PyPI - content-core - Versions diffs - 0.1.0__py3-none-any.whl - Mend

content-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (32) hide show

content_core/__init__.py +214 -0
content_core/common/__init__.py +21 -0
content_core/common/exceptions.py +70 -0
content_core/common/state.py +30 -0
content_core/common/utils.py +31 -0
content_core/config.py +37 -0
content_core/content/__init__.py +0 -0
content_core/content/cleanup/__init__.py +5 -0
content_core/content/cleanup/core.py +15 -0
content_core/content/extraction/__init__.py +13 -0
content_core/content/extraction/graph.py +148 -0
content_core/content/summary/__init__.py +5 -0
content_core/content/summary/core.py +15 -0
content_core/notebooks/run.ipynb +558 -0
content_core/processors/audio.py +106 -0
content_core/processors/office.py +331 -0
content_core/processors/pdf.py +170 -0
content_core/processors/text.py +37 -0
content_core/processors/url.py +191 -0
content_core/processors/video.py +167 -0
content_core/processors/youtube.py +159 -0
content_core/prompter.py +115 -0
content_core/py.typed +2 -0
content_core/templated_message.py +57 -0
content_core/tools/__init__.py +9 -0
content_core/tools/cleanup.py +15 -0
content_core/tools/extract.py +21 -0
content_core/tools/summarize.py +17 -0
content_core-0.1.0.dist-info/METADATA +250 -0
content_core-0.1.0.dist-info/RECORD +32 -0
content_core-0.1.0.dist-info/WHEEL +4 -0
content_core-0.1.0.dist-info/entry_points.txt +4 -0

content_core/__init__.py ADDED Viewed

@@ -0,0 +1,214 @@
+import argparse
+import asyncio
+import json
+import os
+import sys
+from xml.etree import ElementTree as ET
+from dicttoxml import dicttoxml  # type: ignore
+from dotenv import load_dotenv
+from loguru import logger
+from content_core.common import ProcessSourceInput
+from content_core.content.cleanup import cleanup_content
+from content_core.content.extraction import extract_content
+from content_core.content.summary import summarize
+load_dotenv()
+# Configure loguru logger
+logger.remove()  # Remove default handler
+logger.add(sys.stderr, level="INFO")  # Default to INFO level
+def parse_content_format(content: str) -> str:
+    """Parse content that might be JSON or XML, extracting the 'content' field if present."""
+    try:
+        # Try JSON first
+        try:
+            json_data = json.loads(content)
+            if isinstance(json_data, dict) and "content" in json_data:
+                extracted = json_data["content"]
+                return str(extracted) if extracted is not None else content
+        except json.JSONDecodeError:
+            # Try XML
+            try:
+                root = ET.fromstring(content)
+                content_elem = root.find(".//content")
+                if content_elem is not None and content_elem.text is not None:
+                    return content_elem.text
+            except ET.ParseError:
+                pass
+        return content
+    except Exception as e:
+        logger.error(f"Error parsing content: {e}")
+        return content
+def get_content(args, parser, allow_empty=False):
+    """Helper to get content from args or stdin."""
+    if args.content is None:
+        if sys.stdin.isatty():
+            parser.error("No content provided. Provide content or pipe input.")
+        else:
+            content = sys.stdin.read().strip()
+    else:
+        content = args.content
+    if not content and not allow_empty:
+        parser.error("Empty input provided.")
+    return content
+async def process_input_content(content: str) -> str:
+    """Process input content, handling URLs and file paths."""
+    if "http" in content:
+        result = await extract_content(ProcessSourceInput(url=content))
+        content = result.content if result.content else str(result)
+    elif os.path.exists(content):
+        result = await extract_content(ProcessSourceInput(file_path=content))
+        content = result.content if result.content else str(result)
+    return content
+async def ccore_main():
+    """CLI logic for ccore (extract)."""
+    parser = argparse.ArgumentParser(
+        description="Content Core CLI: Extract content with formatting options."
+    )
+    parser.add_argument(
+        "-f",
+        "--format",
+        choices=["xml", "json", "text"],
+        default="text",
+        help="Output format (xml, json, or text). Default: text",
+    )
+    parser.add_argument(
+        "-d", "--debug", action="store_true", help="Enable debug logging."
+    )
+    parser.add_argument(
+        "content",
+        nargs="?",
+        help="Content to process (URL, file path, or text). If not provided, reads from stdin.",
+    )
+    args = parser.parse_args()
+    # Adjust logging level based on debug flag
+    if args.debug:
+        logger.remove()
+        logger.add(sys.stderr, level="DEBUG")
+        logger.debug("Debug logging enabled")
+    content = get_content(args, parser)
+    content = await process_input_content(content)
+    try:
+        result = await extract_content(ProcessSourceInput(content=content))
+        if args.format == "xml":
+            result = dicttoxml(
+                result.model_dump(), custom_root="result", attr_type=False
+            )
+        elif args.format == "json":
+            result = result.model_dump_json()
+        else:  # text
+            result = result.content
+        print(result)
+    except Exception as e:
+        logger.error(f"Error extracting content: {e}")
+        sys.exit(1)
+async def cclean_main():
+    """CLI logic for cclean."""
+    parser = argparse.ArgumentParser(
+        description="Content Core CLI: Clean content string."
+    )
+    parser.add_argument(
+        "-d", "--debug", action="store_true", help="Enable debug logging."
+    )
+    parser.add_argument(
+        "content",
+        nargs="?",
+        help="Content to clean (URL, file path, text, JSON, or XML). If not provided, reads from stdin.",
+    )
+    args = parser.parse_args()
+    # Adjust logging level based on debug flag
+    if args.debug:
+        logger.remove()
+        logger.add(sys.stderr, level="DEBUG")
+        logger.debug("Debug logging enabled")
+    content = get_content(args, parser)
+    content = await process_input_content(content)
+    content = parse_content_format(content)
+    try:
+        result = await cleanup_content(content)
+        print(result)
+    except Exception as e:
+        logger.error(f"Error cleaning content: {e}")
+        sys.exit(1)
+async def csum_main():
+    """CLI logic for csum."""
+    parser = argparse.ArgumentParser(
+        description="Content Core CLI: Summarize content with optional context."
+    )
+    parser.add_argument(
+        "--context",
+        default="",
+        help="Optional context for summarization (e.g., 'summarize as if explaining to a child').",
+    )
+    parser.add_argument(
+        "-d", "--debug", action="store_true", help="Enable debug logging."
+    )
+    parser.add_argument(
+        "content",
+        nargs="?",
+        help="Content to summarize (URL, file path, text, JSON, or XML). If not provided, reads from stdin.",
+    )
+    args = parser.parse_args()
+    # Adjust logging level based on debug flag
+    if args.debug:
+        logger.remove()
+        logger.add(sys.stderr, level="DEBUG")
+        logger.debug("Debug logging enabled")
+    content = get_content(args, parser)
+    content = await process_input_content(content)
+    content = parse_content_format(content)
+    try:
+        result = await summarize(content, args.context)
+        print(result)
+    except Exception as e:
+        logger.error(f"Error summarizing content: {e}")
+        sys.exit(1)
+def ccore():
+    """Synchronous wrapper for ccore."""
+    asyncio.run(ccore_main())
+def cclean():
+    """Synchronous wrapper for cclean."""
+    asyncio.run(cclean_main())
+def csum():
+    """Synchronous wrapper for csum."""
+    asyncio.run(csum_main())
+if __name__ == "__main__":
+    ccore()

content_core/common/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Common utilities and shared code for content-core."""
+from .exceptions import (
+    ContentCoreError,
+    InvalidInputError,
+    NotFoundError,
+    UnsupportedTypeException,
+)
+from .state import ProcessSourceInput, ProcessSourceOutput, ProcessSourceState
+from .utils import process_input_content
+__all__ = [
+    "ContentCoreError",
+    "UnsupportedTypeException",
+    "InvalidInputError",
+    "NotFoundError",
+    "ProcessSourceInput",
+    "ProcessSourceState",
+    "ProcessSourceOutput",
+    "process_input_content",
+]

content_core/common/exceptions.py ADDED Viewed

@@ -0,0 +1,70 @@
+class ContentCoreError(Exception):
+    """Base exception class for Open Notebook errors."""
+    pass
+class DatabaseOperationError(ContentCoreError):
+    """Raised when a database operation fails."""
+    pass
+class UnsupportedTypeException(ContentCoreError):
+    """Raised when an unsupported type is provided."""
+    pass
+class InvalidInputError(ContentCoreError):
+    """Raised when invalid input is provided."""
+    pass
+class NotFoundError(ContentCoreError):
+    """Raised when a requested resource is not found."""
+    pass
+class AuthenticationError(ContentCoreError):
+    """Raised when there's an authentication problem."""
+    pass
+class ConfigurationError(ContentCoreError):
+    """Raised when there's a configuration problem."""
+    pass
+class ExternalServiceError(ContentCoreError):
+    """Raised when an external service (e.g., AI model) fails."""
+    pass
+class RateLimitError(ContentCoreError):
+    """Raised when a rate limit is exceeded."""
+    pass
+class FileOperationError(ContentCoreError):
+    """Raised when a file operation fails."""
+    pass
+class NetworkError(ContentCoreError):
+    """Raised when a network operation fails."""
+    pass
+class NoTranscriptFound(ContentCoreError):
+    """Raised when no transcript is found for a video."""
+    pass

content_core/common/state.py ADDED Viewed

@@ -0,0 +1,30 @@
+from typing import Optional
+from pydantic import BaseModel, Field
+class ProcessSourceState(BaseModel):
+    file_path: Optional[str] = ""
+    url: Optional[str] = ""
+    delete_source: bool = False
+    title: Optional[str] = ""
+    source_type: Optional[str] = ""
+    identified_type: Optional[str] = ""
+    identified_provider: Optional[str] = ""
+    metadata: Optional[dict] = Field(default_factory=lambda: {})
+    content: Optional[str] = ""
+class ProcessSourceInput(BaseModel):
+    content: Optional[str] = ""
+    file_path: Optional[str] = ""
+    url: Optional[str] = ""
+class ProcessSourceOutput(BaseModel):
+    title: Optional[str] = ""
+    source_type: Optional[str] = ""
+    identified_type: Optional[str] = ""
+    identified_provider: Optional[str] = ""
+    metadata: Optional[dict] = Field(default_factory=lambda: {})
+    content: Optional[str] = ""

content_core/common/utils.py ADDED Viewed

@@ -0,0 +1,31 @@
+import os
+import re
+import validators
+from .state import ProcessSourceInput
+async def process_input_content(content: str) -> str:
+    """
+    Process input content to handle URLs and file paths.
+    If the input is a URL or file path, extract the content from it.
+    """
+    # Check if content is a URL
+    if validators.url(content):
+        from content_core.extraction import extract_content
+        content_input = ProcessSourceInput(url=content)
+        extracted = await extract_content(content_input)
+        return extracted.content if extracted.content else str(extracted)
+    # Check if content is a file path (simplified check for demonstration)
+    if re.match(r"^[a-zA-Z0-9_/\-\.]+\.[a-zA-Z0-9]+$", content):
+        if os.path.exists(content):
+            from content_core.extraction import extract_content
+            content_input = ProcessSourceInput(file_path=content)
+            extracted = await extract_content(content_input)
+            return extracted.content if extracted.content else str(extracted)
+        else:
+            raise ValueError(f"File not found: {content}")
+    # If neither URL nor file path, return content as is
+    return content

content_core/config.py ADDED Viewed

@@ -0,0 +1,37 @@
+from esperanto import AIFactory
+from esperanto.providers.stt import SpeechToTextModel
+SPEECH_TO_TEXT_MODEL: SpeechToTextModel = AIFactory.create_speech_to_text(
+    "openai", "whisper-1"
+)
+DEFAULT_MODEL = AIFactory.create_language(
+    "openai",
+    "gpt-4o-mini",
+    config={
+        "temperature": 0.5,
+        "top_p": 1,
+        "max_tokens": 2000,
+    },
+)
+CLEANUP_MODEL = AIFactory.create_language(
+    "openai",
+    "gpt-4o-mini",
+    config={
+        "temperature": 0,
+        "max_tokens": 8000,
+        "output_format": "json",
+        # "stream": True, # TODO: handle streaming
+    },
+)  # Fix deprecation
+SUMMARY_MODEL = AIFactory.create_language(
+    "openai",
+    "gpt-4o-mini",
+    config={
+        "temperature": 0,
+        "top_p": 1,
+        "max_tokens": 2000,
+    },
+)

content_core/content/__init__.py ADDED Viewed

File without changes

content_core/content/cleanup/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Content cleaning functionality for content-core."""
+from .core import cleanup_content
+__all__ = ["cleanup_content"]

content_core/content/cleanup/core.py ADDED Viewed

@@ -0,0 +1,15 @@
+from functools import partial
+from content_core.config import CLEANUP_MODEL
+from content_core.templated_message import TemplatedMessageInput, templated_message
+async def cleanup_content(content) -> str:
+    templated_summary_fn = partial(templated_message, model=CLEANUP_MODEL)
+    input = TemplatedMessageInput(
+        system_prompt_template="content/cleanup",
+        user_prompt_text=content,
+        data={"content": content},
+    )
+    result = await templated_summary_fn(input)
+    return result

content_core/content/extraction/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from typing import Dict, Union
+from content_core.common import ProcessSourceInput, ProcessSourceOutput
+from content_core.content.extraction.graph import graph
+# todo: input/output schema do langgraph
+async def extract_content(data: Union[ProcessSourceInput, Dict]) -> ProcessSourceOutput:
+    if isinstance(data, dict):
+        data = ProcessSourceInput(**data)
+    result = await graph.ainvoke(data)
+    return ProcessSourceOutput(**result)

content_core/content/extraction/graph.py ADDED Viewed

@@ -0,0 +1,148 @@
+import os
+from typing import Any, Dict, Optional
+import magic
+from langgraph.graph import END, START, StateGraph
+from loguru import logger
+from content_core.common import (
+    ProcessSourceInput,
+    ProcessSourceState,
+    UnsupportedTypeException,
+)
+from content_core.processors.audio import extract_audio  # type: ignore
+from content_core.processors.office import (
+    SUPPORTED_OFFICE_TYPES,
+    extract_office_content,
+)
+from content_core.processors.pdf import SUPPORTED_FITZ_TYPES, extract_pdf
+from content_core.processors.text import extract_txt
+from content_core.processors.url import extract_url, url_provider
+from content_core.processors.video import extract_best_audio_from_video
+from content_core.processors.youtube import extract_youtube_transcript
+async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
+    """
+    Identify the content source based on parameters
+    """
+    if state.content:
+        doc_type = "text"
+    elif state.file_path:
+        doc_type = "file"
+    elif state.url:
+        doc_type = "url"
+    else:
+        raise ValueError("No source provided.")
+    return {"source_type": doc_type}
+async def file_type(state: ProcessSourceState) -> Dict[str, Any]:
+    """
+    Identify the file using python-magic
+    """
+    return_dict = {}
+    file_path = state.file_path
+    if file_path is not None:
+        return_dict["identified_type"] = magic.from_file(file_path, mime=True)
+        return_dict["title"] = os.path.basename(file_path)
+    return return_dict
+async def file_type_edge(data: ProcessSourceState) -> str:
+    assert data.identified_type, "Type not identified"
+    identified_type = data.identified_type
+    if identified_type == "text/plain":
+        return "extract_txt"
+    elif identified_type in SUPPORTED_FITZ_TYPES:
+        return "extract_pdf"
+    elif identified_type in SUPPORTED_OFFICE_TYPES:
+        return "extract_office_content"
+    elif identified_type.startswith("video"):
+        return "extract_best_audio_from_video"
+    elif identified_type.startswith("audio"):
+        return "extract_audio"
+    else:
+        raise UnsupportedTypeException(f"Unsupported file type: {data.identified_type}")
+async def delete_file(data: ProcessSourceState) -> Dict[str, Any]:
+    if data.delete_source:
+        logger.debug(f"Deleting file: {data.file_path}")
+        file_path = data.file_path
+        if file_path is not None:
+            try:
+                os.remove(file_path)
+                return {"file_path": None}
+            except FileNotFoundError:
+                logger.warning(f"File not found while trying to delete: {file_path}")
+    else:
+        logger.debug("Not deleting file")
+    return {}
+async def url_type_router(x: ProcessSourceState) -> Optional[str]:
+    return x.identified_type
+async def source_type_router(x: ProcessSourceState) -> Optional[str]:
+    return x.source_type
+# Create workflow
+workflow = StateGraph(
+    ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
+)
+# Add nodes
+workflow.add_node("source", source_identification)
+workflow.add_node("url_provider", url_provider)
+workflow.add_node("file_type", file_type)
+workflow.add_node("extract_txt", extract_txt)
+workflow.add_node("extract_pdf", extract_pdf)
+workflow.add_node("extract_url", extract_url)
+workflow.add_node("extract_office_content", extract_office_content)
+workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
+workflow.add_node("extract_audio", extract_audio)
+workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
+workflow.add_node("delete_file", delete_file)
+# Add edges
+workflow.add_edge(START, "source")
+workflow.add_conditional_edges(
+    "source",
+    source_type_router,
+    {
+        "url": "url_provider",
+        "file": "file_type",
+        "text": END,
+    },
+)
+workflow.add_conditional_edges(
+    "file_type",
+    file_type_edge,
+)
+workflow.add_conditional_edges(
+    "url_provider",
+    url_type_router,
+    {"article": "extract_url", "youtube": "extract_youtube_transcript"},
+)
+workflow.add_edge("url_provider", END)
+workflow.add_edge("file_type", END)
+workflow.add_edge("extract_url", END)
+workflow.add_edge("extract_txt", END)
+workflow.add_edge("extract_youtube_transcript", END)
+workflow.add_edge("extract_pdf", "delete_file")
+workflow.add_edge("extract_office_content", "delete_file")
+workflow.add_edge("extract_best_audio_from_video", "extract_audio")
+workflow.add_edge("extract_audio", "delete_file")
+workflow.add_edge("delete_file", END)
+# Compile graph
+graph = workflow.compile()
+# Compile graph
+graph = workflow.compile()

content_core/content/summary/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Content summarization functionality for content-core."""
+from .core import summarize
+__all__ = ["summarize"]

content_core/content/summary/core.py ADDED Viewed

@@ -0,0 +1,15 @@
+from functools import partial
+from content_core.config import SUMMARY_MODEL
+from content_core.templated_message import TemplatedMessageInput, templated_message
+async def summarize(content: str, context: str) -> str:
+    templated_message_fn = partial(templated_message, model=SUMMARY_MODEL)
+    response = await templated_message_fn(
+        TemplatedMessageInput(
+            user_prompt_template="content/summarize",
+            data={"content": content, "context": context},
+        )
+    )
+    return response