PyPI - content-core - Versions diffs - 0.6.0__tar.gz → 0.7.2__tar.gz - Mend

content-core 0.6.0tar.gz → 0.7.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (59) hide show

{content_core-0.6.0 → content_core-0.7.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 0.6.0
+Version: 0.7.2
 Summary: Extract what matters from any media source
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE
@@ -9,14 +9,12 @@ Requires-Dist: ai-prompter>=0.2.3
 Requires-Dist: aiohttp>=3.11
 Requires-Dist: bs4>=0.0.2
 Requires-Dist: dicttoxml>=1.7.16
-Requires-Dist: esperanto>=1.2.0
-Requires-Dist: google-genai>=1.10.0
+Requires-Dist: esperanto[openai]>=1.2.0
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: langdetect>=1.0.9
 Requires-Dist: langgraph>=0.3.29
 Requires-Dist: loguru>=0.7.3
 Requires-Dist: moviepy>=2.1.2
-Requires-Dist: openai>=1.73.0
 Requires-Dist: openpyxl>=3.1.5
 Requires-Dist: pandas>=2.2.3
 Requires-Dist: pymupdf>=1.25.5
@@ -28,7 +26,7 @@ Requires-Dist: validators>=0.34.0
 Requires-Dist: youtube-transcript-api>=1.0.3
 Provides-Extra: docling
 Requires-Dist: asciidoc; extra == 'docling'
-Requires-Dist: docling[ocr]; extra == 'docling'
+Requires-Dist: docling; extra == 'docling'
 Requires-Dist: pandas; extra == 'docling'
 Requires-Dist: pillow; extra == 'docling'
 Description-Content-Type: text/markdown

{content_core-0.6.0 → content_core-0.7.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "content-core"
-version = "0.6.0"
+version = "0.7.2"
 description = "Extract what matters from any media source"
 readme = "README.md"
 homepage = "https://github.com/lfnovo/content-core"
@@ -11,12 +11,10 @@ requires-python = ">=3.10"
 dependencies = [
     "aiohttp>=3.11",
     "bs4>=0.0.2",
-    "esperanto>=1.2.0",
-    "google-genai>=1.10.0",
+    "esperanto[openai]>=1.2.0",
     "jinja2>=3.1.6",
     "langdetect>=1.0.9",
     "loguru>=0.7.3",
-    "openai>=1.73.0",
     "openpyxl>=3.1.5",
     "pandas>=2.2.3",
     "pymupdf>=1.25.5",
@@ -33,7 +31,7 @@ dependencies = [
 ]
 [project.optional-dependencies]
-docling = ["docling[ocr]", "Pillow", "pandas", "asciidoc"]
+docling = ["docling", "Pillow", "pandas", "asciidoc"]
 [project.scripts]
 ccore = "content_core:ccore"
@@ -54,6 +52,7 @@ package-dir = {"content_core" = "src/content_core"}
 dev = [
     "ipykernel>=4.0.1",
     "ipywidgets>=4.0.0",
+    "openai>=1.78.1",
     "pyperclip>=1.9.0",
     "pytest>=7.2.0",
     "pytest-asyncio>=0.21.0",

{content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/state.py RENAMED Viewed

@@ -13,8 +13,13 @@ class ProcessSourceState(BaseModel):
     identified_provider: Optional[str] = ""
     metadata: Optional[dict] = Field(default_factory=lambda: {})
     content: Optional[str] = ""
-    engine: Optional[str] = Field(default=None, description="Override extraction engine: 'legacy' or 'docling'")
-    output_format: Optional[str] = Field(default=None, description="Override Docling output format: 'markdown', 'html', or 'json'")
+    engine: Optional[str] = Field(
+        default=None, description="Override extraction engine: 'legacy' or 'docling'"
+    )
+    output_format: Optional[str] = Field(
+        default=None,
+        description="Override Docling output format: 'markdown', 'html', or 'json'",
+    )
 class ProcessSourceInput(BaseModel):
@@ -27,6 +32,8 @@ class ProcessSourceInput(BaseModel):
 class ProcessSourceOutput(BaseModel):
     title: Optional[str] = ""
+    file_path: Optional[str] = ""
+    url: Optional[str] = ""
     source_type: Optional[str] = ""
     identified_type: Optional[str] = ""
     identified_provider: Optional[str] = ""

{content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/extraction/graph.py RENAMED Viewed

@@ -15,10 +15,8 @@ from content_core.common import (
 from content_core.config import CONFIG  # type: ignore
 from content_core.logging import logger
 from content_core.processors.audio import extract_audio_data  # type: ignore
-from content_core.processors.docling import (
-    DOCLING_SUPPORTED,  # type: ignore
-    extract_with_docling,
-)
+from content_core.processors.docling import DOCLING_SUPPORTED  # type: ignore
+from content_core.processors.docling import extract_with_docling
 from content_core.processors.office import (
     SUPPORTED_OFFICE_TYPES,
     extract_office_content,
@@ -186,8 +184,3 @@ workflow.add_edge("download_remote_file", "file_type")
 # Compile graph
 graph = workflow.compile()
-# Compile graph
-graph = workflow.compile()
-# Compile graph
-graph = workflow.compile()

{content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/url.py RENAMED Viewed

@@ -1,7 +1,9 @@
 import re
 from urllib.parse import urlparse
+from io import BytesIO
 import aiohttp
+import docx
 from bs4 import BeautifulSoup, Comment
 from content_core.common import ProcessSourceState
@@ -12,6 +14,49 @@ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
 # https://github.com/buriy/python-readability
 # also try readability: from readability import Document
+DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+async def _extract_docx_content(docx_bytes: bytes, url: str):
+    """
+    Extract content from DOCX file bytes.
+    """
+    try:
+        logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
+        doc = docx.Document(BytesIO(docx_bytes))
+        content_parts = [p.text for p in doc.paragraphs if p.text]
+        full_content = "\n\n".join(content_parts)
+        # Try to get a title from document properties or first heading
+        title = doc.core_properties.title
+        if not title and doc.paragraphs:
+            # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
+            for p in doc.paragraphs[:5]: # Check first 5 paragraphs
+                if p.style.name.startswith('Heading'):
+                    title = p.text
+                    break
+            if not title: # Fallback to first line if no heading found
+                 title = doc.paragraphs[0].text.strip() if doc.paragraphs[0].text.strip() else None
+        # If no title found, use filename from URL
+        if not title:
+            title = urlparse(url).path.split('/')[-1]
+        logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
+        return {
+            "title": title,
+            "content": full_content,
+            "domain": urlparse(url).netloc,
+            "url": url,
+        }
+    except Exception as e:
+        logger.error(f"Failed to process DOCX content from {url}: {e}")
+        # Fallback or re-raise, depending on desired error handling
+        return {
+            "title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
+            "content": f"Failed to extract content from DOCX: {e}",
+            "domain": urlparse(url).netloc,
+            "url": url,
+        }
 async def url_provider(state: ProcessSourceState):
     """
@@ -54,6 +99,13 @@ async def extract_url_bs4(url: str):
             async with aiohttp.ClientSession() as session:
                 async with session.get(url, headers=headers, timeout=10) as response:
                     response.raise_for_status()
+                    # Check content type for DOCX
+                    if response.content_type == DOCX_MIME_TYPE:
+                        logger.debug(f"Detected DOCX content type for {url}")
+                        docx_bytes = await response.read()
+                        return await _extract_docx_content(docx_bytes, url)
+                    # If not DOCX, proceed as HTML
                     html_content = await response.text()
         soup = BeautifulSoup(html_content, "html.parser")

content-core 0.6.0__tar.gz → 0.7.2__tar.gz

Potentially problematic release.

content-core 0.6.0tar.gz → 0.7.2tar.gz