PyPI - content-core - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

content-core 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (7) hide show

content_core/content/extraction/graph.py CHANGED Viewed

@@ -15,10 +15,8 @@ from content_core.common import (
 from content_core.config import CONFIG  # type: ignore
 from content_core.logging import logger
 from content_core.processors.audio import extract_audio_data  # type: ignore
-from content_core.processors.docling import (
-    DOCLING_SUPPORTED,  # type: ignore
-    extract_with_docling,
-)
+from content_core.processors.docling import DOCLING_SUPPORTED  # type: ignore
+from content_core.processors.docling import extract_with_docling
 from content_core.processors.office import (
     SUPPORTED_OFFICE_TYPES,
     extract_office_content,
@@ -186,8 +184,3 @@ workflow.add_edge("download_remote_file", "file_type")
 # Compile graph
 graph = workflow.compile()
-# Compile graph
-graph = workflow.compile()
-# Compile graph
-graph = workflow.compile()

content_core/processors/url.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import re
 from urllib.parse import urlparse
+from io import BytesIO
 import aiohttp
+import docx
 from bs4 import BeautifulSoup, Comment
 from content_core.common import ProcessSourceState
@@ -12,6 +14,49 @@ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
 # https://github.com/buriy/python-readability
 # also try readability: from readability import Document
+DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+async def _extract_docx_content(docx_bytes: bytes, url: str):
+    """
+    Extract content from DOCX file bytes.
+    """
+    try:
+        logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
+        doc = docx.Document(BytesIO(docx_bytes))
+        content_parts = [p.text for p in doc.paragraphs if p.text]
+        full_content = "\n\n".join(content_parts)
+        # Try to get a title from document properties or first heading
+        title = doc.core_properties.title
+        if not title and doc.paragraphs:
+            # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
+            for p in doc.paragraphs[:5]: # Check first 5 paragraphs
+                if p.style.name.startswith('Heading'):
+                    title = p.text
+                    break
+            if not title: # Fallback to first line if no heading found
+                 title = doc.paragraphs[0].text.strip() if doc.paragraphs[0].text.strip() else None
+        # If no title found, use filename from URL
+        if not title:
+            title = urlparse(url).path.split('/')[-1]
+        logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
+        return {
+            "title": title,
+            "content": full_content,
+            "domain": urlparse(url).netloc,
+            "url": url,
+        }
+    except Exception as e:
+        logger.error(f"Failed to process DOCX content from {url}: {e}")
+        # Fallback or re-raise, depending on desired error handling
+        return {
+            "title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
+            "content": f"Failed to extract content from DOCX: {e}",
+            "domain": urlparse(url).netloc,
+            "url": url,
+        }
 async def url_provider(state: ProcessSourceState):
     """
@@ -54,6 +99,13 @@ async def extract_url_bs4(url: str):
             async with aiohttp.ClientSession() as session:
                 async with session.get(url, headers=headers, timeout=10) as response:
                     response.raise_for_status()
+                    # Check content type for DOCX
+                    if response.content_type == DOCX_MIME_TYPE:
+                        logger.debug(f"Detected DOCX content type for {url}")
+                        docx_bytes = await response.read()
+                        return await _extract_docx_content(docx_bytes, url)
+                    # If not DOCX, proceed as HTML
                     html_content = await response.text()
         soup = BeautifulSoup(html_content, "html.parser")

{content_core-0.6.0.dist-info → content_core-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 0.6.0
+Version: 0.7.0
 Summary: Extract what matters from any media source
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE
@@ -9,14 +9,12 @@ Requires-Dist: ai-prompter>=0.2.3
 Requires-Dist: aiohttp>=3.11
 Requires-Dist: bs4>=0.0.2
 Requires-Dist: dicttoxml>=1.7.16
-Requires-Dist: esperanto>=1.2.0
-Requires-Dist: google-genai>=1.10.0
+Requires-Dist: esperanto[openai]>=1.2.0
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: langdetect>=1.0.9
 Requires-Dist: langgraph>=0.3.29
 Requires-Dist: loguru>=0.7.3
 Requires-Dist: moviepy>=2.1.2
-Requires-Dist: openai>=1.73.0
 Requires-Dist: openpyxl>=3.1.5
 Requires-Dist: pandas>=2.2.3
 Requires-Dist: pymupdf>=1.25.5
@@ -28,7 +26,7 @@ Requires-Dist: validators>=0.34.0
 Requires-Dist: youtube-transcript-api>=1.0.3
 Provides-Extra: docling
 Requires-Dist: asciidoc; extra == 'docling'
-Requires-Dist: docling[ocr]; extra == 'docling'
+Requires-Dist: docling; extra == 'docling'
 Requires-Dist: pandas; extra == 'docling'
 Requires-Dist: pillow; extra == 'docling'
 Description-Content-Type: text/markdown

{content_core-0.6.0.dist-info → content_core-0.7.0.dist-info}/RECORD RENAMED Viewed

@@ -14,7 +14,7 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
 content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
 content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
 content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
-content_core/content/extraction/graph.py,sha256=d5Hp7GS2dFpYQIHFTIFhU-7ySZ3lfipdDxZZpe2DXS8,6361
+content_core/content/extraction/graph.py,sha256=IKu-bV3YG2MigHnYixYYhtrQ-4qgGpETerXBEFn73zU,6304
 content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
 content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
 content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
@@ -23,15 +23,15 @@ content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_C
 content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
 content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
 content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
-content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
+content_core/processors/url.py,sha256=vmkBVfJ1xpZQzlhRdkO64V1J9xdTBr6nrXY4M74QzEo,9094
 content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
 content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
 content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
 content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
 content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
 content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-0.6.0.dist-info/METADATA,sha256=pn72ciBGpWE7tVvJ2j3NmQPmFB60cNrkHBmp5ziuyqk,10534
-content_core-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-0.6.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
-content_core-0.6.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-0.6.0.dist-info/RECORD,,
+content_core-0.7.0.dist-info/METADATA,sha256=CFTVOA8hnMcofSlIlR-RwcCmvD9Hsa6mxFPjisBMKus,10471
+content_core-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-0.7.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
+content_core-0.7.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-0.7.0.dist-info/RECORD,,

{content_core-0.6.0.dist-info → content_core-0.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-0.6.0.dist-info → content_core-0.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-0.6.0.dist-info → content_core-0.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

content-core 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl