PyPI - content-core - Versions diffs - 0.8.1__py3-none-any.whl → 0.8.5__py3-none-any.whl - Mend

content-core 0.8.1py3-none-any.whl → 0.8.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (12) hide show

content_core/cc_config.yaml CHANGED Viewed

@@ -33,3 +33,7 @@ extraction:
   engine: legacy  # change to 'docling' to enable Docling engine
   docling:
     output_format: markdown  # markdown | html | json
+youtube_transcripts:
+  preferred_languages: ["en", "es", "pt"]

content_core/content/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .cleanup import cleanup_content
 from .extraction import extract_content
+from .identification import get_file_type
 from .summary import summarize
-__all__ = ["extract_content", "cleanup_content", "summarize"]
+__all__ = ["extract_content", "cleanup_content", "summarize", "get_file_type"]

content_core/content/extraction/graph.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 import tempfile
 from typing import Any, Dict, Optional
 from urllib.parse import urlparse
-from content_core.common.types import warn_if_deprecated_engine
 import aiohttp
 import magic
@@ -13,11 +12,14 @@ from content_core.common import (
     ProcessSourceState,
     UnsupportedTypeException,
 )
+from content_core.common.types import warn_if_deprecated_engine
 from content_core.config import CONFIG  # type: ignore
 from content_core.logging import logger
 from content_core.processors.audio import extract_audio_data  # type: ignore
-from content_core.processors.docling import DOCLING_SUPPORTED  # type: ignore
-from content_core.processors.docling import extract_with_docling
+from content_core.processors.docling import (
+    DOCLING_SUPPORTED,  # type: ignore
+    extract_with_docling,
+)
 from content_core.processors.office import (
     SUPPORTED_OFFICE_TYPES,
     extract_office_content,
@@ -60,6 +62,7 @@ async def file_type(state: ProcessSourceState) -> Dict[str, Any]:
 async def file_type_edge(data: ProcessSourceState) -> str:
     assert data.identified_type, "Type not identified"
     identified_type = data.identified_type
+    logger.debug(f"Identified type: {identified_type}")
     if identified_type == "text/plain":
         return "extract_txt"
@@ -91,16 +94,19 @@ async def delete_file(data: ProcessSourceState) -> Dict[str, Any]:
 async def url_type_router(x: ProcessSourceState) -> Optional[str]:
+    assert x.identified_type, "Type not identified"
     return x.identified_type
 async def source_type_router(x: ProcessSourceState) -> Optional[str]:
+    assert x.source_type, "Source type not identified"
     return x.source_type
 async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
     url = state.url
     assert url, "No URL provided"
+    logger.debug(f"Downloading remote file: {url}")
     async with aiohttp.ClientSession() as session:
         async with session.get(url) as resp:
             resp.raise_for_status()
@@ -115,7 +121,6 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
     return {"file_path": tmp, "identified_type": mime}
 async def file_type_router_docling(state: ProcessSourceState) -> str:
     """
     Route to Docling if enabled and supported; otherwise use simple file type edge.
@@ -125,18 +130,25 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
     engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
     warn_if_deprecated_engine(engine)
     if engine == "auto":
+        logger.debug("Using auto engine")
         # Try docling first; if it fails or is not supported, fallback to simple
         if state.identified_type in DOCLING_SUPPORTED:
             try:
+                logger.debug("Trying docling extraction")
                 return "extract_docling"
             except Exception as e:
-                logger.warning(f"Docling extraction failed in 'auto' mode, falling back to simple: {e}")
+                logger.warning(
+                    f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
+                )
         # Fallback to simple
+        logger.debug("Falling back to simple extraction")
         return await file_type_edge(state)
     if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
+        logger.debug("Using docling engine")
         return "extract_docling"
     # For 'simple' and 'legacy', use the default file type edge
+    logger.debug("Using simple engine")
     return await file_type_edge(state)
@@ -179,7 +191,12 @@ workflow.add_conditional_edges(
     "url_provider",
     url_type_router,
     {
-        **{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES},
+        **{
+            m: "download_remote_file"
+            for m in list(SUPPORTED_FITZ_TYPES)
+            + list(SUPPORTED_OFFICE_TYPES)
+            + list(DOCLING_SUPPORTED)
+        },
         "article": "extract_url",
         "youtube": "extract_youtube_transcript",
     },
@@ -197,5 +214,4 @@ workflow.add_edge("extract_audio_data", "delete_file")
 workflow.add_edge("delete_file", END)
 workflow.add_edge("download_remote_file", "file_type")
-# Compile graph
 graph = workflow.compile()

content_core/content/identification/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+import magic
+async def get_file_type(file_path: str) -> str:
+    """
+    Identify the file using python-magic
+    """
+    return magic.from_file(file_path, mime=True)

content_core/processors/docling.py CHANGED Viewed

@@ -26,7 +26,7 @@ DOCLING_SUPPORTED = {
     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
     "text/markdown",
-    "text/plain",
+    # "text/plain", #docling currently not supporting txt
     "text/x-markdown",
     "text/csv",
     "text/html",

content_core/processors/url.py CHANGED Viewed

@@ -1,68 +1,16 @@
 import os
-from io import BytesIO
-from urllib.parse import urlparse
 import aiohttp
-import docx
 from bs4 import BeautifulSoup
 from readability import Document
 from content_core.common import ProcessSourceState
 from content_core.common.types import warn_if_deprecated_engine
 from content_core.logging import logger
+from content_core.processors.docling import DOCLING_SUPPORTED
+from content_core.processors.office import SUPPORTED_OFFICE_TYPES
 from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
-DOCX_MIME_TYPE = (
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-)
-async def _extract_docx_content(docx_bytes: bytes, url: str):
-    """
-    Extract content from DOCX file bytes.
-    """
-    try:
-        logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
-        doc = docx.Document(BytesIO(docx_bytes))
-        content_parts = [p.text for p in doc.paragraphs if p.text]
-        full_content = "\n\n".join(content_parts)
-        # Try to get a title from document properties or first heading
-        title = doc.core_properties.title
-        if not title and doc.paragraphs:
-            # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
-            for p in doc.paragraphs[:5]:  # Check first 5 paragraphs
-                if p.style.name.startswith("Heading"):
-                    title = p.text
-                    break
-            if not title:  # Fallback to first line if no heading found
-                title = (
-                    doc.paragraphs[0].text.strip()
-                    if doc.paragraphs[0].text.strip()
-                    else None
-                )
-        # If no title found, use filename from URL
-        if not title:
-            title = urlparse(url).path.split("/")[-1]
-        logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
-        return {
-            "title": title,
-            "content": full_content,
-            "domain": urlparse(url).netloc,
-            "url": url,
-        }
-    except Exception as e:
-        logger.error(f"Failed to process DOCX content from {url}: {e}")
-        # Fallback or re-raise, depending on desired error handling
-        return {
-            "title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
-            "content": f"Failed to extract content from DOCX: {e}",
-            "domain": urlparse(url).netloc,
-            "url": url,
-        }
 async def url_provider(state: ProcessSourceState):
     """
@@ -81,12 +29,19 @@ async def url_provider(state: ProcessSourceState):
                         url, timeout=10, allow_redirects=True
                     ) as resp:
                         mime = resp.headers.get("content-type", "").split(";", 1)[0]
+                        logger.debug(f"MIME type for {url}: {mime}")
             except Exception as e:
-                logger.debug(f"HEAD check failed for {url}: {e}")
+                logger.warning(f"HEAD check failed for {url}: {e}")
                 mime = "article"
-            if mime in SUPPORTED_FITZ_TYPES:
+            if (
+                mime in DOCLING_SUPPORTED
+                or mime in SUPPORTED_FITZ_TYPES
+                or mime in SUPPORTED_OFFICE_TYPES
+            ):
+                logger.warning(f"Identified type for {url}: {mime}")
                 return_dict["identified_type"] = mime
             else:
+                logger.warning(f"Identified type for {url}: article")
                 return_dict["identified_type"] = "article"
     return return_dict

content_core/processors/youtube.py CHANGED Viewed

@@ -8,6 +8,7 @@ from youtube_transcript_api.formatters import TextFormatter  # type: ignore
 from content_core.common import ProcessSourceState
 from content_core.common.exceptions import NoTranscriptFound
+from content_core.config import CONFIG
 from content_core.logging import logger
 ssl._create_default_https_context = ssl._create_unverified_context
@@ -137,10 +138,11 @@ async def extract_youtube_transcript(state: ProcessSourceState):
     Parse the text file and print its content.
     """
-    languages = ["en", "es", "pt"]
-    # languages = CONFIG.get("youtube_transcripts", {}).get(
-    #     "preferred_languages", ["en", "es", "pt"]
-    # )
+    assert state.url, "No URL provided"
+    logger.warning(f"Extracting transcript from URL: {state.url}")
+    languages = CONFIG.get("youtube_transcripts", {}).get(
+        "preferred_languages", ["en", "es", "pt"]
+    )
     video_id = await _extract_youtube_id(state.url)
     transcript = await get_best_transcript(video_id, languages)
@@ -152,9 +154,24 @@ async def extract_youtube_transcript(state: ProcessSourceState):
     except Exception as e:
         logger.critical(f"Failed to get video title for video_id: {video_id}")
         logger.exception(e)
-        title = None
+        title = ""
+    try:
+        formatted_content = formatter.format_transcript(transcript)
+    except Exception as e:
+        logger.critical(f"Failed to format transcript for video_id: {video_id}")
+        logger.exception(e)
+        formatted_content = ""
+    try:
+        transcript_raw = transcript.to_raw_data()
+    except Exception as e:
+        logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
+        logger.exception(e)
+        transcript_raw = ""
     return {
-        "content": formatter.format_transcript(transcript),
+        "content": formatted_content,
         "title": title,
-        "metadata": {"video_id": video_id, "transcript": transcript.to_raw_data()},
+        "metadata": {"video_id": video_id, "transcript": transcript_raw},
     }

{content_core-0.8.1.dist-info → content_core-0.8.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 0.8.1
+Version: 0.8.5
 Summary: Extract what matters from any media source
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE

{content_core-0.8.1.dist-info → content_core-0.8.5.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 content_core/__init__.py,sha256=ANKeslNXOGumwrkjqgRik23e5PdGps2C0FSup8_XH2Y,6515
-content_core/cc_config.yaml,sha256=w66fo5ut6TPaU3o4hkjnroqg2hkr8YuOG3BRtI50j1s,701
+content_core/cc_config.yaml,sha256=tfbnJ4h9DWuJUljJrnz72s6TD24hD5P-uEPA9K_pNVY,767
 content_core/config.py,sha256=-aUsTB6Z3fa_XIWdHNXhMgWkVLWjEW1kfyQXXB_-j54,1632
 content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
 content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
@@ -11,28 +11,29 @@ content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cI
 content_core/common/state.py,sha256=pO8Oq71KxznlZ4K5qUVfyLrNsZWd2yMO9bXKmrTIXQo,1427
 content_core/common/types.py,sha256=FpIzYadBvafGI4e1EuwGjjiPuawL1HitxsQOciNjTZo,497
 content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
-content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCrefU,171
+content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeFiQM,230
 content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
 content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
 content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
-content_core/content/extraction/graph.py,sha256=51B_j_hi7SsKh7kKNLFsMmxyR2HVS-mOYfKvDFyuYfw,7001
+content_core/content/extraction/graph.py,sha256=Z8IqcFQmWLJG44jJ4399mBDQVMH-mYuQQpBDHTBUEe0,7571
+content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
 content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
 content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
 content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
 content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
-content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
+content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
 content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
 content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
 content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
-content_core/processors/url.py,sha256=yt-uuzS4N-RAOJ8vo5x-b4bgnrFeTV-3SDIatRTRI3g,9462
+content_core/processors/url.py,sha256=qdtEIhZpi62zMXbwbCmmh86ySoomscwqxHdFib7QC-M,7898
 content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
-content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
+content_core/processors/youtube.py,sha256=g_A-rv5bzq2GIuwqMH70YAnDK-4BZqpgQP0IQ3j9zXE,6340
 content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
 content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
 content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
 content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-0.8.1.dist-info/METADATA,sha256=ZIW6gtawFeFo2uQqWkFH2ctSYIUq5PBrke4gyHQQAWU,11439
-content_core-0.8.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-0.8.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
-content_core-0.8.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-0.8.1.dist-info/RECORD,,
+content_core-0.8.5.dist-info/METADATA,sha256=rba5vG3Vkm5WRKHfbTDay5xK4JD4kbPNFow9AoTNHDE,11439
+content_core-0.8.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-0.8.5.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
+content_core-0.8.5.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-0.8.5.dist-info/RECORD,,

{content_core-0.8.1.dist-info → content_core-0.8.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-0.8.1.dist-info → content_core-0.8.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-0.8.1.dist-info → content_core-0.8.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 0.8.1__py3-none-any.whl → 0.8.5__py3-none-any.whl

Potentially problematic release.

content-core 0.8.1py3-none-any.whl → 0.8.5py3-none-any.whl