PyPI - content-core - Versions diffs - 0.8.5__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

content-core 0.8.5py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (12) hide show

content_core/__init__.py CHANGED Viewed

@@ -113,7 +113,7 @@ async def ccore_main():
         if args.format == "xml":
             result = dicttoxml(
                 result.model_dump(), custom_root="result", attr_type=False
-            )
+            ).decode('utf-8')
         elif args.format == "json":
             result = result.model_dump_json()
         else:  # text

content_core/cc_config.yaml CHANGED Viewed

@@ -30,7 +30,8 @@ summary_model:
     max_tokens: 2000
 extraction:
-  engine: legacy  # change to 'docling' to enable Docling engine
+  document_engine: auto  # auto | simple | docling - for files/documents
+  url_engine: auto  # auto | simple | firecrawl | jina | docling - for URLs
   docling:
     output_format: markdown  # markdown | html | json

content_core/common/state.py CHANGED Viewed

@@ -2,8 +2,7 @@ from typing import Optional
 from pydantic import BaseModel, Field
-from content_core.common.types import Engine
-from content_core.common.types import Engine
+from content_core.common.types import DocumentEngine, UrlEngine
 class ProcessSourceState(BaseModel):
@@ -16,9 +15,13 @@ class ProcessSourceState(BaseModel):
     identified_provider: Optional[str] = ""
     metadata: Optional[dict] = Field(default_factory=lambda: {})
     content: Optional[str] = ""
-    engine: Optional[Engine] = Field(
+    document_engine: Optional[DocumentEngine] = Field(
         default=None,
-        description="Override extraction engine: 'auto', 'simple', 'legacy', 'firecrawl', 'jina', or 'docling'",
+        description="Override document extraction engine: 'auto', 'simple', or 'docling'",
+    )
+    url_engine: Optional[UrlEngine] = Field(
+        default=None,
+        description="Override URL extraction engine: 'auto', 'simple', 'firecrawl', 'jina', or 'docling'",
     )
     output_format: Optional[str] = Field(
         default=None,
@@ -30,7 +33,8 @@ class ProcessSourceInput(BaseModel):
     content: Optional[str] = ""
     file_path: Optional[str] = ""
     url: Optional[str] = ""
-    engine: Optional[str] = None
+    document_engine: Optional[str] = None
+    url_engine: Optional[str] = None
     output_format: Optional[str] = None

content_core/common/types.py CHANGED Viewed

@@ -1,21 +1,14 @@
 from typing import Literal
-import warnings
-Engine = Literal[
+DocumentEngine = Literal[
+    "auto",
+    "simple",
+    "docling",
+]
+UrlEngine = Literal[
     "auto",
     "simple",
-    "legacy",
     "firecrawl",
     "jina",
-    "docling",
 ]
-DEPRECATED_ENGINES = {"legacy": "simple"}
-def warn_if_deprecated_engine(engine: str):
-    if engine in DEPRECATED_ENGINES:
-        warnings.warn(
-            f"Engine '{engine}' is deprecated and will be removed in a future release. Use '{DEPRECATED_ENGINES[engine]}' instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )

content_core/config.py CHANGED Viewed

@@ -35,9 +35,13 @@ def load_config():
 CONFIG = load_config()
 # Programmatic config overrides: use in notebooks or scripts
-def set_extraction_engine(engine: str):
-    """Override the extraction engine ('legacy' or 'docling')."""
-    CONFIG.setdefault("extraction", {})["engine"] = engine
+def set_document_engine(engine: str):
+    """Override the document extraction engine ('auto', 'simple', or 'docling')."""
+    CONFIG.setdefault("extraction", {})["document_engine"] = engine
+def set_url_engine(engine: str):
+    """Override the URL extraction engine ('auto', 'simple', 'firecrawl', 'jina', or 'docling')."""
+    CONFIG.setdefault("extraction", {})["url_engine"] = engine
 def set_docling_output_format(fmt: str):
     """Override Docling output_format ('markdown', 'html', or 'json')."""

content_core/content/extraction/graph.py CHANGED Viewed

@@ -12,7 +12,6 @@ from content_core.common import (
     ProcessSourceState,
     UnsupportedTypeException,
 )
-from content_core.common.types import warn_if_deprecated_engine
 from content_core.config import CONFIG  # type: ignore
 from content_core.logging import logger
 from content_core.processors.audio import extract_audio_data  # type: ignore
@@ -124,11 +123,10 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
 async def file_type_router_docling(state: ProcessSourceState) -> str:
     """
     Route to Docling if enabled and supported; otherwise use simple file type edge.
-    Supports 'auto', 'docling', 'simple', and 'legacy' (deprecated, alias for simple).
-    'auto' tries simple first, then falls back to docling if simple fails.
+    Supports 'auto', 'docling', and 'simple'.
+    'auto' tries docling first, then falls back to simple if docling fails.
     """
-    engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
-    warn_if_deprecated_engine(engine)
+    engine = state.document_engine or CONFIG.get("extraction", {}).get("document_engine", "auto")
     if engine == "auto":
         logger.debug("Using auto engine")
         # Try docling first; if it fails or is not supported, fallback to simple
@@ -147,7 +145,7 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
     if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
         logger.debug("Using docling engine")
         return "extract_docling"
-    # For 'simple' and 'legacy', use the default file type edge
+    # For 'simple', use the default file type edge
     logger.debug("Using simple engine")
     return await file_type_edge(state)
@@ -196,8 +194,10 @@ workflow.add_conditional_edges(
             for m in list(SUPPORTED_FITZ_TYPES)
             + list(SUPPORTED_OFFICE_TYPES)
             + list(DOCLING_SUPPORTED)
+            if m not in ["text/html"]  # Exclude HTML from file download, treat as web content
         },
         "article": "extract_url",
+        "text/html": "extract_url",  # Route HTML content to URL extraction
         "youtube": "extract_youtube_transcript",
     },
 )

content_core/processors/url.py CHANGED Viewed

@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
 from readability import Document
 from content_core.common import ProcessSourceState
-from content_core.common.types import warn_if_deprecated_engine
+from content_core.config import CONFIG
 from content_core.logging import logger
 from content_core.processors.docling import DOCLING_SUPPORTED
 from content_core.processors.office import SUPPORTED_OFFICE_TYPES
@@ -160,13 +160,12 @@ async def extract_url_firecrawl(url: str):
 async def extract_url(state: ProcessSourceState):
     """
-    Extract content from a URL using the engine specified in the state.
-    Supported engines: 'auto', 'simple', 'legacy' (deprecated), 'firecrawl', 'jina'.
+    Extract content from a URL using the url_engine specified in the state.
+    Supported engines: 'auto', 'simple', 'firecrawl', 'jina'.
     """
     assert state.url, "No URL provided"
     url = state.url
-    engine = state.engine or "auto"
-    warn_if_deprecated_engine(engine)
+    engine = state.url_engine or CONFIG.get("extraction", {}).get("url_engine", "auto")
     try:
         if engine == "auto":
             if os.environ.get("FIRECRAWL_API_KEY"):
@@ -182,19 +181,12 @@ async def extract_url(state: ProcessSourceState):
                     logger.error(f"Jina extraction error for URL: {url}: {e}")
                     logger.debug("Falling back to BeautifulSoup")
                     return await extract_url_bs4(url)
-        elif engine == "simple" or engine == "legacy":
-            # 'legacy' is deprecated alias for 'simple'
+        elif engine == "simple":
             return await extract_url_bs4(url)
         elif engine == "firecrawl":
             return await extract_url_firecrawl(url)
         elif engine == "jina":
             return await extract_url_jina(url)
-        elif engine == "docling":
-            from content_core.processors.docling import extract_with_docling
-            state.url = url
-            result_state = await extract_with_docling(state)
-            return {"title": None, "content": result_state.content}
         else:
             raise ValueError(f"Unknown engine: {engine}")
     except Exception as e:

{content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 0.8.5
+Version: 1.0.0
 Summary: Extract what matters from any media source
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE
@@ -234,12 +234,18 @@ async def main():
     md_data = await extract_content({"file_path": "path/to/your/document.md"})
     print(md_data)
-    # Per-execution override with Docling
+    # Per-execution override with Docling for documents
     doc_data = await extract_content({
         "file_path": "path/to/your/document.pdf",
-        "engine": "docling",
+        "document_engine": "docling",
         "output_format": "html"
     })
+    # Per-execution override with Firecrawl for URLs
+    url_data = await extract_content({
+        "url": "https://www.example.com",
+        "url_engine": "firecrawl"
+    })
     print(doc_data)
 if __name__ == "__main__":
@@ -262,7 +268,8 @@ Docling is not the default engine when parsing documents. If you don't want to u
 In your `cc_config.yaml` or custom config, set:
 ```yaml
 extraction:
-  engine: docling       # 'legacy' (default) or 'docling'
+  document_engine: docling  # 'auto' (default), 'simple', or 'docling'
+  url_engine: auto          # 'auto' (default), 'simple', 'firecrawl', or 'jina'
   docling:
     output_format: markdown  # markdown | html | json
 ```
@@ -270,10 +277,13 @@ extraction:
 #### Programmatically in Python
 ```python
-from content_core.config import set_extraction_engine, set_docling_output_format
+from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
+# switch document engine to Docling
+set_document_engine("docling")
-# switch engine to Docling
-set_extraction_engine("docling")
+# switch URL engine to Firecrawl
+set_url_engine("firecrawl")
 # choose output format: 'markdown', 'html', or 'json'
 set_docling_output_format("html")

{content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-content_core/__init__.py,sha256=ANKeslNXOGumwrkjqgRik23e5PdGps2C0FSup8_XH2Y,6515
-content_core/cc_config.yaml,sha256=tfbnJ4h9DWuJUljJrnz72s6TD24hD5P-uEPA9K_pNVY,767
-content_core/config.py,sha256=-aUsTB6Z3fa_XIWdHNXhMgWkVLWjEW1kfyQXXB_-j54,1632
+content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
+content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
+content_core/config.py,sha256=vbRgJy8lOTZABeY7GZc7MglNYwBQYpUNzu76kprv_c0,1854
 content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
 content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
 content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
@@ -8,14 +8,14 @@ content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
 content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
 content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
 content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
-content_core/common/state.py,sha256=pO8Oq71KxznlZ4K5qUVfyLrNsZWd2yMO9bXKmrTIXQo,1427
-content_core/common/types.py,sha256=FpIzYadBvafGI4e1EuwGjjiPuawL1HitxsQOciNjTZo,497
+content_core/common/state.py,sha256=K5jsDg4l2GSaoGyFYzdd1GW14vLaAxdxes8vUrPNVkE,1622
+content_core/common/types.py,sha256=DOQFW5ySHELc_mZU6G_7PUy1kmnP4aU4IpMyyXDQcBE,177
 content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
 content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeFiQM,230
 content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
 content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
 content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
-content_core/content/extraction/graph.py,sha256=Z8IqcFQmWLJG44jJ4399mBDQVMH-mYuQQpBDHTBUEe0,7571
+content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8YnijESGvnD0,7605
 content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
 content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
 content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
@@ -25,15 +25,15 @@ content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMgua
 content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
 content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
 content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
-content_core/processors/url.py,sha256=qdtEIhZpi62zMXbwbCmmh86ySoomscwqxHdFib7QC-M,7898
+content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
 content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
 content_core/processors/youtube.py,sha256=g_A-rv5bzq2GIuwqMH70YAnDK-4BZqpgQP0IQ3j9zXE,6340
 content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
 content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
 content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
 content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-0.8.5.dist-info/METADATA,sha256=rba5vG3Vkm5WRKHfbTDay5xK4JD4kbPNFow9AoTNHDE,11439
-content_core-0.8.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-0.8.5.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
-content_core-0.8.5.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-0.8.5.dist-info/RECORD,,
+content_core-1.0.0.dist-info/METADATA,sha256=0TBaT17WQQ3u3YKX4dZYGXLkvnnyFwuxe1Z5uHQr9rQ,11819
+content_core-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-1.0.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
+content_core-1.0.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-1.0.0.dist-info/RECORD,,

{content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 0.8.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

content-core 0.8.5py3-none-any.whl → 1.0.0py3-none-any.whl