PyPI - content-core - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

content-core 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (15) hide show

content_core/__init__.py +6 -2
content_core/cc_config.yaml +35 -0
content_core/common/state.py +4 -0
content_core/config.py +23 -4
content_core/content/extraction/graph.py +15 -1
content_core/notebooks/docling.ipynb +27 -0
content_core/notebooks/run.ipynb +74 -58
content_core/processors/docling.py +72 -0
content_core/templated_message.py +16 -24
{content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/METADATA +56 -2
{content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/RECORD +14 -12
content_core/prompter.py +0 -159
{content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/WHEEL +0 -0
{content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/entry_points.txt +0 -0
{content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/licenses/LICENSE +0 -0

content_core/__init__.py CHANGED Viewed

@@ -5,9 +5,12 @@ import os
 import sys
 from xml.etree import ElementTree as ET
-from dicttoxml import dicttoxml  # type: ignore
 from dotenv import load_dotenv
+load_dotenv()
+from dicttoxml import dicttoxml  # type: ignore
 from content_core.common import ProcessSourceInput
 from content_core.content.cleanup import cleanup_content
 from content_core.content.extraction import extract_content
@@ -18,7 +21,6 @@ from content_core.logging import configure_logging, logger
 extract = extract_content
 clean = cleanup_content
-load_dotenv()
 # Configure loguru logger using centralized configuration
 configure_logging(debug=False)
@@ -212,3 +214,5 @@ def csum():
 if __name__ == "__main__":
     ccore()
+if __name__ == "__main__":
+    ccore()

content_core/cc_config.yaml ADDED Viewed

@@ -0,0 +1,35 @@
+# Content Core main configuration
+# Copy this file to your project root or set CCORE_CONFIG_PATH to its location
+speech_to_text:
+  provider: openai
+  model_name: whisper-1
+default_model:
+  provider: openai
+  model_name: gpt-4o-mini
+  config:
+    temperature: 0.5
+    top_p: 1
+    max_tokens: 2000
+cleanup_model:
+  provider: openai
+  model_name: gpt-4o-mini
+  config:
+    temperature: 0
+    max_tokens: 8000
+    output_format: json
+summary_model:
+  provider: openai
+  model_name: gpt-4o-mini
+  config:
+    temperature: 0
+    top_p: 1
+    max_tokens: 2000
+extraction:
+  engine: legacy  # change to 'docling' to enable Docling engine
+  docling:
+    output_format: markdown  # markdown | html | json

content_core/common/state.py CHANGED Viewed

@@ -13,12 +13,16 @@ class ProcessSourceState(BaseModel):
     identified_provider: Optional[str] = ""
     metadata: Optional[dict] = Field(default_factory=lambda: {})
     content: Optional[str] = ""
+    engine: Optional[str] = Field(default=None, description="Override extraction engine: 'legacy' or 'docling'")
+    output_format: Optional[str] = Field(default=None, description="Override Docling output format: 'markdown', 'html', or 'json'")
 class ProcessSourceInput(BaseModel):
     content: Optional[str] = ""
     file_path: Optional[str] = ""
     url: Optional[str] = ""
+    engine: Optional[str] = None
+    output_format: Optional[str] = None
 class ProcessSourceOutput(BaseModel):

content_core/config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import pkgutil
+import os  # needed for load_config env/path checks
 import yaml
 from dotenv import load_dotenv
@@ -9,7 +9,7 @@ load_dotenv()
 def load_config():
-    config_path = os.environ.get("CCORE_MODEL_CONFIG_PATH")
+    config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
     if config_path and os.path.exists(config_path):
         try:
             with open(config_path, "r") as file:
@@ -20,8 +20,27 @@ def load_config():
     default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
     if default_config_data:
-        return yaml.safe_load(default_config_data)
-    return {}
+        base = yaml.safe_load(default_config_data)
+    else:
+        base = {}
+    # load new cc_config.yaml defaults
+    cc_default = pkgutil.get_data("content_core", "cc_config.yaml")
+    if cc_default:
+        docling_cfg = yaml.safe_load(cc_default)
+        # merge extraction section
+        base["extraction"] = docling_cfg.get("extraction", {})
+    return base
 CONFIG = load_config()
+# Programmatic config overrides: use in notebooks or scripts
+def set_extraction_engine(engine: str):
+    """Override the extraction engine ('legacy' or 'docling')."""
+    CONFIG.setdefault("extraction", {})["engine"] = engine
+def set_docling_output_format(fmt: str):
+    """Override Docling output_format ('markdown', 'html', or 'json')."""
+    extraction = CONFIG.setdefault("extraction", {})
+    docling_cfg = extraction.setdefault("docling", {})
+    docling_cfg["output_format"] = fmt

content_core/content/extraction/graph.py CHANGED Viewed

@@ -20,10 +20,12 @@ from content_core.processors.text import extract_txt
 from content_core.processors.url import extract_url, url_provider
 from content_core.processors.video import extract_best_audio_from_video
 from content_core.processors.youtube import extract_youtube_transcript
+from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED  # type: ignore
 import aiohttp
 import tempfile
 from urllib.parse import urlparse
+from content_core.config import CONFIG  # type: ignore
 async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
@@ -110,6 +112,17 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
     return {"file_path": tmp, "identified_type": mime}
+async def file_type_router_docling(state: ProcessSourceState) -> str:
+    """
+    Route to Docling if enabled and supported; otherwise use legacy file type edge.
+    """
+    # allow per-execution override of engine via state.engine
+    engine = state.engine or CONFIG.get("extraction", {}).get("engine", "legacy")
+    if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
+        return "extract_docling"
+    return await file_type_edge(state)
 # Create workflow
 workflow = StateGraph(
     ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
@@ -128,6 +141,7 @@ workflow.add_node("extract_audio", extract_audio)
 workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
 workflow.add_node("delete_file", delete_file)
 workflow.add_node("download_remote_file", download_remote_file)
+workflow.add_node("extract_docling", extract_with_docling)
 # Add edges
 workflow.add_edge(START, "source")
@@ -142,7 +156,7 @@ workflow.add_conditional_edges(
 )
 workflow.add_conditional_edges(
     "file_type",
-    file_type_edge,
+    file_type_router_docling,
 )
 workflow.add_conditional_edges(
     "url_provider",

content_core/notebooks/docling.ipynb ADDED Viewed

@@ -0,0 +1,27 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "\n",
+    "source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
+    "source_url = \"https://arxiv.org/pdf/2408.09869\"  # PDF path or URL\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(source)\n",
+    "print(result.document.export_to_markdown())"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

content-core 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

content-core 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl