content-core 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/__init__.py CHANGED
@@ -5,9 +5,12 @@ import os
5
5
  import sys
6
6
  from xml.etree import ElementTree as ET
7
7
 
8
- from dicttoxml import dicttoxml # type: ignore
9
8
  from dotenv import load_dotenv
10
9
 
10
+ load_dotenv()
11
+
12
+ from dicttoxml import dicttoxml # type: ignore
13
+
11
14
  from content_core.common import ProcessSourceInput
12
15
  from content_core.content.cleanup import cleanup_content
13
16
  from content_core.content.extraction import extract_content
@@ -18,7 +21,6 @@ from content_core.logging import configure_logging, logger
18
21
  extract = extract_content
19
22
  clean = cleanup_content
20
23
 
21
- load_dotenv()
22
24
 
23
25
  # Configure loguru logger using centralized configuration
24
26
  configure_logging(debug=False)
@@ -212,3 +214,5 @@ def csum():
212
214
 
213
215
  if __name__ == "__main__":
214
216
  ccore()
217
+ if __name__ == "__main__":
218
+ ccore()
@@ -0,0 +1,35 @@
1
+ # Content Core main configuration
2
+ # Copy this file to your project root or set CCORE_CONFIG_PATH to its location
3
+
4
+ speech_to_text:
5
+ provider: openai
6
+ model_name: whisper-1
7
+
8
+ default_model:
9
+ provider: openai
10
+ model_name: gpt-4o-mini
11
+ config:
12
+ temperature: 0.5
13
+ top_p: 1
14
+ max_tokens: 2000
15
+
16
+ cleanup_model:
17
+ provider: openai
18
+ model_name: gpt-4o-mini
19
+ config:
20
+ temperature: 0
21
+ max_tokens: 8000
22
+ output_format: json
23
+
24
+ summary_model:
25
+ provider: openai
26
+ model_name: gpt-4o-mini
27
+ config:
28
+ temperature: 0
29
+ top_p: 1
30
+ max_tokens: 2000
31
+
32
+ extraction:
33
+ engine: legacy # change to 'docling' to enable Docling engine
34
+ docling:
35
+ output_format: markdown # markdown | html | json
@@ -13,12 +13,16 @@ class ProcessSourceState(BaseModel):
13
13
  identified_provider: Optional[str] = ""
14
14
  metadata: Optional[dict] = Field(default_factory=lambda: {})
15
15
  content: Optional[str] = ""
16
+ engine: Optional[str] = Field(default=None, description="Override extraction engine: 'legacy' or 'docling'")
17
+ output_format: Optional[str] = Field(default=None, description="Override Docling output format: 'markdown', 'html', or 'json'")
16
18
 
17
19
 
18
20
  class ProcessSourceInput(BaseModel):
19
21
  content: Optional[str] = ""
20
22
  file_path: Optional[str] = ""
21
23
  url: Optional[str] = ""
24
+ engine: Optional[str] = None
25
+ output_format: Optional[str] = None
22
26
 
23
27
 
24
28
  class ProcessSourceOutput(BaseModel):
content_core/config.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  import pkgutil
3
-
3
+ import os # needed for load_config env/path checks
4
4
  import yaml
5
5
  from dotenv import load_dotenv
6
6
 
@@ -9,7 +9,7 @@ load_dotenv()
9
9
 
10
10
 
11
11
  def load_config():
12
- config_path = os.environ.get("CCORE_MODEL_CONFIG_PATH")
12
+ config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
13
13
  if config_path and os.path.exists(config_path):
14
14
  try:
15
15
  with open(config_path, "r") as file:
@@ -20,8 +20,27 @@ def load_config():
20
20
 
21
21
  default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
22
22
  if default_config_data:
23
- return yaml.safe_load(default_config_data)
24
- return {}
23
+ base = yaml.safe_load(default_config_data)
24
+ else:
25
+ base = {}
26
+ # load new cc_config.yaml defaults
27
+ cc_default = pkgutil.get_data("content_core", "cc_config.yaml")
28
+ if cc_default:
29
+ docling_cfg = yaml.safe_load(cc_default)
30
+ # merge extraction section
31
+ base["extraction"] = docling_cfg.get("extraction", {})
32
+ return base
25
33
 
26
34
 
27
35
  CONFIG = load_config()
36
+
37
+ # Programmatic config overrides: use in notebooks or scripts
38
+ def set_extraction_engine(engine: str):
39
+ """Override the extraction engine ('legacy' or 'docling')."""
40
+ CONFIG.setdefault("extraction", {})["engine"] = engine
41
+
42
+ def set_docling_output_format(fmt: str):
43
+ """Override Docling output_format ('markdown', 'html', or 'json')."""
44
+ extraction = CONFIG.setdefault("extraction", {})
45
+ docling_cfg = extraction.setdefault("docling", {})
46
+ docling_cfg["output_format"] = fmt
@@ -20,10 +20,12 @@ from content_core.processors.text import extract_txt
20
20
  from content_core.processors.url import extract_url, url_provider
21
21
  from content_core.processors.video import extract_best_audio_from_video
22
22
  from content_core.processors.youtube import extract_youtube_transcript
23
+ from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED # type: ignore
23
24
 
24
25
  import aiohttp
25
26
  import tempfile
26
27
  from urllib.parse import urlparse
28
+ from content_core.config import CONFIG # type: ignore
27
29
 
28
30
 
29
31
  async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
@@ -110,6 +112,17 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
110
112
  return {"file_path": tmp, "identified_type": mime}
111
113
 
112
114
 
115
+ async def file_type_router_docling(state: ProcessSourceState) -> str:
116
+ """
117
+ Route to Docling if enabled and supported; otherwise use legacy file type edge.
118
+ """
119
+ # allow per-execution override of engine via state.engine
120
+ engine = state.engine or CONFIG.get("extraction", {}).get("engine", "legacy")
121
+ if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
122
+ return "extract_docling"
123
+ return await file_type_edge(state)
124
+
125
+
113
126
  # Create workflow
114
127
  workflow = StateGraph(
115
128
  ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
@@ -128,6 +141,7 @@ workflow.add_node("extract_audio", extract_audio)
128
141
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
129
142
  workflow.add_node("delete_file", delete_file)
130
143
  workflow.add_node("download_remote_file", download_remote_file)
144
+ workflow.add_node("extract_docling", extract_with_docling)
131
145
 
132
146
  # Add edges
133
147
  workflow.add_edge(START, "source")
@@ -142,7 +156,7 @@ workflow.add_conditional_edges(
142
156
  )
143
157
  workflow.add_conditional_edges(
144
158
  "file_type",
145
- file_type_edge,
159
+ file_type_router_docling,
146
160
  )
147
161
  workflow.add_conditional_edges(
148
162
  "url_provider",
@@ -0,0 +1,27 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from docling.document_converter import DocumentConverter\n",
10
+ "\n",
11
+ "\n",
12
+ "source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
13
+ "source_url = \"https://arxiv.org/pdf/2408.09869\" # PDF path or URL\n",
14
+ "converter = DocumentConverter()\n",
15
+ "result = converter.convert(source)\n",
16
+ "print(result.document.export_to_markdown())"
17
+ ]
18
+ }
19
+ ],
20
+ "metadata": {
21
+ "language_info": {
22
+ "name": "python"
23
+ }
24
+ },
25
+ "nbformat": 4,
26
+ "nbformat_minor": 2
27
+ }