content-core 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +6 -2
- content_core/cc_config.yaml +35 -0
- content_core/common/state.py +4 -0
- content_core/config.py +23 -4
- content_core/content/extraction/graph.py +15 -1
- content_core/notebooks/docling.ipynb +27 -0
- content_core/notebooks/run.ipynb +74 -58
- content_core/processors/docling.py +72 -0
- content_core/templated_message.py +16 -24
- {content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/METADATA +56 -2
- {content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/RECORD +14 -12
- content_core/prompter.py +0 -159
- {content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/WHEEL +0 -0
- {content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/entry_points.txt +0 -0
- {content_core-0.4.0.dist-info → content_core-0.5.1.dist-info}/licenses/LICENSE +0 -0
content_core/__init__.py
CHANGED
|
@@ -5,9 +5,12 @@ import os
|
|
|
5
5
|
import sys
|
|
6
6
|
from xml.etree import ElementTree as ET
|
|
7
7
|
|
|
8
|
-
from dicttoxml import dicttoxml # type: ignore
|
|
9
8
|
from dotenv import load_dotenv
|
|
10
9
|
|
|
10
|
+
load_dotenv()
|
|
11
|
+
|
|
12
|
+
from dicttoxml import dicttoxml # type: ignore
|
|
13
|
+
|
|
11
14
|
from content_core.common import ProcessSourceInput
|
|
12
15
|
from content_core.content.cleanup import cleanup_content
|
|
13
16
|
from content_core.content.extraction import extract_content
|
|
@@ -18,7 +21,6 @@ from content_core.logging import configure_logging, logger
|
|
|
18
21
|
extract = extract_content
|
|
19
22
|
clean = cleanup_content
|
|
20
23
|
|
|
21
|
-
load_dotenv()
|
|
22
24
|
|
|
23
25
|
# Configure loguru logger using centralized configuration
|
|
24
26
|
configure_logging(debug=False)
|
|
@@ -212,3 +214,5 @@ def csum():
|
|
|
212
214
|
|
|
213
215
|
if __name__ == "__main__":
|
|
214
216
|
ccore()
|
|
217
|
+
if __name__ == "__main__":
|
|
218
|
+
ccore()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Content Core main configuration
|
|
2
|
+
# Copy this file to your project root or set CCORE_CONFIG_PATH to its location
|
|
3
|
+
|
|
4
|
+
speech_to_text:
|
|
5
|
+
provider: openai
|
|
6
|
+
model_name: whisper-1
|
|
7
|
+
|
|
8
|
+
default_model:
|
|
9
|
+
provider: openai
|
|
10
|
+
model_name: gpt-4o-mini
|
|
11
|
+
config:
|
|
12
|
+
temperature: 0.5
|
|
13
|
+
top_p: 1
|
|
14
|
+
max_tokens: 2000
|
|
15
|
+
|
|
16
|
+
cleanup_model:
|
|
17
|
+
provider: openai
|
|
18
|
+
model_name: gpt-4o-mini
|
|
19
|
+
config:
|
|
20
|
+
temperature: 0
|
|
21
|
+
max_tokens: 8000
|
|
22
|
+
output_format: json
|
|
23
|
+
|
|
24
|
+
summary_model:
|
|
25
|
+
provider: openai
|
|
26
|
+
model_name: gpt-4o-mini
|
|
27
|
+
config:
|
|
28
|
+
temperature: 0
|
|
29
|
+
top_p: 1
|
|
30
|
+
max_tokens: 2000
|
|
31
|
+
|
|
32
|
+
extraction:
|
|
33
|
+
engine: legacy # change to 'docling' to enable Docling engine
|
|
34
|
+
docling:
|
|
35
|
+
output_format: markdown # markdown | html | json
|
content_core/common/state.py
CHANGED
|
@@ -13,12 +13,16 @@ class ProcessSourceState(BaseModel):
|
|
|
13
13
|
identified_provider: Optional[str] = ""
|
|
14
14
|
metadata: Optional[dict] = Field(default_factory=lambda: {})
|
|
15
15
|
content: Optional[str] = ""
|
|
16
|
+
engine: Optional[str] = Field(default=None, description="Override extraction engine: 'legacy' or 'docling'")
|
|
17
|
+
output_format: Optional[str] = Field(default=None, description="Override Docling output format: 'markdown', 'html', or 'json'")
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class ProcessSourceInput(BaseModel):
|
|
19
21
|
content: Optional[str] = ""
|
|
20
22
|
file_path: Optional[str] = ""
|
|
21
23
|
url: Optional[str] = ""
|
|
24
|
+
engine: Optional[str] = None
|
|
25
|
+
output_format: Optional[str] = None
|
|
22
26
|
|
|
23
27
|
|
|
24
28
|
class ProcessSourceOutput(BaseModel):
|
content_core/config.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pkgutil
|
|
3
|
-
|
|
3
|
+
import os # needed for load_config env/path checks
|
|
4
4
|
import yaml
|
|
5
5
|
from dotenv import load_dotenv
|
|
6
6
|
|
|
@@ -9,7 +9,7 @@ load_dotenv()
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def load_config():
|
|
12
|
-
config_path = os.environ.get("CCORE_MODEL_CONFIG_PATH")
|
|
12
|
+
config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
|
|
13
13
|
if config_path and os.path.exists(config_path):
|
|
14
14
|
try:
|
|
15
15
|
with open(config_path, "r") as file:
|
|
@@ -20,8 +20,27 @@ def load_config():
|
|
|
20
20
|
|
|
21
21
|
default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
|
|
22
22
|
if default_config_data:
|
|
23
|
-
|
|
24
|
-
|
|
23
|
+
base = yaml.safe_load(default_config_data)
|
|
24
|
+
else:
|
|
25
|
+
base = {}
|
|
26
|
+
# load new cc_config.yaml defaults
|
|
27
|
+
cc_default = pkgutil.get_data("content_core", "cc_config.yaml")
|
|
28
|
+
if cc_default:
|
|
29
|
+
docling_cfg = yaml.safe_load(cc_default)
|
|
30
|
+
# merge extraction section
|
|
31
|
+
base["extraction"] = docling_cfg.get("extraction", {})
|
|
32
|
+
return base
|
|
25
33
|
|
|
26
34
|
|
|
27
35
|
CONFIG = load_config()
|
|
36
|
+
|
|
37
|
+
# Programmatic config overrides: use in notebooks or scripts
|
|
38
|
+
def set_extraction_engine(engine: str):
|
|
39
|
+
"""Override the extraction engine ('legacy' or 'docling')."""
|
|
40
|
+
CONFIG.setdefault("extraction", {})["engine"] = engine
|
|
41
|
+
|
|
42
|
+
def set_docling_output_format(fmt: str):
|
|
43
|
+
"""Override Docling output_format ('markdown', 'html', or 'json')."""
|
|
44
|
+
extraction = CONFIG.setdefault("extraction", {})
|
|
45
|
+
docling_cfg = extraction.setdefault("docling", {})
|
|
46
|
+
docling_cfg["output_format"] = fmt
|
|
@@ -20,10 +20,12 @@ from content_core.processors.text import extract_txt
|
|
|
20
20
|
from content_core.processors.url import extract_url, url_provider
|
|
21
21
|
from content_core.processors.video import extract_best_audio_from_video
|
|
22
22
|
from content_core.processors.youtube import extract_youtube_transcript
|
|
23
|
+
from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED # type: ignore
|
|
23
24
|
|
|
24
25
|
import aiohttp
|
|
25
26
|
import tempfile
|
|
26
27
|
from urllib.parse import urlparse
|
|
28
|
+
from content_core.config import CONFIG # type: ignore
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
|
|
@@ -110,6 +112,17 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
110
112
|
return {"file_path": tmp, "identified_type": mime}
|
|
111
113
|
|
|
112
114
|
|
|
115
|
+
async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Route to Docling if enabled and supported; otherwise use legacy file type edge.
|
|
118
|
+
"""
|
|
119
|
+
# allow per-execution override of engine via state.engine
|
|
120
|
+
engine = state.engine or CONFIG.get("extraction", {}).get("engine", "legacy")
|
|
121
|
+
if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
|
|
122
|
+
return "extract_docling"
|
|
123
|
+
return await file_type_edge(state)
|
|
124
|
+
|
|
125
|
+
|
|
113
126
|
# Create workflow
|
|
114
127
|
workflow = StateGraph(
|
|
115
128
|
ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
|
|
@@ -128,6 +141,7 @@ workflow.add_node("extract_audio", extract_audio)
|
|
|
128
141
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
129
142
|
workflow.add_node("delete_file", delete_file)
|
|
130
143
|
workflow.add_node("download_remote_file", download_remote_file)
|
|
144
|
+
workflow.add_node("extract_docling", extract_with_docling)
|
|
131
145
|
|
|
132
146
|
# Add edges
|
|
133
147
|
workflow.add_edge(START, "source")
|
|
@@ -142,7 +156,7 @@ workflow.add_conditional_edges(
|
|
|
142
156
|
)
|
|
143
157
|
workflow.add_conditional_edges(
|
|
144
158
|
"file_type",
|
|
145
|
-
|
|
159
|
+
file_type_router_docling,
|
|
146
160
|
)
|
|
147
161
|
workflow.add_conditional_edges(
|
|
148
162
|
"url_provider",
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": null,
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"outputs": [],
|
|
8
|
+
"source": [
|
|
9
|
+
"from docling.document_converter import DocumentConverter\n",
|
|
10
|
+
"\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
|
|
13
|
+
"source_url = \"https://arxiv.org/pdf/2408.09869\" # PDF path or URL\n",
|
|
14
|
+
"converter = DocumentConverter()\n",
|
|
15
|
+
"result = converter.convert(source)\n",
|
|
16
|
+
"print(result.document.export_to_markdown())"
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
],
|
|
20
|
+
"metadata": {
|
|
21
|
+
"language_info": {
|
|
22
|
+
"name": "python"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"nbformat": 4,
|
|
26
|
+
"nbformat_minor": 2
|
|
27
|
+
}
|