content-core 0.3.1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (63) hide show
  1. {content_core-0.3.1 → content_core-0.5.0}/.gitignore +1 -1
  2. {content_core-0.3.1 → content_core-0.5.0}/.windsurfrules +2 -2
  3. content_core-0.3.1/README.md → content_core-0.5.0/PKG-INFO +83 -1
  4. content_core-0.3.1/PKG-INFO → content_core-0.5.0/README.md +49 -30
  5. {content_core-0.3.1 → content_core-0.5.0}/docs/processors.md +26 -3
  6. {content_core-0.3.1 → content_core-0.5.0}/docs/usage.md +54 -0
  7. {content_core-0.3.1 → content_core-0.5.0}/pyproject.toml +4 -1
  8. content_core-0.5.0/src/content_core/cc_config.yaml +35 -0
  9. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/common/state.py +4 -0
  10. content_core-0.5.0/src/content_core/config.py +46 -0
  11. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/content/extraction/graph.py +37 -2
  12. content_core-0.5.0/src/content_core/notebooks/docling.ipynb +27 -0
  13. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/notebooks/run.ipynb +74 -58
  14. content_core-0.5.0/src/content_core/processors/docling.py +72 -0
  15. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/processors/url.py +14 -5
  16. {content_core-0.3.1 → content_core-0.5.0}/tests/integration/test_extraction.py +10 -0
  17. content_core-0.5.0/tests/unit/test_docling.py +55 -0
  18. {content_core-0.3.1 → content_core-0.5.0}/uv.lock +1487 -13
  19. content_core-0.3.1/src/content_core/config.py +0 -27
  20. {content_core-0.3.1 → content_core-0.5.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  21. {content_core-0.3.1 → content_core-0.5.0}/.github/workflows/publish.yml +0 -0
  22. {content_core-0.3.1 → content_core-0.5.0}/.python-version +0 -0
  23. {content_core-0.3.1 → content_core-0.5.0}/CONTRIBUTING.md +0 -0
  24. {content_core-0.3.1 → content_core-0.5.0}/LICENSE +0 -0
  25. {content_core-0.3.1 → content_core-0.5.0}/Makefile +0 -0
  26. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/__init__.py +0 -0
  27. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/common/__init__.py +0 -0
  28. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/common/exceptions.py +0 -0
  29. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/common/utils.py +0 -0
  30. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/content/__init__.py +0 -0
  31. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/content/cleanup/__init__.py +0 -0
  32. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/content/cleanup/core.py +0 -0
  33. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/content/extraction/__init__.py +0 -0
  34. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/content/summary/__init__.py +0 -0
  35. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/content/summary/core.py +0 -0
  36. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/logging.py +0 -0
  37. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/models.py +0 -0
  38. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/models_config.yaml +0 -0
  39. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/processors/audio.py +0 -0
  40. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/processors/office.py +0 -0
  41. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/processors/pdf.py +0 -0
  42. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/processors/text.py +0 -0
  43. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/processors/video.py +0 -0
  44. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/processors/youtube.py +0 -0
  45. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/prompter.py +0 -0
  46. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/prompts/content/cleanup.jinja +0 -0
  47. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/prompts/content/summarize.jinja +0 -0
  48. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/py.typed +0 -0
  49. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/templated_message.py +0 -0
  50. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/tools/__init__.py +0 -0
  51. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/tools/cleanup.py +0 -0
  52. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/tools/extract.py +0 -0
  53. {content_core-0.3.1 → content_core-0.5.0}/src/content_core/tools/summarize.py +0 -0
  54. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.docx +0 -0
  55. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.epub +0 -0
  56. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.md +0 -0
  57. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.mp3 +0 -0
  58. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.mp4 +0 -0
  59. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.pdf +0 -0
  60. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.pptx +0 -0
  61. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.txt +0 -0
  62. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file.xlsx +0 -0
  63. {content_core-0.3.1 → content_core-0.5.0}/tests/input_content/file_audio.mp3 +0 -0
@@ -20,4 +20,4 @@ ai_docs/
20
20
  todo.md
21
21
  WIP/
22
22
 
23
- *.ignore
23
+ *.ignore
@@ -4,10 +4,10 @@ All documentation (code or readmes) must be in english.
4
4
  Whenever I ask you to tag and release, make sure to run `make test` as part of the process.
5
5
 
6
6
  The full release process is:
7
- - Run `make test` to make sure everything is working
7
+ - Run `make test` to make sure everything is working (if we changed any code or import)
8
8
  - Update version on pyproject.toml
9
9
  - Run `uv sync` to update the lock file
10
10
  - Commit all that's needed
11
- - Merge to main
11
+ - Merge to main (if in a branch)
12
12
  - Tag the release
13
13
  - Push to GitHub
@@ -1,3 +1,37 @@
1
+ Metadata-Version: 2.4
2
+ Name: content-core
3
+ Version: 0.5.0
4
+ Summary: Extract what matters from any media source
5
+ Author-email: LUIS NOVO <lfnovo@gmail.com>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: aiohttp>=3.11
9
+ Requires-Dist: bs4>=0.0.2
10
+ Requires-Dist: dicttoxml>=1.7.16
11
+ Requires-Dist: esperanto>=1.2.0
12
+ Requires-Dist: google-genai>=1.10.0
13
+ Requires-Dist: jinja2>=3.1.6
14
+ Requires-Dist: langdetect>=1.0.9
15
+ Requires-Dist: langgraph>=0.3.29
16
+ Requires-Dist: loguru>=0.7.3
17
+ Requires-Dist: openai>=1.73.0
18
+ Requires-Dist: openpyxl>=3.1.5
19
+ Requires-Dist: pandas>=2.2.3
20
+ Requires-Dist: pydub>=0.25.1
21
+ Requires-Dist: pymupdf>=1.25.5
22
+ Requires-Dist: python-docx>=1.1.2
23
+ Requires-Dist: python-dotenv>=1.1.0
24
+ Requires-Dist: python-magic>=0.4.27
25
+ Requires-Dist: python-pptx>=1.0.2
26
+ Requires-Dist: validators>=0.34.0
27
+ Requires-Dist: youtube-transcript-api>=1.0.3
28
+ Provides-Extra: docling
29
+ Requires-Dist: asciidoc; extra == 'docling'
30
+ Requires-Dist: docling[ocr]; extra == 'docling'
31
+ Requires-Dist: pandas; extra == 'docling'
32
+ Requires-Dist: pillow; extra == 'docling'
33
+ Description-Content-Type: text/markdown
34
+
1
35
  # Content Core
2
36
 
3
37
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -25,8 +59,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
25
59
  Install Content Core using `pip`:
26
60
 
27
61
  ```bash
28
- # Install the package
62
+ # Install the package (without Docling)
29
63
  pip install content-core
64
+ # Install with Docling support
65
+ pip install content-core[docling]
30
66
  ```
31
67
 
32
68
  Alternatively, if you’re developing locally:
@@ -195,12 +231,58 @@ async def main():
195
231
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
196
232
  print(md_data)
197
233
 
234
+ # Per-execution override with Docling
235
+ doc_data = await extract_content({
236
+ "file_path": "path/to/your/document.pdf",
237
+ "engine": "docling",
238
+ "output_format": "html"
239
+ })
240
+ print(doc_data)
241
+
198
242
  if __name__ == "__main__":
199
243
  asyncio.run(main())
200
244
  ```
201
245
 
202
246
  (See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
203
247
 
248
+ ## Docling Integration
249
+
250
+ Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
251
+
252
+ ### Installation
253
+
254
+ ```bash
255
+ # Install with Docling support
256
+ pip install content-core[docling]
257
+ ```
258
+
259
+ ### Enabling Docling
260
+
261
+ #### Via configuration file
262
+
263
+ In your `cc_config.yaml` or custom config, set:
264
+ ```yaml
265
+ extraction:
266
+ engine: docling # 'legacy' (default) or 'docling'
267
+ docling:
268
+ output_format: markdown # markdown | html | json
269
+ ```
270
+
271
+ #### Programmatically in Python
272
+
273
+ ```python
274
+ from content_core.config import set_extraction_engine, set_docling_output_format
275
+
276
+ # switch engine to Docling
277
+ set_extraction_engine("docling")
278
+
279
+ # choose output format: 'markdown', 'html', or 'json'
280
+ set_docling_output_format("html")
281
+
282
+ # now use ccore.extract or ccore.ccore
283
+ result = await cc.extract("document.pdf")
284
+ ```
285
+
204
286
  ## Configuration
205
287
 
206
288
  Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
@@ -1,32 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: content-core
3
- Version: 0.3.1
4
- Summary: Extract what matters from any media source
5
- Author-email: LUIS NOVO <lfnovo@gmail.com>
6
- License-File: LICENSE
7
- Requires-Python: >=3.10
8
- Requires-Dist: aiohttp>=3.11
9
- Requires-Dist: bs4>=0.0.2
10
- Requires-Dist: dicttoxml>=1.7.16
11
- Requires-Dist: esperanto>=1.2.0
12
- Requires-Dist: google-genai>=1.10.0
13
- Requires-Dist: jinja2>=3.1.6
14
- Requires-Dist: langdetect>=1.0.9
15
- Requires-Dist: langgraph>=0.3.29
16
- Requires-Dist: loguru>=0.7.3
17
- Requires-Dist: openai>=1.73.0
18
- Requires-Dist: openpyxl>=3.1.5
19
- Requires-Dist: pandas>=2.2.3
20
- Requires-Dist: pydub>=0.25.1
21
- Requires-Dist: pymupdf>=1.25.5
22
- Requires-Dist: python-docx>=1.1.2
23
- Requires-Dist: python-dotenv>=1.1.0
24
- Requires-Dist: python-magic>=0.4.27
25
- Requires-Dist: python-pptx>=1.0.2
26
- Requires-Dist: validators>=0.34.0
27
- Requires-Dist: youtube-transcript-api>=1.0.3
28
- Description-Content-Type: text/markdown
29
-
30
1
  # Content Core
31
2
 
32
3
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -54,8 +25,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
54
25
  Install Content Core using `pip`:
55
26
 
56
27
  ```bash
57
- # Install the package
28
+ # Install the package (without Docling)
58
29
  pip install content-core
30
+ # Install with Docling support
31
+ pip install content-core[docling]
59
32
  ```
60
33
 
61
34
  Alternatively, if you’re developing locally:
@@ -224,12 +197,58 @@ async def main():
224
197
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
225
198
  print(md_data)
226
199
 
200
+ # Per-execution override with Docling
201
+ doc_data = await extract_content({
202
+ "file_path": "path/to/your/document.pdf",
203
+ "engine": "docling",
204
+ "output_format": "html"
205
+ })
206
+ print(doc_data)
207
+
227
208
  if __name__ == "__main__":
228
209
  asyncio.run(main())
229
210
  ```
230
211
 
231
212
  (See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
232
213
 
214
+ ## Docling Integration
215
+
216
+ Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
217
+
218
+ ### Installation
219
+
220
+ ```bash
221
+ # Install with Docling support
222
+ pip install content-core[docling]
223
+ ```
224
+
225
+ ### Enabling Docling
226
+
227
+ #### Via configuration file
228
+
229
+ In your `cc_config.yaml` or custom config, set:
230
+ ```yaml
231
+ extraction:
232
+ engine: docling # 'legacy' (default) or 'docling'
233
+ docling:
234
+ output_format: markdown # markdown | html | json
235
+ ```
236
+
237
+ #### Programmatically in Python
238
+
239
+ ```python
240
+ from content_core.config import set_extraction_engine, set_docling_output_format
241
+
242
+ # switch engine to Docling
243
+ set_extraction_engine("docling")
244
+
245
+ # choose output format: 'markdown', 'html', or 'json'
246
+ set_docling_output_format("html")
247
+
248
+ # now use ccore.extract or ccore.ccore
249
+ result = await cc.extract("document.pdf")
250
+ ```
251
+
233
252
  ## Configuration
234
253
 
235
254
  Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
@@ -14,11 +14,11 @@ Content Core uses a modular approach to process content from different sources.
14
14
  - **Returned Data**: The input text as-is, wrapped in a structured format compatible with Content Core's output schema.
15
15
  - **Location**: `src/content_core/processors/text.py`
16
16
 
17
- ### 2. **Web Processor**
17
+ ### 2. **Web (URL) Processor**
18
18
  - **Purpose**: Extracts content from web URLs, focusing on meaningful text while ignoring boilerplate (ads, navigation, etc.).
19
19
  - **Supported Input**: URLs (web pages).
20
20
  - **Returned Data**: Extracted text content from the web page, often in a cleaned format.
21
- - **Location**: `src/content_core/processors/web.py`
21
+ - **Location**: `src/content_core/processors/url.py`
22
22
 
23
23
  ### 3. **File Processor**
24
24
  - **Purpose**: Processes local files of various types, extracting content based on file format.
@@ -35,10 +35,33 @@ Content Core uses a modular approach to process content from different sources.
35
35
  - **Returned Data**: Transcribed text from the media content.
36
36
  - **Location**: `src/content_core/processors/transcription.py`
37
37
 
38
+ ### 5. **Docling Processor**
39
+ - **Purpose**: Use Docling library for rich document parsing (PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, images).
40
+ - **Supported Input**: PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, Images (PNG, JPEG, TIFF, BMP).
41
+ - **Returned Data**: Content converted to configured format (markdown, html, json).
42
+ - **Location**: `src/content_core/processors/docling.py`
43
+ - **Configuration**: Activate the Docling engine in `cc_config.yaml` or custom config:
44
+ ```yaml
45
+ extraction:
46
+ engine: docling # 'legacy' (default) or 'docling'
47
+ docling:
48
+ output_format: markdown # markdown | html | json
49
+ ```
50
+ - **Programmatic Toggle**: Use helper functions in Python:
51
+ ```python
52
+ from content_core.config import set_extraction_engine, set_docling_output_format
53
+
54
+ # switch engine to Docling
55
+ set_extraction_engine("docling")
56
+
57
+ # choose output format
58
+ set_docling_output_format("html")
59
+ ```
60
+
38
61
  ## How Processors Work
39
62
 
40
63
  Content Core automatically selects the appropriate processor based on the input type:
41
- - If a URL is provided, the Web Processor is used.
64
+ - If a URL is provided, the Web (URL) Processor is used.
42
65
  - If a file path is provided, the File Processor determines the file type and delegates to specialized handlers (like the Media Transcription Processor for audio/video).
43
66
  - If raw text is provided, the Text Processor handles it directly.
44
67
 
@@ -76,6 +76,60 @@ To simplify setup, we suggest copying the provided sample files:
76
76
 
77
77
  This will allow you to quickly start with customized settings without needing to create the files from scratch.
78
78
 
79
+ ### Docling Engine
80
+
81
+ Content Core supports an optional Docling engine for advanced document parsing. To enable:
82
+
83
+ #### In YAML config
84
+ Add under the `extraction` section:
85
+ ```yaml
86
+ extraction:
87
+ engine: docling # legacy (default) or docling
88
+ docling:
89
+ output_format: html # markdown | html | json
90
+ ```
91
+
92
+ #### Programmatically in Python
93
+ ```python
94
+ from content_core.config import set_extraction_engine, set_docling_output_format
95
+
96
+ # toggle to Docling
97
+ set_extraction_engine("docling")
98
+
99
+ # pick format
100
+ set_docling_output_format("json")
101
+ ```
102
+
103
+ #### Per-Execution Overrides
104
+ You can override the extraction engine and Docling output format on a per-call basis by including `engine` and `output_format` in your input:
105
+
106
+ ```python
107
+ from content_core.content.extraction import extract_content
108
+
109
+ # override engine and format for this document
110
+ result = await extract_content({
111
+ "file_path": "document.pdf",
112
+ "engine": "docling",
113
+ "output_format": "html"
114
+ })
115
+ print(result.content)
116
+ ```
117
+
118
+ Or using `ProcessSourceInput`:
119
+
120
+ ```python
121
+ from content_core.common.state import ProcessSourceInput
122
+ from content_core.content.extraction import extract_content
123
+
124
+ input = ProcessSourceInput(
125
+ file_path="document.pdf",
126
+ engine="docling",
127
+ output_format="json"
128
+ )
129
+ result = await extract_content(input)
130
+ print(result.content)
131
+ ```
132
+
79
133
  ## Support
80
134
 
81
135
  If you have questions or encounter issues while using the library, open an issue in the repository or contact the support team.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "0.3.1"
3
+ version = "0.5.0"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -31,6 +31,9 @@ dependencies = [
31
31
  "validators>=0.34.0",
32
32
  ]
33
33
 
34
+ [project.optional-dependencies]
35
+ docling = ["docling[ocr]", "Pillow", "pandas", "asciidoc"]
36
+
34
37
  [project.scripts]
35
38
  ccore = "content_core:ccore"
36
39
  cclean = "content_core:cclean"
@@ -0,0 +1,35 @@
1
+ # Content Core main configuration
2
+ # Copy this file to your project root or set CCORE_CONFIG_PATH to its location
3
+
4
+ speech_to_text:
5
+ provider: openai
6
+ model_name: whisper-1
7
+
8
+ default_model:
9
+ provider: openai
10
+ model_name: gpt-4o-mini
11
+ config:
12
+ temperature: 0.5
13
+ top_p: 1
14
+ max_tokens: 2000
15
+
16
+ cleanup_model:
17
+ provider: openai
18
+ model_name: gpt-4o-mini
19
+ config:
20
+ temperature: 0
21
+ max_tokens: 8000
22
+ output_format: json
23
+
24
+ summary_model:
25
+ provider: openai
26
+ model_name: gpt-4o-mini
27
+ config:
28
+ temperature: 0
29
+ top_p: 1
30
+ max_tokens: 2000
31
+
32
+ extraction:
33
+ engine: legacy # change to 'docling' to enable Docling engine
34
+ docling:
35
+ output_format: markdown # markdown | html | json
@@ -13,12 +13,16 @@ class ProcessSourceState(BaseModel):
13
13
  identified_provider: Optional[str] = ""
14
14
  metadata: Optional[dict] = Field(default_factory=lambda: {})
15
15
  content: Optional[str] = ""
16
+ engine: Optional[str] = Field(default=None, description="Override extraction engine: 'legacy' or 'docling'")
17
+ output_format: Optional[str] = Field(default=None, description="Override Docling output format: 'markdown', 'html', or 'json'")
16
18
 
17
19
 
18
20
  class ProcessSourceInput(BaseModel):
19
21
  content: Optional[str] = ""
20
22
  file_path: Optional[str] = ""
21
23
  url: Optional[str] = ""
24
+ engine: Optional[str] = None
25
+ output_format: Optional[str] = None
22
26
 
23
27
 
24
28
  class ProcessSourceOutput(BaseModel):
@@ -0,0 +1,46 @@
1
+ import os
2
+ import pkgutil
3
+ import os # needed for load_config env/path checks
4
+ import yaml
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables from .env file
8
+ load_dotenv()
9
+
10
+
11
+ def load_config():
12
+ config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
13
+ if config_path and os.path.exists(config_path):
14
+ try:
15
+ with open(config_path, "r") as file:
16
+ return yaml.safe_load(file)
17
+ except Exception as e:
18
+ print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
19
+ print("Usando configurações padrão internas.")
20
+
21
+ default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
22
+ if default_config_data:
23
+ base = yaml.safe_load(default_config_data)
24
+ else:
25
+ base = {}
26
+ # load new cc_config.yaml defaults
27
+ cc_default = pkgutil.get_data("content_core", "cc_config.yaml")
28
+ if cc_default:
29
+ docling_cfg = yaml.safe_load(cc_default)
30
+ # merge extraction section
31
+ base["extraction"] = docling_cfg.get("extraction", {})
32
+ return base
33
+
34
+
35
+ CONFIG = load_config()
36
+
37
+ # Programmatic config overrides: use in notebooks or scripts
38
+ def set_extraction_engine(engine: str):
39
+ """Override the extraction engine ('legacy' or 'docling')."""
40
+ CONFIG.setdefault("extraction", {})["engine"] = engine
41
+
42
+ def set_docling_output_format(fmt: str):
43
+ """Override Docling output_format ('markdown', 'html', or 'json')."""
44
+ extraction = CONFIG.setdefault("extraction", {})
45
+ docling_cfg = extraction.setdefault("docling", {})
46
+ docling_cfg["output_format"] = fmt
@@ -20,6 +20,12 @@ from content_core.processors.text import extract_txt
20
20
  from content_core.processors.url import extract_url, url_provider
21
21
  from content_core.processors.video import extract_best_audio_from_video
22
22
  from content_core.processors.youtube import extract_youtube_transcript
23
+ from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED # type: ignore
24
+
25
+ import aiohttp
26
+ import tempfile
27
+ from urllib.parse import urlparse
28
+ from content_core.config import CONFIG # type: ignore
23
29
 
24
30
 
25
31
  async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
@@ -91,6 +97,32 @@ async def source_type_router(x: ProcessSourceState) -> Optional[str]:
91
97
  return x.source_type
92
98
 
93
99
 
100
+ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
101
+ url = state.url
102
+ assert url, "No URL provided"
103
+ async with aiohttp.ClientSession() as session:
104
+ async with session.get(url) as resp:
105
+ resp.raise_for_status()
106
+ mime = resp.headers.get("content-type", "").split(";", 1)[0]
107
+ suffix = os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
108
+ fd, tmp = tempfile.mkstemp(suffix=suffix)
109
+ os.close(fd)
110
+ with open(tmp, "wb") as f:
111
+ f.write(await resp.read())
112
+ return {"file_path": tmp, "identified_type": mime}
113
+
114
+
115
+ async def file_type_router_docling(state: ProcessSourceState) -> str:
116
+ """
117
+ Route to Docling if enabled and supported; otherwise use legacy file type edge.
118
+ """
119
+ # allow per-execution override of engine via state.engine
120
+ engine = state.engine or CONFIG.get("extraction", {}).get("engine", "legacy")
121
+ if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
122
+ return "extract_docling"
123
+ return await file_type_edge(state)
124
+
125
+
94
126
  # Create workflow
95
127
  workflow = StateGraph(
96
128
  ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
@@ -108,6 +140,8 @@ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video
108
140
  workflow.add_node("extract_audio", extract_audio)
109
141
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
110
142
  workflow.add_node("delete_file", delete_file)
143
+ workflow.add_node("download_remote_file", download_remote_file)
144
+ workflow.add_node("extract_docling", extract_with_docling)
111
145
 
112
146
  # Add edges
113
147
  workflow.add_edge(START, "source")
@@ -122,12 +156,12 @@ workflow.add_conditional_edges(
122
156
  )
123
157
  workflow.add_conditional_edges(
124
158
  "file_type",
125
- file_type_edge,
159
+ file_type_router_docling,
126
160
  )
127
161
  workflow.add_conditional_edges(
128
162
  "url_provider",
129
163
  url_type_router,
130
- {"article": "extract_url", "youtube": "extract_youtube_transcript"},
164
+ {**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES}, "article": "extract_url", "youtube": "extract_youtube_transcript"},
131
165
  )
132
166
  workflow.add_edge("url_provider", END)
133
167
  workflow.add_edge("file_type", END)
@@ -140,6 +174,7 @@ workflow.add_edge("extract_office_content", "delete_file")
140
174
  workflow.add_edge("extract_best_audio_from_video", "extract_audio")
141
175
  workflow.add_edge("extract_audio", "delete_file")
142
176
  workflow.add_edge("delete_file", END)
177
+ workflow.add_edge("download_remote_file", "file_type")
143
178
 
144
179
  # Compile graph
145
180
  graph = workflow.compile()
@@ -0,0 +1,27 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from docling.document_converter import DocumentConverter\n",
10
+ "\n",
11
+ "\n",
12
+ "source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
13
+ "source_url = \"https://arxiv.org/pdf/2408.09869\" # PDF path or URL\n",
14
+ "converter = DocumentConverter()\n",
15
+ "result = converter.convert(source)\n",
16
+ "print(result.document.export_to_markdown())"
17
+ ]
18
+ }
19
+ ],
20
+ "metadata": {
21
+ "language_info": {
22
+ "name": "python"
23
+ }
24
+ },
25
+ "nbformat": 4,
26
+ "nbformat_minor": 2
27
+ }