content-core 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-0.4.0 → content_core-0.5.0}/.gitignore +1 -1
- content_core-0.4.0/README.md → content_core-0.5.0/PKG-INFO +83 -1
- content_core-0.4.0/PKG-INFO → content_core-0.5.0/README.md +49 -30
- {content_core-0.4.0 → content_core-0.5.0}/docs/processors.md +26 -3
- {content_core-0.4.0 → content_core-0.5.0}/docs/usage.md +54 -0
- {content_core-0.4.0 → content_core-0.5.0}/pyproject.toml +4 -1
- content_core-0.5.0/src/content_core/cc_config.yaml +35 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/common/state.py +4 -0
- content_core-0.5.0/src/content_core/config.py +46 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/content/extraction/graph.py +15 -1
- content_core-0.5.0/src/content_core/notebooks/docling.ipynb +27 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/notebooks/run.ipynb +74 -58
- content_core-0.5.0/src/content_core/processors/docling.py +72 -0
- content_core-0.5.0/tests/unit/test_docling.py +55 -0
- {content_core-0.4.0 → content_core-0.5.0}/uv.lock +1486 -12
- content_core-0.4.0/src/content_core/config.py +0 -27
- {content_core-0.4.0 → content_core-0.5.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/.github/workflows/publish.yml +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/.python-version +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/.windsurfrules +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/CONTRIBUTING.md +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/LICENSE +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/Makefile +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/__init__.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/common/__init__.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/common/utils.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/content/__init__.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/content/summary/core.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/logging.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/models.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/models_config.yaml +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/processors/audio.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/processors/office.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/processors/pdf.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/processors/text.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/processors/url.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/processors/video.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/processors/youtube.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/prompter.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/prompts/content/cleanup.jinja +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/prompts/content/summarize.jinja +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/py.typed +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/templated_message.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/tools/extract.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.docx +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.epub +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.md +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.mp3 +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.mp4 +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.pdf +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.pptx +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.txt +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file.xlsx +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-0.4.0 → content_core-0.5.0}/tests/integration/test_extraction.py +0 -0
|
@@ -1,3 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: content-core
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Extract what matters from any media source
|
|
5
|
+
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: aiohttp>=3.11
|
|
9
|
+
Requires-Dist: bs4>=0.0.2
|
|
10
|
+
Requires-Dist: dicttoxml>=1.7.16
|
|
11
|
+
Requires-Dist: esperanto>=1.2.0
|
|
12
|
+
Requires-Dist: google-genai>=1.10.0
|
|
13
|
+
Requires-Dist: jinja2>=3.1.6
|
|
14
|
+
Requires-Dist: langdetect>=1.0.9
|
|
15
|
+
Requires-Dist: langgraph>=0.3.29
|
|
16
|
+
Requires-Dist: loguru>=0.7.3
|
|
17
|
+
Requires-Dist: openai>=1.73.0
|
|
18
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
19
|
+
Requires-Dist: pandas>=2.2.3
|
|
20
|
+
Requires-Dist: pydub>=0.25.1
|
|
21
|
+
Requires-Dist: pymupdf>=1.25.5
|
|
22
|
+
Requires-Dist: python-docx>=1.1.2
|
|
23
|
+
Requires-Dist: python-dotenv>=1.1.0
|
|
24
|
+
Requires-Dist: python-magic>=0.4.27
|
|
25
|
+
Requires-Dist: python-pptx>=1.0.2
|
|
26
|
+
Requires-Dist: validators>=0.34.0
|
|
27
|
+
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
28
|
+
Provides-Extra: docling
|
|
29
|
+
Requires-Dist: asciidoc; extra == 'docling'
|
|
30
|
+
Requires-Dist: docling[ocr]; extra == 'docling'
|
|
31
|
+
Requires-Dist: pandas; extra == 'docling'
|
|
32
|
+
Requires-Dist: pillow; extra == 'docling'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
1
35
|
# Content Core
|
|
2
36
|
|
|
3
37
|
[](https://opensource.org/licenses/MIT)
|
|
@@ -25,8 +59,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
25
59
|
Install Content Core using `pip`:
|
|
26
60
|
|
|
27
61
|
```bash
|
|
28
|
-
# Install the package
|
|
62
|
+
# Install the package (without Docling)
|
|
29
63
|
pip install content-core
|
|
64
|
+
# Install with Docling support
|
|
65
|
+
pip install content-core[docling]
|
|
30
66
|
```
|
|
31
67
|
|
|
32
68
|
Alternatively, if you’re developing locally:
|
|
@@ -195,12 +231,58 @@ async def main():
|
|
|
195
231
|
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
196
232
|
print(md_data)
|
|
197
233
|
|
|
234
|
+
# Per-execution override with Docling
|
|
235
|
+
doc_data = await extract_content({
|
|
236
|
+
"file_path": "path/to/your/document.pdf",
|
|
237
|
+
"engine": "docling",
|
|
238
|
+
"output_format": "html"
|
|
239
|
+
})
|
|
240
|
+
print(doc_data)
|
|
241
|
+
|
|
198
242
|
if __name__ == "__main__":
|
|
199
243
|
asyncio.run(main())
|
|
200
244
|
```
|
|
201
245
|
|
|
202
246
|
(See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
|
|
203
247
|
|
|
248
|
+
## Docling Integration
|
|
249
|
+
|
|
250
|
+
Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
|
|
251
|
+
|
|
252
|
+
### Installation
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
# Install with Docling support
|
|
256
|
+
pip install content-core[docling]
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Enabling Docling
|
|
260
|
+
|
|
261
|
+
#### Via configuration file
|
|
262
|
+
|
|
263
|
+
In your `cc_config.yaml` or custom config, set:
|
|
264
|
+
```yaml
|
|
265
|
+
extraction:
|
|
266
|
+
engine: docling # 'legacy' (default) or 'docling'
|
|
267
|
+
docling:
|
|
268
|
+
output_format: markdown # markdown | html | json
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
#### Programmatically in Python
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from content_core.config import set_extraction_engine, set_docling_output_format
|
|
275
|
+
|
|
276
|
+
# switch engine to Docling
|
|
277
|
+
set_extraction_engine("docling")
|
|
278
|
+
|
|
279
|
+
# choose output format: 'markdown', 'html', or 'json'
|
|
280
|
+
set_docling_output_format("html")
|
|
281
|
+
|
|
282
|
+
# now use ccore.extract or ccore.ccore
|
|
283
|
+
result = await cc.extract("document.pdf")
|
|
284
|
+
```
|
|
285
|
+
|
|
204
286
|
## Configuration
|
|
205
287
|
|
|
206
288
|
Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
|
|
@@ -1,32 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: content-core
|
|
3
|
-
Version: 0.4.0
|
|
4
|
-
Summary: Extract what matters from any media source
|
|
5
|
-
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Requires-Python: >=3.10
|
|
8
|
-
Requires-Dist: aiohttp>=3.11
|
|
9
|
-
Requires-Dist: bs4>=0.0.2
|
|
10
|
-
Requires-Dist: dicttoxml>=1.7.16
|
|
11
|
-
Requires-Dist: esperanto>=1.2.0
|
|
12
|
-
Requires-Dist: google-genai>=1.10.0
|
|
13
|
-
Requires-Dist: jinja2>=3.1.6
|
|
14
|
-
Requires-Dist: langdetect>=1.0.9
|
|
15
|
-
Requires-Dist: langgraph>=0.3.29
|
|
16
|
-
Requires-Dist: loguru>=0.7.3
|
|
17
|
-
Requires-Dist: openai>=1.73.0
|
|
18
|
-
Requires-Dist: openpyxl>=3.1.5
|
|
19
|
-
Requires-Dist: pandas>=2.2.3
|
|
20
|
-
Requires-Dist: pydub>=0.25.1
|
|
21
|
-
Requires-Dist: pymupdf>=1.25.5
|
|
22
|
-
Requires-Dist: python-docx>=1.1.2
|
|
23
|
-
Requires-Dist: python-dotenv>=1.1.0
|
|
24
|
-
Requires-Dist: python-magic>=0.4.27
|
|
25
|
-
Requires-Dist: python-pptx>=1.0.2
|
|
26
|
-
Requires-Dist: validators>=0.34.0
|
|
27
|
-
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
28
|
-
Description-Content-Type: text/markdown
|
|
29
|
-
|
|
30
1
|
# Content Core
|
|
31
2
|
|
|
32
3
|
[](https://opensource.org/licenses/MIT)
|
|
@@ -54,8 +25,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
54
25
|
Install Content Core using `pip`:
|
|
55
26
|
|
|
56
27
|
```bash
|
|
57
|
-
# Install the package
|
|
28
|
+
# Install the package (without Docling)
|
|
58
29
|
pip install content-core
|
|
30
|
+
# Install with Docling support
|
|
31
|
+
pip install content-core[docling]
|
|
59
32
|
```
|
|
60
33
|
|
|
61
34
|
Alternatively, if you’re developing locally:
|
|
@@ -224,12 +197,58 @@ async def main():
|
|
|
224
197
|
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
225
198
|
print(md_data)
|
|
226
199
|
|
|
200
|
+
# Per-execution override with Docling
|
|
201
|
+
doc_data = await extract_content({
|
|
202
|
+
"file_path": "path/to/your/document.pdf",
|
|
203
|
+
"engine": "docling",
|
|
204
|
+
"output_format": "html"
|
|
205
|
+
})
|
|
206
|
+
print(doc_data)
|
|
207
|
+
|
|
227
208
|
if __name__ == "__main__":
|
|
228
209
|
asyncio.run(main())
|
|
229
210
|
```
|
|
230
211
|
|
|
231
212
|
(See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
|
|
232
213
|
|
|
214
|
+
## Docling Integration
|
|
215
|
+
|
|
216
|
+
Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
|
|
217
|
+
|
|
218
|
+
### Installation
|
|
219
|
+
|
|
220
|
+
```bash
|
|
221
|
+
# Install with Docling support
|
|
222
|
+
pip install content-core[docling]
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Enabling Docling
|
|
226
|
+
|
|
227
|
+
#### Via configuration file
|
|
228
|
+
|
|
229
|
+
In your `cc_config.yaml` or custom config, set:
|
|
230
|
+
```yaml
|
|
231
|
+
extraction:
|
|
232
|
+
engine: docling # 'legacy' (default) or 'docling'
|
|
233
|
+
docling:
|
|
234
|
+
output_format: markdown # markdown | html | json
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
#### Programmatically in Python
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
from content_core.config import set_extraction_engine, set_docling_output_format
|
|
241
|
+
|
|
242
|
+
# switch engine to Docling
|
|
243
|
+
set_extraction_engine("docling")
|
|
244
|
+
|
|
245
|
+
# choose output format: 'markdown', 'html', or 'json'
|
|
246
|
+
set_docling_output_format("html")
|
|
247
|
+
|
|
248
|
+
# now use ccore.extract or ccore.ccore
|
|
249
|
+
result = await cc.extract("document.pdf")
|
|
250
|
+
```
|
|
251
|
+
|
|
233
252
|
## Configuration
|
|
234
253
|
|
|
235
254
|
Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
|
|
@@ -14,11 +14,11 @@ Content Core uses a modular approach to process content from different sources.
|
|
|
14
14
|
- **Returned Data**: The input text as-is, wrapped in a structured format compatible with Content Core's output schema.
|
|
15
15
|
- **Location**: `src/content_core/processors/text.py`
|
|
16
16
|
|
|
17
|
-
### 2. **Web Processor**
|
|
17
|
+
### 2. **Web (URL) Processor**
|
|
18
18
|
- **Purpose**: Extracts content from web URLs, focusing on meaningful text while ignoring boilerplate (ads, navigation, etc.).
|
|
19
19
|
- **Supported Input**: URLs (web pages).
|
|
20
20
|
- **Returned Data**: Extracted text content from the web page, often in a cleaned format.
|
|
21
|
-
- **Location**: `src/content_core/processors/
|
|
21
|
+
- **Location**: `src/content_core/processors/url.py`
|
|
22
22
|
|
|
23
23
|
### 3. **File Processor**
|
|
24
24
|
- **Purpose**: Processes local files of various types, extracting content based on file format.
|
|
@@ -35,10 +35,33 @@ Content Core uses a modular approach to process content from different sources.
|
|
|
35
35
|
- **Returned Data**: Transcribed text from the media content.
|
|
36
36
|
- **Location**: `src/content_core/processors/transcription.py`
|
|
37
37
|
|
|
38
|
+
### 5. **Docling Processor**
|
|
39
|
+
- **Purpose**: Use Docling library for rich document parsing (PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, images).
|
|
40
|
+
- **Supported Input**: PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, Images (PNG, JPEG, TIFF, BMP).
|
|
41
|
+
- **Returned Data**: Content converted to configured format (markdown, html, json).
|
|
42
|
+
- **Location**: `src/content_core/processors/docling.py`
|
|
43
|
+
- **Configuration**: Activate the Docling engine in `cc_config.yaml` or custom config:
|
|
44
|
+
```yaml
|
|
45
|
+
extraction:
|
|
46
|
+
engine: docling # 'legacy' (default) or 'docling'
|
|
47
|
+
docling:
|
|
48
|
+
output_format: markdown # markdown | html | json
|
|
49
|
+
```
|
|
50
|
+
- **Programmatic Toggle**: Use helper functions in Python:
|
|
51
|
+
```python
|
|
52
|
+
from content_core.config import set_extraction_engine, set_docling_output_format
|
|
53
|
+
|
|
54
|
+
# switch engine to Docling
|
|
55
|
+
set_extraction_engine("docling")
|
|
56
|
+
|
|
57
|
+
# choose output format
|
|
58
|
+
set_docling_output_format("html")
|
|
59
|
+
```
|
|
60
|
+
|
|
38
61
|
## How Processors Work
|
|
39
62
|
|
|
40
63
|
Content Core automatically selects the appropriate processor based on the input type:
|
|
41
|
-
- If a URL is provided, the Web Processor is used.
|
|
64
|
+
- If a URL is provided, the Web (URL) Processor is used.
|
|
42
65
|
- If a file path is provided, the File Processor determines the file type and delegates to specialized handlers (like the Media Transcription Processor for audio/video).
|
|
43
66
|
- If raw text is provided, the Text Processor handles it directly.
|
|
44
67
|
|
|
@@ -76,6 +76,60 @@ To simplify setup, we suggest copying the provided sample files:
|
|
|
76
76
|
|
|
77
77
|
This will allow you to quickly start with customized settings without needing to create the files from scratch.
|
|
78
78
|
|
|
79
|
+
### Docling Engine
|
|
80
|
+
|
|
81
|
+
Content Core supports an optional Docling engine for advanced document parsing. To enable:
|
|
82
|
+
|
|
83
|
+
#### In YAML config
|
|
84
|
+
Add under the `extraction` section:
|
|
85
|
+
```yaml
|
|
86
|
+
extraction:
|
|
87
|
+
engine: docling # legacy (default) or docling
|
|
88
|
+
docling:
|
|
89
|
+
output_format: html # markdown | html | json
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
#### Programmatically in Python
|
|
93
|
+
```python
|
|
94
|
+
from content_core.config import set_extraction_engine, set_docling_output_format
|
|
95
|
+
|
|
96
|
+
# toggle to Docling
|
|
97
|
+
set_extraction_engine("docling")
|
|
98
|
+
|
|
99
|
+
# pick format
|
|
100
|
+
set_docling_output_format("json")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
#### Per-Execution Overrides
|
|
104
|
+
You can override the extraction engine and Docling output format on a per-call basis by including `engine` and `output_format` in your input:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from content_core.content.extraction import extract_content
|
|
108
|
+
|
|
109
|
+
# override engine and format for this document
|
|
110
|
+
result = await extract_content({
|
|
111
|
+
"file_path": "document.pdf",
|
|
112
|
+
"engine": "docling",
|
|
113
|
+
"output_format": "html"
|
|
114
|
+
})
|
|
115
|
+
print(result.content)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Or using `ProcessSourceInput`:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from content_core.common.state import ProcessSourceInput
|
|
122
|
+
from content_core.content.extraction import extract_content
|
|
123
|
+
|
|
124
|
+
input = ProcessSourceInput(
|
|
125
|
+
file_path="document.pdf",
|
|
126
|
+
engine="docling",
|
|
127
|
+
output_format="json"
|
|
128
|
+
)
|
|
129
|
+
result = await extract_content(input)
|
|
130
|
+
print(result.content)
|
|
131
|
+
```
|
|
132
|
+
|
|
79
133
|
## Support
|
|
80
134
|
|
|
81
135
|
If you have questions or encounter issues while using the library, open an issue in the repository or contact the support team.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.0"
|
|
4
4
|
description = "Extract what matters from any media source"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -31,6 +31,9 @@ dependencies = [
|
|
|
31
31
|
"validators>=0.34.0",
|
|
32
32
|
]
|
|
33
33
|
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
docling = ["docling[ocr]", "Pillow", "pandas", "asciidoc"]
|
|
36
|
+
|
|
34
37
|
[project.scripts]
|
|
35
38
|
ccore = "content_core:ccore"
|
|
36
39
|
cclean = "content_core:cclean"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Content Core main configuration
|
|
2
|
+
# Copy this file to your project root or set CCORE_CONFIG_PATH to its location
|
|
3
|
+
|
|
4
|
+
speech_to_text:
|
|
5
|
+
provider: openai
|
|
6
|
+
model_name: whisper-1
|
|
7
|
+
|
|
8
|
+
default_model:
|
|
9
|
+
provider: openai
|
|
10
|
+
model_name: gpt-4o-mini
|
|
11
|
+
config:
|
|
12
|
+
temperature: 0.5
|
|
13
|
+
top_p: 1
|
|
14
|
+
max_tokens: 2000
|
|
15
|
+
|
|
16
|
+
cleanup_model:
|
|
17
|
+
provider: openai
|
|
18
|
+
model_name: gpt-4o-mini
|
|
19
|
+
config:
|
|
20
|
+
temperature: 0
|
|
21
|
+
max_tokens: 8000
|
|
22
|
+
output_format: json
|
|
23
|
+
|
|
24
|
+
summary_model:
|
|
25
|
+
provider: openai
|
|
26
|
+
model_name: gpt-4o-mini
|
|
27
|
+
config:
|
|
28
|
+
temperature: 0
|
|
29
|
+
top_p: 1
|
|
30
|
+
max_tokens: 2000
|
|
31
|
+
|
|
32
|
+
extraction:
|
|
33
|
+
engine: legacy # change to 'docling' to enable Docling engine
|
|
34
|
+
docling:
|
|
35
|
+
output_format: markdown # markdown | html | json
|
|
@@ -13,12 +13,16 @@ class ProcessSourceState(BaseModel):
|
|
|
13
13
|
identified_provider: Optional[str] = ""
|
|
14
14
|
metadata: Optional[dict] = Field(default_factory=lambda: {})
|
|
15
15
|
content: Optional[str] = ""
|
|
16
|
+
engine: Optional[str] = Field(default=None, description="Override extraction engine: 'legacy' or 'docling'")
|
|
17
|
+
output_format: Optional[str] = Field(default=None, description="Override Docling output format: 'markdown', 'html', or 'json'")
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class ProcessSourceInput(BaseModel):
|
|
19
21
|
content: Optional[str] = ""
|
|
20
22
|
file_path: Optional[str] = ""
|
|
21
23
|
url: Optional[str] = ""
|
|
24
|
+
engine: Optional[str] = None
|
|
25
|
+
output_format: Optional[str] = None
|
|
22
26
|
|
|
23
27
|
|
|
24
28
|
class ProcessSourceOutput(BaseModel):
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pkgutil
|
|
3
|
+
import os # needed for load_config env/path checks
|
|
4
|
+
import yaml
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
|
|
7
|
+
# Load environment variables from .env file
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load_config():
|
|
12
|
+
config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
|
|
13
|
+
if config_path and os.path.exists(config_path):
|
|
14
|
+
try:
|
|
15
|
+
with open(config_path, "r") as file:
|
|
16
|
+
return yaml.safe_load(file)
|
|
17
|
+
except Exception as e:
|
|
18
|
+
print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
|
|
19
|
+
print("Usando configurações padrão internas.")
|
|
20
|
+
|
|
21
|
+
default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
|
|
22
|
+
if default_config_data:
|
|
23
|
+
base = yaml.safe_load(default_config_data)
|
|
24
|
+
else:
|
|
25
|
+
base = {}
|
|
26
|
+
# load new cc_config.yaml defaults
|
|
27
|
+
cc_default = pkgutil.get_data("content_core", "cc_config.yaml")
|
|
28
|
+
if cc_default:
|
|
29
|
+
docling_cfg = yaml.safe_load(cc_default)
|
|
30
|
+
# merge extraction section
|
|
31
|
+
base["extraction"] = docling_cfg.get("extraction", {})
|
|
32
|
+
return base
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
CONFIG = load_config()
|
|
36
|
+
|
|
37
|
+
# Programmatic config overrides: use in notebooks or scripts
|
|
38
|
+
def set_extraction_engine(engine: str):
|
|
39
|
+
"""Override the extraction engine ('legacy' or 'docling')."""
|
|
40
|
+
CONFIG.setdefault("extraction", {})["engine"] = engine
|
|
41
|
+
|
|
42
|
+
def set_docling_output_format(fmt: str):
|
|
43
|
+
"""Override Docling output_format ('markdown', 'html', or 'json')."""
|
|
44
|
+
extraction = CONFIG.setdefault("extraction", {})
|
|
45
|
+
docling_cfg = extraction.setdefault("docling", {})
|
|
46
|
+
docling_cfg["output_format"] = fmt
|
|
@@ -20,10 +20,12 @@ from content_core.processors.text import extract_txt
|
|
|
20
20
|
from content_core.processors.url import extract_url, url_provider
|
|
21
21
|
from content_core.processors.video import extract_best_audio_from_video
|
|
22
22
|
from content_core.processors.youtube import extract_youtube_transcript
|
|
23
|
+
from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED # type: ignore
|
|
23
24
|
|
|
24
25
|
import aiohttp
|
|
25
26
|
import tempfile
|
|
26
27
|
from urllib.parse import urlparse
|
|
28
|
+
from content_core.config import CONFIG # type: ignore
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
|
|
@@ -110,6 +112,17 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
110
112
|
return {"file_path": tmp, "identified_type": mime}
|
|
111
113
|
|
|
112
114
|
|
|
115
|
+
async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Route to Docling if enabled and supported; otherwise use legacy file type edge.
|
|
118
|
+
"""
|
|
119
|
+
# allow per-execution override of engine via state.engine
|
|
120
|
+
engine = state.engine or CONFIG.get("extraction", {}).get("engine", "legacy")
|
|
121
|
+
if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
|
|
122
|
+
return "extract_docling"
|
|
123
|
+
return await file_type_edge(state)
|
|
124
|
+
|
|
125
|
+
|
|
113
126
|
# Create workflow
|
|
114
127
|
workflow = StateGraph(
|
|
115
128
|
ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
|
|
@@ -128,6 +141,7 @@ workflow.add_node("extract_audio", extract_audio)
|
|
|
128
141
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
129
142
|
workflow.add_node("delete_file", delete_file)
|
|
130
143
|
workflow.add_node("download_remote_file", download_remote_file)
|
|
144
|
+
workflow.add_node("extract_docling", extract_with_docling)
|
|
131
145
|
|
|
132
146
|
# Add edges
|
|
133
147
|
workflow.add_edge(START, "source")
|
|
@@ -142,7 +156,7 @@ workflow.add_conditional_edges(
|
|
|
142
156
|
)
|
|
143
157
|
workflow.add_conditional_edges(
|
|
144
158
|
"file_type",
|
|
145
|
-
|
|
159
|
+
file_type_router_docling,
|
|
146
160
|
)
|
|
147
161
|
workflow.add_conditional_edges(
|
|
148
162
|
"url_provider",
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": null,
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"outputs": [],
|
|
8
|
+
"source": [
|
|
9
|
+
"from docling.document_converter import DocumentConverter\n",
|
|
10
|
+
"\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
|
|
13
|
+
"source_url = \"https://arxiv.org/pdf/2408.09869\" # PDF path or URL\n",
|
|
14
|
+
"converter = DocumentConverter()\n",
|
|
15
|
+
"result = converter.convert(source)\n",
|
|
16
|
+
"print(result.document.export_to_markdown())"
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
],
|
|
20
|
+
"metadata": {
|
|
21
|
+
"language_info": {
|
|
22
|
+
"name": "python"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"nbformat": 4,
|
|
26
|
+
"nbformat_minor": 2
|
|
27
|
+
}
|