content-core 0.8.5__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-0.8.5 → content_core-1.0.0}/.gitignore +2 -1
- {content_core-0.8.5 → content_core-1.0.0}/PKG-INFO +17 -7
- {content_core-0.8.5 → content_core-1.0.0}/README.md +16 -6
- {content_core-0.8.5 → content_core-1.0.0}/docs/processors.md +12 -8
- {content_core-0.8.5 → content_core-1.0.0}/docs/usage.md +24 -13
- {content_core-0.8.5 → content_core-1.0.0}/pyproject.toml +1 -1
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/__init__.py +1 -1
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/cc_config.yaml +2 -1
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/common/state.py +9 -5
- content_core-1.0.0/src/content_core/common/types.py +14 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/config.py +7 -3
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/extraction/graph.py +6 -6
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/processors/url.py +5 -13
- content_core-1.0.0/tests/integration/test_cli.py +394 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/integration/test_extraction.py +10 -5
- {content_core-0.8.5 → content_core-1.0.0}/uv.lock +1 -1
- content_core-0.8.5/src/content_core/common/types.py +0 -21
- {content_core-0.8.5 → content_core-1.0.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/.github/workflows/publish.yml +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/.python-version +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/CONTRIBUTING.md +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/LICENSE +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/Makefile +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/prompts/content/cleanup.jinja +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/prompts/content/summarize.jinja +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/common/__init__.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/common/utils.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/__init__.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/summary/core.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/logging.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/models.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/models_config.yaml +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/processors/audio.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/processors/docling.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/processors/office.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/processors/pdf.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/processors/text.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/processors/video.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/processors/youtube.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/py.typed +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/templated_message.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/tools/extract.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.docx +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.epub +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.md +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.mp3 +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.mp4 +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.pdf +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.pptx +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.txt +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file.xlsx +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-0.8.5 → content_core-1.0.0}/tests/unit/test_docling.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -234,12 +234,18 @@ async def main():
|
|
|
234
234
|
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
235
235
|
print(md_data)
|
|
236
236
|
|
|
237
|
-
# Per-execution override with Docling
|
|
237
|
+
# Per-execution override with Docling for documents
|
|
238
238
|
doc_data = await extract_content({
|
|
239
239
|
"file_path": "path/to/your/document.pdf",
|
|
240
|
-
"
|
|
240
|
+
"document_engine": "docling",
|
|
241
241
|
"output_format": "html"
|
|
242
242
|
})
|
|
243
|
+
|
|
244
|
+
# Per-execution override with Firecrawl for URLs
|
|
245
|
+
url_data = await extract_content({
|
|
246
|
+
"url": "https://www.example.com",
|
|
247
|
+
"url_engine": "firecrawl"
|
|
248
|
+
})
|
|
243
249
|
print(doc_data)
|
|
244
250
|
|
|
245
251
|
if __name__ == "__main__":
|
|
@@ -262,7 +268,8 @@ Docling is not the default engine when parsing documents. If you don't want to u
|
|
|
262
268
|
In your `cc_config.yaml` or custom config, set:
|
|
263
269
|
```yaml
|
|
264
270
|
extraction:
|
|
265
|
-
|
|
271
|
+
document_engine: docling # 'auto' (default), 'simple', or 'docling'
|
|
272
|
+
url_engine: auto # 'auto' (default), 'simple', 'firecrawl', or 'jina'
|
|
266
273
|
docling:
|
|
267
274
|
output_format: markdown # markdown | html | json
|
|
268
275
|
```
|
|
@@ -270,10 +277,13 @@ extraction:
|
|
|
270
277
|
#### Programmatically in Python
|
|
271
278
|
|
|
272
279
|
```python
|
|
273
|
-
from content_core.config import
|
|
280
|
+
from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
|
|
281
|
+
|
|
282
|
+
# switch document engine to Docling
|
|
283
|
+
set_document_engine("docling")
|
|
274
284
|
|
|
275
|
-
# switch engine to
|
|
276
|
-
|
|
285
|
+
# switch URL engine to Firecrawl
|
|
286
|
+
set_url_engine("firecrawl")
|
|
277
287
|
|
|
278
288
|
# choose output format: 'markdown', 'html', or 'json'
|
|
279
289
|
set_docling_output_format("html")
|
|
@@ -201,12 +201,18 @@ async def main():
|
|
|
201
201
|
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
202
202
|
print(md_data)
|
|
203
203
|
|
|
204
|
-
# Per-execution override with Docling
|
|
204
|
+
# Per-execution override with Docling for documents
|
|
205
205
|
doc_data = await extract_content({
|
|
206
206
|
"file_path": "path/to/your/document.pdf",
|
|
207
|
-
"
|
|
207
|
+
"document_engine": "docling",
|
|
208
208
|
"output_format": "html"
|
|
209
209
|
})
|
|
210
|
+
|
|
211
|
+
# Per-execution override with Firecrawl for URLs
|
|
212
|
+
url_data = await extract_content({
|
|
213
|
+
"url": "https://www.example.com",
|
|
214
|
+
"url_engine": "firecrawl"
|
|
215
|
+
})
|
|
210
216
|
print(doc_data)
|
|
211
217
|
|
|
212
218
|
if __name__ == "__main__":
|
|
@@ -229,7 +235,8 @@ Docling is not the default engine when parsing documents. If you don't want to u
|
|
|
229
235
|
In your `cc_config.yaml` or custom config, set:
|
|
230
236
|
```yaml
|
|
231
237
|
extraction:
|
|
232
|
-
|
|
238
|
+
document_engine: docling # 'auto' (default), 'simple', or 'docling'
|
|
239
|
+
url_engine: auto # 'auto' (default), 'simple', 'firecrawl', or 'jina'
|
|
233
240
|
docling:
|
|
234
241
|
output_format: markdown # markdown | html | json
|
|
235
242
|
```
|
|
@@ -237,10 +244,13 @@ extraction:
|
|
|
237
244
|
#### Programmatically in Python
|
|
238
245
|
|
|
239
246
|
```python
|
|
240
|
-
from content_core.config import
|
|
247
|
+
from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
|
|
248
|
+
|
|
249
|
+
# switch document engine to Docling
|
|
250
|
+
set_document_engine("docling")
|
|
241
251
|
|
|
242
|
-
# switch engine to
|
|
243
|
-
|
|
252
|
+
# switch URL engine to Firecrawl
|
|
253
|
+
set_url_engine("firecrawl")
|
|
244
254
|
|
|
245
255
|
# choose output format: 'markdown', 'html', or 'json'
|
|
246
256
|
set_docling_output_format("html")
|
|
@@ -21,11 +21,11 @@ Content Core uses a modular approach to process content from different sources.
|
|
|
21
21
|
- **Supported Input**: URLs (web pages).
|
|
22
22
|
- **Returned Data**: Extracted text content from the web page, often in a cleaned format.
|
|
23
23
|
- **Location**: `src/content_core/processors/url.py`
|
|
24
|
-
- **Default Engine (`auto`) Logic**:
|
|
24
|
+
- **Default URL Engine (`auto`) Logic**:
|
|
25
25
|
- If `FIRECRAWL_API_KEY` is set, uses Firecrawl for extraction.
|
|
26
26
|
- Else it tries Jina until it fails because of rate limits (unless `JINA_API_KEY` is set).
|
|
27
27
|
- Else, falls back to BeautifulSoup-based extraction.
|
|
28
|
-
- You can explicitly specify
|
|
28
|
+
- You can explicitly specify a URL engine (`'firecrawl'`, `'jina'`, `'simple'`), but `'auto'` is now the default and recommended for most users.
|
|
29
29
|
|
|
30
30
|
### 3. **File Processor**
|
|
31
31
|
- **Purpose**: Processes local files of various types, extracting content based on file format.
|
|
@@ -47,23 +47,27 @@ Content Core uses a modular approach to process content from different sources.
|
|
|
47
47
|
- **Supported Input**: PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, Images (PNG, JPEG, TIFF, BMP).
|
|
48
48
|
- **Returned Data**: Content converted to configured format (markdown, html, json).
|
|
49
49
|
- **Location**: `src/content_core/processors/docling.py`
|
|
50
|
-
- **Default Engine (`auto`) Logic for Files/Documents**:
|
|
50
|
+
- **Default Document Engine (`auto`) Logic for Files/Documents**:
|
|
51
51
|
- Tries the `'docling'` extraction method first (robust document parsing for supported types).
|
|
52
52
|
- If `'docling'` fails or is not supported, automatically falls back to simple extraction (fast, lightweight for supported types).
|
|
53
|
-
- You can explicitly specify `'docling'
|
|
53
|
+
- You can explicitly specify `'docling'` or `'simple'` as the document engine, but `'auto'` is now the default and recommended for most users.
|
|
54
54
|
- **Configuration**: Activate the Docling engine in `cc_config.yaml` or custom config:
|
|
55
55
|
```yaml
|
|
56
56
|
extraction:
|
|
57
|
-
|
|
57
|
+
document_engine: docling # 'auto' (default), 'simple', or 'docling'
|
|
58
|
+
url_engine: auto # 'auto' (default), 'simple', 'firecrawl', or 'jina'
|
|
58
59
|
docling:
|
|
59
60
|
output_format: markdown # markdown | html | json
|
|
60
61
|
```
|
|
61
62
|
- **Programmatic Toggle**: Use helper functions in Python:
|
|
62
63
|
```python
|
|
63
|
-
from content_core.config import
|
|
64
|
+
from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
|
|
64
65
|
|
|
65
|
-
# switch engine to Docling
|
|
66
|
-
|
|
66
|
+
# switch document engine to Docling
|
|
67
|
+
set_document_engine("docling")
|
|
68
|
+
|
|
69
|
+
# switch URL engine to Firecrawl
|
|
70
|
+
set_url_engine("firecrawl")
|
|
67
71
|
|
|
68
72
|
# choose output format
|
|
69
73
|
set_docling_output_format("html")
|
|
@@ -80,11 +80,11 @@ This will allow you to quickly start with customized settings without needing to
|
|
|
80
80
|
|
|
81
81
|
### Extraction Engine Selection
|
|
82
82
|
|
|
83
|
-
By default, Content Core uses the `'auto'` engine for
|
|
84
|
-
- **For URLs
|
|
85
|
-
- **For files
|
|
83
|
+
By default, Content Core uses the `'auto'` engine for both document and URL extraction tasks. The logic is as follows:
|
|
84
|
+
- **For URLs** (`url_engine`): Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else Jina if `JINA_API_KEY` is set, else falls back to BeautifulSoup.
|
|
85
|
+
- **For files** (`document_engine`): Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
|
|
86
86
|
|
|
87
|
-
You can override this behavior by specifying
|
|
87
|
+
You can override this behavior by specifying separate engines for documents and URLs in your config or function call, but `'auto'` is recommended for most users.
|
|
88
88
|
|
|
89
89
|
#### Docling Engine
|
|
90
90
|
|
|
@@ -94,35 +94,46 @@ Content Core supports an optional Docling engine for advanced document parsing.
|
|
|
94
94
|
Add under the `extraction` section:
|
|
95
95
|
```yaml
|
|
96
96
|
extraction:
|
|
97
|
-
|
|
97
|
+
document_engine: docling # auto (default), simple, or docling
|
|
98
|
+
url_engine: auto # auto (default), simple, firecrawl, or jina
|
|
98
99
|
docling:
|
|
99
|
-
output_format: html
|
|
100
|
+
output_format: html # markdown | html | json
|
|
100
101
|
```
|
|
101
102
|
|
|
102
103
|
##### Programmatically in Python
|
|
103
104
|
```python
|
|
104
|
-
from content_core.config import
|
|
105
|
+
from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
|
|
105
106
|
|
|
106
|
-
# toggle to Docling
|
|
107
|
-
|
|
107
|
+
# toggle document engine to Docling
|
|
108
|
+
set_document_engine("docling")
|
|
109
|
+
|
|
110
|
+
# toggle URL engine to Firecrawl
|
|
111
|
+
set_url_engine("firecrawl")
|
|
108
112
|
|
|
109
113
|
# pick format
|
|
110
114
|
set_docling_output_format("json")
|
|
111
115
|
```
|
|
112
116
|
|
|
113
117
|
#### Per-Execution Overrides
|
|
114
|
-
You can override the extraction
|
|
118
|
+
You can override the extraction engines and Docling output format on a per-call basis by including `document_engine`, `url_engine` and `output_format` in your input:
|
|
115
119
|
|
|
116
120
|
```python
|
|
117
121
|
from content_core.content.extraction import extract_content
|
|
118
122
|
|
|
119
|
-
# override engine and format for this document
|
|
123
|
+
# override document engine and format for this document
|
|
120
124
|
result = await extract_content({
|
|
121
125
|
"file_path": "document.pdf",
|
|
122
|
-
"
|
|
126
|
+
"document_engine": "docling",
|
|
123
127
|
"output_format": "html"
|
|
124
128
|
})
|
|
125
129
|
print(result.content)
|
|
130
|
+
|
|
131
|
+
# override URL engine for this URL
|
|
132
|
+
result = await extract_content({
|
|
133
|
+
"url": "https://example.com",
|
|
134
|
+
"url_engine": "firecrawl"
|
|
135
|
+
})
|
|
136
|
+
print(result.content)
|
|
126
137
|
```
|
|
127
138
|
|
|
128
139
|
Or using `ProcessSourceInput`:
|
|
@@ -133,7 +144,7 @@ from content_core.content.extraction import extract_content
|
|
|
133
144
|
|
|
134
145
|
input = ProcessSourceInput(
|
|
135
146
|
file_path="document.pdf",
|
|
136
|
-
|
|
147
|
+
document_engine="docling",
|
|
137
148
|
output_format="json"
|
|
138
149
|
)
|
|
139
150
|
result = await extract_content(input)
|
|
@@ -113,7 +113,7 @@ async def ccore_main():
|
|
|
113
113
|
if args.format == "xml":
|
|
114
114
|
result = dicttoxml(
|
|
115
115
|
result.model_dump(), custom_root="result", attr_type=False
|
|
116
|
-
)
|
|
116
|
+
).decode('utf-8')
|
|
117
117
|
elif args.format == "json":
|
|
118
118
|
result = result.model_dump_json()
|
|
119
119
|
else: # text
|
|
@@ -30,7 +30,8 @@ summary_model:
|
|
|
30
30
|
max_tokens: 2000
|
|
31
31
|
|
|
32
32
|
extraction:
|
|
33
|
-
|
|
33
|
+
document_engine: auto # auto | simple | docling - for files/documents
|
|
34
|
+
url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
|
|
34
35
|
docling:
|
|
35
36
|
output_format: markdown # markdown | html | json
|
|
36
37
|
|
|
@@ -2,8 +2,7 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
|
-
from content_core.common.types import
|
|
6
|
-
from content_core.common.types import Engine
|
|
5
|
+
from content_core.common.types import DocumentEngine, UrlEngine
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class ProcessSourceState(BaseModel):
|
|
@@ -16,9 +15,13 @@ class ProcessSourceState(BaseModel):
|
|
|
16
15
|
identified_provider: Optional[str] = ""
|
|
17
16
|
metadata: Optional[dict] = Field(default_factory=lambda: {})
|
|
18
17
|
content: Optional[str] = ""
|
|
19
|
-
|
|
18
|
+
document_engine: Optional[DocumentEngine] = Field(
|
|
20
19
|
default=None,
|
|
21
|
-
description="Override extraction engine: 'auto', 'simple',
|
|
20
|
+
description="Override document extraction engine: 'auto', 'simple', or 'docling'",
|
|
21
|
+
)
|
|
22
|
+
url_engine: Optional[UrlEngine] = Field(
|
|
23
|
+
default=None,
|
|
24
|
+
description="Override URL extraction engine: 'auto', 'simple', 'firecrawl', 'jina', or 'docling'",
|
|
22
25
|
)
|
|
23
26
|
output_format: Optional[str] = Field(
|
|
24
27
|
default=None,
|
|
@@ -30,7 +33,8 @@ class ProcessSourceInput(BaseModel):
|
|
|
30
33
|
content: Optional[str] = ""
|
|
31
34
|
file_path: Optional[str] = ""
|
|
32
35
|
url: Optional[str] = ""
|
|
33
|
-
|
|
36
|
+
document_engine: Optional[str] = None
|
|
37
|
+
url_engine: Optional[str] = None
|
|
34
38
|
output_format: Optional[str] = None
|
|
35
39
|
|
|
36
40
|
|
|
@@ -35,9 +35,13 @@ def load_config():
|
|
|
35
35
|
CONFIG = load_config()
|
|
36
36
|
|
|
37
37
|
# Programmatic config overrides: use in notebooks or scripts
|
|
38
|
-
def
|
|
39
|
-
"""Override the extraction engine ('
|
|
40
|
-
CONFIG.setdefault("extraction", {})["
|
|
38
|
+
def set_document_engine(engine: str):
|
|
39
|
+
"""Override the document extraction engine ('auto', 'simple', or 'docling')."""
|
|
40
|
+
CONFIG.setdefault("extraction", {})["document_engine"] = engine
|
|
41
|
+
|
|
42
|
+
def set_url_engine(engine: str):
|
|
43
|
+
"""Override the URL extraction engine ('auto', 'simple', 'firecrawl', 'jina', or 'docling')."""
|
|
44
|
+
CONFIG.setdefault("extraction", {})["url_engine"] = engine
|
|
41
45
|
|
|
42
46
|
def set_docling_output_format(fmt: str):
|
|
43
47
|
"""Override Docling output_format ('markdown', 'html', or 'json')."""
|
|
@@ -12,7 +12,6 @@ from content_core.common import (
|
|
|
12
12
|
ProcessSourceState,
|
|
13
13
|
UnsupportedTypeException,
|
|
14
14
|
)
|
|
15
|
-
from content_core.common.types import warn_if_deprecated_engine
|
|
16
15
|
from content_core.config import CONFIG # type: ignore
|
|
17
16
|
from content_core.logging import logger
|
|
18
17
|
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
@@ -124,11 +123,10 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
124
123
|
async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
125
124
|
"""
|
|
126
125
|
Route to Docling if enabled and supported; otherwise use simple file type edge.
|
|
127
|
-
Supports 'auto', 'docling',
|
|
128
|
-
'auto' tries
|
|
126
|
+
Supports 'auto', 'docling', and 'simple'.
|
|
127
|
+
'auto' tries docling first, then falls back to simple if docling fails.
|
|
129
128
|
"""
|
|
130
|
-
engine = state.
|
|
131
|
-
warn_if_deprecated_engine(engine)
|
|
129
|
+
engine = state.document_engine or CONFIG.get("extraction", {}).get("document_engine", "auto")
|
|
132
130
|
if engine == "auto":
|
|
133
131
|
logger.debug("Using auto engine")
|
|
134
132
|
# Try docling first; if it fails or is not supported, fallback to simple
|
|
@@ -147,7 +145,7 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
|
147
145
|
if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
|
|
148
146
|
logger.debug("Using docling engine")
|
|
149
147
|
return "extract_docling"
|
|
150
|
-
# For 'simple'
|
|
148
|
+
# For 'simple', use the default file type edge
|
|
151
149
|
logger.debug("Using simple engine")
|
|
152
150
|
return await file_type_edge(state)
|
|
153
151
|
|
|
@@ -196,8 +194,10 @@ workflow.add_conditional_edges(
|
|
|
196
194
|
for m in list(SUPPORTED_FITZ_TYPES)
|
|
197
195
|
+ list(SUPPORTED_OFFICE_TYPES)
|
|
198
196
|
+ list(DOCLING_SUPPORTED)
|
|
197
|
+
if m not in ["text/html"] # Exclude HTML from file download, treat as web content
|
|
199
198
|
},
|
|
200
199
|
"article": "extract_url",
|
|
200
|
+
"text/html": "extract_url", # Route HTML content to URL extraction
|
|
201
201
|
"youtube": "extract_youtube_transcript",
|
|
202
202
|
},
|
|
203
203
|
)
|
|
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|
|
5
5
|
from readability import Document
|
|
6
6
|
|
|
7
7
|
from content_core.common import ProcessSourceState
|
|
8
|
-
from content_core.
|
|
8
|
+
from content_core.config import CONFIG
|
|
9
9
|
from content_core.logging import logger
|
|
10
10
|
from content_core.processors.docling import DOCLING_SUPPORTED
|
|
11
11
|
from content_core.processors.office import SUPPORTED_OFFICE_TYPES
|
|
@@ -160,13 +160,12 @@ async def extract_url_firecrawl(url: str):
|
|
|
160
160
|
|
|
161
161
|
async def extract_url(state: ProcessSourceState):
|
|
162
162
|
"""
|
|
163
|
-
Extract content from a URL using the
|
|
164
|
-
Supported engines: 'auto', 'simple', '
|
|
163
|
+
Extract content from a URL using the url_engine specified in the state.
|
|
164
|
+
Supported engines: 'auto', 'simple', 'firecrawl', 'jina'.
|
|
165
165
|
"""
|
|
166
166
|
assert state.url, "No URL provided"
|
|
167
167
|
url = state.url
|
|
168
|
-
engine = state.
|
|
169
|
-
warn_if_deprecated_engine(engine)
|
|
168
|
+
engine = state.url_engine or CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
170
169
|
try:
|
|
171
170
|
if engine == "auto":
|
|
172
171
|
if os.environ.get("FIRECRAWL_API_KEY"):
|
|
@@ -182,19 +181,12 @@ async def extract_url(state: ProcessSourceState):
|
|
|
182
181
|
logger.error(f"Jina extraction error for URL: {url}: {e}")
|
|
183
182
|
logger.debug("Falling back to BeautifulSoup")
|
|
184
183
|
return await extract_url_bs4(url)
|
|
185
|
-
elif engine == "simple"
|
|
186
|
-
# 'legacy' is deprecated alias for 'simple'
|
|
184
|
+
elif engine == "simple":
|
|
187
185
|
return await extract_url_bs4(url)
|
|
188
186
|
elif engine == "firecrawl":
|
|
189
187
|
return await extract_url_firecrawl(url)
|
|
190
188
|
elif engine == "jina":
|
|
191
189
|
return await extract_url_jina(url)
|
|
192
|
-
elif engine == "docling":
|
|
193
|
-
from content_core.processors.docling import extract_with_docling
|
|
194
|
-
|
|
195
|
-
state.url = url
|
|
196
|
-
result_state = await extract_with_docling(state)
|
|
197
|
-
return {"title": None, "content": result_state.content}
|
|
198
190
|
else:
|
|
199
191
|
raise ValueError(f"Unknown engine: {engine}")
|
|
200
192
|
except Exception as e:
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from xml.etree import ElementTree as ET
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.fixture
|
|
11
|
+
def fixture_path():
|
|
12
|
+
"""Provides the path to the directory containing test input files."""
|
|
13
|
+
return Path(__file__).parent.parent / "input_content"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_cli_command(command_args, input_data=None):
|
|
17
|
+
"""Helper to run CLI commands and capture output."""
|
|
18
|
+
try:
|
|
19
|
+
result = subprocess.run(
|
|
20
|
+
command_args,
|
|
21
|
+
input=input_data,
|
|
22
|
+
capture_output=True,
|
|
23
|
+
text=True,
|
|
24
|
+
timeout=30
|
|
25
|
+
)
|
|
26
|
+
return result
|
|
27
|
+
except subprocess.TimeoutExpired:
|
|
28
|
+
pytest.fail(f"Command {command_args} timed out")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TestCcoreCLI:
|
|
32
|
+
"""Tests for the ccore CLI command."""
|
|
33
|
+
|
|
34
|
+
def test_ccore_help(self):
|
|
35
|
+
"""Test ccore help output."""
|
|
36
|
+
result = run_cli_command([sys.executable, "-m", "content_core", "--help"])
|
|
37
|
+
# Note: ccore is the default when running the module, but let's test the actual CLI entry points
|
|
38
|
+
|
|
39
|
+
def test_ccore_text_input(self):
|
|
40
|
+
"""Test ccore with direct text input."""
|
|
41
|
+
result = run_cli_command(["uv", "run", "ccore", "This is a test content."])
|
|
42
|
+
|
|
43
|
+
assert result.returncode == 0
|
|
44
|
+
assert "This is a test content." in result.stdout
|
|
45
|
+
assert result.stderr == ""
|
|
46
|
+
|
|
47
|
+
def test_ccore_file_input(self, fixture_path):
|
|
48
|
+
"""Test ccore with file input."""
|
|
49
|
+
md_file = fixture_path / "file.md"
|
|
50
|
+
if not md_file.exists():
|
|
51
|
+
pytest.skip(f"Fixture file not found: {md_file}")
|
|
52
|
+
|
|
53
|
+
result = run_cli_command(["uv", "run", "ccore", str(md_file)])
|
|
54
|
+
|
|
55
|
+
assert result.returncode == 0
|
|
56
|
+
assert len(result.stdout.strip()) > 0
|
|
57
|
+
assert "Buenos Aires" in result.stdout
|
|
58
|
+
|
|
59
|
+
def test_ccore_url_input(self):
|
|
60
|
+
"""Test ccore with URL input."""
|
|
61
|
+
result = run_cli_command(["uv", "run", "ccore", "https://www.example.com"])
|
|
62
|
+
|
|
63
|
+
assert result.returncode == 0
|
|
64
|
+
assert len(result.stdout.strip()) > 0
|
|
65
|
+
|
|
66
|
+
def test_ccore_json_format(self):
|
|
67
|
+
"""Test ccore with JSON output format."""
|
|
68
|
+
result = run_cli_command(["uv", "run", "ccore", "-f", "json", "Test content for JSON output."])
|
|
69
|
+
|
|
70
|
+
assert result.returncode == 0
|
|
71
|
+
|
|
72
|
+
# Verify it's valid JSON
|
|
73
|
+
output_data = json.loads(result.stdout)
|
|
74
|
+
assert isinstance(output_data, dict)
|
|
75
|
+
assert "content" in output_data
|
|
76
|
+
assert "Test content for JSON output." in output_data["content"]
|
|
77
|
+
|
|
78
|
+
def test_ccore_xml_format(self):
|
|
79
|
+
"""Test ccore with XML output format."""
|
|
80
|
+
result = run_cli_command(["uv", "run", "ccore", "-f", "xml", "Test content for XML output."])
|
|
81
|
+
|
|
82
|
+
assert result.returncode == 0
|
|
83
|
+
|
|
84
|
+
# Verify it's valid XML
|
|
85
|
+
root = ET.fromstring(result.stdout.strip())
|
|
86
|
+
assert root.tag == "result"
|
|
87
|
+
content_elem = root.find(".//content")
|
|
88
|
+
assert content_elem is not None
|
|
89
|
+
assert "Test content for XML output." in content_elem.text
|
|
90
|
+
|
|
91
|
+
def test_ccore_text_format_explicit(self):
|
|
92
|
+
"""Test ccore with explicit text format."""
|
|
93
|
+
result = run_cli_command(["uv", "run", "ccore", "-f", "text", "Test content for text output."])
|
|
94
|
+
|
|
95
|
+
assert result.returncode == 0
|
|
96
|
+
assert "Test content for text output." in result.stdout
|
|
97
|
+
|
|
98
|
+
def test_ccore_stdin_input(self):
|
|
99
|
+
"""Test ccore with stdin input."""
|
|
100
|
+
test_content = "This content comes from stdin."
|
|
101
|
+
result = run_cli_command(["uv", "run", "ccore"], input_data=test_content)
|
|
102
|
+
|
|
103
|
+
assert result.returncode == 0
|
|
104
|
+
assert test_content in result.stdout
|
|
105
|
+
|
|
106
|
+
def test_ccore_stdin_json_format(self):
|
|
107
|
+
"""Test ccore with stdin input and JSON format."""
|
|
108
|
+
test_content = "Stdin content with JSON format."
|
|
109
|
+
result = run_cli_command(["uv", "run", "ccore", "-f", "json"], input_data=test_content)
|
|
110
|
+
|
|
111
|
+
assert result.returncode == 0
|
|
112
|
+
|
|
113
|
+
# Verify it's valid JSON
|
|
114
|
+
output_data = json.loads(result.stdout)
|
|
115
|
+
assert test_content in output_data["content"]
|
|
116
|
+
|
|
117
|
+
def test_ccore_debug_flag(self):
|
|
118
|
+
"""Test ccore with debug flag."""
|
|
119
|
+
result = run_cli_command(["uv", "run", "ccore", "-d", "Debug test content."])
|
|
120
|
+
|
|
121
|
+
assert result.returncode == 0
|
|
122
|
+
assert "Debug test content." in result.stdout
|
|
123
|
+
# Debug output goes to stderr in loguru
|
|
124
|
+
|
|
125
|
+
def test_ccore_file_pdf(self, fixture_path):
|
|
126
|
+
"""Test ccore with PDF file."""
|
|
127
|
+
pdf_file = fixture_path / "file.pdf"
|
|
128
|
+
if not pdf_file.exists():
|
|
129
|
+
pytest.skip(f"Fixture file not found: {pdf_file}")
|
|
130
|
+
|
|
131
|
+
result = run_cli_command(["uv", "run", "ccore", str(pdf_file)])
|
|
132
|
+
|
|
133
|
+
assert result.returncode == 0
|
|
134
|
+
assert len(result.stdout.strip()) > 0
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class TestCcleanCLI:
|
|
138
|
+
"""Tests for the cclean CLI command."""
|
|
139
|
+
|
|
140
|
+
def test_cclean_text_input(self):
|
|
141
|
+
"""Test cclean with direct text input."""
|
|
142
|
+
messy_text = " This is messy text with extra spaces. "
|
|
143
|
+
result = run_cli_command(["uv", "run", "cclean", messy_text])
|
|
144
|
+
|
|
145
|
+
assert result.returncode == 0
|
|
146
|
+
cleaned = result.stdout.strip()
|
|
147
|
+
assert cleaned != messy_text
|
|
148
|
+
assert "This is messy text" in cleaned
|
|
149
|
+
|
|
150
|
+
def test_cclean_json_input(self):
|
|
151
|
+
"""Test cclean with JSON input containing content field."""
|
|
152
|
+
json_input = '{"content": " Messy JSON content "}'
|
|
153
|
+
result = run_cli_command(["uv", "run", "cclean"], input_data=json_input)
|
|
154
|
+
|
|
155
|
+
assert result.returncode == 0
|
|
156
|
+
cleaned = result.stdout.strip()
|
|
157
|
+
assert "Messy JSON content" in cleaned
|
|
158
|
+
|
|
159
|
+
def test_cclean_xml_input(self):
|
|
160
|
+
"""Test cclean with XML input containing content field."""
|
|
161
|
+
xml_input = '<root><content> Messy XML content </content></root>'
|
|
162
|
+
result = run_cli_command(["uv", "run", "cclean"], input_data=xml_input)
|
|
163
|
+
|
|
164
|
+
assert result.returncode == 0
|
|
165
|
+
cleaned = result.stdout.strip()
|
|
166
|
+
assert "Messy XML content" in cleaned
|
|
167
|
+
|
|
168
|
+
def test_cclean_file_input(self, fixture_path):
|
|
169
|
+
"""Test cclean with file input."""
|
|
170
|
+
txt_file = fixture_path / "file.txt"
|
|
171
|
+
if not txt_file.exists():
|
|
172
|
+
pytest.skip(f"Fixture file not found: {txt_file}")
|
|
173
|
+
|
|
174
|
+
result = run_cli_command(["uv", "run", "cclean", str(txt_file)])
|
|
175
|
+
|
|
176
|
+
assert result.returncode == 0
|
|
177
|
+
assert len(result.stdout.strip()) > 0
|
|
178
|
+
|
|
179
|
+
def test_cclean_url_input(self):
|
|
180
|
+
"""Test cclean with URL input."""
|
|
181
|
+
result = run_cli_command(["uv", "run", "cclean", "https://www.example.com"])
|
|
182
|
+
|
|
183
|
+
assert result.returncode == 0
|
|
184
|
+
assert len(result.stdout.strip()) > 0
|
|
185
|
+
|
|
186
|
+
def test_cclean_stdin_input(self):
|
|
187
|
+
"""Test cclean with stdin input."""
|
|
188
|
+
messy_content = " This has too many spaces and needs cleaning. "
|
|
189
|
+
result = run_cli_command(["uv", "run", "cclean"], input_data=messy_content)
|
|
190
|
+
|
|
191
|
+
assert result.returncode == 0
|
|
192
|
+
cleaned = result.stdout.strip()
|
|
193
|
+
assert "This has too many spaces" in cleaned
|
|
194
|
+
|
|
195
|
+
def test_cclean_debug_flag(self):
|
|
196
|
+
"""Test cclean with debug flag."""
|
|
197
|
+
result = run_cli_command(["uv", "run", "cclean", "-d", "Debug clean test."])
|
|
198
|
+
|
|
199
|
+
assert result.returncode == 0
|
|
200
|
+
assert "Debug clean test" in result.stdout
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class TestCsumCLI:
|
|
204
|
+
"""Tests for the csum CLI command."""
|
|
205
|
+
|
|
206
|
+
def test_csum_text_input(self):
|
|
207
|
+
"""Test csum with direct text input."""
|
|
208
|
+
long_text = "Artificial Intelligence is revolutionizing industries across the globe. From healthcare to finance, AI technologies are enabling automation, improving decision-making, and creating new possibilities for innovation."
|
|
209
|
+
result = run_cli_command(["uv", "run", "csum", long_text])
|
|
210
|
+
|
|
211
|
+
assert result.returncode == 0
|
|
212
|
+
summary = result.stdout.strip()
|
|
213
|
+
assert len(summary) > 0
|
|
214
|
+
assert len(summary) < len(long_text) # Summary should be shorter
|
|
215
|
+
|
|
216
|
+
def test_csum_with_context(self):
|
|
217
|
+
"""Test csum with context parameter."""
|
|
218
|
+
text = "Machine learning algorithms process vast amounts of data to identify patterns and make predictions."
|
|
219
|
+
context = "explain in simple terms"
|
|
220
|
+
result = run_cli_command(["uv", "run", "csum", "--context", context, text])
|
|
221
|
+
|
|
222
|
+
assert result.returncode == 0
|
|
223
|
+
summary = result.stdout.strip()
|
|
224
|
+
assert len(summary) > 0
|
|
225
|
+
|
|
226
|
+
def test_csum_file_input(self, fixture_path):
|
|
227
|
+
"""Test csum with file input."""
|
|
228
|
+
md_file = fixture_path / "file.md"
|
|
229
|
+
if not md_file.exists():
|
|
230
|
+
pytest.skip(f"Fixture file not found: {md_file}")
|
|
231
|
+
|
|
232
|
+
result = run_cli_command(["uv", "run", "csum", str(md_file)])
|
|
233
|
+
|
|
234
|
+
assert result.returncode == 0
|
|
235
|
+
assert len(result.stdout.strip()) > 0
|
|
236
|
+
|
|
237
|
+
def test_csum_url_input(self):
|
|
238
|
+
"""Test csum with URL input."""
|
|
239
|
+
result = run_cli_command(["uv", "run", "csum", "https://www.example.com"])
|
|
240
|
+
|
|
241
|
+
assert result.returncode == 0
|
|
242
|
+
assert len(result.stdout.strip()) > 0
|
|
243
|
+
|
|
244
|
+
def test_csum_json_input(self):
|
|
245
|
+
"""Test csum with JSON input containing content field."""
|
|
246
|
+
json_input = '{"content": "This is a long article about technology trends. It discusses various aspects of innovation, digital transformation, and the future of work in the digital age."}'
|
|
247
|
+
result = run_cli_command(["uv", "run", "csum"], input_data=json_input)
|
|
248
|
+
|
|
249
|
+
assert result.returncode == 0
|
|
250
|
+
summary = result.stdout.strip()
|
|
251
|
+
assert len(summary) > 0
|
|
252
|
+
|
|
253
|
+
def test_csum_xml_input(self):
|
|
254
|
+
"""Test csum with XML input containing content field."""
|
|
255
|
+
xml_input = '<article><content>This is a comprehensive guide to understanding cloud computing. It covers infrastructure, platforms, software services, and deployment models.</content></article>'
|
|
256
|
+
result = run_cli_command(["uv", "run", "csum"], input_data=xml_input)
|
|
257
|
+
|
|
258
|
+
assert result.returncode == 0
|
|
259
|
+
summary = result.stdout.strip()
|
|
260
|
+
assert len(summary) > 0
|
|
261
|
+
|
|
262
|
+
def test_csum_stdin_input(self):
|
|
263
|
+
"""Test csum with stdin input."""
|
|
264
|
+
long_content = "The Internet of Things (IoT) represents a network of interconnected devices that communicate and exchange data. This technology has applications in smart homes, industrial automation, healthcare monitoring, and environmental sensing. As IoT devices become more prevalent, they are transforming how we interact with our environment and creating new opportunities for data-driven insights."
|
|
265
|
+
result = run_cli_command(["uv", "run", "csum"], input_data=long_content)
|
|
266
|
+
|
|
267
|
+
assert result.returncode == 0
|
|
268
|
+
summary = result.stdout.strip()
|
|
269
|
+
assert len(summary) > 0
|
|
270
|
+
assert len(summary) < len(long_content)
|
|
271
|
+
|
|
272
|
+
def test_csum_context_bullet_points(self):
|
|
273
|
+
"""Test csum with bullet points context."""
|
|
274
|
+
text = "Blockchain technology provides a decentralized approach to data storage and transaction processing. It ensures security through cryptographic methods and maintains transparency through distributed ledgers."
|
|
275
|
+
result = run_cli_command(["uv", "run", "csum", "--context", "in bullet points", text])
|
|
276
|
+
|
|
277
|
+
assert result.returncode == 0
|
|
278
|
+
summary = result.stdout.strip()
|
|
279
|
+
assert len(summary) > 0
|
|
280
|
+
|
|
281
|
+
def test_csum_debug_flag(self):
|
|
282
|
+
"""Test csum with debug flag."""
|
|
283
|
+
result = run_cli_command(["uv", "run", "csum", "-d", "Debug summary test content."])
|
|
284
|
+
|
|
285
|
+
assert result.returncode == 0
|
|
286
|
+
assert len(result.stdout.strip()) > 0
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class TestCLIErrorHandling:
|
|
290
|
+
"""Tests for CLI error handling and edge cases."""
|
|
291
|
+
|
|
292
|
+
def test_ccore_empty_input_error(self):
|
|
293
|
+
"""Test ccore with empty input should error."""
|
|
294
|
+
result = run_cli_command(["uv", "run", "ccore", ""])
|
|
295
|
+
|
|
296
|
+
assert result.returncode != 0
|
|
297
|
+
|
|
298
|
+
def test_cclean_empty_input_error(self):
|
|
299
|
+
"""Test cclean with empty input should error."""
|
|
300
|
+
result = run_cli_command(["uv", "run", "cclean", ""])
|
|
301
|
+
|
|
302
|
+
assert result.returncode != 0
|
|
303
|
+
|
|
304
|
+
def test_csum_empty_input_error(self):
|
|
305
|
+
"""Test csum with empty input should error."""
|
|
306
|
+
result = run_cli_command(["uv", "run", "csum", ""])
|
|
307
|
+
|
|
308
|
+
assert result.returncode != 0
|
|
309
|
+
|
|
310
|
+
def test_ccore_invalid_format(self):
|
|
311
|
+
"""Test ccore with invalid format option."""
|
|
312
|
+
result = run_cli_command(["uv", "run", "ccore", "-f", "invalid", "test"])
|
|
313
|
+
|
|
314
|
+
assert result.returncode != 0
|
|
315
|
+
assert "invalid choice" in result.stderr.lower()
|
|
316
|
+
|
|
317
|
+
def test_ccore_nonexistent_file(self):
|
|
318
|
+
"""Test ccore with non-existent file."""
|
|
319
|
+
result = run_cli_command(["uv", "run", "ccore", "/path/to/nonexistent/file.txt"])
|
|
320
|
+
|
|
321
|
+
# Should not error but treat as text content
|
|
322
|
+
assert result.returncode == 0
|
|
323
|
+
assert "/path/to/nonexistent/file.txt" in result.stdout
|
|
324
|
+
|
|
325
|
+
def test_stdin_no_content_error(self):
|
|
326
|
+
"""Test CLI with no content and no stdin should error."""
|
|
327
|
+
# This is tricky to test as it involves TTY detection
|
|
328
|
+
# We'll skip this for now as it requires special handling
|
|
329
|
+
pass
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
class TestCLIIntegration:
|
|
333
|
+
"""Integration tests combining multiple CLI features."""
|
|
334
|
+
|
|
335
|
+
def test_pipeline_extract_clean_summarize(self, fixture_path):
|
|
336
|
+
"""Test a pipeline of extract -> clean -> summarize."""
|
|
337
|
+
md_file = fixture_path / "file.md"
|
|
338
|
+
if not md_file.exists():
|
|
339
|
+
pytest.skip(f"Fixture file not found: {md_file}")
|
|
340
|
+
|
|
341
|
+
# Extract content
|
|
342
|
+
extract_result = run_cli_command(["uv", "run", "ccore", str(md_file)])
|
|
343
|
+
assert extract_result.returncode == 0
|
|
344
|
+
|
|
345
|
+
# Clean extracted content
|
|
346
|
+
clean_result = run_cli_command(["uv", "run", "cclean"], input_data=extract_result.stdout)
|
|
347
|
+
assert clean_result.returncode == 0
|
|
348
|
+
|
|
349
|
+
# Summarize cleaned content
|
|
350
|
+
summary_result = run_cli_command(["uv", "run", "csum"], input_data=clean_result.stdout)
|
|
351
|
+
assert summary_result.returncode == 0
|
|
352
|
+
|
|
353
|
+
assert len(summary_result.stdout.strip()) > 0
|
|
354
|
+
|
|
355
|
+
def test_json_pipeline(self):
|
|
356
|
+
"""Test pipeline with JSON format."""
|
|
357
|
+
text = "This is a test for JSON pipeline processing."
|
|
358
|
+
|
|
359
|
+
# Extract as JSON
|
|
360
|
+
extract_result = run_cli_command(["uv", "run", "ccore", "-f", "json", text])
|
|
361
|
+
assert extract_result.returncode == 0
|
|
362
|
+
|
|
363
|
+
# Verify JSON output
|
|
364
|
+
json_data = json.loads(extract_result.stdout)
|
|
365
|
+
assert text in json_data["content"]
|
|
366
|
+
|
|
367
|
+
# Clean JSON content
|
|
368
|
+
clean_result = run_cli_command(["uv", "run", "cclean"], input_data=extract_result.stdout)
|
|
369
|
+
assert clean_result.returncode == 0
|
|
370
|
+
|
|
371
|
+
# Summarize cleaned content
|
|
372
|
+
summary_result = run_cli_command(["uv", "run", "csum"], input_data=clean_result.stdout)
|
|
373
|
+
assert summary_result.returncode == 0
|
|
374
|
+
|
|
375
|
+
def test_xml_processing(self):
|
|
376
|
+
"""Test XML format processing."""
|
|
377
|
+
text = "This is test content for XML processing and validation."
|
|
378
|
+
|
|
379
|
+
# Extract as XML
|
|
380
|
+
extract_result = run_cli_command(["uv", "run", "ccore", "-f", "xml", text])
|
|
381
|
+
assert extract_result.returncode == 0
|
|
382
|
+
|
|
383
|
+
# Verify XML output
|
|
384
|
+
root = ET.fromstring(extract_result.stdout.strip())
|
|
385
|
+
content_elem = root.find(".//content")
|
|
386
|
+
assert content_elem is not None
|
|
387
|
+
assert text in content_elem.text
|
|
388
|
+
|
|
389
|
+
# Process XML content through clean and summarize
|
|
390
|
+
clean_result = run_cli_command(["uv", "run", "cclean"], input_data=extract_result.stdout)
|
|
391
|
+
assert clean_result.returncode == 0
|
|
392
|
+
|
|
393
|
+
summary_result = run_cli_command(["uv", "run", "csum", "--context", "one sentence"], input_data=clean_result.stdout)
|
|
394
|
+
assert summary_result.returncode == 0
|
|
@@ -26,7 +26,7 @@ async def test_extract_content_from_text():
|
|
|
26
26
|
async def test_extract_content_from_url(fixture_path):
|
|
27
27
|
"""Tests content extraction from a URL."""
|
|
28
28
|
# Using a known URL from the notebook example
|
|
29
|
-
input_data = {"url": "https://www.supernovalabs.com", "
|
|
29
|
+
input_data = {"url": "https://www.supernovalabs.com", "url_engine": "simple"}
|
|
30
30
|
result = await extract_content(input_data)
|
|
31
31
|
|
|
32
32
|
assert hasattr(result, "source_type")
|
|
@@ -41,8 +41,13 @@ async def test_extract_content_from_url(fixture_path):
|
|
|
41
41
|
@pytest.mark.asyncio
|
|
42
42
|
async def test_extract_content_from_url_firecrawl(fixture_path):
|
|
43
43
|
"""Tests content extraction from a URL."""
|
|
44
|
+
try:
|
|
45
|
+
import firecrawl
|
|
46
|
+
except ImportError:
|
|
47
|
+
pytest.skip("Firecrawl not installed")
|
|
48
|
+
|
|
44
49
|
# Using a known URL from the notebook example
|
|
45
|
-
input_data = {"url": "https://www.supernovalabs.com", "
|
|
50
|
+
input_data = {"url": "https://www.supernovalabs.com", "url_engine": "firecrawl"}
|
|
46
51
|
result = await extract_content(input_data)
|
|
47
52
|
|
|
48
53
|
assert hasattr(result, "source_type")
|
|
@@ -58,7 +63,7 @@ async def test_extract_content_from_url_firecrawl(fixture_path):
|
|
|
58
63
|
async def test_extract_content_from_url_jina(fixture_path):
|
|
59
64
|
"""Tests content extraction from a URL."""
|
|
60
65
|
# Using a known URL from the notebook example
|
|
61
|
-
input_data = {"url": "https://www.supernovalabs.com", "
|
|
66
|
+
input_data = {"url": "https://www.supernovalabs.com", "url_engine": "jina"}
|
|
62
67
|
result = await extract_content(input_data)
|
|
63
68
|
|
|
64
69
|
assert hasattr(result, "source_type")
|
|
@@ -222,7 +227,7 @@ async def test_extract_content_from_xlsx(fixture_path):
|
|
|
222
227
|
if not xlsx_file.exists():
|
|
223
228
|
pytest.skip(f"Fixture file not found: {xlsx_file}")
|
|
224
229
|
|
|
225
|
-
result = await extract_content(dict(file_path=str(xlsx_file),
|
|
230
|
+
result = await extract_content(dict(file_path=str(xlsx_file), document_engine="simple"))
|
|
226
231
|
|
|
227
232
|
assert result.source_type == "file"
|
|
228
233
|
assert (
|
|
@@ -240,7 +245,7 @@ async def test_extract_content_from_xlsx(fixture_path):
|
|
|
240
245
|
# if not xlsx_file.exists():
|
|
241
246
|
# pytest.skip(f"Fixture file not found: {xlsx_file}")
|
|
242
247
|
|
|
243
|
-
# result = await extract_content(dict(file_path=str(xlsx_file),
|
|
248
|
+
# result = await extract_content(dict(file_path=str(xlsx_file), document_engine="docling"))
|
|
244
249
|
|
|
245
250
|
# assert result.source_type == "file"
|
|
246
251
|
# assert (
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
from typing import Literal
|
|
2
|
-
import warnings
|
|
3
|
-
|
|
4
|
-
Engine = Literal[
|
|
5
|
-
"auto",
|
|
6
|
-
"simple",
|
|
7
|
-
"legacy",
|
|
8
|
-
"firecrawl",
|
|
9
|
-
"jina",
|
|
10
|
-
"docling",
|
|
11
|
-
]
|
|
12
|
-
|
|
13
|
-
DEPRECATED_ENGINES = {"legacy": "simple"}
|
|
14
|
-
|
|
15
|
-
def warn_if_deprecated_engine(engine: str):
|
|
16
|
-
if engine in DEPRECATED_ENGINES:
|
|
17
|
-
warnings.warn(
|
|
18
|
-
f"Engine '{engine}' is deprecated and will be removed in a future release. Use '{DEPRECATED_ENGINES[engine]}' instead.",
|
|
19
|
-
DeprecationWarning,
|
|
20
|
-
stacklevel=2,
|
|
21
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{content_core-0.8.5 → content_core-1.0.0}/src/content_core/content/identification/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|