content-core 0.8.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +1 -1
- content_core/cc_config.yaml +2 -1
- content_core/common/state.py +9 -5
- content_core/common/types.py +7 -14
- content_core/config.py +7 -3
- content_core/content/extraction/graph.py +6 -6
- content_core/processors/url.py +5 -13
- {content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/METADATA +17 -7
- {content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/RECORD +12 -12
- {content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/WHEEL +0 -0
- {content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/entry_points.txt +0 -0
- {content_core-0.8.5.dist-info → content_core-1.0.0.dist-info}/licenses/LICENSE +0 -0
content_core/__init__.py
CHANGED
|
@@ -113,7 +113,7 @@ async def ccore_main():
|
|
|
113
113
|
if args.format == "xml":
|
|
114
114
|
result = dicttoxml(
|
|
115
115
|
result.model_dump(), custom_root="result", attr_type=False
|
|
116
|
-
)
|
|
116
|
+
).decode('utf-8')
|
|
117
117
|
elif args.format == "json":
|
|
118
118
|
result = result.model_dump_json()
|
|
119
119
|
else: # text
|
content_core/cc_config.yaml
CHANGED
|
@@ -30,7 +30,8 @@ summary_model:
|
|
|
30
30
|
max_tokens: 2000
|
|
31
31
|
|
|
32
32
|
extraction:
|
|
33
|
-
|
|
33
|
+
document_engine: auto # auto | simple | docling - for files/documents
|
|
34
|
+
url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
|
|
34
35
|
docling:
|
|
35
36
|
output_format: markdown # markdown | html | json
|
|
36
37
|
|
content_core/common/state.py
CHANGED
|
@@ -2,8 +2,7 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
|
-
from content_core.common.types import
|
|
6
|
-
from content_core.common.types import Engine
|
|
5
|
+
from content_core.common.types import DocumentEngine, UrlEngine
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class ProcessSourceState(BaseModel):
|
|
@@ -16,9 +15,13 @@ class ProcessSourceState(BaseModel):
|
|
|
16
15
|
identified_provider: Optional[str] = ""
|
|
17
16
|
metadata: Optional[dict] = Field(default_factory=lambda: {})
|
|
18
17
|
content: Optional[str] = ""
|
|
19
|
-
|
|
18
|
+
document_engine: Optional[DocumentEngine] = Field(
|
|
20
19
|
default=None,
|
|
21
|
-
description="Override extraction engine: 'auto', 'simple',
|
|
20
|
+
description="Override document extraction engine: 'auto', 'simple', or 'docling'",
|
|
21
|
+
)
|
|
22
|
+
url_engine: Optional[UrlEngine] = Field(
|
|
23
|
+
default=None,
|
|
24
|
+
description="Override URL extraction engine: 'auto', 'simple', 'firecrawl', 'jina', or 'docling'",
|
|
22
25
|
)
|
|
23
26
|
output_format: Optional[str] = Field(
|
|
24
27
|
default=None,
|
|
@@ -30,7 +33,8 @@ class ProcessSourceInput(BaseModel):
|
|
|
30
33
|
content: Optional[str] = ""
|
|
31
34
|
file_path: Optional[str] = ""
|
|
32
35
|
url: Optional[str] = ""
|
|
33
|
-
|
|
36
|
+
document_engine: Optional[str] = None
|
|
37
|
+
url_engine: Optional[str] = None
|
|
34
38
|
output_format: Optional[str] = None
|
|
35
39
|
|
|
36
40
|
|
content_core/common/types.py
CHANGED
|
@@ -1,21 +1,14 @@
|
|
|
1
1
|
from typing import Literal
|
|
2
|
-
import warnings
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
DocumentEngine = Literal[
|
|
4
|
+
"auto",
|
|
5
|
+
"simple",
|
|
6
|
+
"docling",
|
|
7
|
+
]
|
|
8
|
+
|
|
9
|
+
UrlEngine = Literal[
|
|
5
10
|
"auto",
|
|
6
11
|
"simple",
|
|
7
|
-
"legacy",
|
|
8
12
|
"firecrawl",
|
|
9
13
|
"jina",
|
|
10
|
-
"docling",
|
|
11
14
|
]
|
|
12
|
-
|
|
13
|
-
DEPRECATED_ENGINES = {"legacy": "simple"}
|
|
14
|
-
|
|
15
|
-
def warn_if_deprecated_engine(engine: str):
|
|
16
|
-
if engine in DEPRECATED_ENGINES:
|
|
17
|
-
warnings.warn(
|
|
18
|
-
f"Engine '{engine}' is deprecated and will be removed in a future release. Use '{DEPRECATED_ENGINES[engine]}' instead.",
|
|
19
|
-
DeprecationWarning,
|
|
20
|
-
stacklevel=2,
|
|
21
|
-
)
|
content_core/config.py
CHANGED
|
@@ -35,9 +35,13 @@ def load_config():
|
|
|
35
35
|
CONFIG = load_config()
|
|
36
36
|
|
|
37
37
|
# Programmatic config overrides: use in notebooks or scripts
|
|
38
|
-
def
|
|
39
|
-
"""Override the extraction engine ('
|
|
40
|
-
CONFIG.setdefault("extraction", {})["
|
|
38
|
+
def set_document_engine(engine: str):
|
|
39
|
+
"""Override the document extraction engine ('auto', 'simple', or 'docling')."""
|
|
40
|
+
CONFIG.setdefault("extraction", {})["document_engine"] = engine
|
|
41
|
+
|
|
42
|
+
def set_url_engine(engine: str):
|
|
43
|
+
"""Override the URL extraction engine ('auto', 'simple', 'firecrawl', 'jina', or 'docling')."""
|
|
44
|
+
CONFIG.setdefault("extraction", {})["url_engine"] = engine
|
|
41
45
|
|
|
42
46
|
def set_docling_output_format(fmt: str):
|
|
43
47
|
"""Override Docling output_format ('markdown', 'html', or 'json')."""
|
|
@@ -12,7 +12,6 @@ from content_core.common import (
|
|
|
12
12
|
ProcessSourceState,
|
|
13
13
|
UnsupportedTypeException,
|
|
14
14
|
)
|
|
15
|
-
from content_core.common.types import warn_if_deprecated_engine
|
|
16
15
|
from content_core.config import CONFIG # type: ignore
|
|
17
16
|
from content_core.logging import logger
|
|
18
17
|
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
@@ -124,11 +123,10 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
124
123
|
async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
125
124
|
"""
|
|
126
125
|
Route to Docling if enabled and supported; otherwise use simple file type edge.
|
|
127
|
-
Supports 'auto', 'docling',
|
|
128
|
-
'auto' tries
|
|
126
|
+
Supports 'auto', 'docling', and 'simple'.
|
|
127
|
+
'auto' tries docling first, then falls back to simple if docling fails.
|
|
129
128
|
"""
|
|
130
|
-
engine = state.
|
|
131
|
-
warn_if_deprecated_engine(engine)
|
|
129
|
+
engine = state.document_engine or CONFIG.get("extraction", {}).get("document_engine", "auto")
|
|
132
130
|
if engine == "auto":
|
|
133
131
|
logger.debug("Using auto engine")
|
|
134
132
|
# Try docling first; if it fails or is not supported, fallback to simple
|
|
@@ -147,7 +145,7 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
|
147
145
|
if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
|
|
148
146
|
logger.debug("Using docling engine")
|
|
149
147
|
return "extract_docling"
|
|
150
|
-
# For 'simple'
|
|
148
|
+
# For 'simple', use the default file type edge
|
|
151
149
|
logger.debug("Using simple engine")
|
|
152
150
|
return await file_type_edge(state)
|
|
153
151
|
|
|
@@ -196,8 +194,10 @@ workflow.add_conditional_edges(
|
|
|
196
194
|
for m in list(SUPPORTED_FITZ_TYPES)
|
|
197
195
|
+ list(SUPPORTED_OFFICE_TYPES)
|
|
198
196
|
+ list(DOCLING_SUPPORTED)
|
|
197
|
+
if m not in ["text/html"] # Exclude HTML from file download, treat as web content
|
|
199
198
|
},
|
|
200
199
|
"article": "extract_url",
|
|
200
|
+
"text/html": "extract_url", # Route HTML content to URL extraction
|
|
201
201
|
"youtube": "extract_youtube_transcript",
|
|
202
202
|
},
|
|
203
203
|
)
|
content_core/processors/url.py
CHANGED
|
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|
|
5
5
|
from readability import Document
|
|
6
6
|
|
|
7
7
|
from content_core.common import ProcessSourceState
|
|
8
|
-
from content_core.
|
|
8
|
+
from content_core.config import CONFIG
|
|
9
9
|
from content_core.logging import logger
|
|
10
10
|
from content_core.processors.docling import DOCLING_SUPPORTED
|
|
11
11
|
from content_core.processors.office import SUPPORTED_OFFICE_TYPES
|
|
@@ -160,13 +160,12 @@ async def extract_url_firecrawl(url: str):
|
|
|
160
160
|
|
|
161
161
|
async def extract_url(state: ProcessSourceState):
|
|
162
162
|
"""
|
|
163
|
-
Extract content from a URL using the
|
|
164
|
-
Supported engines: 'auto', 'simple', '
|
|
163
|
+
Extract content from a URL using the url_engine specified in the state.
|
|
164
|
+
Supported engines: 'auto', 'simple', 'firecrawl', 'jina'.
|
|
165
165
|
"""
|
|
166
166
|
assert state.url, "No URL provided"
|
|
167
167
|
url = state.url
|
|
168
|
-
engine = state.
|
|
169
|
-
warn_if_deprecated_engine(engine)
|
|
168
|
+
engine = state.url_engine or CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
170
169
|
try:
|
|
171
170
|
if engine == "auto":
|
|
172
171
|
if os.environ.get("FIRECRAWL_API_KEY"):
|
|
@@ -182,19 +181,12 @@ async def extract_url(state: ProcessSourceState):
|
|
|
182
181
|
logger.error(f"Jina extraction error for URL: {url}: {e}")
|
|
183
182
|
logger.debug("Falling back to BeautifulSoup")
|
|
184
183
|
return await extract_url_bs4(url)
|
|
185
|
-
elif engine == "simple"
|
|
186
|
-
# 'legacy' is deprecated alias for 'simple'
|
|
184
|
+
elif engine == "simple":
|
|
187
185
|
return await extract_url_bs4(url)
|
|
188
186
|
elif engine == "firecrawl":
|
|
189
187
|
return await extract_url_firecrawl(url)
|
|
190
188
|
elif engine == "jina":
|
|
191
189
|
return await extract_url_jina(url)
|
|
192
|
-
elif engine == "docling":
|
|
193
|
-
from content_core.processors.docling import extract_with_docling
|
|
194
|
-
|
|
195
|
-
state.url = url
|
|
196
|
-
result_state = await extract_with_docling(state)
|
|
197
|
-
return {"title": None, "content": result_state.content}
|
|
198
190
|
else:
|
|
199
191
|
raise ValueError(f"Unknown engine: {engine}")
|
|
200
192
|
except Exception as e:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -234,12 +234,18 @@ async def main():
|
|
|
234
234
|
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
235
235
|
print(md_data)
|
|
236
236
|
|
|
237
|
-
# Per-execution override with Docling
|
|
237
|
+
# Per-execution override with Docling for documents
|
|
238
238
|
doc_data = await extract_content({
|
|
239
239
|
"file_path": "path/to/your/document.pdf",
|
|
240
|
-
"
|
|
240
|
+
"document_engine": "docling",
|
|
241
241
|
"output_format": "html"
|
|
242
242
|
})
|
|
243
|
+
|
|
244
|
+
# Per-execution override with Firecrawl for URLs
|
|
245
|
+
url_data = await extract_content({
|
|
246
|
+
"url": "https://www.example.com",
|
|
247
|
+
"url_engine": "firecrawl"
|
|
248
|
+
})
|
|
243
249
|
print(doc_data)
|
|
244
250
|
|
|
245
251
|
if __name__ == "__main__":
|
|
@@ -262,7 +268,8 @@ Docling is not the default engine when parsing documents. If you don't want to u
|
|
|
262
268
|
In your `cc_config.yaml` or custom config, set:
|
|
263
269
|
```yaml
|
|
264
270
|
extraction:
|
|
265
|
-
|
|
271
|
+
document_engine: docling # 'auto' (default), 'simple', or 'docling'
|
|
272
|
+
url_engine: auto # 'auto' (default), 'simple', 'firecrawl', or 'jina'
|
|
266
273
|
docling:
|
|
267
274
|
output_format: markdown # markdown | html | json
|
|
268
275
|
```
|
|
@@ -270,10 +277,13 @@ extraction:
|
|
|
270
277
|
#### Programmatically in Python
|
|
271
278
|
|
|
272
279
|
```python
|
|
273
|
-
from content_core.config import
|
|
280
|
+
from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
|
|
281
|
+
|
|
282
|
+
# switch document engine to Docling
|
|
283
|
+
set_document_engine("docling")
|
|
274
284
|
|
|
275
|
-
# switch engine to
|
|
276
|
-
|
|
285
|
+
# switch URL engine to Firecrawl
|
|
286
|
+
set_url_engine("firecrawl")
|
|
277
287
|
|
|
278
288
|
# choose output format: 'markdown', 'html', or 'json'
|
|
279
289
|
set_docling_output_format("html")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
content_core/__init__.py,sha256=
|
|
2
|
-
content_core/cc_config.yaml,sha256=
|
|
3
|
-
content_core/config.py,sha256
|
|
1
|
+
content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
|
|
2
|
+
content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
|
|
3
|
+
content_core/config.py,sha256=vbRgJy8lOTZABeY7GZc7MglNYwBQYpUNzu76kprv_c0,1854
|
|
4
4
|
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
5
5
|
content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
|
|
6
6
|
content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
|
|
@@ -8,14 +8,14 @@ content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
|
|
|
8
8
|
content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
|
|
9
9
|
content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
|
|
10
10
|
content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
|
|
11
|
-
content_core/common/state.py,sha256=
|
|
12
|
-
content_core/common/types.py,sha256=
|
|
11
|
+
content_core/common/state.py,sha256=K5jsDg4l2GSaoGyFYzdd1GW14vLaAxdxes8vUrPNVkE,1622
|
|
12
|
+
content_core/common/types.py,sha256=DOQFW5ySHELc_mZU6G_7PUy1kmnP4aU4IpMyyXDQcBE,177
|
|
13
13
|
content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
|
|
14
14
|
content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeFiQM,230
|
|
15
15
|
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
16
16
|
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
17
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
|
-
content_core/content/extraction/graph.py,sha256=
|
|
18
|
+
content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8YnijESGvnD0,7605
|
|
19
19
|
content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
|
|
20
20
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
21
21
|
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
@@ -25,15 +25,15 @@ content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMgua
|
|
|
25
25
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
26
26
|
content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
|
|
27
27
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
28
|
-
content_core/processors/url.py,sha256=
|
|
28
|
+
content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
|
|
29
29
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
30
30
|
content_core/processors/youtube.py,sha256=g_A-rv5bzq2GIuwqMH70YAnDK-4BZqpgQP0IQ3j9zXE,6340
|
|
31
31
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
32
32
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
33
33
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
34
34
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
35
|
-
content_core-0.
|
|
36
|
-
content_core-0.
|
|
37
|
-
content_core-0.
|
|
38
|
-
content_core-0.
|
|
39
|
-
content_core-0.
|
|
35
|
+
content_core-1.0.0.dist-info/METADATA,sha256=0TBaT17WQQ3u3YKX4dZYGXLkvnnyFwuxe1Z5uHQr9rQ,11819
|
|
36
|
+
content_core-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
37
|
+
content_core-1.0.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
38
|
+
content_core-1.0.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
39
|
+
content_core-1.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|