content-core 0.8.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/__init__.py CHANGED
@@ -113,7 +113,7 @@ async def ccore_main():
113
113
  if args.format == "xml":
114
114
  result = dicttoxml(
115
115
  result.model_dump(), custom_root="result", attr_type=False
116
- )
116
+ ).decode('utf-8')
117
117
  elif args.format == "json":
118
118
  result = result.model_dump_json()
119
119
  else: # text
@@ -30,7 +30,8 @@ summary_model:
30
30
  max_tokens: 2000
31
31
 
32
32
  extraction:
33
- engine: legacy # change to 'docling' to enable Docling engine
33
+ document_engine: auto # auto | simple | docling - for files/documents
34
+ url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
34
35
  docling:
35
36
  output_format: markdown # markdown | html | json
36
37
 
@@ -2,8 +2,7 @@ from typing import Optional
2
2
 
3
3
  from pydantic import BaseModel, Field
4
4
 
5
- from content_core.common.types import Engine
6
- from content_core.common.types import Engine
5
+ from content_core.common.types import DocumentEngine, UrlEngine
7
6
 
8
7
 
9
8
  class ProcessSourceState(BaseModel):
@@ -16,9 +15,13 @@ class ProcessSourceState(BaseModel):
16
15
  identified_provider: Optional[str] = ""
17
16
  metadata: Optional[dict] = Field(default_factory=lambda: {})
18
17
  content: Optional[str] = ""
19
- engine: Optional[Engine] = Field(
18
+ document_engine: Optional[DocumentEngine] = Field(
20
19
  default=None,
21
- description="Override extraction engine: 'auto', 'simple', 'legacy', 'firecrawl', 'jina', or 'docling'",
20
+ description="Override document extraction engine: 'auto', 'simple', or 'docling'",
21
+ )
22
+ url_engine: Optional[UrlEngine] = Field(
23
+ default=None,
24
+ description="Override URL extraction engine: 'auto', 'simple', 'firecrawl', 'jina', or 'docling'",
22
25
  )
23
26
  output_format: Optional[str] = Field(
24
27
  default=None,
@@ -30,7 +33,8 @@ class ProcessSourceInput(BaseModel):
30
33
  content: Optional[str] = ""
31
34
  file_path: Optional[str] = ""
32
35
  url: Optional[str] = ""
33
- engine: Optional[str] = None
36
+ document_engine: Optional[str] = None
37
+ url_engine: Optional[str] = None
34
38
  output_format: Optional[str] = None
35
39
 
36
40
 
@@ -1,21 +1,14 @@
1
1
  from typing import Literal
2
- import warnings
3
2
 
4
- Engine = Literal[
3
+ DocumentEngine = Literal[
4
+ "auto",
5
+ "simple",
6
+ "docling",
7
+ ]
8
+
9
+ UrlEngine = Literal[
5
10
  "auto",
6
11
  "simple",
7
- "legacy",
8
12
  "firecrawl",
9
13
  "jina",
10
- "docling",
11
14
  ]
12
-
13
- DEPRECATED_ENGINES = {"legacy": "simple"}
14
-
15
- def warn_if_deprecated_engine(engine: str):
16
- if engine in DEPRECATED_ENGINES:
17
- warnings.warn(
18
- f"Engine '{engine}' is deprecated and will be removed in a future release. Use '{DEPRECATED_ENGINES[engine]}' instead.",
19
- DeprecationWarning,
20
- stacklevel=2,
21
- )
content_core/config.py CHANGED
@@ -35,9 +35,13 @@ def load_config():
35
35
  CONFIG = load_config()
36
36
 
37
37
  # Programmatic config overrides: use in notebooks or scripts
38
- def set_extraction_engine(engine: str):
39
- """Override the extraction engine ('legacy' or 'docling')."""
40
- CONFIG.setdefault("extraction", {})["engine"] = engine
38
+ def set_document_engine(engine: str):
39
+ """Override the document extraction engine ('auto', 'simple', or 'docling')."""
40
+ CONFIG.setdefault("extraction", {})["document_engine"] = engine
41
+
42
+ def set_url_engine(engine: str):
43
+ """Override the URL extraction engine ('auto', 'simple', 'firecrawl', 'jina', or 'docling')."""
44
+ CONFIG.setdefault("extraction", {})["url_engine"] = engine
41
45
 
42
46
  def set_docling_output_format(fmt: str):
43
47
  """Override Docling output_format ('markdown', 'html', or 'json')."""
@@ -12,7 +12,6 @@ from content_core.common import (
12
12
  ProcessSourceState,
13
13
  UnsupportedTypeException,
14
14
  )
15
- from content_core.common.types import warn_if_deprecated_engine
16
15
  from content_core.config import CONFIG # type: ignore
17
16
  from content_core.logging import logger
18
17
  from content_core.processors.audio import extract_audio_data # type: ignore
@@ -124,11 +123,10 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
124
123
  async def file_type_router_docling(state: ProcessSourceState) -> str:
125
124
  """
126
125
  Route to Docling if enabled and supported; otherwise use simple file type edge.
127
- Supports 'auto', 'docling', 'simple', and 'legacy' (deprecated, alias for simple).
128
- 'auto' tries simple first, then falls back to docling if simple fails.
126
+ Supports 'auto', 'docling', and 'simple'.
127
+ 'auto' tries docling first, then falls back to simple if docling fails.
129
128
  """
130
- engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
131
- warn_if_deprecated_engine(engine)
129
+ engine = state.document_engine or CONFIG.get("extraction", {}).get("document_engine", "auto")
132
130
  if engine == "auto":
133
131
  logger.debug("Using auto engine")
134
132
  # Try docling first; if it fails or is not supported, fallback to simple
@@ -147,7 +145,7 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
147
145
  if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
148
146
  logger.debug("Using docling engine")
149
147
  return "extract_docling"
150
- # For 'simple' and 'legacy', use the default file type edge
148
+ # For 'simple', use the default file type edge
151
149
  logger.debug("Using simple engine")
152
150
  return await file_type_edge(state)
153
151
 
@@ -196,8 +194,10 @@ workflow.add_conditional_edges(
196
194
  for m in list(SUPPORTED_FITZ_TYPES)
197
195
  + list(SUPPORTED_OFFICE_TYPES)
198
196
  + list(DOCLING_SUPPORTED)
197
+ if m not in ["text/html"] # Exclude HTML from file download, treat as web content
199
198
  },
200
199
  "article": "extract_url",
200
+ "text/html": "extract_url", # Route HTML content to URL extraction
201
201
  "youtube": "extract_youtube_transcript",
202
202
  },
203
203
  )
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
5
5
  from readability import Document
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
- from content_core.common.types import warn_if_deprecated_engine
8
+ from content_core.config import CONFIG
9
9
  from content_core.logging import logger
10
10
  from content_core.processors.docling import DOCLING_SUPPORTED
11
11
  from content_core.processors.office import SUPPORTED_OFFICE_TYPES
@@ -160,13 +160,12 @@ async def extract_url_firecrawl(url: str):
160
160
 
161
161
  async def extract_url(state: ProcessSourceState):
162
162
  """
163
- Extract content from a URL using the engine specified in the state.
164
- Supported engines: 'auto', 'simple', 'legacy' (deprecated), 'firecrawl', 'jina'.
163
+ Extract content from a URL using the url_engine specified in the state.
164
+ Supported engines: 'auto', 'simple', 'firecrawl', 'jina'.
165
165
  """
166
166
  assert state.url, "No URL provided"
167
167
  url = state.url
168
- engine = state.engine or "auto"
169
- warn_if_deprecated_engine(engine)
168
+ engine = state.url_engine or CONFIG.get("extraction", {}).get("url_engine", "auto")
170
169
  try:
171
170
  if engine == "auto":
172
171
  if os.environ.get("FIRECRAWL_API_KEY"):
@@ -182,19 +181,12 @@ async def extract_url(state: ProcessSourceState):
182
181
  logger.error(f"Jina extraction error for URL: {url}: {e}")
183
182
  logger.debug("Falling back to BeautifulSoup")
184
183
  return await extract_url_bs4(url)
185
- elif engine == "simple" or engine == "legacy":
186
- # 'legacy' is deprecated alias for 'simple'
184
+ elif engine == "simple":
187
185
  return await extract_url_bs4(url)
188
186
  elif engine == "firecrawl":
189
187
  return await extract_url_firecrawl(url)
190
188
  elif engine == "jina":
191
189
  return await extract_url_jina(url)
192
- elif engine == "docling":
193
- from content_core.processors.docling import extract_with_docling
194
-
195
- state.url = url
196
- result_state = await extract_with_docling(state)
197
- return {"title": None, "content": result_state.content}
198
190
  else:
199
191
  raise ValueError(f"Unknown engine: {engine}")
200
192
  except Exception as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.8.5
3
+ Version: 1.0.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -234,12 +234,18 @@ async def main():
234
234
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
235
235
  print(md_data)
236
236
 
237
- # Per-execution override with Docling
237
+ # Per-execution override with Docling for documents
238
238
  doc_data = await extract_content({
239
239
  "file_path": "path/to/your/document.pdf",
240
- "engine": "docling",
240
+ "document_engine": "docling",
241
241
  "output_format": "html"
242
242
  })
243
+
244
+ # Per-execution override with Firecrawl for URLs
245
+ url_data = await extract_content({
246
+ "url": "https://www.example.com",
247
+ "url_engine": "firecrawl"
248
+ })
243
249
  print(doc_data)
244
250
 
245
251
  if __name__ == "__main__":
@@ -262,7 +268,8 @@ Docling is not the default engine when parsing documents. If you don't want to u
262
268
  In your `cc_config.yaml` or custom config, set:
263
269
  ```yaml
264
270
  extraction:
265
- engine: docling # 'legacy' (default) or 'docling'
271
+ document_engine: docling # 'auto' (default), 'simple', or 'docling'
272
+ url_engine: auto # 'auto' (default), 'simple', 'firecrawl', or 'jina'
266
273
  docling:
267
274
  output_format: markdown # markdown | html | json
268
275
  ```
@@ -270,10 +277,13 @@ extraction:
270
277
  #### Programmatically in Python
271
278
 
272
279
  ```python
273
- from content_core.config import set_extraction_engine, set_docling_output_format
280
+ from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
281
+
282
+ # switch document engine to Docling
283
+ set_document_engine("docling")
274
284
 
275
- # switch engine to Docling
276
- set_extraction_engine("docling")
285
+ # switch URL engine to Firecrawl
286
+ set_url_engine("firecrawl")
277
287
 
278
288
  # choose output format: 'markdown', 'html', or 'json'
279
289
  set_docling_output_format("html")
@@ -1,6 +1,6 @@
1
- content_core/__init__.py,sha256=ANKeslNXOGumwrkjqgRik23e5PdGps2C0FSup8_XH2Y,6515
2
- content_core/cc_config.yaml,sha256=tfbnJ4h9DWuJUljJrnz72s6TD24hD5P-uEPA9K_pNVY,767
3
- content_core/config.py,sha256=-aUsTB6Z3fa_XIWdHNXhMgWkVLWjEW1kfyQXXB_-j54,1632
1
+ content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
2
+ content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
3
+ content_core/config.py,sha256=vbRgJy8lOTZABeY7GZc7MglNYwBQYpUNzu76kprv_c0,1854
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
5
  content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
6
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
@@ -8,14 +8,14 @@ content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
8
8
  content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
9
9
  content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
10
10
  content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
11
- content_core/common/state.py,sha256=pO8Oq71KxznlZ4K5qUVfyLrNsZWd2yMO9bXKmrTIXQo,1427
12
- content_core/common/types.py,sha256=FpIzYadBvafGI4e1EuwGjjiPuawL1HitxsQOciNjTZo,497
11
+ content_core/common/state.py,sha256=K5jsDg4l2GSaoGyFYzdd1GW14vLaAxdxes8vUrPNVkE,1622
12
+ content_core/common/types.py,sha256=DOQFW5ySHELc_mZU6G_7PUy1kmnP4aU4IpMyyXDQcBE,177
13
13
  content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
14
14
  content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeFiQM,230
15
15
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
16
16
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
17
17
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
18
- content_core/content/extraction/graph.py,sha256=Z8IqcFQmWLJG44jJ4399mBDQVMH-mYuQQpBDHTBUEe0,7571
18
+ content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8YnijESGvnD0,7605
19
19
  content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
20
20
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
21
21
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
@@ -25,15 +25,15 @@ content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMgua
25
25
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
26
26
  content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
27
27
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
28
- content_core/processors/url.py,sha256=qdtEIhZpi62zMXbwbCmmh86ySoomscwqxHdFib7QC-M,7898
28
+ content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
29
29
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
30
30
  content_core/processors/youtube.py,sha256=g_A-rv5bzq2GIuwqMH70YAnDK-4BZqpgQP0IQ3j9zXE,6340
31
31
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
32
32
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
33
33
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
34
34
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
35
- content_core-0.8.5.dist-info/METADATA,sha256=rba5vG3Vkm5WRKHfbTDay5xK4JD4kbPNFow9AoTNHDE,11439
36
- content_core-0.8.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
37
- content_core-0.8.5.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
38
- content_core-0.8.5.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
39
- content_core-0.8.5.dist-info/RECORD,,
35
+ content_core-1.0.0.dist-info/METADATA,sha256=0TBaT17WQQ3u3YKX4dZYGXLkvnnyFwuxe1Z5uHQr9rQ,11819
36
+ content_core-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
37
+ content_core-1.0.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
38
+ content_core-1.0.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
39
+ content_core-1.0.0.dist-info/RECORD,,