content-core 0.8.5__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (63) hide show
  1. {content_core-0.8.5 → content_core-1.0.1}/.gitignore +2 -1
  2. {content_core-0.8.5 → content_core-1.0.1}/PKG-INFO +17 -7
  3. {content_core-0.8.5 → content_core-1.0.1}/README.md +16 -6
  4. {content_core-0.8.5 → content_core-1.0.1}/docs/processors.md +12 -8
  5. {content_core-0.8.5 → content_core-1.0.1}/docs/usage.md +24 -13
  6. {content_core-0.8.5 → content_core-1.0.1}/pyproject.toml +1 -1
  7. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/__init__.py +1 -1
  8. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/cc_config.yaml +2 -1
  9. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/common/state.py +9 -5
  10. content_core-1.0.1/src/content_core/common/types.py +14 -0
  11. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/config.py +7 -3
  12. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/content/extraction/graph.py +6 -6
  13. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/processors/url.py +5 -13
  14. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/processors/youtube.py +80 -62
  15. content_core-1.0.1/tests/integration/test_cli.py +394 -0
  16. {content_core-0.8.5 → content_core-1.0.1}/tests/integration/test_extraction.py +10 -5
  17. {content_core-0.8.5 → content_core-1.0.1}/uv.lock +1 -1
  18. content_core-0.8.5/src/content_core/common/types.py +0 -21
  19. {content_core-0.8.5 → content_core-1.0.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  20. {content_core-0.8.5 → content_core-1.0.1}/.github/workflows/publish.yml +0 -0
  21. {content_core-0.8.5 → content_core-1.0.1}/.python-version +0 -0
  22. {content_core-0.8.5 → content_core-1.0.1}/CONTRIBUTING.md +0 -0
  23. {content_core-0.8.5 → content_core-1.0.1}/LICENSE +0 -0
  24. {content_core-0.8.5 → content_core-1.0.1}/Makefile +0 -0
  25. {content_core-0.8.5 → content_core-1.0.1}/prompts/content/cleanup.jinja +0 -0
  26. {content_core-0.8.5 → content_core-1.0.1}/prompts/content/summarize.jinja +0 -0
  27. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/common/__init__.py +0 -0
  28. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/common/exceptions.py +0 -0
  29. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/common/utils.py +0 -0
  30. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/content/__init__.py +0 -0
  31. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/content/cleanup/__init__.py +0 -0
  32. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/content/cleanup/core.py +0 -0
  33. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/content/extraction/__init__.py +0 -0
  34. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/content/identification/__init__.py +0 -0
  35. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/content/summary/__init__.py +0 -0
  36. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/content/summary/core.py +0 -0
  37. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/logging.py +0 -0
  38. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/models.py +0 -0
  39. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/models_config.yaml +0 -0
  40. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/notebooks/run.ipynb +0 -0
  41. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/processors/audio.py +0 -0
  42. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/processors/docling.py +0 -0
  43. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/processors/office.py +0 -0
  44. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/processors/pdf.py +0 -0
  45. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/processors/text.py +0 -0
  46. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/processors/video.py +0 -0
  47. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/py.typed +0 -0
  48. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/templated_message.py +0 -0
  49. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/tools/__init__.py +0 -0
  50. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/tools/cleanup.py +0 -0
  51. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/tools/extract.py +0 -0
  52. {content_core-0.8.5 → content_core-1.0.1}/src/content_core/tools/summarize.py +0 -0
  53. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.docx +0 -0
  54. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.epub +0 -0
  55. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.md +0 -0
  56. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.mp3 +0 -0
  57. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.mp4 +0 -0
  58. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.pdf +0 -0
  59. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.pptx +0 -0
  60. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.txt +0 -0
  61. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file.xlsx +0 -0
  62. {content_core-0.8.5 → content_core-1.0.1}/tests/input_content/file_audio.mp3 +0 -0
  63. {content_core-0.8.5 → content_core-1.0.1}/tests/unit/test_docling.py +0 -0
@@ -21,4 +21,5 @@ todo.md
21
21
  WIP/
22
22
 
23
23
  *.ignore
24
- .windsurfrules
24
+ .windsurfrules
25
+ CLAUDE.md
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.8.5
3
+ Version: 1.0.1
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -234,12 +234,18 @@ async def main():
234
234
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
235
235
  print(md_data)
236
236
 
237
- # Per-execution override with Docling
237
+ # Per-execution override with Docling for documents
238
238
  doc_data = await extract_content({
239
239
  "file_path": "path/to/your/document.pdf",
240
- "engine": "docling",
240
+ "document_engine": "docling",
241
241
  "output_format": "html"
242
242
  })
243
+
244
+ # Per-execution override with Firecrawl for URLs
245
+ url_data = await extract_content({
246
+ "url": "https://www.example.com",
247
+ "url_engine": "firecrawl"
248
+ })
243
249
  print(doc_data)
244
250
 
245
251
  if __name__ == "__main__":
@@ -262,7 +268,8 @@ Docling is not the default engine when parsing documents. If you don't want to u
262
268
  In your `cc_config.yaml` or custom config, set:
263
269
  ```yaml
264
270
  extraction:
265
- engine: docling # 'legacy' (default) or 'docling'
271
+ document_engine: docling # 'auto' (default), 'simple', or 'docling'
272
+ url_engine: auto # 'auto' (default), 'simple', 'firecrawl', or 'jina'
266
273
  docling:
267
274
  output_format: markdown # markdown | html | json
268
275
  ```
@@ -270,10 +277,13 @@ extraction:
270
277
  #### Programmatically in Python
271
278
 
272
279
  ```python
273
- from content_core.config import set_extraction_engine, set_docling_output_format
280
+ from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
281
+
282
+ # switch document engine to Docling
283
+ set_document_engine("docling")
274
284
 
275
- # switch engine to Docling
276
- set_extraction_engine("docling")
285
+ # switch URL engine to Firecrawl
286
+ set_url_engine("firecrawl")
277
287
 
278
288
  # choose output format: 'markdown', 'html', or 'json'
279
289
  set_docling_output_format("html")
@@ -201,12 +201,18 @@ async def main():
201
201
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
202
202
  print(md_data)
203
203
 
204
- # Per-execution override with Docling
204
+ # Per-execution override with Docling for documents
205
205
  doc_data = await extract_content({
206
206
  "file_path": "path/to/your/document.pdf",
207
- "engine": "docling",
207
+ "document_engine": "docling",
208
208
  "output_format": "html"
209
209
  })
210
+
211
+ # Per-execution override with Firecrawl for URLs
212
+ url_data = await extract_content({
213
+ "url": "https://www.example.com",
214
+ "url_engine": "firecrawl"
215
+ })
210
216
  print(doc_data)
211
217
 
212
218
  if __name__ == "__main__":
@@ -229,7 +235,8 @@ Docling is not the default engine when parsing documents. If you don't want to u
229
235
  In your `cc_config.yaml` or custom config, set:
230
236
  ```yaml
231
237
  extraction:
232
- engine: docling # 'legacy' (default) or 'docling'
238
+ document_engine: docling # 'auto' (default), 'simple', or 'docling'
239
+ url_engine: auto # 'auto' (default), 'simple', 'firecrawl', or 'jina'
233
240
  docling:
234
241
  output_format: markdown # markdown | html | json
235
242
  ```
@@ -237,10 +244,13 @@ extraction:
237
244
  #### Programmatically in Python
238
245
 
239
246
  ```python
240
- from content_core.config import set_extraction_engine, set_docling_output_format
247
+ from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
248
+
249
+ # switch document engine to Docling
250
+ set_document_engine("docling")
241
251
 
242
- # switch engine to Docling
243
- set_extraction_engine("docling")
252
+ # switch URL engine to Firecrawl
253
+ set_url_engine("firecrawl")
244
254
 
245
255
  # choose output format: 'markdown', 'html', or 'json'
246
256
  set_docling_output_format("html")
@@ -21,11 +21,11 @@ Content Core uses a modular approach to process content from different sources.
21
21
  - **Supported Input**: URLs (web pages).
22
22
  - **Returned Data**: Extracted text content from the web page, often in a cleaned format.
23
23
  - **Location**: `src/content_core/processors/url.py`
24
- - **Default Engine (`auto`) Logic**:
24
+ - **Default URL Engine (`auto`) Logic**:
25
25
  - If `FIRECRAWL_API_KEY` is set, uses Firecrawl for extraction.
26
26
  - Else it tries Jina until it fails because of rate limits (unless `JINA_API_KEY` is set).
27
27
  - Else, falls back to BeautifulSoup-based extraction.
28
- - You can explicitly specify an engine (`'firecrawl'`, `'jina'`, `'simple'`, etc.), but `'auto'` is now the default and recommended for most users.
28
+ - You can explicitly specify a URL engine (`'firecrawl'`, `'jina'`, `'simple'`), but `'auto'` is now the default and recommended for most users.
29
29
 
30
30
  ### 3. **File Processor**
31
31
  - **Purpose**: Processes local files of various types, extracting content based on file format.
@@ -47,23 +47,27 @@ Content Core uses a modular approach to process content from different sources.
47
47
  - **Supported Input**: PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, Images (PNG, JPEG, TIFF, BMP).
48
48
  - **Returned Data**: Content converted to configured format (markdown, html, json).
49
49
  - **Location**: `src/content_core/processors/docling.py`
50
- - **Default Engine (`auto`) Logic for Files/Documents**:
50
+ - **Default Document Engine (`auto`) Logic for Files/Documents**:
51
51
  - Tries the `'docling'` extraction method first (robust document parsing for supported types).
52
52
  - If `'docling'` fails or is not supported, automatically falls back to simple extraction (fast, lightweight for supported types).
53
- - You can explicitly specify `'docling'`, `'simple'`, or `'legacy'` as the engine, but `'auto'` is now the default and recommended for most users.
53
+ - You can explicitly specify `'docling'` or `'simple'` as the document engine, but `'auto'` is now the default and recommended for most users.
54
54
  - **Configuration**: Activate the Docling engine in `cc_config.yaml` or custom config:
55
55
  ```yaml
56
56
  extraction:
57
- engine: docling # 'auto' (default), 'docling', or 'simple'
57
+ document_engine: docling # 'auto' (default), 'simple', or 'docling'
58
+ url_engine: auto # 'auto' (default), 'simple', 'firecrawl', or 'jina'
58
59
  docling:
59
60
  output_format: markdown # markdown | html | json
60
61
  ```
61
62
  - **Programmatic Toggle**: Use helper functions in Python:
62
63
  ```python
63
- from content_core.config import set_extraction_engine, set_docling_output_format
64
+ from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
64
65
 
65
- # switch engine to Docling
66
- set_extraction_engine("docling")
66
+ # switch document engine to Docling
67
+ set_document_engine("docling")
68
+
69
+ # switch URL engine to Firecrawl
70
+ set_url_engine("firecrawl")
67
71
 
68
72
  # choose output format
69
73
  set_docling_output_format("html")
@@ -80,11 +80,11 @@ This will allow you to quickly start with customized settings without needing to
80
80
 
81
81
  ### Extraction Engine Selection
82
82
 
83
- By default, Content Core uses the `'auto'` engine for all extraction tasks. The logic is as follows:
84
- - **For URLs**: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else Jina if `JINA_API_KEY` is set, else falls back to BeautifulSoup.
85
- - **For files**: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
83
+ By default, Content Core uses the `'auto'` engine for both document and URL extraction tasks. The logic is as follows:
84
+ - **For URLs** (`url_engine`): Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else Jina if `JINA_API_KEY` is set, else falls back to BeautifulSoup.
85
+ - **For files** (`document_engine`): Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
86
86
 
87
- You can override this behavior by specifying an engine in your config or function call, but `'auto'` is recommended for most users.
87
+ You can override this behavior by specifying separate engines for documents and URLs in your config or function call, but `'auto'` is recommended for most users.
88
88
 
89
89
  #### Docling Engine
90
90
 
@@ -94,35 +94,46 @@ Content Core supports an optional Docling engine for advanced document parsing.
94
94
  Add under the `extraction` section:
95
95
  ```yaml
96
96
  extraction:
97
- engine: docling # auto (default), docling, or simple
97
+ document_engine: docling # auto (default), simple, or docling
98
+ url_engine: auto # auto (default), simple, firecrawl, or jina
98
99
  docling:
99
- output_format: html # markdown | html | json
100
+ output_format: html # markdown | html | json
100
101
  ```
101
102
 
102
103
  ##### Programmatically in Python
103
104
  ```python
104
- from content_core.config import set_extraction_engine, set_docling_output_format
105
+ from content_core.config import set_document_engine, set_url_engine, set_docling_output_format
105
106
 
106
- # toggle to Docling
107
- set_extraction_engine("docling")
107
+ # toggle document engine to Docling
108
+ set_document_engine("docling")
109
+
110
+ # toggle URL engine to Firecrawl
111
+ set_url_engine("firecrawl")
108
112
 
109
113
  # pick format
110
114
  set_docling_output_format("json")
111
115
  ```
112
116
 
113
117
  #### Per-Execution Overrides
114
- You can override the extraction engine and Docling output format on a per-call basis by including `engine` and `output_format` in your input:
118
+ You can override the extraction engines and Docling output format on a per-call basis by including `document_engine`, `url_engine` and `output_format` in your input:
115
119
 
116
120
  ```python
117
121
  from content_core.content.extraction import extract_content
118
122
 
119
- # override engine and format for this document
123
+ # override document engine and format for this document
120
124
  result = await extract_content({
121
125
  "file_path": "document.pdf",
122
- "engine": "docling",
126
+ "document_engine": "docling",
123
127
  "output_format": "html"
124
128
  })
125
129
  print(result.content)
130
+
131
+ # override URL engine for this URL
132
+ result = await extract_content({
133
+ "url": "https://example.com",
134
+ "url_engine": "firecrawl"
135
+ })
136
+ print(result.content)
126
137
  ```
127
138
 
128
139
  Or using `ProcessSourceInput`:
@@ -133,7 +144,7 @@ from content_core.content.extraction import extract_content
133
144
 
134
145
  input = ProcessSourceInput(
135
146
  file_path="document.pdf",
136
- engine="docling",
147
+ document_engine="docling",
137
148
  output_format="json"
138
149
  )
139
150
  result = await extract_content(input)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "0.8.5"
3
+ version = "1.0.1"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -113,7 +113,7 @@ async def ccore_main():
113
113
  if args.format == "xml":
114
114
  result = dicttoxml(
115
115
  result.model_dump(), custom_root="result", attr_type=False
116
- )
116
+ ).decode('utf-8')
117
117
  elif args.format == "json":
118
118
  result = result.model_dump_json()
119
119
  else: # text
@@ -30,7 +30,8 @@ summary_model:
30
30
  max_tokens: 2000
31
31
 
32
32
  extraction:
33
- engine: legacy # change to 'docling' to enable Docling engine
33
+ document_engine: auto # auto | simple | docling - for files/documents
34
+ url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
34
35
  docling:
35
36
  output_format: markdown # markdown | html | json
36
37
 
@@ -2,8 +2,7 @@ from typing import Optional
2
2
 
3
3
  from pydantic import BaseModel, Field
4
4
 
5
- from content_core.common.types import Engine
6
- from content_core.common.types import Engine
5
+ from content_core.common.types import DocumentEngine, UrlEngine
7
6
 
8
7
 
9
8
  class ProcessSourceState(BaseModel):
@@ -16,9 +15,13 @@ class ProcessSourceState(BaseModel):
16
15
  identified_provider: Optional[str] = ""
17
16
  metadata: Optional[dict] = Field(default_factory=lambda: {})
18
17
  content: Optional[str] = ""
19
- engine: Optional[Engine] = Field(
18
+ document_engine: Optional[DocumentEngine] = Field(
20
19
  default=None,
21
- description="Override extraction engine: 'auto', 'simple', 'legacy', 'firecrawl', 'jina', or 'docling'",
20
+ description="Override document extraction engine: 'auto', 'simple', or 'docling'",
21
+ )
22
+ url_engine: Optional[UrlEngine] = Field(
23
+ default=None,
24
+ description="Override URL extraction engine: 'auto', 'simple', 'firecrawl', 'jina', or 'docling'",
22
25
  )
23
26
  output_format: Optional[str] = Field(
24
27
  default=None,
@@ -30,7 +33,8 @@ class ProcessSourceInput(BaseModel):
30
33
  content: Optional[str] = ""
31
34
  file_path: Optional[str] = ""
32
35
  url: Optional[str] = ""
33
- engine: Optional[str] = None
36
+ document_engine: Optional[str] = None
37
+ url_engine: Optional[str] = None
34
38
  output_format: Optional[str] = None
35
39
 
36
40
 
@@ -0,0 +1,14 @@
1
+ from typing import Literal
2
+
3
+ DocumentEngine = Literal[
4
+ "auto",
5
+ "simple",
6
+ "docling",
7
+ ]
8
+
9
+ UrlEngine = Literal[
10
+ "auto",
11
+ "simple",
12
+ "firecrawl",
13
+ "jina",
14
+ ]
@@ -35,9 +35,13 @@ def load_config():
35
35
  CONFIG = load_config()
36
36
 
37
37
  # Programmatic config overrides: use in notebooks or scripts
38
- def set_extraction_engine(engine: str):
39
- """Override the extraction engine ('legacy' or 'docling')."""
40
- CONFIG.setdefault("extraction", {})["engine"] = engine
38
+ def set_document_engine(engine: str):
39
+ """Override the document extraction engine ('auto', 'simple', or 'docling')."""
40
+ CONFIG.setdefault("extraction", {})["document_engine"] = engine
41
+
42
+ def set_url_engine(engine: str):
43
+ """Override the URL extraction engine ('auto', 'simple', 'firecrawl', 'jina', or 'docling')."""
44
+ CONFIG.setdefault("extraction", {})["url_engine"] = engine
41
45
 
42
46
  def set_docling_output_format(fmt: str):
43
47
  """Override Docling output_format ('markdown', 'html', or 'json')."""
@@ -12,7 +12,6 @@ from content_core.common import (
12
12
  ProcessSourceState,
13
13
  UnsupportedTypeException,
14
14
  )
15
- from content_core.common.types import warn_if_deprecated_engine
16
15
  from content_core.config import CONFIG # type: ignore
17
16
  from content_core.logging import logger
18
17
  from content_core.processors.audio import extract_audio_data # type: ignore
@@ -124,11 +123,10 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
124
123
  async def file_type_router_docling(state: ProcessSourceState) -> str:
125
124
  """
126
125
  Route to Docling if enabled and supported; otherwise use simple file type edge.
127
- Supports 'auto', 'docling', 'simple', and 'legacy' (deprecated, alias for simple).
128
- 'auto' tries simple first, then falls back to docling if simple fails.
126
+ Supports 'auto', 'docling', and 'simple'.
127
+ 'auto' tries docling first, then falls back to simple if docling fails.
129
128
  """
130
- engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
131
- warn_if_deprecated_engine(engine)
129
+ engine = state.document_engine or CONFIG.get("extraction", {}).get("document_engine", "auto")
132
130
  if engine == "auto":
133
131
  logger.debug("Using auto engine")
134
132
  # Try docling first; if it fails or is not supported, fallback to simple
@@ -147,7 +145,7 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
147
145
  if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
148
146
  logger.debug("Using docling engine")
149
147
  return "extract_docling"
150
- # For 'simple' and 'legacy', use the default file type edge
148
+ # For 'simple', use the default file type edge
151
149
  logger.debug("Using simple engine")
152
150
  return await file_type_edge(state)
153
151
 
@@ -196,8 +194,10 @@ workflow.add_conditional_edges(
196
194
  for m in list(SUPPORTED_FITZ_TYPES)
197
195
  + list(SUPPORTED_OFFICE_TYPES)
198
196
  + list(DOCLING_SUPPORTED)
197
+ if m not in ["text/html"] # Exclude HTML from file download, treat as web content
199
198
  },
200
199
  "article": "extract_url",
200
+ "text/html": "extract_url", # Route HTML content to URL extraction
201
201
  "youtube": "extract_youtube_transcript",
202
202
  },
203
203
  )
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
5
5
  from readability import Document
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
- from content_core.common.types import warn_if_deprecated_engine
8
+ from content_core.config import CONFIG
9
9
  from content_core.logging import logger
10
10
  from content_core.processors.docling import DOCLING_SUPPORTED
11
11
  from content_core.processors.office import SUPPORTED_OFFICE_TYPES
@@ -160,13 +160,12 @@ async def extract_url_firecrawl(url: str):
160
160
 
161
161
  async def extract_url(state: ProcessSourceState):
162
162
  """
163
- Extract content from a URL using the engine specified in the state.
164
- Supported engines: 'auto', 'simple', 'legacy' (deprecated), 'firecrawl', 'jina'.
163
+ Extract content from a URL using the url_engine specified in the state.
164
+ Supported engines: 'auto', 'simple', 'firecrawl', 'jina'.
165
165
  """
166
166
  assert state.url, "No URL provided"
167
167
  url = state.url
168
- engine = state.engine or "auto"
169
- warn_if_deprecated_engine(engine)
168
+ engine = state.url_engine or CONFIG.get("extraction", {}).get("url_engine", "auto")
170
169
  try:
171
170
  if engine == "auto":
172
171
  if os.environ.get("FIRECRAWL_API_KEY"):
@@ -182,19 +181,12 @@ async def extract_url(state: ProcessSourceState):
182
181
  logger.error(f"Jina extraction error for URL: {url}: {e}")
183
182
  logger.debug("Falling back to BeautifulSoup")
184
183
  return await extract_url_bs4(url)
185
- elif engine == "simple" or engine == "legacy":
186
- # 'legacy' is deprecated alias for 'simple'
184
+ elif engine == "simple":
187
185
  return await extract_url_bs4(url)
188
186
  elif engine == "firecrawl":
189
187
  return await extract_url_firecrawl(url)
190
188
  elif engine == "jina":
191
189
  return await extract_url_jina(url)
192
- elif engine == "docling":
193
- from content_core.processors.docling import extract_with_docling
194
-
195
- state.url = url
196
- result_state = await extract_with_docling(state)
197
- return {"title": None, "content": result_state.content}
198
190
  else:
199
191
  raise ValueError(f"Unknown engine: {engine}")
200
192
  except Exception as e:
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import re
2
3
  import ssl
3
4
 
@@ -68,69 +69,86 @@ async def _extract_youtube_id(url):
68
69
 
69
70
 
70
71
  async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
71
- try:
72
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
73
-
74
- # First try: Manual transcripts in preferred languages
75
- manual_transcripts = []
76
- try:
77
- for transcript in transcript_list:
78
- if not transcript.is_generated and not transcript.is_translatable:
79
- manual_transcripts.append(transcript)
80
-
81
- if manual_transcripts:
82
- # Sort based on preferred language order
83
- for lang in preferred_langs:
84
- for transcript in manual_transcripts:
85
- if transcript.language_code == lang:
86
- return transcript.fetch()
87
- # If no preferred language found, return first manual transcript
88
- return manual_transcripts[0].fetch()
89
- except NoTranscriptFound:
90
- pass
91
-
92
- # Second try: Auto-generated transcripts in preferred languages
93
- generated_transcripts = []
94
- try:
95
- for transcript in transcript_list:
96
- if transcript.is_generated and not transcript.is_translatable:
97
- generated_transcripts.append(transcript)
98
-
99
- if generated_transcripts:
100
- # Sort based on preferred language order
101
- for lang in preferred_langs:
102
- for transcript in generated_transcripts:
103
- if transcript.language_code == lang:
104
- return transcript.fetch()
105
- # If no preferred language found, return first generated transcript
106
- return generated_transcripts[0].fetch()
107
- except NoTranscriptFound:
108
- pass
109
-
110
- # Last try: Translated transcripts in preferred languages
111
- translated_transcripts = []
72
+ max_attempts = 5
73
+ for attempt in range(max_attempts):
112
74
  try:
113
- for transcript in transcript_list:
114
- if transcript.is_translatable:
115
- translated_transcripts.append(transcript)
116
-
117
- if translated_transcripts:
118
- # Sort based on preferred language order
119
- for lang in preferred_langs:
120
- for transcript in translated_transcripts:
121
- if transcript.language_code == lang:
122
- return transcript.fetch()
123
- # If no preferred language found, return translation to first preferred language
124
- translation = translated_transcripts[0].translate(preferred_langs[0])
125
- return translation.fetch()
126
- except NoTranscriptFound:
127
- pass
128
-
129
- raise Exception("No suitable transcript found")
130
-
131
- except Exception as e:
132
- logger.error(f"Failed to get transcript for video {video_id}: {e}")
133
- return None
75
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
76
+
77
+ # First try: Manual transcripts in preferred languages
78
+ manual_transcripts = []
79
+ try:
80
+ for transcript in transcript_list:
81
+ if not transcript.is_generated and not transcript.is_translatable:
82
+ manual_transcripts.append(transcript)
83
+
84
+ if manual_transcripts:
85
+ # Sort based on preferred language order
86
+ for lang in preferred_langs:
87
+ for transcript in manual_transcripts:
88
+ if transcript.language_code == lang:
89
+ return transcript.fetch()
90
+ # If no preferred language found, return first manual transcript
91
+ return manual_transcripts[0].fetch()
92
+ except NoTranscriptFound:
93
+ pass
94
+
95
+ # Second try: Auto-generated transcripts in preferred languages
96
+ generated_transcripts = []
97
+ try:
98
+ for transcript in transcript_list:
99
+ if transcript.is_generated and not transcript.is_translatable:
100
+ generated_transcripts.append(transcript)
101
+
102
+ if generated_transcripts:
103
+ # Sort based on preferred language order
104
+ for lang in preferred_langs:
105
+ for transcript in generated_transcripts:
106
+ if transcript.language_code == lang:
107
+ return transcript.fetch()
108
+ # If no preferred language found, return first generated transcript
109
+ return generated_transcripts[0].fetch()
110
+ except NoTranscriptFound:
111
+ pass
112
+
113
+ # Last try: Translated transcripts in preferred languages
114
+ translated_transcripts = []
115
+ try:
116
+ for transcript in transcript_list:
117
+ if transcript.is_translatable:
118
+ translated_transcripts.append(transcript)
119
+
120
+ if translated_transcripts:
121
+ # Sort based on preferred language order
122
+ for lang in preferred_langs:
123
+ for transcript in translated_transcripts:
124
+ if transcript.language_code == lang:
125
+ return transcript.fetch()
126
+ # If no preferred language found, return translation to first preferred language
127
+ translation = translated_transcripts[0].translate(
128
+ preferred_langs[0]
129
+ )
130
+ return translation.fetch()
131
+ except NoTranscriptFound:
132
+ pass
133
+
134
+ raise Exception("No suitable transcript found")
135
+
136
+ except Exception as e:
137
+ if e.__class__.__name__ == "ParserError":
138
+ logger.warning(
139
+ f"ParserError on attempt {attempt+1}/5 for video {video_id}. Retrying..."
140
+ )
141
+ if attempt == max_attempts - 1:
142
+ logger.error(
143
+ f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
144
+ )
145
+ return None
146
+ await asyncio.sleep(2)
147
+ continue
148
+ else:
149
+ logger.error(f"Failed to get transcript for video {video_id}: {e}")
150
+ return None
151
+ return None
134
152
 
135
153
 
136
154
  async def extract_youtube_transcript(state: ProcessSourceState):
@@ -0,0 +1,394 @@
1
+ import json
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+ from xml.etree import ElementTree as ET
6
+
7
+ import pytest
8
+
9
+
10
+ @pytest.fixture
11
+ def fixture_path():
12
+ """Provides the path to the directory containing test input files."""
13
+ return Path(__file__).parent.parent / "input_content"
14
+
15
+
16
+ def run_cli_command(command_args, input_data=None):
17
+ """Helper to run CLI commands and capture output."""
18
+ try:
19
+ result = subprocess.run(
20
+ command_args,
21
+ input=input_data,
22
+ capture_output=True,
23
+ text=True,
24
+ timeout=30
25
+ )
26
+ return result
27
+ except subprocess.TimeoutExpired:
28
+ pytest.fail(f"Command {command_args} timed out")
29
+
30
+
31
+ class TestCcoreCLI:
32
+ """Tests for the ccore CLI command."""
33
+
34
+ def test_ccore_help(self):
35
+ """Test ccore help output."""
36
+ result = run_cli_command([sys.executable, "-m", "content_core", "--help"])
37
+ # Note: ccore is the default when running the module, but let's test the actual CLI entry points
38
+
39
+ def test_ccore_text_input(self):
40
+ """Test ccore with direct text input."""
41
+ result = run_cli_command(["uv", "run", "ccore", "This is a test content."])
42
+
43
+ assert result.returncode == 0
44
+ assert "This is a test content." in result.stdout
45
+ assert result.stderr == ""
46
+
47
+ def test_ccore_file_input(self, fixture_path):
48
+ """Test ccore with file input."""
49
+ md_file = fixture_path / "file.md"
50
+ if not md_file.exists():
51
+ pytest.skip(f"Fixture file not found: {md_file}")
52
+
53
+ result = run_cli_command(["uv", "run", "ccore", str(md_file)])
54
+
55
+ assert result.returncode == 0
56
+ assert len(result.stdout.strip()) > 0
57
+ assert "Buenos Aires" in result.stdout
58
+
59
+ def test_ccore_url_input(self):
60
+ """Test ccore with URL input."""
61
+ result = run_cli_command(["uv", "run", "ccore", "https://www.example.com"])
62
+
63
+ assert result.returncode == 0
64
+ assert len(result.stdout.strip()) > 0
65
+
66
+ def test_ccore_json_format(self):
67
+ """Test ccore with JSON output format."""
68
+ result = run_cli_command(["uv", "run", "ccore", "-f", "json", "Test content for JSON output."])
69
+
70
+ assert result.returncode == 0
71
+
72
+ # Verify it's valid JSON
73
+ output_data = json.loads(result.stdout)
74
+ assert isinstance(output_data, dict)
75
+ assert "content" in output_data
76
+ assert "Test content for JSON output." in output_data["content"]
77
+
78
+ def test_ccore_xml_format(self):
79
+ """Test ccore with XML output format."""
80
+ result = run_cli_command(["uv", "run", "ccore", "-f", "xml", "Test content for XML output."])
81
+
82
+ assert result.returncode == 0
83
+
84
+ # Verify it's valid XML
85
+ root = ET.fromstring(result.stdout.strip())
86
+ assert root.tag == "result"
87
+ content_elem = root.find(".//content")
88
+ assert content_elem is not None
89
+ assert "Test content for XML output." in content_elem.text
90
+
91
+ def test_ccore_text_format_explicit(self):
92
+ """Test ccore with explicit text format."""
93
+ result = run_cli_command(["uv", "run", "ccore", "-f", "text", "Test content for text output."])
94
+
95
+ assert result.returncode == 0
96
+ assert "Test content for text output." in result.stdout
97
+
98
+ def test_ccore_stdin_input(self):
99
+ """Test ccore with stdin input."""
100
+ test_content = "This content comes from stdin."
101
+ result = run_cli_command(["uv", "run", "ccore"], input_data=test_content)
102
+
103
+ assert result.returncode == 0
104
+ assert test_content in result.stdout
105
+
106
+ def test_ccore_stdin_json_format(self):
107
+ """Test ccore with stdin input and JSON format."""
108
+ test_content = "Stdin content with JSON format."
109
+ result = run_cli_command(["uv", "run", "ccore", "-f", "json"], input_data=test_content)
110
+
111
+ assert result.returncode == 0
112
+
113
+ # Verify it's valid JSON
114
+ output_data = json.loads(result.stdout)
115
+ assert test_content in output_data["content"]
116
+
117
+ def test_ccore_debug_flag(self):
118
+ """Test ccore with debug flag."""
119
+ result = run_cli_command(["uv", "run", "ccore", "-d", "Debug test content."])
120
+
121
+ assert result.returncode == 0
122
+ assert "Debug test content." in result.stdout
123
+ # Debug output goes to stderr in loguru
124
+
125
+ def test_ccore_file_pdf(self, fixture_path):
126
+ """Test ccore with PDF file."""
127
+ pdf_file = fixture_path / "file.pdf"
128
+ if not pdf_file.exists():
129
+ pytest.skip(f"Fixture file not found: {pdf_file}")
130
+
131
+ result = run_cli_command(["uv", "run", "ccore", str(pdf_file)])
132
+
133
+ assert result.returncode == 0
134
+ assert len(result.stdout.strip()) > 0
135
+
136
+
137
+ class TestCcleanCLI:
138
+ """Tests for the cclean CLI command."""
139
+
140
+ def test_cclean_text_input(self):
141
+ """Test cclean with direct text input."""
142
+ messy_text = " This is messy text with extra spaces. "
143
+ result = run_cli_command(["uv", "run", "cclean", messy_text])
144
+
145
+ assert result.returncode == 0
146
+ cleaned = result.stdout.strip()
147
+ assert cleaned != messy_text
148
+ assert "This is messy text" in cleaned
149
+
150
+ def test_cclean_json_input(self):
151
+ """Test cclean with JSON input containing content field."""
152
+ json_input = '{"content": " Messy JSON content "}'
153
+ result = run_cli_command(["uv", "run", "cclean"], input_data=json_input)
154
+
155
+ assert result.returncode == 0
156
+ cleaned = result.stdout.strip()
157
+ assert "Messy JSON content" in cleaned
158
+
159
+ def test_cclean_xml_input(self):
160
+ """Test cclean with XML input containing content field."""
161
+ xml_input = '<root><content> Messy XML content </content></root>'
162
+ result = run_cli_command(["uv", "run", "cclean"], input_data=xml_input)
163
+
164
+ assert result.returncode == 0
165
+ cleaned = result.stdout.strip()
166
+ assert "Messy XML content" in cleaned
167
+
168
+ def test_cclean_file_input(self, fixture_path):
169
+ """Test cclean with file input."""
170
+ txt_file = fixture_path / "file.txt"
171
+ if not txt_file.exists():
172
+ pytest.skip(f"Fixture file not found: {txt_file}")
173
+
174
+ result = run_cli_command(["uv", "run", "cclean", str(txt_file)])
175
+
176
+ assert result.returncode == 0
177
+ assert len(result.stdout.strip()) > 0
178
+
179
+ def test_cclean_url_input(self):
180
+ """Test cclean with URL input."""
181
+ result = run_cli_command(["uv", "run", "cclean", "https://www.example.com"])
182
+
183
+ assert result.returncode == 0
184
+ assert len(result.stdout.strip()) > 0
185
+
186
+ def test_cclean_stdin_input(self):
187
+ """Test cclean with stdin input."""
188
+ messy_content = " This has too many spaces and needs cleaning. "
189
+ result = run_cli_command(["uv", "run", "cclean"], input_data=messy_content)
190
+
191
+ assert result.returncode == 0
192
+ cleaned = result.stdout.strip()
193
+ assert "This has too many spaces" in cleaned
194
+
195
+ def test_cclean_debug_flag(self):
196
+ """Test cclean with debug flag."""
197
+ result = run_cli_command(["uv", "run", "cclean", "-d", "Debug clean test."])
198
+
199
+ assert result.returncode == 0
200
+ assert "Debug clean test" in result.stdout
201
+
202
+
203
+ class TestCsumCLI:
204
+ """Tests for the csum CLI command."""
205
+
206
+ def test_csum_text_input(self):
207
+ """Test csum with direct text input."""
208
+ long_text = "Artificial Intelligence is revolutionizing industries across the globe. From healthcare to finance, AI technologies are enabling automation, improving decision-making, and creating new possibilities for innovation."
209
+ result = run_cli_command(["uv", "run", "csum", long_text])
210
+
211
+ assert result.returncode == 0
212
+ summary = result.stdout.strip()
213
+ assert len(summary) > 0
214
+ assert len(summary) < len(long_text) # Summary should be shorter
215
+
216
+ def test_csum_with_context(self):
217
+ """Test csum with context parameter."""
218
+ text = "Machine learning algorithms process vast amounts of data to identify patterns and make predictions."
219
+ context = "explain in simple terms"
220
+ result = run_cli_command(["uv", "run", "csum", "--context", context, text])
221
+
222
+ assert result.returncode == 0
223
+ summary = result.stdout.strip()
224
+ assert len(summary) > 0
225
+
226
+ def test_csum_file_input(self, fixture_path):
227
+ """Test csum with file input."""
228
+ md_file = fixture_path / "file.md"
229
+ if not md_file.exists():
230
+ pytest.skip(f"Fixture file not found: {md_file}")
231
+
232
+ result = run_cli_command(["uv", "run", "csum", str(md_file)])
233
+
234
+ assert result.returncode == 0
235
+ assert len(result.stdout.strip()) > 0
236
+
237
+ def test_csum_url_input(self):
238
+ """Test csum with URL input."""
239
+ result = run_cli_command(["uv", "run", "csum", "https://www.example.com"])
240
+
241
+ assert result.returncode == 0
242
+ assert len(result.stdout.strip()) > 0
243
+
244
+ def test_csum_json_input(self):
245
+ """Test csum with JSON input containing content field."""
246
+ json_input = '{"content": "This is a long article about technology trends. It discusses various aspects of innovation, digital transformation, and the future of work in the digital age."}'
247
+ result = run_cli_command(["uv", "run", "csum"], input_data=json_input)
248
+
249
+ assert result.returncode == 0
250
+ summary = result.stdout.strip()
251
+ assert len(summary) > 0
252
+
253
+ def test_csum_xml_input(self):
254
+ """Test csum with XML input containing content field."""
255
+ xml_input = '<article><content>This is a comprehensive guide to understanding cloud computing. It covers infrastructure, platforms, software services, and deployment models.</content></article>'
256
+ result = run_cli_command(["uv", "run", "csum"], input_data=xml_input)
257
+
258
+ assert result.returncode == 0
259
+ summary = result.stdout.strip()
260
+ assert len(summary) > 0
261
+
262
+ def test_csum_stdin_input(self):
263
+ """Test csum with stdin input."""
264
+ long_content = "The Internet of Things (IoT) represents a network of interconnected devices that communicate and exchange data. This technology has applications in smart homes, industrial automation, healthcare monitoring, and environmental sensing. As IoT devices become more prevalent, they are transforming how we interact with our environment and creating new opportunities for data-driven insights."
265
+ result = run_cli_command(["uv", "run", "csum"], input_data=long_content)
266
+
267
+ assert result.returncode == 0
268
+ summary = result.stdout.strip()
269
+ assert len(summary) > 0
270
+ assert len(summary) < len(long_content)
271
+
272
+ def test_csum_context_bullet_points(self):
273
+ """Test csum with bullet points context."""
274
+ text = "Blockchain technology provides a decentralized approach to data storage and transaction processing. It ensures security through cryptographic methods and maintains transparency through distributed ledgers."
275
+ result = run_cli_command(["uv", "run", "csum", "--context", "in bullet points", text])
276
+
277
+ assert result.returncode == 0
278
+ summary = result.stdout.strip()
279
+ assert len(summary) > 0
280
+
281
+ def test_csum_debug_flag(self):
282
+ """Test csum with debug flag."""
283
+ result = run_cli_command(["uv", "run", "csum", "-d", "Debug summary test content."])
284
+
285
+ assert result.returncode == 0
286
+ assert len(result.stdout.strip()) > 0
287
+
288
+
289
+ class TestCLIErrorHandling:
290
+ """Tests for CLI error handling and edge cases."""
291
+
292
+ def test_ccore_empty_input_error(self):
293
+ """Test ccore with empty input should error."""
294
+ result = run_cli_command(["uv", "run", "ccore", ""])
295
+
296
+ assert result.returncode != 0
297
+
298
+ def test_cclean_empty_input_error(self):
299
+ """Test cclean with empty input should error."""
300
+ result = run_cli_command(["uv", "run", "cclean", ""])
301
+
302
+ assert result.returncode != 0
303
+
304
+ def test_csum_empty_input_error(self):
305
+ """Test csum with empty input should error."""
306
+ result = run_cli_command(["uv", "run", "csum", ""])
307
+
308
+ assert result.returncode != 0
309
+
310
+ def test_ccore_invalid_format(self):
311
+ """Test ccore with invalid format option."""
312
+ result = run_cli_command(["uv", "run", "ccore", "-f", "invalid", "test"])
313
+
314
+ assert result.returncode != 0
315
+ assert "invalid choice" in result.stderr.lower()
316
+
317
+ def test_ccore_nonexistent_file(self):
318
+ """Test ccore with non-existent file."""
319
+ result = run_cli_command(["uv", "run", "ccore", "/path/to/nonexistent/file.txt"])
320
+
321
+ # Should not error but treat as text content
322
+ assert result.returncode == 0
323
+ assert "/path/to/nonexistent/file.txt" in result.stdout
324
+
325
+ def test_stdin_no_content_error(self):
326
+ """Test CLI with no content and no stdin should error."""
327
+ # This is tricky to test as it involves TTY detection
328
+ # We'll skip this for now as it requires special handling
329
+ pass
330
+
331
+
332
+ class TestCLIIntegration:
333
+ """Integration tests combining multiple CLI features."""
334
+
335
+ def test_pipeline_extract_clean_summarize(self, fixture_path):
336
+ """Test a pipeline of extract -> clean -> summarize."""
337
+ md_file = fixture_path / "file.md"
338
+ if not md_file.exists():
339
+ pytest.skip(f"Fixture file not found: {md_file}")
340
+
341
+ # Extract content
342
+ extract_result = run_cli_command(["uv", "run", "ccore", str(md_file)])
343
+ assert extract_result.returncode == 0
344
+
345
+ # Clean extracted content
346
+ clean_result = run_cli_command(["uv", "run", "cclean"], input_data=extract_result.stdout)
347
+ assert clean_result.returncode == 0
348
+
349
+ # Summarize cleaned content
350
+ summary_result = run_cli_command(["uv", "run", "csum"], input_data=clean_result.stdout)
351
+ assert summary_result.returncode == 0
352
+
353
+ assert len(summary_result.stdout.strip()) > 0
354
+
355
+ def test_json_pipeline(self):
356
+ """Test pipeline with JSON format."""
357
+ text = "This is a test for JSON pipeline processing."
358
+
359
+ # Extract as JSON
360
+ extract_result = run_cli_command(["uv", "run", "ccore", "-f", "json", text])
361
+ assert extract_result.returncode == 0
362
+
363
+ # Verify JSON output
364
+ json_data = json.loads(extract_result.stdout)
365
+ assert text in json_data["content"]
366
+
367
+ # Clean JSON content
368
+ clean_result = run_cli_command(["uv", "run", "cclean"], input_data=extract_result.stdout)
369
+ assert clean_result.returncode == 0
370
+
371
+ # Summarize cleaned content
372
+ summary_result = run_cli_command(["uv", "run", "csum"], input_data=clean_result.stdout)
373
+ assert summary_result.returncode == 0
374
+
375
+ def test_xml_processing(self):
376
+ """Test XML format processing."""
377
+ text = "This is test content for XML processing and validation."
378
+
379
+ # Extract as XML
380
+ extract_result = run_cli_command(["uv", "run", "ccore", "-f", "xml", text])
381
+ assert extract_result.returncode == 0
382
+
383
+ # Verify XML output
384
+ root = ET.fromstring(extract_result.stdout.strip())
385
+ content_elem = root.find(".//content")
386
+ assert content_elem is not None
387
+ assert text in content_elem.text
388
+
389
+ # Process XML content through clean and summarize
390
+ clean_result = run_cli_command(["uv", "run", "cclean"], input_data=extract_result.stdout)
391
+ assert clean_result.returncode == 0
392
+
393
+ summary_result = run_cli_command(["uv", "run", "csum", "--context", "one sentence"], input_data=clean_result.stdout)
394
+ assert summary_result.returncode == 0
@@ -26,7 +26,7 @@ async def test_extract_content_from_text():
26
26
  async def test_extract_content_from_url(fixture_path):
27
27
  """Tests content extraction from a URL."""
28
28
  # Using a known URL from the notebook example
29
- input_data = {"url": "https://www.supernovalabs.com", "engine": "simple"}
29
+ input_data = {"url": "https://www.supernovalabs.com", "url_engine": "simple"}
30
30
  result = await extract_content(input_data)
31
31
 
32
32
  assert hasattr(result, "source_type")
@@ -41,8 +41,13 @@ async def test_extract_content_from_url(fixture_path):
41
41
  @pytest.mark.asyncio
42
42
  async def test_extract_content_from_url_firecrawl(fixture_path):
43
43
  """Tests content extraction from a URL."""
44
+ try:
45
+ import firecrawl
46
+ except ImportError:
47
+ pytest.skip("Firecrawl not installed")
48
+
44
49
  # Using a known URL from the notebook example
45
- input_data = {"url": "https://www.supernovalabs.com", "engine": "firecrawl"}
50
+ input_data = {"url": "https://www.supernovalabs.com", "url_engine": "firecrawl"}
46
51
  result = await extract_content(input_data)
47
52
 
48
53
  assert hasattr(result, "source_type")
@@ -58,7 +63,7 @@ async def test_extract_content_from_url_firecrawl(fixture_path):
58
63
  async def test_extract_content_from_url_jina(fixture_path):
59
64
  """Tests content extraction from a URL."""
60
65
  # Using a known URL from the notebook example
61
- input_data = {"url": "https://www.supernovalabs.com", "engine": "jina"}
66
+ input_data = {"url": "https://www.supernovalabs.com", "url_engine": "jina"}
62
67
  result = await extract_content(input_data)
63
68
 
64
69
  assert hasattr(result, "source_type")
@@ -222,7 +227,7 @@ async def test_extract_content_from_xlsx(fixture_path):
222
227
  if not xlsx_file.exists():
223
228
  pytest.skip(f"Fixture file not found: {xlsx_file}")
224
229
 
225
- result = await extract_content(dict(file_path=str(xlsx_file), engine="simple"))
230
+ result = await extract_content(dict(file_path=str(xlsx_file), document_engine="simple"))
226
231
 
227
232
  assert result.source_type == "file"
228
233
  assert (
@@ -240,7 +245,7 @@ async def test_extract_content_from_xlsx(fixture_path):
240
245
  # if not xlsx_file.exists():
241
246
  # pytest.skip(f"Fixture file not found: {xlsx_file}")
242
247
 
243
- # result = await extract_content(dict(file_path=str(xlsx_file), engine="docling"))
248
+ # result = await extract_content(dict(file_path=str(xlsx_file), document_engine="docling"))
244
249
 
245
250
  # assert result.source_type == "file"
246
251
  # assert (
@@ -410,7 +410,7 @@ wheels = [
410
410
 
411
411
  [[package]]
412
412
  name = "content-core"
413
- version = "0.8.5"
413
+ version = "1.0.1"
414
414
  source = { editable = "." }
415
415
  dependencies = [
416
416
  { name = "ai-prompter" },
@@ -1,21 +0,0 @@
1
- from typing import Literal
2
- import warnings
3
-
4
- Engine = Literal[
5
- "auto",
6
- "simple",
7
- "legacy",
8
- "firecrawl",
9
- "jina",
10
- "docling",
11
- ]
12
-
13
- DEPRECATED_ENGINES = {"legacy": "simple"}
14
-
15
- def warn_if_deprecated_engine(engine: str):
16
- if engine in DEPRECATED_ENGINES:
17
- warnings.warn(
18
- f"Engine '{engine}' is deprecated and will be removed in a future release. Use '{DEPRECATED_ENGINES[engine]}' instead.",
19
- DeprecationWarning,
20
- stacklevel=2,
21
- )
File without changes
File without changes