content-core 0.7.2__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (61) hide show
  1. {content_core-0.7.2 → content_core-0.8.1}/PKG-INFO +17 -17
  2. {content_core-0.7.2 → content_core-0.8.1}/README.md +11 -11
  3. {content_core-0.7.2 → content_core-0.8.1}/docs/processors.md +12 -1
  4. {content_core-0.7.2 → content_core-0.8.1}/docs/usage.md +15 -5
  5. {content_core-0.7.2 → content_core-0.8.1}/pyproject.toml +6 -4
  6. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/common/state.py +6 -2
  7. content_core-0.8.1/src/content_core/common/types.py +21 -0
  8. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/content/extraction/graph.py +18 -3
  9. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/processors/audio.py +19 -11
  10. content_core-0.8.1/src/content_core/processors/url.py +248 -0
  11. {content_core-0.7.2 → content_core-0.8.1}/tests/integration/test_extraction.py +53 -4
  12. {content_core-0.7.2 → content_core-0.8.1}/uv.lock +314 -171
  13. content_core-0.7.2/src/content_core/processors/url.py +0 -252
  14. {content_core-0.7.2 → content_core-0.8.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  15. {content_core-0.7.2 → content_core-0.8.1}/.github/workflows/publish.yml +0 -0
  16. {content_core-0.7.2 → content_core-0.8.1}/.gitignore +0 -0
  17. {content_core-0.7.2 → content_core-0.8.1}/.python-version +0 -0
  18. {content_core-0.7.2 → content_core-0.8.1}/CONTRIBUTING.md +0 -0
  19. {content_core-0.7.2 → content_core-0.8.1}/LICENSE +0 -0
  20. {content_core-0.7.2 → content_core-0.8.1}/Makefile +0 -0
  21. {content_core-0.7.2 → content_core-0.8.1}/prompts/content/cleanup.jinja +0 -0
  22. {content_core-0.7.2 → content_core-0.8.1}/prompts/content/summarize.jinja +0 -0
  23. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/__init__.py +0 -0
  24. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/cc_config.yaml +0 -0
  25. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/common/__init__.py +0 -0
  26. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/common/exceptions.py +0 -0
  27. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/common/utils.py +0 -0
  28. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/config.py +0 -0
  29. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/content/__init__.py +0 -0
  30. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/content/cleanup/__init__.py +0 -0
  31. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/content/cleanup/core.py +0 -0
  32. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/content/extraction/__init__.py +0 -0
  33. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/content/summary/__init__.py +0 -0
  34. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/content/summary/core.py +0 -0
  35. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/logging.py +0 -0
  36. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/models.py +0 -0
  37. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/models_config.yaml +0 -0
  38. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/notebooks/run.ipynb +0 -0
  39. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/processors/docling.py +0 -0
  40. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/processors/office.py +0 -0
  41. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/processors/pdf.py +0 -0
  42. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/processors/text.py +0 -0
  43. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/processors/video.py +0 -0
  44. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/processors/youtube.py +0 -0
  45. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/py.typed +0 -0
  46. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/templated_message.py +0 -0
  47. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/tools/__init__.py +0 -0
  48. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/tools/cleanup.py +0 -0
  49. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/tools/extract.py +0 -0
  50. {content_core-0.7.2 → content_core-0.8.1}/src/content_core/tools/summarize.py +0 -0
  51. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.docx +0 -0
  52. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.epub +0 -0
  53. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.md +0 -0
  54. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.mp3 +0 -0
  55. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.mp4 +0 -0
  56. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.pdf +0 -0
  57. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.pptx +0 -0
  58. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.txt +0 -0
  59. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file.xlsx +0 -0
  60. {content_core-0.7.2 → content_core-0.8.1}/tests/input_content/file_audio.mp3 +0 -0
  61. {content_core-0.7.2 → content_core-0.8.1}/tests/unit/test_docling.py +0 -0
@@ -1,15 +1,18 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.7.2
3
+ Version: 0.8.1
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.10
8
8
  Requires-Dist: ai-prompter>=0.2.3
9
9
  Requires-Dist: aiohttp>=3.11
10
+ Requires-Dist: asciidoc>=10.2.1
10
11
  Requires-Dist: bs4>=0.0.2
11
12
  Requires-Dist: dicttoxml>=1.7.16
13
+ Requires-Dist: docling>=2.34.0
12
14
  Requires-Dist: esperanto[openai]>=1.2.0
15
+ Requires-Dist: firecrawl-py>=2.7.0
13
16
  Requires-Dist: jinja2>=3.1.6
14
17
  Requires-Dist: langdetect>=1.0.9
15
18
  Requires-Dist: langgraph>=0.3.29
@@ -17,18 +20,15 @@ Requires-Dist: loguru>=0.7.3
17
20
  Requires-Dist: moviepy>=2.1.2
18
21
  Requires-Dist: openpyxl>=3.1.5
19
22
  Requires-Dist: pandas>=2.2.3
23
+ Requires-Dist: pillow>=10.4.0
20
24
  Requires-Dist: pymupdf>=1.25.5
21
25
  Requires-Dist: python-docx>=1.1.2
22
26
  Requires-Dist: python-dotenv>=1.1.0
23
27
  Requires-Dist: python-magic>=0.4.27
24
28
  Requires-Dist: python-pptx>=1.0.2
29
+ Requires-Dist: readability-lxml>=0.8.4.1
25
30
  Requires-Dist: validators>=0.34.0
26
31
  Requires-Dist: youtube-transcript-api>=1.0.3
27
- Provides-Extra: docling
28
- Requires-Dist: asciidoc; extra == 'docling'
29
- Requires-Dist: docling; extra == 'docling'
30
- Requires-Dist: pandas; extra == 'docling'
31
- Requires-Dist: pillow; extra == 'docling'
32
32
  Description-Content-Type: text/markdown
33
33
 
34
34
  # Content Core
@@ -39,6 +39,8 @@ Description-Content-Type: text/markdown
39
39
 
40
40
  ## Overview
41
41
 
42
+ > **Note:** As of v0.8, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
43
+
42
44
  The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
43
45
 
44
46
  ## Key Features
@@ -48,6 +50,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
48
50
  * Web URLs (using robust extraction methods).
49
51
  * Local files (including automatic transcription for video/audio files and parsing for text-based formats).
50
52
  * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
53
+ * **Smart Engine Selection:** By default, Content Core uses the `'auto'` engine, which:
54
+ * For URLs: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else tries Jina. Jina might fail because of rate limits, which can be fixed by adding `JINA_API_KEY`. If Jina failes, BeautifulSoup is used as a fallback.
55
+ * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
56
+ * You can override this by specifying an engine, but `'auto'` is recommended for most users.
51
57
  * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
52
58
  * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
53
59
 
@@ -60,8 +66,6 @@ Install Content Core using `pip`:
60
66
  ```bash
61
67
  # Install the package (without Docling)
62
68
  pip install content-core
63
- # Install with Docling support
64
- pip install content-core[docling]
65
69
  ```
66
70
 
67
71
  Alternatively, if you’re developing locally:
@@ -218,15 +222,15 @@ async def main():
218
222
  text_data = await extract_content({"content": "This is my sample text content."})
219
223
  print(text_data)
220
224
 
221
- # Extract from a URL
225
+ # Extract from a URL (uses 'auto' engine by default)
222
226
  url_data = await extract_content({"url": "https://www.example.com"})
223
227
  print(url_data)
224
228
 
225
- # Extract from a local video file (gets transcript)
229
+ # Extract from a local video file (gets transcript, engine='auto' by default)
226
230
  video_data = await extract_content({"file_path": "path/to/your/video.mp4"})
227
231
  print(video_data)
228
232
 
229
- # Extract from a local markdown file
233
+ # Extract from a local markdown file (engine='auto' by default)
230
234
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
231
235
  print(md_data)
232
236
 
@@ -248,15 +252,11 @@ if __name__ == "__main__":
248
252
 
249
253
  Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
250
254
 
251
- ### Installation
252
-
253
- ```bash
254
- # Install with Docling support
255
- pip install content-core[docling]
256
- ```
257
255
 
258
256
  ### Enabling Docling
259
257
 
258
+ Docling is not the default engine when parsing documents. If you don't want to use it, you need to set engine to "simple".
259
+
260
260
  #### Via configuration file
261
261
 
262
262
  In your `cc_config.yaml` or custom config, set:
@@ -6,6 +6,8 @@
6
6
 
7
7
  ## Overview
8
8
 
9
+ > **Note:** As of v0.8, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
10
+
9
11
  The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
10
12
 
11
13
  ## Key Features
@@ -15,6 +17,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
15
17
  * Web URLs (using robust extraction methods).
16
18
  * Local files (including automatic transcription for video/audio files and parsing for text-based formats).
17
19
  * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
20
+ * **Smart Engine Selection:** By default, Content Core uses the `'auto'` engine, which:
21
+ * For URLs: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else tries Jina. Jina might fail because of rate limits, which can be fixed by adding `JINA_API_KEY`. If Jina failes, BeautifulSoup is used as a fallback.
22
+ * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
23
+ * You can override this by specifying an engine, but `'auto'` is recommended for most users.
18
24
  * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
19
25
  * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
20
26
 
@@ -27,8 +33,6 @@ Install Content Core using `pip`:
27
33
  ```bash
28
34
  # Install the package (without Docling)
29
35
  pip install content-core
30
- # Install with Docling support
31
- pip install content-core[docling]
32
36
  ```
33
37
 
34
38
  Alternatively, if you’re developing locally:
@@ -185,15 +189,15 @@ async def main():
185
189
  text_data = await extract_content({"content": "This is my sample text content."})
186
190
  print(text_data)
187
191
 
188
- # Extract from a URL
192
+ # Extract from a URL (uses 'auto' engine by default)
189
193
  url_data = await extract_content({"url": "https://www.example.com"})
190
194
  print(url_data)
191
195
 
192
- # Extract from a local video file (gets transcript)
196
+ # Extract from a local video file (gets transcript, engine='auto' by default)
193
197
  video_data = await extract_content({"file_path": "path/to/your/video.mp4"})
194
198
  print(video_data)
195
199
 
196
- # Extract from a local markdown file
200
+ # Extract from a local markdown file (engine='auto' by default)
197
201
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
198
202
  print(md_data)
199
203
 
@@ -215,15 +219,11 @@ if __name__ == "__main__":
215
219
 
216
220
  Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
217
221
 
218
- ### Installation
219
-
220
- ```bash
221
- # Install with Docling support
222
- pip install content-core[docling]
223
- ```
224
222
 
225
223
  ### Enabling Docling
226
224
 
225
+ Docling is not the default engine when parsing documents. If you don't want to use it, you need to set engine to "simple".
226
+
227
227
  #### Via configuration file
228
228
 
229
229
  In your `cc_config.yaml` or custom config, set:
@@ -1,5 +1,7 @@
1
1
  # Content Core Processors
2
2
 
3
+ **Note:** As of vNEXT, the default extraction engine is now `'auto'`. This means Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. See details below.
4
+
3
5
  This document provides an overview of the content processors available in Content Core. These processors are responsible for extracting and handling content from various sources and file types.
4
6
 
5
7
  ## Overview
@@ -19,6 +21,11 @@ Content Core uses a modular approach to process content from different sources.
19
21
  - **Supported Input**: URLs (web pages).
20
22
  - **Returned Data**: Extracted text content from the web page, often in a cleaned format.
21
23
  - **Location**: `src/content_core/processors/url.py`
24
+ - **Default Engine (`auto`) Logic**:
25
+ - If `FIRECRAWL_API_KEY` is set, uses Firecrawl for extraction.
26
+ - Else it tries Jina until it fails because of rate limits (unless `JINA_API_KEY` is set).
27
+ - Else, falls back to BeautifulSoup-based extraction.
28
+ - You can explicitly specify an engine (`'firecrawl'`, `'jina'`, `'simple'`, etc.), but `'auto'` is now the default and recommended for most users.
22
29
 
23
30
  ### 3. **File Processor**
24
31
  - **Purpose**: Processes local files of various types, extracting content based on file format.
@@ -40,10 +47,14 @@ Content Core uses a modular approach to process content from different sources.
40
47
  - **Supported Input**: PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, Images (PNG, JPEG, TIFF, BMP).
41
48
  - **Returned Data**: Content converted to configured format (markdown, html, json).
42
49
  - **Location**: `src/content_core/processors/docling.py`
50
+ - **Default Engine (`auto`) Logic for Files/Documents**:
51
+ - Tries the `'docling'` extraction method first (robust document parsing for supported types).
52
+ - If `'docling'` fails or is not supported, automatically falls back to simple extraction (fast, lightweight for supported types).
53
+ - You can explicitly specify `'docling'`, `'simple'`, or `'legacy'` as the engine, but `'auto'` is now the default and recommended for most users.
43
54
  - **Configuration**: Activate the Docling engine in `cc_config.yaml` or custom config:
44
55
  ```yaml
45
56
  extraction:
46
- engine: docling # 'legacy' (default) or 'docling'
57
+ engine: docling # 'auto' (default), 'docling', or 'simple'
47
58
  docling:
48
59
  output_format: markdown # markdown | html | json
49
60
  ```
@@ -1,5 +1,7 @@
1
1
  # Using the Content Core Library
2
2
 
3
+ > **Note:** As of vNEXT, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
4
+
3
5
  This documentation explains how to configure and use the **Content Core** library in your projects. The library allows customization of AI model settings through a YAML file and environment variables.
4
6
 
5
7
  ## Environment Variable for Configuration
@@ -76,20 +78,28 @@ To simplify setup, we suggest copying the provided sample files:
76
78
 
77
79
  This will allow you to quickly start with customized settings without needing to create the files from scratch.
78
80
 
79
- ### Docling Engine
81
+ ### Extraction Engine Selection
82
+
83
+ By default, Content Core uses the `'auto'` engine for all extraction tasks. The logic is as follows:
84
+ - **For URLs**: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else Jina if `JINA_API_KEY` is set, else falls back to BeautifulSoup.
85
+ - **For files**: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
86
+
87
+ You can override this behavior by specifying an engine in your config or function call, but `'auto'` is recommended for most users.
88
+
89
+ #### Docling Engine
80
90
 
81
- Content Core supports an optional Docling engine for advanced document parsing. To enable:
91
+ Content Core supports an optional Docling engine for advanced document parsing. To enable Docling explicitly:
82
92
 
83
- #### In YAML config
93
+ ##### In YAML config
84
94
  Add under the `extraction` section:
85
95
  ```yaml
86
96
  extraction:
87
- engine: docling # legacy (default) or docling
97
+ engine: docling # auto (default), docling, or simple
88
98
  docling:
89
99
  output_format: html # markdown | html | json
90
100
  ```
91
101
 
92
- #### Programmatically in Python
102
+ ##### Programmatically in Python
93
103
  ```python
94
104
  from content_core.config import set_extraction_engine, set_docling_output_format
95
105
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "0.7.2"
3
+ version = "0.8.1"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -28,11 +28,13 @@ dependencies = [
28
28
  "validators>=0.34.0",
29
29
  "ai-prompter>=0.2.3",
30
30
  "moviepy>=2.1.2",
31
+ "readability-lxml>=0.8.4.1",
32
+ "firecrawl-py>=2.7.0",
33
+ "docling>=2.34.0",
34
+ "pillow>=10.4.0",
35
+ "asciidoc>=10.2.1",
31
36
  ]
32
37
 
33
- [project.optional-dependencies]
34
- docling = ["docling", "Pillow", "pandas", "asciidoc"]
35
-
36
38
  [project.scripts]
37
39
  ccore = "content_core:ccore"
38
40
  cclean = "content_core:cclean"
@@ -2,6 +2,9 @@ from typing import Optional
2
2
 
3
3
  from pydantic import BaseModel, Field
4
4
 
5
+ from content_core.common.types import Engine
6
+ from content_core.common.types import Engine
7
+
5
8
 
6
9
  class ProcessSourceState(BaseModel):
7
10
  file_path: Optional[str] = ""
@@ -13,8 +16,9 @@ class ProcessSourceState(BaseModel):
13
16
  identified_provider: Optional[str] = ""
14
17
  metadata: Optional[dict] = Field(default_factory=lambda: {})
15
18
  content: Optional[str] = ""
16
- engine: Optional[str] = Field(
17
- default=None, description="Override extraction engine: 'legacy' or 'docling'"
19
+ engine: Optional[Engine] = Field(
20
+ default=None,
21
+ description="Override extraction engine: 'auto', 'simple', 'legacy', 'firecrawl', 'jina', or 'docling'",
18
22
  )
19
23
  output_format: Optional[str] = Field(
20
24
  default=None,
@@ -0,0 +1,21 @@
1
+ from typing import Literal
2
+ import warnings
3
+
4
+ Engine = Literal[
5
+ "auto",
6
+ "simple",
7
+ "legacy",
8
+ "firecrawl",
9
+ "jina",
10
+ "docling",
11
+ ]
12
+
13
+ DEPRECATED_ENGINES = {"legacy": "simple"}
14
+
15
+ def warn_if_deprecated_engine(engine: str):
16
+ if engine in DEPRECATED_ENGINES:
17
+ warnings.warn(
18
+ f"Engine '{engine}' is deprecated and will be removed in a future release. Use '{DEPRECATED_ENGINES[engine]}' instead.",
19
+ DeprecationWarning,
20
+ stacklevel=2,
21
+ )
@@ -2,6 +2,7 @@ import os
2
2
  import tempfile
3
3
  from typing import Any, Dict, Optional
4
4
  from urllib.parse import urlparse
5
+ from content_core.common.types import warn_if_deprecated_engine
5
6
 
6
7
  import aiohttp
7
8
  import magic
@@ -114,14 +115,28 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
114
115
  return {"file_path": tmp, "identified_type": mime}
115
116
 
116
117
 
118
+
117
119
  async def file_type_router_docling(state: ProcessSourceState) -> str:
118
120
  """
119
- Route to Docling if enabled and supported; otherwise use legacy file type edge.
121
+ Route to Docling if enabled and supported; otherwise use simple file type edge.
122
+ Supports 'auto', 'docling', 'simple', and 'legacy' (deprecated, alias for simple).
123
+ 'auto' tries simple first, then falls back to docling if simple fails.
120
124
  """
121
- # allow per-execution override of engine via state.engine
122
- engine = state.engine or CONFIG.get("extraction", {}).get("engine", "legacy")
125
+ engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
126
+ warn_if_deprecated_engine(engine)
127
+ if engine == "auto":
128
+ # Try docling first; if it fails or is not supported, fallback to simple
129
+ if state.identified_type in DOCLING_SUPPORTED:
130
+ try:
131
+ return "extract_docling"
132
+ except Exception as e:
133
+ logger.warning(f"Docling extraction failed in 'auto' mode, falling back to simple: {e}")
134
+ # Fallback to simple
135
+ return await file_type_edge(state)
136
+
123
137
  if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
124
138
  return "extract_docling"
139
+ # For 'simple' and 'legacy', use the default file type edge
125
140
  return await file_type_edge(state)
126
141
 
127
142
 
@@ -1,9 +1,10 @@
1
1
  import asyncio
2
+ import math
2
3
  import os
3
4
  import tempfile
4
- import math
5
5
  import traceback
6
6
  from functools import partial
7
+
7
8
  from moviepy import AudioFileClip
8
9
 
9
10
  from content_core.common import ProcessSourceState
@@ -64,7 +65,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
64
65
  )
65
66
 
66
67
 
67
- def extract_audio(input_file: str, output_file: str, start_time: float = None, end_time: float = None) -> None:
68
+ def extract_audio(
69
+ input_file: str, output_file: str, start_time: float = None, end_time: float = None
70
+ ) -> None:
68
71
  """
69
72
  Extract audio from a video or audio file and save it as an MP3 file.
70
73
  If start_time and end_time are provided, only that segment of audio is extracted.
@@ -78,17 +81,17 @@ def extract_audio(input_file: str, output_file: str, start_time: float = None, e
78
81
  try:
79
82
  # Load the file as an AudioFileClip
80
83
  audio_clip = AudioFileClip(input_file)
81
-
82
- # If start_time and end_time are provided, trim the audio
84
+
85
+ # If start_time and/or end_time are provided, trim the audio using subclipped
83
86
  if start_time is not None and end_time is not None:
84
- audio_clip = audio_clip.cutout(0, start_time).cutout(end_time - start_time, audio_clip.duration)
87
+ audio_clip = audio_clip.subclipped(start_time, end_time)
85
88
  elif start_time is not None:
86
- audio_clip = audio_clip.cutout(0, start_time)
89
+ audio_clip = audio_clip.subclipped(start_time)
87
90
  elif end_time is not None:
88
- audio_clip = audio_clip.cutout(end_time, audio_clip.duration)
91
+ audio_clip = audio_clip.subclipped(0, end_time)
89
92
 
90
93
  # Export the audio as MP3
91
- audio_clip.write_audiofile(output_file, codec='mp3')
94
+ audio_clip.write_audiofile(output_file, codec="mp3")
92
95
  audio_clip.close()
93
96
  except Exception as e:
94
97
  logger.error(f"Error extracting audio: {str(e)}")
@@ -117,7 +120,9 @@ async def extract_audio_data(data: ProcessSourceState):
117
120
  output_files = []
118
121
 
119
122
  if duration_s > segment_length_s:
120
- logger.info(f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments")
123
+ logger.info(
124
+ f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
125
+ )
121
126
  for i in range(math.ceil(duration_s / segment_length_s)):
122
127
  start_time = i * segment_length_s
123
128
  end_time = min((i + 1) * segment_length_s, audio.duration)
@@ -134,15 +139,18 @@ async def extract_audio_data(data: ProcessSourceState):
134
139
 
135
140
  # Transcribe audio files
136
141
  from content_core.models import ModelFactory
142
+
137
143
  speech_to_text_model = ModelFactory.get_model("speech_to_text")
138
144
  transcriptions = []
139
145
  for audio_file in output_files:
140
- transcription = await transcribe_audio_segment(audio_file, speech_to_text_model)
146
+ transcription = await transcribe_audio_segment(
147
+ audio_file, speech_to_text_model
148
+ )
141
149
  transcriptions.append(transcription)
142
150
 
143
151
  return {
144
152
  "metadata": {"audio_files": output_files},
145
- "content": " ".join(transcriptions)
153
+ "content": " ".join(transcriptions),
146
154
  }
147
155
  except Exception as e:
148
156
  logger.error(f"Error processing audio: {str(e)}")