content-core 1.2.0__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core-1.2.2/.claude/commands/pr-review.md +6 -0
- {content_core-1.2.0 → content_core-1.2.2}/PKG-INFO +22 -3
- {content_core-1.2.0 → content_core-1.2.2}/README.md +21 -2
- {content_core-1.2.0 → content_core-1.2.2}/docs/mcp.md +28 -0
- {content_core-1.2.0 → content_core-1.2.2}/docs/processors.md +5 -4
- {content_core-1.2.0 → content_core-1.2.2}/docs/usage.md +14 -1
- {content_core-1.2.0 → content_core-1.2.2}/pyproject.toml +1 -1
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/config.py +37 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/extraction/graph.py +33 -21
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/processors/docling.py +13 -6
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/processors/url.py +5 -4
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/processors/youtube.py +4 -3
- content_core-1.2.2/tests/unit/test_config.py +109 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/unit/test_docling.py +4 -1
- {content_core-1.2.0 → content_core-1.2.2}/uv.lock +1 -1
- {content_core-1.2.0 → content_core-1.2.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/.github/workflows/claude-code-review.yml +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/.github/workflows/claude.yml +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/.github/workflows/publish.yml +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/.gitignore +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/.python-version +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/CONTRIBUTING.md +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/LICENSE +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/Makefile +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/docs/macos.md +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/docs/raycast.md +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/mcp.md +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/new_pdf.pdf +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/prompts/content/summarize.jinja +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/.eslintrc.json +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/CHANGELOG.md +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/README.md +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/assets/command-icon.png +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/package-lock.json +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/package.json +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/raycast-env.d.ts +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/src/extract-content.tsx +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/src/quick-extract.tsx +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/src/summarize-content.tsx +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/src/utils/content-core.ts +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/src/utils/types.ts +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/raycast-content-core/tsconfig.json +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/common/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/common/state.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/common/types.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/common/utils.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/summary/core.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/logging.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/mcp/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/mcp/server.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/models.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/models_config.yaml +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/notebooks/urls.ipynb +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/processors/audio.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/processors/office.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/processors/text.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/processors/video.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/py.typed +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/templated_message.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/tools/extract.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/test.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.docx +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.epub +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.md +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.mp3 +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.mp4 +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.pdf +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.pptx +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.txt +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file.xlsx +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/integration/test_cli.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/integration/test_extraction.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/unit/test_mcp_server.py +0 -0
- {content_core-1.2.0 → content_core-1.2.2}/tests/unit/test_pymupdf_ocr.py +0 -0
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
|
|
2
|
+
When I ask you to do a PR, you will commit the necessary files and open up a pull request. As soon as you open the pull request, an automatic review process will start. The review process will provide feedback on the code quality and best practices, potential bugs or issues, performance considerations, security concerns, and test coverage.
|
|
3
|
+
|
|
4
|
+
You should query the PR after opening it to get the commends from the review tool. Then, you will assess the comments and propose me which of them we need to address.
|
|
5
|
+
|
|
6
|
+
Then, we'll do the changes, commit them and add a comment back to the PR so that the review tool undertands what we changed and what we ignored.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -112,11 +112,17 @@ summary = await cc.summarize_content(result, context="explain to a child")
|
|
|
112
112
|
Install Content Core using `pip`:
|
|
113
113
|
|
|
114
114
|
```bash
|
|
115
|
-
#
|
|
115
|
+
# Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
|
|
116
116
|
pip install content-core
|
|
117
117
|
|
|
118
|
-
#
|
|
118
|
+
# With enhanced document processing (adds Docling)
|
|
119
|
+
pip install content-core[docling]
|
|
120
|
+
|
|
121
|
+
# With MCP server support
|
|
119
122
|
pip install content-core[mcp]
|
|
123
|
+
|
|
124
|
+
# Full installation
|
|
125
|
+
pip install content-core[docling,mcp]
|
|
120
126
|
```
|
|
121
127
|
|
|
122
128
|
Alternatively, if you’re developing locally:
|
|
@@ -526,8 +532,21 @@ Example `.env`:
|
|
|
526
532
|
```plaintext
|
|
527
533
|
OPENAI_API_KEY=your-key-here
|
|
528
534
|
GOOGLE_API_KEY=your-key-here
|
|
535
|
+
|
|
536
|
+
# Engine Selection (optional)
|
|
537
|
+
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
538
|
+
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
529
539
|
```
|
|
530
540
|
|
|
541
|
+
### Engine Selection via Environment Variables
|
|
542
|
+
|
|
543
|
+
For deployment scenarios like MCP servers or Raycast extensions, you can override the extraction engines using environment variables:
|
|
544
|
+
|
|
545
|
+
- **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
|
|
546
|
+
- **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
547
|
+
|
|
548
|
+
These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
|
|
549
|
+
|
|
531
550
|
### Custom Prompt Templates
|
|
532
551
|
|
|
533
552
|
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
@@ -74,11 +74,17 @@ summary = await cc.summarize_content(result, context="explain to a child")
|
|
|
74
74
|
Install Content Core using `pip`:
|
|
75
75
|
|
|
76
76
|
```bash
|
|
77
|
-
#
|
|
77
|
+
# Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
|
|
78
78
|
pip install content-core
|
|
79
79
|
|
|
80
|
-
#
|
|
80
|
+
# With enhanced document processing (adds Docling)
|
|
81
|
+
pip install content-core[docling]
|
|
82
|
+
|
|
83
|
+
# With MCP server support
|
|
81
84
|
pip install content-core[mcp]
|
|
85
|
+
|
|
86
|
+
# Full installation
|
|
87
|
+
pip install content-core[docling,mcp]
|
|
82
88
|
```
|
|
83
89
|
|
|
84
90
|
Alternatively, if you’re developing locally:
|
|
@@ -488,8 +494,21 @@ Example `.env`:
|
|
|
488
494
|
```plaintext
|
|
489
495
|
OPENAI_API_KEY=your-key-here
|
|
490
496
|
GOOGLE_API_KEY=your-key-here
|
|
497
|
+
|
|
498
|
+
# Engine Selection (optional)
|
|
499
|
+
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
500
|
+
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
491
501
|
```
|
|
492
502
|
|
|
503
|
+
### Engine Selection via Environment Variables
|
|
504
|
+
|
|
505
|
+
For deployment scenarios like MCP servers or Raycast extensions, you can override the extraction engines using environment variables:
|
|
506
|
+
|
|
507
|
+
- **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
|
|
508
|
+
- **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
509
|
+
|
|
510
|
+
These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
|
|
511
|
+
|
|
493
512
|
### Custom Prompt Templates
|
|
494
513
|
|
|
495
514
|
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
@@ -292,6 +292,34 @@ export GOOGLE_API_KEY="your-google-key"
|
|
|
292
292
|
- **Firecrawl**: Visit [Firecrawl](https://www.firecrawl.dev/) for enhanced web scraping
|
|
293
293
|
- **Jina**: Visit [Jina AI](https://jina.ai/) for alternative web extraction
|
|
294
294
|
|
|
295
|
+
### Engine Selection via Environment Variables
|
|
296
|
+
|
|
297
|
+
For advanced users, you can override the extraction engines:
|
|
298
|
+
|
|
299
|
+
```json
|
|
300
|
+
{
|
|
301
|
+
"mcpServers": {
|
|
302
|
+
"content-core": {
|
|
303
|
+
"env": {
|
|
304
|
+
"OPENAI_API_KEY": "sk-...",
|
|
305
|
+
"FIRECRAWL_API_KEY": "fc-...",
|
|
306
|
+
"CCORE_DOCUMENT_ENGINE": "simple", // Skip docling, use PyMuPDF
|
|
307
|
+
"CCORE_URL_ENGINE": "auto" // Or firecrawl, jina
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
**Available engines:**
|
|
315
|
+
- **Document**: `auto`, `simple`, `docling` (requires `content-core[docling]`)
|
|
316
|
+
- **URL**: `auto`, `simple`, `firecrawl`, `jina`
|
|
317
|
+
|
|
318
|
+
**Use cases:**
|
|
319
|
+
- Set `CCORE_DOCUMENT_ENGINE=simple` to avoid docling dependency issues
|
|
320
|
+
- Set `CCORE_URL_ENGINE=firecrawl` to always use paid service for better reliability
|
|
321
|
+
- Set `CCORE_URL_ENGINE=simple` for faster processing without external API calls
|
|
322
|
+
|
|
295
323
|
### Custom Prompts
|
|
296
324
|
|
|
297
325
|
You can customize Content Core's behavior by setting a custom prompt path:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Content Core Processors
|
|
2
2
|
|
|
3
|
-
**Note:** As of vNEXT, the default extraction engine is now `'auto'`. This means Content Core will automatically select the best extraction method based on your environment and available
|
|
3
|
+
**Note:** As of vNEXT, the default extraction engine is now `'auto'`. This means Content Core will automatically select the best extraction method based on your environment and available packages, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first (if installed with `pip install content-core[docling]`), then falls back to enhanced PyMuPDF extraction (with quality flags and table detection), then to basic simple extraction. See details below.
|
|
4
4
|
|
|
5
5
|
This document provides an overview of the content processors available in Content Core. These processors are responsible for extracting and handling content from various sources and file types.
|
|
6
6
|
|
|
@@ -62,14 +62,15 @@ Content Core uses a modular approach to process content from different sources.
|
|
|
62
62
|
```
|
|
63
63
|
- **Performance**: Standard extraction maintains baseline performance; OCR only triggers selectively on formula-heavy pages
|
|
64
64
|
|
|
65
|
-
### 6. **Docling Processor**
|
|
65
|
+
### 6. **Docling Processor (Optional)**
|
|
66
66
|
- **Purpose**: Use Docling library for rich document parsing (PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, images).
|
|
67
|
+
- **Installation**: Requires `pip install content-core[docling]`
|
|
67
68
|
- **Supported Input**: PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, Images (PNG, JPEG, TIFF, BMP).
|
|
68
69
|
- **Returned Data**: Content converted to configured format (markdown, html, json).
|
|
69
70
|
- **Location**: `src/content_core/processors/docling.py`
|
|
70
71
|
- **Default Document Engine (`auto`) Logic for Files/Documents**:
|
|
71
|
-
- Tries the `'docling'` extraction method first (
|
|
72
|
-
- If `'docling'`
|
|
72
|
+
- Tries the `'docling'` extraction method first (if installed with `content-core[docling]`).
|
|
73
|
+
- If `'docling'` is not installed or fails, automatically falls back to enhanced PyMuPDF extraction (fast, with quality flags and table detection).
|
|
73
74
|
- Final fallback to basic simple extraction if needed.
|
|
74
75
|
- You can explicitly specify `'docling'` or `'simple'` as the document engine, but `'auto'` is now the default and recommended for most users.
|
|
75
76
|
- **Configuration**: Activate the Docling engine in `cc_config.yaml` or custom config:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Using the Content Core Library
|
|
2
2
|
|
|
3
|
-
> **Note:** As of vNEXT, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available
|
|
3
|
+
> **Note:** As of vNEXT, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available packages, with a smart fallback order for both URLs and files. For files/documents, `'auto'` tries Docling first (if installed with `pip install content-core[docling]`), then falls back to enhanced PyMuPDF extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
|
|
4
4
|
|
|
5
5
|
This documentation explains how to configure and use the **Content Core** library in your projects. The library allows customization of AI model settings through a YAML file and environment variables.
|
|
6
6
|
|
|
@@ -12,8 +12,21 @@ To set the environment variable, add the following line to your `.env` file or s
|
|
|
12
12
|
|
|
13
13
|
```
|
|
14
14
|
CCORE_MODEL_CONFIG_PATH=/path/to/your/models_config.yaml
|
|
15
|
+
|
|
16
|
+
# Optional: Override extraction engines
|
|
17
|
+
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
18
|
+
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
15
19
|
```
|
|
16
20
|
|
|
21
|
+
### Engine Selection Environment Variables
|
|
22
|
+
|
|
23
|
+
Content Core supports environment variable overrides for extraction engines, useful for deployment scenarios:
|
|
24
|
+
|
|
25
|
+
- **`CCORE_DOCUMENT_ENGINE`**: Override document engine (`auto`, `simple`, `docling`)
|
|
26
|
+
- **`CCORE_URL_ENGINE`**: Override URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
27
|
+
|
|
28
|
+
These environment variables take precedence over configuration file settings and per-call overrides.
|
|
29
|
+
|
|
17
30
|
## YAML File Schema
|
|
18
31
|
|
|
19
32
|
The YAML configuration file defines the AI models that the library will use. The structure of the file is as follows:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "1.2.
|
|
3
|
+
version = "1.2.2"
|
|
4
4
|
description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -6,6 +6,10 @@ from dotenv import load_dotenv
|
|
|
6
6
|
# Load environment variables from .env file
|
|
7
7
|
load_dotenv()
|
|
8
8
|
|
|
9
|
+
# Allowed engine values for validation
|
|
10
|
+
ALLOWED_DOCUMENT_ENGINES = {"auto", "simple", "docling"}
|
|
11
|
+
ALLOWED_URL_ENGINES = {"auto", "simple", "firecrawl", "jina"}
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
def load_config():
|
|
11
15
|
config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
|
|
@@ -33,6 +37,39 @@ def load_config():
|
|
|
33
37
|
|
|
34
38
|
CONFIG = load_config()
|
|
35
39
|
|
|
40
|
+
# Environment variable engine selectors for MCP/Raycast users
|
|
41
|
+
def get_document_engine():
|
|
42
|
+
"""Get document engine with environment variable override and validation."""
|
|
43
|
+
env_engine = os.environ.get("CCORE_DOCUMENT_ENGINE")
|
|
44
|
+
if env_engine:
|
|
45
|
+
if env_engine not in ALLOWED_DOCUMENT_ENGINES:
|
|
46
|
+
# Import logger here to avoid circular imports
|
|
47
|
+
from content_core.logging import logger
|
|
48
|
+
logger.warning(
|
|
49
|
+
f"Invalid CCORE_DOCUMENT_ENGINE: '{env_engine}'. "
|
|
50
|
+
f"Allowed values: {', '.join(sorted(ALLOWED_DOCUMENT_ENGINES))}. "
|
|
51
|
+
f"Using default from config."
|
|
52
|
+
)
|
|
53
|
+
return CONFIG.get("extraction", {}).get("document_engine", "auto")
|
|
54
|
+
return env_engine
|
|
55
|
+
return CONFIG.get("extraction", {}).get("document_engine", "auto")
|
|
56
|
+
|
|
57
|
+
def get_url_engine():
|
|
58
|
+
"""Get URL engine with environment variable override and validation."""
|
|
59
|
+
env_engine = os.environ.get("CCORE_URL_ENGINE")
|
|
60
|
+
if env_engine:
|
|
61
|
+
if env_engine not in ALLOWED_URL_ENGINES:
|
|
62
|
+
# Import logger here to avoid circular imports
|
|
63
|
+
from content_core.logging import logger
|
|
64
|
+
logger.warning(
|
|
65
|
+
f"Invalid CCORE_URL_ENGINE: '{env_engine}'. "
|
|
66
|
+
f"Allowed values: {', '.join(sorted(ALLOWED_URL_ENGINES))}. "
|
|
67
|
+
f"Using default from config."
|
|
68
|
+
)
|
|
69
|
+
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
70
|
+
return env_engine
|
|
71
|
+
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
72
|
+
|
|
36
73
|
# Programmatic config overrides: use in notebooks or scripts
|
|
37
74
|
def set_document_engine(engine: str):
|
|
38
75
|
"""Override the document extraction engine ('auto', 'simple', or 'docling')."""
|
|
@@ -12,13 +12,19 @@ from content_core.common import (
|
|
|
12
12
|
ProcessSourceState,
|
|
13
13
|
UnsupportedTypeException,
|
|
14
14
|
)
|
|
15
|
-
from content_core.config import
|
|
15
|
+
from content_core.config import get_document_engine
|
|
16
16
|
from content_core.logging import logger
|
|
17
17
|
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
try:
|
|
19
|
+
from content_core.processors.docling import (
|
|
20
|
+
DOCLING_SUPPORTED, # type: ignore
|
|
21
|
+
extract_with_docling,
|
|
22
|
+
DOCLING_AVAILABLE,
|
|
23
|
+
)
|
|
24
|
+
except ImportError:
|
|
25
|
+
DOCLING_AVAILABLE = False
|
|
26
|
+
DOCLING_SUPPORTED = set()
|
|
27
|
+
extract_with_docling = None
|
|
22
28
|
from content_core.processors.office import (
|
|
23
29
|
SUPPORTED_OFFICE_TYPES,
|
|
24
30
|
extract_office_content,
|
|
@@ -126,26 +132,30 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
|
126
132
|
Supports 'auto', 'docling', and 'simple'.
|
|
127
133
|
'auto' tries docling first, then falls back to simple if docling fails.
|
|
128
134
|
"""
|
|
129
|
-
|
|
135
|
+
# Use environment-aware engine selection
|
|
136
|
+
engine = state.document_engine or get_document_engine()
|
|
137
|
+
|
|
130
138
|
if engine == "auto":
|
|
131
139
|
logger.debug("Using auto engine")
|
|
132
|
-
#
|
|
133
|
-
if state.identified_type in DOCLING_SUPPORTED:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
return "extract_docling"
|
|
137
|
-
except Exception as e:
|
|
138
|
-
logger.warning(
|
|
139
|
-
f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
|
|
140
|
-
)
|
|
140
|
+
# Check if docling is available AND supports the file type
|
|
141
|
+
if DOCLING_AVAILABLE and state.identified_type in DOCLING_SUPPORTED:
|
|
142
|
+
logger.debug("Using docling extraction (auto mode)")
|
|
143
|
+
return "extract_docling"
|
|
141
144
|
# Fallback to simple
|
|
142
|
-
logger.debug("Falling back to simple extraction")
|
|
145
|
+
logger.debug("Falling back to simple extraction (docling unavailable or unsupported)")
|
|
143
146
|
return await file_type_edge(state)
|
|
144
147
|
|
|
145
|
-
if engine == "docling"
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
148
|
+
if engine == "docling":
|
|
149
|
+
if not DOCLING_AVAILABLE:
|
|
150
|
+
raise ImportError("Docling engine requested but docling package not installed. Install with: pip install content-core[docling]")
|
|
151
|
+
if state.identified_type in DOCLING_SUPPORTED:
|
|
152
|
+
logger.debug("Using docling engine")
|
|
153
|
+
return "extract_docling"
|
|
154
|
+
# If docling doesn't support this file type, fall back to simple
|
|
155
|
+
logger.debug("Docling doesn't support this file type, using simple engine")
|
|
156
|
+
return await file_type_edge(state)
|
|
157
|
+
|
|
158
|
+
# For 'simple' or any other engine
|
|
149
159
|
logger.debug("Using simple engine")
|
|
150
160
|
return await file_type_edge(state)
|
|
151
161
|
|
|
@@ -168,7 +178,9 @@ workflow.add_node("extract_audio_data", extract_audio_data)
|
|
|
168
178
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
169
179
|
workflow.add_node("delete_file", delete_file)
|
|
170
180
|
workflow.add_node("download_remote_file", download_remote_file)
|
|
171
|
-
|
|
181
|
+
# Only add docling node if available
|
|
182
|
+
if DOCLING_AVAILABLE:
|
|
183
|
+
workflow.add_node("extract_docling", extract_with_docling)
|
|
172
184
|
|
|
173
185
|
# Add edges
|
|
174
186
|
workflow.add_edge(START, "source")
|
|
@@ -2,22 +2,29 @@
|
|
|
2
2
|
Docling-based document extraction processor.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from content_core.common.state import ProcessSourceState
|
|
6
|
+
from content_core.config import CONFIG
|
|
7
|
+
|
|
8
|
+
DOCLING_AVAILABLE = False
|
|
5
9
|
try:
|
|
6
10
|
from docling.document_converter import DocumentConverter
|
|
11
|
+
DOCLING_AVAILABLE = True
|
|
7
12
|
except ImportError:
|
|
8
13
|
|
|
9
14
|
class DocumentConverter:
|
|
10
15
|
"""Stub when docling is not installed."""
|
|
11
16
|
|
|
12
17
|
def __init__(self):
|
|
13
|
-
raise ImportError(
|
|
18
|
+
raise ImportError(
|
|
19
|
+
"Docling not installed. Install with: pip install content-core[docling] "
|
|
20
|
+
"or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
|
|
21
|
+
)
|
|
14
22
|
|
|
15
23
|
def convert(self, source: str):
|
|
16
|
-
raise ImportError(
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
from content_core.config import CONFIG
|
|
24
|
+
raise ImportError(
|
|
25
|
+
"Docling not installed. Install with: pip install content-core[docling] "
|
|
26
|
+
"or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
|
|
27
|
+
)
|
|
21
28
|
|
|
22
29
|
# Supported MIME types for Docling extraction
|
|
23
30
|
DOCLING_SUPPORTED = {
|
|
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|
|
5
5
|
from readability import Document
|
|
6
6
|
|
|
7
7
|
from content_core.common import ProcessSourceState
|
|
8
|
-
from content_core.config import
|
|
8
|
+
from content_core.config import get_url_engine
|
|
9
9
|
from content_core.logging import logger
|
|
10
10
|
from content_core.processors.docling import DOCLING_SUPPORTED
|
|
11
11
|
from content_core.processors.office import SUPPORTED_OFFICE_TYPES
|
|
@@ -38,10 +38,10 @@ async def url_provider(state: ProcessSourceState):
|
|
|
38
38
|
or mime in SUPPORTED_FITZ_TYPES
|
|
39
39
|
or mime in SUPPORTED_OFFICE_TYPES
|
|
40
40
|
):
|
|
41
|
-
logger.
|
|
41
|
+
logger.debug(f"Identified type for {url}: {mime}")
|
|
42
42
|
return_dict["identified_type"] = mime
|
|
43
43
|
else:
|
|
44
|
-
logger.
|
|
44
|
+
logger.debug(f"Identified type for {url}: article")
|
|
45
45
|
return_dict["identified_type"] = "article"
|
|
46
46
|
return return_dict
|
|
47
47
|
|
|
@@ -165,7 +165,8 @@ async def extract_url(state: ProcessSourceState):
|
|
|
165
165
|
"""
|
|
166
166
|
assert state.url, "No URL provided"
|
|
167
167
|
url = state.url
|
|
168
|
-
|
|
168
|
+
# Use environment-aware engine selection
|
|
169
|
+
engine = state.url_engine or get_url_engine()
|
|
169
170
|
try:
|
|
170
171
|
if engine == "auto":
|
|
171
172
|
if os.environ.get("FIRECRAWL_API_KEY"):
|
|
@@ -3,12 +3,13 @@ import ssl
|
|
|
3
3
|
|
|
4
4
|
import aiohttp
|
|
5
5
|
from bs4 import BeautifulSoup
|
|
6
|
+
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
7
|
+
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
8
|
+
|
|
6
9
|
from content_core.common import ProcessSourceState
|
|
7
10
|
from content_core.common.exceptions import NoTranscriptFound
|
|
8
11
|
from content_core.config import CONFIG
|
|
9
12
|
from content_core.logging import logger
|
|
10
|
-
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
11
|
-
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
12
13
|
|
|
13
14
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
14
15
|
|
|
@@ -172,7 +173,7 @@ async def extract_youtube_transcript(state: ProcessSourceState):
|
|
|
172
173
|
"""
|
|
173
174
|
|
|
174
175
|
assert state.url, "No URL provided"
|
|
175
|
-
logger.
|
|
176
|
+
logger.debug(f"Extracting transcript from URL: {state.url}")
|
|
176
177
|
languages = CONFIG.get("youtube_transcripts", {}).get(
|
|
177
178
|
"preferred_languages", ["en", "es", "pt"]
|
|
178
179
|
)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Tests for configuration functions and environment variable handling."""
|
|
2
|
+
import pytest
|
|
3
|
+
from unittest.mock import patch, MagicMock
|
|
4
|
+
from content_core.config import (
|
|
5
|
+
get_document_engine,
|
|
6
|
+
get_url_engine,
|
|
7
|
+
ALLOWED_DOCUMENT_ENGINES,
|
|
8
|
+
ALLOWED_URL_ENGINES,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestDocumentEngineSelection:
|
|
13
|
+
"""Test document engine selection with environment variables."""
|
|
14
|
+
|
|
15
|
+
def test_default_document_engine(self):
|
|
16
|
+
"""Test default document engine when no env var is set."""
|
|
17
|
+
with patch.dict('os.environ', {}, clear=False):
|
|
18
|
+
# Remove the env var if it exists
|
|
19
|
+
if 'CCORE_DOCUMENT_ENGINE' in __import__('os').environ:
|
|
20
|
+
del __import__('os').environ['CCORE_DOCUMENT_ENGINE']
|
|
21
|
+
engine = get_document_engine()
|
|
22
|
+
assert engine == "auto" # Default from config
|
|
23
|
+
|
|
24
|
+
def test_valid_document_engine_env_var(self):
|
|
25
|
+
"""Test valid document engine environment variable override."""
|
|
26
|
+
for engine in ALLOWED_DOCUMENT_ENGINES:
|
|
27
|
+
with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': engine}):
|
|
28
|
+
assert get_document_engine() == engine
|
|
29
|
+
|
|
30
|
+
def test_invalid_document_engine_env_var(self):
|
|
31
|
+
"""Test invalid document engine environment variable falls back to default."""
|
|
32
|
+
with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': 'invalid_engine'}):
|
|
33
|
+
engine = get_document_engine()
|
|
34
|
+
assert engine == "auto" # Should fallback to default
|
|
35
|
+
|
|
36
|
+
def test_case_sensitive_document_engine_env_var(self):
|
|
37
|
+
"""Test that document engine environment variable is case sensitive."""
|
|
38
|
+
with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': 'AUTO'}): # uppercase
|
|
39
|
+
engine = get_document_engine()
|
|
40
|
+
assert engine == "auto" # Should fallback to default
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TestUrlEngineSelection:
|
|
44
|
+
"""Test URL engine selection with environment variables."""
|
|
45
|
+
|
|
46
|
+
def test_default_url_engine(self):
|
|
47
|
+
"""Test default URL engine when no env var is set."""
|
|
48
|
+
with patch.dict('os.environ', {}, clear=False):
|
|
49
|
+
# Remove the env var if it exists
|
|
50
|
+
if 'CCORE_URL_ENGINE' in __import__('os').environ:
|
|
51
|
+
del __import__('os').environ['CCORE_URL_ENGINE']
|
|
52
|
+
engine = get_url_engine()
|
|
53
|
+
assert engine == "auto" # Default from config
|
|
54
|
+
|
|
55
|
+
def test_valid_url_engine_env_var(self):
|
|
56
|
+
"""Test valid URL engine environment variable override."""
|
|
57
|
+
for engine in ALLOWED_URL_ENGINES:
|
|
58
|
+
with patch.dict('os.environ', {'CCORE_URL_ENGINE': engine}):
|
|
59
|
+
assert get_url_engine() == engine
|
|
60
|
+
|
|
61
|
+
def test_invalid_url_engine_env_var(self):
|
|
62
|
+
"""Test invalid URL engine environment variable falls back to default."""
|
|
63
|
+
with patch.dict('os.environ', {'CCORE_URL_ENGINE': 'invalid_engine'}):
|
|
64
|
+
engine = get_url_engine()
|
|
65
|
+
assert engine == "auto" # Should fallback to default
|
|
66
|
+
|
|
67
|
+
def test_case_sensitive_url_engine_env_var(self):
|
|
68
|
+
"""Test that URL engine environment variable is case sensitive."""
|
|
69
|
+
with patch.dict('os.environ', {'CCORE_URL_ENGINE': 'FIRECRAWL'}): # uppercase
|
|
70
|
+
engine = get_url_engine()
|
|
71
|
+
assert engine == "auto" # Should fallback to default
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class TestEngineConstants:
|
|
75
|
+
"""Test that engine constants contain expected values."""
|
|
76
|
+
|
|
77
|
+
def test_document_engine_constants(self):
|
|
78
|
+
"""Test document engine allowed values."""
|
|
79
|
+
expected = {"auto", "simple", "docling"}
|
|
80
|
+
assert ALLOWED_DOCUMENT_ENGINES == expected
|
|
81
|
+
|
|
82
|
+
def test_url_engine_constants(self):
|
|
83
|
+
"""Test URL engine allowed values."""
|
|
84
|
+
expected = {"auto", "simple", "firecrawl", "jina"}
|
|
85
|
+
assert ALLOWED_URL_ENGINES == expected
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TestEdgeCases:
|
|
89
|
+
"""Test edge cases and error conditions."""
|
|
90
|
+
|
|
91
|
+
def test_empty_string_document_engine(self):
|
|
92
|
+
"""Test empty string for document engine env var."""
|
|
93
|
+
with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': ''}):
|
|
94
|
+
# Empty string should be falsy and use default
|
|
95
|
+
engine = get_document_engine()
|
|
96
|
+
assert engine == "auto"
|
|
97
|
+
|
|
98
|
+
def test_empty_string_url_engine(self):
|
|
99
|
+
"""Test empty string for URL engine env var."""
|
|
100
|
+
with patch.dict('os.environ', {'CCORE_URL_ENGINE': ''}):
|
|
101
|
+
# Empty string should be falsy and use default
|
|
102
|
+
engine = get_url_engine()
|
|
103
|
+
assert engine == "auto"
|
|
104
|
+
|
|
105
|
+
def test_whitespace_engine_values(self):
|
|
106
|
+
"""Test whitespace in engine values are treated as invalid."""
|
|
107
|
+
with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': ' auto '}):
|
|
108
|
+
engine = get_document_engine()
|
|
109
|
+
assert engine == "auto" # Should fallback to default
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pytest
|
|
3
3
|
from types import SimpleNamespace
|
|
4
|
-
from content_core.processors.docling import extract_with_docling
|
|
4
|
+
from content_core.processors.docling import extract_with_docling, DOCLING_AVAILABLE
|
|
5
5
|
from content_core.common.state import ProcessSourceState
|
|
6
6
|
|
|
7
7
|
class DummyDoc:
|
|
@@ -31,6 +31,7 @@ def patch_converter(monkeypatch):
|
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
@pytest.mark.asyncio
|
|
34
|
+
@pytest.mark.skipif(not DOCLING_AVAILABLE, reason="Docling not installed")
|
|
34
35
|
async def test_extract_file(tmp_path):
|
|
35
36
|
# File input with explicit markdown format
|
|
36
37
|
fp = tmp_path / "test.txt"
|
|
@@ -40,6 +41,7 @@ async def test_extract_file(tmp_path):
|
|
|
40
41
|
assert new_state.content == "md:file:" + str(fp)
|
|
41
42
|
|
|
42
43
|
@pytest.mark.asyncio
|
|
44
|
+
@pytest.mark.skipif(not DOCLING_AVAILABLE, reason="Docling not installed")
|
|
43
45
|
async def test_extract_block_html():
|
|
44
46
|
# Block input with HTML format
|
|
45
47
|
state = ProcessSourceState(content="block content", metadata={"docling_format": "html"})
|
|
@@ -47,6 +49,7 @@ async def test_extract_block_html():
|
|
|
47
49
|
assert new_state.content == "<p>blk:block content</p>"
|
|
48
50
|
|
|
49
51
|
@pytest.mark.asyncio
|
|
52
|
+
@pytest.mark.skipif(not DOCLING_AVAILABLE, reason="Docling not installed")
|
|
50
53
|
async def test_default_to_markdown():
|
|
51
54
|
# Default format should fallback to markdown
|
|
52
55
|
state = ProcessSourceState(content="plain text")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{content_core-1.2.0 → content_core-1.2.2}/src/content_core/content/identification/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|