content-core 1.2.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (89) hide show
  1. {content_core-1.2.0 → content_core-1.2.1}/PKG-INFO +22 -3
  2. {content_core-1.2.0 → content_core-1.2.1}/README.md +21 -2
  3. {content_core-1.2.0 → content_core-1.2.1}/docs/mcp.md +28 -0
  4. {content_core-1.2.0 → content_core-1.2.1}/docs/processors.md +5 -4
  5. {content_core-1.2.0 → content_core-1.2.1}/docs/usage.md +14 -1
  6. {content_core-1.2.0 → content_core-1.2.1}/pyproject.toml +1 -1
  7. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/config.py +37 -0
  8. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/content/extraction/graph.py +33 -21
  9. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/processors/docling.py +13 -6
  10. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/processors/url.py +3 -2
  11. content_core-1.2.1/tests/unit/test_config.py +109 -0
  12. {content_core-1.2.0 → content_core-1.2.1}/tests/unit/test_docling.py +4 -1
  13. {content_core-1.2.0 → content_core-1.2.1}/uv.lock +1 -1
  14. {content_core-1.2.0 → content_core-1.2.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  15. {content_core-1.2.0 → content_core-1.2.1}/.github/workflows/claude-code-review.yml +0 -0
  16. {content_core-1.2.0 → content_core-1.2.1}/.github/workflows/claude.yml +0 -0
  17. {content_core-1.2.0 → content_core-1.2.1}/.github/workflows/publish.yml +0 -0
  18. {content_core-1.2.0 → content_core-1.2.1}/.gitignore +0 -0
  19. {content_core-1.2.0 → content_core-1.2.1}/.python-version +0 -0
  20. {content_core-1.2.0 → content_core-1.2.1}/CONTRIBUTING.md +0 -0
  21. {content_core-1.2.0 → content_core-1.2.1}/LICENSE +0 -0
  22. {content_core-1.2.0 → content_core-1.2.1}/Makefile +0 -0
  23. {content_core-1.2.0 → content_core-1.2.1}/docs/macos.md +0 -0
  24. {content_core-1.2.0 → content_core-1.2.1}/docs/raycast.md +0 -0
  25. {content_core-1.2.0 → content_core-1.2.1}/mcp.md +0 -0
  26. {content_core-1.2.0 → content_core-1.2.1}/new_pdf.pdf +0 -0
  27. {content_core-1.2.0 → content_core-1.2.1}/prompts/content/cleanup.jinja +0 -0
  28. {content_core-1.2.0 → content_core-1.2.1}/prompts/content/summarize.jinja +0 -0
  29. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/.eslintrc.json +0 -0
  30. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/CHANGELOG.md +0 -0
  31. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/README.md +0 -0
  32. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/assets/command-icon.png +0 -0
  33. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/package-lock.json +0 -0
  34. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/package.json +0 -0
  35. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/raycast-env.d.ts +0 -0
  36. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/src/extract-content.tsx +0 -0
  37. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/src/quick-extract.tsx +0 -0
  38. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/src/summarize-content.tsx +0 -0
  39. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/src/utils/content-core.ts +0 -0
  40. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/src/utils/types.ts +0 -0
  41. {content_core-1.2.0 → content_core-1.2.1}/raycast-content-core/tsconfig.json +0 -0
  42. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/__init__.py +0 -0
  43. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/cc_config.yaml +0 -0
  44. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/common/__init__.py +0 -0
  45. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/common/exceptions.py +0 -0
  46. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/common/state.py +0 -0
  47. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/common/types.py +0 -0
  48. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/common/utils.py +0 -0
  49. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/content/__init__.py +0 -0
  50. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/content/cleanup/__init__.py +0 -0
  51. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/content/cleanup/core.py +0 -0
  52. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/content/extraction/__init__.py +0 -0
  53. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/content/identification/__init__.py +0 -0
  54. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/content/summary/__init__.py +0 -0
  55. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/content/summary/core.py +0 -0
  56. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/logging.py +0 -0
  57. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/mcp/__init__.py +0 -0
  58. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/mcp/server.py +0 -0
  59. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/models.py +0 -0
  60. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/models_config.yaml +0 -0
  61. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/notebooks/run.ipynb +0 -0
  62. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/notebooks/urls.ipynb +0 -0
  63. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/processors/audio.py +0 -0
  64. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/processors/office.py +0 -0
  65. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/processors/pdf.py +0 -0
  66. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/processors/text.py +0 -0
  67. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/processors/video.py +0 -0
  68. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/processors/youtube.py +0 -0
  69. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/py.typed +0 -0
  70. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/templated_message.py +0 -0
  71. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/tools/__init__.py +0 -0
  72. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/tools/cleanup.py +0 -0
  73. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/tools/extract.py +0 -0
  74. {content_core-1.2.0 → content_core-1.2.1}/src/content_core/tools/summarize.py +0 -0
  75. {content_core-1.2.0 → content_core-1.2.1}/test.py +0 -0
  76. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.docx +0 -0
  77. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.epub +0 -0
  78. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.md +0 -0
  79. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.mp3 +0 -0
  80. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.mp4 +0 -0
  81. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.pdf +0 -0
  82. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.pptx +0 -0
  83. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.txt +0 -0
  84. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file.xlsx +0 -0
  85. {content_core-1.2.0 → content_core-1.2.1}/tests/input_content/file_audio.mp3 +0 -0
  86. {content_core-1.2.0 → content_core-1.2.1}/tests/integration/test_cli.py +0 -0
  87. {content_core-1.2.0 → content_core-1.2.1}/tests/integration/test_extraction.py +0 -0
  88. {content_core-1.2.0 → content_core-1.2.1}/tests/unit/test_mcp_server.py +0 -0
  89. {content_core-1.2.0 → content_core-1.2.1}/tests/unit/test_pymupdf_ocr.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -112,11 +112,17 @@ summary = await cc.summarize_content(result, context="explain to a child")
112
112
  Install Content Core using `pip`:
113
113
 
114
114
  ```bash
115
- # Install the package
115
+ # Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
116
116
  pip install content-core
117
117
 
118
- # Install with MCP server support
118
+ # With enhanced document processing (adds Docling)
119
+ pip install content-core[docling]
120
+
121
+ # With MCP server support
119
122
  pip install content-core[mcp]
123
+
124
+ # Full installation
125
+ pip install content-core[docling,mcp]
120
126
  ```
121
127
 
122
128
  Alternatively, if you’re developing locally:
@@ -526,8 +532,21 @@ Example `.env`:
526
532
  ```plaintext
527
533
  OPENAI_API_KEY=your-key-here
528
534
  GOOGLE_API_KEY=your-key-here
535
+
536
+ # Engine Selection (optional)
537
+ CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
538
+ CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
529
539
  ```
530
540
 
541
+ ### Engine Selection via Environment Variables
542
+
543
+ For deployment scenarios like MCP servers or Raycast extensions, you can override the extraction engines using environment variables:
544
+
545
+ - **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
546
+ - **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
547
+
548
+ These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
549
+
531
550
  ### Custom Prompt Templates
532
551
 
533
552
  Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
@@ -74,11 +74,17 @@ summary = await cc.summarize_content(result, context="explain to a child")
74
74
  Install Content Core using `pip`:
75
75
 
76
76
  ```bash
77
- # Install the package
77
+ # Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
78
78
  pip install content-core
79
79
 
80
- # Install with MCP server support
80
+ # With enhanced document processing (adds Docling)
81
+ pip install content-core[docling]
82
+
83
+ # With MCP server support
81
84
  pip install content-core[mcp]
85
+
86
+ # Full installation
87
+ pip install content-core[docling,mcp]
82
88
  ```
83
89
 
84
90
  Alternatively, if you’re developing locally:
@@ -488,8 +494,21 @@ Example `.env`:
488
494
  ```plaintext
489
495
  OPENAI_API_KEY=your-key-here
490
496
  GOOGLE_API_KEY=your-key-here
497
+
498
+ # Engine Selection (optional)
499
+ CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
500
+ CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
491
501
  ```
492
502
 
503
+ ### Engine Selection via Environment Variables
504
+
505
+ For deployment scenarios like MCP servers or Raycast extensions, you can override the extraction engines using environment variables:
506
+
507
+ - **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
508
+ - **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
509
+
510
+ These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
511
+
493
512
  ### Custom Prompt Templates
494
513
 
495
514
  Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
@@ -292,6 +292,34 @@ export GOOGLE_API_KEY="your-google-key"
292
292
  - **Firecrawl**: Visit [Firecrawl](https://www.firecrawl.dev/) for enhanced web scraping
293
293
  - **Jina**: Visit [Jina AI](https://jina.ai/) for alternative web extraction
294
294
 
295
+ ### Engine Selection via Environment Variables
296
+
297
+ For advanced users, you can override the extraction engines:
298
+
299
+ ```json
300
+ {
301
+ "mcpServers": {
302
+ "content-core": {
303
+ "env": {
304
+ "OPENAI_API_KEY": "sk-...",
305
+ "FIRECRAWL_API_KEY": "fc-...",
306
+ "CCORE_DOCUMENT_ENGINE": "simple", // Skip docling, use PyMuPDF
307
+ "CCORE_URL_ENGINE": "auto" // Or firecrawl, jina
308
+ }
309
+ }
310
+ }
311
+ }
312
+ ```
313
+
314
+ **Available engines:**
315
+ - **Document**: `auto`, `simple`, `docling` (requires `content-core[docling]`)
316
+ - **URL**: `auto`, `simple`, `firecrawl`, `jina`
317
+
318
+ **Use cases:**
319
+ - Set `CCORE_DOCUMENT_ENGINE=simple` to avoid docling dependency issues
320
+ - Set `CCORE_URL_ENGINE=firecrawl` to always use paid service for better reliability
321
+ - Set `CCORE_URL_ENGINE=simple` for faster processing without external API calls
322
+
295
323
  ### Custom Prompts
296
324
 
297
325
  You can customize Content Core's behavior by setting a custom prompt path:
@@ -1,6 +1,6 @@
1
1
  # Content Core Processors
2
2
 
3
- **Note:** As of vNEXT, the default extraction engine is now `'auto'`. This means Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to enhanced PyMuPDF extraction (with quality flags and table detection), then to basic simple extraction. See details below.
3
+ **Note:** As of vNEXT, the default extraction engine is now `'auto'`. This means Content Core will automatically select the best extraction method based on your environment and available packages, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first (if installed with `pip install content-core[docling]`), then falls back to enhanced PyMuPDF extraction (with quality flags and table detection), then to basic simple extraction. See details below.
4
4
 
5
5
  This document provides an overview of the content processors available in Content Core. These processors are responsible for extracting and handling content from various sources and file types.
6
6
 
@@ -62,14 +62,15 @@ Content Core uses a modular approach to process content from different sources.
62
62
  ```
63
63
  - **Performance**: Standard extraction maintains baseline performance; OCR only triggers selectively on formula-heavy pages
64
64
 
65
- ### 6. **Docling Processor**
65
+ ### 6. **Docling Processor (Optional)**
66
66
  - **Purpose**: Use Docling library for rich document parsing (PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, images).
67
+ - **Installation**: Requires `pip install content-core[docling]`
67
68
  - **Supported Input**: PDF, DOCX, XLSX, PPTX, Markdown, AsciiDoc, HTML, CSV, Images (PNG, JPEG, TIFF, BMP).
68
69
  - **Returned Data**: Content converted to configured format (markdown, html, json).
69
70
  - **Location**: `src/content_core/processors/docling.py`
70
71
  - **Default Document Engine (`auto`) Logic for Files/Documents**:
71
- - Tries the `'docling'` extraction method first (robust document parsing for supported types).
72
- - If `'docling'` fails or is not supported, automatically falls back to enhanced PyMuPDF extraction (fast, with quality flags and table detection).
72
+ - Tries the `'docling'` extraction method first (if installed with `content-core[docling]`).
73
+ - If `'docling'` is not installed or fails, automatically falls back to enhanced PyMuPDF extraction (fast, with quality flags and table detection).
73
74
  - Final fallback to basic simple extraction if needed.
74
75
  - You can explicitly specify `'docling'` or `'simple'` as the document engine, but `'auto'` is now the default and recommended for most users.
75
76
  - **Configuration**: Activate the Docling engine in `cc_config.yaml` or custom config:
@@ -1,6 +1,6 @@
1
1
  # Using the Content Core Library
2
2
 
3
- > **Note:** As of vNEXT, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
3
+ > **Note:** As of vNEXT, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available packages, with a smart fallback order for both URLs and files. For files/documents, `'auto'` tries Docling first (if installed with `pip install content-core[docling]`), then falls back to enhanced PyMuPDF extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
4
4
 
5
5
  This documentation explains how to configure and use the **Content Core** library in your projects. The library allows customization of AI model settings through a YAML file and environment variables.
6
6
 
@@ -12,8 +12,21 @@ To set the environment variable, add the following line to your `.env` file or s
12
12
 
13
13
  ```
14
14
  CCORE_MODEL_CONFIG_PATH=/path/to/your/models_config.yaml
15
+
16
+ # Optional: Override extraction engines
17
+ CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
18
+ CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
15
19
  ```
16
20
 
21
+ ### Engine Selection Environment Variables
22
+
23
+ Content Core supports environment variable overrides for extraction engines, useful for deployment scenarios:
24
+
25
+ - **`CCORE_DOCUMENT_ENGINE`**: Override document engine (`auto`, `simple`, `docling`)
26
+ - **`CCORE_URL_ENGINE`**: Override URL engine (`auto`, `simple`, `firecrawl`, `jina`)
27
+
28
+ These environment variables take precedence over configuration file settings and per-call overrides.
29
+
17
30
  ## YAML File Schema
18
31
 
19
32
  The YAML configuration file defines the AI models that the library will use. The structure of the file is as follows:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "1.2.0"
3
+ version = "1.2.1"
4
4
  description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -6,6 +6,10 @@ from dotenv import load_dotenv
6
6
  # Load environment variables from .env file
7
7
  load_dotenv()
8
8
 
9
+ # Allowed engine values for validation
10
+ ALLOWED_DOCUMENT_ENGINES = {"auto", "simple", "docling"}
11
+ ALLOWED_URL_ENGINES = {"auto", "simple", "firecrawl", "jina"}
12
+
9
13
 
10
14
  def load_config():
11
15
  config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
@@ -33,6 +37,39 @@ def load_config():
33
37
 
34
38
  CONFIG = load_config()
35
39
 
40
+ # Environment variable engine selectors for MCP/Raycast users
41
+ def get_document_engine():
42
+ """Get document engine with environment variable override and validation."""
43
+ env_engine = os.environ.get("CCORE_DOCUMENT_ENGINE")
44
+ if env_engine:
45
+ if env_engine not in ALLOWED_DOCUMENT_ENGINES:
46
+ # Import logger here to avoid circular imports
47
+ from content_core.logging import logger
48
+ logger.warning(
49
+ f"Invalid CCORE_DOCUMENT_ENGINE: '{env_engine}'. "
50
+ f"Allowed values: {', '.join(sorted(ALLOWED_DOCUMENT_ENGINES))}. "
51
+ f"Using default from config."
52
+ )
53
+ return CONFIG.get("extraction", {}).get("document_engine", "auto")
54
+ return env_engine
55
+ return CONFIG.get("extraction", {}).get("document_engine", "auto")
56
+
57
+ def get_url_engine():
58
+ """Get URL engine with environment variable override and validation."""
59
+ env_engine = os.environ.get("CCORE_URL_ENGINE")
60
+ if env_engine:
61
+ if env_engine not in ALLOWED_URL_ENGINES:
62
+ # Import logger here to avoid circular imports
63
+ from content_core.logging import logger
64
+ logger.warning(
65
+ f"Invalid CCORE_URL_ENGINE: '{env_engine}'. "
66
+ f"Allowed values: {', '.join(sorted(ALLOWED_URL_ENGINES))}. "
67
+ f"Using default from config."
68
+ )
69
+ return CONFIG.get("extraction", {}).get("url_engine", "auto")
70
+ return env_engine
71
+ return CONFIG.get("extraction", {}).get("url_engine", "auto")
72
+
36
73
  # Programmatic config overrides: use in notebooks or scripts
37
74
  def set_document_engine(engine: str):
38
75
  """Override the document extraction engine ('auto', 'simple', or 'docling')."""
@@ -12,13 +12,19 @@ from content_core.common import (
12
12
  ProcessSourceState,
13
13
  UnsupportedTypeException,
14
14
  )
15
- from content_core.config import CONFIG # type: ignore
15
+ from content_core.config import get_document_engine
16
16
  from content_core.logging import logger
17
17
  from content_core.processors.audio import extract_audio_data # type: ignore
18
- from content_core.processors.docling import (
19
- DOCLING_SUPPORTED, # type: ignore
20
- extract_with_docling,
21
- )
18
+ try:
19
+ from content_core.processors.docling import (
20
+ DOCLING_SUPPORTED, # type: ignore
21
+ extract_with_docling,
22
+ DOCLING_AVAILABLE,
23
+ )
24
+ except ImportError:
25
+ DOCLING_AVAILABLE = False
26
+ DOCLING_SUPPORTED = set()
27
+ extract_with_docling = None
22
28
  from content_core.processors.office import (
23
29
  SUPPORTED_OFFICE_TYPES,
24
30
  extract_office_content,
@@ -126,26 +132,30 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
126
132
  Supports 'auto', 'docling', and 'simple'.
127
133
  'auto' tries docling first, then falls back to simple if docling fails.
128
134
  """
129
- engine = state.document_engine or CONFIG.get("extraction", {}).get("document_engine", "auto")
135
+ # Use environment-aware engine selection
136
+ engine = state.document_engine or get_document_engine()
137
+
130
138
  if engine == "auto":
131
139
  logger.debug("Using auto engine")
132
- # Try docling first; if it fails or is not supported, fallback to simple
133
- if state.identified_type in DOCLING_SUPPORTED:
134
- try:
135
- logger.debug("Trying docling extraction")
136
- return "extract_docling"
137
- except Exception as e:
138
- logger.warning(
139
- f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
140
- )
140
+ # Check if docling is available AND supports the file type
141
+ if DOCLING_AVAILABLE and state.identified_type in DOCLING_SUPPORTED:
142
+ logger.debug("Using docling extraction (auto mode)")
143
+ return "extract_docling"
141
144
  # Fallback to simple
142
- logger.debug("Falling back to simple extraction")
145
+ logger.debug("Falling back to simple extraction (docling unavailable or unsupported)")
143
146
  return await file_type_edge(state)
144
147
 
145
- if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
146
- logger.debug("Using docling engine")
147
- return "extract_docling"
148
- # For 'simple', use the default file type edge
148
+ if engine == "docling":
149
+ if not DOCLING_AVAILABLE:
150
+ raise ImportError("Docling engine requested but docling package not installed. Install with: pip install content-core[docling]")
151
+ if state.identified_type in DOCLING_SUPPORTED:
152
+ logger.debug("Using docling engine")
153
+ return "extract_docling"
154
+ # If docling doesn't support this file type, fall back to simple
155
+ logger.debug("Docling doesn't support this file type, using simple engine")
156
+ return await file_type_edge(state)
157
+
158
+ # For 'simple' or any other engine
149
159
  logger.debug("Using simple engine")
150
160
  return await file_type_edge(state)
151
161
 
@@ -168,7 +178,9 @@ workflow.add_node("extract_audio_data", extract_audio_data)
168
178
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
169
179
  workflow.add_node("delete_file", delete_file)
170
180
  workflow.add_node("download_remote_file", download_remote_file)
171
- workflow.add_node("extract_docling", extract_with_docling)
181
+ # Only add docling node if available
182
+ if DOCLING_AVAILABLE:
183
+ workflow.add_node("extract_docling", extract_with_docling)
172
184
 
173
185
  # Add edges
174
186
  workflow.add_edge(START, "source")
@@ -2,22 +2,29 @@
2
2
  Docling-based document extraction processor.
3
3
  """
4
4
 
5
+ from content_core.common.state import ProcessSourceState
6
+ from content_core.config import CONFIG
7
+
8
+ DOCLING_AVAILABLE = False
5
9
  try:
6
10
  from docling.document_converter import DocumentConverter
11
+ DOCLING_AVAILABLE = True
7
12
  except ImportError:
8
13
 
9
14
  class DocumentConverter:
10
15
  """Stub when docling is not installed."""
11
16
 
12
17
  def __init__(self):
13
- raise ImportError("Docling not installed")
18
+ raise ImportError(
19
+ "Docling not installed. Install with: pip install content-core[docling] "
20
+ "or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
21
+ )
14
22
 
15
23
  def convert(self, source: str):
16
- raise ImportError("Docling not installed")
17
-
18
-
19
- from content_core.common.state import ProcessSourceState
20
- from content_core.config import CONFIG
24
+ raise ImportError(
25
+ "Docling not installed. Install with: pip install content-core[docling] "
26
+ "or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
27
+ )
21
28
 
22
29
  # Supported MIME types for Docling extraction
23
30
  DOCLING_SUPPORTED = {
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
5
5
  from readability import Document
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
- from content_core.config import CONFIG
8
+ from content_core.config import get_url_engine
9
9
  from content_core.logging import logger
10
10
  from content_core.processors.docling import DOCLING_SUPPORTED
11
11
  from content_core.processors.office import SUPPORTED_OFFICE_TYPES
@@ -165,7 +165,8 @@ async def extract_url(state: ProcessSourceState):
165
165
  """
166
166
  assert state.url, "No URL provided"
167
167
  url = state.url
168
- engine = state.url_engine or CONFIG.get("extraction", {}).get("url_engine", "auto")
168
+ # Use environment-aware engine selection
169
+ engine = state.url_engine or get_url_engine()
169
170
  try:
170
171
  if engine == "auto":
171
172
  if os.environ.get("FIRECRAWL_API_KEY"):
@@ -0,0 +1,109 @@
1
+ """Tests for configuration functions and environment variable handling."""
2
+ import pytest
3
+ from unittest.mock import patch, MagicMock
4
+ from content_core.config import (
5
+ get_document_engine,
6
+ get_url_engine,
7
+ ALLOWED_DOCUMENT_ENGINES,
8
+ ALLOWED_URL_ENGINES,
9
+ )
10
+
11
+
12
+ class TestDocumentEngineSelection:
13
+ """Test document engine selection with environment variables."""
14
+
15
+ def test_default_document_engine(self):
16
+ """Test default document engine when no env var is set."""
17
+ with patch.dict('os.environ', {}, clear=False):
18
+ # Remove the env var if it exists
19
+ if 'CCORE_DOCUMENT_ENGINE' in __import__('os').environ:
20
+ del __import__('os').environ['CCORE_DOCUMENT_ENGINE']
21
+ engine = get_document_engine()
22
+ assert engine == "auto" # Default from config
23
+
24
+ def test_valid_document_engine_env_var(self):
25
+ """Test valid document engine environment variable override."""
26
+ for engine in ALLOWED_DOCUMENT_ENGINES:
27
+ with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': engine}):
28
+ assert get_document_engine() == engine
29
+
30
+ def test_invalid_document_engine_env_var(self):
31
+ """Test invalid document engine environment variable falls back to default."""
32
+ with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': 'invalid_engine'}):
33
+ engine = get_document_engine()
34
+ assert engine == "auto" # Should fallback to default
35
+
36
+ def test_case_sensitive_document_engine_env_var(self):
37
+ """Test that document engine environment variable is case sensitive."""
38
+ with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': 'AUTO'}): # uppercase
39
+ engine = get_document_engine()
40
+ assert engine == "auto" # Should fallback to default
41
+
42
+
43
+ class TestUrlEngineSelection:
44
+ """Test URL engine selection with environment variables."""
45
+
46
+ def test_default_url_engine(self):
47
+ """Test default URL engine when no env var is set."""
48
+ with patch.dict('os.environ', {}, clear=False):
49
+ # Remove the env var if it exists
50
+ if 'CCORE_URL_ENGINE' in __import__('os').environ:
51
+ del __import__('os').environ['CCORE_URL_ENGINE']
52
+ engine = get_url_engine()
53
+ assert engine == "auto" # Default from config
54
+
55
+ def test_valid_url_engine_env_var(self):
56
+ """Test valid URL engine environment variable override."""
57
+ for engine in ALLOWED_URL_ENGINES:
58
+ with patch.dict('os.environ', {'CCORE_URL_ENGINE': engine}):
59
+ assert get_url_engine() == engine
60
+
61
+ def test_invalid_url_engine_env_var(self):
62
+ """Test invalid URL engine environment variable falls back to default."""
63
+ with patch.dict('os.environ', {'CCORE_URL_ENGINE': 'invalid_engine'}):
64
+ engine = get_url_engine()
65
+ assert engine == "auto" # Should fallback to default
66
+
67
+ def test_case_sensitive_url_engine_env_var(self):
68
+ """Test that URL engine environment variable is case sensitive."""
69
+ with patch.dict('os.environ', {'CCORE_URL_ENGINE': 'FIRECRAWL'}): # uppercase
70
+ engine = get_url_engine()
71
+ assert engine == "auto" # Should fallback to default
72
+
73
+
74
+ class TestEngineConstants:
75
+ """Test that engine constants contain expected values."""
76
+
77
+ def test_document_engine_constants(self):
78
+ """Test document engine allowed values."""
79
+ expected = {"auto", "simple", "docling"}
80
+ assert ALLOWED_DOCUMENT_ENGINES == expected
81
+
82
+ def test_url_engine_constants(self):
83
+ """Test URL engine allowed values."""
84
+ expected = {"auto", "simple", "firecrawl", "jina"}
85
+ assert ALLOWED_URL_ENGINES == expected
86
+
87
+
88
+ class TestEdgeCases:
89
+ """Test edge cases and error conditions."""
90
+
91
+ def test_empty_string_document_engine(self):
92
+ """Test empty string for document engine env var."""
93
+ with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': ''}):
94
+ # Empty string should be falsy and use default
95
+ engine = get_document_engine()
96
+ assert engine == "auto"
97
+
98
+ def test_empty_string_url_engine(self):
99
+ """Test empty string for URL engine env var."""
100
+ with patch.dict('os.environ', {'CCORE_URL_ENGINE': ''}):
101
+ # Empty string should be falsy and use default
102
+ engine = get_url_engine()
103
+ assert engine == "auto"
104
+
105
+ def test_whitespace_engine_values(self):
106
+ """Test whitespace in engine values are treated as invalid."""
107
+ with patch.dict('os.environ', {'CCORE_DOCUMENT_ENGINE': ' auto '}):
108
+ engine = get_document_engine()
109
+ assert engine == "auto" # Should fallback to default
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import pytest
3
3
  from types import SimpleNamespace
4
- from content_core.processors.docling import extract_with_docling
4
+ from content_core.processors.docling import extract_with_docling, DOCLING_AVAILABLE
5
5
  from content_core.common.state import ProcessSourceState
6
6
 
7
7
  class DummyDoc:
@@ -31,6 +31,7 @@ def patch_converter(monkeypatch):
31
31
  )
32
32
 
33
33
  @pytest.mark.asyncio
34
+ @pytest.mark.skipif(not DOCLING_AVAILABLE, reason="Docling not installed")
34
35
  async def test_extract_file(tmp_path):
35
36
  # File input with explicit markdown format
36
37
  fp = tmp_path / "test.txt"
@@ -40,6 +41,7 @@ async def test_extract_file(tmp_path):
40
41
  assert new_state.content == "md:file:" + str(fp)
41
42
 
42
43
  @pytest.mark.asyncio
44
+ @pytest.mark.skipif(not DOCLING_AVAILABLE, reason="Docling not installed")
43
45
  async def test_extract_block_html():
44
46
  # Block input with HTML format
45
47
  state = ProcessSourceState(content="block content", metadata={"docling_format": "html"})
@@ -47,6 +49,7 @@ async def test_extract_block_html():
47
49
  assert new_state.content == "<p>blk:block content</p>"
48
50
 
49
51
  @pytest.mark.asyncio
52
+ @pytest.mark.skipif(not DOCLING_AVAILABLE, reason="Docling not installed")
50
53
  async def test_default_to_markdown():
51
54
  # Default format should fallback to markdown
52
55
  state = ProcessSourceState(content="plain text")
@@ -419,7 +419,7 @@ wheels = [
419
419
 
420
420
  [[package]]
421
421
  name = "content-core"
422
- version = "1.2.0"
422
+ version = "1.2.1"
423
423
  source = { editable = "." }
424
424
  dependencies = [
425
425
  { name = "ai-prompter" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes