content-core 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/config.py CHANGED
@@ -6,6 +6,10 @@ from dotenv import load_dotenv
6
6
  # Load environment variables from .env file
7
7
  load_dotenv()
8
8
 
9
+ # Allowed engine values for validation
10
+ ALLOWED_DOCUMENT_ENGINES = {"auto", "simple", "docling"}
11
+ ALLOWED_URL_ENGINES = {"auto", "simple", "firecrawl", "jina"}
12
+
9
13
 
10
14
  def load_config():
11
15
  config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
@@ -33,6 +37,39 @@ def load_config():
33
37
 
34
38
  CONFIG = load_config()
35
39
 
40
+ # Environment variable engine selectors for MCP/Raycast users
41
+ def get_document_engine():
42
+ """Get document engine with environment variable override and validation."""
43
+ env_engine = os.environ.get("CCORE_DOCUMENT_ENGINE")
44
+ if env_engine:
45
+ if env_engine not in ALLOWED_DOCUMENT_ENGINES:
46
+ # Import logger here to avoid circular imports
47
+ from content_core.logging import logger
48
+ logger.warning(
49
+ f"Invalid CCORE_DOCUMENT_ENGINE: '{env_engine}'. "
50
+ f"Allowed values: {', '.join(sorted(ALLOWED_DOCUMENT_ENGINES))}. "
51
+ f"Using default from config."
52
+ )
53
+ return CONFIG.get("extraction", {}).get("document_engine", "auto")
54
+ return env_engine
55
+ return CONFIG.get("extraction", {}).get("document_engine", "auto")
56
+
57
+ def get_url_engine():
58
+ """Get URL engine with environment variable override and validation."""
59
+ env_engine = os.environ.get("CCORE_URL_ENGINE")
60
+ if env_engine:
61
+ if env_engine not in ALLOWED_URL_ENGINES:
62
+ # Import logger here to avoid circular imports
63
+ from content_core.logging import logger
64
+ logger.warning(
65
+ f"Invalid CCORE_URL_ENGINE: '{env_engine}'. "
66
+ f"Allowed values: {', '.join(sorted(ALLOWED_URL_ENGINES))}. "
67
+ f"Using default from config."
68
+ )
69
+ return CONFIG.get("extraction", {}).get("url_engine", "auto")
70
+ return env_engine
71
+ return CONFIG.get("extraction", {}).get("url_engine", "auto")
72
+
36
73
  # Programmatic config overrides: use in notebooks or scripts
37
74
  def set_document_engine(engine: str):
38
75
  """Override the document extraction engine ('auto', 'simple', or 'docling')."""
@@ -12,13 +12,19 @@ from content_core.common import (
12
12
  ProcessSourceState,
13
13
  UnsupportedTypeException,
14
14
  )
15
- from content_core.config import CONFIG # type: ignore
15
+ from content_core.config import get_document_engine
16
16
  from content_core.logging import logger
17
17
  from content_core.processors.audio import extract_audio_data # type: ignore
18
- from content_core.processors.docling import (
19
- DOCLING_SUPPORTED, # type: ignore
20
- extract_with_docling,
21
- )
18
+ try:
19
+ from content_core.processors.docling import (
20
+ DOCLING_SUPPORTED, # type: ignore
21
+ extract_with_docling,
22
+ DOCLING_AVAILABLE,
23
+ )
24
+ except ImportError:
25
+ DOCLING_AVAILABLE = False
26
+ DOCLING_SUPPORTED = set()
27
+ extract_with_docling = None
22
28
  from content_core.processors.office import (
23
29
  SUPPORTED_OFFICE_TYPES,
24
30
  extract_office_content,
@@ -126,26 +132,30 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
126
132
  Supports 'auto', 'docling', and 'simple'.
127
133
  'auto' tries docling first, then falls back to simple if docling fails.
128
134
  """
129
- engine = state.document_engine or CONFIG.get("extraction", {}).get("document_engine", "auto")
135
+ # Use environment-aware engine selection
136
+ engine = state.document_engine or get_document_engine()
137
+
130
138
  if engine == "auto":
131
139
  logger.debug("Using auto engine")
132
- # Try docling first; if it fails or is not supported, fallback to simple
133
- if state.identified_type in DOCLING_SUPPORTED:
134
- try:
135
- logger.debug("Trying docling extraction")
136
- return "extract_docling"
137
- except Exception as e:
138
- logger.warning(
139
- f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
140
- )
140
+ # Check if docling is available AND supports the file type
141
+ if DOCLING_AVAILABLE and state.identified_type in DOCLING_SUPPORTED:
142
+ logger.debug("Using docling extraction (auto mode)")
143
+ return "extract_docling"
141
144
  # Fallback to simple
142
- logger.debug("Falling back to simple extraction")
145
+ logger.debug("Falling back to simple extraction (docling unavailable or unsupported)")
143
146
  return await file_type_edge(state)
144
147
 
145
- if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
146
- logger.debug("Using docling engine")
147
- return "extract_docling"
148
- # For 'simple', use the default file type edge
148
+ if engine == "docling":
149
+ if not DOCLING_AVAILABLE:
150
+ raise ImportError("Docling engine requested but docling package not installed. Install with: pip install content-core[docling]")
151
+ if state.identified_type in DOCLING_SUPPORTED:
152
+ logger.debug("Using docling engine")
153
+ return "extract_docling"
154
+ # If docling doesn't support this file type, fall back to simple
155
+ logger.debug("Docling doesn't support this file type, using simple engine")
156
+ return await file_type_edge(state)
157
+
158
+ # For 'simple' or any other engine
149
159
  logger.debug("Using simple engine")
150
160
  return await file_type_edge(state)
151
161
 
@@ -168,7 +178,9 @@ workflow.add_node("extract_audio_data", extract_audio_data)
168
178
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
169
179
  workflow.add_node("delete_file", delete_file)
170
180
  workflow.add_node("download_remote_file", download_remote_file)
171
- workflow.add_node("extract_docling", extract_with_docling)
181
+ # Only add docling node if available
182
+ if DOCLING_AVAILABLE:
183
+ workflow.add_node("extract_docling", extract_with_docling)
172
184
 
173
185
  # Add edges
174
186
  workflow.add_edge(START, "source")
@@ -2,22 +2,29 @@
2
2
  Docling-based document extraction processor.
3
3
  """
4
4
 
5
+ from content_core.common.state import ProcessSourceState
6
+ from content_core.config import CONFIG
7
+
8
+ DOCLING_AVAILABLE = False
5
9
  try:
6
10
  from docling.document_converter import DocumentConverter
11
+ DOCLING_AVAILABLE = True
7
12
  except ImportError:
8
13
 
9
14
  class DocumentConverter:
10
15
  """Stub when docling is not installed."""
11
16
 
12
17
  def __init__(self):
13
- raise ImportError("Docling not installed")
18
+ raise ImportError(
19
+ "Docling not installed. Install with: pip install content-core[docling] "
20
+ "or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
21
+ )
14
22
 
15
23
  def convert(self, source: str):
16
- raise ImportError("Docling not installed")
17
-
18
-
19
- from content_core.common.state import ProcessSourceState
20
- from content_core.config import CONFIG
24
+ raise ImportError(
25
+ "Docling not installed. Install with: pip install content-core[docling] "
26
+ "or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
27
+ )
21
28
 
22
29
  # Supported MIME types for Docling extraction
23
30
  DOCLING_SUPPORTED = {
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
5
5
  from readability import Document
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
- from content_core.config import CONFIG
8
+ from content_core.config import get_url_engine
9
9
  from content_core.logging import logger
10
10
  from content_core.processors.docling import DOCLING_SUPPORTED
11
11
  from content_core.processors.office import SUPPORTED_OFFICE_TYPES
@@ -165,7 +165,8 @@ async def extract_url(state: ProcessSourceState):
165
165
  """
166
166
  assert state.url, "No URL provided"
167
167
  url = state.url
168
- engine = state.url_engine or CONFIG.get("extraction", {}).get("url_engine", "auto")
168
+ # Use environment-aware engine selection
169
+ engine = state.url_engine or get_url_engine()
169
170
  try:
170
171
  if engine == "auto":
171
172
  if os.environ.get("FIRECRAWL_API_KEY"):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -112,11 +112,17 @@ summary = await cc.summarize_content(result, context="explain to a child")
112
112
  Install Content Core using `pip`:
113
113
 
114
114
  ```bash
115
- # Install the package
115
+ # Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
116
116
  pip install content-core
117
117
 
118
- # Install with MCP server support
118
+ # With enhanced document processing (adds Docling)
119
+ pip install content-core[docling]
120
+
121
+ # With MCP server support
119
122
  pip install content-core[mcp]
123
+
124
+ # Full installation
125
+ pip install content-core[docling,mcp]
120
126
  ```
121
127
 
122
128
  Alternatively, if you’re developing locally:
@@ -526,8 +532,21 @@ Example `.env`:
526
532
  ```plaintext
527
533
  OPENAI_API_KEY=your-key-here
528
534
  GOOGLE_API_KEY=your-key-here
535
+
536
+ # Engine Selection (optional)
537
+ CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
538
+ CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
529
539
  ```
530
540
 
541
+ ### Engine Selection via Environment Variables
542
+
543
+ For deployment scenarios like MCP servers or Raycast extensions, you can override the extraction engines using environment variables:
544
+
545
+ - **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
546
+ - **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
547
+
548
+ These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
549
+
531
550
  ### Custom Prompt Templates
532
551
 
533
552
  Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
@@ -1,6 +1,6 @@
1
1
  content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
2
2
  content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
3
- content_core/config.py,sha256=OBwI58W4Twr00UiYD2mdw_rZDcuXxjBanE0IoA8ox-M,2601
3
+ content_core/config.py,sha256=3XAsMF3EhDJ6aCpzk1UZG_m3-SFdYe3cHiDPH7eVGwQ,4312
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
5
  content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
6
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
@@ -15,7 +15,7 @@ content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeF
15
15
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
16
16
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
17
17
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
18
- content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8YnijESGvnD0,7605
18
+ content_core/content/extraction/graph.py,sha256=sjk6NpzOMOzMbUOM0bqrDSlB3cLQzboviLDNbj48pjY,8074
19
19
  content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
20
20
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
21
21
  content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
@@ -24,19 +24,19 @@ content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,70
24
24
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
25
25
  content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
26
26
  content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
27
- content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
27
+ content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
28
28
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
29
29
  content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
30
30
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
31
- content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
31
+ content_core/processors/url.py,sha256=YoWw2CjZbqSKBi1CpY0Qowu4hfqGVGJjLZEXUjz7wxs,7536
32
32
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
33
33
  content_core/processors/youtube.py,sha256=MOeZboVfM9_C87L5mnUVvsbQeKoznwJoYn1wP1_hA_U,7869
34
34
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
35
35
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
36
36
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
37
37
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
38
- content_core-1.2.0.dist-info/METADATA,sha256=wAEQSfn6tTd4hQwAZY8sKeB5e7QpHm6qeTz2akFZwWw,18881
39
- content_core-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
40
- content_core-1.2.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
41
- content_core-1.2.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
42
- content_core-1.2.0.dist-info/RECORD,,
38
+ content_core-1.2.1.dist-info/METADATA,sha256=1LpANnMvECxIekt6kKQr0hnZ1ULGaD2xEmhRh_uzTdk,19676
39
+ content_core-1.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
40
+ content_core-1.2.1.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
41
+ content_core-1.2.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
42
+ content_core-1.2.1.dist-info/RECORD,,