content-core 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/config.py +37 -0
- content_core/content/extraction/graph.py +33 -21
- content_core/processors/docling.py +13 -6
- content_core/processors/url.py +3 -2
- {content_core-1.2.0.dist-info → content_core-1.2.1.dist-info}/METADATA +22 -3
- {content_core-1.2.0.dist-info → content_core-1.2.1.dist-info}/RECORD +9 -9
- {content_core-1.2.0.dist-info → content_core-1.2.1.dist-info}/WHEEL +0 -0
- {content_core-1.2.0.dist-info → content_core-1.2.1.dist-info}/entry_points.txt +0 -0
- {content_core-1.2.0.dist-info → content_core-1.2.1.dist-info}/licenses/LICENSE +0 -0
content_core/config.py
CHANGED
|
@@ -6,6 +6,10 @@ from dotenv import load_dotenv
|
|
|
6
6
|
# Load environment variables from .env file
|
|
7
7
|
load_dotenv()
|
|
8
8
|
|
|
9
|
+
# Allowed engine values for validation
|
|
10
|
+
ALLOWED_DOCUMENT_ENGINES = {"auto", "simple", "docling"}
|
|
11
|
+
ALLOWED_URL_ENGINES = {"auto", "simple", "firecrawl", "jina"}
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
def load_config():
|
|
11
15
|
config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
|
|
@@ -33,6 +37,39 @@ def load_config():
|
|
|
33
37
|
|
|
34
38
|
CONFIG = load_config()
|
|
35
39
|
|
|
40
|
+
# Environment variable engine selectors for MCP/Raycast users
|
|
41
|
+
def get_document_engine():
|
|
42
|
+
"""Get document engine with environment variable override and validation."""
|
|
43
|
+
env_engine = os.environ.get("CCORE_DOCUMENT_ENGINE")
|
|
44
|
+
if env_engine:
|
|
45
|
+
if env_engine not in ALLOWED_DOCUMENT_ENGINES:
|
|
46
|
+
# Import logger here to avoid circular imports
|
|
47
|
+
from content_core.logging import logger
|
|
48
|
+
logger.warning(
|
|
49
|
+
f"Invalid CCORE_DOCUMENT_ENGINE: '{env_engine}'. "
|
|
50
|
+
f"Allowed values: {', '.join(sorted(ALLOWED_DOCUMENT_ENGINES))}. "
|
|
51
|
+
f"Using default from config."
|
|
52
|
+
)
|
|
53
|
+
return CONFIG.get("extraction", {}).get("document_engine", "auto")
|
|
54
|
+
return env_engine
|
|
55
|
+
return CONFIG.get("extraction", {}).get("document_engine", "auto")
|
|
56
|
+
|
|
57
|
+
def get_url_engine():
|
|
58
|
+
"""Get URL engine with environment variable override and validation."""
|
|
59
|
+
env_engine = os.environ.get("CCORE_URL_ENGINE")
|
|
60
|
+
if env_engine:
|
|
61
|
+
if env_engine not in ALLOWED_URL_ENGINES:
|
|
62
|
+
# Import logger here to avoid circular imports
|
|
63
|
+
from content_core.logging import logger
|
|
64
|
+
logger.warning(
|
|
65
|
+
f"Invalid CCORE_URL_ENGINE: '{env_engine}'. "
|
|
66
|
+
f"Allowed values: {', '.join(sorted(ALLOWED_URL_ENGINES))}. "
|
|
67
|
+
f"Using default from config."
|
|
68
|
+
)
|
|
69
|
+
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
70
|
+
return env_engine
|
|
71
|
+
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
72
|
+
|
|
36
73
|
# Programmatic config overrides: use in notebooks or scripts
|
|
37
74
|
def set_document_engine(engine: str):
|
|
38
75
|
"""Override the document extraction engine ('auto', 'simple', or 'docling')."""
|
|
@@ -12,13 +12,19 @@ from content_core.common import (
|
|
|
12
12
|
ProcessSourceState,
|
|
13
13
|
UnsupportedTypeException,
|
|
14
14
|
)
|
|
15
|
-
from content_core.config import
|
|
15
|
+
from content_core.config import get_document_engine
|
|
16
16
|
from content_core.logging import logger
|
|
17
17
|
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
try:
|
|
19
|
+
from content_core.processors.docling import (
|
|
20
|
+
DOCLING_SUPPORTED, # type: ignore
|
|
21
|
+
extract_with_docling,
|
|
22
|
+
DOCLING_AVAILABLE,
|
|
23
|
+
)
|
|
24
|
+
except ImportError:
|
|
25
|
+
DOCLING_AVAILABLE = False
|
|
26
|
+
DOCLING_SUPPORTED = set()
|
|
27
|
+
extract_with_docling = None
|
|
22
28
|
from content_core.processors.office import (
|
|
23
29
|
SUPPORTED_OFFICE_TYPES,
|
|
24
30
|
extract_office_content,
|
|
@@ -126,26 +132,30 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
|
126
132
|
Supports 'auto', 'docling', and 'simple'.
|
|
127
133
|
'auto' tries docling first, then falls back to simple if docling fails.
|
|
128
134
|
"""
|
|
129
|
-
|
|
135
|
+
# Use environment-aware engine selection
|
|
136
|
+
engine = state.document_engine or get_document_engine()
|
|
137
|
+
|
|
130
138
|
if engine == "auto":
|
|
131
139
|
logger.debug("Using auto engine")
|
|
132
|
-
#
|
|
133
|
-
if state.identified_type in DOCLING_SUPPORTED:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
return "extract_docling"
|
|
137
|
-
except Exception as e:
|
|
138
|
-
logger.warning(
|
|
139
|
-
f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
|
|
140
|
-
)
|
|
140
|
+
# Check if docling is available AND supports the file type
|
|
141
|
+
if DOCLING_AVAILABLE and state.identified_type in DOCLING_SUPPORTED:
|
|
142
|
+
logger.debug("Using docling extraction (auto mode)")
|
|
143
|
+
return "extract_docling"
|
|
141
144
|
# Fallback to simple
|
|
142
|
-
logger.debug("Falling back to simple extraction")
|
|
145
|
+
logger.debug("Falling back to simple extraction (docling unavailable or unsupported)")
|
|
143
146
|
return await file_type_edge(state)
|
|
144
147
|
|
|
145
|
-
if engine == "docling"
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
148
|
+
if engine == "docling":
|
|
149
|
+
if not DOCLING_AVAILABLE:
|
|
150
|
+
raise ImportError("Docling engine requested but docling package not installed. Install with: pip install content-core[docling]")
|
|
151
|
+
if state.identified_type in DOCLING_SUPPORTED:
|
|
152
|
+
logger.debug("Using docling engine")
|
|
153
|
+
return "extract_docling"
|
|
154
|
+
# If docling doesn't support this file type, fall back to simple
|
|
155
|
+
logger.debug("Docling doesn't support this file type, using simple engine")
|
|
156
|
+
return await file_type_edge(state)
|
|
157
|
+
|
|
158
|
+
# For 'simple' or any other engine
|
|
149
159
|
logger.debug("Using simple engine")
|
|
150
160
|
return await file_type_edge(state)
|
|
151
161
|
|
|
@@ -168,7 +178,9 @@ workflow.add_node("extract_audio_data", extract_audio_data)
|
|
|
168
178
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
169
179
|
workflow.add_node("delete_file", delete_file)
|
|
170
180
|
workflow.add_node("download_remote_file", download_remote_file)
|
|
171
|
-
|
|
181
|
+
# Only add docling node if available
|
|
182
|
+
if DOCLING_AVAILABLE:
|
|
183
|
+
workflow.add_node("extract_docling", extract_with_docling)
|
|
172
184
|
|
|
173
185
|
# Add edges
|
|
174
186
|
workflow.add_edge(START, "source")
|
|
@@ -2,22 +2,29 @@
|
|
|
2
2
|
Docling-based document extraction processor.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from content_core.common.state import ProcessSourceState
|
|
6
|
+
from content_core.config import CONFIG
|
|
7
|
+
|
|
8
|
+
DOCLING_AVAILABLE = False
|
|
5
9
|
try:
|
|
6
10
|
from docling.document_converter import DocumentConverter
|
|
11
|
+
DOCLING_AVAILABLE = True
|
|
7
12
|
except ImportError:
|
|
8
13
|
|
|
9
14
|
class DocumentConverter:
|
|
10
15
|
"""Stub when docling is not installed."""
|
|
11
16
|
|
|
12
17
|
def __init__(self):
|
|
13
|
-
raise ImportError(
|
|
18
|
+
raise ImportError(
|
|
19
|
+
"Docling not installed. Install with: pip install content-core[docling] "
|
|
20
|
+
"or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
|
|
21
|
+
)
|
|
14
22
|
|
|
15
23
|
def convert(self, source: str):
|
|
16
|
-
raise ImportError(
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
from content_core.config import CONFIG
|
|
24
|
+
raise ImportError(
|
|
25
|
+
"Docling not installed. Install with: pip install content-core[docling] "
|
|
26
|
+
"or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
|
|
27
|
+
)
|
|
21
28
|
|
|
22
29
|
# Supported MIME types for Docling extraction
|
|
23
30
|
DOCLING_SUPPORTED = {
|
content_core/processors/url.py
CHANGED
|
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|
|
5
5
|
from readability import Document
|
|
6
6
|
|
|
7
7
|
from content_core.common import ProcessSourceState
|
|
8
|
-
from content_core.config import
|
|
8
|
+
from content_core.config import get_url_engine
|
|
9
9
|
from content_core.logging import logger
|
|
10
10
|
from content_core.processors.docling import DOCLING_SUPPORTED
|
|
11
11
|
from content_core.processors.office import SUPPORTED_OFFICE_TYPES
|
|
@@ -165,7 +165,8 @@ async def extract_url(state: ProcessSourceState):
|
|
|
165
165
|
"""
|
|
166
166
|
assert state.url, "No URL provided"
|
|
167
167
|
url = state.url
|
|
168
|
-
|
|
168
|
+
# Use environment-aware engine selection
|
|
169
|
+
engine = state.url_engine or get_url_engine()
|
|
169
170
|
try:
|
|
170
171
|
if engine == "auto":
|
|
171
172
|
if os.environ.get("FIRECRAWL_API_KEY"):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -112,11 +112,17 @@ summary = await cc.summarize_content(result, context="explain to a child")
|
|
|
112
112
|
Install Content Core using `pip`:
|
|
113
113
|
|
|
114
114
|
```bash
|
|
115
|
-
#
|
|
115
|
+
# Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
|
|
116
116
|
pip install content-core
|
|
117
117
|
|
|
118
|
-
#
|
|
118
|
+
# With enhanced document processing (adds Docling)
|
|
119
|
+
pip install content-core[docling]
|
|
120
|
+
|
|
121
|
+
# With MCP server support
|
|
119
122
|
pip install content-core[mcp]
|
|
123
|
+
|
|
124
|
+
# Full installation
|
|
125
|
+
pip install content-core[docling,mcp]
|
|
120
126
|
```
|
|
121
127
|
|
|
122
128
|
Alternatively, if you’re developing locally:
|
|
@@ -526,8 +532,21 @@ Example `.env`:
|
|
|
526
532
|
```plaintext
|
|
527
533
|
OPENAI_API_KEY=your-key-here
|
|
528
534
|
GOOGLE_API_KEY=your-key-here
|
|
535
|
+
|
|
536
|
+
# Engine Selection (optional)
|
|
537
|
+
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
538
|
+
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
529
539
|
```
|
|
530
540
|
|
|
541
|
+
### Engine Selection via Environment Variables
|
|
542
|
+
|
|
543
|
+
For deployment scenarios like MCP servers or Raycast extensions, you can override the extraction engines using environment variables:
|
|
544
|
+
|
|
545
|
+
- **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
|
|
546
|
+
- **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
547
|
+
|
|
548
|
+
These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
|
|
549
|
+
|
|
531
550
|
### Custom Prompt Templates
|
|
532
551
|
|
|
533
552
|
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
|
|
2
2
|
content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
|
|
3
|
-
content_core/config.py,sha256=
|
|
3
|
+
content_core/config.py,sha256=3XAsMF3EhDJ6aCpzk1UZG_m3-SFdYe3cHiDPH7eVGwQ,4312
|
|
4
4
|
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
5
5
|
content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
|
|
6
6
|
content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
|
|
@@ -15,7 +15,7 @@ content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeF
|
|
|
15
15
|
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
16
16
|
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
17
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
|
-
content_core/content/extraction/graph.py,sha256=
|
|
18
|
+
content_core/content/extraction/graph.py,sha256=sjk6NpzOMOzMbUOM0bqrDSlB3cLQzboviLDNbj48pjY,8074
|
|
19
19
|
content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
|
|
20
20
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
21
21
|
content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
|
|
@@ -24,19 +24,19 @@ content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,70
|
|
|
24
24
|
content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
|
|
25
25
|
content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
|
|
26
26
|
content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
|
|
27
|
-
content_core/processors/docling.py,sha256=
|
|
27
|
+
content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
|
|
28
28
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
29
29
|
content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
|
|
30
30
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
31
|
-
content_core/processors/url.py,sha256=
|
|
31
|
+
content_core/processors/url.py,sha256=YoWw2CjZbqSKBi1CpY0Qowu4hfqGVGJjLZEXUjz7wxs,7536
|
|
32
32
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
33
33
|
content_core/processors/youtube.py,sha256=MOeZboVfM9_C87L5mnUVvsbQeKoznwJoYn1wP1_hA_U,7869
|
|
34
34
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
35
35
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
36
36
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
37
37
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
38
|
-
content_core-1.2.
|
|
39
|
-
content_core-1.2.
|
|
40
|
-
content_core-1.2.
|
|
41
|
-
content_core-1.2.
|
|
42
|
-
content_core-1.2.
|
|
38
|
+
content_core-1.2.1.dist-info/METADATA,sha256=1LpANnMvECxIekt6kKQr0hnZ1ULGaD2xEmhRh_uzTdk,19676
|
|
39
|
+
content_core-1.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
40
|
+
content_core-1.2.1.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
|
|
41
|
+
content_core-1.2.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
42
|
+
content_core-1.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|