content-core 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/cc_config.yaml +4 -0
- content_core/config.py +57 -2
- content_core/content/extraction/graph.py +33 -21
- content_core/notebooks/urls.ipynb +154 -0
- content_core/processors/docling.py +13 -6
- content_core/processors/pdf.py +152 -28
- content_core/processors/url.py +3 -2
- {content_core-1.1.2.dist-info → content_core-1.2.1.dist-info}/METADATA +170 -22
- {content_core-1.1.2.dist-info → content_core-1.2.1.dist-info}/RECORD +12 -11
- {content_core-1.1.2.dist-info → content_core-1.2.1.dist-info}/WHEEL +0 -0
- {content_core-1.1.2.dist-info → content_core-1.2.1.dist-info}/entry_points.txt +0 -0
- {content_core-1.1.2.dist-info → content_core-1.2.1.dist-info}/licenses/LICENSE +0 -0
content_core/cc_config.yaml
CHANGED
|
@@ -34,6 +34,10 @@ extraction:
|
|
|
34
34
|
url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
|
|
35
35
|
docling:
|
|
36
36
|
output_format: markdown # markdown | html | json
|
|
37
|
+
pymupdf:
|
|
38
|
+
enable_formula_ocr: false # Enable OCR for formula-heavy pages (requires Tesseract)
|
|
39
|
+
formula_threshold: 3 # Minimum formulas per page to trigger OCR
|
|
40
|
+
ocr_fallback: true # Gracefully fallback to standard extraction if OCR fails
|
|
37
41
|
|
|
38
42
|
youtube_transcripts:
|
|
39
43
|
preferred_languages: ["en", "es", "pt"]
|
content_core/config.py
CHANGED
|
@@ -6,6 +6,10 @@ from dotenv import load_dotenv
|
|
|
6
6
|
# Load environment variables from .env file
|
|
7
7
|
load_dotenv()
|
|
8
8
|
|
|
9
|
+
# Allowed engine values for validation
|
|
10
|
+
ALLOWED_DOCUMENT_ENGINES = {"auto", "simple", "docling"}
|
|
11
|
+
ALLOWED_URL_ENGINES = {"auto", "simple", "firecrawl", "jina"}
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
def load_config():
|
|
11
15
|
config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
|
|
@@ -14,8 +18,8 @@ def load_config():
|
|
|
14
18
|
with open(config_path, "r") as file:
|
|
15
19
|
return yaml.safe_load(file)
|
|
16
20
|
except Exception as e:
|
|
17
|
-
print(f"
|
|
18
|
-
print("
|
|
21
|
+
print(f"Error loading configuration file from {config_path}: {e}")
|
|
22
|
+
print("Using internal default settings.")
|
|
19
23
|
|
|
20
24
|
default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
|
|
21
25
|
if default_config_data:
|
|
@@ -33,6 +37,39 @@ def load_config():
|
|
|
33
37
|
|
|
34
38
|
CONFIG = load_config()
|
|
35
39
|
|
|
40
|
+
# Environment variable engine selectors for MCP/Raycast users
|
|
41
|
+
def get_document_engine():
|
|
42
|
+
"""Get document engine with environment variable override and validation."""
|
|
43
|
+
env_engine = os.environ.get("CCORE_DOCUMENT_ENGINE")
|
|
44
|
+
if env_engine:
|
|
45
|
+
if env_engine not in ALLOWED_DOCUMENT_ENGINES:
|
|
46
|
+
# Import logger here to avoid circular imports
|
|
47
|
+
from content_core.logging import logger
|
|
48
|
+
logger.warning(
|
|
49
|
+
f"Invalid CCORE_DOCUMENT_ENGINE: '{env_engine}'. "
|
|
50
|
+
f"Allowed values: {', '.join(sorted(ALLOWED_DOCUMENT_ENGINES))}. "
|
|
51
|
+
f"Using default from config."
|
|
52
|
+
)
|
|
53
|
+
return CONFIG.get("extraction", {}).get("document_engine", "auto")
|
|
54
|
+
return env_engine
|
|
55
|
+
return CONFIG.get("extraction", {}).get("document_engine", "auto")
|
|
56
|
+
|
|
57
|
+
def get_url_engine():
|
|
58
|
+
"""Get URL engine with environment variable override and validation."""
|
|
59
|
+
env_engine = os.environ.get("CCORE_URL_ENGINE")
|
|
60
|
+
if env_engine:
|
|
61
|
+
if env_engine not in ALLOWED_URL_ENGINES:
|
|
62
|
+
# Import logger here to avoid circular imports
|
|
63
|
+
from content_core.logging import logger
|
|
64
|
+
logger.warning(
|
|
65
|
+
f"Invalid CCORE_URL_ENGINE: '{env_engine}'. "
|
|
66
|
+
f"Allowed values: {', '.join(sorted(ALLOWED_URL_ENGINES))}. "
|
|
67
|
+
f"Using default from config."
|
|
68
|
+
)
|
|
69
|
+
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
70
|
+
return env_engine
|
|
71
|
+
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
72
|
+
|
|
36
73
|
# Programmatic config overrides: use in notebooks or scripts
|
|
37
74
|
def set_document_engine(engine: str):
|
|
38
75
|
"""Override the document extraction engine ('auto', 'simple', or 'docling')."""
|
|
@@ -47,3 +84,21 @@ def set_docling_output_format(fmt: str):
|
|
|
47
84
|
extraction = CONFIG.setdefault("extraction", {})
|
|
48
85
|
docling_cfg = extraction.setdefault("docling", {})
|
|
49
86
|
docling_cfg["output_format"] = fmt
|
|
87
|
+
|
|
88
|
+
def set_pymupdf_ocr_enabled(enabled: bool):
|
|
89
|
+
"""Enable or disable PyMuPDF OCR for formula-heavy pages."""
|
|
90
|
+
extraction = CONFIG.setdefault("extraction", {})
|
|
91
|
+
pymupdf_cfg = extraction.setdefault("pymupdf", {})
|
|
92
|
+
pymupdf_cfg["enable_formula_ocr"] = enabled
|
|
93
|
+
|
|
94
|
+
def set_pymupdf_formula_threshold(threshold: int):
|
|
95
|
+
"""Set the minimum number of formulas per page to trigger OCR."""
|
|
96
|
+
extraction = CONFIG.setdefault("extraction", {})
|
|
97
|
+
pymupdf_cfg = extraction.setdefault("pymupdf", {})
|
|
98
|
+
pymupdf_cfg["formula_threshold"] = threshold
|
|
99
|
+
|
|
100
|
+
def set_pymupdf_ocr_fallback(enabled: bool):
|
|
101
|
+
"""Enable or disable fallback to standard extraction when OCR fails."""
|
|
102
|
+
extraction = CONFIG.setdefault("extraction", {})
|
|
103
|
+
pymupdf_cfg = extraction.setdefault("pymupdf", {})
|
|
104
|
+
pymupdf_cfg["ocr_fallback"] = enabled
|
|
@@ -12,13 +12,19 @@ from content_core.common import (
|
|
|
12
12
|
ProcessSourceState,
|
|
13
13
|
UnsupportedTypeException,
|
|
14
14
|
)
|
|
15
|
-
from content_core.config import
|
|
15
|
+
from content_core.config import get_document_engine
|
|
16
16
|
from content_core.logging import logger
|
|
17
17
|
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
try:
|
|
19
|
+
from content_core.processors.docling import (
|
|
20
|
+
DOCLING_SUPPORTED, # type: ignore
|
|
21
|
+
extract_with_docling,
|
|
22
|
+
DOCLING_AVAILABLE,
|
|
23
|
+
)
|
|
24
|
+
except ImportError:
|
|
25
|
+
DOCLING_AVAILABLE = False
|
|
26
|
+
DOCLING_SUPPORTED = set()
|
|
27
|
+
extract_with_docling = None
|
|
22
28
|
from content_core.processors.office import (
|
|
23
29
|
SUPPORTED_OFFICE_TYPES,
|
|
24
30
|
extract_office_content,
|
|
@@ -126,26 +132,30 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
|
126
132
|
Supports 'auto', 'docling', and 'simple'.
|
|
127
133
|
'auto' tries docling first, then falls back to simple if docling fails.
|
|
128
134
|
"""
|
|
129
|
-
|
|
135
|
+
# Use environment-aware engine selection
|
|
136
|
+
engine = state.document_engine or get_document_engine()
|
|
137
|
+
|
|
130
138
|
if engine == "auto":
|
|
131
139
|
logger.debug("Using auto engine")
|
|
132
|
-
#
|
|
133
|
-
if state.identified_type in DOCLING_SUPPORTED:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
return "extract_docling"
|
|
137
|
-
except Exception as e:
|
|
138
|
-
logger.warning(
|
|
139
|
-
f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
|
|
140
|
-
)
|
|
140
|
+
# Check if docling is available AND supports the file type
|
|
141
|
+
if DOCLING_AVAILABLE and state.identified_type in DOCLING_SUPPORTED:
|
|
142
|
+
logger.debug("Using docling extraction (auto mode)")
|
|
143
|
+
return "extract_docling"
|
|
141
144
|
# Fallback to simple
|
|
142
|
-
logger.debug("Falling back to simple extraction")
|
|
145
|
+
logger.debug("Falling back to simple extraction (docling unavailable or unsupported)")
|
|
143
146
|
return await file_type_edge(state)
|
|
144
147
|
|
|
145
|
-
if engine == "docling"
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
148
|
+
if engine == "docling":
|
|
149
|
+
if not DOCLING_AVAILABLE:
|
|
150
|
+
raise ImportError("Docling engine requested but docling package not installed. Install with: pip install content-core[docling]")
|
|
151
|
+
if state.identified_type in DOCLING_SUPPORTED:
|
|
152
|
+
logger.debug("Using docling engine")
|
|
153
|
+
return "extract_docling"
|
|
154
|
+
# If docling doesn't support this file type, fall back to simple
|
|
155
|
+
logger.debug("Docling doesn't support this file type, using simple engine")
|
|
156
|
+
return await file_type_edge(state)
|
|
157
|
+
|
|
158
|
+
# For 'simple' or any other engine
|
|
149
159
|
logger.debug("Using simple engine")
|
|
150
160
|
return await file_type_edge(state)
|
|
151
161
|
|
|
@@ -168,7 +178,9 @@ workflow.add_node("extract_audio_data", extract_audio_data)
|
|
|
168
178
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
169
179
|
workflow.add_node("delete_file", delete_file)
|
|
170
180
|
workflow.add_node("download_remote_file", download_remote_file)
|
|
171
|
-
|
|
181
|
+
# Only add docling node if available
|
|
182
|
+
if DOCLING_AVAILABLE:
|
|
183
|
+
workflow.add_node("extract_docling", extract_with_docling)
|
|
172
184
|
|
|
173
185
|
# Add edges
|
|
174
186
|
workflow.add_edge(START, "source")
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": 2,
|
|
6
|
+
"id": "873a872b",
|
|
7
|
+
"metadata": {},
|
|
8
|
+
"outputs": [],
|
|
9
|
+
"source": [
|
|
10
|
+
"from content_core.content.extraction import extract_content\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"async def process_url(url):\n",
|
|
13
|
+
" print(\"Processing: \", url)\n",
|
|
14
|
+
" print(\"Simple: -------\")\n",
|
|
15
|
+
" result = await extract_content(dict(url=url, engine=\"simple\"))\n",
|
|
16
|
+
" print(result.title[:100])\n",
|
|
17
|
+
" print(result.content[:100])\n",
|
|
18
|
+
" print(\"Jina: -------\")\n",
|
|
19
|
+
" result = await extract_content(dict(url=url, engine=\"jina\"))\n",
|
|
20
|
+
" print(result.title[:100])\n",
|
|
21
|
+
" print(result.content[:100])\n",
|
|
22
|
+
" print(\"Firecrawl: -------\")\n",
|
|
23
|
+
" result = await extract_content(dict(url=url, engine=\"firecrawl\"))\n",
|
|
24
|
+
" print(result.title[:100])\n",
|
|
25
|
+
" print(result.content[:100])\n",
|
|
26
|
+
" print(\"=============================\")"
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"cell_type": "code",
|
|
31
|
+
"execution_count": 4,
|
|
32
|
+
"id": "263dc3af",
|
|
33
|
+
"metadata": {},
|
|
34
|
+
"outputs": [
|
|
35
|
+
{
|
|
36
|
+
"name": "stdout",
|
|
37
|
+
"output_type": "stream",
|
|
38
|
+
"text": [
|
|
39
|
+
"Processing: https://www.supernovalabs.com.br/\n",
|
|
40
|
+
"Simple: -------\n",
|
|
41
|
+
"Readability failed: No content extracted by readability\n",
|
|
42
|
+
"Supernova Labs | AI Consulting\n",
|
|
43
|
+
"Supernova Labs | AI Consulting\n",
|
|
44
|
+
"Jina: -------\n",
|
|
45
|
+
"Supernova Labs | Elite AI Consulting to help you build the Future\n",
|
|
46
|
+
"URL Source: https://www.supernovalabs.com.br/\n",
|
|
47
|
+
"\n",
|
|
48
|
+
"Markdown Content:\n",
|
|
49
|
+
"Supernova Labs\n",
|
|
50
|
+
"\n",
|
|
51
|
+
"[About](https://www\n",
|
|
52
|
+
"Firecrawl: -------\n",
|
|
53
|
+
"Supernova Labs | AI Consulting\n",
|
|
54
|
+
"# Unleash Your AI Edge. Fast.\n",
|
|
55
|
+
"\n",
|
|
56
|
+
"We turn your data, tech and capabilities into impact with lean AI sol\n",
|
|
57
|
+
"=============================\n",
|
|
58
|
+
"None\n",
|
|
59
|
+
"Processing: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
|
|
60
|
+
"Simple: -------\n",
|
|
61
|
+
"Fine-Tuning Transaction User Models - Building Nubank\n",
|
|
62
|
+
"Fine-Tuning Transaction User Models Learn how we combine transaction embeddings with tabular data us\n",
|
|
63
|
+
"Jina: -------\n",
|
|
64
|
+
"Fine-Tuning Transaction User Models - Building Nubank\n",
|
|
65
|
+
"URL Source: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
|
|
66
|
+
"\n",
|
|
67
|
+
"Published Time: 2025-0\n",
|
|
68
|
+
"Firecrawl: -------\n",
|
|
69
|
+
"Fine-Tuning Transaction User Models - Building Nubank\n",
|
|
70
|
+
"# Fine-Tuning Transaction User Models\n",
|
|
71
|
+
"\n",
|
|
72
|
+
"Learn how we combine transaction embeddings with tabular data\n",
|
|
73
|
+
"=============================\n",
|
|
74
|
+
"None\n",
|
|
75
|
+
"Processing: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\n",
|
|
76
|
+
"Simple: -------\n",
|
|
77
|
+
"You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
|
|
78
|
+
"Most people think they need to quit their job to build a new life. I thought that too. You scroll th\n",
|
|
79
|
+
"Jina: -------\n",
|
|
80
|
+
"You Can Make Money With AI Without Quitting Your Job\n",
|
|
81
|
+
"URL Source: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quittin\n",
|
|
82
|
+
"Firecrawl: -------\n",
|
|
83
|
+
"You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
|
|
84
|
+
"[Sitemap](https://medium.com/sitemap/sitemap.xml)\n",
|
|
85
|
+
"\n",
|
|
86
|
+
"[Open in app](https://rsci.app.link/?%24canonical\n",
|
|
87
|
+
"=============================\n",
|
|
88
|
+
"None\n",
|
|
89
|
+
"Processing: https://github.com/mirkonasato/pyodconverter\n",
|
|
90
|
+
"Simple: -------\n",
|
|
91
|
+
"GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
|
|
92
|
+
"This repository was archived by the owner on Dec 1, 2023. It is now read-only. mirkonasato/pyodconve\n",
|
|
93
|
+
"Jina: -------\n",
|
|
94
|
+
"GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
|
|
95
|
+
"URL Source: https://github.com/mirkonasato/pyodconverter\n",
|
|
96
|
+
"\n",
|
|
97
|
+
"Markdown Content:\n",
|
|
98
|
+
"GitHub - mirkonasato/pyo\n",
|
|
99
|
+
"Firecrawl: -------\n",
|
|
100
|
+
"GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
|
|
101
|
+
"[Skip to content](https://github.com/mirkonasato/pyodconverter#start-of-content)\n",
|
|
102
|
+
"\n",
|
|
103
|
+
"You signed in with\n",
|
|
104
|
+
"=============================\n",
|
|
105
|
+
"None\n",
|
|
106
|
+
"Processing: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\n",
|
|
107
|
+
"Simple: -------\n",
|
|
108
|
+
"Error processing URL https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR: HTTP error: 500\n",
|
|
109
|
+
"Error\n",
|
|
110
|
+
"Failed to extract content: HTTP error: 500\n",
|
|
111
|
+
"Jina: -------\n",
|
|
112
|
+
"Ultra-aprendizado: domine habilidades valiosas, seja mais esperto que a competição e dê um impulso n\n",
|
|
113
|
+
"URL Source: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-ca\n",
|
|
114
|
+
"Firecrawl: -------\n",
|
|
115
|
+
"Amazon.com.br\n",
|
|
116
|
+
"#### Digite os caracteres que você vê abaixo\n",
|
|
117
|
+
"\n",
|
|
118
|
+
"Desculpe pelo inconveniente. Para continuar realizando\n",
|
|
119
|
+
"=============================\n",
|
|
120
|
+
"None\n"
|
|
121
|
+
]
|
|
122
|
+
}
|
|
123
|
+
],
|
|
124
|
+
"source": [
|
|
125
|
+
"\n",
|
|
126
|
+
"urls= [\"https://www.supernovalabs.com.br/\", \"https://building.nubank.com/fine-tuning-transaction-user-models/\", \"https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\", \"https://github.com/mirkonasato/pyodconverter\", \"https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\"]\n",
|
|
127
|
+
"for url in urls:\n",
|
|
128
|
+
" result = await process_url(url=url)\n",
|
|
129
|
+
" print(result)"
|
|
130
|
+
]
|
|
131
|
+
}
|
|
132
|
+
],
|
|
133
|
+
"metadata": {
|
|
134
|
+
"kernelspec": {
|
|
135
|
+
"display_name": ".venv",
|
|
136
|
+
"language": "python",
|
|
137
|
+
"name": "python3"
|
|
138
|
+
},
|
|
139
|
+
"language_info": {
|
|
140
|
+
"codemirror_mode": {
|
|
141
|
+
"name": "ipython",
|
|
142
|
+
"version": 3
|
|
143
|
+
},
|
|
144
|
+
"file_extension": ".py",
|
|
145
|
+
"mimetype": "text/x-python",
|
|
146
|
+
"name": "python",
|
|
147
|
+
"nbconvert_exporter": "python",
|
|
148
|
+
"pygments_lexer": "ipython3",
|
|
149
|
+
"version": "3.10.6"
|
|
150
|
+
}
|
|
151
|
+
},
|
|
152
|
+
"nbformat": 4,
|
|
153
|
+
"nbformat_minor": 5
|
|
154
|
+
}
|
|
@@ -2,22 +2,29 @@
|
|
|
2
2
|
Docling-based document extraction processor.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from content_core.common.state import ProcessSourceState
|
|
6
|
+
from content_core.config import CONFIG
|
|
7
|
+
|
|
8
|
+
DOCLING_AVAILABLE = False
|
|
5
9
|
try:
|
|
6
10
|
from docling.document_converter import DocumentConverter
|
|
11
|
+
DOCLING_AVAILABLE = True
|
|
7
12
|
except ImportError:
|
|
8
13
|
|
|
9
14
|
class DocumentConverter:
|
|
10
15
|
"""Stub when docling is not installed."""
|
|
11
16
|
|
|
12
17
|
def __init__(self):
|
|
13
|
-
raise ImportError(
|
|
18
|
+
raise ImportError(
|
|
19
|
+
"Docling not installed. Install with: pip install content-core[docling] "
|
|
20
|
+
"or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
|
|
21
|
+
)
|
|
14
22
|
|
|
15
23
|
def convert(self, source: str):
|
|
16
|
-
raise ImportError(
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
from content_core.config import CONFIG
|
|
24
|
+
raise ImportError(
|
|
25
|
+
"Docling not installed. Install with: pip install content-core[docling] "
|
|
26
|
+
"or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
|
|
27
|
+
)
|
|
21
28
|
|
|
22
29
|
# Supported MIME types for Docling extraction
|
|
23
30
|
DOCLING_SUPPORTED = {
|
content_core/processors/pdf.py
CHANGED
|
@@ -5,20 +5,90 @@ import unicodedata
|
|
|
5
5
|
import fitz # type: ignore
|
|
6
6
|
|
|
7
7
|
from content_core.common import ProcessSourceState
|
|
8
|
+
from content_core.config import CONFIG
|
|
8
9
|
from content_core.logging import logger
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
11
|
+
def count_formula_placeholders(text):
|
|
12
|
+
"""
|
|
13
|
+
Count the number of formula placeholders in extracted text.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
text (str): Extracted text content
|
|
17
|
+
Returns:
|
|
18
|
+
int: Number of formula placeholders found
|
|
19
|
+
"""
|
|
20
|
+
if not text:
|
|
21
|
+
return 0
|
|
22
|
+
return text.count('<!-- formula-not-decoded -->')
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_page_with_ocr(page, page_num):
|
|
26
|
+
"""
|
|
27
|
+
Extract text from a page using OCR (Tesseract).
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
page: PyMuPDF page object
|
|
31
|
+
page_num (int): Page number for logging
|
|
32
|
+
Returns:
|
|
33
|
+
str: OCR-extracted text or None if OCR fails
|
|
34
|
+
"""
|
|
35
|
+
try:
|
|
36
|
+
logger.debug(f"Attempting OCR extraction for page {page_num}")
|
|
37
|
+
# Create TextPage using OCR
|
|
38
|
+
textpage = page.get_textpage_ocr()
|
|
39
|
+
if textpage:
|
|
40
|
+
# Extract text from the OCR TextPage
|
|
41
|
+
ocr_text = textpage.extractText()
|
|
42
|
+
logger.debug(f"OCR successful for page {page_num}, extracted {len(ocr_text)} characters")
|
|
43
|
+
return ocr_text
|
|
44
|
+
else:
|
|
45
|
+
logger.warning(f"OCR TextPage creation failed for page {page_num}")
|
|
46
|
+
return None
|
|
47
|
+
except (ImportError, RuntimeError, OSError) as e:
|
|
48
|
+
# Common errors: Tesseract not installed, OCR failure, file access issues
|
|
49
|
+
logger.debug(f"OCR extraction failed for page {page_num}: {e}")
|
|
50
|
+
return None
|
|
51
|
+
except Exception as e:
|
|
52
|
+
# Unexpected errors - log as warning for debugging
|
|
53
|
+
logger.warning(f"Unexpected error during OCR extraction for page {page_num}: {e}")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def convert_table_to_markdown(table):
|
|
58
|
+
"""
|
|
59
|
+
Convert a PyMuPDF table to markdown format.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
table: Table data from PyMuPDF (list of lists)
|
|
63
|
+
Returns:
|
|
64
|
+
str: Markdown-formatted table
|
|
65
|
+
"""
|
|
66
|
+
if not table or not table[0]:
|
|
67
|
+
return ""
|
|
68
|
+
|
|
69
|
+
# Build markdown table
|
|
70
|
+
markdown_lines = []
|
|
71
|
+
|
|
72
|
+
# Header row
|
|
73
|
+
header = table[0]
|
|
74
|
+
header_row = "| " + " | ".join(str(cell) if cell else "" for cell in header) + " |"
|
|
75
|
+
markdown_lines.append(header_row)
|
|
76
|
+
|
|
77
|
+
# Separator row
|
|
78
|
+
separator = "|" + "|".join([" --- " for _ in header]) + "|"
|
|
79
|
+
markdown_lines.append(separator)
|
|
80
|
+
|
|
81
|
+
# Data rows
|
|
82
|
+
for row in table[1:]:
|
|
83
|
+
if row: # Skip empty rows
|
|
84
|
+
row_text = "| " + " | ".join(str(cell) if cell else "" for cell in row) + " |"
|
|
85
|
+
markdown_lines.append(row_text)
|
|
86
|
+
|
|
87
|
+
return "\n".join(markdown_lines) + "\n"
|
|
88
|
+
|
|
89
|
+
# Configuration constants
|
|
90
|
+
DEFAULT_FORMULA_THRESHOLD = 3
|
|
91
|
+
DEFAULT_OCR_FALLBACK = True
|
|
22
92
|
|
|
23
93
|
SUPPORTED_FITZ_TYPES = [
|
|
24
94
|
"application/pdf",
|
|
@@ -116,30 +186,84 @@ def clean_pdf_text(text):
|
|
|
116
186
|
return text.strip()
|
|
117
187
|
|
|
118
188
|
|
|
119
|
-
async def _extract_text_from_pdf(pdf_path):
|
|
120
|
-
doc = fitz.open(pdf_path)
|
|
121
|
-
try:
|
|
122
|
-
text = ""
|
|
123
|
-
logger.debug(f"Found {len(doc)} pages in PDF")
|
|
124
|
-
for page in doc:
|
|
125
|
-
text += page.get_text()
|
|
126
|
-
normalized_text = clean_pdf_text(text)
|
|
127
|
-
return normalized_text
|
|
128
|
-
finally:
|
|
129
|
-
doc.close()
|
|
130
189
|
|
|
131
190
|
|
|
132
191
|
async def _extract_text_from_pdf(pdf_path):
|
|
133
|
-
"""Extract text from PDF asynchronously"""
|
|
192
|
+
"""Extract text from PDF asynchronously with table detection"""
|
|
134
193
|
|
|
135
194
|
def _extract():
|
|
136
195
|
doc = fitz.open(pdf_path)
|
|
137
196
|
try:
|
|
138
|
-
|
|
197
|
+
full_text = []
|
|
139
198
|
logger.debug(f"Found {len(doc)} pages in PDF")
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
199
|
+
|
|
200
|
+
# Use quality improvement flags for better text extraction
|
|
201
|
+
extraction_flags = (
|
|
202
|
+
fitz.TEXT_PRESERVE_LIGATURES | # Better character rendering
|
|
203
|
+
fitz.TEXT_PRESERVE_WHITESPACE | # Better spacing preservation
|
|
204
|
+
fitz.TEXT_PRESERVE_IMAGES # Better image-text integration
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Get OCR configuration
|
|
208
|
+
ocr_config = CONFIG.get('extraction', {}).get('pymupdf', {})
|
|
209
|
+
enable_ocr = ocr_config.get('enable_formula_ocr', False)
|
|
210
|
+
formula_threshold = ocr_config.get('formula_threshold', DEFAULT_FORMULA_THRESHOLD)
|
|
211
|
+
ocr_fallback = ocr_config.get('ocr_fallback', DEFAULT_OCR_FALLBACK)
|
|
212
|
+
|
|
213
|
+
for page_num, page in enumerate(doc):
|
|
214
|
+
# Extract regular text with quality flags
|
|
215
|
+
standard_text = page.get_text(flags=extraction_flags)
|
|
216
|
+
|
|
217
|
+
# Check if we should try OCR for this page
|
|
218
|
+
formula_count = count_formula_placeholders(standard_text)
|
|
219
|
+
use_ocr = (enable_ocr and
|
|
220
|
+
formula_count >= formula_threshold and
|
|
221
|
+
formula_count > 0)
|
|
222
|
+
|
|
223
|
+
if use_ocr:
|
|
224
|
+
logger.debug(f"Page {page_num + 1} has {formula_count} formulas, attempting OCR")
|
|
225
|
+
ocr_text = extract_page_with_ocr(page, page_num + 1)
|
|
226
|
+
|
|
227
|
+
if ocr_text and ocr_fallback:
|
|
228
|
+
# Use OCR text but preserve table extraction from standard text
|
|
229
|
+
page_text = ocr_text
|
|
230
|
+
logger.debug(f"Using OCR text for page {page_num + 1}")
|
|
231
|
+
else:
|
|
232
|
+
# OCR failed, use standard text
|
|
233
|
+
page_text = standard_text
|
|
234
|
+
if not ocr_text:
|
|
235
|
+
logger.debug(f"OCR failed for page {page_num + 1}, using standard extraction")
|
|
236
|
+
else:
|
|
237
|
+
page_text = standard_text
|
|
238
|
+
|
|
239
|
+
# Try to find and extract tables (regardless of OCR)
|
|
240
|
+
try:
|
|
241
|
+
tables = page.find_tables()
|
|
242
|
+
if tables:
|
|
243
|
+
logger.debug(f"Found {len(tables)} table(s) on page {page_num + 1}")
|
|
244
|
+
|
|
245
|
+
# For each table found, convert to markdown and append
|
|
246
|
+
for table_num, table in enumerate(tables):
|
|
247
|
+
# Extract table data
|
|
248
|
+
table_data = table.extract()
|
|
249
|
+
# Validate table has actual content (not just empty rows/cells)
|
|
250
|
+
if table_data and len(table_data) > 0 and any(
|
|
251
|
+
any(str(cell).strip() for cell in row if cell) for row in table_data if row
|
|
252
|
+
):
|
|
253
|
+
# Add a marker before the table
|
|
254
|
+
page_text += f"\n\n[Table {table_num + 1} from page {page_num + 1}]\n"
|
|
255
|
+
# Convert to markdown
|
|
256
|
+
markdown_table = convert_table_to_markdown(table_data)
|
|
257
|
+
page_text += markdown_table + "\n"
|
|
258
|
+
except Exception as e:
|
|
259
|
+
# If table extraction fails, continue with regular text
|
|
260
|
+
logger.debug(f"Table extraction failed on page {page_num + 1}: {e}")
|
|
261
|
+
|
|
262
|
+
full_text.append(page_text)
|
|
263
|
+
|
|
264
|
+
# Join all pages and clean
|
|
265
|
+
combined_text = "".join(full_text)
|
|
266
|
+
return clean_pdf_text(combined_text)
|
|
143
267
|
finally:
|
|
144
268
|
doc.close()
|
|
145
269
|
|
content_core/processors/url.py
CHANGED
|
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|
|
5
5
|
from readability import Document
|
|
6
6
|
|
|
7
7
|
from content_core.common import ProcessSourceState
|
|
8
|
-
from content_core.config import
|
|
8
|
+
from content_core.config import get_url_engine
|
|
9
9
|
from content_core.logging import logger
|
|
10
10
|
from content_core.processors.docling import DOCLING_SUPPORTED
|
|
11
11
|
from content_core.processors.office import SUPPORTED_OFFICE_TYPES
|
|
@@ -165,7 +165,8 @@ async def extract_url(state: ProcessSourceState):
|
|
|
165
165
|
"""
|
|
166
166
|
assert state.url, "No URL provided"
|
|
167
167
|
url = state.url
|
|
168
|
-
|
|
168
|
+
# Use environment-aware engine selection
|
|
169
|
+
engine = state.url_engine or get_url_engine()
|
|
169
170
|
try:
|
|
170
171
|
if engine == "auto":
|
|
171
172
|
if os.environ.get("FIRECRAWL_API_KEY"):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.1
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -10,7 +10,6 @@ Requires-Dist: aiohttp>=3.11
|
|
|
10
10
|
Requires-Dist: asciidoc>=10.2.1
|
|
11
11
|
Requires-Dist: bs4>=0.0.2
|
|
12
12
|
Requires-Dist: dicttoxml>=1.7.16
|
|
13
|
-
Requires-Dist: docling>=2.34.0
|
|
14
13
|
Requires-Dist: esperanto>=1.2.0
|
|
15
14
|
Requires-Dist: firecrawl-py>=2.7.0
|
|
16
15
|
Requires-Dist: jinja2>=3.1.6
|
|
@@ -31,6 +30,8 @@ Requires-Dist: pytubefix>=9.1.1
|
|
|
31
30
|
Requires-Dist: readability-lxml>=0.8.4.1
|
|
32
31
|
Requires-Dist: validators>=0.34.0
|
|
33
32
|
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
33
|
+
Provides-Extra: docling
|
|
34
|
+
Requires-Dist: docling>=2.34.0; extra == 'docling'
|
|
34
35
|
Provides-Extra: mcp
|
|
35
36
|
Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
|
|
36
37
|
Description-Content-Type: text/markdown
|
|
@@ -39,29 +40,70 @@ Description-Content-Type: text/markdown
|
|
|
39
40
|
|
|
40
41
|
[](https://opensource.org/licenses/MIT)
|
|
41
42
|
|
|
42
|
-
**Content Core** is a
|
|
43
|
+
**Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
|
|
43
44
|
|
|
44
|
-
##
|
|
45
|
+
## 🚀 What You Can Do
|
|
45
46
|
|
|
46
|
-
|
|
47
|
+
**Extract content from anywhere:**
|
|
48
|
+
- 📄 **Documents** - PDF, Word, PowerPoint, Excel, Markdown, HTML, EPUB
|
|
49
|
+
- 🎥 **Media** - Videos (MP4, AVI, MOV) with automatic transcription
|
|
50
|
+
- 🎵 **Audio** - MP3, WAV, M4A with speech-to-text conversion
|
|
51
|
+
- 🌐 **Web** - Any URL with intelligent content extraction
|
|
52
|
+
- 🖼️ **Images** - JPG, PNG, TIFF with OCR text recognition
|
|
53
|
+
- 📦 **Archives** - ZIP, TAR, GZ with content analysis
|
|
47
54
|
|
|
48
|
-
|
|
55
|
+
**Process with AI:**
|
|
56
|
+
- ✨ **Clean & format** extracted content automatically
|
|
57
|
+
- 📝 **Generate summaries** with customizable styles (bullet points, executive summary, etc.)
|
|
58
|
+
- 🎯 **Context-aware processing** - explain to a child, technical summary, action items
|
|
59
|
+
- 🔄 **Smart engine selection** - automatically chooses the best extraction method
|
|
49
60
|
|
|
50
|
-
##
|
|
61
|
+
## 🛠️ Multiple Ways to Use
|
|
51
62
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
### 🖥️ Command Line (Zero Install)
|
|
64
|
+
```bash
|
|
65
|
+
# Extract content from any source
|
|
66
|
+
uvx --from "content-core" ccore https://example.com
|
|
67
|
+
uvx --from "content-core" ccore document.pdf
|
|
68
|
+
|
|
69
|
+
# Generate AI summaries
|
|
70
|
+
uvx --from "content-core" csum video.mp4 --context "bullet points"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### 🤖 Claude Desktop Integration
|
|
74
|
+
One-click setup with Model Context Protocol (MCP) - extract content directly in Claude conversations.
|
|
75
|
+
|
|
76
|
+
### 🔍 Raycast Extension
|
|
77
|
+
Smart auto-detection commands:
|
|
78
|
+
- **Extract Content** - Full interface with format options
|
|
79
|
+
- **Summarize Content** - 9 summary styles available
|
|
80
|
+
- **Quick Extract** - Instant clipboard extraction
|
|
81
|
+
|
|
82
|
+
### 🖱️ macOS Right-Click Integration
|
|
83
|
+
Right-click any file in Finder → Services → Extract or Summarize content instantly.
|
|
84
|
+
|
|
85
|
+
### 🐍 Python Library
|
|
86
|
+
```python
|
|
87
|
+
import content_core as cc
|
|
88
|
+
|
|
89
|
+
# Extract from any source
|
|
90
|
+
result = await cc.extract("https://example.com/article")
|
|
91
|
+
summary = await cc.summarize_content(result, context="explain to a child")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## ⚡ Key Features
|
|
95
|
+
|
|
96
|
+
* **🎯 Intelligent Auto-Detection:** Automatically selects the best extraction method based on content type and available services
|
|
97
|
+
* **🔧 Smart Engine Selection:**
|
|
98
|
+
* **URLs:** Firecrawl → Jina → BeautifulSoup fallback chain
|
|
99
|
+
* **Documents:** Docling → Enhanced PyMuPDF → Simple extraction fallback
|
|
100
|
+
* **Media:** OpenAI Whisper transcription
|
|
101
|
+
* **Images:** OCR with multiple engine support
|
|
102
|
+
* **📊 Enhanced PDF Processing:** Advanced PyMuPDF engine with quality flags, table detection, and optional OCR for mathematical formulas
|
|
103
|
+
* **🌍 Multiple Integrations:** CLI, Python library, MCP server, Raycast extension, macOS Services
|
|
104
|
+
* **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
|
|
105
|
+
* **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
|
|
106
|
+
* **🔄 Asynchronous:** Built with `asyncio` for efficient processing
|
|
65
107
|
|
|
66
108
|
## Getting Started
|
|
67
109
|
|
|
@@ -70,11 +112,17 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
70
112
|
Install Content Core using `pip`:
|
|
71
113
|
|
|
72
114
|
```bash
|
|
73
|
-
#
|
|
115
|
+
# Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
|
|
74
116
|
pip install content-core
|
|
75
117
|
|
|
76
|
-
#
|
|
118
|
+
# With enhanced document processing (adds Docling)
|
|
119
|
+
pip install content-core[docling]
|
|
120
|
+
|
|
121
|
+
# With MCP server support
|
|
77
122
|
pip install content-core[mcp]
|
|
123
|
+
|
|
124
|
+
# Full installation
|
|
125
|
+
pip install content-core[docling,mcp]
|
|
78
126
|
```
|
|
79
127
|
|
|
80
128
|
Alternatively, if you’re developing locally:
|
|
@@ -245,6 +293,49 @@ Add to your `claude_desktop_config.json`:
|
|
|
245
293
|
|
|
246
294
|
For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
|
|
247
295
|
|
|
296
|
+
## Enhanced PDF Processing
|
|
297
|
+
|
|
298
|
+
Content Core features an optimized PyMuPDF extraction engine with significant improvements for scientific documents and complex PDFs.
|
|
299
|
+
|
|
300
|
+
### Key Improvements
|
|
301
|
+
|
|
302
|
+
- **🔬 Mathematical Formula Extraction**: Enhanced quality flags eliminate `<!-- formula-not-decoded -->` placeholders
|
|
303
|
+
- **📊 Automatic Table Detection**: Tables converted to markdown format for LLM consumption
|
|
304
|
+
- **🔧 Quality Text Rendering**: Better ligature, whitespace, and image-text integration
|
|
305
|
+
- **⚡ Optional OCR Enhancement**: Selective OCR for formula-heavy pages (requires Tesseract)
|
|
306
|
+
|
|
307
|
+
### Configuration for Scientific Documents
|
|
308
|
+
|
|
309
|
+
For documents with heavy mathematical content, enable OCR enhancement:
|
|
310
|
+
|
|
311
|
+
```yaml
|
|
312
|
+
# In cc_config.yaml
|
|
313
|
+
extraction:
|
|
314
|
+
pymupdf:
|
|
315
|
+
enable_formula_ocr: true # Enable OCR for formula-heavy pages
|
|
316
|
+
formula_threshold: 3 # Min formulas per page to trigger OCR
|
|
317
|
+
ocr_fallback: true # Graceful fallback if OCR fails
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
```python
|
|
321
|
+
# Runtime configuration
|
|
322
|
+
from content_core.config import set_pymupdf_ocr_enabled
|
|
323
|
+
set_pymupdf_ocr_enabled(True)
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
### Requirements for OCR Enhancement
|
|
327
|
+
|
|
328
|
+
```bash
|
|
329
|
+
# Install Tesseract OCR (optional, for formula enhancement)
|
|
330
|
+
# macOS
|
|
331
|
+
brew install tesseract
|
|
332
|
+
|
|
333
|
+
# Ubuntu/Debian
|
|
334
|
+
sudo apt-get install tesseract-ocr
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
**Note**: OCR is optional - you get improved PDF extraction automatically without any additional setup.
|
|
338
|
+
|
|
248
339
|
## macOS Services Integration
|
|
249
340
|
|
|
250
341
|
Content Core provides powerful right-click integration with macOS Finder, allowing you to extract and summarize content from any file without installation. Choose between clipboard or TextEdit output for maximum flexibility.
|
|
@@ -288,6 +379,50 @@ Create **4 convenient services** for different workflows:
|
|
|
288
379
|
|
|
289
380
|
For complete setup instructions with copy-paste scripts, see [macOS Services Documentation](docs/macos.md).
|
|
290
381
|
|
|
382
|
+
## Raycast Extension
|
|
383
|
+
|
|
384
|
+
Content Core provides a powerful Raycast extension with smart auto-detection that handles both URLs and file paths seamlessly. Extract and summarize content directly from your Raycast interface without switching applications.
|
|
385
|
+
|
|
386
|
+
### Quick Setup
|
|
387
|
+
|
|
388
|
+
**From Raycast Store** (coming soon):
|
|
389
|
+
1. Open Raycast and search for "Content Core"
|
|
390
|
+
2. Install the extension by `luis_novo`
|
|
391
|
+
3. Configure API keys in preferences
|
|
392
|
+
|
|
393
|
+
**Manual Installation**:
|
|
394
|
+
1. Download the extension from the repository
|
|
395
|
+
2. Open Raycast → "Import Extension"
|
|
396
|
+
3. Select the `raycast-content-core` folder
|
|
397
|
+
|
|
398
|
+
### Commands
|
|
399
|
+
|
|
400
|
+
**🔍 Extract Content** - Smart URL/file detection with full interface
|
|
401
|
+
- Auto-detects URLs vs file paths in real-time
|
|
402
|
+
- Multiple output formats (Text, JSON, XML)
|
|
403
|
+
- Drag & drop support for files
|
|
404
|
+
- Rich results view with metadata
|
|
405
|
+
|
|
406
|
+
**📝 Summarize Content** - AI-powered summaries with customizable styles
|
|
407
|
+
- 9 different summary styles (bullet points, executive summary, etc.)
|
|
408
|
+
- Auto-detects source type with visual feedback
|
|
409
|
+
- One-click snippet creation and quicklinks
|
|
410
|
+
|
|
411
|
+
**⚡ Quick Extract** - Instant extraction to clipboard
|
|
412
|
+
- Type → Tab → Paste source → Enter
|
|
413
|
+
- No UI, works directly from command bar
|
|
414
|
+
- Perfect for quick workflows
|
|
415
|
+
|
|
416
|
+
### Features
|
|
417
|
+
|
|
418
|
+
- **Smart Auto-Detection**: Instantly recognizes URLs vs file paths
|
|
419
|
+
- **Zero Installation**: Uses `uvx` for Content Core execution
|
|
420
|
+
- **Rich Integration**: Keyboard shortcuts, clipboard actions, Raycast snippets
|
|
421
|
+
- **All File Types**: Documents, videos, audio, images, archives
|
|
422
|
+
- **Visual Feedback**: Real-time type detection with icons
|
|
423
|
+
|
|
424
|
+
For detailed setup, configuration, and usage examples, see [Raycast Extension Documentation](docs/raycast.md).
|
|
425
|
+
|
|
291
426
|
## Using with Langchain
|
|
292
427
|
|
|
293
428
|
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
@@ -397,8 +532,21 @@ Example `.env`:
|
|
|
397
532
|
```plaintext
|
|
398
533
|
OPENAI_API_KEY=your-key-here
|
|
399
534
|
GOOGLE_API_KEY=your-key-here
|
|
535
|
+
|
|
536
|
+
# Engine Selection (optional)
|
|
537
|
+
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
538
|
+
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
400
539
|
```
|
|
401
540
|
|
|
541
|
+
### Engine Selection via Environment Variables
|
|
542
|
+
|
|
543
|
+
For deployment scenarios like MCP servers or Raycast extensions, you can override the extraction engines using environment variables:
|
|
544
|
+
|
|
545
|
+
- **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
|
|
546
|
+
- **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
547
|
+
|
|
548
|
+
These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
|
|
549
|
+
|
|
402
550
|
### Custom Prompt Templates
|
|
403
551
|
|
|
404
552
|
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
|
|
2
|
-
content_core/cc_config.yaml,sha256=
|
|
3
|
-
content_core/config.py,sha256=
|
|
2
|
+
content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
|
|
3
|
+
content_core/config.py,sha256=3XAsMF3EhDJ6aCpzk1UZG_m3-SFdYe3cHiDPH7eVGwQ,4312
|
|
4
4
|
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
5
5
|
content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
|
|
6
6
|
content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
|
|
@@ -15,27 +15,28 @@ content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeF
|
|
|
15
15
|
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
16
16
|
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
17
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
|
-
content_core/content/extraction/graph.py,sha256=
|
|
18
|
+
content_core/content/extraction/graph.py,sha256=sjk6NpzOMOzMbUOM0bqrDSlB3cLQzboviLDNbj48pjY,8074
|
|
19
19
|
content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
|
|
20
20
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
21
21
|
content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
|
|
22
22
|
content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
|
|
23
23
|
content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
|
|
24
24
|
content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
|
|
25
|
+
content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
|
|
25
26
|
content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
|
|
26
|
-
content_core/processors/docling.py,sha256=
|
|
27
|
+
content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
|
|
27
28
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
28
|
-
content_core/processors/pdf.py,sha256=
|
|
29
|
+
content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
|
|
29
30
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
30
|
-
content_core/processors/url.py,sha256=
|
|
31
|
+
content_core/processors/url.py,sha256=YoWw2CjZbqSKBi1CpY0Qowu4hfqGVGJjLZEXUjz7wxs,7536
|
|
31
32
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
32
33
|
content_core/processors/youtube.py,sha256=MOeZboVfM9_C87L5mnUVvsbQeKoznwJoYn1wP1_hA_U,7869
|
|
33
34
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
34
35
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
35
36
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
36
37
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
37
|
-
content_core-1.1.
|
|
38
|
-
content_core-1.1.
|
|
39
|
-
content_core-1.1.
|
|
40
|
-
content_core-1.1.
|
|
41
|
-
content_core-1.1.
|
|
38
|
+
content_core-1.2.1.dist-info/METADATA,sha256=1LpANnMvECxIekt6kKQr0hnZ1ULGaD2xEmhRh_uzTdk,19676
|
|
39
|
+
content_core-1.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
40
|
+
content_core-1.2.1.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
|
|
41
|
+
content_core-1.2.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
42
|
+
content_core-1.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|