content-core 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_core/__init__.py +216 -0
- content_core/cc_config.yaml +86 -0
- content_core/common/__init__.py +38 -0
- content_core/common/exceptions.py +70 -0
- content_core/common/retry.py +325 -0
- content_core/common/state.py +64 -0
- content_core/common/types.py +15 -0
- content_core/common/utils.py +31 -0
- content_core/config.py +575 -0
- content_core/content/__init__.py +6 -0
- content_core/content/cleanup/__init__.py +5 -0
- content_core/content/cleanup/core.py +15 -0
- content_core/content/extraction/__init__.py +13 -0
- content_core/content/extraction/graph.py +252 -0
- content_core/content/identification/__init__.py +9 -0
- content_core/content/identification/file_detector.py +505 -0
- content_core/content/summary/__init__.py +5 -0
- content_core/content/summary/core.py +15 -0
- content_core/logging.py +15 -0
- content_core/mcp/__init__.py +5 -0
- content_core/mcp/server.py +214 -0
- content_core/models.py +60 -0
- content_core/models_config.yaml +31 -0
- content_core/notebooks/run.ipynb +359 -0
- content_core/notebooks/urls.ipynb +154 -0
- content_core/processors/audio.py +272 -0
- content_core/processors/docling.py +79 -0
- content_core/processors/office.py +331 -0
- content_core/processors/pdf.py +292 -0
- content_core/processors/text.py +36 -0
- content_core/processors/url.py +324 -0
- content_core/processors/video.py +166 -0
- content_core/processors/youtube.py +262 -0
- content_core/py.typed +2 -0
- content_core/templated_message.py +70 -0
- content_core/tools/__init__.py +9 -0
- content_core/tools/cleanup.py +15 -0
- content_core/tools/extract.py +21 -0
- content_core/tools/summarize.py +17 -0
- content_core-1.10.0.dist-info/METADATA +742 -0
- content_core-1.10.0.dist-info/RECORD +44 -0
- content_core-1.10.0.dist-info/WHEEL +4 -0
- content_core-1.10.0.dist-info/entry_points.txt +5 -0
- content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
import aiohttp
|
|
7
|
+
from langgraph.graph import END, START, StateGraph
|
|
8
|
+
|
|
9
|
+
from content_core.common import (
|
|
10
|
+
ProcessSourceInput,
|
|
11
|
+
ProcessSourceState,
|
|
12
|
+
UnsupportedTypeException,
|
|
13
|
+
)
|
|
14
|
+
from content_core.common.retry import retry_download
|
|
15
|
+
from content_core.config import get_document_engine, get_proxy
|
|
16
|
+
from content_core.logging import logger
|
|
17
|
+
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
18
|
+
try:
|
|
19
|
+
from content_core.processors.docling import (
|
|
20
|
+
DOCLING_SUPPORTED, # type: ignore
|
|
21
|
+
extract_with_docling,
|
|
22
|
+
DOCLING_AVAILABLE,
|
|
23
|
+
)
|
|
24
|
+
except ImportError:
|
|
25
|
+
DOCLING_AVAILABLE = False
|
|
26
|
+
DOCLING_SUPPORTED = set()
|
|
27
|
+
extract_with_docling = None
|
|
28
|
+
from content_core.processors.office import (
|
|
29
|
+
SUPPORTED_OFFICE_TYPES,
|
|
30
|
+
extract_office_content,
|
|
31
|
+
)
|
|
32
|
+
from content_core.processors.pdf import SUPPORTED_FITZ_TYPES, extract_pdf
|
|
33
|
+
from content_core.processors.text import extract_txt
|
|
34
|
+
from content_core.processors.url import extract_url, url_provider
|
|
35
|
+
from content_core.processors.video import extract_best_audio_from_video
|
|
36
|
+
from content_core.processors.youtube import extract_youtube_transcript
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
|
|
40
|
+
"""
|
|
41
|
+
Identify the content source based on parameters
|
|
42
|
+
"""
|
|
43
|
+
if state.content:
|
|
44
|
+
doc_type = "text"
|
|
45
|
+
elif state.file_path:
|
|
46
|
+
doc_type = "file"
|
|
47
|
+
elif state.url:
|
|
48
|
+
doc_type = "url"
|
|
49
|
+
else:
|
|
50
|
+
raise ValueError("No source provided.")
|
|
51
|
+
|
|
52
|
+
return {"source_type": doc_type}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def file_type(state: ProcessSourceState) -> Dict[str, Any]:
|
|
56
|
+
"""
|
|
57
|
+
Identify the file using pure Python file detection
|
|
58
|
+
"""
|
|
59
|
+
from content_core.content.identification import get_file_type
|
|
60
|
+
|
|
61
|
+
return_dict = {}
|
|
62
|
+
file_path = state.file_path
|
|
63
|
+
if file_path is not None:
|
|
64
|
+
return_dict["identified_type"] = await get_file_type(file_path)
|
|
65
|
+
return_dict["title"] = os.path.basename(file_path)
|
|
66
|
+
return return_dict
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
async def file_type_edge(data: ProcessSourceState) -> str:
|
|
70
|
+
assert data.identified_type, "Type not identified"
|
|
71
|
+
identified_type = data.identified_type
|
|
72
|
+
logger.debug(f"Identified type: {identified_type}")
|
|
73
|
+
|
|
74
|
+
if identified_type == "text/plain":
|
|
75
|
+
return "extract_txt"
|
|
76
|
+
elif identified_type in SUPPORTED_FITZ_TYPES:
|
|
77
|
+
return "extract_pdf"
|
|
78
|
+
elif identified_type in SUPPORTED_OFFICE_TYPES:
|
|
79
|
+
return "extract_office_content"
|
|
80
|
+
elif identified_type.startswith("video"):
|
|
81
|
+
return "extract_best_audio_from_video"
|
|
82
|
+
elif identified_type.startswith("audio"):
|
|
83
|
+
return "extract_audio_data"
|
|
84
|
+
else:
|
|
85
|
+
raise UnsupportedTypeException(f"Unsupported file type: {data.identified_type}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def delete_file(data: ProcessSourceState) -> Dict[str, Any]:
|
|
89
|
+
if data.delete_source:
|
|
90
|
+
logger.debug(f"Deleting file: {data.file_path}")
|
|
91
|
+
file_path = data.file_path
|
|
92
|
+
if file_path is not None:
|
|
93
|
+
try:
|
|
94
|
+
os.remove(file_path)
|
|
95
|
+
return {"file_path": None}
|
|
96
|
+
except FileNotFoundError:
|
|
97
|
+
logger.warning(f"File not found while trying to delete: {file_path}")
|
|
98
|
+
else:
|
|
99
|
+
logger.debug("Not deleting file")
|
|
100
|
+
return {}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
async def url_type_router(x: ProcessSourceState) -> Optional[str]:
|
|
104
|
+
assert x.identified_type, "Type not identified"
|
|
105
|
+
return x.identified_type
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def source_type_router(x: ProcessSourceState) -> Optional[str]:
|
|
109
|
+
assert x.source_type, "Source type not identified"
|
|
110
|
+
return x.source_type
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@retry_download()
|
|
114
|
+
async def _fetch_remote_file(url: str, proxy: str | None = None) -> tuple:
|
|
115
|
+
"""Internal function to download a remote file - wrapped with retry logic."""
|
|
116
|
+
resolved_proxy = get_proxy(proxy)
|
|
117
|
+
async with aiohttp.ClientSession() as session:
|
|
118
|
+
async with session.get(url, proxy=resolved_proxy) as resp:
|
|
119
|
+
resp.raise_for_status()
|
|
120
|
+
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
121
|
+
content = await resp.read()
|
|
122
|
+
return mime, content
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
126
|
+
"""
|
|
127
|
+
Download a remote file with retry logic for transient network failures.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
state: ProcessSourceState containing the URL to download
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Dict with file_path and identified_type, or raises exception after retries
|
|
134
|
+
"""
|
|
135
|
+
url = state.url
|
|
136
|
+
assert url, "No URL provided"
|
|
137
|
+
logger.debug(f"Downloading remote file: {url}")
|
|
138
|
+
|
|
139
|
+
mime, content = await _fetch_remote_file(url, state.proxy)
|
|
140
|
+
|
|
141
|
+
suffix = (
|
|
142
|
+
os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
|
|
143
|
+
)
|
|
144
|
+
fd, tmp = tempfile.mkstemp(suffix=suffix)
|
|
145
|
+
os.close(fd)
|
|
146
|
+
with open(tmp, "wb") as f:
|
|
147
|
+
f.write(content)
|
|
148
|
+
|
|
149
|
+
return {"file_path": tmp, "identified_type": mime}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
153
|
+
"""
|
|
154
|
+
Route to Docling if enabled and supported; otherwise use simple file type edge.
|
|
155
|
+
Supports 'auto', 'docling', and 'simple'.
|
|
156
|
+
'auto' tries docling first, then falls back to simple if docling fails.
|
|
157
|
+
"""
|
|
158
|
+
# Use environment-aware engine selection
|
|
159
|
+
engine = state.document_engine or get_document_engine()
|
|
160
|
+
|
|
161
|
+
if engine == "auto":
|
|
162
|
+
logger.debug("Using auto engine")
|
|
163
|
+
# Check if docling is available AND supports the file type
|
|
164
|
+
if DOCLING_AVAILABLE and state.identified_type in DOCLING_SUPPORTED:
|
|
165
|
+
logger.debug("Using docling extraction (auto mode)")
|
|
166
|
+
return "extract_docling"
|
|
167
|
+
# Fallback to simple
|
|
168
|
+
logger.debug("Falling back to simple extraction (docling unavailable or unsupported)")
|
|
169
|
+
return await file_type_edge(state)
|
|
170
|
+
|
|
171
|
+
if engine == "docling":
|
|
172
|
+
if not DOCLING_AVAILABLE:
|
|
173
|
+
raise ImportError("Docling engine requested but docling package not installed. Install with: pip install content-core[docling]")
|
|
174
|
+
if state.identified_type in DOCLING_SUPPORTED:
|
|
175
|
+
logger.debug("Using docling engine")
|
|
176
|
+
return "extract_docling"
|
|
177
|
+
# If docling doesn't support this file type, fall back to simple
|
|
178
|
+
logger.debug("Docling doesn't support this file type, using simple engine")
|
|
179
|
+
return await file_type_edge(state)
|
|
180
|
+
|
|
181
|
+
# For 'simple' or any other engine
|
|
182
|
+
logger.debug("Using simple engine")
|
|
183
|
+
return await file_type_edge(state)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# Create workflow
|
|
187
|
+
workflow = StateGraph(
|
|
188
|
+
ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Add nodes
|
|
192
|
+
workflow.add_node("source", source_identification)
|
|
193
|
+
workflow.add_node("url_provider", url_provider)
|
|
194
|
+
workflow.add_node("file_type", file_type)
|
|
195
|
+
workflow.add_node("extract_txt", extract_txt)
|
|
196
|
+
workflow.add_node("extract_pdf", extract_pdf)
|
|
197
|
+
workflow.add_node("extract_url", extract_url)
|
|
198
|
+
workflow.add_node("extract_office_content", extract_office_content)
|
|
199
|
+
workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
|
|
200
|
+
workflow.add_node("extract_audio_data", extract_audio_data)
|
|
201
|
+
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
202
|
+
workflow.add_node("delete_file", delete_file)
|
|
203
|
+
workflow.add_node("download_remote_file", download_remote_file)
|
|
204
|
+
# Only add docling node if available
|
|
205
|
+
if DOCLING_AVAILABLE:
|
|
206
|
+
workflow.add_node("extract_docling", extract_with_docling)
|
|
207
|
+
|
|
208
|
+
# Add edges
|
|
209
|
+
workflow.add_edge(START, "source")
|
|
210
|
+
workflow.add_conditional_edges(
|
|
211
|
+
"source",
|
|
212
|
+
source_type_router,
|
|
213
|
+
{
|
|
214
|
+
"url": "url_provider",
|
|
215
|
+
"file": "file_type",
|
|
216
|
+
"text": END,
|
|
217
|
+
},
|
|
218
|
+
)
|
|
219
|
+
workflow.add_conditional_edges(
|
|
220
|
+
"file_type",
|
|
221
|
+
file_type_router_docling,
|
|
222
|
+
)
|
|
223
|
+
workflow.add_conditional_edges(
|
|
224
|
+
"url_provider",
|
|
225
|
+
url_type_router,
|
|
226
|
+
{
|
|
227
|
+
**{
|
|
228
|
+
m: "download_remote_file"
|
|
229
|
+
for m in list(SUPPORTED_FITZ_TYPES)
|
|
230
|
+
+ list(SUPPORTED_OFFICE_TYPES)
|
|
231
|
+
+ list(DOCLING_SUPPORTED)
|
|
232
|
+
if m not in ["text/html"] # Exclude HTML from file download, treat as web content
|
|
233
|
+
},
|
|
234
|
+
"article": "extract_url",
|
|
235
|
+
"text/html": "extract_url", # Route HTML content to URL extraction
|
|
236
|
+
"youtube": "extract_youtube_transcript",
|
|
237
|
+
},
|
|
238
|
+
)
|
|
239
|
+
workflow.add_edge("url_provider", END)
|
|
240
|
+
workflow.add_edge("file_type", END)
|
|
241
|
+
workflow.add_edge("extract_url", END)
|
|
242
|
+
workflow.add_edge("extract_txt", END)
|
|
243
|
+
workflow.add_edge("extract_youtube_transcript", END)
|
|
244
|
+
|
|
245
|
+
workflow.add_edge("extract_pdf", "delete_file")
|
|
246
|
+
workflow.add_edge("extract_office_content", "delete_file")
|
|
247
|
+
workflow.add_edge("extract_best_audio_from_video", "extract_audio_data")
|
|
248
|
+
workflow.add_edge("extract_audio_data", "delete_file")
|
|
249
|
+
workflow.add_edge("delete_file", END)
|
|
250
|
+
workflow.add_edge("download_remote_file", "file_type")
|
|
251
|
+
|
|
252
|
+
graph = workflow.compile()
|