content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,252 @@
1
+ import os
2
+ import tempfile
3
+ from typing import Any, Dict, Optional
4
+ from urllib.parse import urlparse
5
+
6
+ import aiohttp
7
+ from langgraph.graph import END, START, StateGraph
8
+
9
+ from content_core.common import (
10
+ ProcessSourceInput,
11
+ ProcessSourceState,
12
+ UnsupportedTypeException,
13
+ )
14
+ from content_core.common.retry import retry_download
15
+ from content_core.config import get_document_engine, get_proxy
16
+ from content_core.logging import logger
17
+ from content_core.processors.audio import extract_audio_data # type: ignore
18
+ try:
19
+ from content_core.processors.docling import (
20
+ DOCLING_SUPPORTED, # type: ignore
21
+ extract_with_docling,
22
+ DOCLING_AVAILABLE,
23
+ )
24
+ except ImportError:
25
+ DOCLING_AVAILABLE = False
26
+ DOCLING_SUPPORTED = set()
27
+ extract_with_docling = None
28
+ from content_core.processors.office import (
29
+ SUPPORTED_OFFICE_TYPES,
30
+ extract_office_content,
31
+ )
32
+ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES, extract_pdf
33
+ from content_core.processors.text import extract_txt
34
+ from content_core.processors.url import extract_url, url_provider
35
+ from content_core.processors.video import extract_best_audio_from_video
36
+ from content_core.processors.youtube import extract_youtube_transcript
37
+
38
+
39
+ async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
40
+ """
41
+ Identify the content source based on parameters
42
+ """
43
+ if state.content:
44
+ doc_type = "text"
45
+ elif state.file_path:
46
+ doc_type = "file"
47
+ elif state.url:
48
+ doc_type = "url"
49
+ else:
50
+ raise ValueError("No source provided.")
51
+
52
+ return {"source_type": doc_type}
53
+
54
+
55
+ async def file_type(state: ProcessSourceState) -> Dict[str, Any]:
56
+ """
57
+ Identify the file using pure Python file detection
58
+ """
59
+ from content_core.content.identification import get_file_type
60
+
61
+ return_dict = {}
62
+ file_path = state.file_path
63
+ if file_path is not None:
64
+ return_dict["identified_type"] = await get_file_type(file_path)
65
+ return_dict["title"] = os.path.basename(file_path)
66
+ return return_dict
67
+
68
+
69
+ async def file_type_edge(data: ProcessSourceState) -> str:
70
+ assert data.identified_type, "Type not identified"
71
+ identified_type = data.identified_type
72
+ logger.debug(f"Identified type: {identified_type}")
73
+
74
+ if identified_type == "text/plain":
75
+ return "extract_txt"
76
+ elif identified_type in SUPPORTED_FITZ_TYPES:
77
+ return "extract_pdf"
78
+ elif identified_type in SUPPORTED_OFFICE_TYPES:
79
+ return "extract_office_content"
80
+ elif identified_type.startswith("video"):
81
+ return "extract_best_audio_from_video"
82
+ elif identified_type.startswith("audio"):
83
+ return "extract_audio_data"
84
+ else:
85
+ raise UnsupportedTypeException(f"Unsupported file type: {data.identified_type}")
86
+
87
+
88
+ async def delete_file(data: ProcessSourceState) -> Dict[str, Any]:
89
+ if data.delete_source:
90
+ logger.debug(f"Deleting file: {data.file_path}")
91
+ file_path = data.file_path
92
+ if file_path is not None:
93
+ try:
94
+ os.remove(file_path)
95
+ return {"file_path": None}
96
+ except FileNotFoundError:
97
+ logger.warning(f"File not found while trying to delete: {file_path}")
98
+ else:
99
+ logger.debug("Not deleting file")
100
+ return {}
101
+
102
+
103
+ async def url_type_router(x: ProcessSourceState) -> Optional[str]:
104
+ assert x.identified_type, "Type not identified"
105
+ return x.identified_type
106
+
107
+
108
+ async def source_type_router(x: ProcessSourceState) -> Optional[str]:
109
+ assert x.source_type, "Source type not identified"
110
+ return x.source_type
111
+
112
+
113
+ @retry_download()
114
+ async def _fetch_remote_file(url: str, proxy: str | None = None) -> tuple:
115
+ """Internal function to download a remote file - wrapped with retry logic."""
116
+ resolved_proxy = get_proxy(proxy)
117
+ async with aiohttp.ClientSession() as session:
118
+ async with session.get(url, proxy=resolved_proxy) as resp:
119
+ resp.raise_for_status()
120
+ mime = resp.headers.get("content-type", "").split(";", 1)[0]
121
+ content = await resp.read()
122
+ return mime, content
123
+
124
+
125
+ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
126
+ """
127
+ Download a remote file with retry logic for transient network failures.
128
+
129
+ Args:
130
+ state: ProcessSourceState containing the URL to download
131
+
132
+ Returns:
133
+ Dict with file_path and identified_type, or raises exception after retries
134
+ """
135
+ url = state.url
136
+ assert url, "No URL provided"
137
+ logger.debug(f"Downloading remote file: {url}")
138
+
139
+ mime, content = await _fetch_remote_file(url, state.proxy)
140
+
141
+ suffix = (
142
+ os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
143
+ )
144
+ fd, tmp = tempfile.mkstemp(suffix=suffix)
145
+ os.close(fd)
146
+ with open(tmp, "wb") as f:
147
+ f.write(content)
148
+
149
+ return {"file_path": tmp, "identified_type": mime}
150
+
151
+
152
+ async def file_type_router_docling(state: ProcessSourceState) -> str:
153
+ """
154
+ Route to Docling if enabled and supported; otherwise use simple file type edge.
155
+ Supports 'auto', 'docling', and 'simple'.
156
+ 'auto' tries docling first, then falls back to simple if docling fails.
157
+ """
158
+ # Use environment-aware engine selection
159
+ engine = state.document_engine or get_document_engine()
160
+
161
+ if engine == "auto":
162
+ logger.debug("Using auto engine")
163
+ # Check if docling is available AND supports the file type
164
+ if DOCLING_AVAILABLE and state.identified_type in DOCLING_SUPPORTED:
165
+ logger.debug("Using docling extraction (auto mode)")
166
+ return "extract_docling"
167
+ # Fallback to simple
168
+ logger.debug("Falling back to simple extraction (docling unavailable or unsupported)")
169
+ return await file_type_edge(state)
170
+
171
+ if engine == "docling":
172
+ if not DOCLING_AVAILABLE:
173
+ raise ImportError("Docling engine requested but docling package not installed. Install with: pip install content-core[docling]")
174
+ if state.identified_type in DOCLING_SUPPORTED:
175
+ logger.debug("Using docling engine")
176
+ return "extract_docling"
177
+ # If docling doesn't support this file type, fall back to simple
178
+ logger.debug("Docling doesn't support this file type, using simple engine")
179
+ return await file_type_edge(state)
180
+
181
+ # For 'simple' or any other engine
182
+ logger.debug("Using simple engine")
183
+ return await file_type_edge(state)
184
+
185
+
186
+ # Create workflow
187
+ workflow = StateGraph(
188
+ ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
189
+ )
190
+
191
+ # Add nodes
192
+ workflow.add_node("source", source_identification)
193
+ workflow.add_node("url_provider", url_provider)
194
+ workflow.add_node("file_type", file_type)
195
+ workflow.add_node("extract_txt", extract_txt)
196
+ workflow.add_node("extract_pdf", extract_pdf)
197
+ workflow.add_node("extract_url", extract_url)
198
+ workflow.add_node("extract_office_content", extract_office_content)
199
+ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
200
+ workflow.add_node("extract_audio_data", extract_audio_data)
201
+ workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
202
+ workflow.add_node("delete_file", delete_file)
203
+ workflow.add_node("download_remote_file", download_remote_file)
204
+ # Only add docling node if available
205
+ if DOCLING_AVAILABLE:
206
+ workflow.add_node("extract_docling", extract_with_docling)
207
+
208
+ # Add edges
209
+ workflow.add_edge(START, "source")
210
+ workflow.add_conditional_edges(
211
+ "source",
212
+ source_type_router,
213
+ {
214
+ "url": "url_provider",
215
+ "file": "file_type",
216
+ "text": END,
217
+ },
218
+ )
219
+ workflow.add_conditional_edges(
220
+ "file_type",
221
+ file_type_router_docling,
222
+ )
223
+ workflow.add_conditional_edges(
224
+ "url_provider",
225
+ url_type_router,
226
+ {
227
+ **{
228
+ m: "download_remote_file"
229
+ for m in list(SUPPORTED_FITZ_TYPES)
230
+ + list(SUPPORTED_OFFICE_TYPES)
231
+ + list(DOCLING_SUPPORTED)
232
+ if m not in ["text/html"] # Exclude HTML from file download, treat as web content
233
+ },
234
+ "article": "extract_url",
235
+ "text/html": "extract_url", # Route HTML content to URL extraction
236
+ "youtube": "extract_youtube_transcript",
237
+ },
238
+ )
239
+ workflow.add_edge("url_provider", END)
240
+ workflow.add_edge("file_type", END)
241
+ workflow.add_edge("extract_url", END)
242
+ workflow.add_edge("extract_txt", END)
243
+ workflow.add_edge("extract_youtube_transcript", END)
244
+
245
+ workflow.add_edge("extract_pdf", "delete_file")
246
+ workflow.add_edge("extract_office_content", "delete_file")
247
+ workflow.add_edge("extract_best_audio_from_video", "extract_audio_data")
248
+ workflow.add_edge("extract_audio_data", "delete_file")
249
+ workflow.add_edge("delete_file", END)
250
+ workflow.add_edge("download_remote_file", "file_type")
251
+
252
+ graph = workflow.compile()
@@ -0,0 +1,9 @@
1
+ from .file_detector import FileDetector
2
+
3
+
4
+ async def get_file_type(file_path: str) -> str:
5
+ """
6
+ Identify the file using pure Python file detection
7
+ """
8
+ detector = FileDetector()
9
+ return await detector.detect(file_path)