content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,324 @@
1
+ import os
2
+
3
+ import aiohttp
4
+ from bs4 import BeautifulSoup
5
+ from readability import Document
6
+
7
+ from content_core.common import ProcessSourceState
8
+ from content_core.common.retry import retry_url_api, retry_url_network
9
+ from content_core.config import get_proxy, get_url_engine
10
+ from content_core.logging import logger
11
+ from content_core.processors.docling import DOCLING_SUPPORTED
12
+ from content_core.processors.office import SUPPORTED_OFFICE_TYPES
13
+ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
14
+
15
+
16
+ @retry_url_network()
17
+ async def _fetch_url_mime_type(url: str, proxy: str | None = None) -> str:
18
+ """Internal function to fetch URL MIME type - wrapped with retry logic."""
19
+ resolved_proxy = get_proxy(proxy)
20
+ async with aiohttp.ClientSession() as session:
21
+ async with session.head(
22
+ url, timeout=10, allow_redirects=True, proxy=resolved_proxy
23
+ ) as resp:
24
+ mime = resp.headers.get("content-type", "").split(";", 1)[0]
25
+ logger.debug(f"MIME type for {url}: {mime}")
26
+ return mime
27
+
28
+
29
+ async def url_provider(state: ProcessSourceState):
30
+ """
31
+ Identify the provider with retry logic for network requests.
32
+ """
33
+ return_dict = {}
34
+ url = state.url
35
+ if url:
36
+ if "youtube.com" in url or "youtu.be" in url:
37
+ return_dict["identified_type"] = "youtube"
38
+ else:
39
+ # remote URL: check content-type to catch PDFs
40
+ try:
41
+ mime = await _fetch_url_mime_type(url, state.proxy)
42
+ except Exception as e:
43
+ logger.warning(f"HEAD check failed for {url} after retries: {e}")
44
+ mime = "article"
45
+ if (
46
+ mime in DOCLING_SUPPORTED
47
+ or mime in SUPPORTED_FITZ_TYPES
48
+ or mime in SUPPORTED_OFFICE_TYPES
49
+ ):
50
+ logger.debug(f"Identified type for {url}: {mime}")
51
+ return_dict["identified_type"] = mime
52
+ else:
53
+ logger.debug(f"Identified type for {url}: article")
54
+ return_dict["identified_type"] = "article"
55
+ return return_dict
56
+
57
+
58
+ @retry_url_network()
59
+ async def _fetch_url_html(url: str, proxy: str | None = None) -> str:
60
+ """Internal function to fetch URL HTML content - wrapped with retry logic."""
61
+ resolved_proxy = get_proxy(proxy)
62
+ async with aiohttp.ClientSession() as session:
63
+ async with session.get(url, timeout=10, proxy=resolved_proxy) as response:
64
+ # Raise ClientResponseError so retry logic can inspect status code
65
+ # (5xx and 429 will be retried, 4xx will not)
66
+ response.raise_for_status()
67
+ return await response.text()
68
+
69
+
70
+ async def extract_url_bs4(url: str, proxy: str | None = None) -> dict:
71
+ """
72
+ Get the title and content of a URL using readability with a fallback to BeautifulSoup.
73
+ Includes retry logic for network failures.
74
+
75
+ Args:
76
+ url (str): The URL of the webpage to extract content from.
77
+ proxy (str | None): Optional proxy URL to use for this request.
78
+
79
+ Returns:
80
+ dict: A dictionary containing the 'title' and 'content' of the webpage.
81
+ """
82
+ try:
83
+ # Fetch the webpage content with retry
84
+ html = await _fetch_url_html(url, proxy)
85
+
86
+ # Try extracting with readability
87
+ try:
88
+ doc = Document(html)
89
+ title = doc.title() or "No title found"
90
+ # Extract content as plain text by parsing the cleaned HTML
91
+ soup = BeautifulSoup(doc.summary(), "lxml")
92
+ content = soup.get_text(separator=" ", strip=True)
93
+ if not content.strip():
94
+ raise ValueError("No content extracted by readability")
95
+ except Exception as e:
96
+ logger.debug(f"Readability failed: {e}")
97
+ # Fallback to BeautifulSoup
98
+ soup = BeautifulSoup(html, "lxml")
99
+ # Extract title
100
+ title_tag = (
101
+ soup.find("title")
102
+ or soup.find("h1")
103
+ or soup.find("meta", property="og:title")
104
+ )
105
+ title = (
106
+ title_tag.get_text(strip=True) if title_tag else "No title found"
107
+ )
108
+ # Extract content from common content tags
109
+ content_tags = soup.select(
110
+ 'article, .content, .post, main, [role="main"], div[class*="content"], div[class*="article"]'
111
+ )
112
+ content = (
113
+ " ".join(
114
+ tag.get_text(separator=" ", strip=True) for tag in content_tags
115
+ )
116
+ if content_tags
117
+ else soup.get_text(separator=" ", strip=True)
118
+ )
119
+ content = content.strip() or "No content found"
120
+
121
+ return {
122
+ "title": title,
123
+ "content": content,
124
+ }
125
+
126
+ except Exception as e:
127
+ logger.error(f"Error processing URL {url} after retries: {e}")
128
+ return {
129
+ "title": "Error",
130
+ "content": f"Failed to extract content: {str(e)}",
131
+ }
132
+
133
+
134
+ @retry_url_api()
135
+ async def _fetch_url_jina(url: str, headers: dict, proxy: str | None = None) -> str:
136
+ """Internal function to fetch URL content via Jina - wrapped with retry logic."""
137
+ resolved_proxy = get_proxy(proxy)
138
+ async with aiohttp.ClientSession() as session:
139
+ async with session.get(
140
+ f"https://r.jina.ai/{url}", headers=headers, proxy=resolved_proxy
141
+ ) as response:
142
+ # Raise ClientResponseError so retry logic can inspect status code
143
+ # (5xx and 429 will be retried, 4xx will not)
144
+ response.raise_for_status()
145
+ return await response.text()
146
+
147
+
148
+ async def extract_url_jina(url: str, proxy: str | None = None) -> dict:
149
+ """
150
+ Get the content of a URL using Jina. Uses Bearer token if JINA_API_KEY is set.
151
+ Includes retry logic for transient API failures.
152
+
153
+ Args:
154
+ url (str): The URL to extract content from.
155
+ proxy (str | None): Optional proxy URL to use for this request.
156
+ """
157
+ headers = {}
158
+ api_key = os.environ.get("JINA_API_KEY")
159
+ if api_key:
160
+ headers["Authorization"] = f"Bearer {api_key}"
161
+
162
+ try:
163
+ text = await _fetch_url_jina(url, headers, proxy)
164
+ if text.startswith("Title:") and "\n" in text:
165
+ title_end = text.index("\n")
166
+ title = text[6:title_end].strip()
167
+ content = text[title_end + 1 :].strip()
168
+ logger.debug(
169
+ f"Processed url: {url}, found title: {title}, content: {content[:100]}..."
170
+ )
171
+ return {"title": title, "content": content}
172
+ else:
173
+ logger.debug(
174
+ f"Processed url: {url}, does not have Title prefix, returning full content: {text[:100]}..."
175
+ )
176
+ return {"content": text}
177
+ except Exception as e:
178
+ logger.error(f"Jina extraction failed for {url} after retries: {e}")
179
+ raise
180
+
181
+
182
+ @retry_url_api()
183
+ async def _fetch_url_firecrawl(url: str, proxy: str | None = None) -> dict:
184
+ """Internal function to fetch URL content via Firecrawl - wrapped with retry logic."""
185
+ from firecrawl import AsyncFirecrawlApp
186
+
187
+ # Note: firecrawl-py does not support client-side proxy configuration
188
+ # Proxy must be configured on the Firecrawl server side
189
+ resolved_proxy = get_proxy(proxy)
190
+ if resolved_proxy:
191
+ logger.warning(
192
+ "Proxy is configured but Firecrawl does not support client-side proxy. "
193
+ "Proxy will NOT be used for this request. Configure proxy on Firecrawl server instead."
194
+ )
195
+
196
+ app = AsyncFirecrawlApp(api_key=os.environ.get("FIRECRAWL_API_KEY"))
197
+ scrape_result = await app.scrape(url, formats=["markdown", "html"])
198
+ return {
199
+ "title": scrape_result.metadata.title or "",
200
+ "content": scrape_result.markdown or "",
201
+ }
202
+
203
+
204
+ async def extract_url_firecrawl(url: str, proxy: str | None = None) -> dict | None:
205
+ """
206
+ Get the content of a URL using Firecrawl.
207
+ Returns {"title": ..., "content": ...} or None on failure.
208
+ Includes retry logic for transient API failures.
209
+
210
+ Note: Firecrawl does not support client-side proxy configuration.
211
+ """
212
+ try:
213
+ return await _fetch_url_firecrawl(url, proxy)
214
+ except Exception as e:
215
+ logger.error(f"Firecrawl extraction failed for {url} after retries: {e}")
216
+ return None
217
+
218
+ @retry_url_api()
219
+ async def _fetch_url_crawl4ai(url: str, proxy: str | None = None) -> dict:
220
+ """Internal function to fetch URL content via Crawl4AI - wrapped with retry logic."""
221
+ try:
222
+ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ProxyConfig
223
+ except ImportError:
224
+ raise ImportError(
225
+ "Crawl4AI is not installed. Install it with: pip install content-core[crawl4ai]"
226
+ )
227
+
228
+ resolved_proxy = get_proxy(proxy)
229
+
230
+ # Configure proxy if available
231
+ run_config = None
232
+ if resolved_proxy:
233
+ try:
234
+ run_config = CrawlerRunConfig(
235
+ proxy_config=ProxyConfig.from_string(resolved_proxy)
236
+ )
237
+ logger.debug(f"Crawl4AI using proxy: {resolved_proxy}")
238
+ except Exception as e:
239
+ logger.warning(f"Failed to configure proxy for Crawl4AI: {e}")
240
+
241
+ async with AsyncWebCrawler() as crawler:
242
+ if run_config:
243
+ result = await crawler.arun(url=url, config=run_config)
244
+ else:
245
+ result = await crawler.arun(url=url)
246
+
247
+ # Extract title from metadata if available
248
+ title = ""
249
+ if hasattr(result, "metadata") and result.metadata:
250
+ title = result.metadata.get("title", "")
251
+
252
+ # Get markdown content
253
+ content = result.markdown if hasattr(result, "markdown") else ""
254
+
255
+ return {
256
+ "title": title or "No title found",
257
+ "content": content,
258
+ }
259
+
260
+
261
+ async def extract_url_crawl4ai(url: str, proxy: str | None = None) -> dict | None:
262
+ """
263
+ Get the content of a URL using Crawl4AI (local browser automation).
264
+ Returns {"title": ..., "content": ...} or None on failure.
265
+ Includes retry logic for transient failures.
266
+
267
+ Args:
268
+ url (str): The URL to extract content from.
269
+ proxy (str | None): Optional proxy URL to use for this request.
270
+ """
271
+ try:
272
+ return await _fetch_url_crawl4ai(url, proxy)
273
+ except Exception:
274
+ return None
275
+
276
+ async def extract_url(state: ProcessSourceState):
277
+ """
278
+ Extract content from a URL using the url_engine specified in the state.
279
+ Supported engines: 'auto', 'simple', 'firecrawl', 'jina', 'crawl4ai'.
280
+
281
+ Proxy configuration is passed through state.proxy and resolved using get_proxy().
282
+ """
283
+ assert state.url, "No URL provided"
284
+ url = state.url
285
+ proxy = state.proxy
286
+ # Use environment-aware engine selection
287
+ engine = state.url_engine or get_url_engine()
288
+ try:
289
+ if engine == "auto":
290
+ if os.environ.get("FIRECRAWL_API_KEY"):
291
+ logger.debug(
292
+ "Engine 'auto' selected: using Firecrawl (FIRECRAWL_API_KEY detected)"
293
+ )
294
+ return await extract_url_firecrawl(url, proxy)
295
+ else:
296
+ try:
297
+ logger.debug("Trying to use Jina to extract URL")
298
+ return await extract_url_jina(url, proxy)
299
+ except Exception as e:
300
+ logger.error(f"Jina extraction error for URL: {url}: {e}")
301
+ # Try Crawl4AI before falling back to BeautifulSoup
302
+ logger.debug("Trying to use Crawl4AI to extract URL")
303
+ result = await extract_url_crawl4ai(url, proxy)
304
+ if result is not None:
305
+ return result
306
+ logger.debug(
307
+ "Crawl4AI failed or not installed, falling back to BeautifulSoup"
308
+ )
309
+ return await extract_url_bs4(url, proxy)
310
+ elif engine == "simple":
311
+ return await extract_url_bs4(url, proxy)
312
+ elif engine == "firecrawl":
313
+ return await extract_url_firecrawl(url, proxy)
314
+ elif engine == "jina":
315
+ return await extract_url_jina(url, proxy)
316
+ elif engine == "crawl4ai":
317
+ return await extract_url_crawl4ai(url, proxy)
318
+ else:
319
+ raise ValueError(f"Unknown engine: {engine}")
320
+ except Exception as e:
321
+ logger.error(f"URL extraction failed for URL: {url}")
322
+ logger.exception(e)
323
+ return None
324
+
@@ -0,0 +1,166 @@
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import subprocess
5
+ from functools import partial
6
+
7
+ from content_core.common import ProcessSourceState
8
+ from content_core.logging import logger
9
+
10
+
11
+ async def extract_audio_from_video(input_file, output_file, stream_index):
12
+ """
13
+ Extract the specified audio stream to MP3 format asynchronously
14
+ """
15
+
16
+ def _extract(input_file, output_file, stream_index):
17
+ try:
18
+ cmd = [
19
+ "ffmpeg",
20
+ "-i",
21
+ input_file,
22
+ "-map",
23
+ f"0:a:{stream_index}", # Select specific audio stream
24
+ "-codec:a",
25
+ "libmp3lame", # Use MP3 codec
26
+ "-q:a",
27
+ "2", # High quality setting
28
+ "-y", # Overwrite output file if exists
29
+ output_file,
30
+ ]
31
+
32
+ result = subprocess.run(cmd, capture_output=True, text=True)
33
+ if result.returncode != 0:
34
+ raise Exception(f"FFmpeg failed: {result.stderr}")
35
+
36
+ return True
37
+
38
+ except Exception as e:
39
+ logger.error(f"Error extracting audio: {str(e)}")
40
+ return False
41
+
42
+ return await asyncio.get_event_loop().run_in_executor(
43
+ None, partial(_extract, input_file, output_file, stream_index)
44
+ )
45
+
46
+
47
+ async def get_audio_streams(input_file):
48
+ """
49
+ Analyze video file and return information about all audio streams asynchronously
50
+ """
51
+
52
+ def _analyze(input_file):
53
+ logger.debug(f"Analyzing video file {input_file} for audio streams")
54
+ try:
55
+ cmd = [
56
+ "ffprobe",
57
+ "-v",
58
+ "quiet",
59
+ "-print_format",
60
+ "json",
61
+ "-show_streams",
62
+ "-select_streams",
63
+ "a",
64
+ input_file,
65
+ ]
66
+
67
+ result = subprocess.run(cmd, capture_output=True, text=True)
68
+ if result.returncode != 0:
69
+ raise Exception(f"FFprobe failed: {result.stderr}")
70
+
71
+ data = json.loads(result.stdout)
72
+ logger.debug(data)
73
+ return data.get("streams", [])
74
+ except Exception as e:
75
+ logger.error(f"Error analyzing file: {str(e)}")
76
+ return []
77
+
78
+ return await asyncio.get_event_loop().run_in_executor(
79
+ None, partial(_analyze, input_file)
80
+ )
81
+
82
+
83
+ async def select_best_audio_stream(streams):
84
+ """
85
+ Select the best audio stream based on various quality metrics
86
+ """
87
+
88
+ def _select(streams):
89
+ if not streams:
90
+ logger.debug("No audio streams found")
91
+ return None
92
+ else:
93
+ logger.debug(f"Found {len(streams)} audio streams")
94
+
95
+ # Score each stream based on various factors
96
+ scored_streams = []
97
+ for stream in streams:
98
+ score = 0
99
+
100
+ # Prefer higher bit rates
101
+ bit_rate = stream.get("bit_rate")
102
+ if bit_rate:
103
+ score += int(int(bit_rate) / 1000000) # Convert to Mbps and ensure int
104
+
105
+ # Prefer more channels (stereo over mono)
106
+ channels = stream.get("channels", 0)
107
+ score += channels * 10
108
+
109
+ # Prefer higher sample rates
110
+ sample_rate = stream.get("sample_rate", "0")
111
+ score += int(int(sample_rate) / 48000)
112
+
113
+ scored_streams.append((score, stream))
114
+
115
+ # Return the stream with highest score
116
+ return max(scored_streams, key=lambda x: x[0])[1]
117
+
118
+ return await asyncio.get_event_loop().run_in_executor(
119
+ None, partial(_select, streams)
120
+ )
121
+
122
+
123
+ async def extract_best_audio_from_video(data: ProcessSourceState):
124
+ """
125
+ Main function to extract the best audio stream from a video file asynchronously
126
+ """
127
+ input_file = data.file_path
128
+ assert input_file is not None, "Input file path must be provided"
129
+
130
+ def _check_file(path):
131
+ return os.path.exists(path)
132
+
133
+ file_exists = await asyncio.get_event_loop().run_in_executor(
134
+ None, partial(_check_file, input_file)
135
+ )
136
+
137
+ if not file_exists:
138
+ logger.critical(f"Input file not found: {input_file}")
139
+ return False
140
+
141
+ base_name = os.path.splitext(input_file)[0]
142
+ output_file = f"{base_name}_audio.mp3"
143
+
144
+ # Get all audio streams
145
+ streams = await get_audio_streams(input_file)
146
+ if not streams:
147
+ logger.debug("No audio streams found in the file")
148
+ return False
149
+
150
+ # Select best stream
151
+ best_stream = await select_best_audio_stream(streams)
152
+ if not best_stream:
153
+ logger.error("Could not determine best audio stream")
154
+ return False
155
+
156
+ # Extract the selected stream
157
+ stream_index = streams.index(best_stream)
158
+ success = await extract_audio_from_video(input_file, output_file, stream_index)
159
+
160
+ if success:
161
+ logger.debug(f"Successfully extracted audio to: {output_file}")
162
+ logger.debug(f"- Channels: {best_stream.get('channels', 'unknown')}")
163
+ logger.debug(f"- Sample rate: {best_stream.get('sample_rate', 'unknown')} Hz")
164
+ logger.debug(f"- Bit rate: {best_stream.get('bit_rate', 'unknown')} bits/s")
165
+
166
+ return {"file_path": output_file, "identified_type": "audio/mp3"}