content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,325 @@
1
+ """
2
+ Retry decorators for handling transient failures in external operations.
3
+
4
+ This module provides pre-configured retry decorators using tenacity for different
5
+ operation types (YouTube, URL extraction, audio transcription, LLM calls, downloads).
6
+
7
+ Each decorator uses exponential backoff with jitter to prevent thundering herd problems.
8
+
9
+ Usage:
10
+ from content_core.common.retry import retry_youtube, retry_url_api
11
+
12
+ @retry_youtube()
13
+ async def get_video_title(video_id):
14
+ ...
15
+
16
+ @retry_url_api()
17
+ async def extract_url_jina(url):
18
+ ...
19
+ """
20
+
21
+ from typing import Callable, Optional
22
+
23
+ import aiohttp
24
+ from tenacity import (
25
+ RetryError,
26
+ retry,
27
+ retry_if_exception,
28
+ stop_after_attempt,
29
+ wait_random_exponential,
30
+ )
31
+
32
+ from content_core.common.exceptions import NoTranscriptFound, NotFoundError
33
+ from content_core.config import get_retry_config
34
+ from content_core.logging import logger
35
+
36
+
37
+ # Exceptions that should NOT be retried (permanent failures)
38
+ NON_RETRYABLE_EXCEPTIONS = (
39
+ NoTranscriptFound,
40
+ NotFoundError,
41
+ ValueError,
42
+ TypeError,
43
+ KeyError,
44
+ AttributeError,
45
+ )
46
+
47
+
48
+ def is_retryable_exception(exception: BaseException) -> bool:
49
+ """
50
+ Determine if an exception should trigger a retry.
51
+
52
+ Returns True for transient/network errors, False for permanent failures.
53
+ """
54
+ # Never retry these - they indicate permanent failures
55
+ if isinstance(exception, NON_RETRYABLE_EXCEPTIONS):
56
+ return False
57
+
58
+ # Always retry network-related errors
59
+ if isinstance(exception, (aiohttp.ClientError, ConnectionError, TimeoutError, OSError)):
60
+ # But not if it's a client error (4xx) - those are usually permanent
61
+ if isinstance(exception, aiohttp.ClientResponseError):
62
+ status = exception.status
63
+ # Retry server errors (5xx) and rate limits (429)
64
+ return status >= 500 or status == 429
65
+ return True
66
+
67
+ # For generic exceptions, check if they look like transient errors
68
+ exc_msg = str(exception).lower()
69
+ transient_indicators = [
70
+ "timeout", "timed out", "connection", "network", "temporary",
71
+ "unavailable", "rate limit", "too many requests", "503", "502", "500"
72
+ ]
73
+ return any(indicator in exc_msg for indicator in transient_indicators)
74
+
75
+
76
+ def log_retry_attempt(retry_state) -> None:
77
+ """
78
+ Log retry attempts with detailed information.
79
+
80
+ This is used as the before_sleep callback for tenacity decorators.
81
+
82
+ Args:
83
+ retry_state: Tenacity retry state containing attempt info and exception
84
+ """
85
+ func_name = retry_state.fn.__name__ if retry_state.fn else "unknown"
86
+ attempt_num = retry_state.attempt_number
87
+ exception = retry_state.outcome.exception() if retry_state.outcome else None
88
+
89
+ if exception:
90
+ exc_type = type(exception).__name__
91
+ exc_msg = str(exception)[:200] # Truncate long messages
92
+ logger.warning(
93
+ f"Retry {attempt_num} for {func_name}: {exc_type}: {exc_msg}"
94
+ )
95
+ else:
96
+ logger.warning(f"Retry {attempt_num} for {func_name}: unknown error")
97
+
98
+
99
+ def log_retry_exhausted(retry_state) -> None:
100
+ """
101
+ Log when all retry attempts have been exhausted.
102
+
103
+ Args:
104
+ retry_state: Tenacity retry state containing final attempt info
105
+ """
106
+ func_name = retry_state.fn.__name__ if retry_state.fn else "unknown"
107
+ attempt_num = retry_state.attempt_number
108
+ exception = retry_state.outcome.exception() if retry_state.outcome else None
109
+
110
+ if exception:
111
+ exc_type = type(exception).__name__
112
+ exc_msg = str(exception)[:500]
113
+ logger.error(
114
+ f"All {attempt_num} retries exhausted for {func_name}: {exc_type}: {exc_msg}"
115
+ )
116
+ else:
117
+ logger.error(f"All {attempt_num} retries exhausted for {func_name}")
118
+
119
+
120
+ def retry_youtube(
121
+ max_attempts: Optional[int] = None,
122
+ base_delay: Optional[float] = None,
123
+ max_delay: Optional[float] = None,
124
+ ) -> Callable:
125
+ """
126
+ Retry decorator for YouTube operations.
127
+
128
+ Uses longer delays due to YouTube's aggressive rate limiting.
129
+ Does NOT retry permanent failures like NoTranscriptFound.
130
+
131
+ Args:
132
+ max_attempts: Override max retry attempts (default from config: 5)
133
+ base_delay: Override base delay in seconds (default from config: 2)
134
+ max_delay: Override max delay in seconds (default from config: 60)
135
+
136
+ Returns:
137
+ Configured tenacity retry decorator
138
+ """
139
+ config = get_retry_config("youtube")
140
+ attempts = max_attempts if max_attempts is not None else config["max_attempts"]
141
+ base = base_delay if base_delay is not None else config["base_delay"]
142
+ max_wait = max_delay if max_delay is not None else config["max_delay"]
143
+
144
+ return retry(
145
+ stop=stop_after_attempt(attempts),
146
+ wait=wait_random_exponential(multiplier=base, max=max_wait),
147
+ retry=retry_if_exception(is_retryable_exception),
148
+ before_sleep=log_retry_attempt,
149
+ reraise=True,
150
+ )
151
+
152
+
153
+ def retry_url_api(
154
+ max_attempts: Optional[int] = None,
155
+ base_delay: Optional[float] = None,
156
+ max_delay: Optional[float] = None,
157
+ ) -> Callable:
158
+ """
159
+ Retry decorator for API-based URL extraction (Jina, Firecrawl).
160
+
161
+ Retries on network errors and server errors (5xx, 429), but not client errors (4xx).
162
+
163
+ Args:
164
+ max_attempts: Override max retry attempts (default from config: 3)
165
+ base_delay: Override base delay in seconds (default from config: 1)
166
+ max_delay: Override max delay in seconds (default from config: 30)
167
+
168
+ Returns:
169
+ Configured tenacity retry decorator
170
+ """
171
+ config = get_retry_config("url_api")
172
+ attempts = max_attempts if max_attempts is not None else config["max_attempts"]
173
+ base = base_delay if base_delay is not None else config["base_delay"]
174
+ max_wait = max_delay if max_delay is not None else config["max_delay"]
175
+
176
+ return retry(
177
+ stop=stop_after_attempt(attempts),
178
+ wait=wait_random_exponential(multiplier=base, max=max_wait),
179
+ retry=retry_if_exception(is_retryable_exception),
180
+ before_sleep=log_retry_attempt,
181
+ reraise=True,
182
+ )
183
+
184
+
185
+ def retry_url_network(
186
+ max_attempts: Optional[int] = None,
187
+ base_delay: Optional[float] = None,
188
+ max_delay: Optional[float] = None,
189
+ ) -> Callable:
190
+ """
191
+ Retry decorator for network-only URL operations (BeautifulSoup, HEAD requests).
192
+
193
+ Uses shorter delays as these are typically network-only issues.
194
+
195
+ Args:
196
+ max_attempts: Override max retry attempts (default from config: 3)
197
+ base_delay: Override base delay in seconds (default from config: 0.5)
198
+ max_delay: Override max delay in seconds (default from config: 10)
199
+
200
+ Returns:
201
+ Configured tenacity retry decorator
202
+ """
203
+ config = get_retry_config("url_network")
204
+ attempts = max_attempts if max_attempts is not None else config["max_attempts"]
205
+ base = base_delay if base_delay is not None else config["base_delay"]
206
+ max_wait = max_delay if max_delay is not None else config["max_delay"]
207
+
208
+ return retry(
209
+ stop=stop_after_attempt(attempts),
210
+ wait=wait_random_exponential(multiplier=base, max=max_wait),
211
+ retry=retry_if_exception(is_retryable_exception),
212
+ before_sleep=log_retry_attempt,
213
+ reraise=True,
214
+ )
215
+
216
+
217
+ def retry_audio_transcription(
218
+ max_attempts: Optional[int] = None,
219
+ base_delay: Optional[float] = None,
220
+ max_delay: Optional[float] = None,
221
+ ) -> Callable:
222
+ """
223
+ Retry decorator for audio transcription (speech-to-text API calls).
224
+
225
+ Retries on transient errors, but not on permanent failures like invalid files.
226
+
227
+ Args:
228
+ max_attempts: Override max retry attempts (default from config: 3)
229
+ base_delay: Override base delay in seconds (default from config: 2)
230
+ max_delay: Override max delay in seconds (default from config: 30)
231
+
232
+ Returns:
233
+ Configured tenacity retry decorator
234
+ """
235
+ config = get_retry_config("audio")
236
+ attempts = max_attempts if max_attempts is not None else config["max_attempts"]
237
+ base = base_delay if base_delay is not None else config["base_delay"]
238
+ max_wait = max_delay if max_delay is not None else config["max_delay"]
239
+
240
+ return retry(
241
+ stop=stop_after_attempt(attempts),
242
+ wait=wait_random_exponential(multiplier=base, max=max_wait),
243
+ retry=retry_if_exception(is_retryable_exception),
244
+ before_sleep=log_retry_attempt,
245
+ reraise=True,
246
+ )
247
+
248
+
249
+ def retry_llm(
250
+ max_attempts: Optional[int] = None,
251
+ base_delay: Optional[float] = None,
252
+ max_delay: Optional[float] = None,
253
+ ) -> Callable:
254
+ """
255
+ Retry decorator for LLM API calls (summary, cleanup).
256
+
257
+ Retries on transient errors like rate limits and timeouts, but not on
258
+ permanent failures like invalid API keys or malformed requests.
259
+
260
+ Args:
261
+ max_attempts: Override max retry attempts (default from config: 3)
262
+ base_delay: Override base delay in seconds (default from config: 1)
263
+ max_delay: Override max delay in seconds (default from config: 30)
264
+
265
+ Returns:
266
+ Configured tenacity retry decorator
267
+ """
268
+ config = get_retry_config("llm")
269
+ attempts = max_attempts if max_attempts is not None else config["max_attempts"]
270
+ base = base_delay if base_delay is not None else config["base_delay"]
271
+ max_wait = max_delay if max_delay is not None else config["max_delay"]
272
+
273
+ return retry(
274
+ stop=stop_after_attempt(attempts),
275
+ wait=wait_random_exponential(multiplier=base, max=max_wait),
276
+ retry=retry_if_exception(is_retryable_exception),
277
+ before_sleep=log_retry_attempt,
278
+ reraise=True,
279
+ )
280
+
281
+
282
+ def retry_download(
283
+ max_attempts: Optional[int] = None,
284
+ base_delay: Optional[float] = None,
285
+ max_delay: Optional[float] = None,
286
+ ) -> Callable:
287
+ """
288
+ Retry decorator for remote file downloads.
289
+
290
+ Retries on network errors and server errors (5xx, 429), but not client errors (4xx).
291
+
292
+ Args:
293
+ max_attempts: Override max retry attempts (default from config: 3)
294
+ base_delay: Override base delay in seconds (default from config: 1)
295
+ max_delay: Override max delay in seconds (default from config: 15)
296
+
297
+ Returns:
298
+ Configured tenacity retry decorator
299
+ """
300
+ config = get_retry_config("download")
301
+ attempts = max_attempts if max_attempts is not None else config["max_attempts"]
302
+ base = base_delay if base_delay is not None else config["base_delay"]
303
+ max_wait = max_delay if max_delay is not None else config["max_delay"]
304
+
305
+ return retry(
306
+ stop=stop_after_attempt(attempts),
307
+ wait=wait_random_exponential(multiplier=base, max=max_wait),
308
+ retry=retry_if_exception(is_retryable_exception),
309
+ before_sleep=log_retry_attempt,
310
+ reraise=True,
311
+ )
312
+
313
+
314
+ # Export RetryError for use in exception handling
315
+ __all__ = [
316
+ "retry_youtube",
317
+ "retry_url_api",
318
+ "retry_url_network",
319
+ "retry_audio_transcription",
320
+ "retry_llm",
321
+ "retry_download",
322
+ "log_retry_attempt",
323
+ "log_retry_exhausted",
324
+ "RetryError",
325
+ ]
@@ -0,0 +1,64 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from content_core.common.types import DocumentEngine, UrlEngine
6
+
7
+
8
+ class ProcessSourceState(BaseModel):
9
+ file_path: Optional[str] = ""
10
+ url: Optional[str] = ""
11
+ delete_source: bool = False
12
+ title: Optional[str] = ""
13
+ source_type: Optional[str] = ""
14
+ identified_type: Optional[str] = ""
15
+ identified_provider: Optional[str] = ""
16
+ metadata: Optional[dict] = Field(default_factory=lambda: {})
17
+ content: Optional[str] = ""
18
+ document_engine: Optional[DocumentEngine] = Field(
19
+ default=None,
20
+ description="Override document extraction engine: 'auto', 'simple', or 'docling'",
21
+ )
22
+ url_engine: Optional[UrlEngine] = Field(
23
+ default=None,
24
+ description="Override URL extraction engine: 'auto', 'simple', 'firecrawl', 'jina', 'crawl4ai', or 'docling'",
25
+ )
26
+ output_format: Optional[str] = Field(
27
+ default=None,
28
+ description="Override Docling output format: 'markdown', 'html', or 'json'",
29
+ )
30
+ audio_provider: Optional[str] = Field(
31
+ default=None,
32
+ description="Override speech-to-text provider (e.g., 'openai', 'google')",
33
+ )
34
+ audio_model: Optional[str] = Field(
35
+ default=None,
36
+ description="Override speech-to-text model name (e.g., 'whisper-1', 'chirp')",
37
+ )
38
+ proxy: Optional[str] = Field(
39
+ default=None,
40
+ description="Override proxy URL for this request (e.g., 'http://proxy:8080')",
41
+ )
42
+
43
+
44
+ class ProcessSourceInput(BaseModel):
45
+ content: Optional[str] = ""
46
+ file_path: Optional[str] = ""
47
+ url: Optional[str] = ""
48
+ document_engine: Optional[str] = None
49
+ url_engine: Optional[str] = None
50
+ output_format: Optional[str] = None
51
+ audio_provider: Optional[str] = None
52
+ audio_model: Optional[str] = None
53
+ proxy: Optional[str] = None
54
+
55
+
56
+ class ProcessSourceOutput(BaseModel):
57
+ title: Optional[str] = ""
58
+ file_path: Optional[str] = ""
59
+ url: Optional[str] = ""
60
+ source_type: Optional[str] = ""
61
+ identified_type: Optional[str] = ""
62
+ identified_provider: Optional[str] = ""
63
+ metadata: Optional[dict] = Field(default_factory=lambda: {})
64
+ content: Optional[str] = ""
@@ -0,0 +1,15 @@
1
+ from typing import Literal
2
+
3
+ DocumentEngine = Literal[
4
+ "auto",
5
+ "simple",
6
+ "docling",
7
+ ]
8
+
9
+ UrlEngine = Literal[
10
+ "auto",
11
+ "simple",
12
+ "firecrawl",
13
+ "jina",
14
+ "crawl4ai",
15
+ ]
@@ -0,0 +1,31 @@
1
+ import os
2
+ import re
3
+ import validators
4
+
5
+ from .state import ProcessSourceInput
6
+
7
+
8
+ async def process_input_content(content: str) -> str:
9
+ """
10
+ Process input content to handle URLs and file paths.
11
+ If the input is a URL or file path, extract the content from it.
12
+ """
13
+ # Check if content is a URL
14
+ if validators.url(content):
15
+ from content_core.extraction import extract_content
16
+ content_input = ProcessSourceInput(url=content)
17
+ extracted = await extract_content(content_input)
18
+ return extracted.content if extracted.content else str(extracted)
19
+
20
+ # Check if content is a file path (simplified check for demonstration)
21
+ if re.match(r"^[a-zA-Z0-9_/\-\.]+\.[a-zA-Z0-9]+$", content):
22
+ if os.path.exists(content):
23
+ from content_core.extraction import extract_content
24
+ content_input = ProcessSourceInput(file_path=content)
25
+ extracted = await extract_content(content_input)
26
+ return extracted.content if extracted.content else str(extracted)
27
+ else:
28
+ raise ValueError(f"File not found: {content}")
29
+
30
+ # If neither URL nor file path, return content as is
31
+ return content