content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
content_core/config.py ADDED
@@ -0,0 +1,575 @@
1
+ import os
2
+ import pkgutil
3
+ from typing import Any, Dict, cast
4
+
5
+ import yaml # type: ignore[import]
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables from .env file
9
+ load_dotenv()
10
+
11
+ # Allowed engine values for validation
12
+ ALLOWED_DOCUMENT_ENGINES = {"auto", "simple", "docling"}
13
+ ALLOWED_URL_ENGINES = {"auto", "simple", "firecrawl", "jina", "crawl4ai"}
14
+
15
+ # Allowed retry operation types
16
+ ALLOWED_RETRY_OPERATIONS = {
17
+ "youtube",
18
+ "url_api",
19
+ "url_network",
20
+ "audio",
21
+ "llm",
22
+ "download",
23
+ }
24
+
25
+ # Timeout validation bounds (seconds)
26
+ MIN_TIMEOUT_SECONDS = 1
27
+ MAX_TIMEOUT_SECONDS = 3600
28
+
29
+ # Default retry configurations (used when not in config file)
30
+ DEFAULT_RETRY_CONFIG = {
31
+ "youtube": {"max_attempts": 5, "base_delay": 2, "max_delay": 60},
32
+ "url_api": {"max_attempts": 3, "base_delay": 1, "max_delay": 30},
33
+ "url_network": {"max_attempts": 3, "base_delay": 0.5, "max_delay": 10},
34
+ "audio": {"max_attempts": 3, "base_delay": 2, "max_delay": 30},
35
+ "llm": {"max_attempts": 3, "base_delay": 1, "max_delay": 30},
36
+ "download": {"max_attempts": 3, "base_delay": 1, "max_delay": 15},
37
+ }
38
+
39
+
40
+ def _warn_invalid_timeout(var_name: str, value: str, reason: str):
41
+ """Log a warning for invalid timeout overrides."""
42
+ from content_core.logging import logger
43
+
44
+ logger.warning(
45
+ f"Invalid {var_name}: '{value}'. {reason} "
46
+ f"(expected {MIN_TIMEOUT_SECONDS}-{MAX_TIMEOUT_SECONDS} seconds). "
47
+ f"Using timeout from config."
48
+ )
49
+
50
+
51
+ def _parse_timeout_env(var_name: str):
52
+ """
53
+ Parse timeout overrides from environment variables.
54
+
55
+ Returns:
56
+ int | None: Parsed timeout in seconds, or None if not provided/invalid.
57
+ """
58
+ value = os.environ.get(var_name)
59
+ if value is None or value == "":
60
+ return None
61
+
62
+ try:
63
+ timeout = int(value)
64
+ except ValueError:
65
+ _warn_invalid_timeout(var_name, value, "Must be an integer value")
66
+ return None
67
+
68
+ if timeout < MIN_TIMEOUT_SECONDS or timeout > MAX_TIMEOUT_SECONDS:
69
+ _warn_invalid_timeout(
70
+ var_name,
71
+ value,
72
+ f"Must be between {MIN_TIMEOUT_SECONDS} and {MAX_TIMEOUT_SECONDS} seconds",
73
+ )
74
+ return None
75
+
76
+ return timeout
77
+
78
+
79
+ def apply_timeout_env_overrides(config: dict):
80
+ """
81
+ Apply environment variable overrides for Esperanto timeouts.
82
+
83
+ Priority order (highest to lowest):
84
+ 1. YAML configuration defaults
85
+ 2. Environment variables (ESPERANTO_LLM_TIMEOUT / ESPERANTO_STT_TIMEOUT) used as fallback when YAML does not set a timeout
86
+ """
87
+ if not isinstance(config, dict):
88
+ return
89
+
90
+ llm_timeout = _parse_timeout_env("ESPERANTO_LLM_TIMEOUT")
91
+ if llm_timeout is not None:
92
+ for alias in ("default_model", "cleanup_model", "summary_model"):
93
+ alias_cfg = config.setdefault(alias, {})
94
+ model_cfg = alias_cfg.setdefault("config", {})
95
+ if "timeout" not in model_cfg or model_cfg["timeout"] is None:
96
+ model_cfg["timeout"] = llm_timeout
97
+
98
+ stt_timeout = _parse_timeout_env("ESPERANTO_STT_TIMEOUT")
99
+ if stt_timeout is not None:
100
+ stt_cfg = config.setdefault("speech_to_text", {})
101
+ if "timeout" not in stt_cfg or stt_cfg["timeout"] is None:
102
+ stt_cfg["timeout"] = stt_timeout
103
+
104
+
105
+ def load_config() -> Dict[str, Any]:
106
+ config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get(
107
+ "CCORE_MODEL_CONFIG_PATH"
108
+ )
109
+ if config_path and os.path.exists(config_path):
110
+ try:
111
+ with open(config_path, "r") as file:
112
+ return yaml.safe_load(file)
113
+ except Exception as e:
114
+ print(f"Error loading configuration file from {config_path}: {e}")
115
+ print("Using internal default settings.")
116
+
117
+ default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
118
+ if default_config_data:
119
+ base = yaml.safe_load(default_config_data)
120
+ else:
121
+ base = {}
122
+ # load new cc_config.yaml defaults
123
+ cc_default = pkgutil.get_data("content_core", "cc_config.yaml")
124
+ if cc_default:
125
+ cc_cfg = yaml.safe_load(cc_default)
126
+ # merge extraction section
127
+ base["extraction"] = cc_cfg.get("extraction", {})
128
+ # merge proxy section
129
+ base["proxy"] = cc_cfg.get("proxy", {})
130
+ return base or {}
131
+
132
+
133
+ CONFIG: Dict[str, Any] = load_config()
134
+ apply_timeout_env_overrides(CONFIG)
135
+
136
+
137
+ # Environment variable engine selectors for MCP/Raycast users
138
+ def get_document_engine():
139
+ """Get document engine with environment variable override and validation."""
140
+ env_engine = os.environ.get("CCORE_DOCUMENT_ENGINE")
141
+ if env_engine:
142
+ if env_engine not in ALLOWED_DOCUMENT_ENGINES:
143
+ # Import logger here to avoid circular imports
144
+ from content_core.logging import logger
145
+
146
+ logger.warning(
147
+ f"Invalid CCORE_DOCUMENT_ENGINE: '{env_engine}'. "
148
+ f"Allowed values: {', '.join(sorted(ALLOWED_DOCUMENT_ENGINES))}. "
149
+ f"Using default from config."
150
+ )
151
+ return CONFIG.get("extraction", {}).get("document_engine", "auto")
152
+ return env_engine
153
+ return CONFIG.get("extraction", {}).get("document_engine", "auto")
154
+
155
+
156
+ def get_url_engine():
157
+ """Get URL engine with environment variable override and validation."""
158
+ env_engine = os.environ.get("CCORE_URL_ENGINE")
159
+ if env_engine:
160
+ if env_engine not in ALLOWED_URL_ENGINES:
161
+ # Import logger here to avoid circular imports
162
+ from content_core.logging import logger
163
+
164
+ logger.warning(
165
+ f"Invalid CCORE_URL_ENGINE: '{env_engine}'. "
166
+ f"Allowed values: {', '.join(sorted(ALLOWED_URL_ENGINES))}. "
167
+ f"Using default from config."
168
+ )
169
+ return CONFIG.get("extraction", {}).get("url_engine", "auto")
170
+ return env_engine
171
+ return CONFIG.get("extraction", {}).get("url_engine", "auto")
172
+
173
+
174
+ def get_audio_concurrency():
175
+ """
176
+ Get audio concurrency with environment variable override and validation.
177
+
178
+ Returns the configured number of concurrent audio transcriptions, with automatic
179
+ validation and fallback to safe defaults.
180
+
181
+ Configuration priority (highest to lowest):
182
+ 1. CCORE_AUDIO_CONCURRENCY environment variable
183
+ 2. extraction.audio.concurrency in YAML config
184
+ 3. Default value: 3
185
+
186
+ Returns:
187
+ int: Number of concurrent transcriptions (1-10)
188
+
189
+ Validation:
190
+ - Values must be integers between 1 and 10 (inclusive)
191
+ - Invalid values (out of range, non-integer, etc.) automatically fall back to default
192
+ - A warning is logged when invalid values are detected
193
+
194
+ Examples:
195
+ >>> import os
196
+ >>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "5"
197
+ >>> get_audio_concurrency()
198
+ 5
199
+
200
+ >>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "20" # Too high
201
+ >>> get_audio_concurrency() # Falls back to default
202
+ 3
203
+ """
204
+ env_concurrency = os.environ.get("CCORE_AUDIO_CONCURRENCY")
205
+ if env_concurrency:
206
+ try:
207
+ concurrency = int(env_concurrency)
208
+ if concurrency < 1 or concurrency > 10:
209
+ # Import logger here to avoid circular imports
210
+ from content_core.logging import logger
211
+
212
+ logger.warning(
213
+ f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
214
+ f"Must be between 1 and 10. "
215
+ f"Using default from config."
216
+ )
217
+ return (
218
+ CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
219
+ )
220
+ return concurrency
221
+ except ValueError:
222
+ # Import logger here to avoid circular imports
223
+ from content_core.logging import logger
224
+
225
+ logger.warning(
226
+ f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
227
+ f"Must be a valid integer. "
228
+ f"Using default from config."
229
+ )
230
+ return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
231
+ return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
232
+
233
+
234
+ # Programmatic config overrides: use in notebooks or scripts
235
+ def set_document_engine(engine: str):
236
+ """Override the document extraction engine ('auto', 'simple', or 'docling')."""
237
+ CONFIG.setdefault("extraction", {})["document_engine"] = engine
238
+
239
+
240
+ def set_url_engine(engine: str):
241
+ """Override the URL extraction engine ('auto', 'simple', 'firecrawl', 'jina', 'crawl4ai', or 'docling')."""
242
+ CONFIG.setdefault("extraction", {})["url_engine"] = engine
243
+
244
+
245
+ def set_docling_output_format(fmt: str):
246
+ """Override Docling output_format ('markdown', 'html', or 'json')."""
247
+ extraction = CONFIG.setdefault("extraction", {})
248
+ docling_cfg = extraction.setdefault("docling", {})
249
+ docling_cfg["output_format"] = fmt
250
+
251
+
252
+ def set_pymupdf_ocr_enabled(enabled: bool):
253
+ """Enable or disable PyMuPDF OCR for formula-heavy pages."""
254
+ extraction = CONFIG.setdefault("extraction", {})
255
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
256
+ pymupdf_cfg["enable_formula_ocr"] = enabled
257
+
258
+
259
+ def set_pymupdf_formula_threshold(threshold: int):
260
+ """Set the minimum number of formulas per page to trigger OCR."""
261
+ extraction = CONFIG.setdefault("extraction", {})
262
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
263
+ pymupdf_cfg["formula_threshold"] = threshold
264
+
265
+
266
+ def set_pymupdf_ocr_fallback(enabled: bool):
267
+ """Enable or disable fallback to standard extraction when OCR fails."""
268
+ extraction = CONFIG.setdefault("extraction", {})
269
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
270
+ pymupdf_cfg["ocr_fallback"] = enabled
271
+
272
+
273
+ def set_audio_concurrency(concurrency: int):
274
+ """
275
+ Override the audio concurrency setting (1-10).
276
+
277
+ Args:
278
+ concurrency (int): Number of concurrent audio transcriptions (1-10)
279
+
280
+ Raises:
281
+ ValueError: If concurrency is not between 1 and 10
282
+ """
283
+ if not isinstance(concurrency, int) or concurrency < 1 or concurrency > 10:
284
+ raise ValueError(
285
+ f"Audio concurrency must be an integer between 1 and 10, got: {concurrency}"
286
+ )
287
+ extraction = CONFIG.setdefault("extraction", {})
288
+ audio_cfg = extraction.setdefault("audio", {})
289
+ audio_cfg["concurrency"] = concurrency
290
+
291
+
292
+ def get_retry_config(operation_type: str) -> dict:
293
+ """
294
+ Get retry configuration for a specific operation type.
295
+
296
+ Configuration priority (highest to lowest):
297
+ 1. Environment variables (CCORE_{TYPE}_MAX_RETRIES, CCORE_{TYPE}_BASE_DELAY, CCORE_{TYPE}_MAX_DELAY)
298
+ 2. YAML config (retry.{type}.{param})
299
+ 3. Hardcoded defaults
300
+
301
+ Args:
302
+ operation_type: One of 'youtube', 'url_api', 'url_network', 'audio', 'llm', 'download'
303
+
304
+ Returns:
305
+ dict: Configuration with 'max_attempts', 'base_delay', 'max_delay'
306
+
307
+ Examples:
308
+ >>> get_retry_config("youtube")
309
+ {'max_attempts': 5, 'base_delay': 2, 'max_delay': 60}
310
+
311
+ >>> import os
312
+ >>> os.environ["CCORE_YOUTUBE_MAX_RETRIES"] = "10"
313
+ >>> get_retry_config("youtube")
314
+ {'max_attempts': 10, 'base_delay': 2, 'max_delay': 60}
315
+ """
316
+ if operation_type not in ALLOWED_RETRY_OPERATIONS:
317
+ from content_core.logging import logger
318
+
319
+ logger.warning(
320
+ f"Unknown retry operation type: '{operation_type}'. "
321
+ f"Allowed values: {', '.join(sorted(ALLOWED_RETRY_OPERATIONS))}. "
322
+ f"Using default config for 'url_network'."
323
+ )
324
+ operation_type = "url_network"
325
+
326
+ # Get defaults
327
+ defaults = DEFAULT_RETRY_CONFIG.get(
328
+ operation_type, DEFAULT_RETRY_CONFIG["url_network"]
329
+ )
330
+
331
+ # Get from YAML config (falls back to defaults)
332
+ retry_config = cast(Dict[str, Any], CONFIG.get("retry", {}))
333
+ yaml_config = cast(Dict[str, Any], retry_config.get(operation_type, {}))
334
+ max_attempts = int(yaml_config.get("max_attempts", defaults["max_attempts"]))
335
+ base_delay: float = float(yaml_config.get("base_delay", defaults["base_delay"]))
336
+ max_delay: float = float(yaml_config.get("max_delay", defaults["max_delay"]))
337
+
338
+ # Environment variable overrides
339
+ env_prefix = f"CCORE_{operation_type.upper()}"
340
+
341
+ env_max_retries = os.environ.get(f"{env_prefix}_MAX_RETRIES")
342
+ if env_max_retries:
343
+ try:
344
+ val = int(env_max_retries)
345
+ if 1 <= val <= 20:
346
+ max_attempts = val
347
+ else:
348
+ from content_core.logging import logger
349
+
350
+ logger.warning(
351
+ f"Invalid {env_prefix}_MAX_RETRIES: '{env_max_retries}'. "
352
+ f"Must be between 1 and 20. Using config value: {max_attempts}"
353
+ )
354
+ except ValueError:
355
+ from content_core.logging import logger
356
+
357
+ logger.warning(
358
+ f"Invalid {env_prefix}_MAX_RETRIES: '{env_max_retries}'. "
359
+ f"Must be a valid integer. Using config value: {max_attempts}"
360
+ )
361
+
362
+ env_base_delay = os.environ.get(f"{env_prefix}_BASE_DELAY")
363
+ if env_base_delay:
364
+ try:
365
+ val = float(env_base_delay)
366
+ if 0.1 <= val <= 60:
367
+ base_delay = float(val)
368
+ else:
369
+ from content_core.logging import logger
370
+
371
+ logger.warning(
372
+ f"Invalid {env_prefix}_BASE_DELAY: '{env_base_delay}'. "
373
+ f"Must be between 0.1 and 60. Using config value: {base_delay}"
374
+ )
375
+ except ValueError:
376
+ from content_core.logging import logger
377
+
378
+ logger.warning(
379
+ f"Invalid {env_prefix}_BASE_DELAY: '{env_base_delay}'. "
380
+ f"Must be a valid number. Using config value: {base_delay}"
381
+ )
382
+
383
+ env_max_delay = os.environ.get(f"{env_prefix}_MAX_DELAY")
384
+ if env_max_delay:
385
+ try:
386
+ val = float(env_max_delay)
387
+ if 1 <= val <= 300:
388
+ max_delay = float(val)
389
+ else:
390
+ from content_core.logging import logger
391
+
392
+ logger.warning(
393
+ f"Invalid {env_prefix}_MAX_DELAY: '{env_max_delay}'. "
394
+ f"Must be between 1 and 300. Using config value: {max_delay}"
395
+ )
396
+ except ValueError:
397
+ from content_core.logging import logger
398
+
399
+ logger.warning(
400
+ f"Invalid {env_prefix}_MAX_DELAY: '{env_max_delay}'. "
401
+ f"Must be a valid number. Using config value: {max_delay}"
402
+ )
403
+
404
+ return {
405
+ "max_attempts": max_attempts,
406
+ "base_delay": base_delay,
407
+ "max_delay": max_delay,
408
+ }
409
+
410
+
411
+ # Proxy configuration
412
+ # Priority: Per-request > Programmatic (_PROXY_OVERRIDE) > Env var > YAML config
413
+ _PROXY_OVERRIDE: str | None = None
414
+
415
+
416
+ def _redact_proxy_url(proxy_url: str) -> str:
417
+ """
418
+ Redact credentials from a proxy URL for safe logging.
419
+
420
+ Args:
421
+ proxy_url: The proxy URL that may contain credentials
422
+
423
+ Returns:
424
+ The proxy URL with credentials redacted (e.g., http://***:***@host:port)
425
+ """
426
+ from urllib.parse import urlparse, urlunparse
427
+
428
+ try:
429
+ parsed = urlparse(proxy_url)
430
+ if parsed.username or parsed.password:
431
+ # Reconstruct URL with redacted credentials
432
+ redacted_netloc = "***:***@" + parsed.hostname
433
+ if parsed.port:
434
+ redacted_netloc += f":{parsed.port}"
435
+ return urlunparse(
436
+ (parsed.scheme, redacted_netloc, parsed.path, "", "", "")
437
+ )
438
+ return proxy_url
439
+ except Exception:
440
+ # If parsing fails, return a generic message
441
+ return "<proxy configured>"
442
+
443
+
444
+ def get_proxy(request_proxy: str | None = None) -> str | None:
445
+ """
446
+ Get the proxy URL with priority resolution.
447
+
448
+ Configuration priority (highest to lowest):
449
+ 1. Per-request proxy (passed as argument)
450
+ 2. Programmatic override via set_proxy()
451
+ 3. Environment variables (CCORE_HTTP_PROXY, HTTP_PROXY, HTTPS_PROXY)
452
+ 4. YAML config (proxy.url)
453
+
454
+ Args:
455
+ request_proxy: Optional per-request proxy override. Pass empty string "" to
456
+ explicitly disable proxy for this request.
457
+
458
+ Returns:
459
+ str | None: Proxy URL or None if no proxy configured
460
+
461
+ Examples:
462
+ >>> get_proxy() # Returns from env/config or None
463
+ 'http://proxy.example.com:8080'
464
+
465
+ >>> get_proxy("http://custom-proxy:8080") # Per-request override
466
+ 'http://custom-proxy:8080'
467
+
468
+ >>> get_proxy("") # Explicitly disable proxy for this request
469
+ None
470
+ """
471
+ # 1. Per-request override (highest priority)
472
+ if request_proxy is not None:
473
+ if request_proxy == "":
474
+ return None # Explicitly disabled
475
+ from content_core.logging import logger
476
+
477
+ logger.debug(f"Using per-request proxy: {_redact_proxy_url(request_proxy)}")
478
+ return request_proxy
479
+
480
+ # 2. Programmatic override
481
+ global _PROXY_OVERRIDE
482
+ if _PROXY_OVERRIDE is not None:
483
+ if _PROXY_OVERRIDE == "":
484
+ return None # Explicitly disabled
485
+ from content_core.logging import logger
486
+
487
+ logger.debug(f"Using programmatic proxy: {_redact_proxy_url(_PROXY_OVERRIDE)}")
488
+ return _PROXY_OVERRIDE
489
+
490
+ # 3. Environment variables
491
+ env_proxy = (
492
+ os.environ.get("CCORE_HTTP_PROXY")
493
+ or os.environ.get("HTTP_PROXY")
494
+ or os.environ.get("HTTPS_PROXY")
495
+ )
496
+ if env_proxy:
497
+ from content_core.logging import logger
498
+
499
+ logger.debug(f"Using environment proxy: {_redact_proxy_url(env_proxy)}")
500
+ return env_proxy
501
+
502
+ # 4. YAML config
503
+ yaml_proxy = CONFIG.get("proxy", {}).get("url")
504
+ if yaml_proxy:
505
+ from content_core.logging import logger
506
+
507
+ logger.debug(f"Using YAML config proxy: {_redact_proxy_url(yaml_proxy)}")
508
+ return yaml_proxy
509
+
510
+ return None
511
+
512
+
513
+ def set_proxy(proxy_url: str | None) -> None:
514
+ """
515
+ Set the proxy URL programmatically.
516
+
517
+ This sets a global proxy override that takes precedence over environment
518
+ variables and YAML config, but can still be overridden per-request.
519
+
520
+ Args:
521
+ proxy_url: Proxy URL (e.g., 'http://proxy:8080' or 'http://user:pass@proxy:8080')
522
+ Pass None or empty string to disable proxy.
523
+
524
+ Examples:
525
+ >>> set_proxy("http://proxy.example.com:8080") # Enable proxy
526
+ >>> set_proxy("http://user:pass@proxy:8080") # With authentication
527
+ >>> set_proxy(None) # Disable proxy
528
+ >>> set_proxy("") # Also disables proxy
529
+ """
530
+ global _PROXY_OVERRIDE
531
+ _PROXY_OVERRIDE = proxy_url
532
+ from content_core.logging import logger
533
+
534
+ if proxy_url:
535
+ logger.debug(f"Proxy set programmatically: {proxy_url}")
536
+ else:
537
+ logger.debug("Proxy disabled programmatically")
538
+
539
+
540
+ def clear_proxy() -> None:
541
+ """
542
+ Clear the programmatic proxy override.
543
+
544
+ After calling this, proxy resolution falls back to environment variables
545
+ and YAML config.
546
+
547
+ Examples:
548
+ >>> set_proxy("http://proxy:8080")
549
+ >>> clear_proxy() # Now uses env vars or YAML config
550
+ """
551
+ global _PROXY_OVERRIDE
552
+ _PROXY_OVERRIDE = None
553
+ from content_core.logging import logger
554
+
555
+ logger.debug("Programmatic proxy override cleared")
556
+
557
+
558
+ def get_no_proxy() -> list:
559
+ """
560
+ Get the list of hosts that should bypass the proxy.
561
+
562
+ Returns:
563
+ list: List of hostnames/patterns to bypass proxy
564
+
565
+ Examples:
566
+ >>> get_no_proxy()
567
+ ['localhost', '127.0.0.1']
568
+ """
569
+ # Check environment variable first
570
+ env_no_proxy = os.environ.get("NO_PROXY") or os.environ.get("no_proxy")
571
+ if env_no_proxy:
572
+ return [h.strip() for h in env_no_proxy.split(",") if h.strip()]
573
+
574
+ # Fall back to YAML config
575
+ return CONFIG.get("proxy", {}).get("no_proxy", ["localhost", "127.0.0.1"])
@@ -0,0 +1,6 @@
1
+ from .cleanup import cleanup_content
2
+ from .extraction import extract_content
3
+ from .identification import get_file_type
4
+ from .summary import summarize
5
+
6
+ __all__ = ["extract_content", "cleanup_content", "summarize", "get_file_type"]
@@ -0,0 +1,5 @@
1
+ """Content cleaning functionality for content-core."""
2
+
3
+ from .core import cleanup_content
4
+
5
+ __all__ = ["cleanup_content"]
@@ -0,0 +1,15 @@
1
+ from functools import partial
2
+
3
+ from content_core.models import ModelFactory
4
+ from content_core.templated_message import TemplatedMessageInput, templated_message
5
+
6
+
7
+ async def cleanup_content(content) -> str:
8
+ templated_summary_fn = partial(templated_message, model=ModelFactory.get_model('cleanup_model'))
9
+ input = TemplatedMessageInput(
10
+ system_prompt_template="content/cleanup",
11
+ user_prompt_text=content,
12
+ data={"content": content},
13
+ )
14
+ result = await templated_summary_fn(input)
15
+ return result
@@ -0,0 +1,13 @@
1
+ from typing import Dict, Union
2
+
3
+ from content_core.common import ProcessSourceInput, ProcessSourceOutput
4
+ from content_core.content.extraction.graph import graph
5
+
6
+ # todo: input/output schema do langgraph
7
+
8
+
9
+ async def extract_content(data: Union[ProcessSourceInput, Dict]) -> ProcessSourceOutput:
10
+ if isinstance(data, dict):
11
+ data = ProcessSourceInput(**data)
12
+ result = await graph.ainvoke(data)
13
+ return ProcessSourceOutput(**result)