content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,216 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import sys
6
+ from xml.etree import ElementTree as ET
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+ from dicttoxml import dicttoxml # type: ignore
13
+
14
+ from content_core.common import ProcessSourceInput
15
+ from content_core.content.cleanup import cleanup_content
16
+ from content_core.content.extraction import extract_content
17
+ from content_core.content.summary import summarize
18
+ from content_core.logging import configure_logging, logger
19
+
20
+ # Exposing functions for direct access when importing content_core as cc
21
+ extract = extract_content
22
+ clean = cleanup_content
23
+
24
+
25
+ # Configure loguru logger using centralized configuration
26
+ configure_logging(debug=False)
27
+
28
+
29
+ def parse_content_format(content: str) -> str:
30
+ """Parse content that might be JSON or XML, extracting the 'content' field if present."""
31
+ try:
32
+ # Try JSON first
33
+ try:
34
+ json_data = json.loads(content)
35
+ if isinstance(json_data, dict) and "content" in json_data:
36
+ extracted = json_data["content"]
37
+ return str(extracted) if extracted is not None else content
38
+ except json.JSONDecodeError:
39
+ # Try XML
40
+ try:
41
+ root = ET.fromstring(content)
42
+ content_elem = root.find(".//content")
43
+ if content_elem is not None and content_elem.text is not None:
44
+ return content_elem.text
45
+ except ET.ParseError:
46
+ pass
47
+ return content
48
+ except Exception as e:
49
+ logger.error(f"Error parsing content: {e}")
50
+ return content
51
+
52
+
53
+ def get_content(args, parser, allow_empty=False):
54
+ """Helper to get content from args or stdin."""
55
+ if args.content is None:
56
+ if sys.stdin.isatty():
57
+ parser.error("No content provided. Provide content or pipe input.")
58
+ else:
59
+ content = sys.stdin.read().strip()
60
+ else:
61
+ content = args.content
62
+
63
+ if not content and not allow_empty:
64
+ parser.error("Empty input provided.")
65
+ return content
66
+
67
+
68
+ async def process_input_content(content: str) -> str:
69
+ """Process input content, handling URLs and file paths."""
70
+ if "http" in content:
71
+ result = await extract_content(ProcessSourceInput(url=content))
72
+ content = result.content if result.content else str(result)
73
+ elif os.path.exists(content):
74
+ result = await extract_content(ProcessSourceInput(file_path=content))
75
+ content = result.content if result.content else str(result)
76
+ return content
77
+
78
+
79
+ async def ccore_main():
80
+ """CLI logic for ccore (extract)."""
81
+ parser = argparse.ArgumentParser(
82
+ description="Content Core CLI: Extract content with formatting options."
83
+ )
84
+ parser.add_argument(
85
+ "-f",
86
+ "--format",
87
+ choices=["xml", "json", "text"],
88
+ default="text",
89
+ help="Output format (xml, json, or text). Default: text",
90
+ )
91
+ parser.add_argument(
92
+ "-d", "--debug", action="store_true", help="Enable debug logging."
93
+ )
94
+ parser.add_argument(
95
+ "content",
96
+ nargs="?",
97
+ help="Content to process (URL, file path, or text). If not provided, reads from stdin.",
98
+ )
99
+
100
+ args = parser.parse_args()
101
+
102
+ # Adjust logging level based on debug flag using centralized configuration
103
+ configure_logging(debug=args.debug)
104
+ if args.debug:
105
+ logger.debug("Debug logging enabled")
106
+
107
+ content = get_content(args, parser)
108
+
109
+ content = await process_input_content(content)
110
+
111
+ try:
112
+ result = await extract_content(ProcessSourceInput(content=content))
113
+ if args.format == "xml":
114
+ result = dicttoxml(
115
+ result.model_dump(), custom_root="result", attr_type=False
116
+ ).decode('utf-8')
117
+ elif args.format == "json":
118
+ result = result.model_dump_json()
119
+ else: # text
120
+ result = result.content
121
+ print(result)
122
+ except Exception as e:
123
+ logger.error(f"Error extracting content: {e}")
124
+ sys.exit(1)
125
+
126
+
127
+ async def cclean_main():
128
+ """CLI logic for cclean."""
129
+ parser = argparse.ArgumentParser(
130
+ description="Content Core CLI: Clean content string."
131
+ )
132
+ parser.add_argument(
133
+ "-d", "--debug", action="store_true", help="Enable debug logging."
134
+ )
135
+ parser.add_argument(
136
+ "content",
137
+ nargs="?",
138
+ help="Content to clean (URL, file path, text, JSON, or XML). If not provided, reads from stdin.",
139
+ )
140
+
141
+ args = parser.parse_args()
142
+
143
+ # Adjust logging level based on debug flag using centralized configuration
144
+ configure_logging(debug=args.debug)
145
+ if args.debug:
146
+ logger.debug("Debug logging enabled")
147
+
148
+ content = get_content(args, parser)
149
+
150
+ content = await process_input_content(content)
151
+ content = parse_content_format(content)
152
+
153
+ try:
154
+ result = await cleanup_content(content)
155
+ print(result)
156
+ except Exception as e:
157
+ logger.error(f"Error cleaning content: {e}")
158
+ sys.exit(1)
159
+
160
+
161
+ async def csum_main():
162
+ """CLI logic for csum."""
163
+ parser = argparse.ArgumentParser(
164
+ description="Content Core CLI: Summarize content with optional context."
165
+ )
166
+ parser.add_argument(
167
+ "--context",
168
+ default="",
169
+ help="Optional context for summarization (e.g., 'summarize as if explaining to a child').",
170
+ )
171
+ parser.add_argument(
172
+ "-d", "--debug", action="store_true", help="Enable debug logging."
173
+ )
174
+ parser.add_argument(
175
+ "content",
176
+ nargs="?",
177
+ help="Content to summarize (URL, file path, text, JSON, or XML). If not provided, reads from stdin.",
178
+ )
179
+
180
+ args = parser.parse_args()
181
+
182
+ # Adjust logging level based on debug flag using centralized configuration
183
+ configure_logging(debug=args.debug)
184
+ if args.debug:
185
+ logger.debug("Debug logging enabled")
186
+
187
+ content = get_content(args, parser)
188
+
189
+ content = await process_input_content(content)
190
+ content = parse_content_format(content)
191
+
192
+ try:
193
+ result = await summarize(content, args.context)
194
+ print(result)
195
+ except Exception as e:
196
+ logger.error(f"Error summarizing content: {e}")
197
+ sys.exit(1)
198
+
199
+
200
+ def ccore():
201
+ """Synchronous wrapper for ccore."""
202
+ asyncio.run(ccore_main())
203
+
204
+
205
+ def cclean():
206
+ """Synchronous wrapper for cclean."""
207
+ asyncio.run(cclean_main())
208
+
209
+
210
+ def csum():
211
+ """Synchronous wrapper for csum."""
212
+ asyncio.run(csum_main())
213
+
214
+
215
+ if __name__ == "__main__":
216
+ ccore()
@@ -0,0 +1,86 @@
1
+ # Content Core main configuration
2
+ # Copy this file to your project root or set CCORE_CONFIG_PATH to its location
3
+
4
+ speech_to_text:
5
+ provider: openai
6
+ model_name: gpt-4o-transcribe-diarize
7
+ timeout: 3600 # 1 hour - for processing very long audio files
8
+
9
+ default_model:
10
+ provider: openai
11
+ model_name: gpt-4o-mini
12
+ config:
13
+ temperature: 0.5
14
+ top_p: 1
15
+ max_tokens: 2000
16
+ timeout: 300 # 5 minutes - for general language model operations
17
+
18
+ cleanup_model:
19
+ provider: openai
20
+ model_name: gpt-4o-mini
21
+ config:
22
+ temperature: 0
23
+ max_tokens: 8000
24
+ output_format: json
25
+ timeout: 600 # 10 minutes - for complex content cleaning with large inputs
26
+
27
+ summary_model:
28
+ provider: openai
29
+ model_name: gpt-4o-mini
30
+ config:
31
+ temperature: 0
32
+ top_p: 1
33
+ max_tokens: 2000
34
+ timeout: 300 # 5 minutes - for content summarization
35
+
36
+ extraction:
37
+ document_engine: auto # auto | simple | docling - for files/documents
38
+ url_engine: auto # auto | simple | firecrawl | jina | crawl4ai | docling - for URLs
39
+ audio:
40
+ concurrency: 3 # Number of concurrent audio transcriptions (1-10)
41
+ docling:
42
+ output_format: markdown # markdown | html | json
43
+ pymupdf:
44
+ enable_formula_ocr: false # Enable OCR for formula-heavy pages (requires Tesseract)
45
+ formula_threshold: 3 # Minimum formulas per page to trigger OCR
46
+ ocr_fallback: true # Gracefully fallback to standard extraction if OCR fails
47
+
48
+ youtube_transcripts:
49
+ preferred_languages: ["en", "es", "pt"]
50
+
51
+ # Proxy configuration for HTTP/HTTPS requests
52
+ # Environment variables CCORE_HTTP_PROXY, HTTP_PROXY, or HTTPS_PROXY take precedence
53
+ proxy:
54
+ url: null # Proxy URL (e.g., "http://proxy.example.com:8080" or "http://user:pass@proxy:8080")
55
+ no_proxy: # List of hosts to bypass proxy
56
+ - "localhost"
57
+ - "127.0.0.1"
58
+
59
+ # Retry configuration for transient failures
60
+ # Each operation type can be configured independently
61
+ # Environment variables override these settings (e.g., CCORE_YOUTUBE_MAX_RETRIES)
62
+ retry:
63
+ youtube:
64
+ max_attempts: 5 # Number of retry attempts (YouTube has aggressive rate limiting)
65
+ base_delay: 2 # Base delay in seconds for exponential backoff
66
+ max_delay: 60 # Maximum delay between retries
67
+ url_api:
68
+ max_attempts: 3 # For API-based URL extraction (Jina, Firecrawl) and Crawl4AI
69
+ base_delay: 1
70
+ max_delay: 30
71
+ url_network:
72
+ max_attempts: 3 # For network-only operations (BeautifulSoup, HEAD requests)
73
+ base_delay: 0.5
74
+ max_delay: 10
75
+ audio:
76
+ max_attempts: 3 # For speech-to-text API calls
77
+ base_delay: 2
78
+ max_delay: 30
79
+ llm:
80
+ max_attempts: 3 # For LLM API calls (summary, cleanup)
81
+ base_delay: 1
82
+ max_delay: 30
83
+ download:
84
+ max_attempts: 3 # For remote file downloads
85
+ base_delay: 1
86
+ max_delay: 15
@@ -0,0 +1,38 @@
1
+ """Common utilities and shared code for content-core."""
2
+
3
+ from .exceptions import (
4
+ ContentCoreError,
5
+ InvalidInputError,
6
+ NotFoundError,
7
+ UnsupportedTypeException,
8
+ )
9
+ from .retry import (
10
+ RetryError,
11
+ retry_audio_transcription,
12
+ retry_download,
13
+ retry_llm,
14
+ retry_url_api,
15
+ retry_url_network,
16
+ retry_youtube,
17
+ )
18
+ from .state import ProcessSourceInput, ProcessSourceOutput, ProcessSourceState
19
+ from .utils import process_input_content
20
+
21
+ __all__ = [
22
+ "ContentCoreError",
23
+ "UnsupportedTypeException",
24
+ "InvalidInputError",
25
+ "NotFoundError",
26
+ "ProcessSourceInput",
27
+ "ProcessSourceState",
28
+ "ProcessSourceOutput",
29
+ "process_input_content",
30
+ # Retry decorators
31
+ "retry_youtube",
32
+ "retry_url_api",
33
+ "retry_url_network",
34
+ "retry_audio_transcription",
35
+ "retry_llm",
36
+ "retry_download",
37
+ "RetryError",
38
+ ]
@@ -0,0 +1,70 @@
1
+ class ContentCoreError(Exception):
2
+ """Base exception class for Open Notebook errors."""
3
+
4
+ pass
5
+
6
+
7
+ class DatabaseOperationError(ContentCoreError):
8
+ """Raised when a database operation fails."""
9
+
10
+ pass
11
+
12
+
13
+ class UnsupportedTypeException(ContentCoreError):
14
+ """Raised when an unsupported type is provided."""
15
+
16
+ pass
17
+
18
+
19
+ class InvalidInputError(ContentCoreError):
20
+ """Raised when invalid input is provided."""
21
+
22
+ pass
23
+
24
+
25
+ class NotFoundError(ContentCoreError):
26
+ """Raised when a requested resource is not found."""
27
+
28
+ pass
29
+
30
+
31
+ class AuthenticationError(ContentCoreError):
32
+ """Raised when there's an authentication problem."""
33
+
34
+ pass
35
+
36
+
37
+ class ConfigurationError(ContentCoreError):
38
+ """Raised when there's a configuration problem."""
39
+
40
+ pass
41
+
42
+
43
+ class ExternalServiceError(ContentCoreError):
44
+ """Raised when an external service (e.g., AI model) fails."""
45
+
46
+ pass
47
+
48
+
49
+ class RateLimitError(ContentCoreError):
50
+ """Raised when a rate limit is exceeded."""
51
+
52
+ pass
53
+
54
+
55
+ class FileOperationError(ContentCoreError):
56
+ """Raised when a file operation fails."""
57
+
58
+ pass
59
+
60
+
61
+ class NetworkError(ContentCoreError):
62
+ """Raised when a network operation fails."""
63
+
64
+ pass
65
+
66
+
67
+ class NoTranscriptFound(ContentCoreError):
68
+ """Raised when no transcript is found for a video."""
69
+
70
+ pass