content-core 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_core/__init__.py +216 -0
- content_core/cc_config.yaml +86 -0
- content_core/common/__init__.py +38 -0
- content_core/common/exceptions.py +70 -0
- content_core/common/retry.py +325 -0
- content_core/common/state.py +64 -0
- content_core/common/types.py +15 -0
- content_core/common/utils.py +31 -0
- content_core/config.py +575 -0
- content_core/content/__init__.py +6 -0
- content_core/content/cleanup/__init__.py +5 -0
- content_core/content/cleanup/core.py +15 -0
- content_core/content/extraction/__init__.py +13 -0
- content_core/content/extraction/graph.py +252 -0
- content_core/content/identification/__init__.py +9 -0
- content_core/content/identification/file_detector.py +505 -0
- content_core/content/summary/__init__.py +5 -0
- content_core/content/summary/core.py +15 -0
- content_core/logging.py +15 -0
- content_core/mcp/__init__.py +5 -0
- content_core/mcp/server.py +214 -0
- content_core/models.py +60 -0
- content_core/models_config.yaml +31 -0
- content_core/notebooks/run.ipynb +359 -0
- content_core/notebooks/urls.ipynb +154 -0
- content_core/processors/audio.py +272 -0
- content_core/processors/docling.py +79 -0
- content_core/processors/office.py +331 -0
- content_core/processors/pdf.py +292 -0
- content_core/processors/text.py +36 -0
- content_core/processors/url.py +324 -0
- content_core/processors/video.py +166 -0
- content_core/processors/youtube.py +262 -0
- content_core/py.typed +2 -0
- content_core/templated_message.py +70 -0
- content_core/tools/__init__.py +9 -0
- content_core/tools/cleanup.py +15 -0
- content_core/tools/extract.py +21 -0
- content_core/tools/summarize.py +17 -0
- content_core-1.10.0.dist-info/METADATA +742 -0
- content_core-1.10.0.dist-info/RECORD +44 -0
- content_core-1.10.0.dist-info/WHEEL +4 -0
- content_core-1.10.0.dist-info/entry_points.txt +5 -0
- content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
content_core/__init__.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from xml.etree import ElementTree as ET
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
load_dotenv()
|
|
11
|
+
|
|
12
|
+
from dicttoxml import dicttoxml # type: ignore
|
|
13
|
+
|
|
14
|
+
from content_core.common import ProcessSourceInput
|
|
15
|
+
from content_core.content.cleanup import cleanup_content
|
|
16
|
+
from content_core.content.extraction import extract_content
|
|
17
|
+
from content_core.content.summary import summarize
|
|
18
|
+
from content_core.logging import configure_logging, logger
|
|
19
|
+
|
|
20
|
+
# Exposing functions for direct access when importing content_core as cc
|
|
21
|
+
extract = extract_content
|
|
22
|
+
clean = cleanup_content
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Configure loguru logger using centralized configuration
|
|
26
|
+
configure_logging(debug=False)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_content_format(content: str) -> str:
|
|
30
|
+
"""Parse content that might be JSON or XML, extracting the 'content' field if present."""
|
|
31
|
+
try:
|
|
32
|
+
# Try JSON first
|
|
33
|
+
try:
|
|
34
|
+
json_data = json.loads(content)
|
|
35
|
+
if isinstance(json_data, dict) and "content" in json_data:
|
|
36
|
+
extracted = json_data["content"]
|
|
37
|
+
return str(extracted) if extracted is not None else content
|
|
38
|
+
except json.JSONDecodeError:
|
|
39
|
+
# Try XML
|
|
40
|
+
try:
|
|
41
|
+
root = ET.fromstring(content)
|
|
42
|
+
content_elem = root.find(".//content")
|
|
43
|
+
if content_elem is not None and content_elem.text is not None:
|
|
44
|
+
return content_elem.text
|
|
45
|
+
except ET.ParseError:
|
|
46
|
+
pass
|
|
47
|
+
return content
|
|
48
|
+
except Exception as e:
|
|
49
|
+
logger.error(f"Error parsing content: {e}")
|
|
50
|
+
return content
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_content(args, parser, allow_empty=False):
|
|
54
|
+
"""Helper to get content from args or stdin."""
|
|
55
|
+
if args.content is None:
|
|
56
|
+
if sys.stdin.isatty():
|
|
57
|
+
parser.error("No content provided. Provide content or pipe input.")
|
|
58
|
+
else:
|
|
59
|
+
content = sys.stdin.read().strip()
|
|
60
|
+
else:
|
|
61
|
+
content = args.content
|
|
62
|
+
|
|
63
|
+
if not content and not allow_empty:
|
|
64
|
+
parser.error("Empty input provided.")
|
|
65
|
+
return content
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
async def process_input_content(content: str) -> str:
|
|
69
|
+
"""Process input content, handling URLs and file paths."""
|
|
70
|
+
if "http" in content:
|
|
71
|
+
result = await extract_content(ProcessSourceInput(url=content))
|
|
72
|
+
content = result.content if result.content else str(result)
|
|
73
|
+
elif os.path.exists(content):
|
|
74
|
+
result = await extract_content(ProcessSourceInput(file_path=content))
|
|
75
|
+
content = result.content if result.content else str(result)
|
|
76
|
+
return content
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def ccore_main():
|
|
80
|
+
"""CLI logic for ccore (extract)."""
|
|
81
|
+
parser = argparse.ArgumentParser(
|
|
82
|
+
description="Content Core CLI: Extract content with formatting options."
|
|
83
|
+
)
|
|
84
|
+
parser.add_argument(
|
|
85
|
+
"-f",
|
|
86
|
+
"--format",
|
|
87
|
+
choices=["xml", "json", "text"],
|
|
88
|
+
default="text",
|
|
89
|
+
help="Output format (xml, json, or text). Default: text",
|
|
90
|
+
)
|
|
91
|
+
parser.add_argument(
|
|
92
|
+
"-d", "--debug", action="store_true", help="Enable debug logging."
|
|
93
|
+
)
|
|
94
|
+
parser.add_argument(
|
|
95
|
+
"content",
|
|
96
|
+
nargs="?",
|
|
97
|
+
help="Content to process (URL, file path, or text). If not provided, reads from stdin.",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
args = parser.parse_args()
|
|
101
|
+
|
|
102
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
103
|
+
configure_logging(debug=args.debug)
|
|
104
|
+
if args.debug:
|
|
105
|
+
logger.debug("Debug logging enabled")
|
|
106
|
+
|
|
107
|
+
content = get_content(args, parser)
|
|
108
|
+
|
|
109
|
+
content = await process_input_content(content)
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
result = await extract_content(ProcessSourceInput(content=content))
|
|
113
|
+
if args.format == "xml":
|
|
114
|
+
result = dicttoxml(
|
|
115
|
+
result.model_dump(), custom_root="result", attr_type=False
|
|
116
|
+
).decode('utf-8')
|
|
117
|
+
elif args.format == "json":
|
|
118
|
+
result = result.model_dump_json()
|
|
119
|
+
else: # text
|
|
120
|
+
result = result.content
|
|
121
|
+
print(result)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Error extracting content: {e}")
|
|
124
|
+
sys.exit(1)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
async def cclean_main():
|
|
128
|
+
"""CLI logic for cclean."""
|
|
129
|
+
parser = argparse.ArgumentParser(
|
|
130
|
+
description="Content Core CLI: Clean content string."
|
|
131
|
+
)
|
|
132
|
+
parser.add_argument(
|
|
133
|
+
"-d", "--debug", action="store_true", help="Enable debug logging."
|
|
134
|
+
)
|
|
135
|
+
parser.add_argument(
|
|
136
|
+
"content",
|
|
137
|
+
nargs="?",
|
|
138
|
+
help="Content to clean (URL, file path, text, JSON, or XML). If not provided, reads from stdin.",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
args = parser.parse_args()
|
|
142
|
+
|
|
143
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
144
|
+
configure_logging(debug=args.debug)
|
|
145
|
+
if args.debug:
|
|
146
|
+
logger.debug("Debug logging enabled")
|
|
147
|
+
|
|
148
|
+
content = get_content(args, parser)
|
|
149
|
+
|
|
150
|
+
content = await process_input_content(content)
|
|
151
|
+
content = parse_content_format(content)
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
result = await cleanup_content(content)
|
|
155
|
+
print(result)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Error cleaning content: {e}")
|
|
158
|
+
sys.exit(1)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
async def csum_main():
|
|
162
|
+
"""CLI logic for csum."""
|
|
163
|
+
parser = argparse.ArgumentParser(
|
|
164
|
+
description="Content Core CLI: Summarize content with optional context."
|
|
165
|
+
)
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--context",
|
|
168
|
+
default="",
|
|
169
|
+
help="Optional context for summarization (e.g., 'summarize as if explaining to a child').",
|
|
170
|
+
)
|
|
171
|
+
parser.add_argument(
|
|
172
|
+
"-d", "--debug", action="store_true", help="Enable debug logging."
|
|
173
|
+
)
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
"content",
|
|
176
|
+
nargs="?",
|
|
177
|
+
help="Content to summarize (URL, file path, text, JSON, or XML). If not provided, reads from stdin.",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
args = parser.parse_args()
|
|
181
|
+
|
|
182
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
183
|
+
configure_logging(debug=args.debug)
|
|
184
|
+
if args.debug:
|
|
185
|
+
logger.debug("Debug logging enabled")
|
|
186
|
+
|
|
187
|
+
content = get_content(args, parser)
|
|
188
|
+
|
|
189
|
+
content = await process_input_content(content)
|
|
190
|
+
content = parse_content_format(content)
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
result = await summarize(content, args.context)
|
|
194
|
+
print(result)
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.error(f"Error summarizing content: {e}")
|
|
197
|
+
sys.exit(1)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def ccore():
|
|
201
|
+
"""Synchronous wrapper for ccore."""
|
|
202
|
+
asyncio.run(ccore_main())
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def cclean():
|
|
206
|
+
"""Synchronous wrapper for cclean."""
|
|
207
|
+
asyncio.run(cclean_main())
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def csum():
|
|
211
|
+
"""Synchronous wrapper for csum."""
|
|
212
|
+
asyncio.run(csum_main())
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
if __name__ == "__main__":
|
|
216
|
+
ccore()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Content Core main configuration
|
|
2
|
+
# Copy this file to your project root or set CCORE_CONFIG_PATH to its location
|
|
3
|
+
|
|
4
|
+
speech_to_text:
|
|
5
|
+
provider: openai
|
|
6
|
+
model_name: gpt-4o-transcribe-diarize
|
|
7
|
+
timeout: 3600 # 1 hour - for processing very long audio files
|
|
8
|
+
|
|
9
|
+
default_model:
|
|
10
|
+
provider: openai
|
|
11
|
+
model_name: gpt-4o-mini
|
|
12
|
+
config:
|
|
13
|
+
temperature: 0.5
|
|
14
|
+
top_p: 1
|
|
15
|
+
max_tokens: 2000
|
|
16
|
+
timeout: 300 # 5 minutes - for general language model operations
|
|
17
|
+
|
|
18
|
+
cleanup_model:
|
|
19
|
+
provider: openai
|
|
20
|
+
model_name: gpt-4o-mini
|
|
21
|
+
config:
|
|
22
|
+
temperature: 0
|
|
23
|
+
max_tokens: 8000
|
|
24
|
+
output_format: json
|
|
25
|
+
timeout: 600 # 10 minutes - for complex content cleaning with large inputs
|
|
26
|
+
|
|
27
|
+
summary_model:
|
|
28
|
+
provider: openai
|
|
29
|
+
model_name: gpt-4o-mini
|
|
30
|
+
config:
|
|
31
|
+
temperature: 0
|
|
32
|
+
top_p: 1
|
|
33
|
+
max_tokens: 2000
|
|
34
|
+
timeout: 300 # 5 minutes - for content summarization
|
|
35
|
+
|
|
36
|
+
extraction:
|
|
37
|
+
document_engine: auto # auto | simple | docling - for files/documents
|
|
38
|
+
url_engine: auto # auto | simple | firecrawl | jina | crawl4ai | docling - for URLs
|
|
39
|
+
audio:
|
|
40
|
+
concurrency: 3 # Number of concurrent audio transcriptions (1-10)
|
|
41
|
+
docling:
|
|
42
|
+
output_format: markdown # markdown | html | json
|
|
43
|
+
pymupdf:
|
|
44
|
+
enable_formula_ocr: false # Enable OCR for formula-heavy pages (requires Tesseract)
|
|
45
|
+
formula_threshold: 3 # Minimum formulas per page to trigger OCR
|
|
46
|
+
ocr_fallback: true # Gracefully fallback to standard extraction if OCR fails
|
|
47
|
+
|
|
48
|
+
youtube_transcripts:
|
|
49
|
+
preferred_languages: ["en", "es", "pt"]
|
|
50
|
+
|
|
51
|
+
# Proxy configuration for HTTP/HTTPS requests
|
|
52
|
+
# Environment variables CCORE_HTTP_PROXY, HTTP_PROXY, or HTTPS_PROXY take precedence
|
|
53
|
+
proxy:
|
|
54
|
+
url: null # Proxy URL (e.g., "http://proxy.example.com:8080" or "http://user:pass@proxy:8080")
|
|
55
|
+
no_proxy: # List of hosts to bypass proxy
|
|
56
|
+
- "localhost"
|
|
57
|
+
- "127.0.0.1"
|
|
58
|
+
|
|
59
|
+
# Retry configuration for transient failures
|
|
60
|
+
# Each operation type can be configured independently
|
|
61
|
+
# Environment variables override these settings (e.g., CCORE_YOUTUBE_MAX_RETRIES)
|
|
62
|
+
retry:
|
|
63
|
+
youtube:
|
|
64
|
+
max_attempts: 5 # Number of retry attempts (YouTube has aggressive rate limiting)
|
|
65
|
+
base_delay: 2 # Base delay in seconds for exponential backoff
|
|
66
|
+
max_delay: 60 # Maximum delay between retries
|
|
67
|
+
url_api:
|
|
68
|
+
max_attempts: 3 # For API-based URL extraction (Jina, Firecrawl) and Crawl4AI
|
|
69
|
+
base_delay: 1
|
|
70
|
+
max_delay: 30
|
|
71
|
+
url_network:
|
|
72
|
+
max_attempts: 3 # For network-only operations (BeautifulSoup, HEAD requests)
|
|
73
|
+
base_delay: 0.5
|
|
74
|
+
max_delay: 10
|
|
75
|
+
audio:
|
|
76
|
+
max_attempts: 3 # For speech-to-text API calls
|
|
77
|
+
base_delay: 2
|
|
78
|
+
max_delay: 30
|
|
79
|
+
llm:
|
|
80
|
+
max_attempts: 3 # For LLM API calls (summary, cleanup)
|
|
81
|
+
base_delay: 1
|
|
82
|
+
max_delay: 30
|
|
83
|
+
download:
|
|
84
|
+
max_attempts: 3 # For remote file downloads
|
|
85
|
+
base_delay: 1
|
|
86
|
+
max_delay: 15
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Common utilities and shared code for content-core."""
|
|
2
|
+
|
|
3
|
+
from .exceptions import (
|
|
4
|
+
ContentCoreError,
|
|
5
|
+
InvalidInputError,
|
|
6
|
+
NotFoundError,
|
|
7
|
+
UnsupportedTypeException,
|
|
8
|
+
)
|
|
9
|
+
from .retry import (
|
|
10
|
+
RetryError,
|
|
11
|
+
retry_audio_transcription,
|
|
12
|
+
retry_download,
|
|
13
|
+
retry_llm,
|
|
14
|
+
retry_url_api,
|
|
15
|
+
retry_url_network,
|
|
16
|
+
retry_youtube,
|
|
17
|
+
)
|
|
18
|
+
from .state import ProcessSourceInput, ProcessSourceOutput, ProcessSourceState
|
|
19
|
+
from .utils import process_input_content
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"ContentCoreError",
|
|
23
|
+
"UnsupportedTypeException",
|
|
24
|
+
"InvalidInputError",
|
|
25
|
+
"NotFoundError",
|
|
26
|
+
"ProcessSourceInput",
|
|
27
|
+
"ProcessSourceState",
|
|
28
|
+
"ProcessSourceOutput",
|
|
29
|
+
"process_input_content",
|
|
30
|
+
# Retry decorators
|
|
31
|
+
"retry_youtube",
|
|
32
|
+
"retry_url_api",
|
|
33
|
+
"retry_url_network",
|
|
34
|
+
"retry_audio_transcription",
|
|
35
|
+
"retry_llm",
|
|
36
|
+
"retry_download",
|
|
37
|
+
"RetryError",
|
|
38
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
class ContentCoreError(Exception):
|
|
2
|
+
"""Base exception class for Open Notebook errors."""
|
|
3
|
+
|
|
4
|
+
pass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DatabaseOperationError(ContentCoreError):
|
|
8
|
+
"""Raised when a database operation fails."""
|
|
9
|
+
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class UnsupportedTypeException(ContentCoreError):
|
|
14
|
+
"""Raised when an unsupported type is provided."""
|
|
15
|
+
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InvalidInputError(ContentCoreError):
|
|
20
|
+
"""Raised when invalid input is provided."""
|
|
21
|
+
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NotFoundError(ContentCoreError):
|
|
26
|
+
"""Raised when a requested resource is not found."""
|
|
27
|
+
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AuthenticationError(ContentCoreError):
|
|
32
|
+
"""Raised when there's an authentication problem."""
|
|
33
|
+
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ConfigurationError(ContentCoreError):
|
|
38
|
+
"""Raised when there's a configuration problem."""
|
|
39
|
+
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ExternalServiceError(ContentCoreError):
|
|
44
|
+
"""Raised when an external service (e.g., AI model) fails."""
|
|
45
|
+
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class RateLimitError(ContentCoreError):
|
|
50
|
+
"""Raised when a rate limit is exceeded."""
|
|
51
|
+
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class FileOperationError(ContentCoreError):
|
|
56
|
+
"""Raised when a file operation fails."""
|
|
57
|
+
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class NetworkError(ContentCoreError):
|
|
62
|
+
"""Raised when a network operation fails."""
|
|
63
|
+
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class NoTranscriptFound(ContentCoreError):
|
|
68
|
+
"""Raised when no transcript is found for a video."""
|
|
69
|
+
|
|
70
|
+
pass
|