content-core 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_core/__init__.py +216 -0
- content_core/cc_config.yaml +86 -0
- content_core/common/__init__.py +38 -0
- content_core/common/exceptions.py +70 -0
- content_core/common/retry.py +325 -0
- content_core/common/state.py +64 -0
- content_core/common/types.py +15 -0
- content_core/common/utils.py +31 -0
- content_core/config.py +575 -0
- content_core/content/__init__.py +6 -0
- content_core/content/cleanup/__init__.py +5 -0
- content_core/content/cleanup/core.py +15 -0
- content_core/content/extraction/__init__.py +13 -0
- content_core/content/extraction/graph.py +252 -0
- content_core/content/identification/__init__.py +9 -0
- content_core/content/identification/file_detector.py +505 -0
- content_core/content/summary/__init__.py +5 -0
- content_core/content/summary/core.py +15 -0
- content_core/logging.py +15 -0
- content_core/mcp/__init__.py +5 -0
- content_core/mcp/server.py +214 -0
- content_core/models.py +60 -0
- content_core/models_config.yaml +31 -0
- content_core/notebooks/run.ipynb +359 -0
- content_core/notebooks/urls.ipynb +154 -0
- content_core/processors/audio.py +272 -0
- content_core/processors/docling.py +79 -0
- content_core/processors/office.py +331 -0
- content_core/processors/pdf.py +292 -0
- content_core/processors/text.py +36 -0
- content_core/processors/url.py +324 -0
- content_core/processors/video.py +166 -0
- content_core/processors/youtube.py +262 -0
- content_core/py.typed +2 -0
- content_core/templated_message.py +70 -0
- content_core/tools/__init__.py +9 -0
- content_core/tools/cleanup.py +15 -0
- content_core/tools/extract.py +21 -0
- content_core/tools/summarize.py +17 -0
- content_core-1.10.0.dist-info/METADATA +742 -0
- content_core-1.10.0.dist-info/RECORD +44 -0
- content_core-1.10.0.dist-info/WHEEL +4 -0
- content_core-1.10.0.dist-info/entry_points.txt +5 -0
- content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Content Core MCP Server implementation."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from io import StringIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
|
+
|
|
11
|
+
from fastmcp import FastMCP
|
|
12
|
+
from loguru import logger
|
|
13
|
+
|
|
14
|
+
# Suppress MoviePy output for MCP compatibility
|
|
15
|
+
os.environ["IMAGEIO_LOG_LEVEL"] = "error"
|
|
16
|
+
os.environ["FFMPEG_LOG_LEVEL"] = "error"
|
|
17
|
+
|
|
18
|
+
# Configure loguru to not output to stdout (which would interfere with MCP)
|
|
19
|
+
logger.remove() # Remove default handler
|
|
20
|
+
logger.add(sys.stderr, level="INFO") # Add stderr handler only
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@contextmanager
|
|
24
|
+
def suppress_stdout():
|
|
25
|
+
"""Context manager to suppress stdout during operations that might print."""
|
|
26
|
+
original_stdout = sys.stdout
|
|
27
|
+
sys.stdout = StringIO()
|
|
28
|
+
try:
|
|
29
|
+
yield
|
|
30
|
+
finally:
|
|
31
|
+
sys.stdout = original_stdout
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Add parent directory to path to import content_core
|
|
35
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
|
36
|
+
|
|
37
|
+
import content_core as cc
|
|
38
|
+
|
|
39
|
+
# Initialize MCP server
|
|
40
|
+
mcp = FastMCP("Content Core MCP Server")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def _extract_content_impl(
|
|
44
|
+
url: Optional[str] = None, file_path: Optional[str] = None
|
|
45
|
+
) -> Dict[str, Any]:
|
|
46
|
+
"""
|
|
47
|
+
Extract content from a URL or file using Content Core's auto engine. This is useful for processing Youtube transcripts, website content, PDFs, ePUB, Office files, etc. You can also use it to extract transcripts from audio or video files.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
url: Optional URL to extract content from
|
|
51
|
+
file_path: Optional file path to extract content from
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
JSON object containing extracted content and metadata
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ValueError: If neither or both url and file_path are provided
|
|
58
|
+
"""
|
|
59
|
+
# Validate input - exactly one must be provided
|
|
60
|
+
if (url is None and file_path is None) or (
|
|
61
|
+
url is not None and file_path is not None
|
|
62
|
+
):
|
|
63
|
+
return {
|
|
64
|
+
"success": False,
|
|
65
|
+
"error": "Exactly one of 'url' or 'file_path' must be provided",
|
|
66
|
+
"source_type": None,
|
|
67
|
+
"source": None,
|
|
68
|
+
"content": None,
|
|
69
|
+
"metadata": None,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Determine source type and validate
|
|
73
|
+
source_type = "url" if url else "file"
|
|
74
|
+
source = url if url else file_path
|
|
75
|
+
|
|
76
|
+
# Additional validation for file paths
|
|
77
|
+
if file_path:
|
|
78
|
+
path = Path(file_path)
|
|
79
|
+
if not path.exists():
|
|
80
|
+
return {
|
|
81
|
+
"success": False,
|
|
82
|
+
"error": f"File not found: {file_path}",
|
|
83
|
+
"source_type": source_type,
|
|
84
|
+
"source": source,
|
|
85
|
+
"content": None,
|
|
86
|
+
"metadata": None,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# Security check - ensure no directory traversal
|
|
90
|
+
try:
|
|
91
|
+
# Resolve to absolute path and ensure it's not trying to access sensitive areas
|
|
92
|
+
path.resolve()
|
|
93
|
+
# You might want to add additional checks here based on your security requirements
|
|
94
|
+
except Exception as e:
|
|
95
|
+
return {
|
|
96
|
+
"success": False,
|
|
97
|
+
"error": f"Invalid file path: {str(e)}",
|
|
98
|
+
"source_type": source_type,
|
|
99
|
+
"source": source,
|
|
100
|
+
"content": None,
|
|
101
|
+
"metadata": None,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# Build extraction request
|
|
105
|
+
extraction_request = {}
|
|
106
|
+
if url:
|
|
107
|
+
extraction_request["url"] = url
|
|
108
|
+
else:
|
|
109
|
+
extraction_request["file_path"] = str(Path(file_path).resolve())
|
|
110
|
+
|
|
111
|
+
# Track start time
|
|
112
|
+
start_time = datetime.utcnow()
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
# Use Content Core's extract_content with auto engine
|
|
116
|
+
logger.info(f"Extracting content from {source_type}: {source}")
|
|
117
|
+
|
|
118
|
+
# Suppress stdout to prevent MoviePy and other libraries from interfering with MCP protocol
|
|
119
|
+
with suppress_stdout():
|
|
120
|
+
result = await cc.extract_content(extraction_request)
|
|
121
|
+
|
|
122
|
+
# Calculate extraction time
|
|
123
|
+
extraction_time = (datetime.utcnow() - start_time).total_seconds()
|
|
124
|
+
|
|
125
|
+
# Build response - result is a ProcessSourceOutput object
|
|
126
|
+
response = {
|
|
127
|
+
"success": True,
|
|
128
|
+
"error": None,
|
|
129
|
+
"source_type": source_type,
|
|
130
|
+
"source": source,
|
|
131
|
+
"content": result.content or "",
|
|
132
|
+
"metadata": {
|
|
133
|
+
"extraction_time_seconds": extraction_time,
|
|
134
|
+
"extraction_timestamp": start_time.isoformat() + "Z",
|
|
135
|
+
"content_length": len(result.content or ""),
|
|
136
|
+
"identified_type": result.identified_type or "unknown",
|
|
137
|
+
"identified_provider": result.identified_provider or "",
|
|
138
|
+
},
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# Add metadata from the result
|
|
142
|
+
if result.metadata:
|
|
143
|
+
response["metadata"].update(result.metadata)
|
|
144
|
+
|
|
145
|
+
# Add specific metadata based on source type
|
|
146
|
+
if source_type == "url":
|
|
147
|
+
if result.title:
|
|
148
|
+
response["metadata"]["title"] = result.title
|
|
149
|
+
if result.url:
|
|
150
|
+
response["metadata"]["final_url"] = result.url
|
|
151
|
+
elif source_type == "file":
|
|
152
|
+
if result.title:
|
|
153
|
+
response["metadata"]["title"] = result.title
|
|
154
|
+
if result.file_path:
|
|
155
|
+
response["metadata"]["file_path"] = result.file_path
|
|
156
|
+
response["metadata"]["file_size"] = Path(file_path).stat().st_size
|
|
157
|
+
response["metadata"]["file_extension"] = Path(file_path).suffix
|
|
158
|
+
|
|
159
|
+
logger.info(f"Successfully extracted content from {source_type}: {source}")
|
|
160
|
+
return response
|
|
161
|
+
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.error(f"Error extracting content from {source_type} {source}: {str(e)}")
|
|
164
|
+
return {
|
|
165
|
+
"success": False,
|
|
166
|
+
"error": str(e),
|
|
167
|
+
"source_type": source_type,
|
|
168
|
+
"source": source,
|
|
169
|
+
"content": None,
|
|
170
|
+
"metadata": {
|
|
171
|
+
"extraction_timestamp": start_time.isoformat() + "Z",
|
|
172
|
+
"error_type": type(e).__name__,
|
|
173
|
+
},
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@mcp.tool
|
|
178
|
+
async def extract_content(
|
|
179
|
+
url: Optional[str] = None, file_path: Optional[str] = None
|
|
180
|
+
) -> Dict[str, Any]:
|
|
181
|
+
"""
|
|
182
|
+
Extract content from a URL or file using Content Core's auto engine.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
url: Optional URL to extract content from
|
|
186
|
+
file_path: Optional file path to extract content from
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
JSON object containing extracted content and metadata
|
|
190
|
+
|
|
191
|
+
Raises:
|
|
192
|
+
ValueError: If neither or both url and file_path are provided
|
|
193
|
+
"""
|
|
194
|
+
return await _extract_content_impl(url=url, file_path=file_path)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def main():
|
|
198
|
+
"""Entry point for the MCP server."""
|
|
199
|
+
# Additional MoviePy configuration to suppress all output
|
|
200
|
+
try:
|
|
201
|
+
import moviepy.config as mp_config
|
|
202
|
+
|
|
203
|
+
mp_config.check_and_download_cmd("ffmpeg") # Pre-download to avoid logs later
|
|
204
|
+
except Exception:
|
|
205
|
+
pass # Ignore if MoviePy isn't available or configured
|
|
206
|
+
|
|
207
|
+
logger.info("Starting Content Core MCP Server")
|
|
208
|
+
|
|
209
|
+
# Run with STDIO transport for MCP compatibility
|
|
210
|
+
mcp.run()
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
if __name__ == "__main__":
|
|
214
|
+
main()
|
content_core/models.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from esperanto import AIFactory
|
|
2
|
+
|
|
3
|
+
from .config import CONFIG, get_proxy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ModelFactory:
|
|
7
|
+
_instances = {}
|
|
8
|
+
_proxy_at_creation = {} # Track proxy used when model was created
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def get_model(model_alias):
|
|
12
|
+
current_proxy = get_proxy()
|
|
13
|
+
|
|
14
|
+
# Check if we need to recreate the model due to proxy change
|
|
15
|
+
if model_alias in ModelFactory._instances:
|
|
16
|
+
cached_proxy = ModelFactory._proxy_at_creation.get(model_alias)
|
|
17
|
+
if cached_proxy != current_proxy:
|
|
18
|
+
# Proxy changed, invalidate cached instance
|
|
19
|
+
del ModelFactory._instances[model_alias]
|
|
20
|
+
del ModelFactory._proxy_at_creation[model_alias]
|
|
21
|
+
|
|
22
|
+
if model_alias not in ModelFactory._instances:
|
|
23
|
+
config = CONFIG.get(model_alias, {})
|
|
24
|
+
if not config:
|
|
25
|
+
raise ValueError(
|
|
26
|
+
f"Configuração para o modelo {model_alias} não encontrada."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
provider = config.get("provider")
|
|
30
|
+
model_name = config.get("model_name")
|
|
31
|
+
model_config = config.get("config", {}).copy()
|
|
32
|
+
|
|
33
|
+
# Add proxy to model config if configured
|
|
34
|
+
if current_proxy:
|
|
35
|
+
model_config["proxy"] = current_proxy
|
|
36
|
+
|
|
37
|
+
if model_alias == "speech_to_text":
|
|
38
|
+
# For STT models, pass timeout in config dict
|
|
39
|
+
timeout = config.get("timeout")
|
|
40
|
+
stt_config = {"timeout": timeout} if timeout else {}
|
|
41
|
+
if current_proxy:
|
|
42
|
+
stt_config["proxy"] = current_proxy
|
|
43
|
+
ModelFactory._instances[model_alias] = AIFactory.create_speech_to_text(
|
|
44
|
+
provider, model_name, stt_config
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
ModelFactory._instances[model_alias] = AIFactory.create_language(
|
|
48
|
+
provider, model_name, config=model_config
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Track what proxy was used
|
|
52
|
+
ModelFactory._proxy_at_creation[model_alias] = current_proxy
|
|
53
|
+
|
|
54
|
+
return ModelFactory._instances[model_alias]
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def clear_cache():
|
|
58
|
+
"""Clear all cached model instances. Useful when proxy configuration changes."""
|
|
59
|
+
ModelFactory._instances.clear()
|
|
60
|
+
ModelFactory._proxy_at_creation.clear()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
speech_to_text:
|
|
2
|
+
provider: openai
|
|
3
|
+
model_name: whisper-1
|
|
4
|
+
timeout: 3600 # 1 hour - for processing very long audio files
|
|
5
|
+
|
|
6
|
+
default_model:
|
|
7
|
+
provider: openai
|
|
8
|
+
model_name: gpt-4o-mini
|
|
9
|
+
config:
|
|
10
|
+
temperature: 0.5
|
|
11
|
+
top_p: 1
|
|
12
|
+
max_tokens: 2000
|
|
13
|
+
timeout: 300 # 5 minutes - for general language model operations
|
|
14
|
+
|
|
15
|
+
cleanup_model:
|
|
16
|
+
provider: openai
|
|
17
|
+
model_name: gpt-4o-mini
|
|
18
|
+
config:
|
|
19
|
+
temperature: 0
|
|
20
|
+
max_tokens: 8000
|
|
21
|
+
output_format: json
|
|
22
|
+
timeout: 600 # 10 minutes - for complex content cleaning with large inputs
|
|
23
|
+
|
|
24
|
+
summary_model:
|
|
25
|
+
provider: openai
|
|
26
|
+
model_name: gpt-4o-mini
|
|
27
|
+
config:
|
|
28
|
+
temperature: 0
|
|
29
|
+
top_p: 1
|
|
30
|
+
max_tokens: 2000
|
|
31
|
+
timeout: 300 # 5 minutes - for content summarization
|