content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,214 @@
1
+ """Content Core MCP Server implementation."""
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import contextmanager
6
+ from datetime import datetime
7
+ from io import StringIO
8
+ from pathlib import Path
9
+ from typing import Any, Dict, Optional
10
+
11
+ from fastmcp import FastMCP
12
+ from loguru import logger
13
+
14
+ # Suppress MoviePy output for MCP compatibility
15
+ os.environ["IMAGEIO_LOG_LEVEL"] = "error"
16
+ os.environ["FFMPEG_LOG_LEVEL"] = "error"
17
+
18
+ # Configure loguru to not output to stdout (which would interfere with MCP)
19
+ logger.remove() # Remove default handler
20
+ logger.add(sys.stderr, level="INFO") # Add stderr handler only
21
+
22
+
23
+ @contextmanager
24
+ def suppress_stdout():
25
+ """Context manager to suppress stdout during operations that might print."""
26
+ original_stdout = sys.stdout
27
+ sys.stdout = StringIO()
28
+ try:
29
+ yield
30
+ finally:
31
+ sys.stdout = original_stdout
32
+
33
+
34
+ # Add parent directory to path to import content_core
35
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
36
+
37
+ import content_core as cc
38
+
39
+ # Initialize MCP server
40
+ mcp = FastMCP("Content Core MCP Server")
41
+
42
+
43
+ async def _extract_content_impl(
44
+ url: Optional[str] = None, file_path: Optional[str] = None
45
+ ) -> Dict[str, Any]:
46
+ """
47
+ Extract content from a URL or file using Content Core's auto engine. This is useful for processing Youtube transcripts, website content, PDFs, ePUB, Office files, etc. You can also use it to extract transcripts from audio or video files.
48
+
49
+ Args:
50
+ url: Optional URL to extract content from
51
+ file_path: Optional file path to extract content from
52
+
53
+ Returns:
54
+ JSON object containing extracted content and metadata
55
+
56
+ Raises:
57
+ ValueError: If neither or both url and file_path are provided
58
+ """
59
+ # Validate input - exactly one must be provided
60
+ if (url is None and file_path is None) or (
61
+ url is not None and file_path is not None
62
+ ):
63
+ return {
64
+ "success": False,
65
+ "error": "Exactly one of 'url' or 'file_path' must be provided",
66
+ "source_type": None,
67
+ "source": None,
68
+ "content": None,
69
+ "metadata": None,
70
+ }
71
+
72
+ # Determine source type and validate
73
+ source_type = "url" if url else "file"
74
+ source = url if url else file_path
75
+
76
+ # Additional validation for file paths
77
+ if file_path:
78
+ path = Path(file_path)
79
+ if not path.exists():
80
+ return {
81
+ "success": False,
82
+ "error": f"File not found: {file_path}",
83
+ "source_type": source_type,
84
+ "source": source,
85
+ "content": None,
86
+ "metadata": None,
87
+ }
88
+
89
+ # Security check - ensure no directory traversal
90
+ try:
91
+ # Resolve to absolute path and ensure it's not trying to access sensitive areas
92
+ path.resolve()
93
+ # You might want to add additional checks here based on your security requirements
94
+ except Exception as e:
95
+ return {
96
+ "success": False,
97
+ "error": f"Invalid file path: {str(e)}",
98
+ "source_type": source_type,
99
+ "source": source,
100
+ "content": None,
101
+ "metadata": None,
102
+ }
103
+
104
+ # Build extraction request
105
+ extraction_request = {}
106
+ if url:
107
+ extraction_request["url"] = url
108
+ else:
109
+ extraction_request["file_path"] = str(Path(file_path).resolve())
110
+
111
+ # Track start time
112
+ start_time = datetime.utcnow()
113
+
114
+ try:
115
+ # Use Content Core's extract_content with auto engine
116
+ logger.info(f"Extracting content from {source_type}: {source}")
117
+
118
+ # Suppress stdout to prevent MoviePy and other libraries from interfering with MCP protocol
119
+ with suppress_stdout():
120
+ result = await cc.extract_content(extraction_request)
121
+
122
+ # Calculate extraction time
123
+ extraction_time = (datetime.utcnow() - start_time).total_seconds()
124
+
125
+ # Build response - result is a ProcessSourceOutput object
126
+ response = {
127
+ "success": True,
128
+ "error": None,
129
+ "source_type": source_type,
130
+ "source": source,
131
+ "content": result.content or "",
132
+ "metadata": {
133
+ "extraction_time_seconds": extraction_time,
134
+ "extraction_timestamp": start_time.isoformat() + "Z",
135
+ "content_length": len(result.content or ""),
136
+ "identified_type": result.identified_type or "unknown",
137
+ "identified_provider": result.identified_provider or "",
138
+ },
139
+ }
140
+
141
+ # Add metadata from the result
142
+ if result.metadata:
143
+ response["metadata"].update(result.metadata)
144
+
145
+ # Add specific metadata based on source type
146
+ if source_type == "url":
147
+ if result.title:
148
+ response["metadata"]["title"] = result.title
149
+ if result.url:
150
+ response["metadata"]["final_url"] = result.url
151
+ elif source_type == "file":
152
+ if result.title:
153
+ response["metadata"]["title"] = result.title
154
+ if result.file_path:
155
+ response["metadata"]["file_path"] = result.file_path
156
+ response["metadata"]["file_size"] = Path(file_path).stat().st_size
157
+ response["metadata"]["file_extension"] = Path(file_path).suffix
158
+
159
+ logger.info(f"Successfully extracted content from {source_type}: {source}")
160
+ return response
161
+
162
+ except Exception as e:
163
+ logger.error(f"Error extracting content from {source_type} {source}: {str(e)}")
164
+ return {
165
+ "success": False,
166
+ "error": str(e),
167
+ "source_type": source_type,
168
+ "source": source,
169
+ "content": None,
170
+ "metadata": {
171
+ "extraction_timestamp": start_time.isoformat() + "Z",
172
+ "error_type": type(e).__name__,
173
+ },
174
+ }
175
+
176
+
177
+ @mcp.tool
178
+ async def extract_content(
179
+ url: Optional[str] = None, file_path: Optional[str] = None
180
+ ) -> Dict[str, Any]:
181
+ """
182
+ Extract content from a URL or file using Content Core's auto engine.
183
+
184
+ Args:
185
+ url: Optional URL to extract content from
186
+ file_path: Optional file path to extract content from
187
+
188
+ Returns:
189
+ JSON object containing extracted content and metadata
190
+
191
+ Raises:
192
+ ValueError: If neither or both url and file_path are provided
193
+ """
194
+ return await _extract_content_impl(url=url, file_path=file_path)
195
+
196
+
197
+ def main():
198
+ """Entry point for the MCP server."""
199
+ # Additional MoviePy configuration to suppress all output
200
+ try:
201
+ import moviepy.config as mp_config
202
+
203
+ mp_config.check_and_download_cmd("ffmpeg") # Pre-download to avoid logs later
204
+ except Exception:
205
+ pass # Ignore if MoviePy isn't available or configured
206
+
207
+ logger.info("Starting Content Core MCP Server")
208
+
209
+ # Run with STDIO transport for MCP compatibility
210
+ mcp.run()
211
+
212
+
213
+ if __name__ == "__main__":
214
+ main()
content_core/models.py ADDED
@@ -0,0 +1,60 @@
1
+ from esperanto import AIFactory
2
+
3
+ from .config import CONFIG, get_proxy
4
+
5
+
6
+ class ModelFactory:
7
+ _instances = {}
8
+ _proxy_at_creation = {} # Track proxy used when model was created
9
+
10
+ @staticmethod
11
+ def get_model(model_alias):
12
+ current_proxy = get_proxy()
13
+
14
+ # Check if we need to recreate the model due to proxy change
15
+ if model_alias in ModelFactory._instances:
16
+ cached_proxy = ModelFactory._proxy_at_creation.get(model_alias)
17
+ if cached_proxy != current_proxy:
18
+ # Proxy changed, invalidate cached instance
19
+ del ModelFactory._instances[model_alias]
20
+ del ModelFactory._proxy_at_creation[model_alias]
21
+
22
+ if model_alias not in ModelFactory._instances:
23
+ config = CONFIG.get(model_alias, {})
24
+ if not config:
25
+ raise ValueError(
26
+ f"Configuração para o modelo {model_alias} não encontrada."
27
+ )
28
+
29
+ provider = config.get("provider")
30
+ model_name = config.get("model_name")
31
+ model_config = config.get("config", {}).copy()
32
+
33
+ # Add proxy to model config if configured
34
+ if current_proxy:
35
+ model_config["proxy"] = current_proxy
36
+
37
+ if model_alias == "speech_to_text":
38
+ # For STT models, pass timeout in config dict
39
+ timeout = config.get("timeout")
40
+ stt_config = {"timeout": timeout} if timeout else {}
41
+ if current_proxy:
42
+ stt_config["proxy"] = current_proxy
43
+ ModelFactory._instances[model_alias] = AIFactory.create_speech_to_text(
44
+ provider, model_name, stt_config
45
+ )
46
+ else:
47
+ ModelFactory._instances[model_alias] = AIFactory.create_language(
48
+ provider, model_name, config=model_config
49
+ )
50
+
51
+ # Track what proxy was used
52
+ ModelFactory._proxy_at_creation[model_alias] = current_proxy
53
+
54
+ return ModelFactory._instances[model_alias]
55
+
56
+ @staticmethod
57
+ def clear_cache():
58
+ """Clear all cached model instances. Useful when proxy configuration changes."""
59
+ ModelFactory._instances.clear()
60
+ ModelFactory._proxy_at_creation.clear()
@@ -0,0 +1,31 @@
1
+ speech_to_text:
2
+ provider: openai
3
+ model_name: whisper-1
4
+ timeout: 3600 # 1 hour - for processing very long audio files
5
+
6
+ default_model:
7
+ provider: openai
8
+ model_name: gpt-4o-mini
9
+ config:
10
+ temperature: 0.5
11
+ top_p: 1
12
+ max_tokens: 2000
13
+ timeout: 300 # 5 minutes - for general language model operations
14
+
15
+ cleanup_model:
16
+ provider: openai
17
+ model_name: gpt-4o-mini
18
+ config:
19
+ temperature: 0
20
+ max_tokens: 8000
21
+ output_format: json
22
+ timeout: 600 # 10 minutes - for complex content cleaning with large inputs
23
+
24
+ summary_model:
25
+ provider: openai
26
+ model_name: gpt-4o-mini
27
+ config:
28
+ temperature: 0
29
+ top_p: 1
30
+ max_tokens: 2000
31
+ timeout: 300 # 5 minutes - for content summarization