content-core 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/config.py CHANGED
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import pkgutil
3
- import os # needed for load_config env/path checks
4
3
  import yaml
5
4
  from dotenv import load_dotenv
6
5
 
@@ -0,0 +1,5 @@
1
+ """Content Core MCP Server module."""
2
+
3
+ from .server import mcp, main
4
+
5
+ __all__ = ["mcp", "main"]
@@ -0,0 +1,211 @@
1
+ """Content Core MCP Server implementation."""
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import contextmanager
6
+ from datetime import datetime
7
+ from io import StringIO
8
+ from pathlib import Path
9
+ from typing import Any, Dict, Optional
10
+
11
+ from fastmcp import FastMCP
12
+ from loguru import logger
13
+
14
+ # Suppress MoviePy output for MCP compatibility
15
+ os.environ["IMAGEIO_LOG_LEVEL"] = "error"
16
+ os.environ["FFMPEG_LOG_LEVEL"] = "error"
17
+
18
+ # Configure loguru to not output to stdout (which would interfere with MCP)
19
+ logger.remove() # Remove default handler
20
+ logger.add(sys.stderr, level="INFO") # Add stderr handler only
21
+
22
+
23
+ @contextmanager
24
+ def suppress_stdout():
25
+ """Context manager to suppress stdout during operations that might print."""
26
+ original_stdout = sys.stdout
27
+ sys.stdout = StringIO()
28
+ try:
29
+ yield
30
+ finally:
31
+ sys.stdout = original_stdout
32
+
33
+ # Add parent directory to path to import content_core
34
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
35
+
36
+ import content_core as cc
37
+
38
+ # Initialize MCP server
39
+ mcp = FastMCP("Content Core MCP Server")
40
+
41
+ async def _extract_content_impl(
42
+ url: Optional[str] = None,
43
+ file_path: Optional[str] = None
44
+ ) -> Dict[str, Any]:
45
+ """
46
+ Extract content from a URL or file using Content Core's auto engine.
47
+
48
+ Args:
49
+ url: Optional URL to extract content from
50
+ file_path: Optional file path to extract content from
51
+
52
+ Returns:
53
+ JSON object containing extracted content and metadata
54
+
55
+ Raises:
56
+ ValueError: If neither or both url and file_path are provided
57
+ """
58
+ # Validate input - exactly one must be provided
59
+ if (url is None and file_path is None) or (url is not None and file_path is not None):
60
+ return {
61
+ "success": False,
62
+ "error": "Exactly one of 'url' or 'file_path' must be provided",
63
+ "source_type": None,
64
+ "source": None,
65
+ "content": None,
66
+ "metadata": None
67
+ }
68
+
69
+ # Determine source type and validate
70
+ source_type = "url" if url else "file"
71
+ source = url if url else file_path
72
+
73
+ # Additional validation for file paths
74
+ if file_path:
75
+ path = Path(file_path)
76
+ if not path.exists():
77
+ return {
78
+ "success": False,
79
+ "error": f"File not found: {file_path}",
80
+ "source_type": source_type,
81
+ "source": source,
82
+ "content": None,
83
+ "metadata": None
84
+ }
85
+
86
+ # Security check - ensure no directory traversal
87
+ try:
88
+ # Resolve to absolute path and ensure it's not trying to access sensitive areas
89
+ path.resolve()
90
+ # You might want to add additional checks here based on your security requirements
91
+ except Exception as e:
92
+ return {
93
+ "success": False,
94
+ "error": f"Invalid file path: {str(e)}",
95
+ "source_type": source_type,
96
+ "source": source,
97
+ "content": None,
98
+ "metadata": None
99
+ }
100
+
101
+ # Build extraction request
102
+ extraction_request = {}
103
+ if url:
104
+ extraction_request["url"] = url
105
+ else:
106
+ extraction_request["file_path"] = str(Path(file_path).resolve())
107
+
108
+ # Track start time
109
+ start_time = datetime.utcnow()
110
+
111
+ try:
112
+ # Use Content Core's extract_content with auto engine
113
+ logger.info(f"Extracting content from {source_type}: {source}")
114
+
115
+ # Suppress stdout to prevent MoviePy and other libraries from interfering with MCP protocol
116
+ with suppress_stdout():
117
+ result = await cc.extract_content(extraction_request)
118
+
119
+ # Calculate extraction time
120
+ extraction_time = (datetime.utcnow() - start_time).total_seconds()
121
+
122
+ # Build response - result is a ProcessSourceOutput object
123
+ response = {
124
+ "success": True,
125
+ "error": None,
126
+ "source_type": source_type,
127
+ "source": source,
128
+ "content": result.content or "",
129
+ "metadata": {
130
+ "extraction_time_seconds": extraction_time,
131
+ "extraction_timestamp": start_time.isoformat() + "Z",
132
+ "content_length": len(result.content or ""),
133
+ "identified_type": result.identified_type or "unknown",
134
+ "identified_provider": result.identified_provider or "",
135
+ }
136
+ }
137
+
138
+ # Add metadata from the result
139
+ if result.metadata:
140
+ response["metadata"].update(result.metadata)
141
+
142
+ # Add specific metadata based on source type
143
+ if source_type == "url":
144
+ if result.title:
145
+ response["metadata"]["title"] = result.title
146
+ if result.url:
147
+ response["metadata"]["final_url"] = result.url
148
+ elif source_type == "file":
149
+ if result.title:
150
+ response["metadata"]["title"] = result.title
151
+ if result.file_path:
152
+ response["metadata"]["file_path"] = result.file_path
153
+ response["metadata"]["file_size"] = Path(file_path).stat().st_size
154
+ response["metadata"]["file_extension"] = Path(file_path).suffix
155
+
156
+ logger.info(f"Successfully extracted content from {source_type}: {source}")
157
+ return response
158
+
159
+ except Exception as e:
160
+ logger.error(f"Error extracting content from {source_type} {source}: {str(e)}")
161
+ return {
162
+ "success": False,
163
+ "error": str(e),
164
+ "source_type": source_type,
165
+ "source": source,
166
+ "content": None,
167
+ "metadata": {
168
+ "extraction_timestamp": start_time.isoformat() + "Z",
169
+ "error_type": type(e).__name__
170
+ }
171
+ }
172
+
173
+
174
+ @mcp.tool
175
+ async def extract_content(
176
+ url: Optional[str] = None,
177
+ file_path: Optional[str] = None
178
+ ) -> Dict[str, Any]:
179
+ """
180
+ Extract content from a URL or file using Content Core's auto engine.
181
+
182
+ Args:
183
+ url: Optional URL to extract content from
184
+ file_path: Optional file path to extract content from
185
+
186
+ Returns:
187
+ JSON object containing extracted content and metadata
188
+
189
+ Raises:
190
+ ValueError: If neither or both url and file_path are provided
191
+ """
192
+ return await _extract_content_impl(url=url, file_path=file_path)
193
+
194
+
195
+ def main():
196
+ """Entry point for the MCP server."""
197
+ # Additional MoviePy configuration to suppress all output
198
+ try:
199
+ import moviepy.config as mp_config
200
+ mp_config.check_and_download_cmd("ffmpeg") # Pre-download to avoid logs later
201
+ except Exception:
202
+ pass # Ignore if MoviePy isn't available or configured
203
+
204
+ logger.info("Starting Content Core MCP Server")
205
+
206
+ # Run with STDIO transport for MCP compatibility
207
+ mcp.run()
208
+
209
+
210
+ if __name__ == "__main__":
211
+ main()
content_core/models.py CHANGED
@@ -1,5 +1,4 @@
1
1
  from esperanto import AIFactory
2
- from esperanto.providers.stt import SpeechToTextModel
3
2
  from .config import CONFIG
4
3
 
5
4
  class ModelFactory:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.0.4
3
+ Version: 1.1.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -31,6 +31,8 @@ Requires-Dist: pytubefix>=9.1.1
31
31
  Requires-Dist: readability-lxml>=0.8.4.1
32
32
  Requires-Dist: validators>=0.34.0
33
33
  Requires-Dist: youtube-transcript-api>=1.0.3
34
+ Provides-Extra: mcp
35
+ Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
34
36
  Description-Content-Type: text/markdown
35
37
 
36
38
  # Content Core
@@ -57,6 +59,7 @@ The primary goal of Content Core is to simplify the process of ingesting content
57
59
  * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
58
60
  * You can override this by specifying an engine, but `'auto'` is recommended for most users.
59
61
  * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
62
+ * **MCP Server:** Includes a Model Context Protocol (MCP) server for seamless integration with Claude Desktop and other MCP-compatible applications.
60
63
  * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
61
64
 
62
65
  ## Getting Started
@@ -66,8 +69,11 @@ The primary goal of Content Core is to simplify the process of ingesting content
66
69
  Install Content Core using `pip`:
67
70
 
68
71
  ```bash
69
- # Install the package (without Docling)
72
+ # Install the package
70
73
  pip install content-core
74
+
75
+ # Install with MCP server support
76
+ pip install content-core[mcp]
71
77
  ```
72
78
 
73
79
  Alternatively, if you’re developing locally:
@@ -194,6 +200,38 @@ summary = await cc.summarize_content("long article text", context="explain to a
194
200
 
195
201
  For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
196
202
 
203
+ ## MCP Server Integration
204
+
205
+ Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
206
+
207
+ ### Quick Setup with Claude Desktop
208
+
209
+ ```bash
210
+ # Install with MCP support
211
+ pip install content-core[mcp]
212
+
213
+ # Or use directly with uvx (no installation required)
214
+ uvx --from "content-core[mcp]" content-core-mcp
215
+ ```
216
+
217
+ Add to your `claude_desktop_config.json`:
218
+ ```json
219
+ {
220
+ "mcpServers": {
221
+ "content-core": {
222
+ "command": "uvx",
223
+ "args": [
224
+ "--from",
225
+ "content-core[mcp]",
226
+ "content-core-mcp"
227
+ ]
228
+ }
229
+ }
230
+ }
231
+ ```
232
+
233
+ For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
234
+
197
235
  ## Using with Langchain
198
236
 
199
237
  For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
@@ -1,8 +1,8 @@
1
1
  content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
2
2
  content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
3
- content_core/config.py,sha256=vbRgJy8lOTZABeY7GZc7MglNYwBQYpUNzu76kprv_c0,1854
3
+ content_core/config.py,sha256=vyx0fioR6r0mcZfVdwAFDhFrRNoG0ZNG8RNxIDnhNlo,1802
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
- content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
5
+ content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
6
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
7
7
  content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
8
8
  content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
@@ -19,6 +19,8 @@ content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8
19
19
  content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
20
20
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
21
21
  content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
22
+ content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
23
+ content_core/mcp/server.py,sha256=m2A63Qle3nJ_Lw46uWkwVvYERtEw84hd7NHAn1rwdAQ,6968
22
24
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
23
25
  content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
24
26
  content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
@@ -32,8 +34,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
32
34
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
33
35
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
34
36
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
35
- content_core-1.0.4.dist-info/METADATA,sha256=SdXexgOV0tc4ArCYWjxrZog4esHJxW0zh8pdnZFqLi8,11908
36
- content_core-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
37
- content_core-1.0.4.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
38
- content_core-1.0.4.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
39
- content_core-1.0.4.dist-info/RECORD,,
37
+ content_core-1.1.0.dist-info/METADATA,sha256=9-ppXQ7o-s8BCb2lH5xBiaiYBHmOFmXFrWntHuo9G_o,13017
38
+ content_core-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
39
+ content_core-1.1.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
40
+ content_core-1.1.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
41
+ content_core-1.1.0.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
2
  cclean = content_core:cclean
3
3
  ccore = content_core:ccore
4
+ content-core-mcp = content_core.mcp.server:main
4
5
  csum = content_core:csum