content-core 1.0.4__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/config.py CHANGED
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import pkgutil
3
- import os # needed for load_config env/path checks
4
3
  import yaml
5
4
  from dotenv import load_dotenv
6
5
 
@@ -0,0 +1,5 @@
1
+ """Content Core MCP Server module."""
2
+
3
+ from .server import mcp, main
4
+
5
+ __all__ = ["mcp", "main"]
@@ -0,0 +1,214 @@
1
+ """Content Core MCP Server implementation."""
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import contextmanager
6
+ from datetime import datetime
7
+ from io import StringIO
8
+ from pathlib import Path
9
+ from typing import Any, Dict, Optional
10
+
11
+ from fastmcp import FastMCP
12
+ from loguru import logger
13
+
14
+ # Suppress MoviePy output for MCP compatibility
15
+ os.environ["IMAGEIO_LOG_LEVEL"] = "error"
16
+ os.environ["FFMPEG_LOG_LEVEL"] = "error"
17
+
18
+ # Configure loguru to not output to stdout (which would interfere with MCP)
19
+ logger.remove() # Remove default handler
20
+ logger.add(sys.stderr, level="INFO") # Add stderr handler only
21
+
22
+
23
+ @contextmanager
24
+ def suppress_stdout():
25
+ """Context manager to suppress stdout during operations that might print."""
26
+ original_stdout = sys.stdout
27
+ sys.stdout = StringIO()
28
+ try:
29
+ yield
30
+ finally:
31
+ sys.stdout = original_stdout
32
+
33
+
34
+ # Add parent directory to path to import content_core
35
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
36
+
37
+ import content_core as cc
38
+
39
+ # Initialize MCP server
40
+ mcp = FastMCP("Content Core MCP Server")
41
+
42
+
43
+ async def _extract_content_impl(
44
+ url: Optional[str] = None, file_path: Optional[str] = None
45
+ ) -> Dict[str, Any]:
46
+ """
47
+ Extract content from a URL or file using Content Core's auto engine. This is useful for processing Youtube transcripts, website content, PDFs, ePUB, Office files, etc. You can also use it to extract transcripts from audio or video files.
48
+
49
+ Args:
50
+ url: Optional URL to extract content from
51
+ file_path: Optional file path to extract content from
52
+
53
+ Returns:
54
+ JSON object containing extracted content and metadata
55
+
56
+ Raises:
57
+ ValueError: If neither or both url and file_path are provided
58
+ """
59
+ # Validate input - exactly one must be provided
60
+ if (url is None and file_path is None) or (
61
+ url is not None and file_path is not None
62
+ ):
63
+ return {
64
+ "success": False,
65
+ "error": "Exactly one of 'url' or 'file_path' must be provided",
66
+ "source_type": None,
67
+ "source": None,
68
+ "content": None,
69
+ "metadata": None,
70
+ }
71
+
72
+ # Determine source type and validate
73
+ source_type = "url" if url else "file"
74
+ source = url if url else file_path
75
+
76
+ # Additional validation for file paths
77
+ if file_path:
78
+ path = Path(file_path)
79
+ if not path.exists():
80
+ return {
81
+ "success": False,
82
+ "error": f"File not found: {file_path}",
83
+ "source_type": source_type,
84
+ "source": source,
85
+ "content": None,
86
+ "metadata": None,
87
+ }
88
+
89
+ # Security check - ensure no directory traversal
90
+ try:
91
+ # Resolve to absolute path and ensure it's not trying to access sensitive areas
92
+ path.resolve()
93
+ # You might want to add additional checks here based on your security requirements
94
+ except Exception as e:
95
+ return {
96
+ "success": False,
97
+ "error": f"Invalid file path: {str(e)}",
98
+ "source_type": source_type,
99
+ "source": source,
100
+ "content": None,
101
+ "metadata": None,
102
+ }
103
+
104
+ # Build extraction request
105
+ extraction_request = {}
106
+ if url:
107
+ extraction_request["url"] = url
108
+ else:
109
+ extraction_request["file_path"] = str(Path(file_path).resolve())
110
+
111
+ # Track start time
112
+ start_time = datetime.utcnow()
113
+
114
+ try:
115
+ # Use Content Core's extract_content with auto engine
116
+ logger.info(f"Extracting content from {source_type}: {source}")
117
+
118
+ # Suppress stdout to prevent MoviePy and other libraries from interfering with MCP protocol
119
+ with suppress_stdout():
120
+ result = await cc.extract_content(extraction_request)
121
+
122
+ # Calculate extraction time
123
+ extraction_time = (datetime.utcnow() - start_time).total_seconds()
124
+
125
+ # Build response - result is a ProcessSourceOutput object
126
+ response = {
127
+ "success": True,
128
+ "error": None,
129
+ "source_type": source_type,
130
+ "source": source,
131
+ "content": result.content or "",
132
+ "metadata": {
133
+ "extraction_time_seconds": extraction_time,
134
+ "extraction_timestamp": start_time.isoformat() + "Z",
135
+ "content_length": len(result.content or ""),
136
+ "identified_type": result.identified_type or "unknown",
137
+ "identified_provider": result.identified_provider or "",
138
+ },
139
+ }
140
+
141
+ # Add metadata from the result
142
+ if result.metadata:
143
+ response["metadata"].update(result.metadata)
144
+
145
+ # Add specific metadata based on source type
146
+ if source_type == "url":
147
+ if result.title:
148
+ response["metadata"]["title"] = result.title
149
+ if result.url:
150
+ response["metadata"]["final_url"] = result.url
151
+ elif source_type == "file":
152
+ if result.title:
153
+ response["metadata"]["title"] = result.title
154
+ if result.file_path:
155
+ response["metadata"]["file_path"] = result.file_path
156
+ response["metadata"]["file_size"] = Path(file_path).stat().st_size
157
+ response["metadata"]["file_extension"] = Path(file_path).suffix
158
+
159
+ logger.info(f"Successfully extracted content from {source_type}: {source}")
160
+ return response
161
+
162
+ except Exception as e:
163
+ logger.error(f"Error extracting content from {source_type} {source}: {str(e)}")
164
+ return {
165
+ "success": False,
166
+ "error": str(e),
167
+ "source_type": source_type,
168
+ "source": source,
169
+ "content": None,
170
+ "metadata": {
171
+ "extraction_timestamp": start_time.isoformat() + "Z",
172
+ "error_type": type(e).__name__,
173
+ },
174
+ }
175
+
176
+
177
+ @mcp.tool
178
+ async def extract_content(
179
+ url: Optional[str] = None, file_path: Optional[str] = None
180
+ ) -> Dict[str, Any]:
181
+ """
182
+ Extract content from a URL or file using Content Core's auto engine.
183
+
184
+ Args:
185
+ url: Optional URL to extract content from
186
+ file_path: Optional file path to extract content from
187
+
188
+ Returns:
189
+ JSON object containing extracted content and metadata
190
+
191
+ Raises:
192
+ ValueError: If neither or both url and file_path are provided
193
+ """
194
+ return await _extract_content_impl(url=url, file_path=file_path)
195
+
196
+
197
+ def main():
198
+ """Entry point for the MCP server."""
199
+ # Additional MoviePy configuration to suppress all output
200
+ try:
201
+ import moviepy.config as mp_config
202
+
203
+ mp_config.check_and_download_cmd("ffmpeg") # Pre-download to avoid logs later
204
+ except Exception:
205
+ pass # Ignore if MoviePy isn't available or configured
206
+
207
+ logger.info("Starting Content Core MCP Server")
208
+
209
+ # Run with STDIO transport for MCP compatibility
210
+ mcp.run()
211
+
212
+
213
+ if __name__ == "__main__":
214
+ main()
content_core/models.py CHANGED
@@ -1,5 +1,4 @@
1
1
  from esperanto import AIFactory
2
- from esperanto.providers.stt import SpeechToTextModel
3
2
  from .config import CONFIG
4
3
 
5
4
  class ModelFactory:
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.0.4
4
- Summary: Extract what matters from any media source
3
+ Version: 1.1.2
4
+ Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.10
@@ -31,6 +31,8 @@ Requires-Dist: pytubefix>=9.1.1
31
31
  Requires-Dist: readability-lxml>=0.8.4.1
32
32
  Requires-Dist: validators>=0.34.0
33
33
  Requires-Dist: youtube-transcript-api>=1.0.3
34
+ Provides-Extra: mcp
35
+ Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
34
36
  Description-Content-Type: text/markdown
35
37
 
36
38
  # Content Core
@@ -57,6 +59,8 @@ The primary goal of Content Core is to simplify the process of ingesting content
57
59
  * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
58
60
  * You can override this by specifying an engine, but `'auto'` is recommended for most users.
59
61
  * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
62
+ * **MCP Server:** Includes a Model Context Protocol (MCP) server for seamless integration with Claude Desktop and other MCP-compatible applications.
63
+ * **macOS Services:** Right-click context menu integration for Finder (extract and summarize files directly).
60
64
  * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
61
65
 
62
66
  ## Getting Started
@@ -66,8 +70,11 @@ The primary goal of Content Core is to simplify the process of ingesting content
66
70
  Install Content Core using `pip`:
67
71
 
68
72
  ```bash
69
- # Install the package (without Docling)
73
+ # Install the package
70
74
  pip install content-core
75
+
76
+ # Install with MCP server support
77
+ pip install content-core[mcp]
71
78
  ```
72
79
 
73
80
  Alternatively, if you’re developing locally:
@@ -86,6 +93,18 @@ uv sync
86
93
  Content Core provides three CLI commands for extracting, cleaning, and summarizing content:
87
94
  ccore, cclean, and csum. These commands support input from text, URLs, files, or piped data (e.g., via cat file | command).
88
95
 
96
+ **Zero-install usage with uvx:**
97
+ ```bash
98
+ # Extract content
99
+ uvx --from "content-core" ccore https://example.com
100
+
101
+ # Clean content
102
+ uvx --from "content-core" cclean "messy content"
103
+
104
+ # Summarize content
105
+ uvx --from "content-core" csum "long text" --context "bullet points"
106
+ ```
107
+
89
108
  #### ccore - Extract Content
90
109
 
91
110
  Extracts content from text, URLs, or files, with optional formatting.
@@ -194,6 +213,81 @@ summary = await cc.summarize_content("long article text", context="explain to a
194
213
 
195
214
  For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
196
215
 
216
+ ## MCP Server Integration
217
+
218
+ Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
219
+
220
+ ### Quick Setup with Claude Desktop
221
+
222
+ ```bash
223
+ # Install with MCP support
224
+ pip install content-core[mcp]
225
+
226
+ # Or use directly with uvx (no installation required)
227
+ uvx --from "content-core[mcp]" content-core-mcp
228
+ ```
229
+
230
+ Add to your `claude_desktop_config.json`:
231
+ ```json
232
+ {
233
+ "mcpServers": {
234
+ "content-core": {
235
+ "command": "uvx",
236
+ "args": [
237
+ "--from",
238
+ "content-core[mcp]",
239
+ "content-core-mcp"
240
+ ]
241
+ }
242
+ }
243
+ }
244
+ ```
245
+
246
+ For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
247
+
248
+ ## macOS Services Integration
249
+
250
+ Content Core provides powerful right-click integration with macOS Finder, allowing you to extract and summarize content from any file without installation. Choose between clipboard or TextEdit output for maximum flexibility.
251
+
252
+ ### Available Services
253
+
254
+ Create **4 convenient services** for different workflows:
255
+
256
+ - **Extract Content → Clipboard** - Quick copy for immediate pasting
257
+ - **Extract Content → TextEdit** - Review before using
258
+ - **Summarize Content → Clipboard** - Quick summary copying
259
+ - **Summarize Content → TextEdit** - Formatted summary with headers
260
+
261
+ ### Quick Setup
262
+
263
+ 1. **Install uv** (if not already installed):
264
+ ```bash
265
+ curl -LsSf https://astral.sh/uv/install.sh | sh
266
+ ```
267
+
268
+ 2. **Create services manually** using Automator (5 minutes setup)
269
+
270
+ ### Usage
271
+
272
+ **Right-click any supported file** in Finder → **Services** → Choose your option:
273
+
274
+ - **PDFs, Word docs** - Instant text extraction
275
+ - **Videos, audio files** - Automatic transcription
276
+ - **Images** - OCR text recognition
277
+ - **Web content** - Clean text extraction
278
+ - **Multiple files** - Batch processing support
279
+
280
+ ### Features
281
+
282
+ - **Zero-install processing**: Uses `uvx` for isolated execution
283
+ - **Multiple output options**: Clipboard or TextEdit display
284
+ - **System notifications**: Visual feedback on completion
285
+ - **Wide format support**: 20+ file types supported
286
+ - **Batch processing**: Handle multiple files at once
287
+ - **Keyboard shortcuts**: Assignable hotkeys for power users
288
+
289
+ For complete setup instructions with copy-paste scripts, see [macOS Services Documentation](docs/macos.md).
290
+
197
291
  ## Using with Langchain
198
292
 
199
293
  For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
@@ -1,8 +1,8 @@
1
1
  content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
2
2
  content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
3
- content_core/config.py,sha256=vbRgJy8lOTZABeY7GZc7MglNYwBQYpUNzu76kprv_c0,1854
3
+ content_core/config.py,sha256=vyx0fioR6r0mcZfVdwAFDhFrRNoG0ZNG8RNxIDnhNlo,1802
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
- content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
5
+ content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
6
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
7
7
  content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
8
8
  content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
@@ -19,6 +19,8 @@ content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8
19
19
  content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
20
20
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
21
21
  content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
22
+ content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
23
+ content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
22
24
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
23
25
  content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
24
26
  content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
@@ -32,8 +34,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
32
34
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
33
35
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
34
36
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
35
- content_core-1.0.4.dist-info/METADATA,sha256=SdXexgOV0tc4ArCYWjxrZog4esHJxW0zh8pdnZFqLi8,11908
36
- content_core-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
37
- content_core-1.0.4.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
38
- content_core-1.0.4.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
39
- content_core-1.0.4.dist-info/RECORD,,
37
+ content_core-1.1.2.dist-info/METADATA,sha256=_0Rg4yeU-05hDB_91dvcMXYKMaKcMcU5C8SpkYhtiRs,15072
38
+ content_core-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
39
+ content_core-1.1.2.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
40
+ content_core-1.1.2.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
41
+ content_core-1.1.2.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
2
  cclean = content_core:cclean
3
3
  ccore = content_core:ccore
4
+ content-core-mcp = content_core.mcp.server:main
4
5
  csum = content_core:csum