content-core 1.0.3__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-1.0.3 → content_core-1.1.0}/Makefile +4 -1
- {content_core-1.0.3 → content_core-1.1.0}/PKG-INFO +40 -2
- {content_core-1.0.3 → content_core-1.1.0}/README.md +37 -1
- content_core-1.1.0/docs/mcp.md +345 -0
- {content_core-1.0.3 → content_core-1.1.0}/pyproject.toml +7 -1
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/config.py +0 -1
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/content/summary/core.py +1 -1
- content_core-1.1.0/src/content_core/mcp/__init__.py +5 -0
- content_core-1.1.0/src/content_core/mcp/server.py +211 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/models.py +0 -1
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/processors/youtube.py +17 -10
- content_core-1.1.0/tests/unit/test_mcp_server.py +124 -0
- {content_core-1.0.3 → content_core-1.1.0}/uv.lock +174 -2
- {content_core-1.0.3 → content_core-1.1.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/.github/workflows/publish.yml +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/.gitignore +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/.python-version +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/CONTRIBUTING.md +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/LICENSE +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/docs/processors.md +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/docs/usage.md +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/prompts/content/summarize.jinja +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/common/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/common/state.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/common/types.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/common/utils.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/content/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/content/extraction/graph.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/logging.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/models_config.yaml +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/processors/audio.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/processors/docling.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/processors/office.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/processors/text.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/processors/url.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/processors/video.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/py.typed +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/templated_message.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/tools/extract.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.docx +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.epub +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.md +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.mp3 +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.mp4 +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.pdf +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.pptx +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.txt +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file.xlsx +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/integration/test_cli.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/integration/test_extraction.py +0 -0
- {content_core-1.0.3 → content_core-1.1.0}/tests/unit/test_docling.py +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
.PHONY: tag test build-docs ruff
|
|
1
|
+
.PHONY: tag test build-docs ruff mcp-server
|
|
2
2
|
|
|
3
3
|
tag:
|
|
4
4
|
@version=$$(grep '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/'); \
|
|
@@ -14,3 +14,6 @@ build-docs:
|
|
|
14
14
|
|
|
15
15
|
ruff:
|
|
16
16
|
ruff check . --fix
|
|
17
|
+
|
|
18
|
+
mcp-server:
|
|
19
|
+
uv run content-core-mcp
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -31,6 +31,8 @@ Requires-Dist: pytubefix>=9.1.1
|
|
|
31
31
|
Requires-Dist: readability-lxml>=0.8.4.1
|
|
32
32
|
Requires-Dist: validators>=0.34.0
|
|
33
33
|
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
34
|
+
Provides-Extra: mcp
|
|
35
|
+
Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
|
|
34
36
|
Description-Content-Type: text/markdown
|
|
35
37
|
|
|
36
38
|
# Content Core
|
|
@@ -57,6 +59,7 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
57
59
|
* For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
|
|
58
60
|
* You can override this by specifying an engine, but `'auto'` is recommended for most users.
|
|
59
61
|
* **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
|
|
62
|
+
* **MCP Server:** Includes a Model Context Protocol (MCP) server for seamless integration with Claude Desktop and other MCP-compatible applications.
|
|
60
63
|
* **Asynchronous:** Built with `asyncio` for efficient I/O operations.
|
|
61
64
|
|
|
62
65
|
## Getting Started
|
|
@@ -66,8 +69,11 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
66
69
|
Install Content Core using `pip`:
|
|
67
70
|
|
|
68
71
|
```bash
|
|
69
|
-
# Install the package
|
|
72
|
+
# Install the package
|
|
70
73
|
pip install content-core
|
|
74
|
+
|
|
75
|
+
# Install with MCP server support
|
|
76
|
+
pip install content-core[mcp]
|
|
71
77
|
```
|
|
72
78
|
|
|
73
79
|
Alternatively, if you’re developing locally:
|
|
@@ -194,6 +200,38 @@ summary = await cc.summarize_content("long article text", context="explain to a
|
|
|
194
200
|
|
|
195
201
|
For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
|
|
196
202
|
|
|
203
|
+
## MCP Server Integration
|
|
204
|
+
|
|
205
|
+
Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
|
|
206
|
+
|
|
207
|
+
### Quick Setup with Claude Desktop
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
# Install with MCP support
|
|
211
|
+
pip install content-core[mcp]
|
|
212
|
+
|
|
213
|
+
# Or use directly with uvx (no installation required)
|
|
214
|
+
uvx --from "content-core[mcp]" content-core-mcp
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Add to your `claude_desktop_config.json`:
|
|
218
|
+
```json
|
|
219
|
+
{
|
|
220
|
+
"mcpServers": {
|
|
221
|
+
"content-core": {
|
|
222
|
+
"command": "uvx",
|
|
223
|
+
"args": [
|
|
224
|
+
"--from",
|
|
225
|
+
"content-core[mcp]",
|
|
226
|
+
"content-core-mcp"
|
|
227
|
+
]
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
|
|
234
|
+
|
|
197
235
|
## Using with Langchain
|
|
198
236
|
|
|
199
237
|
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
@@ -22,6 +22,7 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
22
22
|
* For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
|
|
23
23
|
* You can override this by specifying an engine, but `'auto'` is recommended for most users.
|
|
24
24
|
* **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
|
|
25
|
+
* **MCP Server:** Includes a Model Context Protocol (MCP) server for seamless integration with Claude Desktop and other MCP-compatible applications.
|
|
25
26
|
* **Asynchronous:** Built with `asyncio` for efficient I/O operations.
|
|
26
27
|
|
|
27
28
|
## Getting Started
|
|
@@ -31,8 +32,11 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
31
32
|
Install Content Core using `pip`:
|
|
32
33
|
|
|
33
34
|
```bash
|
|
34
|
-
# Install the package
|
|
35
|
+
# Install the package
|
|
35
36
|
pip install content-core
|
|
37
|
+
|
|
38
|
+
# Install with MCP server support
|
|
39
|
+
pip install content-core[mcp]
|
|
36
40
|
```
|
|
37
41
|
|
|
38
42
|
Alternatively, if you’re developing locally:
|
|
@@ -159,6 +163,38 @@ summary = await cc.summarize_content("long article text", context="explain to a
|
|
|
159
163
|
|
|
160
164
|
For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
|
|
161
165
|
|
|
166
|
+
## MCP Server Integration
|
|
167
|
+
|
|
168
|
+
Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
|
|
169
|
+
|
|
170
|
+
### Quick Setup with Claude Desktop
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
# Install with MCP support
|
|
174
|
+
pip install content-core[mcp]
|
|
175
|
+
|
|
176
|
+
# Or use directly with uvx (no installation required)
|
|
177
|
+
uvx --from "content-core[mcp]" content-core-mcp
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Add to your `claude_desktop_config.json`:
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"mcpServers": {
|
|
184
|
+
"content-core": {
|
|
185
|
+
"command": "uvx",
|
|
186
|
+
"args": [
|
|
187
|
+
"--from",
|
|
188
|
+
"content-core[mcp]",
|
|
189
|
+
"content-core-mcp"
|
|
190
|
+
]
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
|
|
197
|
+
|
|
162
198
|
## Using with Langchain
|
|
163
199
|
|
|
164
200
|
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
# MCP Server Documentation
|
|
2
|
+
|
|
3
|
+
Content Core includes a Model Context Protocol (MCP) server that provides powerful content extraction capabilities to Claude Desktop and other MCP-compatible applications. The server exposes a single, easy-to-use tool that can extract content from URLs and files using Content Core's advanced extraction engines.
|
|
4
|
+
|
|
5
|
+
## What is MCP?
|
|
6
|
+
|
|
7
|
+
The [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) is an open standard that enables AI applications to securely connect to external data sources and tools. Content Core's MCP server allows Claude Desktop to directly extract content from various sources, making it easy to process web pages, documents, videos, and other media within your conversations.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Single tool interface**: `extract_content` function accepts either URLs or file paths
|
|
12
|
+
- **Auto engine selection**: Uses Content Core's intelligent 'auto' engine for optimal extraction
|
|
13
|
+
- **Rich metadata**: Returns detailed information about extraction process and content
|
|
14
|
+
- **Structured JSON responses**: Consistent format with success/error handling
|
|
15
|
+
- **Wide format support**: Handles web pages, PDFs, Word docs, videos, audio files, and more
|
|
16
|
+
- **Zero-install option**: Run with `uvx` without local installation
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
### Option 1: Install with pip (Recommended for local development)
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Install Content Core with MCP support
|
|
24
|
+
pip install content-core[mcp]
|
|
25
|
+
|
|
26
|
+
# The content-core-mcp command becomes available
|
|
27
|
+
content-core-mcp
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Option 2: Use with uvx (Recommended for production)
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Run directly without installation
|
|
34
|
+
uvx --from "content-core[mcp]" content-core-mcp
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Claude Desktop Setup
|
|
38
|
+
|
|
39
|
+
### Configuration
|
|
40
|
+
|
|
41
|
+
Add Content Core to your Claude Desktop configuration file:
|
|
42
|
+
|
|
43
|
+
**Location of config file:**
|
|
44
|
+
- **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
|
|
45
|
+
- **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
|
|
46
|
+
|
|
47
|
+
### Production Configuration (using uvx)
|
|
48
|
+
|
|
49
|
+
```json
|
|
50
|
+
{
|
|
51
|
+
"mcpServers": {
|
|
52
|
+
"content-core": {
|
|
53
|
+
"command": "uvx",
|
|
54
|
+
"args": [
|
|
55
|
+
"--from",
|
|
56
|
+
"content-core[mcp]",
|
|
57
|
+
"content-core-mcp"
|
|
58
|
+
]
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Local Development Configuration
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"mcpServers": {
|
|
69
|
+
"content-core": {
|
|
70
|
+
"command": "uv",
|
|
71
|
+
"args": [
|
|
72
|
+
"--directory",
|
|
73
|
+
"/path/to/your/content-core",
|
|
74
|
+
"run",
|
|
75
|
+
"content-core-mcp"
|
|
76
|
+
]
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### With Environment Variables
|
|
83
|
+
|
|
84
|
+
If you need to pass API keys or other configuration:
|
|
85
|
+
|
|
86
|
+
```json
|
|
87
|
+
{
|
|
88
|
+
"mcpServers": {
|
|
89
|
+
"content-core": {
|
|
90
|
+
"command": "uvx",
|
|
91
|
+
"args": [
|
|
92
|
+
"--from",
|
|
93
|
+
"content-core[mcp]",
|
|
94
|
+
"content-core-mcp"
|
|
95
|
+
],
|
|
96
|
+
"env": {
|
|
97
|
+
"OPENAI_API_KEY": "your-openai-key",
|
|
98
|
+
"FIRECRAWL_API_KEY": "your-firecrawl-key",
|
|
99
|
+
"JINA_API_KEY": "your-jina-key"
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Usage
|
|
107
|
+
|
|
108
|
+
After setting up the MCP server, you can use it directly in Claude Desktop conversations. The server provides one main tool:
|
|
109
|
+
|
|
110
|
+
### extract_content
|
|
111
|
+
|
|
112
|
+
Extract content from URLs or files using Content Core's auto engine.
|
|
113
|
+
|
|
114
|
+
**Parameters:**
|
|
115
|
+
- `url` (optional): URL to extract content from
|
|
116
|
+
- `file_path` (optional): Local file path to extract content from
|
|
117
|
+
|
|
118
|
+
**Note**: Exactly one parameter must be provided (either `url` OR `file_path`, not both).
|
|
119
|
+
|
|
120
|
+
## Examples
|
|
121
|
+
|
|
122
|
+
### Extracting from URLs
|
|
123
|
+
|
|
124
|
+
**Prompt in Claude Desktop:**
|
|
125
|
+
```
|
|
126
|
+
Please extract the content from https://example.com/article
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**What happens:**
|
|
130
|
+
Claude will use the MCP server to extract the article content, including the title, main text, and metadata.
|
|
131
|
+
|
|
132
|
+
### Extracting from Files
|
|
133
|
+
|
|
134
|
+
**Prompt in Claude Desktop:**
|
|
135
|
+
```
|
|
136
|
+
Extract the content from /path/to/document.pdf
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
**What happens:**
|
|
140
|
+
Claude will extract text content from the PDF, including any embedded text, tables, and structural information.
|
|
141
|
+
|
|
142
|
+
### Working with Videos
|
|
143
|
+
|
|
144
|
+
**Prompt in Claude Desktop:**
|
|
145
|
+
```
|
|
146
|
+
Please extract the transcript from /path/to/video.mp4
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**What happens:**
|
|
150
|
+
Content Core will extract audio from the video, transcribe it to text, and return the full transcript.
|
|
151
|
+
|
|
152
|
+
### Complex Workflows
|
|
153
|
+
|
|
154
|
+
**Prompt in Claude Desktop:**
|
|
155
|
+
```
|
|
156
|
+
Extract content from https://www.youtube.com/watch?v=example and summarize the key points in bullet format
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**What happens:**
|
|
160
|
+
1. Claude extracts the YouTube video transcript using the MCP server
|
|
161
|
+
2. Claude then processes and summarizes the content as requested
|
|
162
|
+
|
|
163
|
+
## Response Format
|
|
164
|
+
|
|
165
|
+
The MCP server returns structured JSON responses:
|
|
166
|
+
|
|
167
|
+
```json
|
|
168
|
+
{
|
|
169
|
+
"success": true,
|
|
170
|
+
"error": null,
|
|
171
|
+
"source_type": "url",
|
|
172
|
+
"source": "https://example.com/article",
|
|
173
|
+
"content": "Extracted article content...",
|
|
174
|
+
"metadata": {
|
|
175
|
+
"extraction_time_seconds": 2.34,
|
|
176
|
+
"extraction_timestamp": "2025-06-19T13:00:00Z",
|
|
177
|
+
"content_length": 1234,
|
|
178
|
+
"identified_type": "text/html",
|
|
179
|
+
"title": "Article Title",
|
|
180
|
+
"final_url": "https://example.com/article",
|
|
181
|
+
// Additional metadata specific to content type
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Error Response
|
|
187
|
+
|
|
188
|
+
```json
|
|
189
|
+
{
|
|
190
|
+
"success": false,
|
|
191
|
+
"error": "File not found: /path/to/nonexistent.pdf",
|
|
192
|
+
"source_type": "file",
|
|
193
|
+
"source": "/path/to/nonexistent.pdf",
|
|
194
|
+
"content": null,
|
|
195
|
+
"metadata": {
|
|
196
|
+
"extraction_timestamp": "2025-06-19T13:00:00Z",
|
|
197
|
+
"error_type": "FileNotFoundError"
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Supported Content Types
|
|
203
|
+
|
|
204
|
+
The MCP server supports all content types that Content Core can handle:
|
|
205
|
+
|
|
206
|
+
### Web Content
|
|
207
|
+
- HTML pages
|
|
208
|
+
- YouTube videos (transcript extraction)
|
|
209
|
+
- Social media posts
|
|
210
|
+
- Articles and blogs
|
|
211
|
+
- Documentation sites
|
|
212
|
+
|
|
213
|
+
### Document Formats
|
|
214
|
+
- PDF files
|
|
215
|
+
- Microsoft Word (.docx)
|
|
216
|
+
- PowerPoint (.pptx)
|
|
217
|
+
- Excel (.xlsx)
|
|
218
|
+
- Markdown files
|
|
219
|
+
- Plain text files
|
|
220
|
+
- CSV files
|
|
221
|
+
|
|
222
|
+
### Media Files
|
|
223
|
+
- Video files (MP4, AVI, MOV, etc.) - extracts transcript
|
|
224
|
+
- Audio files (MP3, WAV, M4A, etc.) - transcribes to text
|
|
225
|
+
- Images (JPG, PNG, etc.) - OCR text extraction
|
|
226
|
+
|
|
227
|
+
### Other Formats
|
|
228
|
+
- ZIP archives (extracts text from contained files)
|
|
229
|
+
- EPUB books
|
|
230
|
+
- AsciiDoc files
|
|
231
|
+
- HTML files
|
|
232
|
+
|
|
233
|
+
## Configuration
|
|
234
|
+
|
|
235
|
+
### Engine Selection
|
|
236
|
+
|
|
237
|
+
Content Core's MCP server uses the 'auto' engine by default, which automatically selects the best extraction method based on:
|
|
238
|
+
|
|
239
|
+
- **URLs**: Firecrawl (if API key available) → Jina (if API key available) → BeautifulSoup
|
|
240
|
+
- **Files**: Docling → Simple extraction
|
|
241
|
+
|
|
242
|
+
### API Keys
|
|
243
|
+
|
|
244
|
+
To get the best extraction results, configure these optional API keys:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
# For enhanced web extraction
|
|
248
|
+
export FIRECRAWL_API_KEY="your-firecrawl-key"
|
|
249
|
+
export JINA_API_KEY="your-jina-key"
|
|
250
|
+
|
|
251
|
+
# For AI-powered content cleaning and summarization
|
|
252
|
+
export OPENAI_API_KEY="your-openai-key"
|
|
253
|
+
export GOOGLE_API_KEY="your-google-key"
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Custom Prompts
|
|
257
|
+
|
|
258
|
+
You can customize Content Core's behavior by setting a custom prompt path:
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
export PROMPT_PATH="/path/to/your/custom/prompts"
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
## Troubleshooting
|
|
265
|
+
|
|
266
|
+
### Common Issues
|
|
267
|
+
|
|
268
|
+
**"MCP content-core: Unexpected token" errors:**
|
|
269
|
+
- This usually indicates output to stdout that interferes with the MCP protocol
|
|
270
|
+
- Content Core v1.0.5+ includes fixes to suppress MoviePy and other library outputs
|
|
271
|
+
|
|
272
|
+
**Connection failures:**
|
|
273
|
+
```bash
|
|
274
|
+
# Test the MCP server directly
|
|
275
|
+
content-core-mcp
|
|
276
|
+
|
|
277
|
+
# Or with uvx
|
|
278
|
+
uvx --from "content-core[mcp]" content-core-mcp
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
**Missing dependencies:**
|
|
282
|
+
```bash
|
|
283
|
+
# Reinstall with MCP dependencies
|
|
284
|
+
pip install --force-reinstall content-core[mcp]
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Debug Mode
|
|
288
|
+
|
|
289
|
+
For development and debugging, you can run the server with additional logging:
|
|
290
|
+
|
|
291
|
+
```bash
|
|
292
|
+
# Set debug level
|
|
293
|
+
export LOGURU_LEVEL=DEBUG
|
|
294
|
+
content-core-mcp
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
### Performance Considerations
|
|
298
|
+
|
|
299
|
+
- **Large files**: Video and audio files may take longer to process due to transcription
|
|
300
|
+
- **API rate limits**: Some web extraction services have rate limits
|
|
301
|
+
- **Network connectivity**: URL extraction requires internet access
|
|
302
|
+
|
|
303
|
+
## Development
|
|
304
|
+
|
|
305
|
+
### Running Locally
|
|
306
|
+
|
|
307
|
+
```bash
|
|
308
|
+
# Clone the repository
|
|
309
|
+
git clone https://github.com/lfnovo/content-core
|
|
310
|
+
cd content-core
|
|
311
|
+
|
|
312
|
+
# Install with MCP dependencies
|
|
313
|
+
uv sync --extra mcp
|
|
314
|
+
|
|
315
|
+
# Run the server
|
|
316
|
+
make mcp-server
|
|
317
|
+
# or
|
|
318
|
+
uv run content-core-mcp
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### Testing
|
|
322
|
+
|
|
323
|
+
```bash
|
|
324
|
+
# Run MCP-specific tests
|
|
325
|
+
uv run pytest tests/unit/test_mcp_server.py -v
|
|
326
|
+
|
|
327
|
+
# Run all tests
|
|
328
|
+
make test
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
## Contributing
|
|
332
|
+
|
|
333
|
+
Contributions to the MCP server are welcome! Please see our [Contributing Guide](../CONTRIBUTING.md) for development setup and guidelines.
|
|
334
|
+
|
|
335
|
+
## Support
|
|
336
|
+
|
|
337
|
+
- **Issues**: [GitHub Issues](https://github.com/lfnovo/content-core/issues)
|
|
338
|
+
- **Documentation**: [Main Documentation](usage.md)
|
|
339
|
+
- **MCP Protocol**: [Model Context Protocol](https://modelcontextprotocol.io/)
|
|
340
|
+
|
|
341
|
+
## Version History
|
|
342
|
+
|
|
343
|
+
- **v1.0.4**: Initial MCP server implementation
|
|
344
|
+
- **v1.0.5**: Added output suppression for better MCP compatibility
|
|
345
|
+
- **Latest**: Enhanced error handling and metadata support
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "1.0
|
|
3
|
+
version = "1.1.0"
|
|
4
4
|
description = "Extract what matters from any media source"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -37,10 +37,14 @@ dependencies = [
|
|
|
37
37
|
"pytubefix>=9.1.1",
|
|
38
38
|
]
|
|
39
39
|
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
mcp = ["fastmcp>=0.5.0"]
|
|
42
|
+
|
|
40
43
|
[project.scripts]
|
|
41
44
|
ccore = "content_core:ccore"
|
|
42
45
|
cclean = "content_core:cclean"
|
|
43
46
|
csum = "content_core:csum"
|
|
47
|
+
content-core-mcp = "content_core.mcp.server:main"
|
|
44
48
|
|
|
45
49
|
[tool.hatch.metadata]
|
|
46
50
|
allow-direct-references = true
|
|
@@ -52,6 +56,8 @@ build-backend = "hatchling.build"
|
|
|
52
56
|
[tool.setuptools]
|
|
53
57
|
package-dir = {"content_core" = "src/content_core"}
|
|
54
58
|
|
|
59
|
+
[tool.uv.sources]
|
|
60
|
+
|
|
55
61
|
[dependency-groups]
|
|
56
62
|
dev = [
|
|
57
63
|
"ipykernel>=4.0.1",
|
|
@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
|
|
|
8
8
|
templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
|
|
9
9
|
response = await templated_message_fn(
|
|
10
10
|
TemplatedMessageInput(
|
|
11
|
-
user_prompt_template="content/summarize",
|
|
11
|
+
user_prompt_template="prompts/content/summarize",
|
|
12
12
|
data={"content": content, "context": context},
|
|
13
13
|
)
|
|
14
14
|
)
|