content-core 1.2.2__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-1.2.2 → content_core-1.3.0}/.gitignore +3 -1
- {content_core-1.2.2 → content_core-1.3.0}/PKG-INFO +10 -11
- {content_core-1.2.2 → content_core-1.3.0}/README.md +8 -8
- {content_core-1.2.2 → content_core-1.3.0}/docs/mcp.md +8 -8
- content_core-1.3.0/examples/main.py +241 -0
- {content_core-1.2.2 → content_core-1.3.0}/pyproject.toml +2 -2
- {content_core-1.2.2 → content_core-1.3.0}/uv.lock +4 -6
- content_core-1.2.2/.claude/commands/pr-review.md +0 -6
- content_core-1.2.2/mcp.md +0 -248
- content_core-1.2.2/test.py +0 -16
- {content_core-1.2.2 → content_core-1.3.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/.github/workflows/claude-code-review.yml +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/.github/workflows/claude.yml +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/.github/workflows/publish.yml +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/.python-version +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/CONTRIBUTING.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/LICENSE +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/Makefile +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/docs/macos.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/docs/processors.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/docs/raycast.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/docs/usage.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/prompts/content/summarize.jinja +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/.eslintrc.json +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/CHANGELOG.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/README.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/assets/command-icon.png +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/package-lock.json +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/package.json +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/raycast-env.d.ts +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/src/extract-content.tsx +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/src/quick-extract.tsx +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/src/summarize-content.tsx +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/src/utils/content-core.ts +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/src/utils/types.ts +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/raycast-content-core/tsconfig.json +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/common/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/common/state.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/common/types.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/common/utils.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/config.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/extraction/graph.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/summary/core.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/logging.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/mcp/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/mcp/server.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/models.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/models_config.yaml +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/notebooks/urls.ipynb +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/processors/audio.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/processors/docling.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/processors/office.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/processors/text.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/processors/url.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/processors/video.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/processors/youtube.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/py.typed +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/templated_message.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/tools/extract.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.docx +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.epub +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.md +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.mp3 +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.mp4 +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.pdf +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.pptx +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.txt +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file.xlsx +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.2.2 → content_core-1.3.0/tests/input_content}/new_pdf.pdf +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/integration/test_cli.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/integration/test_extraction.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/unit/test_config.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/unit/test_docling.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/unit/test_mcp_server.py +0 -0
- {content_core-1.2.2 → content_core-1.3.0}/tests/unit/test_pymupdf_ocr.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -11,6 +11,7 @@ Requires-Dist: asciidoc>=10.2.1
|
|
|
11
11
|
Requires-Dist: bs4>=0.0.2
|
|
12
12
|
Requires-Dist: dicttoxml>=1.7.16
|
|
13
13
|
Requires-Dist: esperanto>=1.2.0
|
|
14
|
+
Requires-Dist: fastmcp>=0.5.0
|
|
14
15
|
Requires-Dist: firecrawl-py>=2.7.0
|
|
15
16
|
Requires-Dist: jinja2>=3.1.6
|
|
16
17
|
Requires-Dist: langdetect>=1.0.9
|
|
@@ -32,8 +33,6 @@ Requires-Dist: validators>=0.34.0
|
|
|
32
33
|
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
33
34
|
Provides-Extra: docling
|
|
34
35
|
Requires-Dist: docling>=2.34.0; extra == 'docling'
|
|
35
|
-
Provides-Extra: mcp
|
|
36
|
-
Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
|
|
37
36
|
Description-Content-Type: text/markdown
|
|
38
37
|
|
|
39
38
|
# Content Core
|
|
@@ -118,11 +117,11 @@ pip install content-core
|
|
|
118
117
|
# With enhanced document processing (adds Docling)
|
|
119
118
|
pip install content-core[docling]
|
|
120
119
|
|
|
121
|
-
# With MCP server support
|
|
122
|
-
pip install content-core
|
|
120
|
+
# With MCP server support (now included by default)
|
|
121
|
+
pip install content-core
|
|
123
122
|
|
|
124
|
-
# Full installation
|
|
125
|
-
pip install content-core[docling
|
|
123
|
+
# Full installation (with enhanced document processing)
|
|
124
|
+
pip install content-core[docling]
|
|
126
125
|
```
|
|
127
126
|
|
|
128
127
|
Alternatively, if you’re developing locally:
|
|
@@ -268,11 +267,11 @@ Content Core includes a Model Context Protocol (MCP) server that enables seamles
|
|
|
268
267
|
### Quick Setup with Claude Desktop
|
|
269
268
|
|
|
270
269
|
```bash
|
|
271
|
-
# Install
|
|
272
|
-
pip install content-core
|
|
270
|
+
# Install Content Core (MCP server included)
|
|
271
|
+
pip install content-core
|
|
273
272
|
|
|
274
273
|
# Or use directly with uvx (no installation required)
|
|
275
|
-
uvx --from "content-core
|
|
274
|
+
uvx --from "content-core" content-core-mcp
|
|
276
275
|
```
|
|
277
276
|
|
|
278
277
|
Add to your `claude_desktop_config.json`:
|
|
@@ -283,7 +282,7 @@ Add to your `claude_desktop_config.json`:
|
|
|
283
282
|
"command": "uvx",
|
|
284
283
|
"args": [
|
|
285
284
|
"--from",
|
|
286
|
-
"content-core
|
|
285
|
+
"content-core",
|
|
287
286
|
"content-core-mcp"
|
|
288
287
|
]
|
|
289
288
|
}
|
|
@@ -80,11 +80,11 @@ pip install content-core
|
|
|
80
80
|
# With enhanced document processing (adds Docling)
|
|
81
81
|
pip install content-core[docling]
|
|
82
82
|
|
|
83
|
-
# With MCP server support
|
|
84
|
-
pip install content-core
|
|
83
|
+
# With MCP server support (now included by default)
|
|
84
|
+
pip install content-core
|
|
85
85
|
|
|
86
|
-
# Full installation
|
|
87
|
-
pip install content-core[docling
|
|
86
|
+
# Full installation (with enhanced document processing)
|
|
87
|
+
pip install content-core[docling]
|
|
88
88
|
```
|
|
89
89
|
|
|
90
90
|
Alternatively, if you’re developing locally:
|
|
@@ -230,11 +230,11 @@ Content Core includes a Model Context Protocol (MCP) server that enables seamles
|
|
|
230
230
|
### Quick Setup with Claude Desktop
|
|
231
231
|
|
|
232
232
|
```bash
|
|
233
|
-
# Install
|
|
234
|
-
pip install content-core
|
|
233
|
+
# Install Content Core (MCP server included)
|
|
234
|
+
pip install content-core
|
|
235
235
|
|
|
236
236
|
# Or use directly with uvx (no installation required)
|
|
237
|
-
uvx --from "content-core
|
|
237
|
+
uvx --from "content-core" content-core-mcp
|
|
238
238
|
```
|
|
239
239
|
|
|
240
240
|
Add to your `claude_desktop_config.json`:
|
|
@@ -245,7 +245,7 @@ Add to your `claude_desktop_config.json`:
|
|
|
245
245
|
"command": "uvx",
|
|
246
246
|
"args": [
|
|
247
247
|
"--from",
|
|
248
|
-
"content-core
|
|
248
|
+
"content-core",
|
|
249
249
|
"content-core-mcp"
|
|
250
250
|
]
|
|
251
251
|
}
|
|
@@ -20,8 +20,8 @@ The [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) is an open
|
|
|
20
20
|
### Option 1: Install with pip (Recommended for local development)
|
|
21
21
|
|
|
22
22
|
```bash
|
|
23
|
-
# Install Content Core
|
|
24
|
-
pip install content-core
|
|
23
|
+
# Install Content Core (MCP server included by default)
|
|
24
|
+
pip install content-core
|
|
25
25
|
|
|
26
26
|
# The content-core-mcp command becomes available
|
|
27
27
|
content-core-mcp
|
|
@@ -31,7 +31,7 @@ content-core-mcp
|
|
|
31
31
|
|
|
32
32
|
```bash
|
|
33
33
|
# Run MCP server directly without installation
|
|
34
|
-
uvx --from "content-core
|
|
34
|
+
uvx --from "content-core" content-core-mcp
|
|
35
35
|
|
|
36
36
|
# Also works for CLI tools
|
|
37
37
|
uvx --from "content-core" ccore https://example.com
|
|
@@ -58,7 +58,7 @@ Add Content Core to your Claude Desktop configuration file:
|
|
|
58
58
|
"command": "uvx",
|
|
59
59
|
"args": [
|
|
60
60
|
"--from",
|
|
61
|
-
"content-core
|
|
61
|
+
"content-core",
|
|
62
62
|
"content-core-mcp"
|
|
63
63
|
]
|
|
64
64
|
}
|
|
@@ -102,7 +102,7 @@ For optimal functionality, you'll need to configure API keys. Here's what each k
|
|
|
102
102
|
"command": "uvx",
|
|
103
103
|
"args": [
|
|
104
104
|
"--from",
|
|
105
|
-
"content-core
|
|
105
|
+
"content-core",
|
|
106
106
|
"content-core-mcp"
|
|
107
107
|
],
|
|
108
108
|
"env": {
|
|
@@ -342,13 +342,13 @@ export PROMPT_PATH="/path/to/your/custom/prompts"
|
|
|
342
342
|
content-core-mcp
|
|
343
343
|
|
|
344
344
|
# Or with uvx
|
|
345
|
-
uvx --from "content-core
|
|
345
|
+
uvx --from "content-core" content-core-mcp
|
|
346
346
|
```
|
|
347
347
|
|
|
348
348
|
**Missing dependencies:**
|
|
349
349
|
```bash
|
|
350
|
-
# Reinstall
|
|
351
|
-
pip install --force-reinstall content-core
|
|
350
|
+
# Reinstall Content Core
|
|
351
|
+
pip install --force-reinstall content-core
|
|
352
352
|
```
|
|
353
353
|
|
|
354
354
|
**Audio/video extraction failing:**
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
MarkDowny - Convert files and URLs to Markdown using Microsoft's MarkItDown library.
|
|
4
|
+
|
|
5
|
+
This script processes all files in the input_content/ directory and URLs from urls.txt,
|
|
6
|
+
converting them to Markdown format and saving the results to separate files.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Tuple
|
|
13
|
+
from urllib.parse import urlparse
|
|
14
|
+
|
|
15
|
+
from loguru import logger
|
|
16
|
+
from markitdown import MarkItDown
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def setup_logging() -> None:
|
|
20
|
+
"""Configure logging with loguru."""
|
|
21
|
+
logger.remove()
|
|
22
|
+
logger.add(
|
|
23
|
+
sys.stderr,
|
|
24
|
+
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
|
|
25
|
+
level="INFO"
|
|
26
|
+
)
|
|
27
|
+
logger.add(
|
|
28
|
+
"processing.log",
|
|
29
|
+
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
|
|
30
|
+
level="DEBUG",
|
|
31
|
+
rotation="10 MB"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def create_output_directory(output_dir: Path) -> None:
|
|
36
|
+
"""Create output directory if it doesn't exist."""
|
|
37
|
+
output_dir.mkdir(exist_ok=True)
|
|
38
|
+
logger.info(f"Output directory: {output_dir}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def sanitize_filename(filename: str) -> str:
|
|
42
|
+
"""Sanitize filename for safe file system usage."""
|
|
43
|
+
# Remove or replace problematic characters
|
|
44
|
+
invalid_chars = '<>:"/\\|?*'
|
|
45
|
+
for char in invalid_chars:
|
|
46
|
+
filename = filename.replace(char, '_')
|
|
47
|
+
return filename.strip()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def process_file(md_converter: MarkItDown, file_path: Path, output_dir: Path) -> bool:
|
|
51
|
+
"""
|
|
52
|
+
Process a single file and convert it to Markdown.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
md_converter: MarkItDown instance
|
|
56
|
+
file_path: Path to the input file
|
|
57
|
+
output_dir: Directory to save the output
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
bool: True if successful, False otherwise
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
logger.info(f"Processing file: {file_path.name}")
|
|
64
|
+
|
|
65
|
+
# Convert file to markdown
|
|
66
|
+
result = md_converter.convert(str(file_path))
|
|
67
|
+
|
|
68
|
+
# Create output filename
|
|
69
|
+
base_name = file_path.name # Use full filename with extension
|
|
70
|
+
safe_name = sanitize_filename(base_name.replace('.', '_'))
|
|
71
|
+
output_filename = f"{safe_name}_converted.md"
|
|
72
|
+
output_path = output_dir / output_filename
|
|
73
|
+
|
|
74
|
+
# Create markdown content with metadata
|
|
75
|
+
content = f"""# Converted from: {file_path.name}
|
|
76
|
+
|
|
77
|
+
**Source File:** {file_path.name}
|
|
78
|
+
**Source Path:** {file_path}
|
|
79
|
+
**Conversion Date:** {result.title if hasattr(result, 'title') else 'N/A'}
|
|
80
|
+
**File Size:** {file_path.stat().st_size} bytes
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
{result.text_content}
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
# Write to output file
|
|
88
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
89
|
+
f.write(content)
|
|
90
|
+
|
|
91
|
+
logger.success(f"Successfully converted {file_path.name} -> {output_filename}")
|
|
92
|
+
return True
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.error(f"Failed to process {file_path.name}: {str(e)}")
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def process_url(md_converter: MarkItDown, url: str, output_dir: Path, index: int) -> bool:
|
|
100
|
+
"""
|
|
101
|
+
Process a single URL and convert it to Markdown.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
md_converter: MarkItDown instance
|
|
105
|
+
url: URL to process
|
|
106
|
+
output_dir: Directory to save the output
|
|
107
|
+
index: Index for naming the output file
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
bool: True if successful, False otherwise
|
|
111
|
+
"""
|
|
112
|
+
try:
|
|
113
|
+
logger.info(f"Processing URL: {url}")
|
|
114
|
+
|
|
115
|
+
# Convert URL to markdown
|
|
116
|
+
result = md_converter.convert(url)
|
|
117
|
+
|
|
118
|
+
# Create output filename based on URL
|
|
119
|
+
parsed_url = urlparse(url)
|
|
120
|
+
domain = parsed_url.netloc.replace('www.', '')
|
|
121
|
+
safe_domain = sanitize_filename(domain)
|
|
122
|
+
output_filename = f"url_{index:02d}_{safe_domain}_converted.md"
|
|
123
|
+
output_path = output_dir / output_filename
|
|
124
|
+
|
|
125
|
+
# Create markdown content with metadata
|
|
126
|
+
content = f"""# Converted from URL: {url}
|
|
127
|
+
|
|
128
|
+
**Source URL:** {url}
|
|
129
|
+
**Domain:** {parsed_url.netloc}
|
|
130
|
+
**Conversion Date:** {result.title if hasattr(result, 'title') else 'N/A'}
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
{result.text_content}
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
# Write to output file
|
|
138
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
139
|
+
f.write(content)
|
|
140
|
+
|
|
141
|
+
logger.success(f"Successfully converted {url} -> {output_filename}")
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.error(f"Failed to process URL {url}: {str(e)}")
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def load_urls(urls_file: Path) -> List[str]:
|
|
150
|
+
"""Load URLs from the urls.txt file."""
|
|
151
|
+
try:
|
|
152
|
+
with open(urls_file, 'r', encoding='utf-8') as f:
|
|
153
|
+
urls = [line.strip() for line in f if line.strip() and not line.strip().startswith('#')]
|
|
154
|
+
logger.info(f"Loaded {len(urls)} URLs from {urls_file}")
|
|
155
|
+
return urls
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Failed to load URLs from {urls_file}: {str(e)}")
|
|
158
|
+
return []
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def get_input_files(input_dir: Path) -> List[Path]:
|
|
162
|
+
"""Get all files from the input directory, excluding audio/video files."""
|
|
163
|
+
try:
|
|
164
|
+
# Skip audio/video files for now
|
|
165
|
+
skip_extensions = [] #{'.mp3', '.mp4', '.wav', '.avi', '.mov', '.mkv', '.m4a'}
|
|
166
|
+
files = [f for f in input_dir.iterdir()
|
|
167
|
+
if f.is_file() and f.suffix.lower() not in skip_extensions]
|
|
168
|
+
logger.info(f"Found {len(files)} files in {input_dir} (excluding audio/video)")
|
|
169
|
+
return files
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error(f"Failed to read files from {input_dir}: {str(e)}")
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def main():
|
|
176
|
+
"""Main function to orchestrate the conversion process."""
|
|
177
|
+
setup_logging()
|
|
178
|
+
logger.info("Starting MarkDowny processing...")
|
|
179
|
+
|
|
180
|
+
# Setup paths
|
|
181
|
+
project_root = Path(__file__).parent
|
|
182
|
+
input_dir = project_root / "input_content"
|
|
183
|
+
urls_file = project_root / "urls.txt"
|
|
184
|
+
output_dir = project_root / "output"
|
|
185
|
+
|
|
186
|
+
# Create output directory
|
|
187
|
+
create_output_directory(output_dir)
|
|
188
|
+
|
|
189
|
+
# Initialize MarkItDown
|
|
190
|
+
md_converter = MarkItDown()
|
|
191
|
+
|
|
192
|
+
# Process files
|
|
193
|
+
files_processed = 0
|
|
194
|
+
files_failed = 0
|
|
195
|
+
|
|
196
|
+
if input_dir.exists():
|
|
197
|
+
input_files = get_input_files(input_dir)
|
|
198
|
+
logger.info(f"Processing {len(input_files)} files...")
|
|
199
|
+
|
|
200
|
+
for file_path in input_files:
|
|
201
|
+
if process_file(md_converter, file_path, output_dir):
|
|
202
|
+
files_processed += 1
|
|
203
|
+
else:
|
|
204
|
+
files_failed += 1
|
|
205
|
+
else:
|
|
206
|
+
logger.warning(f"Input directory {input_dir} does not exist")
|
|
207
|
+
|
|
208
|
+
# Process URLs
|
|
209
|
+
urls_processed = 0
|
|
210
|
+
urls_failed = 0
|
|
211
|
+
|
|
212
|
+
if urls_file.exists():
|
|
213
|
+
urls = load_urls(urls_file)
|
|
214
|
+
logger.info(f"Processing {len(urls)} URLs...")
|
|
215
|
+
|
|
216
|
+
for index, url in enumerate(urls, 1):
|
|
217
|
+
if process_url(md_converter, url, output_dir, index):
|
|
218
|
+
urls_processed += 1
|
|
219
|
+
else:
|
|
220
|
+
urls_failed += 1
|
|
221
|
+
else:
|
|
222
|
+
logger.warning(f"URLs file {urls_file} does not exist")
|
|
223
|
+
|
|
224
|
+
# Summary
|
|
225
|
+
logger.info("=" * 50)
|
|
226
|
+
logger.info("PROCESSING SUMMARY")
|
|
227
|
+
logger.info("=" * 50)
|
|
228
|
+
logger.info(f"Files: {files_processed} successful, {files_failed} failed")
|
|
229
|
+
logger.info(f"URLs: {urls_processed} successful, {urls_failed} failed")
|
|
230
|
+
logger.info(f"Total: {files_processed + urls_processed} successful, {files_failed + urls_failed} failed")
|
|
231
|
+
logger.info(f"Output directory: {output_dir}")
|
|
232
|
+
|
|
233
|
+
if files_failed + urls_failed > 0:
|
|
234
|
+
logger.warning("Some items failed to process. Check the logs for details.")
|
|
235
|
+
sys.exit(1)
|
|
236
|
+
else:
|
|
237
|
+
logger.success("All items processed successfully!")
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
if __name__ == "__main__":
|
|
241
|
+
main()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.3.0"
|
|
4
4
|
description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -34,10 +34,10 @@ dependencies = [
|
|
|
34
34
|
"asciidoc>=10.2.1",
|
|
35
35
|
"python-magic-bin==0.4.14; sys_platform == 'win32'",
|
|
36
36
|
"pytubefix>=9.1.1",
|
|
37
|
+
"fastmcp>=0.5.0",
|
|
37
38
|
]
|
|
38
39
|
|
|
39
40
|
[project.optional-dependencies]
|
|
40
|
-
mcp = ["fastmcp>=0.5.0"]
|
|
41
41
|
docling = ["docling>=2.34.0"]
|
|
42
42
|
|
|
43
43
|
[project.scripts]
|
|
@@ -419,7 +419,7 @@ wheels = [
|
|
|
419
419
|
|
|
420
420
|
[[package]]
|
|
421
421
|
name = "content-core"
|
|
422
|
-
version = "1.
|
|
422
|
+
version = "1.3.0"
|
|
423
423
|
source = { editable = "." }
|
|
424
424
|
dependencies = [
|
|
425
425
|
{ name = "ai-prompter" },
|
|
@@ -428,6 +428,7 @@ dependencies = [
|
|
|
428
428
|
{ name = "bs4" },
|
|
429
429
|
{ name = "dicttoxml" },
|
|
430
430
|
{ name = "esperanto" },
|
|
431
|
+
{ name = "fastmcp" },
|
|
431
432
|
{ name = "firecrawl-py" },
|
|
432
433
|
{ name = "jinja2" },
|
|
433
434
|
{ name = "langdetect" },
|
|
@@ -453,9 +454,6 @@ dependencies = [
|
|
|
453
454
|
docling = [
|
|
454
455
|
{ name = "docling" },
|
|
455
456
|
]
|
|
456
|
-
mcp = [
|
|
457
|
-
{ name = "fastmcp" },
|
|
458
|
-
]
|
|
459
457
|
|
|
460
458
|
[package.dev-dependencies]
|
|
461
459
|
dev = [
|
|
@@ -476,7 +474,7 @@ requires-dist = [
|
|
|
476
474
|
{ name = "dicttoxml", specifier = ">=1.7.16" },
|
|
477
475
|
{ name = "docling", marker = "extra == 'docling'", specifier = ">=2.34.0" },
|
|
478
476
|
{ name = "esperanto", specifier = ">=1.2.0" },
|
|
479
|
-
{ name = "fastmcp",
|
|
477
|
+
{ name = "fastmcp", specifier = ">=0.5.0" },
|
|
480
478
|
{ name = "firecrawl-py", specifier = ">=2.7.0" },
|
|
481
479
|
{ name = "jinja2", specifier = ">=3.1.6" },
|
|
482
480
|
{ name = "langdetect", specifier = ">=1.0.9" },
|
|
@@ -497,7 +495,7 @@ requires-dist = [
|
|
|
497
495
|
{ name = "validators", specifier = ">=0.34.0" },
|
|
498
496
|
{ name = "youtube-transcript-api", specifier = ">=1.0.3" },
|
|
499
497
|
]
|
|
500
|
-
provides-extras = ["
|
|
498
|
+
provides-extras = ["docling"]
|
|
501
499
|
|
|
502
500
|
[package.metadata.requires-dev]
|
|
503
501
|
dev = [
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
When I ask you to do a PR, you will commit the necessary files and open up a pull request. As soon as you open the pull request, an automatic review process will start. The review process will provide feedback on the code quality and best practices, potential bugs or issues, performance considerations, security concerns, and test coverage.
|
|
3
|
-
|
|
4
|
-
You should query the PR after opening it to get the commends from the review tool. Then, you will assess the comments and propose me which of them we need to address.
|
|
5
|
-
|
|
6
|
-
Then, we'll do the changes, commit them and add a comment back to the PR so that the review tool undertands what we changed and what we ignored.
|
content_core-1.2.2/mcp.md
DELETED
|
@@ -1,248 +0,0 @@
|
|
|
1
|
-
# MCP Server Implementation Plan for Content Core
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
Implement a FastMCP server that exposes Content Core's extraction functionality through the Model Context Protocol (MCP). The server will provide a single tool `extract_content` that accepts either a URL or file path and returns extracted content in JSON format using the 'auto' engine.
|
|
5
|
-
|
|
6
|
-
## Architecture
|
|
7
|
-
|
|
8
|
-
### 1. Dependencies
|
|
9
|
-
```toml
|
|
10
|
-
# Add to pyproject.toml as optional dependency
|
|
11
|
-
[project.optional-dependencies]
|
|
12
|
-
mcp = ["fastmcp>=0.5.0"]
|
|
13
|
-
|
|
14
|
-
# Add script entry point for uvx
|
|
15
|
-
[project.scripts]
|
|
16
|
-
content-core-mcp = "content_core.mcp.server:main"
|
|
17
|
-
```
|
|
18
|
-
|
|
19
|
-
This allows users to install with MCP support using:
|
|
20
|
-
```bash
|
|
21
|
-
pip install content-core[mcp]
|
|
22
|
-
# or with uv
|
|
23
|
-
uv pip install content-core[mcp]
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
### 2. Server Structure
|
|
27
|
-
```
|
|
28
|
-
src/content_core/
|
|
29
|
-
├── mcp/
|
|
30
|
-
│ ├── __init__.py
|
|
31
|
-
│ └── server.py # Main MCP server implementation
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
### 3. Implementation Details
|
|
35
|
-
|
|
36
|
-
#### Server Setup (`server.py`)
|
|
37
|
-
```python
|
|
38
|
-
from fastmcp import FastMCP
|
|
39
|
-
from typing import Optional, Dict, Any
|
|
40
|
-
import content_core as cc
|
|
41
|
-
|
|
42
|
-
# Initialize MCP server
|
|
43
|
-
mcp = FastMCP("Content Core MCP Server")
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
#### Tool Definition
|
|
47
|
-
The `extract_content` tool will:
|
|
48
|
-
- Accept either `url` or `file_path` as optional parameters
|
|
49
|
-
- Validate that exactly one is provided
|
|
50
|
-
- Return extracted content in JSON format
|
|
51
|
-
- Use the 'auto' engine by default
|
|
52
|
-
|
|
53
|
-
```python
|
|
54
|
-
@mcp.tool
|
|
55
|
-
async def extract_content(
|
|
56
|
-
url: Optional[str] = None,
|
|
57
|
-
file_path: Optional[str] = None
|
|
58
|
-
) -> Dict[str, Any]:
|
|
59
|
-
"""
|
|
60
|
-
Extract content from a URL or file using Content Core's auto engine.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
url: Optional URL to extract content from
|
|
64
|
-
file_path: Optional file path to extract content from
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
JSON object containing extracted content and metadata
|
|
68
|
-
|
|
69
|
-
Raises:
|
|
70
|
-
ValueError: If neither or both url and file_path are provided
|
|
71
|
-
"""
|
|
72
|
-
# Implementation details below
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
#### Input Validation
|
|
76
|
-
- Ensure exactly one input source is provided
|
|
77
|
-
- Validate URL format if URL is provided
|
|
78
|
-
- Validate file existence if file_path is provided
|
|
79
|
-
|
|
80
|
-
#### Integration with Content Core
|
|
81
|
-
```python
|
|
82
|
-
# Build extraction request
|
|
83
|
-
extraction_request = {}
|
|
84
|
-
if url:
|
|
85
|
-
extraction_request["url"] = url
|
|
86
|
-
elif file_path:
|
|
87
|
-
extraction_request["file_path"] = file_path
|
|
88
|
-
|
|
89
|
-
# Use Content Core's extract_content with auto engine
|
|
90
|
-
result = await cc.extract_content(extraction_request)
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
#### Return Format
|
|
94
|
-
The tool will return a JSON structure containing:
|
|
95
|
-
```json
|
|
96
|
-
{
|
|
97
|
-
"source_type": "url" | "file",
|
|
98
|
-
"source": "<url or file_path>",
|
|
99
|
-
"content": "<extracted content>",
|
|
100
|
-
"metadata": {
|
|
101
|
-
"engine_used": "<actual engine used by auto>",
|
|
102
|
-
"content_type": "<detected content type>",
|
|
103
|
-
"extraction_time": "<ISO timestamp>",
|
|
104
|
-
// Additional metadata from Content Core
|
|
105
|
-
},
|
|
106
|
-
"success": true,
|
|
107
|
-
"error": null // or error message if extraction failed
|
|
108
|
-
}
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
#### Error Handling
|
|
112
|
-
- Wrap extraction in try/except block
|
|
113
|
-
- Return structured error response on failure
|
|
114
|
-
- Log errors using Context if needed
|
|
115
|
-
- Handle specific Content Core exceptions
|
|
116
|
-
|
|
117
|
-
### 4. Running the Server
|
|
118
|
-
|
|
119
|
-
#### Entry Point (`main` function)
|
|
120
|
-
```python
|
|
121
|
-
def main():
|
|
122
|
-
"""Entry point for the MCP server."""
|
|
123
|
-
import sys
|
|
124
|
-
# Run with STDIO transport for MCP compatibility
|
|
125
|
-
mcp.run()
|
|
126
|
-
|
|
127
|
-
if __name__ == "__main__":
|
|
128
|
-
main()
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
#### Usage with uvx
|
|
132
|
-
Users can run the server directly with uvx (no installation required):
|
|
133
|
-
```bash
|
|
134
|
-
# Run the MCP server
|
|
135
|
-
uvx --from "content-core[mcp]" content-core-mcp
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
#### Claude Desktop Configuration
|
|
139
|
-
Add to `claude_desktop_config.json`:
|
|
140
|
-
```json
|
|
141
|
-
{
|
|
142
|
-
"mcpServers": {
|
|
143
|
-
"content-core": {
|
|
144
|
-
"command": "uvx",
|
|
145
|
-
"args": [
|
|
146
|
-
"--from",
|
|
147
|
-
"content-core[mcp]",
|
|
148
|
-
"content-core-mcp"
|
|
149
|
-
]
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
#### Alternative: Local Development
|
|
156
|
-
```bash
|
|
157
|
-
# Install with MCP support
|
|
158
|
-
uv pip install -e ".[mcp]"
|
|
159
|
-
|
|
160
|
-
# Run directly
|
|
161
|
-
content-core-mcp
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
### 5. Testing Strategy
|
|
165
|
-
|
|
166
|
-
#### Unit Tests
|
|
167
|
-
- Test input validation logic
|
|
168
|
-
- Mock Content Core extraction calls
|
|
169
|
-
- Verify JSON response structure
|
|
170
|
-
|
|
171
|
-
#### Integration Tests
|
|
172
|
-
- Test with real URLs
|
|
173
|
-
- Test with various file types
|
|
174
|
-
- Verify auto engine selection works correctly
|
|
175
|
-
|
|
176
|
-
### 6. Documentation
|
|
177
|
-
|
|
178
|
-
#### Usage Example
|
|
179
|
-
```python
|
|
180
|
-
# Client code example
|
|
181
|
-
from fastmcp import Client
|
|
182
|
-
|
|
183
|
-
client = Client()
|
|
184
|
-
result = await client.call_tool(
|
|
185
|
-
"extract_content",
|
|
186
|
-
{"url": "https://example.com/article"}
|
|
187
|
-
)
|
|
188
|
-
```
|
|
189
|
-
|
|
190
|
-
### 7. Configuration
|
|
191
|
-
|
|
192
|
-
#### Environment Variables
|
|
193
|
-
- Support existing Content Core environment variables
|
|
194
|
-
- Allow MCP-specific configuration if needed
|
|
195
|
-
|
|
196
|
-
#### Config File Support
|
|
197
|
-
- Use existing `cc_config.yaml` if present
|
|
198
|
-
- Allow override via MCP server initialization
|
|
199
|
-
|
|
200
|
-
### 8. Future Enhancements (Not in initial implementation)
|
|
201
|
-
- Add streaming support for large files
|
|
202
|
-
- Support for multiple URLs/files in one request
|
|
203
|
-
- Add content cleaning and summarization tools
|
|
204
|
-
- Support custom extraction engines per request
|
|
205
|
-
- Add resource endpoints for browsing extracted content
|
|
206
|
-
|
|
207
|
-
### 9. Implementation Steps
|
|
208
|
-
|
|
209
|
-
1. Create `mcp/` directory structure
|
|
210
|
-
2. Implement basic server with extract_content tool
|
|
211
|
-
3. Add input validation and error handling
|
|
212
|
-
4. Integrate with Content Core extraction
|
|
213
|
-
5. Format JSON response properly
|
|
214
|
-
6. Add comprehensive logging
|
|
215
|
-
7. Write unit tests
|
|
216
|
-
8. Write integration tests
|
|
217
|
-
9. Add documentation and examples
|
|
218
|
-
10. Test with various MCP clients
|
|
219
|
-
|
|
220
|
-
### 10. Key Considerations
|
|
221
|
-
|
|
222
|
-
- **Async First**: Use async/await throughout since Content Core is async
|
|
223
|
-
- **Error Messages**: Provide clear, actionable error messages
|
|
224
|
-
- **Performance**: Consider caching for repeated requests
|
|
225
|
-
- **Security**: Validate file paths to prevent directory traversal
|
|
226
|
-
- **Compatibility**: Ensure works with all Content Core extraction engines
|
|
227
|
-
|
|
228
|
-
### 11. Publishing and Distribution
|
|
229
|
-
|
|
230
|
-
#### PyPI Package
|
|
231
|
-
The MCP server will be included as an optional extra in the main content-core package:
|
|
232
|
-
- Users install with `pip install content-core[mcp]`
|
|
233
|
-
- The `content-core-mcp` command becomes available after installation
|
|
234
|
-
- Works seamlessly with `uvx` for zero-install usage
|
|
235
|
-
|
|
236
|
-
#### Benefits of uvx approach:
|
|
237
|
-
1. **No installation required**: Users can run directly with `uvx`
|
|
238
|
-
2. **Automatic updates**: Always uses the latest published version
|
|
239
|
-
3. **Isolation**: Runs in isolated environment, avoiding dependency conflicts
|
|
240
|
-
4. **Simple configuration**: Just add to `claude_desktop_config.json`
|
|
241
|
-
|
|
242
|
-
#### Example MCP server listing entry:
|
|
243
|
-
```yaml
|
|
244
|
-
name: content-core
|
|
245
|
-
description: Extract content from URLs and files using Content Core
|
|
246
|
-
commands:
|
|
247
|
-
- uvx --from "content-core[mcp]" content-core-mcp
|
|
248
|
-
```
|
content_core-1.2.2/test.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
|
|
3
|
-
from crawl4ai import *
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
async def main():
|
|
7
|
-
async with AsyncWebCrawler() as crawler:
|
|
8
|
-
result = await crawler.arun(
|
|
9
|
-
url="https://www.nbcnews.com/business",
|
|
10
|
-
)
|
|
11
|
-
print(result.markdown)
|
|
12
|
-
print(result.title)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
if __name__ == "__main__":
|
|
16
|
-
asyncio.run(main())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{content_core-1.2.2 → content_core-1.3.0}/src/content_core/content/identification/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|