mistral-ocr-mcp 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistral_ocr_mcp/__init__.py +7 -0
- mistral_ocr_mcp/__main__.py +27 -0
- mistral_ocr_mcp/config.py +88 -0
- mistral_ocr_mcp/extraction.py +159 -0
- mistral_ocr_mcp/images.py +175 -0
- mistral_ocr_mcp/markdown_rewrite.py +118 -0
- mistral_ocr_mcp/mistral_client.py +129 -0
- mistral_ocr_mcp/path_sandbox.py +143 -0
- mistral_ocr_mcp/server.py +86 -0
- mistral_ocr_mcp-0.1.3.dist-info/METADATA +407 -0
- mistral_ocr_mcp-0.1.3.dist-info/RECORD +13 -0
- mistral_ocr_mcp-0.1.3.dist-info/WHEEL +4 -0
- mistral_ocr_mcp-0.1.3.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""MCP server implementation for Mistral OCR."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from .config import ConfigurationError, load_config
|
|
5
|
+
from .server import run
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main() -> int:
|
|
9
|
+
"""Main entry point for the MCP server."""
|
|
10
|
+
try:
|
|
11
|
+
# Load and validate configuration before starting server
|
|
12
|
+
load_config()
|
|
13
|
+
run()
|
|
14
|
+
return 0
|
|
15
|
+
except ConfigurationError as e:
|
|
16
|
+
print(f"Configuration error: {e}", file=sys.stderr)
|
|
17
|
+
return 1
|
|
18
|
+
except NotImplementedError as e:
|
|
19
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
20
|
+
return 1
|
|
21
|
+
except Exception as e:
|
|
22
|
+
print(f"Unexpected error: {e}", file=sys.stderr)
|
|
23
|
+
return 1
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
sys.exit(main())
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Configuration module for Mistral OCR MCP server.
|
|
2
|
+
|
|
3
|
+
This module loads and validates environment variables required for the server.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import NamedTuple
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Config(NamedTuple):
|
|
12
|
+
"""Configuration for the Mistral OCR MCP server.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
api_key: Mistral API key (never logged)
|
|
16
|
+
allowed_dir_original: Original allowed directory string from environment
|
|
17
|
+
allowed_dir_resolved: Resolved canonical path to allowed directory
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
api_key: str
|
|
21
|
+
allowed_dir_original: str
|
|
22
|
+
allowed_dir_resolved: Path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ConfigurationError(Exception):
|
|
26
|
+
"""Exception raised for configuration errors."""
|
|
27
|
+
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_config() -> Config:
|
|
32
|
+
"""Load and validate configuration from environment variables.
|
|
33
|
+
|
|
34
|
+
Reads:
|
|
35
|
+
- MISTRAL_API_KEY: Required API key for Mistral OCR service
|
|
36
|
+
- MISTRAL_OCR_ALLOWED_DIR: Required absolute path to allowed directory
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Config object with validated settings
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ConfigurationError: If any required environment variable is missing
|
|
43
|
+
or if the allowed directory is invalid
|
|
44
|
+
"""
|
|
45
|
+
# Load API key
|
|
46
|
+
api_key = os.getenv("MISTRAL_API_KEY")
|
|
47
|
+
if not api_key:
|
|
48
|
+
raise ConfigurationError(
|
|
49
|
+
"Missing required environment variable: MISTRAL_API_KEY"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Load allowed directory
|
|
53
|
+
allowed_dir_str = os.getenv("MISTRAL_OCR_ALLOWED_DIR")
|
|
54
|
+
if not allowed_dir_str:
|
|
55
|
+
raise ConfigurationError(
|
|
56
|
+
"Missing required environment variable: MISTRAL_OCR_ALLOWED_DIR"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Verify it's an absolute path BEFORE canonicalization (SRS FR-5.3)
|
|
60
|
+
if not Path(allowed_dir_str).is_absolute():
|
|
61
|
+
raise ConfigurationError(
|
|
62
|
+
f"MISTRAL_OCR_ALLOWED_DIR must be an absolute path: {allowed_dir_str}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Validate and canonicalize allowed directory
|
|
66
|
+
try:
|
|
67
|
+
allowed_dir = Path(allowed_dir_str).resolve(strict=True)
|
|
68
|
+
except FileNotFoundError:
|
|
69
|
+
raise ConfigurationError(
|
|
70
|
+
f"MISTRAL_OCR_ALLOWED_DIR does not exist: {allowed_dir_str}"
|
|
71
|
+
)
|
|
72
|
+
except RuntimeError as e:
|
|
73
|
+
# Can happen if path contains infinite symlink loops
|
|
74
|
+
raise ConfigurationError(
|
|
75
|
+
f"Invalid MISTRAL_OCR_ALLOWED_DIR: {allowed_dir_str} - {e}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Verify it's a directory
|
|
79
|
+
if not allowed_dir.is_dir():
|
|
80
|
+
raise ConfigurationError(
|
|
81
|
+
f"MISTRAL_OCR_ALLOWED_DIR is not a directory: {allowed_dir_str}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return Config(
|
|
85
|
+
api_key=api_key,
|
|
86
|
+
allowed_dir_original=allowed_dir_str,
|
|
87
|
+
allowed_dir_resolved=allowed_dir,
|
|
88
|
+
)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Extraction orchestration for Mistral OCR MCP server.
|
|
2
|
+
|
|
3
|
+
This module provides the main extraction functions that orchestrate OCR calls,
|
|
4
|
+
image saving, and markdown rewriting.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List
|
|
10
|
+
|
|
11
|
+
from .config import load_config
|
|
12
|
+
from .images import save_images
|
|
13
|
+
from .markdown_rewrite import rewrite_markdown
|
|
14
|
+
from .mistral_client import process_local_file
|
|
15
|
+
from .path_sandbox import PathValidationError, validate_file_path, validate_output_dir
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract_markdown(file_path: str) -> str:
|
|
19
|
+
"""Extract markdown text from a file without images.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
file_path: Absolute path to the input file (PDF or image)
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Concatenated markdown content from all pages
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
PathValidationError: If file_path is invalid
|
|
29
|
+
MistralOCRAPIError: If the OCR API call fails
|
|
30
|
+
MistralOCRFileError: If filesystem operations fail
|
|
31
|
+
"""
|
|
32
|
+
# Validate file path
|
|
33
|
+
validated_path = validate_file_path(file_path)
|
|
34
|
+
|
|
35
|
+
# Call OCR without images
|
|
36
|
+
response = process_local_file(validated_path, include_image_base64=False)
|
|
37
|
+
|
|
38
|
+
# Join page markdowns with double newline
|
|
39
|
+
page_markdowns = [page.markdown for page in response.pages]
|
|
40
|
+
return "\n\n".join(page_markdowns)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def extract_markdown_with_images(file_path: str, output_dir: str) -> Dict[str, Any]:
|
|
44
|
+
"""Extract markdown with embedded images and save them as separate files.
|
|
45
|
+
|
|
46
|
+
This function:
|
|
47
|
+
1. Validates both file_path and output_dir
|
|
48
|
+
2. Enforces sandbox constraints using config
|
|
49
|
+
3. Creates a unique output subdirectory
|
|
50
|
+
4. Calls OCR with include_image_base64=True
|
|
51
|
+
5. Saves images to the output subdirectory
|
|
52
|
+
6. Rewrites markdown to replace base64 URIs with relative paths
|
|
53
|
+
7. Saves the rewritten markdown as content.md
|
|
54
|
+
8. Returns metadata about the extracted content
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
file_path: Absolute path to the input file (PDF or image)
|
|
58
|
+
output_dir: Absolute path to the output directory (must be within allowed dir)
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Dictionary with keys:
|
|
62
|
+
- output_directory: Absolute path to the output subdirectory
|
|
63
|
+
- markdown_file: Absolute path to the content.md file
|
|
64
|
+
- images: List of saved image filenames (not full paths)
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
PathValidationError: If file_path or output_dir is invalid
|
|
68
|
+
MistralOCRAPIError: If the OCR API call fails
|
|
69
|
+
MistralOCRFileError: If filesystem operations fail
|
|
70
|
+
"""
|
|
71
|
+
# Load config to get allowed directory
|
|
72
|
+
config = load_config()
|
|
73
|
+
|
|
74
|
+
# Validate file path
|
|
75
|
+
validated_file_path = validate_file_path(file_path)
|
|
76
|
+
|
|
77
|
+
# Validate output directory with sandbox enforcement
|
|
78
|
+
validated_output_dir = validate_output_dir(
|
|
79
|
+
output_dir,
|
|
80
|
+
config.allowed_dir_resolved,
|
|
81
|
+
config.allowed_dir_original,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Create output subdirectory with collision handling
|
|
85
|
+
output_subdir = _create_output_subdirectory(
|
|
86
|
+
validated_output_dir, validated_file_path
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Call OCR with images
|
|
90
|
+
response = process_local_file(validated_file_path, include_image_base64=True)
|
|
91
|
+
|
|
92
|
+
# Extract images from response
|
|
93
|
+
images: List[dict] = []
|
|
94
|
+
for page in response.pages:
|
|
95
|
+
if hasattr(page, "images") and page.images:
|
|
96
|
+
images.extend(
|
|
97
|
+
[
|
|
98
|
+
img.model_dump() if hasattr(img, "model_dump") else img
|
|
99
|
+
for img in page.images
|
|
100
|
+
]
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Save images
|
|
104
|
+
saved_filenames = save_images(output_subdir, images)
|
|
105
|
+
|
|
106
|
+
# Join page markdowns
|
|
107
|
+
page_markdowns = [page.markdown for page in response.pages]
|
|
108
|
+
markdown_content = "\n\n".join(page_markdowns)
|
|
109
|
+
|
|
110
|
+
# Rewrite markdown to replace base64 URIs with relative paths
|
|
111
|
+
rewritten_markdown = rewrite_markdown(markdown_content, images, saved_filenames)
|
|
112
|
+
|
|
113
|
+
# Save markdown as content.md
|
|
114
|
+
markdown_file_path = output_subdir / "content.md"
|
|
115
|
+
markdown_file_path.write_text(rewritten_markdown, encoding="utf-8")
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
"output_directory": str(output_subdir),
|
|
119
|
+
"markdown_file": str(markdown_file_path),
|
|
120
|
+
"images": saved_filenames,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _create_output_subdirectory(output_dir: Path, file_path: Path) -> Path:
|
|
125
|
+
"""Create a unique output subdirectory for a file's extracted content.
|
|
126
|
+
|
|
127
|
+
The subdirectory name is based on the file stem (without extension).
|
|
128
|
+
If a directory with that name already exists, appends a timestamp
|
|
129
|
+
in the format _YYYYMMDD_HHMMSS.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
output_dir: The validated output directory
|
|
133
|
+
file_path: The validated input file path
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Path to the created output subdirectory
|
|
137
|
+
"""
|
|
138
|
+
base_name = file_path.stem
|
|
139
|
+
subdir_path = output_dir / base_name
|
|
140
|
+
|
|
141
|
+
# If base directory doesn't exist, just use it
|
|
142
|
+
if not subdir_path.exists():
|
|
143
|
+
subdir_path.mkdir(parents=True, exist_ok=True)
|
|
144
|
+
return subdir_path
|
|
145
|
+
|
|
146
|
+
# Directory exists, append timestamp until we find a unique name
|
|
147
|
+
while True:
|
|
148
|
+
timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
|
|
149
|
+
timestamped_path = output_dir / f"{base_name}{timestamp}"
|
|
150
|
+
|
|
151
|
+
if not timestamped_path.exists():
|
|
152
|
+
timestamped_path.mkdir(parents=True, exist_ok=True)
|
|
153
|
+
return timestamped_path
|
|
154
|
+
|
|
155
|
+
# Extremely unlikely but possible: timestamp collision
|
|
156
|
+
# Sleep a tiny bit and try again
|
|
157
|
+
import time
|
|
158
|
+
|
|
159
|
+
time.sleep(0.001)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Image handling for Mistral OCR MCP server.
|
|
2
|
+
|
|
3
|
+
This module handles parsing and saving base64-encoded images from OCR responses.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import binascii
|
|
8
|
+
import re
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Tuple
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ImageError(Exception):
|
|
14
|
+
"""Exception raised for image processing errors."""
|
|
15
|
+
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_data_uri(data_uri: str) -> Tuple[str, str]:
|
|
20
|
+
"""Parse a data URI to extract MIME type and raw base64 data.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
data_uri: Data URI string like `data:image/jpeg;base64,<...>`
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Tuple of (mime_type, raw_base64_string)
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ImageError: If the data URI is invalid or missing required parts
|
|
30
|
+
"""
|
|
31
|
+
if not data_uri:
|
|
32
|
+
raise ImageError("data_uri cannot be empty")
|
|
33
|
+
|
|
34
|
+
# Match data URI pattern: data:<mime>;base64,<data>
|
|
35
|
+
match = re.match(r"^data:([^;]*);base64,(.*)$", data_uri)
|
|
36
|
+
if not match:
|
|
37
|
+
raise ImageError(
|
|
38
|
+
f"Invalid data URI format, expected 'data:<mime>;base64,<data>': {data_uri[:50]}..."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
mime_type = match.group(1)
|
|
42
|
+
raw_b64 = match.group(2)
|
|
43
|
+
|
|
44
|
+
if not mime_type:
|
|
45
|
+
raise ImageError(f"Missing MIME type in data URI: {data_uri[:50]}...")
|
|
46
|
+
|
|
47
|
+
if not raw_b64:
|
|
48
|
+
raise ImageError(f"Missing base64 data in data URI: {data_uri[:50]}...")
|
|
49
|
+
|
|
50
|
+
return mime_type, raw_b64
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_extension_from_mime(mime_type: str) -> str:
|
|
54
|
+
"""Determine file extension from MIME type.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
mime_type: MIME type string like 'image/jpeg'
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
File extension including the dot (e.g., '.jpeg')
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
str: File extension with leading dot
|
|
64
|
+
"""
|
|
65
|
+
# Mapping of common image MIME types to extensions
|
|
66
|
+
mime_to_ext = {
|
|
67
|
+
"image/jpeg": ".jpeg",
|
|
68
|
+
"image/jpg": ".jpg",
|
|
69
|
+
"image/png": ".png",
|
|
70
|
+
"image/webp": ".webp",
|
|
71
|
+
"image/gif": ".gif",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# Normalize MIME type to lowercase
|
|
75
|
+
mime_lower = mime_type.lower()
|
|
76
|
+
|
|
77
|
+
# Default to .png if unknown
|
|
78
|
+
return mime_to_ext.get(mime_lower, ".png")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def save_base64_image(output_dir: Path, image_id: str, data_uri: str) -> str:
|
|
82
|
+
"""Decode a base64 data URI and save it as an image file.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
output_dir: Directory to save the image in
|
|
86
|
+
image_id: Identifier for the image (used as filename; may already include an extension)
|
|
87
|
+
data_uri: Base64 data URI string
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Filename of the saved image (without directory path)
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
ImageError: If parsing, decoding, or saving fails
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
mime_type, raw_b64 = parse_data_uri(data_uri)
|
|
97
|
+
ext = get_extension_from_mime(mime_type)
|
|
98
|
+
|
|
99
|
+
# Sanitize image_id to prevent path traversal attacks
|
|
100
|
+
# First, remove any path separators and null bytes
|
|
101
|
+
sanitized_id = image_id.replace("\0", "_")
|
|
102
|
+
# Remove any directory separators and parent directory references
|
|
103
|
+
sanitized_id = sanitized_id.replace("/", "_").replace("\\", "_")
|
|
104
|
+
sanitized_id = sanitized_id.replace("..", "__")
|
|
105
|
+
|
|
106
|
+
# Only allow alphanumeric characters, underscores, hyphens, and dots
|
|
107
|
+
sanitized_id = re.sub(r"[^\w\-.]", "_", sanitized_id)
|
|
108
|
+
|
|
109
|
+
# Ensure it doesn't start with a dot (hidden file) or dash (flag)
|
|
110
|
+
if sanitized_id.startswith(".") or sanitized_id.startswith("-"):
|
|
111
|
+
sanitized_id = "_" + sanitized_id.lstrip(".-")
|
|
112
|
+
|
|
113
|
+
# Remove any remaining path traversal patterns
|
|
114
|
+
sanitized_id = re.sub(r"\.\.+", "__", sanitized_id)
|
|
115
|
+
|
|
116
|
+
# Limit length to prevent filesystem issues
|
|
117
|
+
max_id_length = 200 # Leave room for extension
|
|
118
|
+
if len(sanitized_id) > max_id_length:
|
|
119
|
+
sanitized_id = sanitized_id[:max_id_length]
|
|
120
|
+
|
|
121
|
+
# Don't append extension if image_id already ends with it (case-insensitive)
|
|
122
|
+
# This prevents duplicate extensions like image.jpeg.jpeg
|
|
123
|
+
if sanitized_id.lower().endswith(ext.lower()):
|
|
124
|
+
filename = sanitized_id
|
|
125
|
+
else:
|
|
126
|
+
filename = f"{sanitized_id}{ext}"
|
|
127
|
+
|
|
128
|
+
output_path = output_dir / filename
|
|
129
|
+
|
|
130
|
+
# Decode base64
|
|
131
|
+
image_data = base64.b64decode(raw_b64)
|
|
132
|
+
|
|
133
|
+
# Ensure output directory exists
|
|
134
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
# Write file
|
|
137
|
+
output_path.write_bytes(image_data)
|
|
138
|
+
|
|
139
|
+
return filename
|
|
140
|
+
|
|
141
|
+
except (binascii.Error, ValueError) as e:
|
|
142
|
+
raise ImageError(
|
|
143
|
+
f"Failed to decode base64 image data for image_id={image_id}: {e}"
|
|
144
|
+
) from e
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def save_images(output_dir: Path, images: List[dict]) -> List[str]:
|
|
148
|
+
"""Save multiple base64-encoded images from OCR response.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
output_dir: Directory to save images in
|
|
152
|
+
images: List of image dictionaries with 'id' and 'image_base64' keys
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
List of saved filenames in the same order as input images
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
ImageError: If any image fails to parse, decode, or save
|
|
159
|
+
"""
|
|
160
|
+
saved_filenames = []
|
|
161
|
+
|
|
162
|
+
for img in images:
|
|
163
|
+
image_id = img.get("id")
|
|
164
|
+
image_base64 = img.get("image_base64")
|
|
165
|
+
|
|
166
|
+
if image_id is None:
|
|
167
|
+
raise ImageError("Image missing 'id' field")
|
|
168
|
+
|
|
169
|
+
if image_base64 is None:
|
|
170
|
+
raise ImageError(f"Image '{image_id}' missing 'image_base64' field")
|
|
171
|
+
|
|
172
|
+
filename = save_base64_image(output_dir, image_id, image_base64)
|
|
173
|
+
saved_filenames.append(filename)
|
|
174
|
+
|
|
175
|
+
return saved_filenames
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Markdown rewrite module for Mistral OCR MCP server.
|
|
2
|
+
|
|
3
|
+
This module rewrites markdown content to replace embedded base64 image URIs
|
|
4
|
+
with relative file paths.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def rewrite_markdown(
|
|
12
|
+
markdown: str, images: List[dict], output_filenames: Optional[List[str]] = None
|
|
13
|
+
) -> str:
|
|
14
|
+
"""Rewrite markdown to replace embedded base64 image URIs with relative paths.
|
|
15
|
+
|
|
16
|
+
The function uses a deterministic strategy:
|
|
17
|
+
1. First, try exact-match replacement using the image_base64 strings
|
|
18
|
+
returned by the API, paired with output_filenames.
|
|
19
|
+
2. If no output_filenames provided or exact matches fail, fall back to
|
|
20
|
+
sequential regex replacement in document order.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
markdown: The original markdown content with embedded base64 images
|
|
24
|
+
images: List of image dictionaries from OCR response with 'image_base64' keys
|
|
25
|
+
output_filenames: Optional list of filenames to use for exact-match replacement.
|
|
26
|
+
Must match length of images. If None, uses sequential strategy.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Rewritten markdown with base64 URIs replaced by relative paths like './img_id.ext'
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: If output_filenames is provided but doesn't match images length
|
|
33
|
+
"""
|
|
34
|
+
if output_filenames is not None:
|
|
35
|
+
if len(output_filenames) != len(images):
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"output_filenames length ({len(output_filenames)}) "
|
|
38
|
+
f"must match images length ({len(images)})"
|
|
39
|
+
)
|
|
40
|
+
# Strategy 1: Exact-match replacement
|
|
41
|
+
return _rewrite_exact_match(markdown, images, output_filenames)
|
|
42
|
+
else:
|
|
43
|
+
# Strategy 2: Sequential regex replacement
|
|
44
|
+
return _rewrite_sequential(markdown, images)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _rewrite_exact_match(
|
|
48
|
+
markdown: str, images: List[dict], output_filenames: List[str]
|
|
49
|
+
) -> str:
|
|
50
|
+
"""Rewrite using exact-match replacement of data URIs.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
markdown: Original markdown content
|
|
54
|
+
images: List of image dicts with 'image_base64' keys
|
|
55
|
+
output_filenames: List of filenames to use for replacement
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Rewritten markdown
|
|
59
|
+
"""
|
|
60
|
+
result = markdown
|
|
61
|
+
for img, filename in zip(images, output_filenames):
|
|
62
|
+
data_uri = img.get("image_base64")
|
|
63
|
+
if data_uri:
|
|
64
|
+
# Replace exact data URI string with relative path
|
|
65
|
+
result = result.replace(data_uri, f"./{filename}")
|
|
66
|
+
return result
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _rewrite_sequential(markdown: str, images: List[dict]) -> str:
|
|
70
|
+
"""Rewrite using sequential regex replacement.
|
|
71
|
+
|
|
72
|
+
Finds all data:image/...;base64,... patterns and replaces them
|
|
73
|
+
sequentially with ./<id>.ext using extensions determined from the data URI.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
markdown: Original markdown content
|
|
77
|
+
images: List of image dicts with 'id' and 'image_base64' keys
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Rewritten markdown
|
|
81
|
+
"""
|
|
82
|
+
# Import here to avoid circular dependency
|
|
83
|
+
from .images import get_extension_from_mime, parse_data_uri
|
|
84
|
+
|
|
85
|
+
result = markdown
|
|
86
|
+
|
|
87
|
+
# Pattern to match data URIs in markdown
|
|
88
|
+
# This matches: data:image/...;base64,... (case-insensitive for mime type)
|
|
89
|
+
data_uri_pattern = re.compile(r'data:image/[^;]+;base64,[^"\'\)]+', re.IGNORECASE)
|
|
90
|
+
|
|
91
|
+
# Find all data URIs in the markdown
|
|
92
|
+
matches = list(data_uri_pattern.finditer(result))
|
|
93
|
+
|
|
94
|
+
# Replace sequentially in order of appearance
|
|
95
|
+
for i, match in enumerate(matches):
|
|
96
|
+
# Get the image data for this position
|
|
97
|
+
if i < len(images):
|
|
98
|
+
img = images[i]
|
|
99
|
+
image_id = img.get("id", f"image_{i}")
|
|
100
|
+
data_uri = img.get("image_base64", match.group(0))
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
# Parse the data URI to get the extension
|
|
104
|
+
mime_type, _ = parse_data_uri(data_uri)
|
|
105
|
+
ext = get_extension_from_mime(mime_type)
|
|
106
|
+
except Exception:
|
|
107
|
+
# If parsing fails, default to .png
|
|
108
|
+
ext = ".png"
|
|
109
|
+
|
|
110
|
+
# Replace this occurrence with relative path
|
|
111
|
+
result = (
|
|
112
|
+
result[: match.start()] + f"./{image_id}{ext}" + result[match.end() :]
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
# More data URIs in markdown than images - skip extras
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
return result
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Mistral OCR client adapter.
|
|
2
|
+
|
|
3
|
+
This module wraps the official `mistralai` SDK behind a small adapter function.
|
|
4
|
+
It centralizes:
|
|
5
|
+
- client initialization from environment-based config
|
|
6
|
+
- the upload -> signed URL -> OCR process flow
|
|
7
|
+
- consistent error normalization (FR-6)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from types import SimpleNamespace
|
|
14
|
+
from typing import Any, Optional
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from mistralai import Mistral, models
|
|
18
|
+
except ModuleNotFoundError: # pragma: no cover
|
|
19
|
+
# Allow offline unit tests to inject a fake client without requiring the SDK.
|
|
20
|
+
Mistral = None # type: ignore[assignment]
|
|
21
|
+
models = SimpleNamespace( # type: ignore[assignment]
|
|
22
|
+
OCRResponse=Any,
|
|
23
|
+
MistralError=Exception,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from .config import load_config
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _mistral_error_types() -> tuple[type[BaseException], ...]:
|
|
30
|
+
"""Return the Mistral SDK exception types we normalize (FR-6.2)."""
|
|
31
|
+
|
|
32
|
+
error_types: list[type[BaseException]] = [models.MistralError]
|
|
33
|
+
sdk_error = getattr(models, "SDKError", None)
|
|
34
|
+
if sdk_error is not None:
|
|
35
|
+
error_types.append(sdk_error)
|
|
36
|
+
return tuple(error_types)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _format_mistral_error(e: BaseException) -> str:
|
|
40
|
+
status_code = getattr(e, "status_code", None)
|
|
41
|
+
message = getattr(e, "message", str(e))
|
|
42
|
+
|
|
43
|
+
if status_code is None:
|
|
44
|
+
return f"Mistral OCR request failed: {message}"
|
|
45
|
+
|
|
46
|
+
return f"Mistral OCR request failed (status={status_code}): {message}"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class MistralOCRClientError(RuntimeError):
|
|
50
|
+
"""Base exception for Mistral OCR client adapter errors."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MistralOCRAPIError(MistralOCRClientError):
|
|
54
|
+
"""Raised when the Mistral API returns an error."""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class MistralOCRFileError(MistralOCRClientError):
|
|
58
|
+
"""Raised when local filesystem operations fail."""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def process_local_file(
|
|
62
|
+
path: Path,
|
|
63
|
+
*,
|
|
64
|
+
include_image_base64: bool = False,
|
|
65
|
+
client: Optional[Mistral] = None,
|
|
66
|
+
) -> models.OCRResponse:
|
|
67
|
+
"""Run OCR against a local file path.
|
|
68
|
+
|
|
69
|
+
Flow:
|
|
70
|
+
1) Upload file via `client.files.upload(..., purpose="ocr")`
|
|
71
|
+
2) Fetch signed URL via `client.files.get_signed_url(...)`
|
|
72
|
+
3) Call `client.ocr.process(...)` with a `document_url` for PDFs and an
|
|
73
|
+
`image_url` for other supported image formats.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
path: Local filesystem path to a PDF or image.
|
|
77
|
+
include_image_base64: Whether to include base64 images in OCR response.
|
|
78
|
+
client: Optional injected Mistral client (useful for unit tests).
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
The SDK's OCRResponse.
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
MistralOCRAPIError: For SDK/API errors (includes status code + message).
|
|
85
|
+
MistralOCRFileError: For local filesystem errors (includes path + operation).
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def _process(mistral: Mistral) -> models.OCRResponse:
|
|
89
|
+
try:
|
|
90
|
+
with path.open("rb") as fh:
|
|
91
|
+
uploaded = mistral.files.upload(
|
|
92
|
+
file={"file_name": path.name, "content": fh},
|
|
93
|
+
purpose="ocr",
|
|
94
|
+
)
|
|
95
|
+
except OSError as e:
|
|
96
|
+
raise MistralOCRFileError(
|
|
97
|
+
f"Filesystem error during open/read for upload: path={path!s}"
|
|
98
|
+
) from e
|
|
99
|
+
except _mistral_error_types() as e:
|
|
100
|
+
raise MistralOCRAPIError(_format_mistral_error(e)) from e
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
signed_url = mistral.files.get_signed_url(file_id=uploaded.id)
|
|
104
|
+
|
|
105
|
+
is_pdf = path.suffix.lower() == ".pdf"
|
|
106
|
+
if is_pdf:
|
|
107
|
+
document = {"type": "document_url", "document_url": signed_url.url}
|
|
108
|
+
else:
|
|
109
|
+
document = {"type": "image_url", "image_url": signed_url.url}
|
|
110
|
+
|
|
111
|
+
return mistral.ocr.process(
|
|
112
|
+
model="mistral-ocr-latest",
|
|
113
|
+
document=document,
|
|
114
|
+
include_image_base64=bool(include_image_base64),
|
|
115
|
+
)
|
|
116
|
+
except _mistral_error_types() as e:
|
|
117
|
+
raise MistralOCRAPIError(_format_mistral_error(e)) from e
|
|
118
|
+
|
|
119
|
+
if client is not None:
|
|
120
|
+
return _process(client)
|
|
121
|
+
|
|
122
|
+
if Mistral is None:
|
|
123
|
+
raise MistralOCRClientError(
|
|
124
|
+
"mistralai SDK is required when no client is injected"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
config = load_config()
|
|
128
|
+
with Mistral(api_key=config.api_key) as mistral:
|
|
129
|
+
return _process(mistral)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Path sandbox validation for Mistral OCR MCP server.
|
|
2
|
+
|
|
3
|
+
This module provides validation for file paths and output directories,
|
|
4
|
+
ensuring they are within the allowed directory sandbox.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import tempfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Set
|
|
11
|
+
|
|
12
|
+
# Supported file extensions for OCR processing
|
|
13
|
+
SUPPORTED_EXTENSIONS: Set[str] = {
|
|
14
|
+
".pdf",
|
|
15
|
+
".png",
|
|
16
|
+
".jpg",
|
|
17
|
+
".jpeg",
|
|
18
|
+
".webp",
|
|
19
|
+
".gif",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PathValidationError(Exception):
|
|
24
|
+
"""Exception raised for path validation errors."""
|
|
25
|
+
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_file_path(file_path: str) -> Path:
|
|
30
|
+
"""Validate and canonicalize an input file path.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
file_path: Absolute path to the input file
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Resolved canonical Path to the file
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
PathValidationError: If path is not absolute, doesn't exist,
|
|
40
|
+
has unsupported extension, or other filesystem error
|
|
41
|
+
"""
|
|
42
|
+
path = Path(file_path)
|
|
43
|
+
|
|
44
|
+
# Check if absolute
|
|
45
|
+
if not path.is_absolute():
|
|
46
|
+
raise PathValidationError(
|
|
47
|
+
f"validate file_path: must be an absolute path: {file_path}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Canonicalize and check existence
|
|
51
|
+
try:
|
|
52
|
+
resolved_path = path.resolve(strict=True)
|
|
53
|
+
except FileNotFoundError:
|
|
54
|
+
raise PathValidationError(
|
|
55
|
+
f"validate file_path: resolve failed, path does not exist: {file_path}"
|
|
56
|
+
)
|
|
57
|
+
except RuntimeError as e:
|
|
58
|
+
# Can happen with infinite symlink loops
|
|
59
|
+
raise PathValidationError(
|
|
60
|
+
f"validate file_path: resolve failed: {file_path} - {e}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Check extension
|
|
64
|
+
if resolved_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
|
65
|
+
raise PathValidationError(
|
|
66
|
+
f"validate file_path: unsupported file type '{resolved_path.suffix}'. "
|
|
67
|
+
f"Supported types: .pdf, .png, .jpg, .jpeg, .webp, .gif. Path: {file_path}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return resolved_path
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def validate_output_dir(
|
|
74
|
+
output_dir: str,
|
|
75
|
+
allowed_dir_resolved: Path,
|
|
76
|
+
allowed_dir_original: str,
|
|
77
|
+
) -> Path:
|
|
78
|
+
"""Validate and canonicalize an output directory path.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
output_dir: Absolute path to the output directory
|
|
82
|
+
allowed_dir_resolved: Canonical path to the allowed directory
|
|
83
|
+
allowed_dir_original: Original string from environment (for error messages)
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Resolved canonical Path to the output directory
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
PathValidationError: If path is not absolute, doesn't exist,
|
|
90
|
+
is not a directory, not writable, or outside allowed dir
|
|
91
|
+
"""
|
|
92
|
+
path = Path(output_dir)
|
|
93
|
+
|
|
94
|
+
# Check if absolute
|
|
95
|
+
if not path.is_absolute():
|
|
96
|
+
raise PathValidationError(
|
|
97
|
+
f"validate output_dir: must be an absolute path: {output_dir}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Canonicalize and check existence
|
|
101
|
+
try:
|
|
102
|
+
resolved_path = path.resolve(strict=True)
|
|
103
|
+
except FileNotFoundError:
|
|
104
|
+
raise PathValidationError(
|
|
105
|
+
f"validate output_dir: resolve failed, path does not exist: {output_dir}"
|
|
106
|
+
)
|
|
107
|
+
except RuntimeError as e:
|
|
108
|
+
raise PathValidationError(
|
|
109
|
+
f"validate output_dir: resolve failed: {output_dir} - {e}"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Verify it's a directory
|
|
113
|
+
if not resolved_path.is_dir():
|
|
114
|
+
raise PathValidationError(
|
|
115
|
+
f"validate output_dir: path is not a directory: {output_dir}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Check writability - try creating a temporary file
|
|
119
|
+
try:
|
|
120
|
+
# Use mkstemp to create a unique temporary file atomically
|
|
121
|
+
# This avoids predictable filenames and doesn't follow pre-existing symlinks
|
|
122
|
+
fd, temp_path = tempfile.mkstemp(dir=str(resolved_path))
|
|
123
|
+
os.close(fd)
|
|
124
|
+
os.unlink(temp_path)
|
|
125
|
+
except PermissionError:
|
|
126
|
+
raise PathValidationError(
|
|
127
|
+
f"validate output_dir: writability check failed, directory not writable: {output_dir}"
|
|
128
|
+
)
|
|
129
|
+
except OSError as e:
|
|
130
|
+
raise PathValidationError(
|
|
131
|
+
f"validate output_dir: writability check failed: {output_dir} - {e}"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Sandbox enforcement: output_dir must be within allowed directory
|
|
135
|
+
try:
|
|
136
|
+
resolved_path.relative_to(allowed_dir_resolved)
|
|
137
|
+
except ValueError:
|
|
138
|
+
# output_dir is not a descendant of allowed_dir
|
|
139
|
+
raise PathValidationError(
|
|
140
|
+
f"output_dir must be within the allowed directory: {allowed_dir_original}"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return resolved_path
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""MCP server implementation for Mistral OCR."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from mcp.server.fastmcp import FastMCP
|
|
6
|
+
|
|
7
|
+
from .extraction import extract_markdown, extract_markdown_with_images
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Create the MCP server instance
|
|
11
|
+
mcp = FastMCP("Mistral OCR")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@mcp.tool(name="extract_markdown")
|
|
15
|
+
def extract_markdown_tool(file_path: str) -> str:
|
|
16
|
+
"""Extract markdown text from a PDF or image file.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
file_path: Absolute path to the input file (PDF or image)
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Extracted markdown content as a string
|
|
23
|
+
"""
|
|
24
|
+
return extract_markdown(file_path)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@mcp.tool(name="extract_markdown_with_images")
|
|
28
|
+
def extract_markdown_with_images_tool(
|
|
29
|
+
file_path: str, output_dir: str
|
|
30
|
+
) -> dict[str, Any]:
|
|
31
|
+
"""Extract markdown with embedded images and save them as separate files.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
file_path: Absolute path to the input file (PDF or image)
|
|
35
|
+
output_dir: Absolute path to an existing output directory (must be within allowed dir)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dictionary with:
|
|
39
|
+
- output_directory: Absolute path to the output subdirectory
|
|
40
|
+
- markdown_file: Absolute path to the content.md file
|
|
41
|
+
- images: List of saved image filenames (not full paths)
|
|
42
|
+
"""
|
|
43
|
+
result = extract_markdown_with_images(file_path, output_dir)
|
|
44
|
+
return result
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def list_tools_impl() -> list[str]:
|
|
48
|
+
"""List available tool names for testing purposes."""
|
|
49
|
+
return ["extract_markdown", "extract_markdown_with_images"]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def call_tool_impl(name: str, arguments: dict[str, Any]) -> Any:
|
|
53
|
+
"""Call a tool implementation for testing purposes.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
name: Tool name to call
|
|
57
|
+
arguments: Tool arguments as a dictionary
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Tool result or raises an error
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
ValueError: If tool name is unknown
|
|
64
|
+
"""
|
|
65
|
+
if name == "extract_markdown":
|
|
66
|
+
if "file_path" not in arguments:
|
|
67
|
+
raise ValueError("Missing required argument: file_path")
|
|
68
|
+
return extract_markdown(arguments["file_path"])
|
|
69
|
+
elif name == "extract_markdown_with_images":
|
|
70
|
+
if "file_path" not in arguments:
|
|
71
|
+
raise ValueError("Missing required argument: file_path")
|
|
72
|
+
if "output_dir" not in arguments:
|
|
73
|
+
raise ValueError("Missing required argument: output_dir")
|
|
74
|
+
return extract_markdown_with_images(
|
|
75
|
+
arguments["file_path"], arguments["output_dir"]
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError(f"Unknown tool: {name}")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def run() -> None:
|
|
82
|
+
"""Run the MCP server.
|
|
83
|
+
|
|
84
|
+
This is a synchronous wrapper that starts the stdio server.
|
|
85
|
+
"""
|
|
86
|
+
mcp.run()
|
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mistral-ocr-mcp
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: MCP server for extracting text and images from documents using Mistral OCR API
|
|
5
|
+
Project-URL: Homepage, https://github.com/ORDIS-Co-Ltd/mistral-ocr-mcp
|
|
6
|
+
Project-URL: Repository, https://github.com/ORDIS-Co-Ltd/mistral-ocr-mcp
|
|
7
|
+
Author: Ordis
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: mcp>=1.0.0
|
|
11
|
+
Requires-Dist: mistralai>=1.0.0
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# Mistral OCR MCP Server
|
|
17
|
+
|
|
18
|
+
A Model Context Protocol (MCP) server that provides tools for extracting text and images from PDF and image files using the Mistral OCR API.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- **Simple Text Extraction**: Extract markdown content from documents without handling images
|
|
23
|
+
- **Full Extraction with Images**: Extract markdown and save embedded images to disk with proper relative links
|
|
24
|
+
- **Security Sandbox**: Restricts file writes to a configured allowed directory
|
|
25
|
+
- **Zero-Install Deployment**: Run with `uvx` without prior installation
|
|
26
|
+
- **Supported Formats**: PDF (`.pdf`), PNG (`.png`), JPEG (`.jpg`, `.jpeg`), WebP (`.webp`), GIF (`.gif`)
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Quickstart
|
|
31
|
+
|
|
32
|
+
Run the server directly with `uvx` (no installation required):
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
MISTRAL_API_KEY="your-api-key-here" \
|
|
36
|
+
MISTRAL_OCR_ALLOWED_DIR="/absolute/path/to/allowed/directory" \
|
|
37
|
+
uvx mistral-ocr-mcp
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**Important**: `MISTRAL_OCR_ALLOWED_DIR` must be:
|
|
41
|
+
- An **absolute path** (e.g., `/Users/username/documents`, not `~/documents`)
|
|
42
|
+
- An **existing directory** on your filesystem
|
|
43
|
+
- The location where you want to allow the server to write extracted images
|
|
44
|
+
|
|
45
|
+
The server will start in stdio mode and wait for MCP client connections.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
### For Use with MCP Clients
|
|
52
|
+
|
|
53
|
+
Install via pip:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install mistral-ocr-mcp
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Then configure your MCP client (e.g., Claude Desktop) to run:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
mistral-ocr-mcp
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### For Development
|
|
66
|
+
|
|
67
|
+
Clone the repository and install with development dependencies:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git clone https://github.com/ORDIS-Co-Ltd/mistral-ocr-mcp
|
|
71
|
+
cd mistral-ocr-multimedia-mcp
|
|
72
|
+
pip install -e '.[dev]'
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Run the server:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
MISTRAL_API_KEY="your-key" \
|
|
79
|
+
MISTRAL_OCR_ALLOWED_DIR="/path/to/allowed/dir" \
|
|
80
|
+
python -m mistral_ocr_mcp
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Configuration
|
|
86
|
+
|
|
87
|
+
### Required Environment Variables
|
|
88
|
+
|
|
89
|
+
| Variable | Description | Example |
|
|
90
|
+
|----------|-------------|---------|
|
|
91
|
+
| `MISTRAL_API_KEY` | Your Mistral API key (never logged) | `sk-abc123...` |
|
|
92
|
+
| `MISTRAL_OCR_ALLOWED_DIR` | Absolute path to allowed write directory | `/Users/username/workdir` |
|
|
93
|
+
|
|
94
|
+
### Security Sandbox
|
|
95
|
+
|
|
96
|
+
The server enforces a **write directory sandbox** to prevent unauthorized file writes:
|
|
97
|
+
|
|
98
|
+
- **`extract_markdown`**: No write restrictions (read-only operation)
|
|
99
|
+
- **`extract_markdown_with_images`**: The `output_dir` parameter **must** be within `MISTRAL_OCR_ALLOWED_DIR`
|
|
100
|
+
|
|
101
|
+
**Validation Examples:**
|
|
102
|
+
|
|
103
|
+
| `MISTRAL_OCR_ALLOWED_DIR` | `output_dir` | Result |
|
|
104
|
+
|---------------------------|--------------|--------|
|
|
105
|
+
| `/Users/username/workdir` | `/Users/username/workdir/project/output` | ✅ Allowed |
|
|
106
|
+
| `/Users/username/workdir` | `/Users/username/workdir` | ✅ Allowed (exact match) |
|
|
107
|
+
| `/Users/username/workdir` | `/Users/username/documents` | ❌ Rejected |
|
|
108
|
+
| `/Users/username/workdir` | `/Users/username/workdir/../documents` | ❌ Rejected (resolves outside) |
|
|
109
|
+
|
|
110
|
+
**Security Notes:**
|
|
111
|
+
- All paths are canonicalized (symlinks resolved, `..` eliminated) before validation
|
|
112
|
+
- Image filenames are sanitized to prevent path traversal attacks
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Client Configuration
|
|
117
|
+
|
|
118
|
+
### Claude Desktop
|
|
119
|
+
|
|
120
|
+
Add this to your `claude_desktop_config.json`:
|
|
121
|
+
|
|
122
|
+
- **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
|
|
123
|
+
- **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
|
|
124
|
+
|
|
125
|
+
```json
|
|
126
|
+
{
|
|
127
|
+
"mcpServers": {
|
|
128
|
+
"mistral-ocr": {
|
|
129
|
+
"command": "uvx",
|
|
130
|
+
"args": ["mistral-ocr-mcp"],
|
|
131
|
+
"env": {
|
|
132
|
+
"MISTRAL_API_KEY": "your-api-key-here",
|
|
133
|
+
"MISTRAL_OCR_ALLOWED_DIR": "/absolute/path/to/allowed/directory"
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### OpenCode
|
|
141
|
+
|
|
142
|
+
Add this to the `mcp` section of your configuration file:
|
|
143
|
+
|
|
144
|
+
```json
|
|
145
|
+
{
|
|
146
|
+
"mcp": {
|
|
147
|
+
"mistral-ocr": {
|
|
148
|
+
"type": "local",
|
|
149
|
+
"command": ["uvx", "mistral-ocr-mcp"],
|
|
150
|
+
"enabled": true,
|
|
151
|
+
"environment": {
|
|
152
|
+
"MISTRAL_API_KEY": "your-api-key-here",
|
|
153
|
+
"MISTRAL_OCR_ALLOWED_DIR": "/absolute/path/to/allowed/directory"
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Codex
|
|
161
|
+
|
|
162
|
+
If you use the Codex CLI, you can add the server with:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
codex mcp add mistral-ocr -- uvx mistral-ocr-mcp
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Make sure the environment variables `MISTRAL_API_KEY` and `MISTRAL_OCR_ALLOWED_DIR` are set in your shell environment.
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Tool Reference
|
|
173
|
+
|
|
174
|
+
### Tool 1: `extract_markdown`
|
|
175
|
+
|
|
176
|
+
Extract markdown content from a document **without** saving images.
|
|
177
|
+
|
|
178
|
+
**Arguments:**
|
|
179
|
+
|
|
180
|
+
```json
|
|
181
|
+
{
|
|
182
|
+
"file_path": "/absolute/path/to/document.pdf"
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
| Parameter | Type | Required | Description |
|
|
187
|
+
|-----------|------|----------|-------------|
|
|
188
|
+
| `file_path` | `string` | Yes | Absolute path to input file (PDF or image) |
|
|
189
|
+
|
|
190
|
+
**Returns:**
|
|
191
|
+
|
|
192
|
+
```json
|
|
193
|
+
"# Document Title\n\nExtracted markdown content from all pages..."
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Returns a single string containing concatenated markdown from all pages.
|
|
197
|
+
|
|
198
|
+
**Example:**
|
|
199
|
+
|
|
200
|
+
```json
|
|
201
|
+
{
|
|
202
|
+
"tool": "extract_markdown",
|
|
203
|
+
"arguments": {
|
|
204
|
+
"file_path": "/Users/username/documents/report.pdf"
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
### Tool 2: `extract_markdown_with_images`
|
|
212
|
+
|
|
213
|
+
Extract markdown content **and** save embedded images to disk.
|
|
214
|
+
|
|
215
|
+
**Arguments:**
|
|
216
|
+
|
|
217
|
+
```json
|
|
218
|
+
{
|
|
219
|
+
"file_path": "/absolute/path/to/document.pdf",
|
|
220
|
+
"output_dir": "/absolute/path/to/output/parent"
|
|
221
|
+
}
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
| Parameter | Type | Required | Description |
|
|
225
|
+
|-----------|------|----------|-------------|
|
|
226
|
+
| `file_path` | `string` | Yes | Absolute path to input file (PDF or image) |
|
|
227
|
+
| `output_dir` | `string` | Yes | Absolute path to output parent directory (must exist and be writable, must be within `MISTRAL_OCR_ALLOWED_DIR`) |
|
|
228
|
+
|
|
229
|
+
**Returns:**
|
|
230
|
+
|
|
231
|
+
```json
|
|
232
|
+
{
|
|
233
|
+
"output_directory": "/absolute/path/to/output/parent/document",
|
|
234
|
+
"markdown_file": "/absolute/path/to/output/parent/document/content.md",
|
|
235
|
+
"images": ["img_abc123.png", "img_def456.jpeg"]
|
|
236
|
+
}
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
| Field | Type | Description |
|
|
240
|
+
|-------|------|-------------|
|
|
241
|
+
| `output_directory` | `string` | Absolute path to created subdirectory |
|
|
242
|
+
| `markdown_file` | `string` | Absolute path to `content.md` file |
|
|
243
|
+
| `images` | `array[string]` | List of saved image filenames (not full paths) |
|
|
244
|
+
|
|
245
|
+
**Behavior:**
|
|
246
|
+
|
|
247
|
+
1. Creates a subdirectory named after the input file stem (e.g., `report` for `report.pdf`)
|
|
248
|
+
2. If the subdirectory already exists, appends a timestamp: `report_20260102_143022`
|
|
249
|
+
3. Saves all extracted images as `<sanitized_id>.<ext>` (e.g., `img_abc123.png`)
|
|
250
|
+
4. Saves markdown to `content.md` with relative image links (e.g., ``)
|
|
251
|
+
|
|
252
|
+
**Example:**
|
|
253
|
+
|
|
254
|
+
```json
|
|
255
|
+
{
|
|
256
|
+
"tool": "extract_markdown_with_images",
|
|
257
|
+
"arguments": {
|
|
258
|
+
"file_path": "/Users/username/documents/quarterly-report.pdf",
|
|
259
|
+
"output_dir": "/Users/username/workdir/extracted"
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
**Output Structure:**
|
|
265
|
+
|
|
266
|
+
```
|
|
267
|
+
/Users/username/workdir/extracted/
|
|
268
|
+
quarterly-report/
|
|
269
|
+
content.md # Markdown with relative image links
|
|
270
|
+
img_abc123.png # First extracted image
|
|
271
|
+
img_def456.jpeg # Second extracted image
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## Example Client Usage
|
|
277
|
+
|
|
278
|
+
Here's a minimal Python example using the MCP SDK to call the tools:
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
import asyncio
|
|
282
|
+
from mcp import ClientSession, StdioServerParameters
|
|
283
|
+
from mcp.client.stdio import stdio_client
|
|
284
|
+
|
|
285
|
+
async def extract_document():
|
|
286
|
+
server_params = StdioServerParameters(
|
|
287
|
+
command="mistral-ocr-mcp",
|
|
288
|
+
env={
|
|
289
|
+
"MISTRAL_API_KEY": "your-api-key",
|
|
290
|
+
"MISTRAL_OCR_ALLOWED_DIR": "/Users/username/workdir"
|
|
291
|
+
}
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
async with stdio_client(server_params) as (read, write):
|
|
295
|
+
async with ClientSession(read, write) as session:
|
|
296
|
+
await session.initialize()
|
|
297
|
+
|
|
298
|
+
# Simple extraction
|
|
299
|
+
result = await session.call_tool(
|
|
300
|
+
"extract_markdown",
|
|
301
|
+
arguments={"file_path": "/path/to/document.pdf"}
|
|
302
|
+
)
|
|
303
|
+
print(result.content[0].text)
|
|
304
|
+
|
|
305
|
+
# Extraction with images
|
|
306
|
+
result = await session.call_tool(
|
|
307
|
+
"extract_markdown_with_images",
|
|
308
|
+
arguments={
|
|
309
|
+
"file_path": "/path/to/document.pdf",
|
|
310
|
+
"output_dir": "/Users/username/workdir/output"
|
|
311
|
+
}
|
|
312
|
+
)
|
|
313
|
+
print(result.content[0].text)
|
|
314
|
+
|
|
315
|
+
asyncio.run(extract_document())
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
---
|
|
319
|
+
|
|
320
|
+
## Troubleshooting
|
|
321
|
+
|
|
322
|
+
| Error | Cause | Solution |
|
|
323
|
+
|-------|-------|----------|
|
|
324
|
+
| `Missing required environment variable: MISTRAL_API_KEY` | `MISTRAL_API_KEY` not set | Set the environment variable before running the server |
|
|
325
|
+
| `Missing required environment variable: MISTRAL_OCR_ALLOWED_DIR` | `MISTRAL_OCR_ALLOWED_DIR` not set | Set the environment variable to an absolute path |
|
|
326
|
+
| `MISTRAL_OCR_ALLOWED_DIR must be an absolute path` | Relative path provided (e.g., `~/documents`) | Use an absolute path (e.g., `/Users/username/documents`) |
|
|
327
|
+
| `MISTRAL_OCR_ALLOWED_DIR does not exist` | Directory does not exist on filesystem | Create the directory first: `mkdir -p /path/to/dir` |
|
|
328
|
+
| `MISTRAL_OCR_ALLOWED_DIR is not a directory` | Path points to a file, not a directory | Ensure the path is a directory |
|
|
329
|
+
| `validate file_path: must be an absolute path: {path}` | Relative path provided for input file | Use an absolute path (e.g., `/Users/username/file.pdf`) |
|
|
330
|
+
| `validate file_path: resolve failed, path does not exist: {path}` | Input file does not exist | Check the file path and ensure the file exists |
|
|
331
|
+
| `validate file_path: unsupported file type '{suffix}'. Supported types: ...` | File extension not supported | Use `.pdf`, `.png`, `.jpg`, `.jpeg`, `.webp`, or `.gif` |
|
|
332
|
+
| `validate output_dir: resolve failed, path does not exist: {path}` | Output directory does not exist | Create the directory first: `mkdir -p {path}` |
|
|
333
|
+
| `validate output_dir: path is not a directory: {path}` | Path points to a file, not a directory | Ensure the path is a directory |
|
|
334
|
+
| `validate output_dir: writability check failed, directory not writable: {path}` | Output directory exists but is not writable | Check directory permissions: `chmod u+w {path}` |
|
|
335
|
+
| `output_dir must be within the allowed directory` | `output_dir` is outside `MISTRAL_OCR_ALLOWED_DIR` | Use a path within the allowed directory |
|
|
336
|
+
| `Mistral OCR request failed (status=401): {message}` | Invalid API key | Check your `MISTRAL_API_KEY` |
|
|
337
|
+
| `Mistral OCR request failed (status=429): {message}` | Rate limit exceeded | Wait and retry, or check your API quota |
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
## Development
|
|
342
|
+
|
|
343
|
+
### Install Development Dependencies
|
|
344
|
+
|
|
345
|
+
```bash
|
|
346
|
+
pip install -e '.[dev]'
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### Run Tests
|
|
350
|
+
|
|
351
|
+
Run the full test suite:
|
|
352
|
+
|
|
353
|
+
```bash
|
|
354
|
+
pytest
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
Run tests with verbose output:
|
|
358
|
+
|
|
359
|
+
```bash
|
|
360
|
+
pytest -v
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
Run tests in quiet mode:
|
|
364
|
+
|
|
365
|
+
```bash
|
|
366
|
+
pytest -q
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
### Project Structure
|
|
370
|
+
|
|
371
|
+
```
|
|
372
|
+
mistral-ocr-multimedia-mcp/
|
|
373
|
+
├── src/
|
|
374
|
+
│ └── mistral_ocr_mcp/
|
|
375
|
+
│ ├── __init__.py
|
|
376
|
+
│ ├── __main__.py # Entry point
|
|
377
|
+
│ ├── server.py # MCP server and tool definitions
|
|
378
|
+
│ ├── config.py # Configuration loading and validation
|
|
379
|
+
│ ├── extraction.py # OCR orchestration logic
|
|
380
|
+
│ ├── mistral_client.py # Mistral API client
|
|
381
|
+
│ ├── images.py # Image parsing and saving
|
|
382
|
+
│ ├── markdown_rewrite.py # Markdown link rewriting
|
|
383
|
+
│ └── path_sandbox.py # Path validation and sandbox enforcement
|
|
384
|
+
├── tests/ # Unit tests
|
|
385
|
+
├── pyproject.toml # Package configuration
|
|
386
|
+
└── README.md # This file
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
---
|
|
390
|
+
|
|
391
|
+
## License
|
|
392
|
+
|
|
393
|
+
MIT
|
|
394
|
+
|
|
395
|
+
---
|
|
396
|
+
|
|
397
|
+
## Contributing
|
|
398
|
+
|
|
399
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
400
|
+
|
|
401
|
+
---
|
|
402
|
+
|
|
403
|
+
## Links
|
|
404
|
+
|
|
405
|
+
- **GitHub Repository**: https://github.com/ORDIS-Co-Ltd/mistral-ocr-mcp
|
|
406
|
+
- **MCP Specification**: https://modelcontextprotocol.io
|
|
407
|
+
- **Mistral AI**: https://mistral.ai
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
mistral_ocr_mcp/__init__.py,sha256=UwK-luoFhJsyps4rpij6BGZiWyxMXX-AL2q_7tWLF5k,196
|
|
2
|
+
mistral_ocr_mcp/__main__.py,sha256=pDlN6SI9E51xUGgrAy6pJC_F1vEqI3vjIvDIGFOBqZ0,696
|
|
3
|
+
mistral_ocr_mcp/config.py,sha256=eHm-eQq2jIbuQoLDPUsi8llE0HKyd1v_rYGrknfiAhM,2658
|
|
4
|
+
mistral_ocr_mcp/extraction.py,sha256=wbRqjMGY86CiDPqEKYMP1rRYMJsxUDXG_PNBx9PeYRk,5408
|
|
5
|
+
mistral_ocr_mcp/images.py,sha256=aIb3k8bNDtdPZ0zg19YJzB_aLWrb0pCw4UOJdYa8Xxk,5357
|
|
6
|
+
mistral_ocr_mcp/markdown_rewrite.py,sha256=ktLnqEo1zwmMBE89N6VI557hAEsknANBC2sSs_ArauI,4245
|
|
7
|
+
mistral_ocr_mcp/mistral_client.py,sha256=aCo6HjmibTuID3itHqGdVfxBpie9kMteVx3GsS6l3SY,4223
|
|
8
|
+
mistral_ocr_mcp/path_sandbox.py,sha256=Xi5CuyTQvZ0L_9woTvkmttsbzdz6Sd8oRnlwjX79Eag,4377
|
|
9
|
+
mistral_ocr_mcp/server.py,sha256=425FeAvFNLRGDXZm7a4g5ADQpMWNSp1_EWkrXlPJnO0,2585
|
|
10
|
+
mistral_ocr_mcp-0.1.3.dist-info/METADATA,sha256=n8qinc4uKnB39GbH8XVpdKR0GNVQcqO4aYm9qyPD2tY,12059
|
|
11
|
+
mistral_ocr_mcp-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
12
|
+
mistral_ocr_mcp-0.1.3.dist-info/entry_points.txt,sha256=J-qJQ5P8-pJ6a8W7KXaesUuMFU3rFgYNyAsMCanQoqs,66
|
|
13
|
+
mistral_ocr_mcp-0.1.3.dist-info/RECORD,,
|