mistral-ocr-mcp 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ """Mistral OCR MCP Server.
2
+
3
+ A Model Context Protocol (MCP) server that provides tools for extracting
4
+ text and images from PDF and image files using the Mistral OCR API.
5
+ """
6
+
7
+ __version__ = "0.1.0"
@@ -0,0 +1,27 @@
1
+ """MCP server implementation for Mistral OCR."""
2
+
3
+ import sys
4
+ from .config import ConfigurationError, load_config
5
+ from .server import run
6
+
7
+
8
+ def main() -> int:
9
+ """Main entry point for the MCP server."""
10
+ try:
11
+ # Load and validate configuration before starting server
12
+ load_config()
13
+ run()
14
+ return 0
15
+ except ConfigurationError as e:
16
+ print(f"Configuration error: {e}", file=sys.stderr)
17
+ return 1
18
+ except NotImplementedError as e:
19
+ print(f"Error: {e}", file=sys.stderr)
20
+ return 1
21
+ except Exception as e:
22
+ print(f"Unexpected error: {e}", file=sys.stderr)
23
+ return 1
24
+
25
+
26
+ if __name__ == "__main__":
27
+ sys.exit(main())
@@ -0,0 +1,88 @@
1
+ """Configuration module for Mistral OCR MCP server.
2
+
3
+ This module loads and validates environment variables required for the server.
4
+ """
5
+
6
+ import os
7
+ from pathlib import Path
8
+ from typing import NamedTuple
9
+
10
+
11
+ class Config(NamedTuple):
12
+ """Configuration for the Mistral OCR MCP server.
13
+
14
+ Attributes:
15
+ api_key: Mistral API key (never logged)
16
+ allowed_dir_original: Original allowed directory string from environment
17
+ allowed_dir_resolved: Resolved canonical path to allowed directory
18
+ """
19
+
20
+ api_key: str
21
+ allowed_dir_original: str
22
+ allowed_dir_resolved: Path
23
+
24
+
25
+ class ConfigurationError(Exception):
26
+ """Exception raised for configuration errors."""
27
+
28
+ pass
29
+
30
+
31
+ def load_config() -> Config:
32
+ """Load and validate configuration from environment variables.
33
+
34
+ Reads:
35
+ - MISTRAL_API_KEY: Required API key for Mistral OCR service
36
+ - MISTRAL_OCR_ALLOWED_DIR: Required absolute path to allowed directory
37
+
38
+ Returns:
39
+ Config object with validated settings
40
+
41
+ Raises:
42
+ ConfigurationError: If any required environment variable is missing
43
+ or if the allowed directory is invalid
44
+ """
45
+ # Load API key
46
+ api_key = os.getenv("MISTRAL_API_KEY")
47
+ if not api_key:
48
+ raise ConfigurationError(
49
+ "Missing required environment variable: MISTRAL_API_KEY"
50
+ )
51
+
52
+ # Load allowed directory
53
+ allowed_dir_str = os.getenv("MISTRAL_OCR_ALLOWED_DIR")
54
+ if not allowed_dir_str:
55
+ raise ConfigurationError(
56
+ "Missing required environment variable: MISTRAL_OCR_ALLOWED_DIR"
57
+ )
58
+
59
+ # Verify it's an absolute path BEFORE canonicalization (SRS FR-5.3)
60
+ if not Path(allowed_dir_str).is_absolute():
61
+ raise ConfigurationError(
62
+ f"MISTRAL_OCR_ALLOWED_DIR must be an absolute path: {allowed_dir_str}"
63
+ )
64
+
65
+ # Validate and canonicalize allowed directory
66
+ try:
67
+ allowed_dir = Path(allowed_dir_str).resolve(strict=True)
68
+ except FileNotFoundError:
69
+ raise ConfigurationError(
70
+ f"MISTRAL_OCR_ALLOWED_DIR does not exist: {allowed_dir_str}"
71
+ )
72
+ except RuntimeError as e:
73
+ # Can happen if path contains infinite symlink loops
74
+ raise ConfigurationError(
75
+ f"Invalid MISTRAL_OCR_ALLOWED_DIR: {allowed_dir_str} - {e}"
76
+ )
77
+
78
+ # Verify it's a directory
79
+ if not allowed_dir.is_dir():
80
+ raise ConfigurationError(
81
+ f"MISTRAL_OCR_ALLOWED_DIR is not a directory: {allowed_dir_str}"
82
+ )
83
+
84
+ return Config(
85
+ api_key=api_key,
86
+ allowed_dir_original=allowed_dir_str,
87
+ allowed_dir_resolved=allowed_dir,
88
+ )
@@ -0,0 +1,159 @@
1
+ """Extraction orchestration for Mistral OCR MCP server.
2
+
3
+ This module provides the main extraction functions that orchestrate OCR calls,
4
+ image saving, and markdown rewriting.
5
+ """
6
+
7
+ import datetime
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List
10
+
11
+ from .config import load_config
12
+ from .images import save_images
13
+ from .markdown_rewrite import rewrite_markdown
14
+ from .mistral_client import process_local_file
15
+ from .path_sandbox import PathValidationError, validate_file_path, validate_output_dir
16
+
17
+
18
+ def extract_markdown(file_path: str) -> str:
19
+ """Extract markdown text from a file without images.
20
+
21
+ Args:
22
+ file_path: Absolute path to the input file (PDF or image)
23
+
24
+ Returns:
25
+ Concatenated markdown content from all pages
26
+
27
+ Raises:
28
+ PathValidationError: If file_path is invalid
29
+ MistralOCRAPIError: If the OCR API call fails
30
+ MistralOCRFileError: If filesystem operations fail
31
+ """
32
+ # Validate file path
33
+ validated_path = validate_file_path(file_path)
34
+
35
+ # Call OCR without images
36
+ response = process_local_file(validated_path, include_image_base64=False)
37
+
38
+ # Join page markdowns with double newline
39
+ page_markdowns = [page.markdown for page in response.pages]
40
+ return "\n\n".join(page_markdowns)
41
+
42
+
43
+ def extract_markdown_with_images(file_path: str, output_dir: str) -> Dict[str, Any]:
44
+ """Extract markdown with embedded images and save them as separate files.
45
+
46
+ This function:
47
+ 1. Validates both file_path and output_dir
48
+ 2. Enforces sandbox constraints using config
49
+ 3. Creates a unique output subdirectory
50
+ 4. Calls OCR with include_image_base64=True
51
+ 5. Saves images to the output subdirectory
52
+ 6. Rewrites markdown to replace base64 URIs with relative paths
53
+ 7. Saves the rewritten markdown as content.md
54
+ 8. Returns metadata about the extracted content
55
+
56
+ Args:
57
+ file_path: Absolute path to the input file (PDF or image)
58
+ output_dir: Absolute path to the output directory (must be within allowed dir)
59
+
60
+ Returns:
61
+ Dictionary with keys:
62
+ - output_directory: Absolute path to the output subdirectory
63
+ - markdown_file: Absolute path to the content.md file
64
+ - images: List of saved image filenames (not full paths)
65
+
66
+ Raises:
67
+ PathValidationError: If file_path or output_dir is invalid
68
+ MistralOCRAPIError: If the OCR API call fails
69
+ MistralOCRFileError: If filesystem operations fail
70
+ """
71
+ # Load config to get allowed directory
72
+ config = load_config()
73
+
74
+ # Validate file path
75
+ validated_file_path = validate_file_path(file_path)
76
+
77
+ # Validate output directory with sandbox enforcement
78
+ validated_output_dir = validate_output_dir(
79
+ output_dir,
80
+ config.allowed_dir_resolved,
81
+ config.allowed_dir_original,
82
+ )
83
+
84
+ # Create output subdirectory with collision handling
85
+ output_subdir = _create_output_subdirectory(
86
+ validated_output_dir, validated_file_path
87
+ )
88
+
89
+ # Call OCR with images
90
+ response = process_local_file(validated_file_path, include_image_base64=True)
91
+
92
+ # Extract images from response
93
+ images: List[dict] = []
94
+ for page in response.pages:
95
+ if hasattr(page, "images") and page.images:
96
+ images.extend(
97
+ [
98
+ img.model_dump() if hasattr(img, "model_dump") else img
99
+ for img in page.images
100
+ ]
101
+ )
102
+
103
+ # Save images
104
+ saved_filenames = save_images(output_subdir, images)
105
+
106
+ # Join page markdowns
107
+ page_markdowns = [page.markdown for page in response.pages]
108
+ markdown_content = "\n\n".join(page_markdowns)
109
+
110
+ # Rewrite markdown to replace base64 URIs with relative paths
111
+ rewritten_markdown = rewrite_markdown(markdown_content, images, saved_filenames)
112
+
113
+ # Save markdown as content.md
114
+ markdown_file_path = output_subdir / "content.md"
115
+ markdown_file_path.write_text(rewritten_markdown, encoding="utf-8")
116
+
117
+ return {
118
+ "output_directory": str(output_subdir),
119
+ "markdown_file": str(markdown_file_path),
120
+ "images": saved_filenames,
121
+ }
122
+
123
+
124
+ def _create_output_subdirectory(output_dir: Path, file_path: Path) -> Path:
125
+ """Create a unique output subdirectory for a file's extracted content.
126
+
127
+ The subdirectory name is based on the file stem (without extension).
128
+ If a directory with that name already exists, appends a timestamp
129
+ in the format _YYYYMMDD_HHMMSS.
130
+
131
+ Args:
132
+ output_dir: The validated output directory
133
+ file_path: The validated input file path
134
+
135
+ Returns:
136
+ Path to the created output subdirectory
137
+ """
138
+ base_name = file_path.stem
139
+ subdir_path = output_dir / base_name
140
+
141
+ # If base directory doesn't exist, just use it
142
+ if not subdir_path.exists():
143
+ subdir_path.mkdir(parents=True, exist_ok=True)
144
+ return subdir_path
145
+
146
+ # Directory exists, append timestamp until we find a unique name
147
+ while True:
148
+ timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
149
+ timestamped_path = output_dir / f"{base_name}{timestamp}"
150
+
151
+ if not timestamped_path.exists():
152
+ timestamped_path.mkdir(parents=True, exist_ok=True)
153
+ return timestamped_path
154
+
155
+ # Extremely unlikely but possible: timestamp collision
156
+ # Sleep a tiny bit and try again
157
+ import time
158
+
159
+ time.sleep(0.001)
@@ -0,0 +1,175 @@
1
+ """Image handling for Mistral OCR MCP server.
2
+
3
+ This module handles parsing and saving base64-encoded images from OCR responses.
4
+ """
5
+
6
+ import base64
7
+ import binascii
8
+ import re
9
+ from pathlib import Path
10
+ from typing import List, Tuple
11
+
12
+
13
+ class ImageError(Exception):
14
+ """Exception raised for image processing errors."""
15
+
16
+ pass
17
+
18
+
19
+ def parse_data_uri(data_uri: str) -> Tuple[str, str]:
20
+ """Parse a data URI to extract MIME type and raw base64 data.
21
+
22
+ Args:
23
+ data_uri: Data URI string like `data:image/jpeg;base64,<...>`
24
+
25
+ Returns:
26
+ Tuple of (mime_type, raw_base64_string)
27
+
28
+ Raises:
29
+ ImageError: If the data URI is invalid or missing required parts
30
+ """
31
+ if not data_uri:
32
+ raise ImageError("data_uri cannot be empty")
33
+
34
+ # Match data URI pattern: data:<mime>;base64,<data>
35
+ match = re.match(r"^data:([^;]*);base64,(.*)$", data_uri)
36
+ if not match:
37
+ raise ImageError(
38
+ f"Invalid data URI format, expected 'data:<mime>;base64,<data>': {data_uri[:50]}..."
39
+ )
40
+
41
+ mime_type = match.group(1)
42
+ raw_b64 = match.group(2)
43
+
44
+ if not mime_type:
45
+ raise ImageError(f"Missing MIME type in data URI: {data_uri[:50]}...")
46
+
47
+ if not raw_b64:
48
+ raise ImageError(f"Missing base64 data in data URI: {data_uri[:50]}...")
49
+
50
+ return mime_type, raw_b64
51
+
52
+
53
+ def get_extension_from_mime(mime_type: str) -> str:
54
+ """Determine file extension from MIME type.
55
+
56
+ Args:
57
+ mime_type: MIME type string like 'image/jpeg'
58
+
59
+ Returns:
60
+ File extension including the dot (e.g., '.jpeg')
61
+
62
+ Returns:
63
+ str: File extension with leading dot
64
+ """
65
+ # Mapping of common image MIME types to extensions
66
+ mime_to_ext = {
67
+ "image/jpeg": ".jpeg",
68
+ "image/jpg": ".jpg",
69
+ "image/png": ".png",
70
+ "image/webp": ".webp",
71
+ "image/gif": ".gif",
72
+ }
73
+
74
+ # Normalize MIME type to lowercase
75
+ mime_lower = mime_type.lower()
76
+
77
+ # Default to .png if unknown
78
+ return mime_to_ext.get(mime_lower, ".png")
79
+
80
+
81
+ def save_base64_image(output_dir: Path, image_id: str, data_uri: str) -> str:
82
+ """Decode a base64 data URI and save it as an image file.
83
+
84
+ Args:
85
+ output_dir: Directory to save the image in
86
+ image_id: Identifier for the image (used as filename; may already include an extension)
87
+ data_uri: Base64 data URI string
88
+
89
+ Returns:
90
+ Filename of the saved image (without directory path)
91
+
92
+ Raises:
93
+ ImageError: If parsing, decoding, or saving fails
94
+ """
95
+ try:
96
+ mime_type, raw_b64 = parse_data_uri(data_uri)
97
+ ext = get_extension_from_mime(mime_type)
98
+
99
+ # Sanitize image_id to prevent path traversal attacks
100
+ # First, remove any path separators and null bytes
101
+ sanitized_id = image_id.replace("\0", "_")
102
+ # Remove any directory separators and parent directory references
103
+ sanitized_id = sanitized_id.replace("/", "_").replace("\\", "_")
104
+ sanitized_id = sanitized_id.replace("..", "__")
105
+
106
+ # Only allow alphanumeric characters, underscores, hyphens, and dots
107
+ sanitized_id = re.sub(r"[^\w\-.]", "_", sanitized_id)
108
+
109
+ # Ensure it doesn't start with a dot (hidden file) or dash (flag)
110
+ if sanitized_id.startswith(".") or sanitized_id.startswith("-"):
111
+ sanitized_id = "_" + sanitized_id.lstrip(".-")
112
+
113
+ # Remove any remaining path traversal patterns
114
+ sanitized_id = re.sub(r"\.\.+", "__", sanitized_id)
115
+
116
+ # Limit length to prevent filesystem issues
117
+ max_id_length = 200 # Leave room for extension
118
+ if len(sanitized_id) > max_id_length:
119
+ sanitized_id = sanitized_id[:max_id_length]
120
+
121
+ # Don't append extension if image_id already ends with it (case-insensitive)
122
+ # This prevents duplicate extensions like image.jpeg.jpeg
123
+ if sanitized_id.lower().endswith(ext.lower()):
124
+ filename = sanitized_id
125
+ else:
126
+ filename = f"{sanitized_id}{ext}"
127
+
128
+ output_path = output_dir / filename
129
+
130
+ # Decode base64
131
+ image_data = base64.b64decode(raw_b64)
132
+
133
+ # Ensure output directory exists
134
+ output_dir.mkdir(parents=True, exist_ok=True)
135
+
136
+ # Write file
137
+ output_path.write_bytes(image_data)
138
+
139
+ return filename
140
+
141
+ except (binascii.Error, ValueError) as e:
142
+ raise ImageError(
143
+ f"Failed to decode base64 image data for image_id={image_id}: {e}"
144
+ ) from e
145
+
146
+
147
+ def save_images(output_dir: Path, images: List[dict]) -> List[str]:
148
+ """Save multiple base64-encoded images from OCR response.
149
+
150
+ Args:
151
+ output_dir: Directory to save images in
152
+ images: List of image dictionaries with 'id' and 'image_base64' keys
153
+
154
+ Returns:
155
+ List of saved filenames in the same order as input images
156
+
157
+ Raises:
158
+ ImageError: If any image fails to parse, decode, or save
159
+ """
160
+ saved_filenames = []
161
+
162
+ for img in images:
163
+ image_id = img.get("id")
164
+ image_base64 = img.get("image_base64")
165
+
166
+ if image_id is None:
167
+ raise ImageError("Image missing 'id' field")
168
+
169
+ if image_base64 is None:
170
+ raise ImageError(f"Image '{image_id}' missing 'image_base64' field")
171
+
172
+ filename = save_base64_image(output_dir, image_id, image_base64)
173
+ saved_filenames.append(filename)
174
+
175
+ return saved_filenames
@@ -0,0 +1,118 @@
1
+ """Markdown rewrite module for Mistral OCR MCP server.
2
+
3
+ This module rewrites markdown content to replace embedded base64 image URIs
4
+ with relative file paths.
5
+ """
6
+
7
+ import re
8
+ from typing import List, Optional
9
+
10
+
11
+ def rewrite_markdown(
12
+ markdown: str, images: List[dict], output_filenames: Optional[List[str]] = None
13
+ ) -> str:
14
+ """Rewrite markdown to replace embedded base64 image URIs with relative paths.
15
+
16
+ The function uses a deterministic strategy:
17
+ 1. First, try exact-match replacement using the image_base64 strings
18
+ returned by the API, paired with output_filenames.
19
+ 2. If no output_filenames provided or exact matches fail, fall back to
20
+ sequential regex replacement in document order.
21
+
22
+ Args:
23
+ markdown: The original markdown content with embedded base64 images
24
+ images: List of image dictionaries from OCR response with 'image_base64' keys
25
+ output_filenames: Optional list of filenames to use for exact-match replacement.
26
+ Must match length of images. If None, uses sequential strategy.
27
+
28
+ Returns:
29
+ Rewritten markdown with base64 URIs replaced by relative paths like './img_id.ext'
30
+
31
+ Raises:
32
+ ValueError: If output_filenames is provided but doesn't match images length
33
+ """
34
+ if output_filenames is not None:
35
+ if len(output_filenames) != len(images):
36
+ raise ValueError(
37
+ f"output_filenames length ({len(output_filenames)}) "
38
+ f"must match images length ({len(images)})"
39
+ )
40
+ # Strategy 1: Exact-match replacement
41
+ return _rewrite_exact_match(markdown, images, output_filenames)
42
+ else:
43
+ # Strategy 2: Sequential regex replacement
44
+ return _rewrite_sequential(markdown, images)
45
+
46
+
47
+ def _rewrite_exact_match(
48
+ markdown: str, images: List[dict], output_filenames: List[str]
49
+ ) -> str:
50
+ """Rewrite using exact-match replacement of data URIs.
51
+
52
+ Args:
53
+ markdown: Original markdown content
54
+ images: List of image dicts with 'image_base64' keys
55
+ output_filenames: List of filenames to use for replacement
56
+
57
+ Returns:
58
+ Rewritten markdown
59
+ """
60
+ result = markdown
61
+ for img, filename in zip(images, output_filenames):
62
+ data_uri = img.get("image_base64")
63
+ if data_uri:
64
+ # Replace exact data URI string with relative path
65
+ result = result.replace(data_uri, f"./{filename}")
66
+ return result
67
+
68
+
69
+ def _rewrite_sequential(markdown: str, images: List[dict]) -> str:
70
+ """Rewrite using sequential regex replacement.
71
+
72
+ Finds all data:image/...;base64,... patterns and replaces them
73
+ sequentially with ./<id>.ext using extensions determined from the data URI.
74
+
75
+ Args:
76
+ markdown: Original markdown content
77
+ images: List of image dicts with 'id' and 'image_base64' keys
78
+
79
+ Returns:
80
+ Rewritten markdown
81
+ """
82
+ # Import here to avoid circular dependency
83
+ from .images import get_extension_from_mime, parse_data_uri
84
+
85
+ result = markdown
86
+
87
+ # Pattern to match data URIs in markdown
88
+ # This matches: data:image/...;base64,... (case-insensitive for mime type)
89
+ data_uri_pattern = re.compile(r'data:image/[^;]+;base64,[^"\'\)]+', re.IGNORECASE)
90
+
91
+ # Find all data URIs in the markdown
92
+ matches = list(data_uri_pattern.finditer(result))
93
+
94
+ # Replace sequentially in order of appearance
95
+ for i, match in enumerate(matches):
96
+ # Get the image data for this position
97
+ if i < len(images):
98
+ img = images[i]
99
+ image_id = img.get("id", f"image_{i}")
100
+ data_uri = img.get("image_base64", match.group(0))
101
+
102
+ try:
103
+ # Parse the data URI to get the extension
104
+ mime_type, _ = parse_data_uri(data_uri)
105
+ ext = get_extension_from_mime(mime_type)
106
+ except Exception:
107
+ # If parsing fails, default to .png
108
+ ext = ".png"
109
+
110
+ # Replace this occurrence with relative path
111
+ result = (
112
+ result[: match.start()] + f"./{image_id}{ext}" + result[match.end() :]
113
+ )
114
+ else:
115
+ # More data URIs in markdown than images - skip extras
116
+ break
117
+
118
+ return result
@@ -0,0 +1,129 @@
1
+ """Mistral OCR client adapter.
2
+
3
+ This module wraps the official `mistralai` SDK behind a small adapter function.
4
+ It centralizes:
5
+ - client initialization from environment-based config
6
+ - the upload -> signed URL -> OCR process flow
7
+ - consistent error normalization (FR-6)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+ from types import SimpleNamespace
14
+ from typing import Any, Optional
15
+
16
+ try:
17
+ from mistralai import Mistral, models
18
+ except ModuleNotFoundError: # pragma: no cover
19
+ # Allow offline unit tests to inject a fake client without requiring the SDK.
20
+ Mistral = None # type: ignore[assignment]
21
+ models = SimpleNamespace( # type: ignore[assignment]
22
+ OCRResponse=Any,
23
+ MistralError=Exception,
24
+ )
25
+
26
+ from .config import load_config
27
+
28
+
29
+ def _mistral_error_types() -> tuple[type[BaseException], ...]:
30
+ """Return the Mistral SDK exception types we normalize (FR-6.2)."""
31
+
32
+ error_types: list[type[BaseException]] = [models.MistralError]
33
+ sdk_error = getattr(models, "SDKError", None)
34
+ if sdk_error is not None:
35
+ error_types.append(sdk_error)
36
+ return tuple(error_types)
37
+
38
+
39
+ def _format_mistral_error(e: BaseException) -> str:
40
+ status_code = getattr(e, "status_code", None)
41
+ message = getattr(e, "message", str(e))
42
+
43
+ if status_code is None:
44
+ return f"Mistral OCR request failed: {message}"
45
+
46
+ return f"Mistral OCR request failed (status={status_code}): {message}"
47
+
48
+
49
+ class MistralOCRClientError(RuntimeError):
50
+ """Base exception for Mistral OCR client adapter errors."""
51
+
52
+
53
+ class MistralOCRAPIError(MistralOCRClientError):
54
+ """Raised when the Mistral API returns an error."""
55
+
56
+
57
+ class MistralOCRFileError(MistralOCRClientError):
58
+ """Raised when local filesystem operations fail."""
59
+
60
+
61
+ def process_local_file(
62
+ path: Path,
63
+ *,
64
+ include_image_base64: bool = False,
65
+ client: Optional[Mistral] = None,
66
+ ) -> models.OCRResponse:
67
+ """Run OCR against a local file path.
68
+
69
+ Flow:
70
+ 1) Upload file via `client.files.upload(..., purpose="ocr")`
71
+ 2) Fetch signed URL via `client.files.get_signed_url(...)`
72
+ 3) Call `client.ocr.process(...)` with a `document_url` for PDFs and an
73
+ `image_url` for other supported image formats.
74
+
75
+ Args:
76
+ path: Local filesystem path to a PDF or image.
77
+ include_image_base64: Whether to include base64 images in OCR response.
78
+ client: Optional injected Mistral client (useful for unit tests).
79
+
80
+ Returns:
81
+ The SDK's OCRResponse.
82
+
83
+ Raises:
84
+ MistralOCRAPIError: For SDK/API errors (includes status code + message).
85
+ MistralOCRFileError: For local filesystem errors (includes path + operation).
86
+ """
87
+
88
+ def _process(mistral: Mistral) -> models.OCRResponse:
89
+ try:
90
+ with path.open("rb") as fh:
91
+ uploaded = mistral.files.upload(
92
+ file={"file_name": path.name, "content": fh},
93
+ purpose="ocr",
94
+ )
95
+ except OSError as e:
96
+ raise MistralOCRFileError(
97
+ f"Filesystem error during open/read for upload: path={path!s}"
98
+ ) from e
99
+ except _mistral_error_types() as e:
100
+ raise MistralOCRAPIError(_format_mistral_error(e)) from e
101
+
102
+ try:
103
+ signed_url = mistral.files.get_signed_url(file_id=uploaded.id)
104
+
105
+ is_pdf = path.suffix.lower() == ".pdf"
106
+ if is_pdf:
107
+ document = {"type": "document_url", "document_url": signed_url.url}
108
+ else:
109
+ document = {"type": "image_url", "image_url": signed_url.url}
110
+
111
+ return mistral.ocr.process(
112
+ model="mistral-ocr-latest",
113
+ document=document,
114
+ include_image_base64=bool(include_image_base64),
115
+ )
116
+ except _mistral_error_types() as e:
117
+ raise MistralOCRAPIError(_format_mistral_error(e)) from e
118
+
119
+ if client is not None:
120
+ return _process(client)
121
+
122
+ if Mistral is None:
123
+ raise MistralOCRClientError(
124
+ "mistralai SDK is required when no client is injected"
125
+ )
126
+
127
+ config = load_config()
128
+ with Mistral(api_key=config.api_key) as mistral:
129
+ return _process(mistral)
@@ -0,0 +1,143 @@
1
+ """Path sandbox validation for Mistral OCR MCP server.
2
+
3
+ This module provides validation for file paths and output directories,
4
+ ensuring they are within the allowed directory sandbox.
5
+ """
6
+
7
+ import os
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import Set
11
+
12
+ # Supported file extensions for OCR processing
13
+ SUPPORTED_EXTENSIONS: Set[str] = {
14
+ ".pdf",
15
+ ".png",
16
+ ".jpg",
17
+ ".jpeg",
18
+ ".webp",
19
+ ".gif",
20
+ }
21
+
22
+
23
+ class PathValidationError(Exception):
24
+ """Exception raised for path validation errors."""
25
+
26
+ pass
27
+
28
+
29
+ def validate_file_path(file_path: str) -> Path:
30
+ """Validate and canonicalize an input file path.
31
+
32
+ Args:
33
+ file_path: Absolute path to the input file
34
+
35
+ Returns:
36
+ Resolved canonical Path to the file
37
+
38
+ Raises:
39
+ PathValidationError: If path is not absolute, doesn't exist,
40
+ has unsupported extension, or other filesystem error
41
+ """
42
+ path = Path(file_path)
43
+
44
+ # Check if absolute
45
+ if not path.is_absolute():
46
+ raise PathValidationError(
47
+ f"validate file_path: must be an absolute path: {file_path}"
48
+ )
49
+
50
+ # Canonicalize and check existence
51
+ try:
52
+ resolved_path = path.resolve(strict=True)
53
+ except FileNotFoundError:
54
+ raise PathValidationError(
55
+ f"validate file_path: resolve failed, path does not exist: {file_path}"
56
+ )
57
+ except RuntimeError as e:
58
+ # Can happen with infinite symlink loops
59
+ raise PathValidationError(
60
+ f"validate file_path: resolve failed: {file_path} - {e}"
61
+ )
62
+
63
+ # Check extension
64
+ if resolved_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
65
+ raise PathValidationError(
66
+ f"validate file_path: unsupported file type '{resolved_path.suffix}'. "
67
+ f"Supported types: .pdf, .png, .jpg, .jpeg, .webp, .gif. Path: {file_path}"
68
+ )
69
+
70
+ return resolved_path
71
+
72
+
73
+ def validate_output_dir(
74
+ output_dir: str,
75
+ allowed_dir_resolved: Path,
76
+ allowed_dir_original: str,
77
+ ) -> Path:
78
+ """Validate and canonicalize an output directory path.
79
+
80
+ Args:
81
+ output_dir: Absolute path to the output directory
82
+ allowed_dir_resolved: Canonical path to the allowed directory
83
+ allowed_dir_original: Original string from environment (for error messages)
84
+
85
+ Returns:
86
+ Resolved canonical Path to the output directory
87
+
88
+ Raises:
89
+ PathValidationError: If path is not absolute, doesn't exist,
90
+ is not a directory, not writable, or outside allowed dir
91
+ """
92
+ path = Path(output_dir)
93
+
94
+ # Check if absolute
95
+ if not path.is_absolute():
96
+ raise PathValidationError(
97
+ f"validate output_dir: must be an absolute path: {output_dir}"
98
+ )
99
+
100
+ # Canonicalize and check existence
101
+ try:
102
+ resolved_path = path.resolve(strict=True)
103
+ except FileNotFoundError:
104
+ raise PathValidationError(
105
+ f"validate output_dir: resolve failed, path does not exist: {output_dir}"
106
+ )
107
+ except RuntimeError as e:
108
+ raise PathValidationError(
109
+ f"validate output_dir: resolve failed: {output_dir} - {e}"
110
+ )
111
+
112
+ # Verify it's a directory
113
+ if not resolved_path.is_dir():
114
+ raise PathValidationError(
115
+ f"validate output_dir: path is not a directory: {output_dir}"
116
+ )
117
+
118
+ # Check writability - try creating a temporary file
119
+ try:
120
+ # Use mkstemp to create a unique temporary file atomically
121
+ # This avoids predictable filenames and doesn't follow pre-existing symlinks
122
+ fd, temp_path = tempfile.mkstemp(dir=str(resolved_path))
123
+ os.close(fd)
124
+ os.unlink(temp_path)
125
+ except PermissionError:
126
+ raise PathValidationError(
127
+ f"validate output_dir: writability check failed, directory not writable: {output_dir}"
128
+ )
129
+ except OSError as e:
130
+ raise PathValidationError(
131
+ f"validate output_dir: writability check failed: {output_dir} - {e}"
132
+ )
133
+
134
+ # Sandbox enforcement: output_dir must be within allowed directory
135
+ try:
136
+ resolved_path.relative_to(allowed_dir_resolved)
137
+ except ValueError:
138
+ # output_dir is not a descendant of allowed_dir
139
+ raise PathValidationError(
140
+ f"output_dir must be within the allowed directory: {allowed_dir_original}"
141
+ )
142
+
143
+ return resolved_path
@@ -0,0 +1,86 @@
1
+ """MCP server implementation for Mistral OCR."""
2
+
3
+ from typing import Any
4
+
5
+ from mcp.server.fastmcp import FastMCP
6
+
7
+ from .extraction import extract_markdown, extract_markdown_with_images
8
+
9
+
10
+ # Create the MCP server instance
11
+ mcp = FastMCP("Mistral OCR")
12
+
13
+
14
+ @mcp.tool(name="extract_markdown")
15
+ def extract_markdown_tool(file_path: str) -> str:
16
+ """Extract markdown text from a PDF or image file.
17
+
18
+ Args:
19
+ file_path: Absolute path to the input file (PDF or image)
20
+
21
+ Returns:
22
+ Extracted markdown content as a string
23
+ """
24
+ return extract_markdown(file_path)
25
+
26
+
27
+ @mcp.tool(name="extract_markdown_with_images")
28
+ def extract_markdown_with_images_tool(
29
+ file_path: str, output_dir: str
30
+ ) -> dict[str, Any]:
31
+ """Extract markdown with embedded images and save them as separate files.
32
+
33
+ Args:
34
+ file_path: Absolute path to the input file (PDF or image)
35
+ output_dir: Absolute path to an existing output directory (must be within allowed dir)
36
+
37
+ Returns:
38
+ Dictionary with:
39
+ - output_directory: Absolute path to the output subdirectory
40
+ - markdown_file: Absolute path to the content.md file
41
+ - images: List of saved image filenames (not full paths)
42
+ """
43
+ result = extract_markdown_with_images(file_path, output_dir)
44
+ return result
45
+
46
+
47
+ def list_tools_impl() -> list[str]:
48
+ """List available tool names for testing purposes."""
49
+ return ["extract_markdown", "extract_markdown_with_images"]
50
+
51
+
52
+ def call_tool_impl(name: str, arguments: dict[str, Any]) -> Any:
53
+ """Call a tool implementation for testing purposes.
54
+
55
+ Args:
56
+ name: Tool name to call
57
+ arguments: Tool arguments as a dictionary
58
+
59
+ Returns:
60
+ Tool result or raises an error
61
+
62
+ Raises:
63
+ ValueError: If tool name is unknown
64
+ """
65
+ if name == "extract_markdown":
66
+ if "file_path" not in arguments:
67
+ raise ValueError("Missing required argument: file_path")
68
+ return extract_markdown(arguments["file_path"])
69
+ elif name == "extract_markdown_with_images":
70
+ if "file_path" not in arguments:
71
+ raise ValueError("Missing required argument: file_path")
72
+ if "output_dir" not in arguments:
73
+ raise ValueError("Missing required argument: output_dir")
74
+ return extract_markdown_with_images(
75
+ arguments["file_path"], arguments["output_dir"]
76
+ )
77
+ else:
78
+ raise ValueError(f"Unknown tool: {name}")
79
+
80
+
81
+ def run() -> None:
82
+ """Run the MCP server.
83
+
84
+ This is a synchronous wrapper that starts the stdio server.
85
+ """
86
+ mcp.run()
@@ -0,0 +1,407 @@
1
+ Metadata-Version: 2.4
2
+ Name: mistral-ocr-mcp
3
+ Version: 0.1.3
4
+ Summary: MCP server for extracting text and images from documents using Mistral OCR API
5
+ Project-URL: Homepage, https://github.com/ORDIS-Co-Ltd/mistral-ocr-mcp
6
+ Project-URL: Repository, https://github.com/ORDIS-Co-Ltd/mistral-ocr-mcp
7
+ Author: Ordis
8
+ License: MIT
9
+ Requires-Python: >=3.10
10
+ Requires-Dist: mcp>=1.0.0
11
+ Requires-Dist: mistralai>=1.0.0
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Mistral OCR MCP Server
17
+
18
+ A Model Context Protocol (MCP) server that provides tools for extracting text and images from PDF and image files using the Mistral OCR API.
19
+
20
+ ## Features
21
+
22
+ - **Simple Text Extraction**: Extract markdown content from documents without handling images
23
+ - **Full Extraction with Images**: Extract markdown and save embedded images to disk with proper relative links
24
+ - **Security Sandbox**: Restricts file writes to a configured allowed directory
25
+ - **Zero-Install Deployment**: Run with `uvx` without prior installation
26
+ - **Supported Formats**: PDF (`.pdf`), PNG (`.png`), JPEG (`.jpg`, `.jpeg`), WebP (`.webp`), GIF (`.gif`)
27
+
28
+ ---
29
+
30
+ ## Quickstart
31
+
32
+ Run the server directly with `uvx` (no installation required):
33
+
34
+ ```bash
35
+ MISTRAL_API_KEY="your-api-key-here" \
36
+ MISTRAL_OCR_ALLOWED_DIR="/absolute/path/to/allowed/directory" \
37
+ uvx mistral-ocr-mcp
38
+ ```
39
+
40
+ **Important**: `MISTRAL_OCR_ALLOWED_DIR` must be:
41
+ - An **absolute path** (e.g., `/Users/username/documents`, not `~/documents`)
42
+ - An **existing directory** on your filesystem
43
+ - The location where you want to allow the server to write extracted images
44
+
45
+ The server will start in stdio mode and wait for MCP client connections.
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ ### For Use with MCP Clients
52
+
53
+ Install via pip:
54
+
55
+ ```bash
56
+ pip install mistral-ocr-mcp
57
+ ```
58
+
59
+ Then configure your MCP client (e.g., Claude Desktop) to run:
60
+
61
+ ```bash
62
+ mistral-ocr-mcp
63
+ ```
64
+
65
+ ### For Development
66
+
67
+ Clone the repository and install with development dependencies:
68
+
69
+ ```bash
70
+ git clone https://github.com/ORDIS-Co-Ltd/mistral-ocr-mcp
71
+ cd mistral-ocr-multimedia-mcp
72
+ pip install -e '.[dev]'
73
+ ```
74
+
75
+ Run the server:
76
+
77
+ ```bash
78
+ MISTRAL_API_KEY="your-key" \
79
+ MISTRAL_OCR_ALLOWED_DIR="/path/to/allowed/dir" \
80
+ python -m mistral_ocr_mcp
81
+ ```
82
+
83
+ ---
84
+
85
+ ## Configuration
86
+
87
+ ### Required Environment Variables
88
+
89
+ | Variable | Description | Example |
90
+ |----------|-------------|---------|
91
+ | `MISTRAL_API_KEY` | Your Mistral API key (never logged) | `sk-abc123...` |
92
+ | `MISTRAL_OCR_ALLOWED_DIR` | Absolute path to allowed write directory | `/Users/username/workdir` |
93
+
94
+ ### Security Sandbox
95
+
96
+ The server enforces a **write directory sandbox** to prevent unauthorized file writes:
97
+
98
+ - **`extract_markdown`**: No write restrictions (read-only operation)
99
+ - **`extract_markdown_with_images`**: The `output_dir` parameter **must** be within `MISTRAL_OCR_ALLOWED_DIR`
100
+
101
+ **Validation Examples:**
102
+
103
+ | `MISTRAL_OCR_ALLOWED_DIR` | `output_dir` | Result |
104
+ |---------------------------|--------------|--------|
105
+ | `/Users/username/workdir` | `/Users/username/workdir/project/output` | ✅ Allowed |
106
+ | `/Users/username/workdir` | `/Users/username/workdir` | ✅ Allowed (exact match) |
107
+ | `/Users/username/workdir` | `/Users/username/documents` | ❌ Rejected |
108
+ | `/Users/username/workdir` | `/Users/username/workdir/../documents` | ❌ Rejected (resolves outside) |
109
+
110
+ **Security Notes:**
111
+ - All paths are canonicalized (symlinks resolved, `..` eliminated) before validation
112
+ - Image filenames are sanitized to prevent path traversal attacks
113
+
114
+ ---
115
+
116
+ ## Client Configuration
117
+
118
+ ### Claude Desktop
119
+
120
+ Add this to your `claude_desktop_config.json`:
121
+
122
+ - **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
123
+ - **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
124
+
125
+ ```json
126
+ {
127
+ "mcpServers": {
128
+ "mistral-ocr": {
129
+ "command": "uvx",
130
+ "args": ["mistral-ocr-mcp"],
131
+ "env": {
132
+ "MISTRAL_API_KEY": "your-api-key-here",
133
+ "MISTRAL_OCR_ALLOWED_DIR": "/absolute/path/to/allowed/directory"
134
+ }
135
+ }
136
+ }
137
+ }
138
+ ```
139
+
140
+ ### OpenCode
141
+
142
+ Add this to the `mcp` section of your configuration file:
143
+
144
+ ```json
145
+ {
146
+ "mcp": {
147
+ "mistral-ocr": {
148
+ "type": "local",
149
+ "command": ["uvx", "mistral-ocr-mcp"],
150
+ "enabled": true,
151
+ "environment": {
152
+ "MISTRAL_API_KEY": "your-api-key-here",
153
+ "MISTRAL_OCR_ALLOWED_DIR": "/absolute/path/to/allowed/directory"
154
+ }
155
+ }
156
+ }
157
+ }
158
+ ```
159
+
160
+ ### Codex
161
+
162
+ If you use the Codex CLI, you can add the server with:
163
+
164
+ ```bash
165
+ codex mcp add mistral-ocr -- uvx mistral-ocr-mcp
166
+ ```
167
+
168
+ Make sure the environment variables `MISTRAL_API_KEY` and `MISTRAL_OCR_ALLOWED_DIR` are set in your shell environment.
169
+
170
+ ---
171
+
172
+ ## Tool Reference
173
+
174
+ ### Tool 1: `extract_markdown`
175
+
176
+ Extract markdown content from a document **without** saving images.
177
+
178
+ **Arguments:**
179
+
180
+ ```json
181
+ {
182
+ "file_path": "/absolute/path/to/document.pdf"
183
+ }
184
+ ```
185
+
186
+ | Parameter | Type | Required | Description |
187
+ |-----------|------|----------|-------------|
188
+ | `file_path` | `string` | Yes | Absolute path to input file (PDF or image) |
189
+
190
+ **Returns:**
191
+
192
+ ```json
193
+ "# Document Title\n\nExtracted markdown content from all pages..."
194
+ ```
195
+
196
+ Returns a single string containing concatenated markdown from all pages.
197
+
198
+ **Example:**
199
+
200
+ ```json
201
+ {
202
+ "tool": "extract_markdown",
203
+ "arguments": {
204
+ "file_path": "/Users/username/documents/report.pdf"
205
+ }
206
+ }
207
+ ```
208
+
209
+ ---
210
+
211
+ ### Tool 2: `extract_markdown_with_images`
212
+
213
+ Extract markdown content **and** save embedded images to disk.
214
+
215
+ **Arguments:**
216
+
217
+ ```json
218
+ {
219
+ "file_path": "/absolute/path/to/document.pdf",
220
+ "output_dir": "/absolute/path/to/output/parent"
221
+ }
222
+ ```
223
+
224
+ | Parameter | Type | Required | Description |
225
+ |-----------|------|----------|-------------|
226
+ | `file_path` | `string` | Yes | Absolute path to input file (PDF or image) |
227
+ | `output_dir` | `string` | Yes | Absolute path to output parent directory (must exist and be writable, must be within `MISTRAL_OCR_ALLOWED_DIR`) |
228
+
229
+ **Returns:**
230
+
231
+ ```json
232
+ {
233
+ "output_directory": "/absolute/path/to/output/parent/document",
234
+ "markdown_file": "/absolute/path/to/output/parent/document/content.md",
235
+ "images": ["img_abc123.png", "img_def456.jpeg"]
236
+ }
237
+ ```
238
+
239
+ | Field | Type | Description |
240
+ |-------|------|-------------|
241
+ | `output_directory` | `string` | Absolute path to created subdirectory |
242
+ | `markdown_file` | `string` | Absolute path to `content.md` file |
243
+ | `images` | `array[string]` | List of saved image filenames (not full paths) |
244
+
245
+ **Behavior:**
246
+
247
+ 1. Creates a subdirectory named after the input file stem (e.g., `report` for `report.pdf`)
248
+ 2. If the subdirectory already exists, appends a timestamp: `report_20260102_143022`
249
+ 3. Saves all extracted images as `<sanitized_id>.<ext>` (e.g., `img_abc123.png`)
250
+ 4. Saves markdown to `content.md` with relative image links (e.g., `![](./img_abc123.png)`)
251
+
252
+ **Example:**
253
+
254
+ ```json
255
+ {
256
+ "tool": "extract_markdown_with_images",
257
+ "arguments": {
258
+ "file_path": "/Users/username/documents/quarterly-report.pdf",
259
+ "output_dir": "/Users/username/workdir/extracted"
260
+ }
261
+ }
262
+ ```
263
+
264
+ **Output Structure:**
265
+
266
+ ```
267
+ /Users/username/workdir/extracted/
268
+ quarterly-report/
269
+ content.md # Markdown with relative image links
270
+ img_abc123.png # First extracted image
271
+ img_def456.jpeg # Second extracted image
272
+ ```
273
+
274
+ ---
275
+
276
+ ## Example Client Usage
277
+
278
+ Here's a minimal Python example using the MCP SDK to call the tools:
279
+
280
+ ```python
281
+ import asyncio
282
+ from mcp import ClientSession, StdioServerParameters
283
+ from mcp.client.stdio import stdio_client
284
+
285
+ async def extract_document():
286
+ server_params = StdioServerParameters(
287
+ command="mistral-ocr-mcp",
288
+ env={
289
+ "MISTRAL_API_KEY": "your-api-key",
290
+ "MISTRAL_OCR_ALLOWED_DIR": "/Users/username/workdir"
291
+ }
292
+ )
293
+
294
+ async with stdio_client(server_params) as (read, write):
295
+ async with ClientSession(read, write) as session:
296
+ await session.initialize()
297
+
298
+ # Simple extraction
299
+ result = await session.call_tool(
300
+ "extract_markdown",
301
+ arguments={"file_path": "/path/to/document.pdf"}
302
+ )
303
+ print(result.content[0].text)
304
+
305
+ # Extraction with images
306
+ result = await session.call_tool(
307
+ "extract_markdown_with_images",
308
+ arguments={
309
+ "file_path": "/path/to/document.pdf",
310
+ "output_dir": "/Users/username/workdir/output"
311
+ }
312
+ )
313
+ print(result.content[0].text)
314
+
315
+ asyncio.run(extract_document())
316
+ ```
317
+
318
+ ---
319
+
320
+ ## Troubleshooting
321
+
322
+ | Error | Cause | Solution |
323
+ |-------|-------|----------|
324
+ | `Missing required environment variable: MISTRAL_API_KEY` | `MISTRAL_API_KEY` not set | Set the environment variable before running the server |
325
+ | `Missing required environment variable: MISTRAL_OCR_ALLOWED_DIR` | `MISTRAL_OCR_ALLOWED_DIR` not set | Set the environment variable to an absolute path |
326
+ | `MISTRAL_OCR_ALLOWED_DIR must be an absolute path` | Relative path provided (e.g., `~/documents`) | Use an absolute path (e.g., `/Users/username/documents`) |
327
+ | `MISTRAL_OCR_ALLOWED_DIR does not exist` | Directory does not exist on filesystem | Create the directory first: `mkdir -p /path/to/dir` |
328
+ | `MISTRAL_OCR_ALLOWED_DIR is not a directory` | Path points to a file, not a directory | Ensure the path is a directory |
329
+ | `validate file_path: must be an absolute path: {path}` | Relative path provided for input file | Use an absolute path (e.g., `/Users/username/file.pdf`) |
330
+ | `validate file_path: resolve failed, path does not exist: {path}` | Input file does not exist | Check the file path and ensure the file exists |
331
+ | `validate file_path: unsupported file type '{suffix}'. Supported types: ...` | File extension not supported | Use `.pdf`, `.png`, `.jpg`, `.jpeg`, `.webp`, or `.gif` |
332
+ | `validate output_dir: resolve failed, path does not exist: {path}` | Output directory does not exist | Create the directory first: `mkdir -p {path}` |
333
+ | `validate output_dir: path is not a directory: {path}` | Path points to a file, not a directory | Ensure the path is a directory |
334
+ | `validate output_dir: writability check failed, directory not writable: {path}` | Output directory exists but is not writable | Check directory permissions: `chmod u+w {path}` |
335
+ | `output_dir must be within the allowed directory` | `output_dir` is outside `MISTRAL_OCR_ALLOWED_DIR` | Use a path within the allowed directory |
336
+ | `Mistral OCR request failed (status=401): {message}` | Invalid API key | Check your `MISTRAL_API_KEY` |
337
+ | `Mistral OCR request failed (status=429): {message}` | Rate limit exceeded | Wait and retry, or check your API quota |
338
+
339
+ ---
340
+
341
+ ## Development
342
+
343
+ ### Install Development Dependencies
344
+
345
+ ```bash
346
+ pip install -e '.[dev]'
347
+ ```
348
+
349
+ ### Run Tests
350
+
351
+ Run the full test suite:
352
+
353
+ ```bash
354
+ pytest
355
+ ```
356
+
357
+ Run tests with verbose output:
358
+
359
+ ```bash
360
+ pytest -v
361
+ ```
362
+
363
+ Run tests in quiet mode:
364
+
365
+ ```bash
366
+ pytest -q
367
+ ```
368
+
369
+ ### Project Structure
370
+
371
+ ```
372
+ mistral-ocr-multimedia-mcp/
373
+ ├── src/
374
+ │ └── mistral_ocr_mcp/
375
+ │ ├── __init__.py
376
+ │ ├── __main__.py # Entry point
377
+ │ ├── server.py # MCP server and tool definitions
378
+ │ ├── config.py # Configuration loading and validation
379
+ │ ├── extraction.py # OCR orchestration logic
380
+ │ ├── mistral_client.py # Mistral API client
381
+ │ ├── images.py # Image parsing and saving
382
+ │ ├── markdown_rewrite.py # Markdown link rewriting
383
+ │ └── path_sandbox.py # Path validation and sandbox enforcement
384
+ ├── tests/ # Unit tests
385
+ ├── pyproject.toml # Package configuration
386
+ └── README.md # This file
387
+ ```
388
+
389
+ ---
390
+
391
+ ## License
392
+
393
+ MIT
394
+
395
+ ---
396
+
397
+ ## Contributing
398
+
399
+ Contributions are welcome! Please open an issue or submit a pull request.
400
+
401
+ ---
402
+
403
+ ## Links
404
+
405
+ - **GitHub Repository**: https://github.com/ORDIS-Co-Ltd/mistral-ocr-mcp
406
+ - **MCP Specification**: https://modelcontextprotocol.io
407
+ - **Mistral AI**: https://mistral.ai
@@ -0,0 +1,13 @@
1
+ mistral_ocr_mcp/__init__.py,sha256=UwK-luoFhJsyps4rpij6BGZiWyxMXX-AL2q_7tWLF5k,196
2
+ mistral_ocr_mcp/__main__.py,sha256=pDlN6SI9E51xUGgrAy6pJC_F1vEqI3vjIvDIGFOBqZ0,696
3
+ mistral_ocr_mcp/config.py,sha256=eHm-eQq2jIbuQoLDPUsi8llE0HKyd1v_rYGrknfiAhM,2658
4
+ mistral_ocr_mcp/extraction.py,sha256=wbRqjMGY86CiDPqEKYMP1rRYMJsxUDXG_PNBx9PeYRk,5408
5
+ mistral_ocr_mcp/images.py,sha256=aIb3k8bNDtdPZ0zg19YJzB_aLWrb0pCw4UOJdYa8Xxk,5357
6
+ mistral_ocr_mcp/markdown_rewrite.py,sha256=ktLnqEo1zwmMBE89N6VI557hAEsknANBC2sSs_ArauI,4245
7
+ mistral_ocr_mcp/mistral_client.py,sha256=aCo6HjmibTuID3itHqGdVfxBpie9kMteVx3GsS6l3SY,4223
8
+ mistral_ocr_mcp/path_sandbox.py,sha256=Xi5CuyTQvZ0L_9woTvkmttsbzdz6Sd8oRnlwjX79Eag,4377
9
+ mistral_ocr_mcp/server.py,sha256=425FeAvFNLRGDXZm7a4g5ADQpMWNSp1_EWkrXlPJnO0,2585
10
+ mistral_ocr_mcp-0.1.3.dist-info/METADATA,sha256=n8qinc4uKnB39GbH8XVpdKR0GNVQcqO4aYm9qyPD2tY,12059
11
+ mistral_ocr_mcp-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
12
+ mistral_ocr_mcp-0.1.3.dist-info/entry_points.txt,sha256=J-qJQ5P8-pJ6a8W7KXaesUuMFU3rFgYNyAsMCanQoqs,66
13
+ mistral_ocr_mcp-0.1.3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ mistral-ocr-mcp = mistral_ocr_mcp.__main__:main