mistral-ocr-cli 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Rubén Fernández-Fuertes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,294 @@
1
+ Metadata-Version: 2.3
2
+ Name: mistral-ocr-cli
3
+ Version: 1.0.0
4
+ Summary: A clean command-line tool for OCR processing using Mistral AI's API
5
+ License: MIT
6
+ Keywords: ocr,mistral,pdf,cli,command-line,document-processing,text-extraction,image-processing
7
+ Author: Ruben Fernandez-Fuertes
8
+ Author-email: fernandezfuertesruben@gmail.com
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Text Processing
25
+ Classifier: Topic :: Utilities
26
+ Requires-Dist: Pillow (>=10.2.0,<11.0.0)
27
+ Requires-Dist: click (>=8.1.7,<9.0.0)
28
+ Requires-Dist: mistralai (>=1.0.0,<2.0.0)
29
+ Requires-Dist: pypdf (>=4.0.0,<5.0.0)
30
+ Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
31
+ Requires-Dist: rich (>=13.7.0,<14.0.0)
32
+ Project-URL: Homepage, https://github.com/rubenfernandez/mistral-ocr-cli
33
+ Project-URL: Repository, https://github.com/rubenfernandez/mistral-ocr-cli
34
+ Description-Content-Type: text/markdown
35
+
36
+ # Mistral OCR CLI
37
+
38
+ A powerful command-line tool for OCR processing using Mistral AI's state-of-the-art OCR API. Process PDFs and images to extract text, tables, equations, and images with unprecedented accuracy.
39
+
40
+ ## ⚠️ Disclaimer
41
+
42
+ **This is an unofficial, community-created tool** that uses Mistral AI's OCR API. This project is not affiliated with, officially maintained, or endorsed by Mistral AI.
43
+
44
+ - **Official Mistral OCR**: https://mistral.ai/news/mistral-ocr
45
+ - **Official Documentation**: https://docs.mistral.ai/capabilities/OCR/
46
+ - **Mistral AI Platform**: https://console.mistral.ai/
47
+
48
+ For official tools and support, please visit [Mistral AI's website](https://mistral.ai).
49
+
50
+ ## Features
51
+
52
+ - 📄 **Multi-format Support**: Process PDFs and images (JPG, PNG, WEBP, GIF, BMP, TIFF)
53
+ - 📊 **Advanced Extraction**: Extract text, tables, equations, and images
54
+ - 📁 **Batch Processing**: Process single files or entire directories
55
+ - 🎯 **Smart Output**: Preserves document structure in markdown format
56
+ - 🖼️ **Image Extraction**: Optionally extract and save embedded images
57
+ - 📈 **Progress Tracking**: Real-time progress bars for batch operations
58
+ - 🔧 **Flexible Configuration**: Environment variables or command-line options
59
+
60
+ ## Installation
61
+
62
+ ### Prerequisites
63
+
64
+ - Python 3.9 or higher
65
+ - Mistral API key from [Mistral Console](https://console.mistral.ai/)
66
+
67
+ ### Install with Poetry (Recommended)
68
+
69
+ ```bash
70
+ # Clone the repository
71
+ git clone https://github.com/yourusername/mistral-ocr.git
72
+ cd mistral-ocr
73
+
74
+ # Install Poetry if you haven't already
75
+ curl -sSL https://install.python-poetry.org | python3 -
76
+
77
+ # Install dependencies and the package
78
+ poetry install
79
+
80
+ # Make the command available globally
81
+ poetry build
82
+ pip install dist/mistral_ocr-*.whl
83
+ ```
84
+
85
+ ### Install with pip
86
+
87
+ ```bash
88
+ # Clone the repository
89
+ git clone https://github.com/yourusername/mistral-ocr.git
90
+ cd mistral-ocr
91
+
92
+ # Install in editable mode for global usage
93
+ pip install -e .
94
+ ```
95
+
96
+ ## Configuration
97
+
98
+ ### 1. Set up your Mistral API key
99
+
100
+ Create a `.env` file in your project root (or copy from `.env.example`):
101
+
102
+ ```bash
103
+ cp .env.example .env
104
+ ```
105
+
106
+ Edit `.env` and add your API key:
107
+
108
+ ```env
109
+ MISTRAL_API_KEY=your_actual_api_key_here
110
+ ```
111
+
112
+ ### 2. Alternative: Export as environment variable
113
+
114
+ ```bash
115
+ export MISTRAL_API_KEY="your_actual_api_key_here"
116
+ ```
117
+
118
+ ## Usage
119
+
120
+ ### Basic Usage
121
+
122
+ Process a single PDF file (output saved to `mistral_ocr_output/` in the same directory):
123
+
124
+ ```bash
125
+ mistral-ocr document.pdf
126
+ ```
127
+
128
+ ### Specify Output Directory
129
+
130
+ ```bash
131
+ mistral-ocr document.pdf --output-path ./results
132
+ ```
133
+
134
+ ### Process Entire Directory
135
+
136
+ ```bash
137
+ mistral-ocr ./documents --output-path ./extracted
138
+ ```
139
+
140
+ ### Command-Line Options
141
+
142
+ ```
143
+ Usage: mistral-ocr INPUT_PATH [OPTIONS]
144
+
145
+ Arguments:
146
+ INPUT_PATH Path to input file or directory (required)
147
+
148
+ Options:
149
+ -o, --output-path PATH Output directory (default: <input_dir>/mistral_ocr_output/)
150
+ --api-key TEXT Mistral API key (or set MISTRAL_API_KEY env var)
151
+ --model TEXT OCR model (default: mistral-ocr-latest)
152
+ --env-file PATH Path to .env file
153
+ --include-images/--no-images Extract images (default: True)
154
+ -v, --verbose Enable verbose output
155
+ --version Show version
156
+ --help Show this message
157
+ ```
158
+
159
+ ## Examples
160
+
161
+ ### Process a single image
162
+
163
+ ```bash
164
+ mistral-ocr photo.jpg
165
+ ```
166
+
167
+ ### Process multiple PDFs with custom output
168
+
169
+ ```bash
170
+ mistral-ocr ./reports --output-path ./extracted_text --verbose
171
+ ```
172
+
173
+ ### Use a different .env file
174
+
175
+ ```bash
176
+ mistral-ocr document.pdf --env-file .env.production
177
+ ```
178
+
179
+ ### Process without extracting images
180
+
181
+ ```bash
182
+ mistral-ocr document.pdf --no-images
183
+ ```
184
+
185
+ ### Pass API key directly (not recommended for production)
186
+
187
+ ```bash
188
+ mistral-ocr doc.pdf --api-key "your_api_key_here"
189
+ ```
190
+
191
+ ## Output Structure
192
+
193
+ The tool creates the following output structure:
194
+
195
+ ```
196
+ mistral_ocr_output/
197
+ ├── document1.md # Extracted text in markdown format
198
+ ├── document1_images/ # Extracted images (if enabled)
199
+ │ ├── page1_img1.png
200
+ │ └── page1_img2.png
201
+ ├── document2.md
202
+ ├── document2_images/
203
+ │ └── ...
204
+ └── metadata.json # Processing statistics and errors
205
+ ```
206
+
207
+ ### Markdown Output
208
+
209
+ Each processed document generates a markdown file containing:
210
+ - Document metadata (source, processing time)
211
+ - Extracted text with preserved formatting
212
+ - Tables rendered in markdown format
213
+ - Mathematical equations
214
+ - Image references (if image extraction is enabled)
215
+
216
+ ### Metadata File
217
+
218
+ The `metadata.json` file contains:
219
+ - List of processed files
220
+ - Processing time
221
+ - File sizes
222
+ - Output paths
223
+ - Any errors encountered
224
+
225
+ ## Limitations
226
+
227
+ - Maximum file size: 50 MB
228
+ - Maximum pages per document: 1,000
229
+ - Supported formats: PDF, JPG, JPEG, PNG, WEBP, GIF, BMP, TIFF
230
+
231
+ ## Pricing
232
+
233
+ Mistral OCR API pricing: $1 per 1,000 pages ($0.001 per page)
234
+
235
+ ## Development
236
+
237
+ ### Run tests
238
+
239
+ ```bash
240
+ poetry run pytest
241
+ ```
242
+
243
+ ### Format code
244
+
245
+ ```bash
246
+ poetry run black mistral_ocr/
247
+ poetry run ruff check mistral_ocr/
248
+ ```
249
+
250
+ ### Type checking
251
+
252
+ ```bash
253
+ poetry run mypy mistral_ocr/
254
+ ```
255
+
256
+ ## Troubleshooting
257
+
258
+ ### API Key Not Found
259
+
260
+ If you get an error about missing API key:
261
+ 1. Ensure `.env` file exists and contains `MISTRAL_API_KEY=your_key`
262
+ 2. Or export it: `export MISTRAL_API_KEY="your_key"`
263
+ 3. Or pass it directly: `mistral-ocr --api-key "your_key" ...`
264
+
265
+ ### File Size Error
266
+
267
+ If a file exceeds 50 MB:
268
+ - Consider splitting large PDFs into smaller parts
269
+ - Compress images before processing
270
+
271
+ ### Installation Issues
272
+
273
+ If the command is not found after installation:
274
+ 1. Ensure the package is installed: `pip show mistral-ocr`
275
+ 2. Check your PATH includes pip's script directory
276
+ 3. Try reinstalling with: `pip install -e .`
277
+
278
+ ## License
279
+
280
+ MIT License - see LICENSE file for details
281
+
282
+ ## Contributing
283
+
284
+ Contributions are welcome! Please feel free to submit a Pull Request.
285
+
286
+ ## Support
287
+
288
+ For issues or questions about this CLI tool, please open an issue on [GitHub](https://github.com/r-uben/mistral-ocr-cli/issues)
289
+
290
+ For questions about Mistral AI's OCR API, please refer to [Mistral's official documentation](https://docs.mistral.ai) or contact their support.
291
+
292
+ ## Legal
293
+
294
+ "Mistral AI" and "Mistral" are trademarks of Mistral AI. This project is not affiliated with or endorsed by Mistral AI. The use of Mistral AI's OCR API is subject to Mistral AI's [Terms of Service](https://mistral.ai/terms/).
@@ -0,0 +1,259 @@
1
+ # Mistral OCR CLI
2
+
3
+ A powerful command-line tool for OCR processing using Mistral AI's state-of-the-art OCR API. Process PDFs and images to extract text, tables, equations, and images with unprecedented accuracy.
4
+
5
+ ## ⚠️ Disclaimer
6
+
7
+ **This is an unofficial, community-created tool** that uses Mistral AI's OCR API. This project is not affiliated with, officially maintained, or endorsed by Mistral AI.
8
+
9
+ - **Official Mistral OCR**: https://mistral.ai/news/mistral-ocr
10
+ - **Official Documentation**: https://docs.mistral.ai/capabilities/OCR/
11
+ - **Mistral AI Platform**: https://console.mistral.ai/
12
+
13
+ For official tools and support, please visit [Mistral AI's website](https://mistral.ai).
14
+
15
+ ## Features
16
+
17
+ - 📄 **Multi-format Support**: Process PDFs and images (JPG, PNG, WEBP, GIF, BMP, TIFF)
18
+ - 📊 **Advanced Extraction**: Extract text, tables, equations, and images
19
+ - 📁 **Batch Processing**: Process single files or entire directories
20
+ - 🎯 **Smart Output**: Preserves document structure in markdown format
21
+ - 🖼️ **Image Extraction**: Optionally extract and save embedded images
22
+ - 📈 **Progress Tracking**: Real-time progress bars for batch operations
23
+ - 🔧 **Flexible Configuration**: Environment variables or command-line options
24
+
25
+ ## Installation
26
+
27
+ ### Prerequisites
28
+
29
+ - Python 3.9 or higher
30
+ - Mistral API key from [Mistral Console](https://console.mistral.ai/)
31
+
32
+ ### Install with Poetry (Recommended)
33
+
34
+ ```bash
35
+ # Clone the repository
36
+ git clone https://github.com/yourusername/mistral-ocr.git
37
+ cd mistral-ocr
38
+
39
+ # Install Poetry if you haven't already
40
+ curl -sSL https://install.python-poetry.org | python3 -
41
+
42
+ # Install dependencies and the package
43
+ poetry install
44
+
45
+ # Make the command available globally
46
+ poetry build
47
+ pip install dist/mistral_ocr-*.whl
48
+ ```
49
+
50
+ ### Install with pip
51
+
52
+ ```bash
53
+ # Clone the repository
54
+ git clone https://github.com/yourusername/mistral-ocr.git
55
+ cd mistral-ocr
56
+
57
+ # Install in editable mode for global usage
58
+ pip install -e .
59
+ ```
60
+
61
+ ## Configuration
62
+
63
+ ### 1. Set up your Mistral API key
64
+
65
+ Create a `.env` file in your project root (or copy from `.env.example`):
66
+
67
+ ```bash
68
+ cp .env.example .env
69
+ ```
70
+
71
+ Edit `.env` and add your API key:
72
+
73
+ ```env
74
+ MISTRAL_API_KEY=your_actual_api_key_here
75
+ ```
76
+
77
+ ### 2. Alternative: Export as environment variable
78
+
79
+ ```bash
80
+ export MISTRAL_API_KEY="your_actual_api_key_here"
81
+ ```
82
+
83
+ ## Usage
84
+
85
+ ### Basic Usage
86
+
87
+ Process a single PDF file (output saved to `mistral_ocr_output/` in the same directory):
88
+
89
+ ```bash
90
+ mistral-ocr document.pdf
91
+ ```
92
+
93
+ ### Specify Output Directory
94
+
95
+ ```bash
96
+ mistral-ocr document.pdf --output-path ./results
97
+ ```
98
+
99
+ ### Process Entire Directory
100
+
101
+ ```bash
102
+ mistral-ocr ./documents --output-path ./extracted
103
+ ```
104
+
105
+ ### Command-Line Options
106
+
107
+ ```
108
+ Usage: mistral-ocr INPUT_PATH [OPTIONS]
109
+
110
+ Arguments:
111
+ INPUT_PATH Path to input file or directory (required)
112
+
113
+ Options:
114
+ -o, --output-path PATH Output directory (default: <input_dir>/mistral_ocr_output/)
115
+ --api-key TEXT Mistral API key (or set MISTRAL_API_KEY env var)
116
+ --model TEXT OCR model (default: mistral-ocr-latest)
117
+ --env-file PATH Path to .env file
118
+ --include-images/--no-images Extract images (default: True)
119
+ -v, --verbose Enable verbose output
120
+ --version Show version
121
+ --help Show this message
122
+ ```
123
+
124
+ ## Examples
125
+
126
+ ### Process a single image
127
+
128
+ ```bash
129
+ mistral-ocr photo.jpg
130
+ ```
131
+
132
+ ### Process multiple PDFs with custom output
133
+
134
+ ```bash
135
+ mistral-ocr ./reports --output-path ./extracted_text --verbose
136
+ ```
137
+
138
+ ### Use a different .env file
139
+
140
+ ```bash
141
+ mistral-ocr document.pdf --env-file .env.production
142
+ ```
143
+
144
+ ### Process without extracting images
145
+
146
+ ```bash
147
+ mistral-ocr document.pdf --no-images
148
+ ```
149
+
150
+ ### Pass API key directly (not recommended for production)
151
+
152
+ ```bash
153
+ mistral-ocr doc.pdf --api-key "your_api_key_here"
154
+ ```
155
+
156
+ ## Output Structure
157
+
158
+ The tool creates the following output structure:
159
+
160
+ ```
161
+ mistral_ocr_output/
162
+ ├── document1.md # Extracted text in markdown format
163
+ ├── document1_images/ # Extracted images (if enabled)
164
+ │ ├── page1_img1.png
165
+ │ └── page1_img2.png
166
+ ├── document2.md
167
+ ├── document2_images/
168
+ │ └── ...
169
+ └── metadata.json # Processing statistics and errors
170
+ ```
171
+
172
+ ### Markdown Output
173
+
174
+ Each processed document generates a markdown file containing:
175
+ - Document metadata (source, processing time)
176
+ - Extracted text with preserved formatting
177
+ - Tables rendered in markdown format
178
+ - Mathematical equations
179
+ - Image references (if image extraction is enabled)
180
+
181
+ ### Metadata File
182
+
183
+ The `metadata.json` file contains:
184
+ - List of processed files
185
+ - Processing time
186
+ - File sizes
187
+ - Output paths
188
+ - Any errors encountered
189
+
190
+ ## Limitations
191
+
192
+ - Maximum file size: 50 MB
193
+ - Maximum pages per document: 1,000
194
+ - Supported formats: PDF, JPG, JPEG, PNG, WEBP, GIF, BMP, TIFF
195
+
196
+ ## Pricing
197
+
198
+ Mistral OCR API pricing: $1 per 1,000 pages ($0.001 per page)
199
+
200
+ ## Development
201
+
202
+ ### Run tests
203
+
204
+ ```bash
205
+ poetry run pytest
206
+ ```
207
+
208
+ ### Format code
209
+
210
+ ```bash
211
+ poetry run black mistral_ocr/
212
+ poetry run ruff check mistral_ocr/
213
+ ```
214
+
215
+ ### Type checking
216
+
217
+ ```bash
218
+ poetry run mypy mistral_ocr/
219
+ ```
220
+
221
+ ## Troubleshooting
222
+
223
+ ### API Key Not Found
224
+
225
+ If you get an error about missing API key:
226
+ 1. Ensure `.env` file exists and contains `MISTRAL_API_KEY=your_key`
227
+ 2. Or export it: `export MISTRAL_API_KEY="your_key"`
228
+ 3. Or pass it directly: `mistral-ocr --api-key "your_key" ...`
229
+
230
+ ### File Size Error
231
+
232
+ If a file exceeds 50 MB:
233
+ - Consider splitting large PDFs into smaller parts
234
+ - Compress images before processing
235
+
236
+ ### Installation Issues
237
+
238
+ If the command is not found after installation:
239
+ 1. Ensure the package is installed: `pip show mistral-ocr`
240
+ 2. Check your PATH includes pip's script directory
241
+ 3. Try reinstalling with: `pip install -e .`
242
+
243
+ ## License
244
+
245
+ MIT License - see LICENSE file for details
246
+
247
+ ## Contributing
248
+
249
+ Contributions are welcome! Please feel free to submit a Pull Request.
250
+
251
+ ## Support
252
+
253
+ For issues or questions about this CLI tool, please open an issue on [GitHub](https://github.com/r-uben/mistral-ocr-cli/issues)
254
+
255
+ For questions about Mistral AI's OCR API, please refer to [Mistral's official documentation](https://docs.mistral.ai) or contact their support.
256
+
257
+ ## Legal
258
+
259
+ "Mistral AI" and "Mistral" are trademarks of Mistral AI. This project is not affiliated with or endorsed by Mistral AI. The use of Mistral AI's OCR API is subject to Mistral AI's [Terms of Service](https://mistral.ai/terms/).
@@ -0,0 +1,9 @@
1
+ """Mistral OCR CLI - A clean command-line tool for OCR processing using Mistral AI."""
2
+
3
+ __version__ = "1.0.0"
4
+ __author__ = "Ruben Fernandez-Fuertes"
5
+
6
+ from .processor import OCRProcessor
7
+ from .config import Config
8
+
9
+ __all__ = ["OCRProcessor", "Config"]
@@ -0,0 +1,6 @@
1
+ """Entry point for running mistral_ocr as a module."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,159 @@
1
+ """Command-line interface for Mistral OCR."""
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import click
9
+ from rich.console import Console
10
+
11
+ from .config import Config
12
+ from .processor import OCRProcessor
13
+
14
+
15
+ console = Console()
16
+
17
+ # Get the original working directory if set
18
+ ORIGINAL_CWD = os.environ.get('MISTRAL_OCR_CWD', os.getcwd())
19
+
20
+
21
+ @click.command()
22
+ @click.argument(
23
+ "input_path",
24
+ type=click.Path(path_type=Path),
25
+ required=True
26
+ )
27
+ @click.option(
28
+ "--output-path", "-o",
29
+ type=click.Path(path_type=Path),
30
+ required=False,
31
+ help="Path to output directory (default: <input_dir>/mistral_ocr_output/)"
32
+ )
33
+ @click.option(
34
+ "--api-key",
35
+ type=str,
36
+ envvar="MISTRAL_API_KEY",
37
+ help="Mistral API key (can also be set via MISTRAL_API_KEY env var)"
38
+ )
39
+ @click.option(
40
+ "--model",
41
+ type=str,
42
+ default="mistral-ocr-latest",
43
+ help="Mistral OCR model to use (default: mistral-ocr-latest)"
44
+ )
45
+ @click.option(
46
+ "--env-file",
47
+ type=click.Path(exists=True, path_type=Path),
48
+ help="Path to .env file containing configuration"
49
+ )
50
+ @click.option(
51
+ "--include-images/--no-images",
52
+ default=True,
53
+ help="Include extracted images in output (default: True)"
54
+ )
55
+ @click.option(
56
+ "--add-timestamp/--no-timestamp",
57
+ default=False,
58
+ help="Add timestamp to output folder name (default: False)"
59
+ )
60
+ @click.option(
61
+ "--verbose", "-v",
62
+ is_flag=True,
63
+ help="Enable verbose output"
64
+ )
65
+ @click.version_option(version="1.0.0", prog_name="mistral-ocr")
66
+ def main(
67
+ input_path: Path,
68
+ output_path: Optional[Path],
69
+ api_key: Optional[str],
70
+ model: str,
71
+ env_file: Optional[Path],
72
+ include_images: bool,
73
+ add_timestamp: bool,
74
+ verbose: bool
75
+ ) -> None:
76
+ """
77
+ Mistral OCR - Process documents using Mistral AI's OCR API.
78
+
79
+ This tool processes PDF and image files using Mistral's powerful OCR capabilities,
80
+ extracting text, tables, equations, and images with high accuracy.
81
+
82
+ Examples:
83
+
84
+ # Process a single PDF file
85
+ mistral-ocr document.pdf
86
+
87
+ # Process all files in a directory
88
+ mistral-ocr ./documents --output-path ./results
89
+
90
+ # Use a specific .env file
91
+ mistral-ocr doc.pdf --env-file .env.production
92
+ """
93
+ try:
94
+ # Resolve input path relative to original working directory
95
+ if not input_path.is_absolute():
96
+ input_path = Path(ORIGINAL_CWD) / input_path
97
+
98
+ # Check if input path exists
99
+ if not input_path.exists():
100
+ raise ValueError(f"Input path does not exist: {input_path}")
101
+
102
+ # Resolve output path if provided
103
+ if output_path and not output_path.is_absolute():
104
+ output_path = Path(ORIGINAL_CWD) / output_path
105
+
106
+ # Print header
107
+ console.print("\n[bold blue]🔍 Mistral OCR[/bold blue]")
108
+ console.print("[dim]Powered by Mistral AI's OCR API[/dim]\n")
109
+
110
+ # Load configuration
111
+ if verbose:
112
+ console.print("[dim]Loading configuration...[/dim]")
113
+
114
+ # Create config from environment
115
+ if env_file:
116
+ config = Config.from_env(env_file)
117
+ else:
118
+ # If API key is provided via CLI, set it as env var
119
+ if api_key:
120
+ import os
121
+ os.environ["MISTRAL_API_KEY"] = api_key
122
+
123
+ config = Config.from_env()
124
+
125
+ # Override config with CLI options
126
+ config.model = model
127
+ config.include_images = include_images
128
+ config.verbose = verbose
129
+
130
+ # Create processor
131
+ processor = OCRProcessor(config)
132
+
133
+ # Process input
134
+ processor.process(input_path, output_path, add_timestamp=add_timestamp)
135
+
136
+ # Print summary
137
+ if processor.errors and verbose:
138
+ console.print("\n[yellow]⚠ Errors encountered:[/yellow]")
139
+ for error in processor.errors:
140
+ console.print(f" [red]• {error['file']}: {error['error']}[/red]")
141
+
142
+ console.print("\n[bold green]✨ Processing complete![/bold green]\n")
143
+
144
+ except ValueError as e:
145
+ console.print(f"\n[red]Error: {e}[/red]\n")
146
+ sys.exit(1)
147
+ except KeyboardInterrupt:
148
+ console.print("\n[yellow]Processing interrupted by user.[/yellow]\n")
149
+ sys.exit(130)
150
+ except Exception as e:
151
+ console.print(f"\n[red]Unexpected error: {e}[/red]\n")
152
+ if verbose:
153
+ import traceback
154
+ traceback.print_exc()
155
+ sys.exit(1)
156
+
157
+
158
+ if __name__ == "__main__":
159
+ main()
@@ -0,0 +1,55 @@
1
+ """Configuration module for Mistral OCR."""
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from dotenv import load_dotenv
9
+
10
+
11
+ @dataclass
12
+ class Config:
13
+ """Configuration for Mistral OCR."""
14
+
15
+ api_key: str
16
+ model: str = "mistral-ocr-latest"
17
+ max_file_size_mb: int = 50
18
+ max_pages: int = 1000
19
+ output_format: str = "markdown"
20
+ include_images: bool = True
21
+ verbose: bool = False
22
+
23
+ @classmethod
24
+ def from_env(cls, env_file: Optional[Path] = None) -> "Config":
25
+ """Load configuration from environment variables."""
26
+ if env_file and env_file.exists():
27
+ load_dotenv(env_file)
28
+ else:
29
+ load_dotenv()
30
+
31
+ api_key = os.getenv("MISTRAL_API_KEY")
32
+ if not api_key:
33
+ raise ValueError(
34
+ "MISTRAL_API_KEY not found in environment variables. "
35
+ "Please set it or create a .env file."
36
+ )
37
+
38
+ return cls(
39
+ api_key=api_key,
40
+ model=os.getenv("MISTRAL_MODEL", "mistral-ocr-latest"),
41
+ max_file_size_mb=int(os.getenv("MAX_FILE_SIZE_MB", "50")),
42
+ max_pages=int(os.getenv("MAX_PAGES", "1000")),
43
+ output_format=os.getenv("OUTPUT_FORMAT", "markdown"),
44
+ include_images=os.getenv("INCLUDE_IMAGES", "true").lower() == "true",
45
+ verbose=os.getenv("VERBOSE", "false").lower() == "true",
46
+ )
47
+
48
+ def validate_file_size(self, file_path: Path) -> None:
49
+ """Validate that file size is within limits."""
50
+ file_size_mb = file_path.stat().st_size / (1024 * 1024)
51
+ if file_size_mb > self.max_file_size_mb:
52
+ raise ValueError(
53
+ f"File size ({file_size_mb:.2f} MB) exceeds maximum allowed size "
54
+ f"({self.max_file_size_mb} MB)"
55
+ )
@@ -0,0 +1,260 @@
1
+ """Core OCR processing module using Mistral AI."""
2
+
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ from mistralai import Mistral
8
+ from rich.console import Console
9
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
10
+
11
+ from .config import Config
12
+ from .utils import (
13
+ create_data_uri,
14
+ determine_output_path,
15
+ format_file_size,
16
+ get_supported_files,
17
+ sanitize_filename,
18
+ save_base64_image,
19
+ save_metadata,
20
+ )
21
+
22
+
23
+ console = Console()
24
+
25
+
26
+ class OCRProcessor:
27
+ """OCR processor using Mistral AI API."""
28
+
29
+ def __init__(self, config: Config):
30
+ """Initialize the OCR processor."""
31
+ self.config = config
32
+ try:
33
+ self.client = Mistral(api_key=config.api_key)
34
+ except Exception as e:
35
+ console.print(f"[red]Failed to initialize Mistral client: {e}[/red]")
36
+ raise
37
+ self.errors: List[Dict] = []
38
+ self.processed_files: List[Dict] = []
39
+
40
+ def process_file(self, file_path: Path) -> Optional[Dict]:
41
+ """Process a single file with OCR."""
42
+ try:
43
+ # Validate file size
44
+ file_size_mb = file_path.stat().st_size / (1024 * 1024)
45
+ if self.config.verbose:
46
+ console.print(f"[dim]File size: {file_size_mb:.2f} MB[/dim]")
47
+ self.config.validate_file_size(file_path)
48
+
49
+ # Create data URI for the file
50
+ if self.config.verbose:
51
+ console.print(f"[dim]Creating data URI for {file_path.suffix} file...[/dim]")
52
+ data_uri = create_data_uri(file_path)
53
+
54
+ # Determine document type based on file extension
55
+ if file_path.suffix.lower() == ".pdf":
56
+ document = {
57
+ "type": "document_url",
58
+ "document_url": data_uri
59
+ }
60
+ else:
61
+ document = {
62
+ "type": "image_url",
63
+ "image_url": data_uri
64
+ }
65
+
66
+ # Process with Mistral OCR
67
+ if not hasattr(self.client, 'ocr'):
68
+ raise AttributeError(
69
+ "OCR endpoint not available in Mistral client. "
70
+ "Please ensure you have the latest mistralai package "
71
+ "and OCR access enabled for your API key."
72
+ )
73
+
74
+ if self.config.verbose:
75
+ console.print(f"[dim]Sending to Mistral OCR API...[/dim]")
76
+ console.print(f"[dim]Model: {self.config.model}[/dim]")
77
+
78
+ response = self.client.ocr.process(
79
+ model=self.config.model,
80
+ document=document,
81
+ include_image_base64=self.config.include_images
82
+ )
83
+
84
+ return {
85
+ "file_path": file_path,
86
+ "response": response,
87
+ "success": True
88
+ }
89
+
90
+ except Exception as e:
91
+ error_msg = f"Error processing {file_path.name}: {str(e)}"
92
+ # Always show errors, not just in verbose mode
93
+ console.print(f"[red]{error_msg}[/red]")
94
+ if self.config.verbose:
95
+ import traceback
96
+ console.print(f"[dim]{traceback.format_exc()}[/dim]")
97
+ self.errors.append({
98
+ "file": str(file_path),
99
+ "error": str(e)
100
+ })
101
+ return None
102
+
103
+ def save_results(
104
+ self,
105
+ result: Dict,
106
+ output_dir: Path,
107
+ is_single_file: bool = False
108
+ ) -> None:
109
+ """Save OCR results to files."""
110
+ file_path = result["file_path"]
111
+ response = result["response"]
112
+
113
+ # For single files, use simpler naming
114
+ if is_single_file:
115
+ base_name = "output"
116
+ markdown_path = output_dir / "output.md"
117
+ else:
118
+ # For multiple files, use sanitized filename
119
+ base_name = sanitize_filename(file_path.stem, max_length=40)
120
+ markdown_path = output_dir / f"{base_name}.md"
121
+
122
+ markdown_content = []
123
+
124
+ # Add file header
125
+ markdown_content.append(f"# OCR Results\n\n")
126
+ markdown_content.append(f"**Original File:** {file_path.name}\n")
127
+ markdown_content.append(f"**Full Path:** `{file_path}`\n")
128
+ markdown_content.append(f"**Processed:** {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
129
+ markdown_content.append("---\n\n")
130
+
131
+ # Process each page
132
+ if hasattr(response, 'pages'):
133
+ for page in response.pages:
134
+ markdown_content.append(f"## Page {page.index + 1}\n\n")
135
+
136
+ # Add extracted text
137
+ if hasattr(page, 'markdown'):
138
+ markdown_content.append(page.markdown)
139
+ markdown_content.append("\n\n")
140
+
141
+ # Save images if included
142
+ if self.config.include_images and hasattr(page, 'images') and page.images:
143
+ images_dir = output_dir / "images"
144
+ images_dir.mkdir(parents=True, exist_ok=True)
145
+
146
+ for idx, image in enumerate(page.images):
147
+ if hasattr(image, 'base64'):
148
+ image_filename = f"page{page.index + 1}_img{idx + 1}.png"
149
+ image_path = images_dir / image_filename
150
+ save_base64_image(image.base64, image_path)
151
+
152
+ # Add image reference to markdown
153
+ markdown_content.append(f"![Image {idx + 1}](./images/{image_filename})\n\n")
154
+
155
+ # Write markdown file
156
+ with open(markdown_path, "w", encoding="utf-8") as f:
157
+ f.write("".join(markdown_content))
158
+
159
+ if self.config.verbose:
160
+ console.print(f"[green]✓[/green] Saved results to {markdown_path}")
161
+
162
+ def process_directory(
163
+ self,
164
+ input_dir: Path,
165
+ output_dir: Optional[Path] = None,
166
+ add_timestamp: bool = False
167
+ ) -> Tuple[int, int]:
168
+ """Process all supported files in a directory."""
169
+ files = get_supported_files(input_dir)
170
+
171
+ if not files:
172
+ console.print("[yellow]No supported files found in the directory.[/yellow]")
173
+ return 0, 0
174
+
175
+ output_path = determine_output_path(input_dir, output_dir, add_timestamp=add_timestamp)
176
+ console.print(f"[blue]Processing {len(files)} file(s)...[/blue]")
177
+ console.print(f"[blue]Output directory: {output_path}[/blue]\n")
178
+
179
+ start_time = time.time()
180
+ success_count = 0
181
+
182
+ with Progress(
183
+ SpinnerColumn(),
184
+ TextColumn("[progress.description]{task.description}"),
185
+ BarColumn(),
186
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
187
+ TimeRemainingColumn(),
188
+ console=console
189
+ ) as progress:
190
+ task = progress.add_task("Processing files...", total=len(files))
191
+
192
+ for file_path in files:
193
+ file_size = format_file_size(file_path.stat().st_size)
194
+ progress.update(
195
+ task,
196
+ description=f"Processing {file_path.name} ({file_size})..."
197
+ )
198
+
199
+ result = self.process_file(file_path)
200
+ if result:
201
+ self.save_results(result, output_path, is_single_file=False)
202
+ success_count += 1
203
+ base_name = sanitize_filename(file_path.stem, max_length=40)
204
+ self.processed_files.append({
205
+ "file": str(file_path),
206
+ "size": file_path.stat().st_size,
207
+ "output": str(output_path / f"{base_name}.md")
208
+ })
209
+
210
+ progress.update(task, advance=1)
211
+
212
+ # Save metadata
213
+ processing_time = time.time() - start_time
214
+ save_metadata(output_path, self.processed_files, processing_time, self.errors)
215
+
216
+ return success_count, len(files)
217
+
218
+ def process(
219
+ self,
220
+ input_path: Path,
221
+ output_path: Optional[Path] = None,
222
+ add_timestamp: bool = False
223
+ ) -> None:
224
+ """Process input path (file or directory)."""
225
+ if input_path.is_file():
226
+ # Process single file
227
+ output_dir = determine_output_path(input_path, output_path, add_timestamp=add_timestamp)
228
+ console.print(f"[blue]Processing file: {input_path}[/blue]")
229
+ console.print(f"[blue]Output directory: {output_dir}[/blue]\n")
230
+
231
+ start_time = time.time()
232
+ result = self.process_file(input_path)
233
+
234
+ if result:
235
+ self.save_results(result, output_dir, is_single_file=True)
236
+ self.processed_files.append({
237
+ "file": str(input_path),
238
+ "size": input_path.stat().st_size,
239
+ "output": str(output_dir / "output.md")
240
+ })
241
+
242
+ # Save metadata
243
+ processing_time = time.time() - start_time
244
+ save_metadata(output_dir, self.processed_files, processing_time, self.errors)
245
+
246
+ console.print(f"\n[green]✓ Successfully processed 1 file[/green]")
247
+ console.print(f"[dim]Processing time: {processing_time:.2f} seconds[/dim]")
248
+ else:
249
+ console.print(f"\n[red]✗ Failed to process file[/red]")
250
+
251
+ elif input_path.is_dir():
252
+ # Process directory
253
+ success_count, total_count = self.process_directory(input_path, output_path, add_timestamp)
254
+
255
+ console.print(f"\n[green]✓ Successfully processed {success_count}/{total_count} files[/green]")
256
+ if self.errors:
257
+ console.print(f"[red]✗ {len(self.errors)} file(s) failed[/red]")
258
+
259
+ else:
260
+ raise ValueError(f"Input path does not exist: {input_path}")
@@ -0,0 +1,132 @@
1
+ """Utility functions for Mistral OCR."""
2
+
3
+ import base64
4
+ import json
5
+ import mimetypes
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+
10
+ def encode_file_to_base64(file_path: Path) -> str:
11
+ """Encode a file to base64 string."""
12
+ with open(file_path, "rb") as file:
13
+ return base64.b64encode(file.read()).decode("utf-8")
14
+
15
+
16
+ def get_mime_type(file_path: Path) -> str:
17
+ """Get MIME type of a file."""
18
+ mime_type, _ = mimetypes.guess_type(str(file_path))
19
+ if not mime_type:
20
+ if file_path.suffix.lower() == ".pdf":
21
+ return "application/pdf"
22
+ elif file_path.suffix.lower() in [".jpg", ".jpeg"]:
23
+ return "image/jpeg"
24
+ elif file_path.suffix.lower() == ".png":
25
+ return "image/png"
26
+ elif file_path.suffix.lower() == ".webp":
27
+ return "image/webp"
28
+ else:
29
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
30
+ return mime_type
31
+
32
+
33
+ def create_data_uri(file_path: Path) -> str:
34
+ """Create a data URI from a file."""
35
+ mime_type = get_mime_type(file_path)
36
+ base64_data = encode_file_to_base64(file_path)
37
+ return f"data:{mime_type};base64,{base64_data}"
38
+
39
+
40
+ def save_base64_image(base64_string: str, output_path: Path) -> None:
41
+ """Save a base64 encoded image to file."""
42
+ image_data = base64.b64decode(base64_string)
43
+ output_path.parent.mkdir(parents=True, exist_ok=True)
44
+ with open(output_path, "wb") as f:
45
+ f.write(image_data)
46
+
47
+
48
+ def get_supported_files(directory: Path) -> List[Path]:
49
+ """Get all supported files from a directory."""
50
+ supported_extensions = {".pdf", ".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff"}
51
+ files = []
52
+
53
+ for file_path in directory.rglob("*"):
54
+ if file_path.is_file() and file_path.suffix.lower() in supported_extensions:
55
+ files.append(file_path)
56
+
57
+ return sorted(files)
58
+
59
+
60
+ def determine_output_path(
61
+ input_path: Path,
62
+ output_path: Optional[Path] = None,
63
+ default_folder_name: str = "mistral_ocr_output",
64
+ add_timestamp: bool = False
65
+ ) -> Path:
66
+ """Determine the output path for OCR results."""
67
+ if output_path:
68
+ return output_path
69
+
70
+ if input_path.is_file():
71
+ parent_dir = input_path.parent
72
+ else:
73
+ parent_dir = input_path
74
+
75
+ # Add timestamp if requested
76
+ if add_timestamp:
77
+ import time
78
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
79
+ folder_name = f"{default_folder_name}_{timestamp}"
80
+ else:
81
+ folder_name = default_folder_name
82
+
83
+ output_dir = parent_dir / folder_name
84
+ output_dir.mkdir(parents=True, exist_ok=True)
85
+ return output_dir
86
+
87
+
88
+ def save_metadata(
89
+ output_dir: Path,
90
+ files_processed: List[Dict],
91
+ processing_time: float,
92
+ errors: List[Dict]
93
+ ) -> None:
94
+ """Save processing metadata to JSON file."""
95
+ metadata = {
96
+ "files_processed": files_processed,
97
+ "total_files": len(files_processed),
98
+ "processing_time_seconds": processing_time,
99
+ "errors": errors,
100
+ "error_count": len(errors)
101
+ }
102
+
103
+ metadata_path = output_dir / "metadata.json"
104
+ with open(metadata_path, "w") as f:
105
+ json.dump(metadata, f, indent=2, default=str)
106
+
107
+
108
+ def format_file_size(size_bytes: int) -> str:
109
+ """Format file size in human-readable format."""
110
+ for unit in ["B", "KB", "MB", "GB"]:
111
+ if size_bytes < 1024.0:
112
+ return f"{size_bytes:.2f} {unit}"
113
+ size_bytes /= 1024.0
114
+ return f"{size_bytes:.2f} TB"
115
+
116
+
117
+ def sanitize_filename(filename: str, max_length: int = 50) -> str:
118
+ """Sanitize filename by removing or replacing invalid characters."""
119
+ invalid_chars = '<>:"/\\|?*'
120
+ for char in invalid_chars:
121
+ filename = filename.replace(char, "_")
122
+
123
+ # Truncate long filenames but keep extension
124
+ if len(filename) > max_length and '.' in filename:
125
+ name, ext = filename.rsplit('.', 1)
126
+ if len(name) > max_length - len(ext) - 1:
127
+ name = name[:max_length - len(ext) - 4] + "..."
128
+ filename = f"{name}.{ext}"
129
+ elif len(filename) > max_length:
130
+ filename = filename[:max_length - 3] + "..."
131
+
132
+ return filename
@@ -0,0 +1,66 @@
1
+ [tool.poetry]
2
+ name = "mistral-ocr-cli"
3
+ version = "1.0.0"
4
+ description = "A clean command-line tool for OCR processing using Mistral AI's API"
5
+ authors = ["Ruben Fernandez-Fuertes <fernandezfuertesruben@gmail.com>"]
6
+ readme = "README.md"
7
+ packages = [{include = "mistral_ocr"}]
8
+ license = "MIT"
9
+ homepage = "https://github.com/rubenfernandez/mistral-ocr-cli"
10
+ repository = "https://github.com/rubenfernandez/mistral-ocr-cli"
11
+ keywords = ["ocr", "mistral", "pdf", "cli", "command-line", "document-processing", "text-extraction", "image-processing"]
12
+ classifiers = [
13
+ "Development Status :: 5 - Production/Stable",
14
+ "Environment :: Console",
15
+ "Intended Audience :: Developers",
16
+ "Intended Audience :: Science/Research",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ "Topic :: Text Processing",
26
+ "Topic :: Utilities",
27
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
28
+ ]
29
+
30
+ [tool.poetry.dependencies]
31
+ python = "^3.9"
32
+ mistralai = "^1.0.0"
33
+ python-dotenv = "^1.0.0"
34
+ click = "^8.1.7"
35
+ rich = "^13.7.0"
36
+ Pillow = "^10.2.0"
37
+ pypdf = "^4.0.0"
38
+
39
+ [tool.poetry.group.dev.dependencies]
40
+ pytest = "^8.0.0"
41
+ black = "^24.0.0"
42
+ ruff = "^0.3.0"
43
+ mypy = "^1.8.0"
44
+ pytest-cov = "^4.1.0"
45
+
46
+ [tool.poetry.scripts]
47
+ mistral-ocr = "mistral_ocr.cli:main"
48
+
49
+ [build-system]
50
+ requires = ["poetry-core"]
51
+ build-backend = "poetry.core.masonry.api"
52
+
53
+ [tool.black]
54
+ line-length = 100
55
+ target-version = ['py39']
56
+
57
+ [tool.ruff]
58
+ line-length = 100
59
+ select = ["E", "F", "I", "N", "UP", "B", "C4", "SIM"]
60
+ ignore = ["E501"]
61
+
62
+ [tool.mypy]
63
+ python_version = "3.9"
64
+ warn_return_any = true
65
+ warn_unused_configs = true
66
+ disallow_untyped_defs = true