mistral-ocr-cli 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistral_ocr_cli-1.0.0/LICENSE +21 -0
- mistral_ocr_cli-1.0.0/PKG-INFO +294 -0
- mistral_ocr_cli-1.0.0/README.md +259 -0
- mistral_ocr_cli-1.0.0/mistral_ocr/__init__.py +9 -0
- mistral_ocr_cli-1.0.0/mistral_ocr/__main__.py +6 -0
- mistral_ocr_cli-1.0.0/mistral_ocr/cli.py +159 -0
- mistral_ocr_cli-1.0.0/mistral_ocr/config.py +55 -0
- mistral_ocr_cli-1.0.0/mistral_ocr/processor.py +260 -0
- mistral_ocr_cli-1.0.0/mistral_ocr/utils.py +132 -0
- mistral_ocr_cli-1.0.0/pyproject.toml +66 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Rubén Fernández-Fuertes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: mistral-ocr-cli
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A clean command-line tool for OCR processing using Mistral AI's API
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: ocr,mistral,pdf,cli,command-line,document-processing,text-extraction,image-processing
|
|
7
|
+
Author: Ruben Fernandez-Fuertes
|
|
8
|
+
Author-email: fernandezfuertesruben@gmail.com
|
|
9
|
+
Requires-Python: >=3.9,<4.0
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Text Processing
|
|
25
|
+
Classifier: Topic :: Utilities
|
|
26
|
+
Requires-Dist: Pillow (>=10.2.0,<11.0.0)
|
|
27
|
+
Requires-Dist: click (>=8.1.7,<9.0.0)
|
|
28
|
+
Requires-Dist: mistralai (>=1.0.0,<2.0.0)
|
|
29
|
+
Requires-Dist: pypdf (>=4.0.0,<5.0.0)
|
|
30
|
+
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
|
31
|
+
Requires-Dist: rich (>=13.7.0,<14.0.0)
|
|
32
|
+
Project-URL: Homepage, https://github.com/rubenfernandez/mistral-ocr-cli
|
|
33
|
+
Project-URL: Repository, https://github.com/rubenfernandez/mistral-ocr-cli
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# Mistral OCR CLI
|
|
37
|
+
|
|
38
|
+
A powerful command-line tool for OCR processing using Mistral AI's state-of-the-art OCR API. Process PDFs and images to extract text, tables, equations, and images with unprecedented accuracy.
|
|
39
|
+
|
|
40
|
+
## ⚠️ Disclaimer
|
|
41
|
+
|
|
42
|
+
**This is an unofficial, community-created tool** that uses Mistral AI's OCR API. This project is not affiliated with, officially maintained, or endorsed by Mistral AI.
|
|
43
|
+
|
|
44
|
+
- **Official Mistral OCR**: https://mistral.ai/news/mistral-ocr
|
|
45
|
+
- **Official Documentation**: https://docs.mistral.ai/capabilities/OCR/
|
|
46
|
+
- **Mistral AI Platform**: https://console.mistral.ai/
|
|
47
|
+
|
|
48
|
+
For official tools and support, please visit [Mistral AI's website](https://mistral.ai).
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- 📄 **Multi-format Support**: Process PDFs and images (JPG, PNG, WEBP, GIF, BMP, TIFF)
|
|
53
|
+
- 📊 **Advanced Extraction**: Extract text, tables, equations, and images
|
|
54
|
+
- 📁 **Batch Processing**: Process single files or entire directories
|
|
55
|
+
- 🎯 **Smart Output**: Preserves document structure in markdown format
|
|
56
|
+
- 🖼️ **Image Extraction**: Optionally extract and save embedded images
|
|
57
|
+
- 📈 **Progress Tracking**: Real-time progress bars for batch operations
|
|
58
|
+
- 🔧 **Flexible Configuration**: Environment variables or command-line options
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
### Prerequisites
|
|
63
|
+
|
|
64
|
+
- Python 3.9 or higher
|
|
65
|
+
- Mistral API key from [Mistral Console](https://console.mistral.ai/)
|
|
66
|
+
|
|
67
|
+
### Install with Poetry (Recommended)
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Clone the repository
|
|
71
|
+
git clone https://github.com/yourusername/mistral-ocr.git
|
|
72
|
+
cd mistral-ocr
|
|
73
|
+
|
|
74
|
+
# Install Poetry if you haven't already
|
|
75
|
+
curl -sSL https://install.python-poetry.org | python3 -
|
|
76
|
+
|
|
77
|
+
# Install dependencies and the package
|
|
78
|
+
poetry install
|
|
79
|
+
|
|
80
|
+
# Make the command available globally
|
|
81
|
+
poetry build
|
|
82
|
+
pip install dist/mistral_ocr-*.whl
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Install with pip
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Clone the repository
|
|
89
|
+
git clone https://github.com/yourusername/mistral-ocr.git
|
|
90
|
+
cd mistral-ocr
|
|
91
|
+
|
|
92
|
+
# Install in editable mode for global usage
|
|
93
|
+
pip install -e .
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Configuration
|
|
97
|
+
|
|
98
|
+
### 1. Set up your Mistral API key
|
|
99
|
+
|
|
100
|
+
Create a `.env` file in your project root (or copy from `.env.example`):
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
cp .env.example .env
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Edit `.env` and add your API key:
|
|
107
|
+
|
|
108
|
+
```env
|
|
109
|
+
MISTRAL_API_KEY=your_actual_api_key_here
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### 2. Alternative: Export as environment variable
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
export MISTRAL_API_KEY="your_actual_api_key_here"
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Usage
|
|
119
|
+
|
|
120
|
+
### Basic Usage
|
|
121
|
+
|
|
122
|
+
Process a single PDF file (output saved to `mistral_ocr_output/` in the same directory):
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
mistral-ocr document.pdf
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Specify Output Directory
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
mistral-ocr document.pdf --output-path ./results
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Process Entire Directory
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
mistral-ocr ./documents --output-path ./extracted
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Command-Line Options
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
Usage: mistral-ocr INPUT_PATH [OPTIONS]
|
|
144
|
+
|
|
145
|
+
Arguments:
|
|
146
|
+
INPUT_PATH Path to input file or directory (required)
|
|
147
|
+
|
|
148
|
+
Options:
|
|
149
|
+
-o, --output-path PATH Output directory (default: <input_dir>/mistral_ocr_output/)
|
|
150
|
+
--api-key TEXT Mistral API key (or set MISTRAL_API_KEY env var)
|
|
151
|
+
--model TEXT OCR model (default: mistral-ocr-latest)
|
|
152
|
+
--env-file PATH Path to .env file
|
|
153
|
+
--include-images/--no-images Extract images (default: True)
|
|
154
|
+
-v, --verbose Enable verbose output
|
|
155
|
+
--version Show version
|
|
156
|
+
--help Show this message
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Examples
|
|
160
|
+
|
|
161
|
+
### Process a single image
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
mistral-ocr photo.jpg
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Process multiple PDFs with custom output
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
mistral-ocr ./reports --output-path ./extracted_text --verbose
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Use a different .env file
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
mistral-ocr document.pdf --env-file .env.production
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Process without extracting images
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
mistral-ocr document.pdf --no-images
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Pass API key directly (not recommended for production)
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
mistral-ocr doc.pdf --api-key "your_api_key_here"
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Output Structure
|
|
192
|
+
|
|
193
|
+
The tool creates the following output structure:
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
mistral_ocr_output/
|
|
197
|
+
├── document1.md # Extracted text in markdown format
|
|
198
|
+
├── document1_images/ # Extracted images (if enabled)
|
|
199
|
+
│ ├── page1_img1.png
|
|
200
|
+
│ └── page1_img2.png
|
|
201
|
+
├── document2.md
|
|
202
|
+
├── document2_images/
|
|
203
|
+
│ └── ...
|
|
204
|
+
└── metadata.json # Processing statistics and errors
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Markdown Output
|
|
208
|
+
|
|
209
|
+
Each processed document generates a markdown file containing:
|
|
210
|
+
- Document metadata (source, processing time)
|
|
211
|
+
- Extracted text with preserved formatting
|
|
212
|
+
- Tables rendered in markdown format
|
|
213
|
+
- Mathematical equations
|
|
214
|
+
- Image references (if image extraction is enabled)
|
|
215
|
+
|
|
216
|
+
### Metadata File
|
|
217
|
+
|
|
218
|
+
The `metadata.json` file contains:
|
|
219
|
+
- List of processed files
|
|
220
|
+
- Processing time
|
|
221
|
+
- File sizes
|
|
222
|
+
- Output paths
|
|
223
|
+
- Any errors encountered
|
|
224
|
+
|
|
225
|
+
## Limitations
|
|
226
|
+
|
|
227
|
+
- Maximum file size: 50 MB
|
|
228
|
+
- Maximum pages per document: 1,000
|
|
229
|
+
- Supported formats: PDF, JPG, JPEG, PNG, WEBP, GIF, BMP, TIFF
|
|
230
|
+
|
|
231
|
+
## Pricing
|
|
232
|
+
|
|
233
|
+
Mistral OCR API pricing: $1 per 1,000 pages ($0.001 per page)
|
|
234
|
+
|
|
235
|
+
## Development
|
|
236
|
+
|
|
237
|
+
### Run tests
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
poetry run pytest
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Format code
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
poetry run black mistral_ocr/
|
|
247
|
+
poetry run ruff check mistral_ocr/
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Type checking
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
poetry run mypy mistral_ocr/
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## Troubleshooting
|
|
257
|
+
|
|
258
|
+
### API Key Not Found
|
|
259
|
+
|
|
260
|
+
If you get an error about missing API key:
|
|
261
|
+
1. Ensure `.env` file exists and contains `MISTRAL_API_KEY=your_key`
|
|
262
|
+
2. Or export it: `export MISTRAL_API_KEY="your_key"`
|
|
263
|
+
3. Or pass it directly: `mistral-ocr --api-key "your_key" ...`
|
|
264
|
+
|
|
265
|
+
### File Size Error
|
|
266
|
+
|
|
267
|
+
If a file exceeds 50 MB:
|
|
268
|
+
- Consider splitting large PDFs into smaller parts
|
|
269
|
+
- Compress images before processing
|
|
270
|
+
|
|
271
|
+
### Installation Issues
|
|
272
|
+
|
|
273
|
+
If the command is not found after installation:
|
|
274
|
+
1. Ensure the package is installed: `pip show mistral-ocr`
|
|
275
|
+
2. Check your PATH includes pip's script directory
|
|
276
|
+
3. Try reinstalling with: `pip install -e .`
|
|
277
|
+
|
|
278
|
+
## License
|
|
279
|
+
|
|
280
|
+
MIT License - see LICENSE file for details
|
|
281
|
+
|
|
282
|
+
## Contributing
|
|
283
|
+
|
|
284
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
285
|
+
|
|
286
|
+
## Support
|
|
287
|
+
|
|
288
|
+
For issues or questions about this CLI tool, please open an issue on [GitHub](https://github.com/r-uben/mistral-ocr-cli/issues)
|
|
289
|
+
|
|
290
|
+
For questions about Mistral AI's OCR API, please refer to [Mistral's official documentation](https://docs.mistral.ai) or contact their support.
|
|
291
|
+
|
|
292
|
+
## Legal
|
|
293
|
+
|
|
294
|
+
"Mistral AI" and "Mistral" are trademarks of Mistral AI. This project is not affiliated with or endorsed by Mistral AI. The use of Mistral AI's OCR API is subject to Mistral AI's [Terms of Service](https://mistral.ai/terms/).
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# Mistral OCR CLI
|
|
2
|
+
|
|
3
|
+
A powerful command-line tool for OCR processing using Mistral AI's state-of-the-art OCR API. Process PDFs and images to extract text, tables, equations, and images with unprecedented accuracy.
|
|
4
|
+
|
|
5
|
+
## ⚠️ Disclaimer
|
|
6
|
+
|
|
7
|
+
**This is an unofficial, community-created tool** that uses Mistral AI's OCR API. This project is not affiliated with, officially maintained, or endorsed by Mistral AI.
|
|
8
|
+
|
|
9
|
+
- **Official Mistral OCR**: https://mistral.ai/news/mistral-ocr
|
|
10
|
+
- **Official Documentation**: https://docs.mistral.ai/capabilities/OCR/
|
|
11
|
+
- **Mistral AI Platform**: https://console.mistral.ai/
|
|
12
|
+
|
|
13
|
+
For official tools and support, please visit [Mistral AI's website](https://mistral.ai).
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- 📄 **Multi-format Support**: Process PDFs and images (JPG, PNG, WEBP, GIF, BMP, TIFF)
|
|
18
|
+
- 📊 **Advanced Extraction**: Extract text, tables, equations, and images
|
|
19
|
+
- 📁 **Batch Processing**: Process single files or entire directories
|
|
20
|
+
- 🎯 **Smart Output**: Preserves document structure in markdown format
|
|
21
|
+
- 🖼️ **Image Extraction**: Optionally extract and save embedded images
|
|
22
|
+
- 📈 **Progress Tracking**: Real-time progress bars for batch operations
|
|
23
|
+
- 🔧 **Flexible Configuration**: Environment variables or command-line options
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
### Prerequisites
|
|
28
|
+
|
|
29
|
+
- Python 3.9 or higher
|
|
30
|
+
- Mistral API key from [Mistral Console](https://console.mistral.ai/)
|
|
31
|
+
|
|
32
|
+
### Install with Poetry (Recommended)
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Clone the repository
|
|
36
|
+
git clone https://github.com/yourusername/mistral-ocr.git
|
|
37
|
+
cd mistral-ocr
|
|
38
|
+
|
|
39
|
+
# Install Poetry if you haven't already
|
|
40
|
+
curl -sSL https://install.python-poetry.org | python3 -
|
|
41
|
+
|
|
42
|
+
# Install dependencies and the package
|
|
43
|
+
poetry install
|
|
44
|
+
|
|
45
|
+
# Make the command available globally
|
|
46
|
+
poetry build
|
|
47
|
+
pip install dist/mistral_ocr-*.whl
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Install with pip
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Clone the repository
|
|
54
|
+
git clone https://github.com/yourusername/mistral-ocr.git
|
|
55
|
+
cd mistral-ocr
|
|
56
|
+
|
|
57
|
+
# Install in editable mode for global usage
|
|
58
|
+
pip install -e .
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Configuration
|
|
62
|
+
|
|
63
|
+
### 1. Set up your Mistral API key
|
|
64
|
+
|
|
65
|
+
Create a `.env` file in your project root (or copy from `.env.example`):
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
cp .env.example .env
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Edit `.env` and add your API key:
|
|
72
|
+
|
|
73
|
+
```env
|
|
74
|
+
MISTRAL_API_KEY=your_actual_api_key_here
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 2. Alternative: Export as environment variable
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
export MISTRAL_API_KEY="your_actual_api_key_here"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Usage
|
|
84
|
+
|
|
85
|
+
### Basic Usage
|
|
86
|
+
|
|
87
|
+
Process a single PDF file (output saved to `mistral_ocr_output/` in the same directory):
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
mistral-ocr document.pdf
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Specify Output Directory
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
mistral-ocr document.pdf --output-path ./results
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Process Entire Directory
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
mistral-ocr ./documents --output-path ./extracted
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Command-Line Options
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
Usage: mistral-ocr INPUT_PATH [OPTIONS]
|
|
109
|
+
|
|
110
|
+
Arguments:
|
|
111
|
+
INPUT_PATH Path to input file or directory (required)
|
|
112
|
+
|
|
113
|
+
Options:
|
|
114
|
+
-o, --output-path PATH Output directory (default: <input_dir>/mistral_ocr_output/)
|
|
115
|
+
--api-key TEXT Mistral API key (or set MISTRAL_API_KEY env var)
|
|
116
|
+
--model TEXT OCR model (default: mistral-ocr-latest)
|
|
117
|
+
--env-file PATH Path to .env file
|
|
118
|
+
--include-images/--no-images Extract images (default: True)
|
|
119
|
+
-v, --verbose Enable verbose output
|
|
120
|
+
--version Show version
|
|
121
|
+
--help Show this message
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Examples
|
|
125
|
+
|
|
126
|
+
### Process a single image
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
mistral-ocr photo.jpg
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Process multiple PDFs with custom output
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
mistral-ocr ./reports --output-path ./extracted_text --verbose
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Use a different .env file
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
mistral-ocr document.pdf --env-file .env.production
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Process without extracting images
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
mistral-ocr document.pdf --no-images
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Pass API key directly (not recommended for production)
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
mistral-ocr doc.pdf --api-key "your_api_key_here"
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Output Structure
|
|
157
|
+
|
|
158
|
+
The tool creates the following output structure:
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
mistral_ocr_output/
|
|
162
|
+
├── document1.md # Extracted text in markdown format
|
|
163
|
+
├── document1_images/ # Extracted images (if enabled)
|
|
164
|
+
│ ├── page1_img1.png
|
|
165
|
+
│ └── page1_img2.png
|
|
166
|
+
├── document2.md
|
|
167
|
+
├── document2_images/
|
|
168
|
+
│ └── ...
|
|
169
|
+
└── metadata.json # Processing statistics and errors
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Markdown Output
|
|
173
|
+
|
|
174
|
+
Each processed document generates a markdown file containing:
|
|
175
|
+
- Document metadata (source, processing time)
|
|
176
|
+
- Extracted text with preserved formatting
|
|
177
|
+
- Tables rendered in markdown format
|
|
178
|
+
- Mathematical equations
|
|
179
|
+
- Image references (if image extraction is enabled)
|
|
180
|
+
|
|
181
|
+
### Metadata File
|
|
182
|
+
|
|
183
|
+
The `metadata.json` file contains:
|
|
184
|
+
- List of processed files
|
|
185
|
+
- Processing time
|
|
186
|
+
- File sizes
|
|
187
|
+
- Output paths
|
|
188
|
+
- Any errors encountered
|
|
189
|
+
|
|
190
|
+
## Limitations
|
|
191
|
+
|
|
192
|
+
- Maximum file size: 50 MB
|
|
193
|
+
- Maximum pages per document: 1,000
|
|
194
|
+
- Supported formats: PDF, JPG, JPEG, PNG, WEBP, GIF, BMP, TIFF
|
|
195
|
+
|
|
196
|
+
## Pricing
|
|
197
|
+
|
|
198
|
+
Mistral OCR API pricing: $1 per 1,000 pages ($0.001 per page)
|
|
199
|
+
|
|
200
|
+
## Development
|
|
201
|
+
|
|
202
|
+
### Run tests
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
poetry run pytest
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Format code
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
poetry run black mistral_ocr/
|
|
212
|
+
poetry run ruff check mistral_ocr/
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Type checking
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
poetry run mypy mistral_ocr/
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Troubleshooting
|
|
222
|
+
|
|
223
|
+
### API Key Not Found
|
|
224
|
+
|
|
225
|
+
If you get an error about missing API key:
|
|
226
|
+
1. Ensure `.env` file exists and contains `MISTRAL_API_KEY=your_key`
|
|
227
|
+
2. Or export it: `export MISTRAL_API_KEY="your_key"`
|
|
228
|
+
3. Or pass it directly: `mistral-ocr --api-key "your_key" ...`
|
|
229
|
+
|
|
230
|
+
### File Size Error
|
|
231
|
+
|
|
232
|
+
If a file exceeds 50 MB:
|
|
233
|
+
- Consider splitting large PDFs into smaller parts
|
|
234
|
+
- Compress images before processing
|
|
235
|
+
|
|
236
|
+
### Installation Issues
|
|
237
|
+
|
|
238
|
+
If the command is not found after installation:
|
|
239
|
+
1. Ensure the package is installed: `pip show mistral-ocr`
|
|
240
|
+
2. Check your PATH includes pip's script directory
|
|
241
|
+
3. Try reinstalling with: `pip install -e .`
|
|
242
|
+
|
|
243
|
+
## License
|
|
244
|
+
|
|
245
|
+
MIT License - see LICENSE file for details
|
|
246
|
+
|
|
247
|
+
## Contributing
|
|
248
|
+
|
|
249
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
250
|
+
|
|
251
|
+
## Support
|
|
252
|
+
|
|
253
|
+
For issues or questions about this CLI tool, please open an issue on [GitHub](https://github.com/r-uben/mistral-ocr-cli/issues)
|
|
254
|
+
|
|
255
|
+
For questions about Mistral AI's OCR API, please refer to [Mistral's official documentation](https://docs.mistral.ai) or contact their support.
|
|
256
|
+
|
|
257
|
+
## Legal
|
|
258
|
+
|
|
259
|
+
"Mistral AI" and "Mistral" are trademarks of Mistral AI. This project is not affiliated with or endorsed by Mistral AI. The use of Mistral AI's OCR API is subject to Mistral AI's [Terms of Service](https://mistral.ai/terms/).
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Command-line interface for Mistral OCR."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from .config import Config
|
|
12
|
+
from .processor import OCRProcessor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
|
|
17
|
+
# Get the original working directory if set
|
|
18
|
+
ORIGINAL_CWD = os.environ.get('MISTRAL_OCR_CWD', os.getcwd())
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@click.command()
|
|
22
|
+
@click.argument(
|
|
23
|
+
"input_path",
|
|
24
|
+
type=click.Path(path_type=Path),
|
|
25
|
+
required=True
|
|
26
|
+
)
|
|
27
|
+
@click.option(
|
|
28
|
+
"--output-path", "-o",
|
|
29
|
+
type=click.Path(path_type=Path),
|
|
30
|
+
required=False,
|
|
31
|
+
help="Path to output directory (default: <input_dir>/mistral_ocr_output/)"
|
|
32
|
+
)
|
|
33
|
+
@click.option(
|
|
34
|
+
"--api-key",
|
|
35
|
+
type=str,
|
|
36
|
+
envvar="MISTRAL_API_KEY",
|
|
37
|
+
help="Mistral API key (can also be set via MISTRAL_API_KEY env var)"
|
|
38
|
+
)
|
|
39
|
+
@click.option(
|
|
40
|
+
"--model",
|
|
41
|
+
type=str,
|
|
42
|
+
default="mistral-ocr-latest",
|
|
43
|
+
help="Mistral OCR model to use (default: mistral-ocr-latest)"
|
|
44
|
+
)
|
|
45
|
+
@click.option(
|
|
46
|
+
"--env-file",
|
|
47
|
+
type=click.Path(exists=True, path_type=Path),
|
|
48
|
+
help="Path to .env file containing configuration"
|
|
49
|
+
)
|
|
50
|
+
@click.option(
|
|
51
|
+
"--include-images/--no-images",
|
|
52
|
+
default=True,
|
|
53
|
+
help="Include extracted images in output (default: True)"
|
|
54
|
+
)
|
|
55
|
+
@click.option(
|
|
56
|
+
"--add-timestamp/--no-timestamp",
|
|
57
|
+
default=False,
|
|
58
|
+
help="Add timestamp to output folder name (default: False)"
|
|
59
|
+
)
|
|
60
|
+
@click.option(
|
|
61
|
+
"--verbose", "-v",
|
|
62
|
+
is_flag=True,
|
|
63
|
+
help="Enable verbose output"
|
|
64
|
+
)
|
|
65
|
+
@click.version_option(version="1.0.0", prog_name="mistral-ocr")
|
|
66
|
+
def main(
|
|
67
|
+
input_path: Path,
|
|
68
|
+
output_path: Optional[Path],
|
|
69
|
+
api_key: Optional[str],
|
|
70
|
+
model: str,
|
|
71
|
+
env_file: Optional[Path],
|
|
72
|
+
include_images: bool,
|
|
73
|
+
add_timestamp: bool,
|
|
74
|
+
verbose: bool
|
|
75
|
+
) -> None:
|
|
76
|
+
"""
|
|
77
|
+
Mistral OCR - Process documents using Mistral AI's OCR API.
|
|
78
|
+
|
|
79
|
+
This tool processes PDF and image files using Mistral's powerful OCR capabilities,
|
|
80
|
+
extracting text, tables, equations, and images with high accuracy.
|
|
81
|
+
|
|
82
|
+
Examples:
|
|
83
|
+
|
|
84
|
+
# Process a single PDF file
|
|
85
|
+
mistral-ocr document.pdf
|
|
86
|
+
|
|
87
|
+
# Process all files in a directory
|
|
88
|
+
mistral-ocr ./documents --output-path ./results
|
|
89
|
+
|
|
90
|
+
# Use a specific .env file
|
|
91
|
+
mistral-ocr doc.pdf --env-file .env.production
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
# Resolve input path relative to original working directory
|
|
95
|
+
if not input_path.is_absolute():
|
|
96
|
+
input_path = Path(ORIGINAL_CWD) / input_path
|
|
97
|
+
|
|
98
|
+
# Check if input path exists
|
|
99
|
+
if not input_path.exists():
|
|
100
|
+
raise ValueError(f"Input path does not exist: {input_path}")
|
|
101
|
+
|
|
102
|
+
# Resolve output path if provided
|
|
103
|
+
if output_path and not output_path.is_absolute():
|
|
104
|
+
output_path = Path(ORIGINAL_CWD) / output_path
|
|
105
|
+
|
|
106
|
+
# Print header
|
|
107
|
+
console.print("\n[bold blue]🔍 Mistral OCR[/bold blue]")
|
|
108
|
+
console.print("[dim]Powered by Mistral AI's OCR API[/dim]\n")
|
|
109
|
+
|
|
110
|
+
# Load configuration
|
|
111
|
+
if verbose:
|
|
112
|
+
console.print("[dim]Loading configuration...[/dim]")
|
|
113
|
+
|
|
114
|
+
# Create config from environment
|
|
115
|
+
if env_file:
|
|
116
|
+
config = Config.from_env(env_file)
|
|
117
|
+
else:
|
|
118
|
+
# If API key is provided via CLI, set it as env var
|
|
119
|
+
if api_key:
|
|
120
|
+
import os
|
|
121
|
+
os.environ["MISTRAL_API_KEY"] = api_key
|
|
122
|
+
|
|
123
|
+
config = Config.from_env()
|
|
124
|
+
|
|
125
|
+
# Override config with CLI options
|
|
126
|
+
config.model = model
|
|
127
|
+
config.include_images = include_images
|
|
128
|
+
config.verbose = verbose
|
|
129
|
+
|
|
130
|
+
# Create processor
|
|
131
|
+
processor = OCRProcessor(config)
|
|
132
|
+
|
|
133
|
+
# Process input
|
|
134
|
+
processor.process(input_path, output_path, add_timestamp=add_timestamp)
|
|
135
|
+
|
|
136
|
+
# Print summary
|
|
137
|
+
if processor.errors and verbose:
|
|
138
|
+
console.print("\n[yellow]⚠ Errors encountered:[/yellow]")
|
|
139
|
+
for error in processor.errors:
|
|
140
|
+
console.print(f" [red]• {error['file']}: {error['error']}[/red]")
|
|
141
|
+
|
|
142
|
+
console.print("\n[bold green]✨ Processing complete![/bold green]\n")
|
|
143
|
+
|
|
144
|
+
except ValueError as e:
|
|
145
|
+
console.print(f"\n[red]Error: {e}[/red]\n")
|
|
146
|
+
sys.exit(1)
|
|
147
|
+
except KeyboardInterrupt:
|
|
148
|
+
console.print("\n[yellow]Processing interrupted by user.[/yellow]\n")
|
|
149
|
+
sys.exit(130)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
console.print(f"\n[red]Unexpected error: {e}[/red]\n")
|
|
152
|
+
if verbose:
|
|
153
|
+
import traceback
|
|
154
|
+
traceback.print_exc()
|
|
155
|
+
sys.exit(1)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if __name__ == "__main__":
|
|
159
|
+
main()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Configuration module for Mistral OCR."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Config:
|
|
13
|
+
"""Configuration for Mistral OCR."""
|
|
14
|
+
|
|
15
|
+
api_key: str
|
|
16
|
+
model: str = "mistral-ocr-latest"
|
|
17
|
+
max_file_size_mb: int = 50
|
|
18
|
+
max_pages: int = 1000
|
|
19
|
+
output_format: str = "markdown"
|
|
20
|
+
include_images: bool = True
|
|
21
|
+
verbose: bool = False
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def from_env(cls, env_file: Optional[Path] = None) -> "Config":
|
|
25
|
+
"""Load configuration from environment variables."""
|
|
26
|
+
if env_file and env_file.exists():
|
|
27
|
+
load_dotenv(env_file)
|
|
28
|
+
else:
|
|
29
|
+
load_dotenv()
|
|
30
|
+
|
|
31
|
+
api_key = os.getenv("MISTRAL_API_KEY")
|
|
32
|
+
if not api_key:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
"MISTRAL_API_KEY not found in environment variables. "
|
|
35
|
+
"Please set it or create a .env file."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
return cls(
|
|
39
|
+
api_key=api_key,
|
|
40
|
+
model=os.getenv("MISTRAL_MODEL", "mistral-ocr-latest"),
|
|
41
|
+
max_file_size_mb=int(os.getenv("MAX_FILE_SIZE_MB", "50")),
|
|
42
|
+
max_pages=int(os.getenv("MAX_PAGES", "1000")),
|
|
43
|
+
output_format=os.getenv("OUTPUT_FORMAT", "markdown"),
|
|
44
|
+
include_images=os.getenv("INCLUDE_IMAGES", "true").lower() == "true",
|
|
45
|
+
verbose=os.getenv("VERBOSE", "false").lower() == "true",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def validate_file_size(self, file_path: Path) -> None:
|
|
49
|
+
"""Validate that file size is within limits."""
|
|
50
|
+
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
51
|
+
if file_size_mb > self.max_file_size_mb:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"File size ({file_size_mb:.2f} MB) exceeds maximum allowed size "
|
|
54
|
+
f"({self.max_file_size_mb} MB)"
|
|
55
|
+
)
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
"""Core OCR processing module using Mistral AI."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from mistralai import Mistral
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
|
|
10
|
+
|
|
11
|
+
from .config import Config
|
|
12
|
+
from .utils import (
|
|
13
|
+
create_data_uri,
|
|
14
|
+
determine_output_path,
|
|
15
|
+
format_file_size,
|
|
16
|
+
get_supported_files,
|
|
17
|
+
sanitize_filename,
|
|
18
|
+
save_base64_image,
|
|
19
|
+
save_metadata,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
console = Console()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class OCRProcessor:
|
|
27
|
+
"""OCR processor using Mistral AI API."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: Config):
|
|
30
|
+
"""Initialize the OCR processor."""
|
|
31
|
+
self.config = config
|
|
32
|
+
try:
|
|
33
|
+
self.client = Mistral(api_key=config.api_key)
|
|
34
|
+
except Exception as e:
|
|
35
|
+
console.print(f"[red]Failed to initialize Mistral client: {e}[/red]")
|
|
36
|
+
raise
|
|
37
|
+
self.errors: List[Dict] = []
|
|
38
|
+
self.processed_files: List[Dict] = []
|
|
39
|
+
|
|
40
|
+
def process_file(self, file_path: Path) -> Optional[Dict]:
|
|
41
|
+
"""Process a single file with OCR."""
|
|
42
|
+
try:
|
|
43
|
+
# Validate file size
|
|
44
|
+
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
|
45
|
+
if self.config.verbose:
|
|
46
|
+
console.print(f"[dim]File size: {file_size_mb:.2f} MB[/dim]")
|
|
47
|
+
self.config.validate_file_size(file_path)
|
|
48
|
+
|
|
49
|
+
# Create data URI for the file
|
|
50
|
+
if self.config.verbose:
|
|
51
|
+
console.print(f"[dim]Creating data URI for {file_path.suffix} file...[/dim]")
|
|
52
|
+
data_uri = create_data_uri(file_path)
|
|
53
|
+
|
|
54
|
+
# Determine document type based on file extension
|
|
55
|
+
if file_path.suffix.lower() == ".pdf":
|
|
56
|
+
document = {
|
|
57
|
+
"type": "document_url",
|
|
58
|
+
"document_url": data_uri
|
|
59
|
+
}
|
|
60
|
+
else:
|
|
61
|
+
document = {
|
|
62
|
+
"type": "image_url",
|
|
63
|
+
"image_url": data_uri
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
# Process with Mistral OCR
|
|
67
|
+
if not hasattr(self.client, 'ocr'):
|
|
68
|
+
raise AttributeError(
|
|
69
|
+
"OCR endpoint not available in Mistral client. "
|
|
70
|
+
"Please ensure you have the latest mistralai package "
|
|
71
|
+
"and OCR access enabled for your API key."
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if self.config.verbose:
|
|
75
|
+
console.print(f"[dim]Sending to Mistral OCR API...[/dim]")
|
|
76
|
+
console.print(f"[dim]Model: {self.config.model}[/dim]")
|
|
77
|
+
|
|
78
|
+
response = self.client.ocr.process(
|
|
79
|
+
model=self.config.model,
|
|
80
|
+
document=document,
|
|
81
|
+
include_image_base64=self.config.include_images
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
"file_path": file_path,
|
|
86
|
+
"response": response,
|
|
87
|
+
"success": True
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
except Exception as e:
|
|
91
|
+
error_msg = f"Error processing {file_path.name}: {str(e)}"
|
|
92
|
+
# Always show errors, not just in verbose mode
|
|
93
|
+
console.print(f"[red]{error_msg}[/red]")
|
|
94
|
+
if self.config.verbose:
|
|
95
|
+
import traceback
|
|
96
|
+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
|
|
97
|
+
self.errors.append({
|
|
98
|
+
"file": str(file_path),
|
|
99
|
+
"error": str(e)
|
|
100
|
+
})
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
def save_results(
|
|
104
|
+
self,
|
|
105
|
+
result: Dict,
|
|
106
|
+
output_dir: Path,
|
|
107
|
+
is_single_file: bool = False
|
|
108
|
+
) -> None:
|
|
109
|
+
"""Save OCR results to files."""
|
|
110
|
+
file_path = result["file_path"]
|
|
111
|
+
response = result["response"]
|
|
112
|
+
|
|
113
|
+
# For single files, use simpler naming
|
|
114
|
+
if is_single_file:
|
|
115
|
+
base_name = "output"
|
|
116
|
+
markdown_path = output_dir / "output.md"
|
|
117
|
+
else:
|
|
118
|
+
# For multiple files, use sanitized filename
|
|
119
|
+
base_name = sanitize_filename(file_path.stem, max_length=40)
|
|
120
|
+
markdown_path = output_dir / f"{base_name}.md"
|
|
121
|
+
|
|
122
|
+
markdown_content = []
|
|
123
|
+
|
|
124
|
+
# Add file header
|
|
125
|
+
markdown_content.append(f"# OCR Results\n\n")
|
|
126
|
+
markdown_content.append(f"**Original File:** {file_path.name}\n")
|
|
127
|
+
markdown_content.append(f"**Full Path:** `{file_path}`\n")
|
|
128
|
+
markdown_content.append(f"**Processed:** {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
129
|
+
markdown_content.append("---\n\n")
|
|
130
|
+
|
|
131
|
+
# Process each page
|
|
132
|
+
if hasattr(response, 'pages'):
|
|
133
|
+
for page in response.pages:
|
|
134
|
+
markdown_content.append(f"## Page {page.index + 1}\n\n")
|
|
135
|
+
|
|
136
|
+
# Add extracted text
|
|
137
|
+
if hasattr(page, 'markdown'):
|
|
138
|
+
markdown_content.append(page.markdown)
|
|
139
|
+
markdown_content.append("\n\n")
|
|
140
|
+
|
|
141
|
+
# Save images if included
|
|
142
|
+
if self.config.include_images and hasattr(page, 'images') and page.images:
|
|
143
|
+
images_dir = output_dir / "images"
|
|
144
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
|
|
146
|
+
for idx, image in enumerate(page.images):
|
|
147
|
+
if hasattr(image, 'base64'):
|
|
148
|
+
image_filename = f"page{page.index + 1}_img{idx + 1}.png"
|
|
149
|
+
image_path = images_dir / image_filename
|
|
150
|
+
save_base64_image(image.base64, image_path)
|
|
151
|
+
|
|
152
|
+
# Add image reference to markdown
|
|
153
|
+
markdown_content.append(f"\n\n")
|
|
154
|
+
|
|
155
|
+
# Write markdown file
|
|
156
|
+
with open(markdown_path, "w", encoding="utf-8") as f:
|
|
157
|
+
f.write("".join(markdown_content))
|
|
158
|
+
|
|
159
|
+
if self.config.verbose:
|
|
160
|
+
console.print(f"[green]✓[/green] Saved results to {markdown_path}")
|
|
161
|
+
|
|
162
|
+
def process_directory(
|
|
163
|
+
self,
|
|
164
|
+
input_dir: Path,
|
|
165
|
+
output_dir: Optional[Path] = None,
|
|
166
|
+
add_timestamp: bool = False
|
|
167
|
+
) -> Tuple[int, int]:
|
|
168
|
+
"""Process all supported files in a directory."""
|
|
169
|
+
files = get_supported_files(input_dir)
|
|
170
|
+
|
|
171
|
+
if not files:
|
|
172
|
+
console.print("[yellow]No supported files found in the directory.[/yellow]")
|
|
173
|
+
return 0, 0
|
|
174
|
+
|
|
175
|
+
output_path = determine_output_path(input_dir, output_dir, add_timestamp=add_timestamp)
|
|
176
|
+
console.print(f"[blue]Processing {len(files)} file(s)...[/blue]")
|
|
177
|
+
console.print(f"[blue]Output directory: {output_path}[/blue]\n")
|
|
178
|
+
|
|
179
|
+
start_time = time.time()
|
|
180
|
+
success_count = 0
|
|
181
|
+
|
|
182
|
+
with Progress(
|
|
183
|
+
SpinnerColumn(),
|
|
184
|
+
TextColumn("[progress.description]{task.description}"),
|
|
185
|
+
BarColumn(),
|
|
186
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
187
|
+
TimeRemainingColumn(),
|
|
188
|
+
console=console
|
|
189
|
+
) as progress:
|
|
190
|
+
task = progress.add_task("Processing files...", total=len(files))
|
|
191
|
+
|
|
192
|
+
for file_path in files:
|
|
193
|
+
file_size = format_file_size(file_path.stat().st_size)
|
|
194
|
+
progress.update(
|
|
195
|
+
task,
|
|
196
|
+
description=f"Processing {file_path.name} ({file_size})..."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
result = self.process_file(file_path)
|
|
200
|
+
if result:
|
|
201
|
+
self.save_results(result, output_path, is_single_file=False)
|
|
202
|
+
success_count += 1
|
|
203
|
+
base_name = sanitize_filename(file_path.stem, max_length=40)
|
|
204
|
+
self.processed_files.append({
|
|
205
|
+
"file": str(file_path),
|
|
206
|
+
"size": file_path.stat().st_size,
|
|
207
|
+
"output": str(output_path / f"{base_name}.md")
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
progress.update(task, advance=1)
|
|
211
|
+
|
|
212
|
+
# Save metadata
|
|
213
|
+
processing_time = time.time() - start_time
|
|
214
|
+
save_metadata(output_path, self.processed_files, processing_time, self.errors)
|
|
215
|
+
|
|
216
|
+
return success_count, len(files)
|
|
217
|
+
|
|
218
|
+
def process(
|
|
219
|
+
self,
|
|
220
|
+
input_path: Path,
|
|
221
|
+
output_path: Optional[Path] = None,
|
|
222
|
+
add_timestamp: bool = False
|
|
223
|
+
) -> None:
|
|
224
|
+
"""Process input path (file or directory)."""
|
|
225
|
+
if input_path.is_file():
|
|
226
|
+
# Process single file
|
|
227
|
+
output_dir = determine_output_path(input_path, output_path, add_timestamp=add_timestamp)
|
|
228
|
+
console.print(f"[blue]Processing file: {input_path}[/blue]")
|
|
229
|
+
console.print(f"[blue]Output directory: {output_dir}[/blue]\n")
|
|
230
|
+
|
|
231
|
+
start_time = time.time()
|
|
232
|
+
result = self.process_file(input_path)
|
|
233
|
+
|
|
234
|
+
if result:
|
|
235
|
+
self.save_results(result, output_dir, is_single_file=True)
|
|
236
|
+
self.processed_files.append({
|
|
237
|
+
"file": str(input_path),
|
|
238
|
+
"size": input_path.stat().st_size,
|
|
239
|
+
"output": str(output_dir / "output.md")
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
# Save metadata
|
|
243
|
+
processing_time = time.time() - start_time
|
|
244
|
+
save_metadata(output_dir, self.processed_files, processing_time, self.errors)
|
|
245
|
+
|
|
246
|
+
console.print(f"\n[green]✓ Successfully processed 1 file[/green]")
|
|
247
|
+
console.print(f"[dim]Processing time: {processing_time:.2f} seconds[/dim]")
|
|
248
|
+
else:
|
|
249
|
+
console.print(f"\n[red]✗ Failed to process file[/red]")
|
|
250
|
+
|
|
251
|
+
elif input_path.is_dir():
|
|
252
|
+
# Process directory
|
|
253
|
+
success_count, total_count = self.process_directory(input_path, output_path, add_timestamp)
|
|
254
|
+
|
|
255
|
+
console.print(f"\n[green]✓ Successfully processed {success_count}/{total_count} files[/green]")
|
|
256
|
+
if self.errors:
|
|
257
|
+
console.print(f"[red]✗ {len(self.errors)} file(s) failed[/red]")
|
|
258
|
+
|
|
259
|
+
else:
|
|
260
|
+
raise ValueError(f"Input path does not exist: {input_path}")
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Utility functions for Mistral OCR."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import json
|
|
5
|
+
import mimetypes
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def encode_file_to_base64(file_path: Path) -> str:
|
|
11
|
+
"""Encode a file to base64 string."""
|
|
12
|
+
with open(file_path, "rb") as file:
|
|
13
|
+
return base64.b64encode(file.read()).decode("utf-8")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_mime_type(file_path: Path) -> str:
|
|
17
|
+
"""Get MIME type of a file."""
|
|
18
|
+
mime_type, _ = mimetypes.guess_type(str(file_path))
|
|
19
|
+
if not mime_type:
|
|
20
|
+
if file_path.suffix.lower() == ".pdf":
|
|
21
|
+
return "application/pdf"
|
|
22
|
+
elif file_path.suffix.lower() in [".jpg", ".jpeg"]:
|
|
23
|
+
return "image/jpeg"
|
|
24
|
+
elif file_path.suffix.lower() == ".png":
|
|
25
|
+
return "image/png"
|
|
26
|
+
elif file_path.suffix.lower() == ".webp":
|
|
27
|
+
return "image/webp"
|
|
28
|
+
else:
|
|
29
|
+
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
30
|
+
return mime_type
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def create_data_uri(file_path: Path) -> str:
|
|
34
|
+
"""Create a data URI from a file."""
|
|
35
|
+
mime_type = get_mime_type(file_path)
|
|
36
|
+
base64_data = encode_file_to_base64(file_path)
|
|
37
|
+
return f"data:{mime_type};base64,{base64_data}"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def save_base64_image(base64_string: str, output_path: Path) -> None:
|
|
41
|
+
"""Save a base64 encoded image to file."""
|
|
42
|
+
image_data = base64.b64decode(base64_string)
|
|
43
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
with open(output_path, "wb") as f:
|
|
45
|
+
f.write(image_data)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_supported_files(directory: Path) -> List[Path]:
|
|
49
|
+
"""Get all supported files from a directory."""
|
|
50
|
+
supported_extensions = {".pdf", ".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff"}
|
|
51
|
+
files = []
|
|
52
|
+
|
|
53
|
+
for file_path in directory.rglob("*"):
|
|
54
|
+
if file_path.is_file() and file_path.suffix.lower() in supported_extensions:
|
|
55
|
+
files.append(file_path)
|
|
56
|
+
|
|
57
|
+
return sorted(files)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def determine_output_path(
|
|
61
|
+
input_path: Path,
|
|
62
|
+
output_path: Optional[Path] = None,
|
|
63
|
+
default_folder_name: str = "mistral_ocr_output",
|
|
64
|
+
add_timestamp: bool = False
|
|
65
|
+
) -> Path:
|
|
66
|
+
"""Determine the output path for OCR results."""
|
|
67
|
+
if output_path:
|
|
68
|
+
return output_path
|
|
69
|
+
|
|
70
|
+
if input_path.is_file():
|
|
71
|
+
parent_dir = input_path.parent
|
|
72
|
+
else:
|
|
73
|
+
parent_dir = input_path
|
|
74
|
+
|
|
75
|
+
# Add timestamp if requested
|
|
76
|
+
if add_timestamp:
|
|
77
|
+
import time
|
|
78
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
79
|
+
folder_name = f"{default_folder_name}_{timestamp}"
|
|
80
|
+
else:
|
|
81
|
+
folder_name = default_folder_name
|
|
82
|
+
|
|
83
|
+
output_dir = parent_dir / folder_name
|
|
84
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
return output_dir
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def save_metadata(
|
|
89
|
+
output_dir: Path,
|
|
90
|
+
files_processed: List[Dict],
|
|
91
|
+
processing_time: float,
|
|
92
|
+
errors: List[Dict]
|
|
93
|
+
) -> None:
|
|
94
|
+
"""Save processing metadata to JSON file."""
|
|
95
|
+
metadata = {
|
|
96
|
+
"files_processed": files_processed,
|
|
97
|
+
"total_files": len(files_processed),
|
|
98
|
+
"processing_time_seconds": processing_time,
|
|
99
|
+
"errors": errors,
|
|
100
|
+
"error_count": len(errors)
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
metadata_path = output_dir / "metadata.json"
|
|
104
|
+
with open(metadata_path, "w") as f:
|
|
105
|
+
json.dump(metadata, f, indent=2, default=str)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def format_file_size(size_bytes: int) -> str:
|
|
109
|
+
"""Format file size in human-readable format."""
|
|
110
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
111
|
+
if size_bytes < 1024.0:
|
|
112
|
+
return f"{size_bytes:.2f} {unit}"
|
|
113
|
+
size_bytes /= 1024.0
|
|
114
|
+
return f"{size_bytes:.2f} TB"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def sanitize_filename(filename: str, max_length: int = 50) -> str:
|
|
118
|
+
"""Sanitize filename by removing or replacing invalid characters."""
|
|
119
|
+
invalid_chars = '<>:"/\\|?*'
|
|
120
|
+
for char in invalid_chars:
|
|
121
|
+
filename = filename.replace(char, "_")
|
|
122
|
+
|
|
123
|
+
# Truncate long filenames but keep extension
|
|
124
|
+
if len(filename) > max_length and '.' in filename:
|
|
125
|
+
name, ext = filename.rsplit('.', 1)
|
|
126
|
+
if len(name) > max_length - len(ext) - 1:
|
|
127
|
+
name = name[:max_length - len(ext) - 4] + "..."
|
|
128
|
+
filename = f"{name}.{ext}"
|
|
129
|
+
elif len(filename) > max_length:
|
|
130
|
+
filename = filename[:max_length - 3] + "..."
|
|
131
|
+
|
|
132
|
+
return filename
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "mistral-ocr-cli"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "A clean command-line tool for OCR processing using Mistral AI's API"
|
|
5
|
+
authors = ["Ruben Fernandez-Fuertes <fernandezfuertesruben@gmail.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
packages = [{include = "mistral_ocr"}]
|
|
8
|
+
license = "MIT"
|
|
9
|
+
homepage = "https://github.com/rubenfernandez/mistral-ocr-cli"
|
|
10
|
+
repository = "https://github.com/rubenfernandez/mistral-ocr-cli"
|
|
11
|
+
keywords = ["ocr", "mistral", "pdf", "cli", "command-line", "document-processing", "text-extraction", "image-processing"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 5 - Production/Stable",
|
|
14
|
+
"Environment :: Console",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
"Topic :: Text Processing",
|
|
26
|
+
"Topic :: Utilities",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[tool.poetry.dependencies]
|
|
31
|
+
python = "^3.9"
|
|
32
|
+
mistralai = "^1.0.0"
|
|
33
|
+
python-dotenv = "^1.0.0"
|
|
34
|
+
click = "^8.1.7"
|
|
35
|
+
rich = "^13.7.0"
|
|
36
|
+
Pillow = "^10.2.0"
|
|
37
|
+
pypdf = "^4.0.0"
|
|
38
|
+
|
|
39
|
+
[tool.poetry.group.dev.dependencies]
|
|
40
|
+
pytest = "^8.0.0"
|
|
41
|
+
black = "^24.0.0"
|
|
42
|
+
ruff = "^0.3.0"
|
|
43
|
+
mypy = "^1.8.0"
|
|
44
|
+
pytest-cov = "^4.1.0"
|
|
45
|
+
|
|
46
|
+
[tool.poetry.scripts]
|
|
47
|
+
mistral-ocr = "mistral_ocr.cli:main"
|
|
48
|
+
|
|
49
|
+
[build-system]
|
|
50
|
+
requires = ["poetry-core"]
|
|
51
|
+
build-backend = "poetry.core.masonry.api"
|
|
52
|
+
|
|
53
|
+
[tool.black]
|
|
54
|
+
line-length = 100
|
|
55
|
+
target-version = ['py39']
|
|
56
|
+
|
|
57
|
+
[tool.ruff]
|
|
58
|
+
line-length = 100
|
|
59
|
+
select = ["E", "F", "I", "N", "UP", "B", "C4", "SIM"]
|
|
60
|
+
ignore = ["E501"]
|
|
61
|
+
|
|
62
|
+
[tool.mypy]
|
|
63
|
+
python_version = "3.9"
|
|
64
|
+
warn_return_any = true
|
|
65
|
+
warn_unused_configs = true
|
|
66
|
+
disallow_untyped_defs = true
|