mistral-ai-ocr 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,255 @@
1
+ #!/usr/bin/env python
2
+ import base64
3
+ from typing import List
4
+ from mistralai import Mistral, OCRImageObject
5
+ from mistralai.models import OCRResponse
6
+ from pathlib import Path
7
+ import mimetypes
8
+ from enum import Enum
9
+ import sys
10
+
11
+ class Modes(Enum):
12
+ FULL = 0
13
+ FULL_ALT = 1
14
+ FULL_NO_DIR = 2
15
+ FULL_NO_PAGES = 3
16
+ TEXT = 4
17
+ TEXT_NO_PAGES = 5
18
+
19
+ def get_mode_from_string(mode_str: str):
20
+ for mode in Modes:
21
+ if mode.name == mode_str.upper() or mode.value == mode_str:
22
+ return mode
23
+ raise ValueError(f"Unknown mode: {mode_str}")
24
+
25
+ def b64encode_document(document_path: Path):
26
+ try:
27
+ with open(document_path, "rb") as doc_file:
28
+ return base64.b64encode(doc_file.read()).decode('utf-8')
29
+ except FileNotFoundError:
30
+ return None
31
+ except Exception as e:
32
+ return None
33
+
34
+ def b64decode_document(base64_data: str, output_path: Path):
35
+ if ',' in base64_data:
36
+ _, base64_str = base64_data.split(',', 1)
37
+ else:
38
+ base64_str = base64_data
39
+ try:
40
+ image_data = base64.b64decode(base64_str)
41
+ except (base64.binascii.Error, ValueError) as e:
42
+ print(f"Error decoding base64 data: {e}", file=sys.stderr)
43
+ return
44
+
45
+ output_path.parent.mkdir(parents=True, exist_ok=True)
46
+
47
+ with open(output_path, 'wb') as f:
48
+ f.write(image_data)
49
+
50
+ class Page:
51
+ def __init__(self, index, markdown=None, images:List[OCRImageObject]=None):
52
+ self.index = index
53
+ self.markdown = markdown
54
+ self.images = images if images is not None else []
55
+
56
+ def write_markdown(self, output_path: Path, append: bool = False, insert = None):
57
+ if self.markdown:
58
+ output_path.parent.mkdir(parents=True, exist_ok=True)
59
+ mode = 'a' if append else 'w'
60
+ with open(output_path, mode) as md_file:
61
+ if insert:
62
+ md_file.write(insert)
63
+ md_file.write(self.markdown)
64
+
65
+ def write_images(self, output_directory: Path):
66
+ if not self.images:
67
+ return
68
+
69
+ for image in self.images:
70
+ if image and image.image_base64:
71
+ image_name = image.id
72
+ image_path = output_directory / image_name
73
+ b64decode_document(image.image_base64, image_path)
74
+
75
+ class MistralOCRDocument:
76
+ def __init__(self,
77
+ document_path: Path,
78
+ api_key: str,
79
+ include_images=True,
80
+ output_directory: Path = None,
81
+ generate_pages=True,
82
+ full_directory_name="full",
83
+ page_separator="\n",
84
+ page_directory_name="page_<index>",
85
+ page_text_name="<stem>.md",
86
+ json_ocr_response_path=None,
87
+ save_json=True,
88
+ ):
89
+ self.document_path = document_path
90
+ self.api_key = api_key
91
+ self.include_images = include_images
92
+ self.generate_pages = generate_pages
93
+ self.save_json = save_json
94
+ self.full_directory_name = full_directory_name
95
+ self.page_separator = page_separator
96
+ self.page_directory_name = page_directory_name
97
+ self.page_text_name = page_text_name
98
+ self.json_ocr_response_path = json_ocr_response_path
99
+ if output_directory is None:
100
+ self.output_directory = self.get_input_path().parent / self.get_input_path().stem
101
+ else:
102
+ self.output_directory = output_directory
103
+
104
+ def get_ocr_response(self, mimetype, base64_document):
105
+ client = Mistral(api_key=self.api_key)
106
+ if mimetype.startswith("image/"):
107
+ document_type = "image_url"
108
+ elif mimetype.startswith("application/pdf"):
109
+ document_type = "document_url"
110
+ else:
111
+ raise ValueError(f"Unsupported MIME type: {mimetype}. Only image and PDF files are supported.")
112
+ self.ocr_response = client.ocr.process(
113
+ model="mistral-ocr-latest",
114
+ document={
115
+ "type": document_type,
116
+ document_type: f"data:{mimetype};base64,{base64_document}"
117
+ },
118
+ include_image_base64=self.include_images
119
+ )
120
+
121
+ def process_document(self):
122
+ if not self.document_path.exists():
123
+ raise FileNotFoundError(f"The document {self.document_path} does not exist.")
124
+ if not self.document_path.is_file():
125
+ raise ValueError(f"The path {self.document_path} is not a valid file.")
126
+
127
+ mimetype, _ = mimetypes.guess_type(self.document_path)
128
+ if mimetype is None:
129
+ raise ValueError(f"Could not determine the MIME type for {self.document_path}.")
130
+
131
+ self.get_ocr_response(mimetype, b64encode_document(self.document_path))
132
+ self.write_json()
133
+ self.process_ocr_response()
134
+
135
+ def process_json_response(self):
136
+ if self.json_ocr_response_path is None or not self.json_ocr_response_path.exists():
137
+ raise FileNotFoundError(f"The JSON OCR response {self.json_ocr_response_path} does not exist.")
138
+
139
+ with open(self.json_ocr_response_path, "r") as json_file:
140
+ self.ocr_response = OCRResponse.model_validate_json(json_file.read())
141
+ self.write_json()
142
+ self.process_ocr_response()
143
+
144
+ def process(self):
145
+ if self.json_ocr_response_path is not None:
146
+ self.process_json_response()
147
+ else:
148
+ self.process_document()
149
+
150
+ def get_input_path(self):
151
+ if self.json_ocr_response_path is not None:
152
+ return self.json_ocr_response_path
153
+ return self.document_path
154
+
155
+ def write_json(self):
156
+ if self.save_json:
157
+ output_path = (self.output_directory / self.get_input_path().stem).with_suffix(".json")
158
+ self.output_directory.mkdir(parents=True, exist_ok=True)
159
+ with open(output_path, "w") as text_file:
160
+ text_file.write(self.ocr_response.model_dump_json(indent=2))
161
+
162
+ def process_ocr_response(self):
163
+ response_pages = self.ocr_response.pages
164
+ if not response_pages:
165
+ print("No pages found in the OCR response.")
166
+ return
167
+
168
+ pages = []
169
+
170
+ full_dir = self.output_directory / self.full_directory_name
171
+
172
+ for r_page in response_pages:
173
+ page = Page(
174
+ index=r_page.index,
175
+ markdown=r_page.markdown,
176
+ images=r_page.images
177
+ )
178
+ if self.generate_pages:
179
+ page_dir = self.output_directory / self.page_directory_name.replace("<index>", str(page.index))
180
+ page.write_markdown((
181
+ page_dir / self.page_text_name.
182
+ replace("<stem>", self.get_input_path().stem).
183
+ replace("<index>", str(page.index))
184
+ ).with_suffix(".md"))
185
+ if self.include_images:
186
+ page.write_images(page_dir)
187
+ if self.include_images:
188
+ page.write_images(full_dir)
189
+ pages.append(page)
190
+ for i, page in enumerate(sorted(pages, key=lambda p: p.index)):
191
+ first = i == 0
192
+ md_file = (full_dir / self.get_input_path().stem).with_suffix(".md")
193
+ insert = self.page_separator if not first else None
194
+ page.write_markdown(md_file, append=not first, insert=insert)
195
+
196
+ def construct_from_mode(
197
+ document_path: Path,
198
+ api_key: str,
199
+ output_directory: Path = None,
200
+ json_ocr_response_path: Path = None,
201
+ page_separator: str = "\n",
202
+ write_json: bool = True,
203
+ mode: Modes = Modes.FULL
204
+ ):
205
+ kwargs = dict(
206
+ document_path=document_path,
207
+ api_key=api_key,
208
+ output_directory=output_directory,
209
+ json_ocr_response_path=json_ocr_response_path,
210
+ page_separator=page_separator,
211
+ save_json=write_json
212
+ )
213
+ match mode:
214
+ case Modes.FULL:
215
+ kwargs.update(
216
+ include_images=True,
217
+ generate_pages=True
218
+ )
219
+ case Modes.FULL_ALT:
220
+ kwargs.update(
221
+ include_images=True,
222
+ generate_pages=True,
223
+ full_directory_name="."
224
+ )
225
+ case Modes.FULL_NO_DIR:
226
+ kwargs.update(
227
+ include_images=True,
228
+ generate_pages=True,
229
+ full_directory_name=".",
230
+ page_directory_name=".",
231
+ page_text_name="<stem><index>.md"
232
+ )
233
+ case Modes.FULL_NO_PAGES:
234
+ kwargs.update(
235
+ include_images=True,
236
+ generate_pages=False,
237
+ full_directory_name="."
238
+ )
239
+ case Modes.TEXT:
240
+ kwargs.update(
241
+ include_images=False,
242
+ generate_pages=True,
243
+ full_directory_name=".",
244
+ page_directory_name=".",
245
+ page_text_name="<stem><index>.md"
246
+ )
247
+ case Modes.TEXT_NO_PAGES:
248
+ kwargs.update(
249
+ include_images=False,
250
+ generate_pages=False,
251
+ full_directory_name="."
252
+ )
253
+ case _:
254
+ raise ValueError(f"Unknown mode: {mode}")
255
+ return MistralOCRDocument(**kwargs)
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env python
2
+ from pathlib import Path
3
+ from . import Modes, construct_from_mode, get_mode_from_string
4
+ import argparse
5
+ import codecs
6
+ from os import getenv
7
+ from dotenv import load_dotenv
8
+
9
+ try:
10
+ load_dotenv(dotenv_path=Path.home() / ".mistral_ai_ocr.env")
11
+ except:
12
+ pass
13
+
14
+ _mode_choices = [mode.name for mode in Modes] + [str(mode.value) for mode in Modes]
15
+
16
+ def _unescape(s: str) -> str:
17
+ return codecs.decode(s, 'unicode_escape')
18
+
19
+ def main():
20
+ example_text = (
21
+ 'examples:\n\n'
22
+ '%(prog)s paper.pdf\n'
23
+ '%(prog)s paper.pdf --api-key jrWjJE5lFketfB2sA6vvhQK2SoHQ6R39\n'
24
+ '%(prog)s paper.pdf -o revision\n'
25
+ '%(prog)s paper.pdf -e\n'
26
+ '%(prog)s paper.pdf -m FULL\n'
27
+ '%(prog)s -j paper.json\n'
28
+ '%(prog)s -j paper.json -m TEXT_NO_PAGES -n\n'
29
+ )
30
+ parser = argparse.ArgumentParser(
31
+ description="A simple script that uses the Mistral AI OCR API to get the Markdown text from a PDF or image file.",
32
+ epilog=example_text,
33
+ formatter_class=argparse.RawDescriptionHelpFormatter
34
+ )
35
+
36
+ parser.add_argument("input", type=Path, nargs="?", help="input PDF or image file", default=None)
37
+ parser.add_argument("-k", "--api-key", help="Mistral API key, can be set via the MISTRAL_API_KEY environment variable", default=None)
38
+ parser.add_argument("-o", "--output", type=Path, help="output directory path. If not set, a directory will be created in the current working directory using the same stem (filename without extension) as the input file", default=None)
39
+ parser.add_argument("-j", "--json-ocr-response", type=Path, help="path from which to load a pre-existing JSON OCR response (any input file will be ignored)", default=None)
40
+ parser.add_argument("-m", "--mode", type=str, choices=_mode_choices, default="FULL_NO_PAGES",
41
+ help="mode of operation: either the name or numerical value of the mode. Defaults to FULL_NO_PAGES")
42
+ parser.add_argument("-s", "--page-separator", type=str, default="\n",
43
+ help="page separator to use when writing the Markdown file. Defaults to '\\n'")
44
+ parser.add_argument("-n", "--no-json", action="store_false", dest="write_json",
45
+ help="do not write the JSON OCR response to a file. By default, the response is written")
46
+ parser.add_argument("-e", "--load-dot-env", action="store_true",
47
+ help="load the .env file from the current directory using python-dotenv, to retrieve the Mistral API key")
48
+ args = parser.parse_args()
49
+
50
+ if args.load_dot_env:
51
+ load_dotenv()
52
+
53
+ if args.api_key is None:
54
+ args.api_key = getenv("MISTRAL_API_KEY")
55
+ if args.api_key is None:
56
+ parser.error("API key is required. Set it with --api-key, via the MISTRAL_API_KEY environment variable, or load it from a .env file with -e/--load-dot-env")
57
+
58
+ try:
59
+ construct_from_mode(
60
+ document_path=args.input,
61
+ api_key=args.api_key,
62
+ output_directory=args.output,
63
+ json_ocr_response_path=args.json_ocr_response,
64
+ page_separator=_unescape(args.page_separator),
65
+ write_json=args.write_json,
66
+ mode=get_mode_from_string(args.mode)
67
+ ).process()
68
+ except FileNotFoundError as e:
69
+ parser.error(e)
70
+ except ValueError as e:
71
+ parser.error(e)
72
+
73
+ if __name__ == "__main__":
74
+ main()
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.1
2
+ Name: mistral-ai-ocr
3
+ Version: 1.0
4
+ Description-Content-Type: text/markdown
5
+ Requires-Dist: mistralai
6
+ Requires-Dist: python-dotenv
7
+
8
+ # Mistral AI OCR
9
+ This is a simple script that uses the Mistral AI OCR API to extract text from a PDF or image file
10
+
11
+ ## Modes
12
+
13
+ | Value | Name |
14
+ |-|-|
15
+ | 0 | FULL |
16
+ | 1 | FULL_ALT |
17
+ | 2 | FULL_NO_DIR |
18
+ | 3 | FULL_NO_PAGES |
19
+ | 4 | TEXT |
20
+ | 5 | TEXT_NO_PAGES |
21
+
22
+ Given the input file `paper.pdf`, the directory structure for each mode is shown below:
23
+
24
+ ### 0 - `FULL`
25
+
26
+ Structure
27
+ ```
28
+ paper
29
+ ├── full
30
+ │ ├── image1.png
31
+ │ ├── image2.png
32
+ │ ├── image3.png
33
+ │ └── paper.md
34
+ ├── page_0
35
+ │ ├── image1.png
36
+ │ └── paper.md
37
+ ├── page_1
38
+ │ ├── image2.png
39
+ │ └── paper.md
40
+ └── page_2
41
+ ├── image3.png
42
+ └── paper.md
43
+ ```
44
+
45
+ ### 1 - `FULL_ALT`
46
+
47
+ Structure
48
+ ```
49
+ paper
50
+ ├── image1.png
51
+ ├── image2.png
52
+ ├── image3.png
53
+ ├── paper.md
54
+ ├── page_0
55
+ │ ├── image1.png
56
+ │ └── paper.md
57
+ ├── page_1
58
+ │ ├── image2.png
59
+ │ └── paper.md
60
+ └── page_2
61
+ ├── image3.png
62
+ └── paper.md
63
+ ```
64
+
65
+ ### 2 - `FULL_NO_DIR`
66
+
67
+ Structure
68
+ ```
69
+ paper
70
+ ├── image1.png
71
+ ├── image2.png
72
+ ├── image3.png
73
+ ├── paper.md
74
+ ├── paper0.md
75
+ ├── paper1.md
76
+ └── paper2.md
77
+ ```
78
+
79
+ ### 3 - `FULL_NO_PAGES` *default*
80
+
81
+ Structure
82
+ ```
83
+ paper
84
+ ├── image1.png
85
+ ├── image2.png
86
+ ├── image3.png
87
+ └── paper.md
88
+ ```
89
+
90
+ ### 4 - `TEXT`
91
+
92
+ Structure
93
+ ```
94
+ paper
95
+ ├── paper.md
96
+ ├── paper0.md
97
+ ├── paper1.md
98
+ └── paper2.md
99
+ ```
100
+
101
+ ### 5 - `TEXT_NO_PAGES`
102
+
103
+ Structure
104
+ ```
105
+ paper
106
+ └── paper.md
107
+ ```
108
+
109
+ By default, the JSON response from the Mistral AI OCR API is saved in the output directory. To disable JSON output, use the `-n` or `--no-json` argument. To experiment with a different **mode** without using additional API calls, reuse an existing JSON response instead of the original input file
110
+
111
+ # Usage
112
+
113
+ ## Install the Requirements
114
+
115
+ To install the necessary requirements, run the following command:
116
+
117
+ ```sh
118
+ pip install mistral-ai-ocr
119
+ ```
120
+
121
+ ## Typical Usage
122
+
123
+ ```sh
124
+ mistral-ai-ocr paper.pdf
125
+ mistral-ai-ocr paper.pdf --api-key jrWjJE5lFketfB2sA6vvhQK2SoHQ6R39
126
+ mistral-ai-ocr paper.pdf -o revision
127
+ mistral-ai-ocr paper.pdf -e
128
+ mistral-ai-ocr paper.pdf -m FULL
129
+ mistral-ai-ocr page74.jpg -e
130
+ mistral-ai-ocr -j paper.json
131
+ mistral-ai-ocr -j paper.json -m TEXT_NO_PAGES -n
132
+ ```
133
+
134
+ ## Arguments
135
+
136
+ | Argument || Description |
137
+ |-|-|-|
138
+ | | | input PDF or image file |
139
+ | -k API_KEY | --api-key API_KEY | Mistral API key, can be set via the **MISTRAL_API_KEY** environment variable |
140
+ | -o OUTPUT | --output OUTPUT | output directory path. If not set, a directory will be created in the current working directory using the same stem (filename without extension) as the input file |
141
+ | -j JSON_OCR_RESPONSE | --json-ocr-response JSON_OCR_RESPONSE | path from which to load a pre-existing JSON OCR response (any input file will be ignored) |
142
+ | -m MODE | --mode MODE | mode of operation: either the name or numerical value of the mode. _Defaults to FULL_NO_PAGES_ |
143
+ | -s PAGE_SEPARATOR | --page-separator PAGE_SEPARATOR | page separator to use when writing the Markdown file. _Defaults to `\n`_ |
144
+ | -n | --no-json | do not write the JSON OCR response to a file. By default, the response is written |
145
+ | -e | --load-dot-env | load the .env file from the current directory using [`python-dotenv`](https://pypi.org/project/python-dotenv/), to retrieve the Mistral API key |
146
+
147
+ ### Mistral AI API Key
148
+
149
+ To obtain an API key, you need a [Mistral AI](https://auth.mistral.ai/ui/registration) account. Then visit [https://admin.mistral.ai/organization/api-keys](https://admin.mistral.ai/organization/api-keys) and click the **Create new key** button
150
+
151
+ To avoid using `-e` to load the `.env` file, you can create one at `$HOME/.mistral_ai_ocr.env` (where `$HOME` is your home directory). It will then be automatically loaded at the start of the script
152
+
153
+ For example, for an user called `vavilov`, the path would look like this:
154
+
155
+ * **Linux**
156
+ ```
157
+ /home/vavilov/.mistral_ai_ocr.env
158
+ ```
159
+
160
+ * **macOS**
161
+ ```
162
+ /Users/vavilov/.mistral_ai_ocr.env
163
+ ```
164
+
165
+ * **Windows**
166
+ ```
167
+ C:\Users\vavilov\.mistral_ai_ocr.env
168
+ ```
169
+
170
+ and the content will be something like this:
171
+
172
+ ```
173
+ MISTRAL_API_KEY=jrWjJE5lFketfB2sA6vvhQK2SoHQ6R39
174
+ ```
@@ -0,0 +1,7 @@
1
+ mistral_ai_ocr/__init__.py,sha256=wOwicDbjQMcMWubEnPogXRyAiV6JVrJhiXZmWRkPsVw,9248
2
+ mistral_ai_ocr/__main__.py,sha256=5Jrno0r448BT2HdNrXLi2H6RLkUqP76IV4kmB3HuJ6g,3247
3
+ mistral_ai_ocr-1.0.dist-info/METADATA,sha256=hqq21JC0r4mFIy0vY7UYm9R_Mbcr4ebgYawnyGNiPyo,4401
4
+ mistral_ai_ocr-1.0.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
5
+ mistral_ai_ocr-1.0.dist-info/entry_points.txt,sha256=m-ENd87vam6706-mmfzVfBq5q028TKM-7SMLUakWd-U,64
6
+ mistral_ai_ocr-1.0.dist-info/top_level.txt,sha256=4X0WShtu4WEMtVriRP9X2Fia0ORjbAK03bRYimMvRHA,15
7
+ mistral_ai_ocr-1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.37.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ mistral-ai-ocr = mistral_ai_ocr.__main__:main
@@ -0,0 +1 @@
1
+ mistral_ai_ocr