mistocr 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.0.3"
1
+ __version__ = "0.0.4"
mistocr/_modidx.py CHANGED
@@ -13,6 +13,7 @@ d = { 'settings': { 'branch': 'main',
13
13
  'mistocr.core.get_api_key': ('core.html#get_api_key', 'mistocr/core.py'),
14
14
  'mistocr.core.ocr': ('core.html#ocr', 'mistocr/core.py'),
15
15
  'mistocr.core.prep_pdf_batch': ('core.html#prep_pdf_batch', 'mistocr/core.py'),
16
+ 'mistocr.core.read_pgs': ('core.html#read_pgs', 'mistocr/core.py'),
16
17
  'mistocr.core.save_images': ('core.html#save_images', 'mistocr/core.py'),
17
18
  'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
18
19
  'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
mistocr/core.py CHANGED
@@ -4,21 +4,17 @@
4
4
 
5
5
  # %% auto 0
6
6
  __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
7
- 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr']
7
+ 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr', 'read_pgs']
8
8
 
9
9
  # %% ../nbs/00_core.ipynb 3
10
10
  from fastcore.all import *
11
- from dotenv import load_dotenv
12
- import os, json, time, base64, tempfile
11
+ import os, re, json, time, base64, tempfile, logging
13
12
  from io import BytesIO
14
13
  from pathlib import Path
15
14
  from PIL import Image
16
15
  from mistralai import Mistral
17
16
 
18
17
  # %% ../nbs/00_core.ipynb 6
19
- load_dotenv()
20
-
21
- # %% ../nbs/00_core.ipynb 7
22
18
  def get_api_key(
23
19
  key:str=None # Mistral API key
24
20
  ):
@@ -27,11 +23,11 @@ def get_api_key(
27
23
  if not key: raise ValueError("MISTRAL_API_KEY not found")
28
24
  return key
29
25
 
30
- # %% ../nbs/00_core.ipynb 8
26
+ # %% ../nbs/00_core.ipynb 7
31
27
  ocr_model = "mistral-ocr-latest"
32
28
  ocr_endpoint = "/v1/ocr"
33
29
 
34
- # %% ../nbs/00_core.ipynb 11
30
+ # %% ../nbs/00_core.ipynb 10
35
31
  def upload_pdf(
36
32
  path:str, # Path to PDF file
37
33
  key:str=None # Mistral API key
@@ -42,11 +38,11 @@ def upload_pdf(
42
38
  uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
43
39
  return c.files.get_signed_url(file_id=uploaded.id).url, c
44
40
 
45
- # %% ../nbs/00_core.ipynb 16
41
+ # %% ../nbs/00_core.ipynb 15
46
42
  def create_batch_entry(
47
43
  path:str, # Path to PDF file,
48
44
  url:str, # Mistral signed URL
49
- cid:str=None, # Custom ID (by default using the file name without extention)
45
+ cid:str=None, # Custom ID (by default using the file name without extension)
50
46
  inc_img:bool=True # Include image in response
51
47
  ) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
52
48
  "Create a batch entry dict for OCR"
@@ -54,7 +50,7 @@ def create_batch_entry(
54
50
  if not cid: cid = path.stem
55
51
  return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
56
52
 
57
- # %% ../nbs/00_core.ipynb 18
53
+ # %% ../nbs/00_core.ipynb 17
58
54
  def prep_pdf_batch(
59
55
  path:str, # Path to PDF file,
60
56
  cid:str=None, # Custom ID (by default using the file name without extention)
@@ -65,7 +61,7 @@ def prep_pdf_batch(
65
61
  url, c = upload_pdf(path, key)
66
62
  return create_batch_entry(path, url, cid, inc_img), c
67
63
 
68
- # %% ../nbs/00_core.ipynb 22
64
+ # %% ../nbs/00_core.ipynb 21
69
65
  def submit_batch(
70
66
  entries:list[dict], # List of batch entries,
71
67
  c:Mistral=None, # Mistral client,
@@ -79,7 +75,7 @@ def submit_batch(
79
75
  batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
80
76
  return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
81
77
 
82
- # %% ../nbs/00_core.ipynb 25
78
+ # %% ../nbs/00_core.ipynb 24
83
79
  def wait_for_job(
84
80
  job:dict, # Job dict,
85
81
  c:Mistral=None, # Mistral client,
@@ -91,7 +87,7 @@ def wait_for_job(
91
87
  job = c.batch.jobs.get(job_id=job.id)
92
88
  return job
93
89
 
94
- # %% ../nbs/00_core.ipynb 27
90
+ # %% ../nbs/00_core.ipynb 26
95
91
  def download_results(
96
92
  job:dict, # Job dict,
97
93
  c:Mistral=None # Mistral client
@@ -100,7 +96,7 @@ def download_results(
100
96
  content = c.files.download(file_id=job.output_file).read().decode('utf-8')
101
97
  return [json.loads(line) for line in content.strip().split('\n') if line]
102
98
 
103
- # %% ../nbs/00_core.ipynb 32
99
+ # %% ../nbs/00_core.ipynb 31
104
100
  def save_images(
105
101
  page:dict, # Page dict,
106
102
  img_dir:str='img' # Directory to save images
@@ -111,7 +107,7 @@ def save_images(
111
107
  img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
112
108
  Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
113
109
 
114
- # %% ../nbs/00_core.ipynb 33
110
+ # %% ../nbs/00_core.ipynb 32
115
111
  def save_page(
116
112
  page:dict, # Page dict,
117
113
  out_dir:str, # Directory to save page
@@ -123,7 +119,7 @@ def save_page(
123
119
  img_dir.mkdir(exist_ok=True)
124
120
  save_images(page, img_dir)
125
121
 
126
- # %% ../nbs/00_core.ipynb 35
122
+ # %% ../nbs/00_core.ipynb 34
127
123
  def save_pages(
128
124
  ocr_resp:dict, # OCR response,
129
125
  out_dir:str, # Directory to save pages,
@@ -136,7 +132,7 @@ def save_pages(
136
132
  for page in ocr_resp['pages']: save_page(page, out_dir, img_dir)
137
133
  return out_dir
138
134
 
139
- # %% ../nbs/00_core.ipynb 41
135
+ # %% ../nbs/00_core.ipynb 40
140
136
  def _get_paths(path:str) -> list[Path]:
141
137
  "Get list of PDFs from file or folder"
142
138
  path = Path(path)
@@ -147,7 +143,7 @@ def _get_paths(path:str) -> list[Path]:
147
143
  return pdfs
148
144
  raise ValueError(f"Path not found: {path}")
149
145
 
150
- # %% ../nbs/00_core.ipynb 42
146
+ # %% ../nbs/00_core.ipynb 41
151
147
  def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
152
148
  "Prepare batch entries for list of PDFs"
153
149
  entries, c = [], None
@@ -156,7 +152,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
156
152
  entries.append(entry)
157
153
  return entries, c
158
154
 
159
- # %% ../nbs/00_core.ipynb 43
155
+ # %% ../nbs/00_core.ipynb 42
160
156
  def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
161
157
  "Submit batch, wait for completion, and download results"
162
158
  job = submit_batch(entries, c)
@@ -164,7 +160,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
164
160
  if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
165
161
  return download_results(job, c)
166
162
 
167
- # %% ../nbs/00_core.ipynb 44
163
+ # %% ../nbs/00_core.ipynb 43
168
164
  def ocr(
169
165
  path:str, # Path to PDF file or folder,
170
166
  out_dir:str='md', # Directory to save markdown pages,
@@ -177,3 +173,17 @@ def ocr(
177
173
  entries, c = _prep_batch(pdfs, inc_img, key)
178
174
  results = _run_batch(entries, c, poll_interval)
179
175
  return L([save_pages(r['response']['body'], out_dir, r['custom_id']) for r in results])
176
+
177
+ # %% ../nbs/00_core.ipynb 48
178
+ def read_pgs(
179
+ path:str, # OCR output directory,
180
+ pg:int=None, # Page number
181
+ ) -> str:
182
+ "Read specific page or all pages from OCR output directory"
183
+ path = Path(path)
184
+ if pg:
185
+ pg_path = path / f'page_{pg}.md'
186
+ if not pg_path.exists(): raise ValueError(f"Page {pg} not found")
187
+ return pg_path.read_text()
188
+ pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
189
+ return '\n\n'.join([p.read_text() for p in pgs])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -54,10 +54,11 @@ for large document sets.
54
54
  **Cost savings**: Batch OCR mode reduces costs from \$1/1000 pages to
55
55
  \$0.50/1000 pages - a 50% reduction compared to synchronous processing.
56
56
 
57
- **Simplicity**: A single `ocr()` function handles everything -
58
- uploading, batch submission, polling for completion, and saving results
59
- as markdown with extracted images. Process one PDF or an entire folder
60
- with the same simple interface.
57
+ **Simplicity**: A single
58
+ [`ocr()`](https://franckalbinet.github.io/mistocr/core.html#ocr)
59
+ function handles everything - uploading, batch submission, polling for
60
+ completion, and saving results as markdown with extracted images.
61
+ Process one PDF or an entire folder with the same simple interface.
61
62
 
62
63
  **Organized output**: Each PDF is automatically saved to its own folder
63
64
  with pages as separate markdown files and images in an `img` subfolder,
@@ -80,57 +81,60 @@ $ pip install mistocr
80
81
 
81
82
  ## How to use
82
83
 
84
+ ### Basic usage
85
+
86
+ Process a single PDF:
87
+
83
88
  ``` python
84
89
  from mistocr.core import ocr
85
- ```
86
-
87
- - **Process a single PDF:**
88
90
 
89
- <!-- -->
91
+ fname = 'files/test/attention-is-all-you-need.pdf'
92
+ result = ocr(fname)
93
+ ```
90
94
 
91
- fname = 'files/test/attention-is-all-you-need.pdf'
92
- result = ocr(fname)
95
+ Or process an entire folder:
93
96
 
94
97
  ``` python
98
+ results = ocr('files/test')
95
99
  ```
96
100
 
97
- files/test/md/attention-is-all-you-need:
98
- img/ page_11.md page_14.md page_3.md page_6.md page_9.md
99
- page_1.md page_12.md page_15.md page_4.md page_7.md
100
- page_10.md page_13.md page_2.md page_5.md page_8.md
101
+ ### Output structure
101
102
 
102
- files/test/md/attention-is-all-you-need/img:
103
- img-0.jpeg img-1.jpeg img-2.jpeg img-3.jpeg img-4.jpeg
103
+ Each PDF is saved to its own folder with pages as separate markdown
104
+ files and images in an `img` subfolder:
104
105
 
105
- - **Or process an entire folder:**
106
+ files/test/md/
107
+ ├── attention-is-all-you-need/
108
+ │ ├── img/
109
+ │ │ ├── img-0.jpeg
110
+ │ │ ├── img-1.jpeg
111
+ │ │ └── ...
112
+ │ ├── page_1.md
113
+ │ ├── page_2.md
114
+ │ └── ...
115
+ └── resnet/
116
+ ├── img/
117
+ └── ...
106
118
 
107
- ``` python
108
- results = ocr('files/test')
109
- ```
119
+ ### Reading results
110
120
 
111
- ``` python
112
- ```
121
+ Read all pages from a processed PDF:
113
122
 
114
- files/test/md:
115
- attention-is-all-you-need/ resnet/
123
+ ``` python
124
+ from mistocr.core import read_pgs
116
125
 
117
- files/test/md/attention-is-all-you-need:
118
- img/ page_11.md page_14.md page_3.md page_6.md page_9.md
119
- page_1.md page_12.md page_15.md page_4.md page_7.md
120
- page_10.md page_13.md page_2.md page_5.md page_8.md
126
+ text = read_pgs('files/test/md/attention-is-all-you-need')
127
+ ```
121
128
 
122
- files/test/md/attention-is-all-you-need/img:
123
- img-0.jpeg img-1.jpeg img-2.jpeg img-3.jpeg img-4.jpeg
129
+ Or read a specific page:
124
130
 
125
- files/test/md/resnet:
126
- img/ page_10.md page_12.md page_3.md page_5.md page_7.md page_9.md
127
- page_1.md page_11.md page_2.md page_4.md page_6.md page_8.md
131
+ ``` python
132
+ text = read_pgs('files/test/md/attention-is-all-you-need', 10)
133
+ ```
128
134
 
129
- files/test/md/resnet/img:
130
- img-0.jpeg img-2.jpeg img-4.jpeg img-6.jpeg
131
- img-1.jpeg img-3.jpeg img-5.jpeg
135
+ ### Customization
132
136
 
133
- - **Customize the output:**
137
+ Customize output directory, image inclusion, and polling interval:
134
138
 
135
139
  ``` python
136
140
  results = ocr('files/test', out_dir='output', inc_img=False, poll_interval=5)
@@ -0,0 +1,9 @@
1
+ mistocr/__init__.py,sha256=1mptEzQihbdyqqzMgdns_j5ZGK9gz7hR2bsgA_TnjO4,22
2
+ mistocr/_modidx.py,sha256=zA12OvdPdNkQ7K_oQx8rzto1mWnpQa3kyz8N-az6kMw,1843
3
+ mistocr/core.py,sha256=qMV6ZFqs3PNHNUL6o6612WkWzOQiiA1jIKreAaYwORg,7239
4
+ mistocr-0.0.4.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
5
+ mistocr-0.0.4.dist-info/METADATA,sha256=01uXdXnZhKv334UNN1ZNlWCxNeozrptZpvAN9MFYIF4,4825
6
+ mistocr-0.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ mistocr-0.0.4.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
8
+ mistocr-0.0.4.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
9
+ mistocr-0.0.4.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- mistocr/__init__.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
2
- mistocr/_modidx.py,sha256=gViY05_Y4LdQXC5l2yEPG3MX-9M93xf4FJEGh3ns2Fo,1745
3
- mistocr/core.py,sha256=Ur5R8NLvHxduvSVuWNkWwt8xgkrxpnL9cmJjQ5h9thM,6778
4
- mistocr-0.0.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
5
- mistocr-0.0.3.dist-info/METADATA,sha256=aWl_wHxvy5Qrsze7JtTWMQ6FD-l-1QEM-7GZfTeem88,5076
6
- mistocr-0.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- mistocr-0.0.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
8
- mistocr-0.0.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
9
- mistocr-0.0.3.dist-info/RECORD,,