mistocr 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.2"
1
+ __version__ = "0.4.3"
mistocr/core.py CHANGED
@@ -52,8 +52,8 @@ def create_batch_entry(
52
52
  url:str, # Mistral signed URL
53
53
  cid:str=None, # Custom ID (by default using the file name without extension)
54
54
  inc_img:bool=True, # Include image in response
55
- extract_header:bool=False, # Extract headers from document
56
- extract_footer:bool=False # Extract footers from document
55
+ extract_header:bool=True, # Extract headers from document
56
+ extract_footer:bool=True # Extract footers from document
57
57
  ) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
58
58
  "Create a batch entry dict for OCR"
59
59
  path = Path(path)
@@ -70,7 +70,7 @@ def create_batch_entry(
70
70
  )
71
71
  )
72
72
 
73
- # %% ../nbs/00_core.ipynb 18
73
+ # %% ../nbs/00_core.ipynb 19
74
74
  def prep_pdf_batch(
75
75
  path:str, # Path to PDF file,
76
76
  cid:str=None, # Custom ID (by default using the file name without extention)
@@ -81,7 +81,7 @@ def prep_pdf_batch(
81
81
  url, c = upload_pdf(path, key)
82
82
  return create_batch_entry(path, url, cid, inc_img), c
83
83
 
84
- # %% ../nbs/00_core.ipynb 22
84
+ # %% ../nbs/00_core.ipynb 23
85
85
  def submit_batch(
86
86
  entries:list[dict], # List of batch entries,
87
87
  c:Mistral=None, # Mistral client,
@@ -95,7 +95,7 @@ def submit_batch(
95
95
  batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
96
96
  return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
97
97
 
98
- # %% ../nbs/00_core.ipynb 25
98
+ # %% ../nbs/00_core.ipynb 26
99
99
  def _check_timeout(
100
100
  queued_time:int, # Time spent in QUEUED state (seconds)
101
101
  timeout:int, # Maximum allowed QUEUED time (seconds)
@@ -104,7 +104,7 @@ def _check_timeout(
104
104
  "Raise TimeoutError if job has been queued longer than timeout"
105
105
  if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
106
106
 
107
- # %% ../nbs/00_core.ipynb 26
107
+ # %% ../nbs/00_core.ipynb 27
108
108
  def wait_for_job(
109
109
  job:dict, # Batch job from submit_batch
110
110
  c:Mistral=None, # Mistral client
@@ -123,7 +123,7 @@ def wait_for_job(
123
123
  if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
124
124
  return job
125
125
 
126
- # %% ../nbs/00_core.ipynb 28
126
+ # %% ../nbs/00_core.ipynb 29
127
127
  def download_results(
128
128
  job:dict, # Job dict,
129
129
  c:Mistral=None # Mistral client
@@ -132,7 +132,7 @@ def download_results(
132
132
  content = c.files.download(file_id=job.output_file).read().decode('utf-8')
133
133
  return [json.loads(line) for line in content.strip().split('\n') if line]
134
134
 
135
- # %% ../nbs/00_core.ipynb 33
135
+ # %% ../nbs/00_core.ipynb 34
136
136
  def save_images(
137
137
  page:dict, # Page dict,
138
138
  img_dir:str='img' # Directory to save images
@@ -143,7 +143,7 @@ def save_images(
143
143
  img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
144
144
  Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
145
145
 
146
- # %% ../nbs/00_core.ipynb 34
146
+ # %% ../nbs/00_core.ipynb 35
147
147
  def save_page(
148
148
  page:dict, # Page dict,
149
149
  dst:str, # Directory to save page
@@ -155,7 +155,7 @@ def save_page(
155
155
  img_dir.mkdir(exist_ok=True)
156
156
  save_images(page, img_dir)
157
157
 
158
- # %% ../nbs/00_core.ipynb 36
158
+ # %% ../nbs/00_core.ipynb 37
159
159
  def save_pages(
160
160
  ocr_resp:dict, # OCR response,
161
161
  dst:str, # Directory to save pages,
@@ -168,7 +168,7 @@ def save_pages(
168
168
  for page in ocr_resp['pages']: save_page(page, dst, img_dir)
169
169
  return dst
170
170
 
171
- # %% ../nbs/00_core.ipynb 42
171
+ # %% ../nbs/00_core.ipynb 43
172
172
  def _get_paths(path:str) -> list[Path]:
173
173
  "Get list of PDFs from file or folder"
174
174
  path = Path(path)
@@ -179,7 +179,7 @@ def _get_paths(path:str) -> list[Path]:
179
179
  return pdfs
180
180
  raise ValueError(f"Path not found: {path}")
181
181
 
182
- # %% ../nbs/00_core.ipynb 43
182
+ # %% ../nbs/00_core.ipynb 44
183
183
  def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
184
184
  "Prepare batch entries for list of PDFs"
185
185
  entries, c = [], None
@@ -188,7 +188,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
188
188
  entries.append(entry)
189
189
  return entries, c
190
190
 
191
- # %% ../nbs/00_core.ipynb 44
191
+ # %% ../nbs/00_core.ipynb 45
192
192
  def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
193
193
  "Submit batch, wait for completion, and download results"
194
194
  job = submit_batch(entries, c)
@@ -196,7 +196,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
196
196
  if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
197
197
  return download_results(job, c)
198
198
 
199
- # %% ../nbs/00_core.ipynb 45
199
+ # %% ../nbs/00_core.ipynb 46
200
200
  def ocr_pdf(
201
201
  path:str, # Path to PDF file or folder,
202
202
  dst:str='md', # Directory to save markdown pages,
@@ -210,7 +210,7 @@ def ocr_pdf(
210
210
  results = _run_batch(entries, c, poll_interval)
211
211
  return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
212
212
 
213
- # %% ../nbs/00_core.ipynb 52
213
+ # %% ../nbs/00_core.ipynb 53
214
214
  def read_pgs(
215
215
  path:str, # OCR output directory,
216
216
  join:bool=True # Join pages into single string
@@ -221,7 +221,7 @@ def read_pgs(
221
221
  contents = L([p.read_text() for p in pgs])
222
222
  return '\n\n'.join(contents) if join else contents
223
223
 
224
- # %% ../nbs/00_core.ipynb 59
224
+ # %% ../nbs/00_core.ipynb 60
225
225
  def subset_pdf(
226
226
  path:str, # Path to PDF file
227
227
  start:int=1, # Start page (1-based)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=Nyg0pmk5ea9-SLCAFEIF96ByFx4-TJFtrqYPN-Zn6g4,22
2
+ mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
3
+ mistocr/core.py,sha256=DQzJqbZqdSNLNvWjuXGQx9Z1pha00s5dWuBds3dHpno,9505
4
+ mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
5
+ mistocr/refine.py,sha256=Q14DhUUsT5FLMxP9oIJ2TGQ3qbxe7ulXfRMPKpsd4Wo,13232
6
+ mistocr-0.4.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.4.3.dist-info/METADATA,sha256=xflyT5mn1mEOmzBv4vvGhPazaHc6oq6jCe9A7mPCpT0,9011
8
+ mistocr-0.4.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
+ mistocr-0.4.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.4.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.4.3.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- mistocr/__init__.py,sha256=6hfVa12Q-nXyUEXr6SyKpqPEDJW6vlRHyPxlA27PfTs,22
2
- mistocr/_modidx.py,sha256=LyFez7ndKOXQpF0unhg1imCrW7xcUwO106W82ttVnnk,4366
3
- mistocr/core.py,sha256=nIwqszvMMvlLgYtweIq4hN1UsA_-P5K6k9YE1BIQn2g,9507
4
- mistocr/pipeline.py,sha256=n8AHIHGZBXYy_k7LtEBZSiTL6HClT84-7K4QlTjOLvo,2107
5
- mistocr/refine.py,sha256=Q14DhUUsT5FLMxP9oIJ2TGQ3qbxe7ulXfRMPKpsd4Wo,13232
6
- mistocr-0.4.2.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
- mistocr-0.4.2.dist-info/METADATA,sha256=lfBxmqi3TP9XSvTY_1qhSdMfjH0fr9IXzeo0-sgE2eo,9011
8
- mistocr-0.4.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
- mistocr-0.4.2.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
- mistocr-0.4.2.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
- mistocr-0.4.2.dist-info/RECORD,,