mistocr 0.4.1__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
113
113
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
114
114
  ```
115
115
 
116
- This will (as indicated by the output):
116
+ mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
117
+ mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
118
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
119
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
120
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
121
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
122
+ mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
123
+ mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
124
+ mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
125
+
126
+ Describing 12 images...
127
+
128
+ mistocr.pipeline - INFO - Done!
129
+
130
+ Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
131
+ Adding descriptions to 12 pages...
132
+ Done! Enriched pages saved to files/test/md_test
133
+
134
+ This will:
117
135
 
118
136
  1. OCR the PDF using Mistral’s batch API
119
137
  2. Fix heading hierarchy inconsistencies
@@ -72,7 +72,25 @@ from mistocr.pipeline import pdf_to_md
72
72
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
73
73
  ```
74
74
 
75
- This will (as indicated by the output):
75
+ mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
76
+ mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
77
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
78
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
79
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
80
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
81
+ mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
82
+ mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
83
+ mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
84
+
85
+ Describing 12 images...
86
+
87
+ mistocr.pipeline - INFO - Done!
88
+
89
+ Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
90
+ Adding descriptions to 12 pages...
91
+ Done! Enriched pages saved to files/test/md_test
92
+
93
+ This will:
76
94
 
77
95
  1. OCR the PDF using Mistral’s batch API
78
96
  2. Fix heading hierarchy inconsistencies
@@ -0,0 +1 @@
1
+ __version__ = "0.4.3"
@@ -51,14 +51,26 @@ def create_batch_entry(
51
51
  path:str, # Path to PDF file,
52
52
  url:str, # Mistral signed URL
53
53
  cid:str=None, # Custom ID (by default using the file name without extension)
54
- inc_img:bool=True # Include image in response
54
+ inc_img:bool=True, # Include image in response
55
+ extract_header:bool=True, # Extract headers from document
56
+ extract_footer:bool=True # Extract footers from document
55
57
  ) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
56
58
  "Create a batch entry dict for OCR"
57
59
  path = Path(path)
58
60
  if not cid: cid = path.stem
59
- return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
60
-
61
- # %% ../nbs/00_core.ipynb 18
61
+ return dict(
62
+ custom_id=cid,
63
+ body=dict(
64
+ document=dict(
65
+ type="document_url",
66
+ document_url=url),
67
+ include_image_base64=inc_img,
68
+ extract_header=extract_header,
69
+ extract_footer=extract_footer
70
+ )
71
+ )
72
+
73
+ # %% ../nbs/00_core.ipynb 19
62
74
  def prep_pdf_batch(
63
75
  path:str, # Path to PDF file,
64
76
  cid:str=None, # Custom ID (by default using the file name without extention)
@@ -69,7 +81,7 @@ def prep_pdf_batch(
69
81
  url, c = upload_pdf(path, key)
70
82
  return create_batch_entry(path, url, cid, inc_img), c
71
83
 
72
- # %% ../nbs/00_core.ipynb 22
84
+ # %% ../nbs/00_core.ipynb 23
73
85
  def submit_batch(
74
86
  entries:list[dict], # List of batch entries,
75
87
  c:Mistral=None, # Mistral client,
@@ -83,7 +95,7 @@ def submit_batch(
83
95
  batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
84
96
  return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
85
97
 
86
- # %% ../nbs/00_core.ipynb 25
98
+ # %% ../nbs/00_core.ipynb 26
87
99
  def _check_timeout(
88
100
  queued_time:int, # Time spent in QUEUED state (seconds)
89
101
  timeout:int, # Maximum allowed QUEUED time (seconds)
@@ -92,7 +104,7 @@ def _check_timeout(
92
104
  "Raise TimeoutError if job has been queued longer than timeout"
93
105
  if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
94
106
 
95
- # %% ../nbs/00_core.ipynb 26
107
+ # %% ../nbs/00_core.ipynb 27
96
108
  def wait_for_job(
97
109
  job:dict, # Batch job from submit_batch
98
110
  c:Mistral=None, # Mistral client
@@ -111,7 +123,7 @@ def wait_for_job(
111
123
  if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
112
124
  return job
113
125
 
114
- # %% ../nbs/00_core.ipynb 28
126
+ # %% ../nbs/00_core.ipynb 29
115
127
  def download_results(
116
128
  job:dict, # Job dict,
117
129
  c:Mistral=None # Mistral client
@@ -120,7 +132,7 @@ def download_results(
120
132
  content = c.files.download(file_id=job.output_file).read().decode('utf-8')
121
133
  return [json.loads(line) for line in content.strip().split('\n') if line]
122
134
 
123
- # %% ../nbs/00_core.ipynb 33
135
+ # %% ../nbs/00_core.ipynb 34
124
136
  def save_images(
125
137
  page:dict, # Page dict,
126
138
  img_dir:str='img' # Directory to save images
@@ -131,7 +143,7 @@ def save_images(
131
143
  img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
132
144
  Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
133
145
 
134
- # %% ../nbs/00_core.ipynb 34
146
+ # %% ../nbs/00_core.ipynb 35
135
147
  def save_page(
136
148
  page:dict, # Page dict,
137
149
  dst:str, # Directory to save page
@@ -143,7 +155,7 @@ def save_page(
143
155
  img_dir.mkdir(exist_ok=True)
144
156
  save_images(page, img_dir)
145
157
 
146
- # %% ../nbs/00_core.ipynb 36
158
+ # %% ../nbs/00_core.ipynb 37
147
159
  def save_pages(
148
160
  ocr_resp:dict, # OCR response,
149
161
  dst:str, # Directory to save pages,
@@ -156,7 +168,7 @@ def save_pages(
156
168
  for page in ocr_resp['pages']: save_page(page, dst, img_dir)
157
169
  return dst
158
170
 
159
- # %% ../nbs/00_core.ipynb 42
171
+ # %% ../nbs/00_core.ipynb 43
160
172
  def _get_paths(path:str) -> list[Path]:
161
173
  "Get list of PDFs from file or folder"
162
174
  path = Path(path)
@@ -167,7 +179,7 @@ def _get_paths(path:str) -> list[Path]:
167
179
  return pdfs
168
180
  raise ValueError(f"Path not found: {path}")
169
181
 
170
- # %% ../nbs/00_core.ipynb 43
182
+ # %% ../nbs/00_core.ipynb 44
171
183
  def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
172
184
  "Prepare batch entries for list of PDFs"
173
185
  entries, c = [], None
@@ -176,7 +188,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
176
188
  entries.append(entry)
177
189
  return entries, c
178
190
 
179
- # %% ../nbs/00_core.ipynb 44
191
+ # %% ../nbs/00_core.ipynb 45
180
192
  def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
181
193
  "Submit batch, wait for completion, and download results"
182
194
  job = submit_batch(entries, c)
@@ -184,7 +196,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
184
196
  if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
185
197
  return download_results(job, c)
186
198
 
187
- # %% ../nbs/00_core.ipynb 45
199
+ # %% ../nbs/00_core.ipynb 46
188
200
  def ocr_pdf(
189
201
  path:str, # Path to PDF file or folder,
190
202
  dst:str='md', # Directory to save markdown pages,
@@ -198,7 +210,7 @@ def ocr_pdf(
198
210
  results = _run_batch(entries, c, poll_interval)
199
211
  return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
200
212
 
201
- # %% ../nbs/00_core.ipynb 52
213
+ # %% ../nbs/00_core.ipynb 53
202
214
  def read_pgs(
203
215
  path:str, # OCR output directory,
204
216
  join:bool=True # Join pages into single string
@@ -209,7 +221,7 @@ def read_pgs(
209
221
  contents = L([p.read_text() for p in pgs])
210
222
  return '\n\n'.join(contents) if join else contents
211
223
 
212
- # %% ../nbs/00_core.ipynb 59
224
+ # %% ../nbs/00_core.ipynb 60
213
225
  def subset_pdf(
214
226
  path:str, # Path to PDF file
215
227
  start:int=1, # Start page (1-based)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
113
113
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
114
114
  ```
115
115
 
116
- This will (as indicated by the output):
116
+ mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
117
+ mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
118
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
119
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
120
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
121
+ mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
122
+ mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
123
+ mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
124
+ mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
125
+
126
+ Describing 12 images...
127
+
128
+ mistocr.pipeline - INFO - Done!
129
+
130
+ Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
131
+ Adding descriptions to 12 pages...
132
+ Done! Enriched pages saved to files/test/md_test
133
+
134
+ This will:
117
135
 
118
136
  1. OCR the PDF using Mistral’s batch API
119
137
  2. Fix heading hierarchy inconsistencies
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = mistocr
3
3
  lib_name = mistocr
4
- version = 0.4.1
4
+ version = 0.4.3
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -1 +0,0 @@
1
- __version__ = "0.4.1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes