mistocr 0.1.3__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mistocr/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.3"
1
+ __version__ = "0.2.1"
mistocr/_modidx.py CHANGED
@@ -11,7 +11,7 @@ d = { 'settings': { 'branch': 'main',
11
11
  'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
12
12
  'mistocr.core.download_results': ('core.html#download_results', 'mistocr/core.py'),
13
13
  'mistocr.core.get_api_key': ('core.html#get_api_key', 'mistocr/core.py'),
14
- 'mistocr.core.ocr': ('core.html#ocr', 'mistocr/core.py'),
14
+ 'mistocr.core.ocr_pdf': ('core.html#ocr_pdf', 'mistocr/core.py'),
15
15
  'mistocr.core.prep_pdf_batch': ('core.html#prep_pdf_batch', 'mistocr/core.py'),
16
16
  'mistocr.core.read_pgs': ('core.html#read_pgs', 'mistocr/core.py'),
17
17
  'mistocr.core.save_images': ('core.html#save_images', 'mistocr/core.py'),
@@ -20,10 +20,22 @@ d = { 'settings': { 'branch': 'main',
20
20
  'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
21
21
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
22
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
+ 'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
23
24
  'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
25
+ 'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
26
+ 'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
27
+ 'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
28
+ 'mistocr.refine.add_img_descs': ('refine.html#add_img_descs', 'mistocr/refine.py'),
29
+ 'mistocr.refine.add_pg_hdgs': ('refine.html#add_pg_hdgs', 'mistocr/refine.py'),
24
30
  'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
31
+ 'mistocr.refine.describe_img': ('refine.html#describe_img', 'mistocr/refine.py'),
32
+ 'mistocr.refine.describe_imgs': ('refine.html#describe_imgs', 'mistocr/refine.py'),
25
33
  'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
26
- 'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
34
+ 'mistocr.refine.fix_hdgs': ('refine.html#fix_hdgs', 'mistocr/refine.py'),
27
35
  'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
28
36
  'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
29
- 'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py')}}}
37
+ 'mistocr.refine.limit': ('refine.html#limit', 'mistocr/refine.py'),
38
+ 'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py'),
39
+ 'mistocr.refine.parse_r': ('refine.html#parse_r', 'mistocr/refine.py'),
40
+ 'mistocr.refine.read_pgs_pg': ('refine.html#read_pgs_pg', 'mistocr/refine.py'),
41
+ 'mistocr.refine.save_img_descs': ('refine.html#save_img_descs', 'mistocr/refine.py')}}}
mistocr/core.py CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  # %% auto 0
6
6
  __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
7
- 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr', 'read_pgs']
7
+ 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
8
8
 
9
9
  # %% ../nbs/00_core.ipynb 3
10
10
  from fastcore.all import *
@@ -79,10 +79,11 @@ def submit_batch(
79
79
  def wait_for_job(
80
80
  job:dict, # Job dict,
81
81
  c:Mistral=None, # Mistral client,
82
- poll_interval:int=10 # Poll interval in seconds
82
+ poll_interval:int=1 # Poll interval in seconds
83
83
  ) -> dict: # Job dict (with status)
84
84
  "Poll job until completion and return final job status"
85
85
  while job.status in ["QUEUED", "RUNNING"]:
86
+ print(f'Mistral batch job status: {job.status}')
86
87
  time.sleep(poll_interval)
87
88
  job = c.batch.jobs.get(job_id=job.id)
88
89
  return job
@@ -161,7 +162,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
161
162
  return download_results(job, c)
162
163
 
163
164
  # %% ../nbs/00_core.ipynb 43
164
- def ocr(
165
+ def ocr_pdf(
165
166
  path:str, # Path to PDF file or folder,
166
167
  dst:str='md', # Directory to save markdown pages,
167
168
  inc_img:bool=True, # Include image in response,
@@ -174,7 +175,7 @@ def ocr(
174
175
  results = _run_batch(entries, c, poll_interval)
175
176
  return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
176
177
 
177
- # %% ../nbs/00_core.ipynb 48
178
+ # %% ../nbs/00_core.ipynb 47
178
179
  def read_pgs(
179
180
  path:str, # OCR output directory,
180
181
  join:bool=True # Join pages into single string
mistocr/pipeline.py ADDED
@@ -0,0 +1,37 @@
1
+ """End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
2
+
3
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
+
5
+ # %% auto 0
6
+ __all__ = ['pdf_to_md']
7
+
8
+ # %% ../nbs/02_pipeline.ipynb 3
9
+ from fastcore.all import *
10
+ from .core import read_pgs, ocr_pdf
11
+ from .refine import add_img_descs, fix_hdgs
12
+ from pathlib import Path
13
+ from asyncio import Semaphore, gather, sleep
14
+ import os, json, shutil
15
+
16
+ # %% ../nbs/02_pipeline.ipynb 4
17
+ @delegates(add_img_descs)
18
+ async def pdf_to_md(
19
+ pdf_path:str, # Path to input PDF file
20
+ dst:str, # Destination directory for output markdown
21
+ ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
22
+ model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
23
+ add_img_desc:bool=True, # Whether to add image descriptions
24
+ progress:bool=True, # Whether to show progress messages
25
+ **kwargs):
26
+ "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
27
+ n_steps = 3 if add_img_desc else 2
28
+ if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
29
+ ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
30
+ ocr_dir = ocr_dirs[0]
31
+ if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
32
+ fix_hdgs(ocr_dir, model=model)
33
+ if add_img_desc:
34
+ if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
35
+ await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
36
+ elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
37
+ if progress: print("Done!")
mistocr/refine.py CHANGED
@@ -1,24 +1,30 @@
1
- """Postprocess markdown files by fixing heading hierarchy and describint images"""
1
+ """Fix heading hierarchy and describe images in OCR'd markdown documents"""
2
2
 
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['prompt_fix_hdgs', 'get_hdgs', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut',
7
- 'apply_hdg_fixes', 'fix_md_hdgs']
6
+ __all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
+ 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
+ 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
+ 'add_img_descs']
8
10
 
9
11
  # %% ../nbs/01_refine.ipynb 3
10
12
  from fastcore.all import *
11
13
  from .core import read_pgs
12
14
  from re import sub, findall, MULTILINE
13
15
  from pydantic import BaseModel
16
+ from lisette import *
14
17
  from lisette.core import completion
18
+ from typing import Callable
15
19
  import os
16
20
  import json
21
+ import shutil
22
+ from asyncio import Semaphore, gather, sleep
17
23
 
18
24
  # %% ../nbs/01_refine.ipynb 7
19
25
  def get_hdgs(
20
26
  md:str # Markdown file string
21
- ):
27
+ ) -> L: # L of strings
22
28
  "Return the markdown headings"
23
29
  # Sanitize removing '#' in python snippet if any
24
30
  md = sub(r'```[\s\S]*?```', '', md)
@@ -26,7 +32,25 @@ def get_hdgs(
26
32
 
27
33
 
28
34
 
29
- # %% ../nbs/01_refine.ipynb 10
35
+ # %% ../nbs/01_refine.ipynb 8
36
+ def add_pg_hdgs(
37
+ md:str, # Markdown file string,
38
+ n:int # Page number
39
+ ) -> str: # Markdown file string
40
+ "Add page number to all headings in page markdown"
41
+ md = sub(r'```[\s\S]*?```', '', md)
42
+ def repl(m): return m.group(0) + f' ... page {n}'
43
+ return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
44
+
45
+ # %% ../nbs/01_refine.ipynb 12
46
+ def read_pgs_pg(
47
+ path:str # Path to the markdown file
48
+ ) -> L: # List of markdown pages
49
+ "Read all pages of a markdown file and add page numbers to all headings"
50
+ pgs = read_pgs(path, join=False)
51
+ return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
52
+
53
+ # %% ../nbs/01_refine.ipynb 15
30
54
  def fmt_hdgs_idx(
31
55
  hdgs: list[str] # List of markdown headings
32
56
  ) -> str: # Formatted string with index
@@ -34,19 +58,21 @@ def fmt_hdgs_idx(
34
58
  return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
35
59
 
36
60
 
37
- # %% ../nbs/01_refine.ipynb 13
61
+ # %% ../nbs/01_refine.ipynb 18
38
62
  class HeadingCorrections(BaseModel):
39
63
  corrections: dict[int, str] # index → corrected heading
40
64
 
41
- # %% ../nbs/01_refine.ipynb 15
65
+ # %% ../nbs/01_refine.ipynb 20
42
66
  prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
43
67
 
44
- INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
68
+ INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
45
69
 
46
70
  RULES - Apply these fixes in order:
47
71
 
48
- 1. **Single H1 rule**: Documents must have exactly ONE # heading (the title/main heading)
49
- - All other headings should be ## or deeper
72
+ 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
73
+ - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
74
+ - If no H1 exists, the first major heading should be #, and all others ## or deeper
75
+ - NO exceptions: appendices, references, and all sections are ## or deeper after the title
50
76
 
51
77
  2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
52
78
  - Parent section (e.g., "1", "2", "A") should be shallower than child (e.g., "1.1", "2.a", "A.1")
@@ -60,58 +86,180 @@ RULES - Apply these fixes in order:
60
86
  4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
61
87
 
62
88
  OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
89
+ IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
63
90
  Only include entries that need changes.
64
91
 
65
92
  Headings to analyze:
66
93
  {headings_list}
67
94
  """
68
95
 
69
- # %% ../nbs/01_refine.ipynb 16
96
+
97
+ # %% ../nbs/01_refine.ipynb 22
70
98
  def fix_hdg_hierarchy(
71
99
  hdgs: list[str], # List of markdown headings
72
- prompt: str=prompt_fix_hdgs, # Prompt to use
100
+ prompt: str=None, # Prompt to use
73
101
  model: str='claude-sonnet-4-5', # Model to use
74
- api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
102
+ api_key: str=None # API key
75
103
  ) -> dict[int, str]: # Dictionary of index → corrected heading
76
104
  "Fix the heading hierarchy"
77
- r = completion(
78
- model=model,
79
- messages=[{"role": "user", "content": prompt_fix_hdgs.format(headings_list=fmt_hdgs_idx(hdgs))}],
80
- response_format=HeadingCorrections,
81
- api_key=api_key
82
- )
105
+ if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
106
+ if prompt is None: prompt = prompt_fix_hdgs
107
+ prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
108
+ r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
83
109
  return json.loads(r.choices[0].message.content)['corrections']
84
110
 
85
- # %% ../nbs/01_refine.ipynb 19
111
+
112
+ # %% ../nbs/01_refine.ipynb 25
113
+ @delegates(fix_hdg_hierarchy)
86
114
  def mk_fixes_lut(
87
115
  hdgs: list[str], # List of markdown headings
88
116
  model: str='claude-sonnet-4-5', # Model to use
89
- api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
117
+ api_key: str=None, # API key
118
+ **kwargs
90
119
  ) -> dict[str, str]: # Dictionary of old → new heading
91
120
  "Make a lookup table of fixes"
92
- fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key)
121
+ if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
122
+ fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
93
123
  return {hdgs[int(k)]:v for k,v in fixes.items()}
94
124
 
95
- # %% ../nbs/01_refine.ipynb 22
125
+ # %% ../nbs/01_refine.ipynb 28
96
126
  def apply_hdg_fixes(
97
127
  p:str, # Page to fix
98
128
  lut_fixes: dict[str, str], # Lookup table of fixes
99
- pg: int=None, # Optionnaly specify the page number to append to original heading
100
129
  ) -> str: # Page with fixes applied
101
130
  "Apply the fixes to the page"
102
- for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
131
+ for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
103
132
  return p
104
133
 
105
- # %% ../nbs/01_refine.ipynb 25
106
- def fix_md_hdgs(
107
- src:str, # Source directory with markdown pages
108
- model:str='claude-sonnet-4-5', # Model
109
- dst:str=None, # Destination directory (None=overwrite)
110
- pg_nums:bool=True # Add page numbers
111
- ):
134
+ # %% ../nbs/01_refine.ipynb 31
135
+ @delegates(mk_fixes_lut)
136
+ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
112
137
  "Fix heading hierarchy in markdown document"
113
138
  src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
114
139
  if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
115
- lut = mk_fixes_lut(get_hdgs(read_pgs(src_path)), model)
116
- for i,p in enumerate(read_pgs(src_path, join=False), 1):
117
- (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut, pg=i if pg_nums else None))
140
+ src_imgs = src_path/img_folder
141
+ if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
142
+ pgs_with_pg = read_pgs_pg(src_path)
143
+ lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
144
+ for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
145
+
146
+ # %% ../nbs/01_refine.ipynb 37
147
+ class ImgDescription(BaseModel):
148
+ "Image classification and description for OCR'd documents"
149
+ is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
150
+ description:str # Detailed description of the image content for RAG and accessibility
151
+
152
+ # %% ../nbs/01_refine.ipynb 40
153
+ describe_img_prompt = """Analyze this image from an academic/technical document.
154
+
155
+ Step 1: Determine if this image is informative for understanding the document content.
156
+ - Informative: charts, diagrams, tables, technical illustrations, experimental results, architectural diagrams
157
+ - Non-informative: logos, decorative images, generic photos, page backgrounds
158
+
159
+ Step 2:
160
+ - If informative: Provide a detailed description including the type of visualization, key elements and their relationships, important data or patterns, and relevant technical details.
161
+ - If non-informative: Provide a brief label (e.g., "Company logo", "Decorative header image")
162
+
163
+ Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
164
+
165
+ # %% ../nbs/01_refine.ipynb 41
166
+ async def describe_img(
167
+ img_path: Path, # Path to the image file
168
+ model: str = 'claude-sonnet-4-5', # Model to use
169
+ prompt: str = describe_img_prompt # Prompt for description
170
+ ) -> ImgDescription:
171
+ "Describe a single image using AsyncChat"
172
+ chat = AsyncChat(model=model)
173
+ r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
174
+ return r
175
+
176
+ # %% ../nbs/01_refine.ipynb 45
177
+ async def limit(
178
+ semaphore, # Semaphore for concurrency control
179
+ coro, # Coroutine to execute
180
+ delay:float=None # Optional delay in seconds after execution
181
+ ):
182
+ "Execute coroutine with semaphore-based rate limiting and optional delay"
183
+ async with semaphore:
184
+ r = await coro
185
+ if delay: await sleep(delay)
186
+ return r
187
+
188
+ # %% ../nbs/01_refine.ipynb 47
189
+ def parse_r(
190
+ result # ModelResponse object from API call
191
+ ): # Dictionary with 'is_informative' and 'description' keys
192
+ "Extract and parse JSON content from model response"
193
+ return json.loads(result.choices[0].message.content)
194
+
195
+ # %% ../nbs/01_refine.ipynb 49
196
+ async def describe_imgs(
197
+ imgs: list[Path], # List of image file paths to describe
198
+ model: str = 'claude-sonnet-4-5', # Model to use for image description
199
+ prompt: str = describe_img_prompt, # Prompt template for description
200
+ semaphore: int = 2, # Max concurrent API requests
201
+ delay: float = 1 # Delay in seconds between requests
202
+ ) -> dict[str, dict]: # Dict mapping filename to parsed description
203
+ "Describe multiple images in parallel with rate limiting"
204
+ sem = Semaphore(semaphore)
205
+ results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
206
+ return {img.name: parse_r(r) for img, r in zip(imgs, results)}
207
+
208
+ # %% ../nbs/01_refine.ipynb 51
209
+ def save_img_descs(
210
+ descs: dict, # Dictionary of image descriptions
211
+ dst_fname: Path, # Path to save the JSON file
212
+ ) -> None:
213
+ "Save image descriptions to JSON file"
214
+ Path(dst_fname).write_text(json.dumps(descs, indent=2))
215
+
216
+ # %% ../nbs/01_refine.ipynb 56
217
+ def add_descs_to_pg(
218
+ pg:str, # Page markdown content
219
+ descs:dict # Dictionary mapping image filenames to their descriptions
220
+ ) -> str: # Page markdown with descriptions added
221
+ "Add AI-generated descriptions to images in page"
222
+ for link in re.findall(r'!\[[^\]]*\]\([^)]+\)', pg):
223
+ fname = re.findall(r'\(([^)]+)\)', link)[0]
224
+ if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
225
+ return pg
226
+
227
+ # %% ../nbs/01_refine.ipynb 61
228
+ def add_descs_to_pgs(
229
+ pgs:list, # List of page markdown strings
230
+ descs:dict # Dictionary mapping image filenames to their descriptions
231
+ ) -> list: # List of pages with descriptions added
232
+ "Add AI-generated descriptions to images in all pages"
233
+ return [add_descs_to_pg(pg, descs) for pg in pgs]
234
+
235
+ # %% ../nbs/01_refine.ipynb 64
236
+ async def add_img_descs(
237
+ src:str, # Path to source markdown directory
238
+ dst:str=None, # Destination directory (defaults to src if None)
239
+ model:str='claude-sonnet-4-5', # Vision model for image description
240
+ img_folder:str='img', # Name of folder containing images
241
+ semaphore:int=2, # Max concurrent API requests
242
+ delay:float=1, # Delay in seconds between API calls
243
+ force:bool=False, # Force regeneration even if cache exists
244
+ progress:bool=True # Print progress messages
245
+ ):
246
+ "Describe all images in markdown document and insert descriptions inline"
247
+ src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
248
+ if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
249
+ src_imgs = src_path/img_folder
250
+ if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
251
+ desc_file = src_path/'img_descriptions.json'
252
+ if desc_file.exists() and not force:
253
+ if progress: print(f"Loading existing descriptions from {desc_file}")
254
+ descs = json.loads(desc_file.read_text())
255
+ else:
256
+ imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
257
+ if progress: print(f"Describing {len(imgs)} images...")
258
+ descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
259
+ save_img_descs(descs, desc_file)
260
+ if progress: print(f"Saved descriptions to {desc_file}")
261
+ pgs = read_pgs(src_path, join=False)
262
+ if progress: print(f"Adding descriptions to {len(pgs)} pages...")
263
+ enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
264
+ for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
265
+ if progress: print(f"Done! Enriched pages saved to {dst_path}")
@@ -0,0 +1,253 @@
1
+ Metadata-Version: 2.4
2
+ Name: mistocr
3
+ Version: 0.2.1
4
+ Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
+ Home-page: https://github.com/franckalbinet/mistocr
6
+ Author: Solveit
7
+ Author-email: nobody@fast.ai
8
+ License: Apache Software License 2.0
9
+ Keywords: nbdev jupyter notebook python
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Natural Language :: English
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: License :: OSI Approved :: Apache Software License
18
+ Requires-Python: >=3.9
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: fastcore
22
+ Requires-Dist: mistralai
23
+ Requires-Dist: pillow
24
+ Requires-Dist: dotenv
25
+ Requires-Dist: lisette
26
+ Provides-Extra: dev
27
+ Dynamic: author
28
+ Dynamic: author-email
29
+ Dynamic: classifier
30
+ Dynamic: description
31
+ Dynamic: description-content-type
32
+ Dynamic: home-page
33
+ Dynamic: keywords
34
+ Dynamic: license
35
+ Dynamic: license-file
36
+ Dynamic: provides-extra
37
+ Dynamic: requires-dist
38
+ Dynamic: requires-python
39
+ Dynamic: summary
40
+
41
+ # mistocr
42
+
43
+
44
+ <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
45
+
46
+ **PDF OCR is a critical bottleneck in AI pipelines.** It’s often
47
+ mentioned in passing, as if it’s a trivial step. Practice shows it’s far
48
+ from it. Poorly converted PDFs mean garbage-in-garbage-out for
49
+ downstream AI-system (RAG, …).
50
+
51
+ When [Mistral AI](https://mistral.ai) released their [state-of-the-art
52
+ OCR model](https://mistral.ai/fr/news/mistral-ocr) in March 2025, it
53
+ opened new possibilities for large-scale document processing. While
54
+ alternatives like [datalab.to](https://www.datalab.to) and
55
+ [docling.ai](https://www.docling.ai) offer viable solutions, Mistral OCR
56
+ delivers exceptional accuracy at a compelling price point.
57
+
58
+ **mistocr** emerged from months of real-world usage across projects
59
+ requiring large-scale processing of niche-domain PDFs. It addresses two
60
+ fundamental challenges that raw OCR output leaves unsolved:
61
+
62
+ - **Heading hierarchy restoration**: Even state-of-the-art OCR sometimes
63
+ produces inconsistent heading levels in large documents—a complex task
64
+ to get right. mistocr uses LLM-based analysis to restore proper
65
+ document structure, essential for downstream AI tasks.
66
+
67
+ - **Visual content integration**: Charts, figures and diagrams are
68
+ automatically classified and described, then integrated into the
69
+ markdown. This makes visual information searchable and accessible for
70
+ downstream applications.
71
+
72
+ - **Cost-efficient batch processing**: By exclusively using Mistral’s
73
+ batch API, mistocr cuts costs by 50% (\$0.50 vs \$1.00 per 1000 pages)
74
+ while eliminating the boilerplate code typically required.
75
+
76
+ **In short**: Production-ready batch OCR with intelligent postprocessing
77
+ that ensures your documents are actually usable for AI systems.
78
+
79
+ ## Get Started
80
+
81
+ Install latest from [pypi](https://pypi.org/project/mistocr), then:
82
+
83
+ ``` sh
84
+ $ pip install mistocr
85
+ ```
86
+
87
+ Set your API keys:
88
+
89
+ ``` python
90
+ import os
91
+ os.environ['MISTRAL_API_KEY'] = 'your-key-here'
92
+ os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Advanced Usage for other LLMs)
93
+ ```
94
+
95
+ ### Complete Pipeline
96
+
97
+ Full pipeline with all features:
98
+
99
+ ``` python
100
+ from mistocr.pipeline import pdf_to_md
101
+ await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
102
+ ```
103
+
104
+ Step 1/3: Running OCR on files/test/resnet.pdf...
105
+ Mistral batch job status: QUEUED
106
+ Mistral batch job status: RUNNING
107
+ Mistral batch job status: RUNNING
108
+ Step 2/3: Fixing heading hierarchy...
109
+ Step 3/3: Adding image descriptions...
110
+ Describing 7 images...
111
+ Saved descriptions to ocr_temp/resnet/img_descriptions.json
112
+ Adding descriptions to 12 pages...
113
+ Done! Enriched pages saved to files/test/md_test
114
+ Done!
115
+
116
+ This will (as indicated by the output):
117
+
118
+ 1. OCR the PDF using Mistral’s batch API
119
+ 2. Fix heading hierarchy inconsistencies
120
+ 3. Describe images (charts, diagrams) and add those descriptions into
121
+ the markdown Save everything to `files/test/md_test`
122
+
123
+ The output structure will be:
124
+
125
+ files/test/md_test/
126
+ ├── img/
127
+ │ ├── img-0.jpeg
128
+ │ ├── img-1.jpeg
129
+ │ └── ...
130
+ ├── page_1.md
131
+ ├── page_2.md
132
+ └── ...
133
+
134
+ Each page’s markdown will include inline image descriptions:
135
+
136
+ ```` markdown
137
+ ```markdown
138
+ ![Figure 1](img/img-0.jpeg)
139
+ AI-generated image description:
140
+ ___
141
+ A residual learning block...
142
+ ___
143
+ ```
144
+ ````
145
+
146
+ To print the the processed markdown, you can use the
147
+ [`read_pgs`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
148
+ function. Here’s how:
149
+
150
+ Then to read the fully processed document:
151
+
152
+ ``` python
153
+ from mistocr.pipeline import read_pgs
154
+ md = read_pgs('files/test/md_test')
155
+ print(md[:500])
156
+ ```
157
+
158
+ # Deep Residual Learning for Image Recognition ... page 1
159
+
160
+ Kaiming He Xiangyu Zhang Shaoqing Ren Jian Sun<br>Microsoft Research<br>\{kahe, v-xiangz, v-shren, jiansun\}@microsoft.com
161
+
162
+
163
+ ## Abstract ... page 1
164
+
165
+ Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, ins
166
+
167
+ By default,
168
+ [`read_pgs()`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
169
+ joins all pages. Pass `join=False` to get a list of individual pages
170
+ instead.
171
+
172
+ ### Advanced Usage
173
+
174
+ **Batch process entire folders:**
175
+
176
+ ``` python
177
+ from mistocr.core import ocr_pdf
178
+
179
+ # Process all PDFs in a folder
180
+ output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
181
+ ```
182
+
183
+ **Custom models and prompts for heading fixes:**
184
+
185
+ ``` python
186
+ from mistocr.refine import fix_hdgs
187
+
188
+ # Use a different model or custom prompt
189
+ fix_hdgs('ocr_output/doc1',
190
+ model='gpt-4o',
191
+ prompt=your_custom_prompt)
192
+ ```
193
+
194
+ **Custom image description with rate limiting:**
195
+
196
+ ``` python
197
+ from mistocr.refine import add_img_descs
198
+
199
+ # Control API usage and customize descriptions
200
+ await add_img_descs('ocr_output/doc1',
201
+ model='claude-opus-4',
202
+ semaphore=5, # More concurrent requests
203
+ delay=0.5) # Shorter delay between calls
204
+ ```
205
+
206
+ For complete control over each pipeline step, see the
207
+ [core](https://fr.anckalbi.net/mistocr/core.html),
208
+ [refine](https://fr.anckalbi.net/mistocr/refine.html), and
209
+ [pipeline](https://fr.anckalbi.net/mistocr/pipeline.html) module
210
+ documentation.
211
+
212
+ ## Known Limitations & Future Work
213
+
214
+ `mistocr` is under active development. Current limitations include:
215
+
216
+ - **No timeout on batch jobs**: Jobs poll indefinitely until completion.
217
+ If a job stalls, manual intervention is required.
218
+ - **Limited error handling**: When batch jobs fail, error reporting and
219
+ recovery options are minimal.
220
+ - **Progress monitoring**: Currently limited to periodic status prints.
221
+ Future versions will support callbacks or streaming updates for better
222
+ real-time monitoring.
223
+
224
+ Contributions are welcome! If you encounter issues or have ideas for
225
+ improvements, please open an issue or discussion on
226
+ [GitHub](https://github.com/franckalbinet/mistocr).
227
+
228
+ ## Developer Guide
229
+
230
+ If you are new to using `nbdev` here are some useful pointers to get you
231
+ started.
232
+
233
+ ### Install mistocr in Development mode
234
+
235
+ ``` sh
236
+ # make sure mistocr package is installed in development mode
237
+ $ pip install -e .
238
+
239
+ # make changes under nbs/ directory
240
+ # ...
241
+
242
+ # compile to have changes apply to mistocr
243
+ $ nbdev_prepare
244
+ ```
245
+
246
+ ### Documentation
247
+
248
+ Documentation can be found hosted on this GitHub
249
+ [repository](https://github.com/franckalbinet/mistocr)’s
250
+ [pages](https://franckalbinet.github.io/mistocr/). Additionally you can
251
+ find package manager specific guidelines on
252
+ [conda](https://anaconda.org/franckalbinet/mistocr) and
253
+ [pypi](https://pypi.org/project/mistocr/) respectively.
@@ -0,0 +1,11 @@
1
+ mistocr/__init__.py,sha256=HfjVOrpTnmZ-xVFCYSVmX50EXaBQeJteUHG-PD6iQs8,22
2
+ mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
3
+ mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
4
+ mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
5
+ mistocr/refine.py,sha256=zsPoxWD63bk1rzRVO9OPsevWeMNORHgT_y8H7T7CxYs,11785
6
+ mistocr-0.2.1.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
7
+ mistocr-0.2.1.dist-info/METADATA,sha256=-y9Ze92RygrKGCfHbBjlGXlv-5iRYVAOyHtC9MHnplw,7990
8
+ mistocr-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mistocr-0.2.1.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
10
+ mistocr-0.2.1.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
11
+ mistocr-0.2.1.dist-info/RECORD,,
@@ -1,183 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: mistocr
3
- Version: 0.1.3
4
- Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
5
- Home-page: https://github.com/franckalbinet/mistocr
6
- Author: Solveit
7
- Author-email: nobody@fast.ai
8
- License: Apache Software License 2.0
9
- Keywords: nbdev jupyter notebook python
10
- Classifier: Development Status :: 4 - Beta
11
- Classifier: Intended Audience :: Developers
12
- Classifier: Natural Language :: English
13
- Classifier: Programming Language :: Python :: 3.9
14
- Classifier: Programming Language :: Python :: 3.10
15
- Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
- Classifier: License :: OSI Approved :: Apache Software License
18
- Requires-Python: >=3.9
19
- Description-Content-Type: text/markdown
20
- License-File: LICENSE
21
- Requires-Dist: fastcore
22
- Requires-Dist: mistralai
23
- Requires-Dist: pillow
24
- Requires-Dist: dotenv
25
- Requires-Dist: lisette
26
- Provides-Extra: dev
27
- Dynamic: author
28
- Dynamic: author-email
29
- Dynamic: classifier
30
- Dynamic: description
31
- Dynamic: description-content-type
32
- Dynamic: home-page
33
- Dynamic: keywords
34
- Dynamic: license
35
- Dynamic: license-file
36
- Dynamic: provides-extra
37
- Dynamic: requires-dist
38
- Dynamic: requires-python
39
- Dynamic: summary
40
-
41
- # mistocr
42
-
43
-
44
- <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
45
-
46
- ## Why mistocr?
47
-
48
- **Performance**: Mistral’s OCR delivers state-of-the-art accuracy on
49
- complex documents including tables, charts, and multi-column layouts.
50
-
51
- **Scale**: Process entire folders of PDFs in a single batch job. Upload
52
- once, process asynchronously, and retrieve results when ready - perfect
53
- for large document sets.
54
-
55
- **Cost savings**: Batch OCR mode reduces costs from \$1/1000 pages to
56
- \$0.50/1000 pages - a 50% reduction compared to synchronous processing.
57
-
58
- **Simplicity**: A single
59
- [`ocr()`](https://franckalbinet.github.io/mistocr/core.html#ocr)
60
- function handles everything - uploading, batch submission, polling for
61
- completion, and saving results as markdown with extracted images.
62
- Process one PDF or an entire folder with the same simple interface.
63
-
64
- **Organized output**: Each PDF is automatically saved to its own folder
65
- with pages as separate markdown files and images in an `img` subfolder,
66
- making results easy to navigate and process further.
67
-
68
- ## Installation
69
-
70
- Install latest from the GitHub
71
- [repository](https://github.com/franckalbinet/mistocr):
72
-
73
- ``` sh
74
- $ pip install git+https://github.com/franckalbinet/mistocr.git
75
- ```
76
-
77
- or from [pypi](https://pypi.org/project/mistocr/)
78
-
79
- ``` sh
80
- $ pip install mistocr
81
- ```
82
-
83
- ## How to use
84
-
85
- ### Basic usage
86
-
87
- Process a single PDF:
88
-
89
- ``` python
90
- from mistocr.core import ocr
91
-
92
- fname = 'files/test/attention-is-all-you-need.pdf'
93
- result = ocr(fname)
94
- ```
95
-
96
- Or process an entire folder:
97
-
98
- ``` python
99
- results = ocr('files/test')
100
- ```
101
-
102
- ### Output structure
103
-
104
- Each PDF is saved to its own folder with pages as separate markdown
105
- files and images in an `img` subfolder:
106
-
107
- files/test/md/
108
- ├── attention-is-all-you-need/
109
- │ ├── img/
110
- │ │ ├── img-0.jpeg
111
- │ │ ├── img-1.jpeg
112
- │ │ └── ...
113
- │ ├── page_1.md
114
- │ ├── page_2.md
115
- │ └── ...
116
- └── resnet/
117
- ├── img/
118
- └── ...
119
-
120
- ### Reading results
121
-
122
- Read all pages from a processed PDF:
123
-
124
- ``` python
125
- from mistocr.core import read_pgs
126
-
127
- text = read_pgs('files/test/md/attention-is-all-you-need')
128
- ```
129
-
130
- Or read a specific page:
131
-
132
- ``` python
133
- text = read_pgs('files/test/md/attention-is-all-you-need', 10)
134
- ```
135
-
136
- ### Customization
137
-
138
- Customize output directory, image inclusion, and polling interval:
139
-
140
- ``` python
141
- results = ocr('files/test', out_dir='output', inc_img=False, poll_interval=5)
142
- ```
143
-
144
- **Parameters:**
145
-
146
- - **`path`**: A single PDF file or folder containing multiple PDFs
147
- - **`out_dir`**: Directory name for saving markdown output (default:
148
- `'md'`)
149
- - **`inc_img`**: Include extracted images in the output (default:
150
- `True`)
151
- - **`key`**: Your Mistral API key (uses `MISTRAL_API_KEY` environment
152
- variable if not provided)
153
- - **`poll_interval`**: Seconds between batch job status checks (default:
154
- `2`)
155
-
156
- **Returns:** List of paths to the generated markdown files
157
-
158
- ## Developer Guide
159
-
160
- If you are new to using `nbdev` here are some useful pointers to get you
161
- started.
162
-
163
- ### Install mistocr in Development mode
164
-
165
- ``` sh
166
- # make sure mistocr package is installed in development mode
167
- $ pip install -e .
168
-
169
- # make changes under nbs/ directory
170
- # ...
171
-
172
- # compile to have changes apply to mistocr
173
- $ nbdev_prepare
174
- ```
175
-
176
- ### Documentation
177
-
178
- Documentation can be found hosted on this GitHub
179
- [repository](https://github.com/franckalbinet/mistocr)’s
180
- [pages](https://franckalbinet.github.io/mistocr/). Additionally you can
181
- find package manager specific guidelines on
182
- [conda](https://anaconda.org/franckalbinet/mistocr) and
183
- [pypi](https://pypi.org/project/mistocr/) respectively.
@@ -1,10 +0,0 @@
1
- mistocr/__init__.py,sha256=XEqb2aiIn8fzGE68Mph4ck1FtQqsR_am0wRWvrYPffQ,22
2
- mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
3
- mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
4
- mistocr/refine.py,sha256=572SDG8vhGjNMiET5eZhgVemNpUIHNFqi0ZSSl4eKCM,4545
5
- mistocr-0.1.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
6
- mistocr-0.1.3.dist-info/METADATA,sha256=jHRc6nm_uk7V-03y6Bd268hUWmkkOFNdt4s5cH3YPu0,4848
7
- mistocr-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- mistocr-0.1.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
9
- mistocr-0.1.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
10
- mistocr-0.1.3.dist-info/RECORD,,