mistocr 0.2.7__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.7
3
+ Version: 0.4.1
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
23
23
  Requires-Dist: pillow
24
24
  Requires-Dist: dotenv
25
25
  Requires-Dist: lisette
26
+ Requires-Dist: PyPDF2
26
27
  Provides-Extra: dev
27
28
  Dynamic: author
28
29
  Dynamic: author-email
@@ -112,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
112
113
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
113
114
  ```
114
115
 
115
- Step 1/3: Running OCR on files/test/resnet.pdf...
116
- Mistral batch job status: QUEUED
117
- Mistral batch job status: RUNNING
118
- Mistral batch job status: RUNNING
119
- Step 2/3: Fixing heading hierarchy...
120
- Step 3/3: Adding image descriptions...
121
- Describing 7 images...
122
- Saved descriptions to ocr_temp/resnet/img_descriptions.json
123
- Adding descriptions to 12 pages...
124
- Done! Enriched pages saved to files/test/md_test
125
- Done!
126
-
127
116
  This will (as indicated by the output):
128
117
 
129
118
  1. OCR the PDF using Mistral’s batch API
@@ -72,18 +72,6 @@ from mistocr.pipeline import pdf_to_md
72
72
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
73
73
  ```
74
74
 
75
- Step 1/3: Running OCR on files/test/resnet.pdf...
76
- Mistral batch job status: QUEUED
77
- Mistral batch job status: RUNNING
78
- Mistral batch job status: RUNNING
79
- Step 2/3: Fixing heading hierarchy...
80
- Step 3/3: Adding image descriptions...
81
- Describing 7 images...
82
- Saved descriptions to ocr_temp/resnet/img_descriptions.json
83
- Adding descriptions to 12 pages...
84
- Done! Enriched pages saved to files/test/md_test
85
- Done!
86
-
87
75
  This will (as indicated by the output):
88
76
 
89
77
  1. OCR the PDF using Mistral’s batch API
@@ -0,0 +1 @@
1
+ __version__ = "0.4.1"
@@ -5,7 +5,8 @@ d = { 'settings': { 'branch': 'main',
5
5
  'doc_host': 'https://franckalbinet.github.io',
6
6
  'git_url': 'https://github.com/franckalbinet/mistocr',
7
7
  'lib_path': 'mistocr'},
8
- 'syms': { 'mistocr.core': { 'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
8
+ 'syms': { 'mistocr.core': { 'mistocr.core._check_timeout': ('core.html#_check_timeout', 'mistocr/core.py'),
9
+ 'mistocr.core._get_paths': ('core.html#_get_paths', 'mistocr/core.py'),
9
10
  'mistocr.core._prep_batch': ('core.html#_prep_batch', 'mistocr/core.py'),
10
11
  'mistocr.core._run_batch': ('core.html#_run_batch', 'mistocr/core.py'),
11
12
  'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
@@ -18,6 +19,7 @@ d = { 'settings': { 'branch': 'main',
18
19
  'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
19
20
  'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
20
21
  'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
22
+ 'mistocr.core.subset_pdf': ('core.html#subset_pdf', 'mistocr/core.py'),
21
23
  'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
22
24
  'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
23
25
  'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
@@ -3,8 +3,9 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
7
- 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
6
+ __all__ = ['logger', 'ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch',
7
+ 'submit_batch', 'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf',
8
+ 'read_pgs', 'subset_pdf']
8
9
 
9
10
  # %% ../nbs/00_core.ipynb 3
10
11
  from fastcore.all import *
@@ -13,8 +14,15 @@ from io import BytesIO
13
14
  from pathlib import Path
14
15
  from PIL import Image
15
16
  from mistralai import Mistral
17
+ import PyPDF2
18
+ import logging
16
19
 
17
- # %% ../nbs/00_core.ipynb 6
20
+ # %% ../nbs/00_core.ipynb 4
21
+ logger = logging.getLogger(__name__)
22
+ logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
23
+ logger.setLevel(logging.DEBUG)
24
+
25
+ # %% ../nbs/00_core.ipynb 7
18
26
  def get_api_key(
19
27
  key:str=None # Mistral API key
20
28
  ):
@@ -23,11 +31,11 @@ def get_api_key(
23
31
  if not key: raise ValueError("MISTRAL_API_KEY not found")
24
32
  return key
25
33
 
26
- # %% ../nbs/00_core.ipynb 7
34
+ # %% ../nbs/00_core.ipynb 8
27
35
  ocr_model = "mistral-ocr-latest"
28
36
  ocr_endpoint = "/v1/ocr"
29
37
 
30
- # %% ../nbs/00_core.ipynb 10
38
+ # %% ../nbs/00_core.ipynb 11
31
39
  def upload_pdf(
32
40
  path:str, # Path to PDF file
33
41
  key:str=None # Mistral API key
@@ -38,7 +46,7 @@ def upload_pdf(
38
46
  uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
39
47
  return c.files.get_signed_url(file_id=uploaded.id).url, c
40
48
 
41
- # %% ../nbs/00_core.ipynb 15
49
+ # %% ../nbs/00_core.ipynb 16
42
50
  def create_batch_entry(
43
51
  path:str, # Path to PDF file,
44
52
  url:str, # Mistral signed URL
@@ -50,7 +58,7 @@ def create_batch_entry(
50
58
  if not cid: cid = path.stem
51
59
  return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
52
60
 
53
- # %% ../nbs/00_core.ipynb 17
61
+ # %% ../nbs/00_core.ipynb 18
54
62
  def prep_pdf_batch(
55
63
  path:str, # Path to PDF file,
56
64
  cid:str=None, # Custom ID (by default using the file name without extention)
@@ -61,7 +69,7 @@ def prep_pdf_batch(
61
69
  url, c = upload_pdf(path, key)
62
70
  return create_batch_entry(path, url, cid, inc_img), c
63
71
 
64
- # %% ../nbs/00_core.ipynb 21
72
+ # %% ../nbs/00_core.ipynb 22
65
73
  def submit_batch(
66
74
  entries:list[dict], # List of batch entries,
67
75
  c:Mistral=None, # Mistral client,
@@ -75,20 +83,35 @@ def submit_batch(
75
83
  batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
76
84
  return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
77
85
 
78
- # %% ../nbs/00_core.ipynb 24
86
+ # %% ../nbs/00_core.ipynb 25
87
+ def _check_timeout(
88
+ queued_time:int, # Time spent in QUEUED state (seconds)
89
+ timeout:int, # Maximum allowed QUEUED time (seconds)
90
+ job_id:str # Batch job ID
91
+ ):
92
+ "Raise TimeoutError if job has been queued longer than timeout"
93
+ if queued_time >= timeout: raise TimeoutError(f"Job {job_id} stayed in QUEUED for {queued_time}s, exceeding timeout of {timeout}s. Check your balance or Mistral Status.")
94
+
95
+ # %% ../nbs/00_core.ipynb 26
79
96
  def wait_for_job(
80
- job:dict, # Job dict,
81
- c:Mistral=None, # Mistral client,
82
- poll_interval:int=1 # Poll interval in seconds
83
- ) -> dict: # Job dict (with status)
97
+ job:dict, # Batch job from submit_batch
98
+ c:Mistral=None, # Mistral client
99
+ poll_interval:int=1, # Seconds between status checks
100
+ queued_timeout:int=300 # Max seconds in QUEUED before timeout
101
+ ) -> dict: # Completed job dict
84
102
  "Poll job until completion and return final job status"
103
+ logger.info(f"Waiting for batch job {job.id} (initial status: {job.status})")
104
+ queued_time = 0
85
105
  while job.status in ["QUEUED", "RUNNING"]:
86
- print(f'Mistral batch job status: {job.status}')
106
+ logger.debug(f"Job {job.id} status: {job.status} (elapsed: {queued_time}s)")
107
+ if job.status == "QUEUED": queued_time += poll_interval; _check_timeout(queued_time, queued_timeout, job.id)
87
108
  time.sleep(poll_interval)
88
109
  job = c.batch.jobs.get(job_id=job.id)
110
+ logger.info(f"Job {job.id} completed with status: {job.status}")
111
+ if job.status != "SUCCESS": logger.warning(f"Job {job.id} finished with non-success status: {job.status}")
89
112
  return job
90
113
 
91
- # %% ../nbs/00_core.ipynb 26
114
+ # %% ../nbs/00_core.ipynb 28
92
115
  def download_results(
93
116
  job:dict, # Job dict,
94
117
  c:Mistral=None # Mistral client
@@ -97,7 +120,7 @@ def download_results(
97
120
  content = c.files.download(file_id=job.output_file).read().decode('utf-8')
98
121
  return [json.loads(line) for line in content.strip().split('\n') if line]
99
122
 
100
- # %% ../nbs/00_core.ipynb 31
123
+ # %% ../nbs/00_core.ipynb 33
101
124
  def save_images(
102
125
  page:dict, # Page dict,
103
126
  img_dir:str='img' # Directory to save images
@@ -108,7 +131,7 @@ def save_images(
108
131
  img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
109
132
  Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
110
133
 
111
- # %% ../nbs/00_core.ipynb 32
134
+ # %% ../nbs/00_core.ipynb 34
112
135
  def save_page(
113
136
  page:dict, # Page dict,
114
137
  dst:str, # Directory to save page
@@ -120,7 +143,7 @@ def save_page(
120
143
  img_dir.mkdir(exist_ok=True)
121
144
  save_images(page, img_dir)
122
145
 
123
- # %% ../nbs/00_core.ipynb 34
146
+ # %% ../nbs/00_core.ipynb 36
124
147
  def save_pages(
125
148
  ocr_resp:dict, # OCR response,
126
149
  dst:str, # Directory to save pages,
@@ -133,7 +156,7 @@ def save_pages(
133
156
  for page in ocr_resp['pages']: save_page(page, dst, img_dir)
134
157
  return dst
135
158
 
136
- # %% ../nbs/00_core.ipynb 40
159
+ # %% ../nbs/00_core.ipynb 42
137
160
  def _get_paths(path:str) -> list[Path]:
138
161
  "Get list of PDFs from file or folder"
139
162
  path = Path(path)
@@ -144,7 +167,7 @@ def _get_paths(path:str) -> list[Path]:
144
167
  return pdfs
145
168
  raise ValueError(f"Path not found: {path}")
146
169
 
147
- # %% ../nbs/00_core.ipynb 41
170
+ # %% ../nbs/00_core.ipynb 43
148
171
  def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
149
172
  "Prepare batch entries for list of PDFs"
150
173
  entries, c = [], None
@@ -153,7 +176,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
153
176
  entries.append(entry)
154
177
  return entries, c
155
178
 
156
- # %% ../nbs/00_core.ipynb 42
179
+ # %% ../nbs/00_core.ipynb 44
157
180
  def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
158
181
  "Submit batch, wait for completion, and download results"
159
182
  job = submit_batch(entries, c)
@@ -161,7 +184,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
161
184
  if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
162
185
  return download_results(job, c)
163
186
 
164
- # %% ../nbs/00_core.ipynb 43
187
+ # %% ../nbs/00_core.ipynb 45
165
188
  def ocr_pdf(
166
189
  path:str, # Path to PDF file or folder,
167
190
  dst:str='md', # Directory to save markdown pages,
@@ -175,7 +198,7 @@ def ocr_pdf(
175
198
  results = _run_batch(entries, c, poll_interval)
176
199
  return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
177
200
 
178
- # %% ../nbs/00_core.ipynb 47
201
+ # %% ../nbs/00_core.ipynb 52
179
202
  def read_pgs(
180
203
  path:str, # OCR output directory,
181
204
  join:bool=True # Join pages into single string
@@ -185,3 +208,24 @@ def read_pgs(
185
208
  pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
186
209
  contents = L([p.read_text() for p in pgs])
187
210
  return '\n\n'.join(contents) if join else contents
211
+
212
+ # %% ../nbs/00_core.ipynb 59
213
+ def subset_pdf(
214
+ path:str, # Path to PDF file
215
+ start:int=1, # Start page (1-based)
216
+ end:int=None, # End page (1-based, inclusive)
217
+ dst:str='.' # Output directory
218
+ ) -> Path: # Path to subset PDF
219
+ "Extract page range from PDF and save with range suffix"
220
+ path = Path(path)
221
+ writer = PyPDF2.PdfWriter()
222
+ with open(path, 'rb') as f:
223
+ reader = PyPDF2.PdfReader(f)
224
+ n = len(reader.pages)
225
+ end = end or n
226
+ s, e = max(0, start-1), min(n, end) - 1
227
+ for i in range(s, e+1): writer.add_page(reader.pages[i])
228
+ suffix = f"_p{s+1}-{e+1}" if s>0 or e<n-1 else ""
229
+ out = Path(dst) / f"{path.stem}{suffix}.pdf"
230
+ with open(out, 'wb') as f: writer.write(f)
231
+ return out
@@ -0,0 +1,48 @@
1
+ """End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
2
+
3
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
+
5
+ # %% auto 0
6
+ __all__ = ['logger', 'pdf_to_md']
7
+
8
+ # %% ../nbs/02_pipeline.ipynb 3
9
+ from fastcore.all import *
10
+ from .core import read_pgs, ocr_pdf
11
+ from .refine import add_img_descs, fix_hdgs
12
+ from pathlib import Path
13
+ from asyncio import Semaphore, gather, sleep
14
+ import tempfile
15
+ import os, json, shutil
16
+ import logging
17
+
18
+ # %% ../nbs/02_pipeline.ipynb 4
19
+ logger = logging.getLogger(__name__)
20
+ logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
21
+ logger.setLevel(logging.INFO)
22
+
23
+ # %% ../nbs/02_pipeline.ipynb 5
24
+ @delegates(add_img_descs)
25
+ async def pdf_to_md(
26
+ pdf_path:str, # Path to input PDF file
27
+ dst:str, # Destination directory for output markdown
28
+ ocr_dst:str=None, # Optional OCR output directory
29
+ model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
30
+ add_img_desc:bool=True, # Whether to add image descriptions
31
+ progress:bool=True, # Whether to show progress messages
32
+ **kwargs
33
+ ):
34
+ "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
35
+ "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
36
+ cleanup = ocr_dst is None
37
+ if cleanup: ocr_dst = tempfile.mkdtemp()
38
+ n_steps = 3 if add_img_desc else 2
39
+ if progress: logger.info(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
40
+ ocr_dir = ocr_pdf(pdf_path, ocr_dst)[0]
41
+ if progress: logger.info(f"Step 2/{n_steps}: Fixing heading hierarchy...")
42
+ fix_hdgs(ocr_dir, model=model)
43
+ if add_img_desc:
44
+ if progress: logger.info(f"Step 3/{n_steps}: Adding image descriptions...")
45
+ await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
46
+ elif dst != str(ocr_dir): shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
47
+ if cleanup: shutil.rmtree(ocr_dst)
48
+ if progress: logger.info("Done!")
@@ -3,10 +3,10 @@
3
3
  # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
4
4
 
5
5
  # %% auto 0
6
- __all__ = ['describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx', 'HeadingCorrection',
7
- 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
8
- 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
9
- 'add_img_descs']
6
+ __all__ = ['logger', 'prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
7
+ 'HeadingCorrection', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes',
8
+ 'fix_hdgs', 'ImgDescription', 'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs',
9
+ 'add_descs_to_pg', 'add_descs_to_pgs', 'add_img_descs']
10
10
 
11
11
  # %% ../nbs/01_refine.ipynb 3
12
12
  from fastcore.all import *
@@ -20,8 +20,14 @@ import os
20
20
  import json
21
21
  import shutil
22
22
  from asyncio import Semaphore, gather, sleep
23
+ import logging
23
24
 
24
- # %% ../nbs/01_refine.ipynb 7
25
+ # %% ../nbs/01_refine.ipynb 4
26
+ logger = logging.getLogger(__name__)
27
+ logging.basicConfig(level=logging.WARNING, format='%(name)s - %(levelname)s - %(message)s')
28
+ logger.setLevel(logging.INFO)
29
+
30
+ # %% ../nbs/01_refine.ipynb 8
25
31
  def get_hdgs(
26
32
  md:str # Markdown file string
27
33
  ) -> L: # L of strings
@@ -32,7 +38,7 @@ def get_hdgs(
32
38
 
33
39
 
34
40
 
35
- # %% ../nbs/01_refine.ipynb 8
41
+ # %% ../nbs/01_refine.ipynb 9
36
42
  def add_pg_hdgs(
37
43
  md:str, # Markdown file string,
38
44
  n:int # Page number
@@ -42,7 +48,7 @@ def add_pg_hdgs(
42
48
  def repl(m): return m.group(0) + f' ... page {n}'
43
49
  return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
44
50
 
45
- # %% ../nbs/01_refine.ipynb 12
51
+ # %% ../nbs/01_refine.ipynb 13
46
52
  def read_pgs_pg(
47
53
  path:str # Path to the markdown file
48
54
  ) -> L: # List of markdown pages
@@ -50,7 +56,7 @@ def read_pgs_pg(
50
56
  pgs = read_pgs(path, join=False)
51
57
  return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
52
58
 
53
- # %% ../nbs/01_refine.ipynb 15
59
+ # %% ../nbs/01_refine.ipynb 16
54
60
  def fmt_hdgs_idx(
55
61
  hdgs: list[str] # List of markdown headings
56
62
  ) -> str: # Formatted string with index
@@ -58,15 +64,58 @@ def fmt_hdgs_idx(
58
64
  return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
59
65
 
60
66
 
61
- # %% ../nbs/01_refine.ipynb 18
67
+ # %% ../nbs/01_refine.ipynb 19
62
68
  class HeadingCorrection(BaseModel):
69
+ "A single heading correction mapping an index to its corrected markdown heading"
63
70
  index: int
64
71
  corrected: str
65
72
 
73
+ # %% ../nbs/01_refine.ipynb 20
66
74
  class HeadingCorrections(BaseModel):
75
+ "Collection of heading corrections returned by the LLM"
67
76
  corrections: list[HeadingCorrection]
68
77
 
69
78
  # %% ../nbs/01_refine.ipynb 22
79
+ prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
80
+
81
+ INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title ... page 1")
82
+
83
+ ANALYSIS STEPS (think through these before outputting corrections):
84
+ 1. For each numbered heading (e.g., "4.1", "2.a", "A.1"), identify its parent heading (e.g., "4", "2", "A")
85
+ 2. Verify the child heading is exactly one # deeper than its parent
86
+ 3. If not, mark it for correction
87
+
88
+ RULES - Apply these fixes in order:
89
+
90
+ 1. **Single H1 rule**: Documents must have exactly ONE # heading (typically the document title at the top)
91
+ - If index 0 is already #, then all subsequent headings (index 1+) must be ## or deeper
92
+ - If no H1 exists, the first major heading should be #, and all others ## or deeper
93
+ - NO exceptions: appendices, references, and all sections are ## or deeper after the title
94
+
95
+ 2. **Infer depth from numbering patterns**: If headings contain section numbers, deeper nesting means deeper heading level
96
+ - Parent section (e.g., "1", "2", "A") MUST be shallower than child (e.g., "1.1", "2.a", "A.1")
97
+ - Child section MUST be exactly one # deeper than parent
98
+ - Works with any numbering: "1/1.1/1.1.1", "A/A.1/A.1.a", "I/I.A/I.A.1", etc.
99
+
100
+ 3. **Level jumps**: Headings can only increase by one # at a time when moving deeper
101
+ - Wrong: ## Section → ##### Subsection
102
+ - Fixed: ## Section → ### Subsection
103
+
104
+ 4. **Decreasing levels is OK**: Moving back up the hierarchy (### to ##) is valid for new sections
105
+
106
+ 5. **Unnumbered headings in numbered documents**: If the document uses numbered headings consistently, any unnumbered heading appearing within that structure is likely misclassified bold text and should be converted to regular text (output the heading text without any # symbols in the corrected field)
107
+
108
+ OUTPUT: Return a list of corrections, where each correction has:
109
+ - index: the heading's index number
110
+ - corrected: the fixed heading text (without the index prefix), or empty string "" to remove the heading entirely
111
+ IMPORTANT: Preserve the " ... page N" suffix in all corrected headings.
112
+ Only include headings that need changes.
113
+
114
+ Headings to analyze:
115
+ {headings_list}
116
+ """
117
+
118
+ # %% ../nbs/01_refine.ipynb 24
70
119
  def fix_hdg_hierarchy(
71
120
  hdgs: list[str], # List of markdown headings
72
121
  prompt: str=None, # Prompt to use
@@ -82,7 +131,7 @@ def fix_hdg_hierarchy(
82
131
  return {o['index']: o['corrected'] for o in fixes}
83
132
 
84
133
 
85
- # %% ../nbs/01_refine.ipynb 25
134
+ # %% ../nbs/01_refine.ipynb 27
86
135
  @delegates(fix_hdg_hierarchy)
87
136
  def mk_fixes_lut(
88
137
  hdgs: list[str], # List of markdown headings
@@ -95,7 +144,7 @@ def mk_fixes_lut(
95
144
  fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
96
145
  return {hdgs[k]:v for k,v in fixes.items()}
97
146
 
98
- # %% ../nbs/01_refine.ipynb 28
147
+ # %% ../nbs/01_refine.ipynb 30
99
148
  def apply_hdg_fixes(
100
149
  p:str, # Page to fix
101
150
  lut_fixes: dict[str, str], # Lookup table of fixes
@@ -104,7 +153,7 @@ def apply_hdg_fixes(
104
153
  for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
105
154
  return p
106
155
 
107
- # %% ../nbs/01_refine.ipynb 31
156
+ # %% ../nbs/01_refine.ipynb 33
108
157
  @delegates(mk_fixes_lut)
109
158
  def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
110
159
  "Fix heading hierarchy in markdown document"
@@ -116,13 +165,13 @@ def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:st
116
165
  lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
117
166
  for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
118
167
 
119
- # %% ../nbs/01_refine.ipynb 37
168
+ # %% ../nbs/01_refine.ipynb 39
120
169
  class ImgDescription(BaseModel):
121
170
  "Image classification and description for OCR'd documents"
122
171
  is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
123
172
  description:str # Detailed description of the image content for RAG and accessibility
124
173
 
125
- # %% ../nbs/01_refine.ipynb 40
174
+ # %% ../nbs/01_refine.ipynb 42
126
175
  describe_img_prompt = """Analyze this image from an academic/technical document.
127
176
 
128
177
  Step 1: Determine if this image is informative for understanding the document content.
@@ -135,7 +184,7 @@ Step 2:
135
184
 
136
185
  Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
137
186
 
138
- # %% ../nbs/01_refine.ipynb 41
187
+ # %% ../nbs/01_refine.ipynb 43
139
188
  async def describe_img(
140
189
  img_path: Path, # Path to the image file
141
190
  model: str = 'claude-sonnet-4-5', # Model to use
@@ -146,7 +195,7 @@ async def describe_img(
146
195
  r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
147
196
  return r
148
197
 
149
- # %% ../nbs/01_refine.ipynb 45
198
+ # %% ../nbs/01_refine.ipynb 47
150
199
  async def limit(
151
200
  semaphore, # Semaphore for concurrency control
152
201
  coro, # Coroutine to execute
@@ -158,14 +207,14 @@ async def limit(
158
207
  if delay: await sleep(delay)
159
208
  return r
160
209
 
161
- # %% ../nbs/01_refine.ipynb 47
210
+ # %% ../nbs/01_refine.ipynb 49
162
211
  def parse_r(
163
212
  result # ModelResponse object from API call
164
213
  ): # Dictionary with 'is_informative' and 'description' keys
165
214
  "Extract and parse JSON content from model response"
166
215
  return json.loads(result.choices[0].message.content)
167
216
 
168
- # %% ../nbs/01_refine.ipynb 49
217
+ # %% ../nbs/01_refine.ipynb 51
169
218
  async def describe_imgs(
170
219
  imgs: list[Path], # List of image file paths to describe
171
220
  model: str = 'claude-sonnet-4-5', # Model to use for image description
@@ -178,7 +227,7 @@ async def describe_imgs(
178
227
  results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
179
228
  return {img.name: parse_r(r) for img, r in zip(imgs, results)}
180
229
 
181
- # %% ../nbs/01_refine.ipynb 51
230
+ # %% ../nbs/01_refine.ipynb 53
182
231
  def save_img_descs(
183
232
  descs: dict, # Dictionary of image descriptions
184
233
  dst_fname: Path, # Path to save the JSON file
@@ -186,7 +235,7 @@ def save_img_descs(
186
235
  "Save image descriptions to JSON file"
187
236
  Path(dst_fname).write_text(json.dumps(descs, indent=2))
188
237
 
189
- # %% ../nbs/01_refine.ipynb 56
238
+ # %% ../nbs/01_refine.ipynb 58
190
239
  def add_descs_to_pg(
191
240
  pg:str, # Page markdown content
192
241
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -197,7 +246,7 @@ def add_descs_to_pg(
197
246
  if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
198
247
  return pg
199
248
 
200
- # %% ../nbs/01_refine.ipynb 61
249
+ # %% ../nbs/01_refine.ipynb 63
201
250
  def add_descs_to_pgs(
202
251
  pgs:list, # List of page markdown strings
203
252
  descs:dict # Dictionary mapping image filenames to their descriptions
@@ -205,7 +254,7 @@ def add_descs_to_pgs(
205
254
  "Add AI-generated descriptions to images in all pages"
206
255
  return [add_descs_to_pg(pg, descs) for pg in pgs]
207
256
 
208
- # %% ../nbs/01_refine.ipynb 64
257
+ # %% ../nbs/01_refine.ipynb 66
209
258
  async def add_img_descs(
210
259
  src:str, # Path to source markdown directory
211
260
  dst:str=None, # Destination directory (defaults to src if None)
@@ -214,25 +263,32 @@ async def add_img_descs(
214
263
  semaphore:int=2, # Max concurrent API requests
215
264
  delay:float=1, # Delay in seconds between API calls
216
265
  force:bool=False, # Force regeneration even if cache exists
217
- progress:bool=True # Print progress messages
266
+ progress:bool=True # Log progress messages
218
267
  ):
219
268
  "Describe all images in markdown document and insert descriptions inline"
220
269
  src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
221
270
  if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
222
271
  src_imgs = src_path/img_folder
272
+
273
+ # Check if image folder exists
274
+ if not src_imgs.exists():
275
+ if progress: logger.info(f"No images to describe in the document (no '{img_folder}' folder found)")
276
+ return
277
+
223
278
  if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
224
279
  desc_file = src_path/'img_descriptions.json'
225
280
  if desc_file.exists() and not force:
226
- if progress: print(f"Loading existing descriptions from {desc_file}")
281
+ if progress: logger.info(f"Loading existing descriptions from {desc_file}")
227
282
  descs = json.loads(desc_file.read_text())
228
283
  else:
229
284
  imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
230
- if progress: print(f"Describing {len(imgs)} images...")
285
+ if progress: logger.info(f"Describing {len(imgs)} images...")
231
286
  descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
232
287
  save_img_descs(descs, desc_file)
233
- if progress: print(f"Saved descriptions to {desc_file}")
288
+ if progress: logger.info(f"Saved descriptions to {desc_file}")
234
289
  pgs = read_pgs(src_path, join=False)
235
- if progress: print(f"Adding descriptions to {len(pgs)} pages...")
290
+ if progress: logger.info(f"Adding descriptions to {len(pgs)} pages...")
236
291
  enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
237
292
  for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
238
- if progress: print(f"Done! Enriched pages saved to {dst_path}")
293
+ if progress: logger.info(f"Done! Enriched pages saved to {dst_path}")
294
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mistocr
3
- Version: 0.2.7
3
+ Version: 0.4.1
4
4
  Summary: Batch OCR for PDFs with heading restoration and visual content integration
5
5
  Home-page: https://github.com/franckalbinet/mistocr
6
6
  Author: Solveit
@@ -23,6 +23,7 @@ Requires-Dist: mistralai
23
23
  Requires-Dist: pillow
24
24
  Requires-Dist: dotenv
25
25
  Requires-Dist: lisette
26
+ Requires-Dist: PyPDF2
26
27
  Provides-Extra: dev
27
28
  Dynamic: author
28
29
  Dynamic: author-email
@@ -112,18 +113,6 @@ from mistocr.pipeline import pdf_to_md
112
113
  await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
113
114
  ```
114
115
 
115
- Step 1/3: Running OCR on files/test/resnet.pdf...
116
- Mistral batch job status: QUEUED
117
- Mistral batch job status: RUNNING
118
- Mistral batch job status: RUNNING
119
- Step 2/3: Fixing heading hierarchy...
120
- Step 3/3: Adding image descriptions...
121
- Describing 7 images...
122
- Saved descriptions to ocr_temp/resnet/img_descriptions.json
123
- Adding descriptions to 12 pages...
124
- Done! Enriched pages saved to files/test/md_test
125
- Done!
126
-
127
116
  This will (as indicated by the output):
128
117
 
129
118
  1. OCR the PDF using Mistral’s batch API
@@ -3,5 +3,6 @@ mistralai
3
3
  pillow
4
4
  dotenv
5
5
  lisette
6
+ PyPDF2
6
7
 
7
8
  [dev]
@@ -1,7 +1,7 @@
1
1
  [DEFAULT]
2
2
  repo = mistocr
3
3
  lib_name = mistocr
4
- version = 0.2.7
4
+ version = 0.4.1
5
5
  min_python = 3.9
6
6
  license = apache2
7
7
  black_formatting = False
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
27
27
  language = English
28
28
  status = 3
29
29
  user = franckalbinet
30
- requirements = fastcore mistralai pillow dotenv lisette
30
+ requirements = fastcore mistralai pillow dotenv lisette PyPDF2
31
31
  readme_nb = index.ipynb
32
32
  allowed_metadata_keys =
33
33
  allowed_cell_metadata_keys =
@@ -1 +0,0 @@
1
- __version__ = "0.2.7"
@@ -1,37 +0,0 @@
1
- """End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
2
-
3
- # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
4
-
5
- # %% auto 0
6
- __all__ = ['pdf_to_md']
7
-
8
- # %% ../nbs/02_pipeline.ipynb 3
9
- from fastcore.all import *
10
- from .core import read_pgs, ocr_pdf
11
- from .refine import add_img_descs, fix_hdgs
12
- from pathlib import Path
13
- from asyncio import Semaphore, gather, sleep
14
- import os, json, shutil
15
-
16
- # %% ../nbs/02_pipeline.ipynb 4
17
- @delegates(add_img_descs)
18
- async def pdf_to_md(
19
- pdf_path:str, # Path to input PDF file
20
- dst:str, # Destination directory for output markdown
21
- ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
22
- model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
23
- add_img_desc:bool=True, # Whether to add image descriptions
24
- progress:bool=True, # Whether to show progress messages
25
- **kwargs):
26
- "Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
27
- n_steps = 3 if add_img_desc else 2
28
- if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
29
- ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
30
- ocr_dir = ocr_dirs[0]
31
- if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
32
- fix_hdgs(ocr_dir, model=model)
33
- if add_img_desc:
34
- if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
35
- await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
36
- elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
37
- if progress: print("Done!")
File without changes
File without changes
File without changes
File without changes
File without changes