mistocr 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +13 -3
- mistocr/core.py +5 -4
- mistocr/pipeline.py +37 -0
- mistocr/refine.py +164 -32
- mistocr-0.2.0.dist-info/METADATA +253 -0
- mistocr-0.2.0.dist-info/RECORD +11 -0
- mistocr-0.1.5.dist-info/METADATA +0 -183
- mistocr-0.1.5.dist-info/RECORD +0 -10
- {mistocr-0.1.5.dist-info → mistocr-0.2.0.dist-info}/WHEEL +0 -0
- {mistocr-0.1.5.dist-info → mistocr-0.2.0.dist-info}/entry_points.txt +0 -0
- {mistocr-0.1.5.dist-info → mistocr-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.1.5.dist-info → mistocr-0.2.0.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.2.0"
|
mistocr/_modidx.py
CHANGED
|
@@ -11,7 +11,7 @@ d = { 'settings': { 'branch': 'main',
|
|
|
11
11
|
'mistocr.core.create_batch_entry': ('core.html#create_batch_entry', 'mistocr/core.py'),
|
|
12
12
|
'mistocr.core.download_results': ('core.html#download_results', 'mistocr/core.py'),
|
|
13
13
|
'mistocr.core.get_api_key': ('core.html#get_api_key', 'mistocr/core.py'),
|
|
14
|
-
'mistocr.core.
|
|
14
|
+
'mistocr.core.ocr_pdf': ('core.html#ocr_pdf', 'mistocr/core.py'),
|
|
15
15
|
'mistocr.core.prep_pdf_batch': ('core.html#prep_pdf_batch', 'mistocr/core.py'),
|
|
16
16
|
'mistocr.core.read_pgs': ('core.html#read_pgs', 'mistocr/core.py'),
|
|
17
17
|
'mistocr.core.save_images': ('core.html#save_images', 'mistocr/core.py'),
|
|
@@ -20,12 +20,22 @@ d = { 'settings': { 'branch': 'main',
|
|
|
20
20
|
'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
|
|
21
21
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
22
22
|
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
|
+
'mistocr.pipeline': {'mistocr.pipeline.pdf_to_md': ('pipeline.html#pdf_to_md', 'mistocr/pipeline.py')},
|
|
23
24
|
'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
25
|
+
'mistocr.refine.ImgDescription': ('refine.html#imgdescription', 'mistocr/refine.py'),
|
|
26
|
+
'mistocr.refine.add_descs_to_pg': ('refine.html#add_descs_to_pg', 'mistocr/refine.py'),
|
|
27
|
+
'mistocr.refine.add_descs_to_pgs': ('refine.html#add_descs_to_pgs', 'mistocr/refine.py'),
|
|
28
|
+
'mistocr.refine.add_img_descs': ('refine.html#add_img_descs', 'mistocr/refine.py'),
|
|
24
29
|
'mistocr.refine.add_pg_hdgs': ('refine.html#add_pg_hdgs', 'mistocr/refine.py'),
|
|
25
30
|
'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
|
|
31
|
+
'mistocr.refine.describe_img': ('refine.html#describe_img', 'mistocr/refine.py'),
|
|
32
|
+
'mistocr.refine.describe_imgs': ('refine.html#describe_imgs', 'mistocr/refine.py'),
|
|
26
33
|
'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
|
|
27
|
-
'mistocr.refine.
|
|
34
|
+
'mistocr.refine.fix_hdgs': ('refine.html#fix_hdgs', 'mistocr/refine.py'),
|
|
28
35
|
'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
|
|
29
36
|
'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
|
|
37
|
+
'mistocr.refine.limit': ('refine.html#limit', 'mistocr/refine.py'),
|
|
30
38
|
'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py'),
|
|
31
|
-
'mistocr.refine.
|
|
39
|
+
'mistocr.refine.parse_r': ('refine.html#parse_r', 'mistocr/refine.py'),
|
|
40
|
+
'mistocr.refine.read_pgs_pg': ('refine.html#read_pgs_pg', 'mistocr/refine.py'),
|
|
41
|
+
'mistocr.refine.save_img_descs': ('refine.html#save_img_descs', 'mistocr/refine.py')}}}
|
mistocr/core.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
|
|
7
|
-
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', '
|
|
7
|
+
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr_pdf', 'read_pgs']
|
|
8
8
|
|
|
9
9
|
# %% ../nbs/00_core.ipynb 3
|
|
10
10
|
from fastcore.all import *
|
|
@@ -79,10 +79,11 @@ def submit_batch(
|
|
|
79
79
|
def wait_for_job(
|
|
80
80
|
job:dict, # Job dict,
|
|
81
81
|
c:Mistral=None, # Mistral client,
|
|
82
|
-
poll_interval:int=
|
|
82
|
+
poll_interval:int=1 # Poll interval in seconds
|
|
83
83
|
) -> dict: # Job dict (with status)
|
|
84
84
|
"Poll job until completion and return final job status"
|
|
85
85
|
while job.status in ["QUEUED", "RUNNING"]:
|
|
86
|
+
print(f'Mistral batch job status: {job.status}')
|
|
86
87
|
time.sleep(poll_interval)
|
|
87
88
|
job = c.batch.jobs.get(job_id=job.id)
|
|
88
89
|
return job
|
|
@@ -161,7 +162,7 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
161
162
|
return download_results(job, c)
|
|
162
163
|
|
|
163
164
|
# %% ../nbs/00_core.ipynb 43
|
|
164
|
-
def
|
|
165
|
+
def ocr_pdf(
|
|
165
166
|
path:str, # Path to PDF file or folder,
|
|
166
167
|
dst:str='md', # Directory to save markdown pages,
|
|
167
168
|
inc_img:bool=True, # Include image in response,
|
|
@@ -174,7 +175,7 @@ def ocr(
|
|
|
174
175
|
results = _run_batch(entries, c, poll_interval)
|
|
175
176
|
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
176
177
|
|
|
177
|
-
# %% ../nbs/00_core.ipynb
|
|
178
|
+
# %% ../nbs/00_core.ipynb 47
|
|
178
179
|
def read_pgs(
|
|
179
180
|
path:str, # OCR output directory,
|
|
180
181
|
join:bool=True # Join pages into single string
|
mistocr/pipeline.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""End-to-End Pipeline: PDF OCR, Markdown Heading Correction, and AI Image Descriptions"""
|
|
2
|
+
|
|
3
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_pipeline.ipynb.
|
|
4
|
+
|
|
5
|
+
# %% auto 0
|
|
6
|
+
__all__ = ['pdf_to_md']
|
|
7
|
+
|
|
8
|
+
# %% ../nbs/02_pipeline.ipynb 3
|
|
9
|
+
from fastcore.all import *
|
|
10
|
+
from .core import read_pgs, ocr_pdf
|
|
11
|
+
from .refine import add_img_descs, fix_hdgs
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from asyncio import Semaphore, gather, sleep
|
|
14
|
+
import os, json, shutil
|
|
15
|
+
|
|
16
|
+
# %% ../nbs/02_pipeline.ipynb 4
|
|
17
|
+
@delegates(add_img_descs)
|
|
18
|
+
async def pdf_to_md(
|
|
19
|
+
pdf_path:str, # Path to input PDF file
|
|
20
|
+
dst:str, # Destination directory for output markdown
|
|
21
|
+
ocr_output:str=None, # Optional OCR output directory (defaults to pdf_path stem)
|
|
22
|
+
model:str='claude-sonnet-4-5', # Model to use for heading fixes and image descriptions
|
|
23
|
+
add_img_desc:bool=True, # Whether to add image descriptions
|
|
24
|
+
progress:bool=True, # Whether to show progress messages
|
|
25
|
+
**kwargs):
|
|
26
|
+
"Convert PDF to markdown with OCR, fixed heading hierarchy, and optional image descriptions"
|
|
27
|
+
n_steps = 3 if add_img_desc else 2
|
|
28
|
+
if progress: print(f"Step 1/{n_steps}: Running OCR on {pdf_path}...")
|
|
29
|
+
ocr_dirs = ocr_pdf(pdf_path, ocr_output or 'ocr_temp')
|
|
30
|
+
ocr_dir = ocr_dirs[0]
|
|
31
|
+
if progress: print(f"Step 2/{n_steps}: Fixing heading hierarchy...")
|
|
32
|
+
fix_hdgs(ocr_dir, model=model)
|
|
33
|
+
if add_img_desc:
|
|
34
|
+
if progress: print(f"Step 3/{n_steps}: Adding image descriptions...")
|
|
35
|
+
await add_img_descs(ocr_dir, dst=dst, model=model, progress=progress, **kwargs)
|
|
36
|
+
elif dst and Path(dst) != ocr_dir: shutil.copytree(ocr_dir, dst, dirs_exist_ok=True)
|
|
37
|
+
if progress: print("Done!")
|
mistocr/refine.py
CHANGED
|
@@ -1,24 +1,30 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Fix heading hierarchy and describe images in OCR'd markdown documents"""
|
|
2
2
|
|
|
3
3
|
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
|
-
__all__ = ['prompt_fix_hdgs', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
-
'mk_fixes_lut', 'apply_hdg_fixes', '
|
|
6
|
+
__all__ = ['prompt_fix_hdgs', 'describe_img_prompt', 'get_hdgs', 'add_pg_hdgs', 'read_pgs_pg', 'fmt_hdgs_idx',
|
|
7
|
+
'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut', 'apply_hdg_fixes', 'fix_hdgs', 'ImgDescription',
|
|
8
|
+
'describe_img', 'limit', 'parse_r', 'describe_imgs', 'save_img_descs', 'add_descs_to_pg', 'add_descs_to_pgs',
|
|
9
|
+
'add_img_descs']
|
|
8
10
|
|
|
9
11
|
# %% ../nbs/01_refine.ipynb 3
|
|
10
12
|
from fastcore.all import *
|
|
11
13
|
from .core import read_pgs
|
|
12
14
|
from re import sub, findall, MULTILINE
|
|
13
15
|
from pydantic import BaseModel
|
|
16
|
+
from lisette import *
|
|
14
17
|
from lisette.core import completion
|
|
18
|
+
from typing import Callable
|
|
15
19
|
import os
|
|
16
20
|
import json
|
|
21
|
+
import shutil
|
|
22
|
+
from asyncio import Semaphore, gather, sleep
|
|
17
23
|
|
|
18
|
-
# %% ../nbs/01_refine.ipynb
|
|
24
|
+
# %% ../nbs/01_refine.ipynb 7
|
|
19
25
|
def get_hdgs(
|
|
20
26
|
md:str # Markdown file string
|
|
21
|
-
):
|
|
27
|
+
) -> L: # L of strings
|
|
22
28
|
"Return the markdown headings"
|
|
23
29
|
# Sanitize removing '#' in python snippet if any
|
|
24
30
|
md = sub(r'```[\s\S]*?```', '', md)
|
|
@@ -26,15 +32,20 @@ def get_hdgs(
|
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
|
|
29
|
-
# %% ../nbs/01_refine.ipynb
|
|
30
|
-
def add_pg_hdgs(
|
|
31
|
-
|
|
35
|
+
# %% ../nbs/01_refine.ipynb 8
|
|
36
|
+
def add_pg_hdgs(
|
|
37
|
+
md:str, # Markdown file string,
|
|
38
|
+
n:int # Page number
|
|
39
|
+
) -> str: # Markdown file string
|
|
40
|
+
"Add page number to all headings in page markdown"
|
|
32
41
|
md = sub(r'```[\s\S]*?```', '', md)
|
|
33
42
|
def repl(m): return m.group(0) + f' ... page {n}'
|
|
34
43
|
return sub(r'^#{1,6} .+$', repl, md, flags=MULTILINE)
|
|
35
44
|
|
|
36
|
-
# %% ../nbs/01_refine.ipynb
|
|
37
|
-
def read_pgs_pg(
|
|
45
|
+
# %% ../nbs/01_refine.ipynb 12
|
|
46
|
+
def read_pgs_pg(
|
|
47
|
+
path:str # Path to the markdown file
|
|
48
|
+
) -> L: # List of markdown pages
|
|
38
49
|
"Read all pages of a markdown file and add page numbers to all headings"
|
|
39
50
|
pgs = read_pgs(path, join=False)
|
|
40
51
|
return L([add_pg_hdgs(pg, n) for n, pg in enumerate(pgs, 1)]).concat()
|
|
@@ -83,33 +94,35 @@ Headings to analyze:
|
|
|
83
94
|
"""
|
|
84
95
|
|
|
85
96
|
|
|
86
|
-
# %% ../nbs/01_refine.ipynb
|
|
97
|
+
# %% ../nbs/01_refine.ipynb 22
|
|
87
98
|
def fix_hdg_hierarchy(
|
|
88
99
|
hdgs: list[str], # List of markdown headings
|
|
89
|
-
prompt: str=
|
|
100
|
+
prompt: str=None, # Prompt to use
|
|
90
101
|
model: str='claude-sonnet-4-5', # Model to use
|
|
91
|
-
api_key: str=
|
|
102
|
+
api_key: str=None # API key
|
|
92
103
|
) -> dict[int, str]: # Dictionary of index → corrected heading
|
|
93
104
|
"Fix the heading hierarchy"
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
api_key=api_key
|
|
99
|
-
)
|
|
105
|
+
if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
|
|
106
|
+
if prompt is None: prompt = prompt_fix_hdgs
|
|
107
|
+
prompt = prompt.format(headings_list=fmt_hdgs_idx(hdgs))
|
|
108
|
+
r = completion(model=model, messages=[{"role": "user", "content": prompt}], response_format=HeadingCorrections, api_key=api_key)
|
|
100
109
|
return json.loads(r.choices[0].message.content)['corrections']
|
|
101
110
|
|
|
102
|
-
|
|
111
|
+
|
|
112
|
+
# %% ../nbs/01_refine.ipynb 25
|
|
113
|
+
@delegates(fix_hdg_hierarchy)
|
|
103
114
|
def mk_fixes_lut(
|
|
104
115
|
hdgs: list[str], # List of markdown headings
|
|
105
116
|
model: str='claude-sonnet-4-5', # Model to use
|
|
106
|
-
api_key: str=
|
|
117
|
+
api_key: str=None, # API key
|
|
118
|
+
**kwargs
|
|
107
119
|
) -> dict[str, str]: # Dictionary of old → new heading
|
|
108
120
|
"Make a lookup table of fixes"
|
|
109
|
-
|
|
121
|
+
if api_key is None: api_key = os.getenv('ANTHROPIC_API_KEY')
|
|
122
|
+
fixes = fix_hdg_hierarchy(hdgs, model=model, api_key=api_key, **kwargs)
|
|
110
123
|
return {hdgs[int(k)]:v for k,v in fixes.items()}
|
|
111
124
|
|
|
112
|
-
# %% ../nbs/01_refine.ipynb
|
|
125
|
+
# %% ../nbs/01_refine.ipynb 28
|
|
113
126
|
def apply_hdg_fixes(
|
|
114
127
|
p:str, # Page to fix
|
|
115
128
|
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
@@ -118,16 +131,135 @@ def apply_hdg_fixes(
|
|
|
118
131
|
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old))
|
|
119
132
|
return p
|
|
120
133
|
|
|
121
|
-
# %% ../nbs/01_refine.ipynb
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
model:str='claude-sonnet-4-5', # Model
|
|
125
|
-
dst:str=None, # Destination directory (None=overwrite)
|
|
126
|
-
):
|
|
134
|
+
# %% ../nbs/01_refine.ipynb 31
|
|
135
|
+
@delegates(mk_fixes_lut)
|
|
136
|
+
def fix_hdgs(src:str, model:str='claude-sonnet-4-5', dst:str=None, img_folder:str='img', **kwargs):
|
|
127
137
|
"Fix heading hierarchy in markdown document"
|
|
128
138
|
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
129
139
|
if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
|
|
140
|
+
src_imgs = src_path/img_folder
|
|
141
|
+
if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
|
|
130
142
|
pgs_with_pg = read_pgs_pg(src_path)
|
|
131
|
-
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model)
|
|
132
|
-
for i,p in enumerate(pgs_with_pg, 1):
|
|
133
|
-
|
|
143
|
+
lut = mk_fixes_lut(L([get_hdgs(pg) for pg in pgs_with_pg]).concat(), model, **kwargs)
|
|
144
|
+
for i,p in enumerate(pgs_with_pg, 1): (dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut))
|
|
145
|
+
|
|
146
|
+
# %% ../nbs/01_refine.ipynb 37
|
|
147
|
+
class ImgDescription(BaseModel):
|
|
148
|
+
"Image classification and description for OCR'd documents"
|
|
149
|
+
is_informative:bool # Whether image contains informative content (charts, diagrams, tables) vs decorative (logos, backgrounds)
|
|
150
|
+
description:str # Detailed description of the image content for RAG and accessibility
|
|
151
|
+
|
|
152
|
+
# %% ../nbs/01_refine.ipynb 40
|
|
153
|
+
describe_img_prompt = """Analyze this image from an academic/technical document.
|
|
154
|
+
|
|
155
|
+
Step 1: Determine if this image is informative for understanding the document content.
|
|
156
|
+
- Informative: charts, diagrams, tables, technical illustrations, experimental results, architectural diagrams
|
|
157
|
+
- Non-informative: logos, decorative images, generic photos, page backgrounds
|
|
158
|
+
|
|
159
|
+
Step 2:
|
|
160
|
+
- If informative: Provide a detailed description including the type of visualization, key elements and their relationships, important data or patterns, and relevant technical details.
|
|
161
|
+
- If non-informative: Provide a brief label (e.g., "Company logo", "Decorative header image")
|
|
162
|
+
|
|
163
|
+
Return your response as JSON with 'is_informative' (boolean) and 'description' (string) fields."""
|
|
164
|
+
|
|
165
|
+
# %% ../nbs/01_refine.ipynb 41
|
|
166
|
+
async def describe_img(
|
|
167
|
+
img_path: Path, # Path to the image file
|
|
168
|
+
model: str = 'claude-sonnet-4-5', # Model to use
|
|
169
|
+
prompt: str = describe_img_prompt # Prompt for description
|
|
170
|
+
) -> ImgDescription:
|
|
171
|
+
"Describe a single image using AsyncChat"
|
|
172
|
+
chat = AsyncChat(model=model)
|
|
173
|
+
r = await chat([img_path.read_bytes(), prompt], response_format=ImgDescription)
|
|
174
|
+
return r
|
|
175
|
+
|
|
176
|
+
# %% ../nbs/01_refine.ipynb 45
|
|
177
|
+
async def limit(
|
|
178
|
+
semaphore, # Semaphore for concurrency control
|
|
179
|
+
coro, # Coroutine to execute
|
|
180
|
+
delay:float=None # Optional delay in seconds after execution
|
|
181
|
+
):
|
|
182
|
+
"Execute coroutine with semaphore-based rate limiting and optional delay"
|
|
183
|
+
async with semaphore:
|
|
184
|
+
r = await coro
|
|
185
|
+
if delay: await sleep(delay)
|
|
186
|
+
return r
|
|
187
|
+
|
|
188
|
+
# %% ../nbs/01_refine.ipynb 47
|
|
189
|
+
def parse_r(
|
|
190
|
+
result # ModelResponse object from API call
|
|
191
|
+
): # Dictionary with 'is_informative' and 'description' keys
|
|
192
|
+
"Extract and parse JSON content from model response"
|
|
193
|
+
return json.loads(result.choices[0].message.content)
|
|
194
|
+
|
|
195
|
+
# %% ../nbs/01_refine.ipynb 49
|
|
196
|
+
async def describe_imgs(
|
|
197
|
+
imgs: list[Path], # List of image file paths to describe
|
|
198
|
+
model: str = 'claude-sonnet-4-5', # Model to use for image description
|
|
199
|
+
prompt: str = describe_img_prompt, # Prompt template for description
|
|
200
|
+
semaphore: int = 2, # Max concurrent API requests
|
|
201
|
+
delay: float = 1 # Delay in seconds between requests
|
|
202
|
+
) -> dict[str, dict]: # Dict mapping filename to parsed description
|
|
203
|
+
"Describe multiple images in parallel with rate limiting"
|
|
204
|
+
sem = Semaphore(semaphore)
|
|
205
|
+
results = await gather(*[limit(sem, describe_img(img, model, prompt), delay) for img in imgs])
|
|
206
|
+
return {img.name: parse_r(r) for img, r in zip(imgs, results)}
|
|
207
|
+
|
|
208
|
+
# %% ../nbs/01_refine.ipynb 51
|
|
209
|
+
def save_img_descs(
|
|
210
|
+
descs: dict, # Dictionary of image descriptions
|
|
211
|
+
dst_fname: Path, # Path to save the JSON file
|
|
212
|
+
) -> None:
|
|
213
|
+
"Save image descriptions to JSON file"
|
|
214
|
+
Path(dst_fname).write_text(json.dumps(descs, indent=2))
|
|
215
|
+
|
|
216
|
+
# %% ../nbs/01_refine.ipynb 56
|
|
217
|
+
def add_descs_to_pg(
|
|
218
|
+
pg:str, # Page markdown content
|
|
219
|
+
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
220
|
+
) -> str: # Page markdown with descriptions added
|
|
221
|
+
"Add AI-generated descriptions to images in page"
|
|
222
|
+
for link in re.findall(r'!\[[^\]]*\]\([^)]+\)', pg):
|
|
223
|
+
fname = re.findall(r'\(([^)]+)\)', link)[0]
|
|
224
|
+
if fname in descs: pg = pg.replace(link, f"{link}\nAI-generated image description:\n___\n{descs[fname]['description']}\n___")
|
|
225
|
+
return pg
|
|
226
|
+
|
|
227
|
+
# %% ../nbs/01_refine.ipynb 61
|
|
228
|
+
def add_descs_to_pgs(
|
|
229
|
+
pgs:list, # List of page markdown strings
|
|
230
|
+
descs:dict # Dictionary mapping image filenames to their descriptions
|
|
231
|
+
) -> list: # List of pages with descriptions added
|
|
232
|
+
"Add AI-generated descriptions to images in all pages"
|
|
233
|
+
return [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
234
|
+
|
|
235
|
+
# %% ../nbs/01_refine.ipynb 64
|
|
236
|
+
async def add_img_descs(
|
|
237
|
+
src:str, # Path to source markdown directory
|
|
238
|
+
dst:str=None, # Destination directory (defaults to src if None)
|
|
239
|
+
model:str='claude-sonnet-4-5', # Vision model for image description
|
|
240
|
+
img_folder:str='img', # Name of folder containing images
|
|
241
|
+
semaphore:int=2, # Max concurrent API requests
|
|
242
|
+
delay:float=1, # Delay in seconds between API calls
|
|
243
|
+
force:bool=False, # Force regeneration even if cache exists
|
|
244
|
+
progress:bool=True # Print progress messages
|
|
245
|
+
):
|
|
246
|
+
"Describe all images in markdown document and insert descriptions inline"
|
|
247
|
+
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
248
|
+
if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
|
|
249
|
+
src_imgs = src_path/img_folder
|
|
250
|
+
if src_imgs.exists() and dst_path != src_path: shutil.copytree(src_imgs, dst_path/img_folder, dirs_exist_ok=True)
|
|
251
|
+
desc_file = src_path/'img_descriptions.json'
|
|
252
|
+
if desc_file.exists() and not force:
|
|
253
|
+
if progress: print(f"Loading existing descriptions from {desc_file}")
|
|
254
|
+
descs = json.loads(desc_file.read_text())
|
|
255
|
+
else:
|
|
256
|
+
imgs = (src_path/img_folder).ls(file_exts=['.jpeg', '.jpg', '.png'])
|
|
257
|
+
if progress: print(f"Describing {len(imgs)} images...")
|
|
258
|
+
descs = await describe_imgs(imgs, model, semaphore=semaphore, delay=delay)
|
|
259
|
+
save_img_descs(descs, desc_file)
|
|
260
|
+
if progress: print(f"Saved descriptions to {desc_file}")
|
|
261
|
+
pgs = read_pgs(src_path, join=False)
|
|
262
|
+
if progress: print(f"Adding descriptions to {len(pgs)} pages...")
|
|
263
|
+
enriched = [add_descs_to_pg(pg, descs) for pg in pgs]
|
|
264
|
+
for i,pg in enumerate(enriched, 1): (dst_path/f'page_{i}.md').write_text(pg)
|
|
265
|
+
if progress: print(f"Done! Enriched pages saved to {dst_path}")
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mistocr
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
|
|
5
|
+
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
|
+
Author: Solveit
|
|
7
|
+
Author-email: nobody@fast.ai
|
|
8
|
+
License: Apache Software License 2.0
|
|
9
|
+
Keywords: nbdev jupyter notebook python
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Natural Language :: English
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: fastcore
|
|
22
|
+
Requires-Dist: mistralai
|
|
23
|
+
Requires-Dist: pillow
|
|
24
|
+
Requires-Dist: dotenv
|
|
25
|
+
Requires-Dist: lisette
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: author-email
|
|
29
|
+
Dynamic: classifier
|
|
30
|
+
Dynamic: description
|
|
31
|
+
Dynamic: description-content-type
|
|
32
|
+
Dynamic: home-page
|
|
33
|
+
Dynamic: keywords
|
|
34
|
+
Dynamic: license
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
Dynamic: provides-extra
|
|
37
|
+
Dynamic: requires-dist
|
|
38
|
+
Dynamic: requires-python
|
|
39
|
+
Dynamic: summary
|
|
40
|
+
|
|
41
|
+
# mistocr
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
|
|
45
|
+
|
|
46
|
+
**PDF OCR is a critical bottleneck in AI pipelines.** It’s often
|
|
47
|
+
mentioned in passing, as if it’s a trivial step. Practice shows it’s far
|
|
48
|
+
from it. Poorly converted PDFs mean garbage-in-garbage-out for
|
|
49
|
+
downstream AI-system (RAG, …).
|
|
50
|
+
|
|
51
|
+
When [Mistral AI](https://mistral.ai) released their [state-of-the-art
|
|
52
|
+
OCR model](https://mistral.ai/fr/news/mistral-ocr) in March 2025, it
|
|
53
|
+
opened new possibilities for large-scale document processing. While
|
|
54
|
+
alternatives like [datalab.to](https://www.datalab.to) and
|
|
55
|
+
[docling.ai](https://www.docling.ai) offer viable solutions, Mistral OCR
|
|
56
|
+
delivers exceptional accuracy at a compelling price point.
|
|
57
|
+
|
|
58
|
+
**mistocr** emerged from months of real-world usage across projects
|
|
59
|
+
requiring large-scale processing of niche-domain PDFs. It addresses two
|
|
60
|
+
fundamental challenges that raw OCR output leaves unsolved:
|
|
61
|
+
|
|
62
|
+
- **Heading hierarchy restoration**: Even state-of-the-art OCR sometimes
|
|
63
|
+
produces inconsistent heading levels in large documents—a complex task
|
|
64
|
+
to get right. mistocr uses LLM-based analysis to restore proper
|
|
65
|
+
document structure, essential for downstream AI tasks.
|
|
66
|
+
|
|
67
|
+
- **Visual content integration**: Charts, figures and diagrams are
|
|
68
|
+
automatically classified and described, then integrated into the
|
|
69
|
+
markdown. This makes visual information searchable and accessible for
|
|
70
|
+
downstream applications.
|
|
71
|
+
|
|
72
|
+
- **Cost-efficient batch processing**: By exclusively using Mistral’s
|
|
73
|
+
batch API, mistocr cuts costs by 50% (\$0.50 vs \$1.00 per 1000 pages)
|
|
74
|
+
while eliminating the boilerplate code typically required.
|
|
75
|
+
|
|
76
|
+
**In short**: Production-ready batch OCR with intelligent postprocessing
|
|
77
|
+
that ensures your documents are actually usable for AI systems.
|
|
78
|
+
|
|
79
|
+
## Get Started
|
|
80
|
+
|
|
81
|
+
Install latest from [pypi](https://pypi.org/project/mistocr), then:
|
|
82
|
+
|
|
83
|
+
``` sh
|
|
84
|
+
$ pip install mistocr
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Set your API keys:
|
|
88
|
+
|
|
89
|
+
``` python
|
|
90
|
+
import os
|
|
91
|
+
os.environ['MISTRAL_API_KEY'] = 'your-key-here'
|
|
92
|
+
os.environ['ANTHROPIC_API_KEY'] = 'your-key-here' # for refine features (see Advanced Usage for other LLMs)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Complete Pipeline
|
|
96
|
+
|
|
97
|
+
Full pipeline with all features:
|
|
98
|
+
|
|
99
|
+
``` python
|
|
100
|
+
from mistocr.pipeline import pdf_to_md
|
|
101
|
+
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
105
|
+
Mistral batch job status: QUEUED
|
|
106
|
+
Mistral batch job status: RUNNING
|
|
107
|
+
Mistral batch job status: RUNNING
|
|
108
|
+
Step 2/3: Fixing heading hierarchy...
|
|
109
|
+
Step 3/3: Adding image descriptions...
|
|
110
|
+
Describing 7 images...
|
|
111
|
+
Saved descriptions to ocr_temp/resnet/img_descriptions.json
|
|
112
|
+
Adding descriptions to 12 pages...
|
|
113
|
+
Done! Enriched pages saved to files/test/md_test
|
|
114
|
+
Done!
|
|
115
|
+
|
|
116
|
+
This will (as indicated by the output):
|
|
117
|
+
|
|
118
|
+
1. OCR the PDF using Mistral’s batch API
|
|
119
|
+
2. Fix heading hierarchy inconsistencies
|
|
120
|
+
3. Describe images (charts, diagrams) and add those descriptions into
|
|
121
|
+
the markdown Save everything to `files/test/md_test`
|
|
122
|
+
|
|
123
|
+
The output structure will be:
|
|
124
|
+
|
|
125
|
+
files/test/md_test/
|
|
126
|
+
├── img/
|
|
127
|
+
│ ├── img-0.jpeg
|
|
128
|
+
│ ├── img-1.jpeg
|
|
129
|
+
│ └── ...
|
|
130
|
+
├── page_1.md
|
|
131
|
+
├── page_2.md
|
|
132
|
+
└── ...
|
|
133
|
+
|
|
134
|
+
Each page’s markdown will include inline image descriptions:
|
|
135
|
+
|
|
136
|
+
```` markdown
|
|
137
|
+
```markdown
|
|
138
|
+

|
|
139
|
+
AI-generated image description:
|
|
140
|
+
___
|
|
141
|
+
A residual learning block...
|
|
142
|
+
___
|
|
143
|
+
```
|
|
144
|
+
````
|
|
145
|
+
|
|
146
|
+
To print the the processed markdown, you can use the
|
|
147
|
+
[`read_pgs`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
|
|
148
|
+
function. Here’s how:
|
|
149
|
+
|
|
150
|
+
Then to read the fully processed document:
|
|
151
|
+
|
|
152
|
+
``` python
|
|
153
|
+
from mistocr.pipeline import read_pgs
|
|
154
|
+
md = read_pgs('files/test/md_test')
|
|
155
|
+
print(md[:500])
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
# Deep Residual Learning for Image Recognition ... page 1
|
|
159
|
+
|
|
160
|
+
Kaiming He Xiangyu Zhang Shaoqing Ren Jian Sun<br>Microsoft Research<br>\{kahe, v-xiangz, v-shren, jiansun\}@microsoft.com
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
## Abstract ... page 1
|
|
164
|
+
|
|
165
|
+
Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, ins
|
|
166
|
+
|
|
167
|
+
By default,
|
|
168
|
+
[`read_pgs()`](https://franckalbinet.github.io/mistocr/core.html#read_pgs)
|
|
169
|
+
joins all pages. Pass `join=False` to get a list of individual pages
|
|
170
|
+
instead.
|
|
171
|
+
|
|
172
|
+
### Advanced Usage
|
|
173
|
+
|
|
174
|
+
**Batch process entire folders:**
|
|
175
|
+
|
|
176
|
+
``` python
|
|
177
|
+
from mistocr.core import ocr_pdf
|
|
178
|
+
|
|
179
|
+
# Process all PDFs in a folder
|
|
180
|
+
output_dirs = ocr_pdf('path/to/pdf_folder', dst='output_folder')
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Custom models and prompts for heading fixes:**
|
|
184
|
+
|
|
185
|
+
``` python
|
|
186
|
+
from mistocr.refine import fix_hdgs
|
|
187
|
+
|
|
188
|
+
# Use a different model or custom prompt
|
|
189
|
+
fix_hdgs('ocr_output/doc1',
|
|
190
|
+
model='gpt-4o',
|
|
191
|
+
prompt=your_custom_prompt)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Custom image description with rate limiting:**
|
|
195
|
+
|
|
196
|
+
``` python
|
|
197
|
+
from mistocr.refine import add_img_descs
|
|
198
|
+
|
|
199
|
+
# Control API usage and customize descriptions
|
|
200
|
+
await add_img_descs('ocr_output/doc1',
|
|
201
|
+
model='claude-opus-4',
|
|
202
|
+
semaphore=5, # More concurrent requests
|
|
203
|
+
delay=0.5) # Shorter delay between calls
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
For complete control over each pipeline step, see the
|
|
207
|
+
[core](https://fr.anckalbi.net/mistocr/core.html),
|
|
208
|
+
[refine](https://fr.anckalbi.net/mistocr/refine.html), and
|
|
209
|
+
[pipeline](https://fr.anckalbi.net/mistocr/pipeline.html) module
|
|
210
|
+
documentation.
|
|
211
|
+
|
|
212
|
+
## Known Limitations & Future Work
|
|
213
|
+
|
|
214
|
+
`mistocr` is under active development. Current limitations include:
|
|
215
|
+
|
|
216
|
+
- **No timeout on batch jobs**: Jobs poll indefinitely until completion.
|
|
217
|
+
If a job stalls, manual intervention is required.
|
|
218
|
+
- **Limited error handling**: When batch jobs fail, error reporting and
|
|
219
|
+
recovery options are minimal.
|
|
220
|
+
- **Progress monitoring**: Currently limited to periodic status prints.
|
|
221
|
+
Future versions will support callbacks or streaming updates for better
|
|
222
|
+
real-time monitoring.
|
|
223
|
+
|
|
224
|
+
Contributions are welcome! If you encounter issues or have ideas for
|
|
225
|
+
improvements, please open an issue or discussion on
|
|
226
|
+
[GitHub](https://github.com/franckalbinet/mistocr).
|
|
227
|
+
|
|
228
|
+
## Developer Guide
|
|
229
|
+
|
|
230
|
+
If you are new to using `nbdev` here are some useful pointers to get you
|
|
231
|
+
started.
|
|
232
|
+
|
|
233
|
+
### Install mistocr in Development mode
|
|
234
|
+
|
|
235
|
+
``` sh
|
|
236
|
+
# make sure mistocr package is installed in development mode
|
|
237
|
+
$ pip install -e .
|
|
238
|
+
|
|
239
|
+
# make changes under nbs/ directory
|
|
240
|
+
# ...
|
|
241
|
+
|
|
242
|
+
# compile to have changes apply to mistocr
|
|
243
|
+
$ nbdev_prepare
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Documentation
|
|
247
|
+
|
|
248
|
+
Documentation can be found hosted on this GitHub
|
|
249
|
+
[repository](https://github.com/franckalbinet/mistocr)’s
|
|
250
|
+
[pages](https://franckalbinet.github.io/mistocr/). Additionally you can
|
|
251
|
+
find package manager specific guidelines on
|
|
252
|
+
[conda](https://anaconda.org/franckalbinet/mistocr) and
|
|
253
|
+
[pypi](https://pypi.org/project/mistocr/) respectively.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
|
|
2
|
+
mistocr/_modidx.py,sha256=JtXJb6ErVwyGFCWgeVz7N9JvybBp8Zyao4Ahn3smr90,4030
|
|
3
|
+
mistocr/core.py,sha256=KH5ND6ZMmETj9m9A-Ivtw5N2L-qm9vGwYgvHAzg-BsA,7218
|
|
4
|
+
mistocr/pipeline.py,sha256=k5rtr9ccxTgwQAzw533iMIL9qwiCofugWv0fhotLzaI,1639
|
|
5
|
+
mistocr/refine.py,sha256=zsPoxWD63bk1rzRVO9OPsevWeMNORHgT_y8H7T7CxYs,11785
|
|
6
|
+
mistocr-0.2.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
7
|
+
mistocr-0.2.0.dist-info/METADATA,sha256=CiFsDl_zMSvai9aW_jprae7xwM2qC3JSTUfJTk-x41g,7987
|
|
8
|
+
mistocr-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
9
|
+
mistocr-0.2.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
10
|
+
mistocr-0.2.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
11
|
+
mistocr-0.2.0.dist-info/RECORD,,
|
mistocr-0.1.5.dist-info/METADATA
DELETED
|
@@ -1,183 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: mistocr
|
|
3
|
-
Version: 0.1.5
|
|
4
|
-
Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
|
|
5
|
-
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
|
-
Author: Solveit
|
|
7
|
-
Author-email: nobody@fast.ai
|
|
8
|
-
License: Apache Software License 2.0
|
|
9
|
-
Keywords: nbdev jupyter notebook python
|
|
10
|
-
Classifier: Development Status :: 4 - Beta
|
|
11
|
-
Classifier: Intended Audience :: Developers
|
|
12
|
-
Classifier: Natural Language :: English
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
18
|
-
Requires-Python: >=3.9
|
|
19
|
-
Description-Content-Type: text/markdown
|
|
20
|
-
License-File: LICENSE
|
|
21
|
-
Requires-Dist: fastcore
|
|
22
|
-
Requires-Dist: mistralai
|
|
23
|
-
Requires-Dist: pillow
|
|
24
|
-
Requires-Dist: dotenv
|
|
25
|
-
Requires-Dist: lisette
|
|
26
|
-
Provides-Extra: dev
|
|
27
|
-
Dynamic: author
|
|
28
|
-
Dynamic: author-email
|
|
29
|
-
Dynamic: classifier
|
|
30
|
-
Dynamic: description
|
|
31
|
-
Dynamic: description-content-type
|
|
32
|
-
Dynamic: home-page
|
|
33
|
-
Dynamic: keywords
|
|
34
|
-
Dynamic: license
|
|
35
|
-
Dynamic: license-file
|
|
36
|
-
Dynamic: provides-extra
|
|
37
|
-
Dynamic: requires-dist
|
|
38
|
-
Dynamic: requires-python
|
|
39
|
-
Dynamic: summary
|
|
40
|
-
|
|
41
|
-
# mistocr
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
|
|
45
|
-
|
|
46
|
-
## Why mistocr?
|
|
47
|
-
|
|
48
|
-
**Performance**: Mistral’s OCR delivers state-of-the-art accuracy on
|
|
49
|
-
complex documents including tables, charts, and multi-column layouts.
|
|
50
|
-
|
|
51
|
-
**Scale**: Process entire folders of PDFs in a single batch job. Upload
|
|
52
|
-
once, process asynchronously, and retrieve results when ready - perfect
|
|
53
|
-
for large document sets.
|
|
54
|
-
|
|
55
|
-
**Cost savings**: Batch OCR mode reduces costs from \$1/1000 pages to
|
|
56
|
-
\$0.50/1000 pages - a 50% reduction compared to synchronous processing.
|
|
57
|
-
|
|
58
|
-
**Simplicity**: A single
|
|
59
|
-
[`ocr()`](https://franckalbinet.github.io/mistocr/core.html#ocr)
|
|
60
|
-
function handles everything - uploading, batch submission, polling for
|
|
61
|
-
completion, and saving results as markdown with extracted images.
|
|
62
|
-
Process one PDF or an entire folder with the same simple interface.
|
|
63
|
-
|
|
64
|
-
**Organized output**: Each PDF is automatically saved to its own folder
|
|
65
|
-
with pages as separate markdown files and images in an `img` subfolder,
|
|
66
|
-
making results easy to navigate and process further.
|
|
67
|
-
|
|
68
|
-
## Installation
|
|
69
|
-
|
|
70
|
-
Install latest from the GitHub
|
|
71
|
-
[repository](https://github.com/franckalbinet/mistocr):
|
|
72
|
-
|
|
73
|
-
``` sh
|
|
74
|
-
$ pip install git+https://github.com/franckalbinet/mistocr.git
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
or from [pypi](https://pypi.org/project/mistocr/)
|
|
78
|
-
|
|
79
|
-
``` sh
|
|
80
|
-
$ pip install mistocr
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
## How to use
|
|
84
|
-
|
|
85
|
-
### Basic usage
|
|
86
|
-
|
|
87
|
-
Process a single PDF:
|
|
88
|
-
|
|
89
|
-
``` python
|
|
90
|
-
from mistocr.core import ocr
|
|
91
|
-
|
|
92
|
-
fname = 'files/test/attention-is-all-you-need.pdf'
|
|
93
|
-
result = ocr(fname)
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
Or process an entire folder:
|
|
97
|
-
|
|
98
|
-
``` python
|
|
99
|
-
results = ocr('files/test')
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
### Output structure
|
|
103
|
-
|
|
104
|
-
Each PDF is saved to its own folder with pages as separate markdown
|
|
105
|
-
files and images in an `img` subfolder:
|
|
106
|
-
|
|
107
|
-
files/test/md/
|
|
108
|
-
├── attention-is-all-you-need/
|
|
109
|
-
│ ├── img/
|
|
110
|
-
│ │ ├── img-0.jpeg
|
|
111
|
-
│ │ ├── img-1.jpeg
|
|
112
|
-
│ │ └── ...
|
|
113
|
-
│ ├── page_1.md
|
|
114
|
-
│ ├── page_2.md
|
|
115
|
-
│ └── ...
|
|
116
|
-
└── resnet/
|
|
117
|
-
├── img/
|
|
118
|
-
└── ...
|
|
119
|
-
|
|
120
|
-
### Reading results
|
|
121
|
-
|
|
122
|
-
Read all pages from a processed PDF:
|
|
123
|
-
|
|
124
|
-
``` python
|
|
125
|
-
from mistocr.core import read_pgs
|
|
126
|
-
|
|
127
|
-
text = read_pgs('files/test/md/attention-is-all-you-need')
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
Or read a specific page:
|
|
131
|
-
|
|
132
|
-
``` python
|
|
133
|
-
text = read_pgs('files/test/md/attention-is-all-you-need', 10)
|
|
134
|
-
```
|
|
135
|
-
|
|
136
|
-
### Customization
|
|
137
|
-
|
|
138
|
-
Customize output directory, image inclusion, and polling interval:
|
|
139
|
-
|
|
140
|
-
``` python
|
|
141
|
-
results = ocr('files/test', out_dir='output', inc_img=False, poll_interval=5)
|
|
142
|
-
```
|
|
143
|
-
|
|
144
|
-
**Parameters:**
|
|
145
|
-
|
|
146
|
-
- **`path`**: A single PDF file or folder containing multiple PDFs
|
|
147
|
-
- **`out_dir`**: Directory name for saving markdown output (default:
|
|
148
|
-
`'md'`)
|
|
149
|
-
- **`inc_img`**: Include extracted images in the output (default:
|
|
150
|
-
`True`)
|
|
151
|
-
- **`key`**: Your Mistral API key (uses `MISTRAL_API_KEY` environment
|
|
152
|
-
variable if not provided)
|
|
153
|
-
- **`poll_interval`**: Seconds between batch job status checks (default:
|
|
154
|
-
`2`)
|
|
155
|
-
|
|
156
|
-
**Returns:** List of paths to the generated markdown files
|
|
157
|
-
|
|
158
|
-
## Developer Guide
|
|
159
|
-
|
|
160
|
-
If you are new to using `nbdev` here are some useful pointers to get you
|
|
161
|
-
started.
|
|
162
|
-
|
|
163
|
-
### Install mistocr in Development mode
|
|
164
|
-
|
|
165
|
-
``` sh
|
|
166
|
-
# make sure mistocr package is installed in development mode
|
|
167
|
-
$ pip install -e .
|
|
168
|
-
|
|
169
|
-
# make changes under nbs/ directory
|
|
170
|
-
# ...
|
|
171
|
-
|
|
172
|
-
# compile to have changes apply to mistocr
|
|
173
|
-
$ nbdev_prepare
|
|
174
|
-
```
|
|
175
|
-
|
|
176
|
-
### Documentation
|
|
177
|
-
|
|
178
|
-
Documentation can be found hosted on this GitHub
|
|
179
|
-
[repository](https://github.com/franckalbinet/mistocr)’s
|
|
180
|
-
[pages](https://franckalbinet.github.io/mistocr/). Additionally you can
|
|
181
|
-
find package manager specific guidelines on
|
|
182
|
-
[conda](https://anaconda.org/franckalbinet/mistocr) and
|
|
183
|
-
[pypi](https://pypi.org/project/mistocr/) respectively.
|
mistocr-0.1.5.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=rPSfWgIeq2YWVPyESOAwCBt8vftsTpIkuLAGDEzyRQc,22
|
|
2
|
-
mistocr/_modidx.py,sha256=sZ3ISGF-2f7VEOD9MVgqMVs5SifUNe-1YP0wy8Ey0cU,2884
|
|
3
|
-
mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
|
|
4
|
-
mistocr/refine.py,sha256=314r4MBZRIvUmu6B_dvvq9P4d4a_japKBpsg4wnU9oU,5253
|
|
5
|
-
mistocr-0.1.5.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
6
|
-
mistocr-0.1.5.dist-info/METADATA,sha256=kcrDK0kJadP5Sze0tVzRo-pLWWTJQiEnvwlLpWJZz2o,4848
|
|
7
|
-
mistocr-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mistocr-0.1.5.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
9
|
-
mistocr-0.1.5.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
10
|
-
mistocr-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|