mistocr 0.0.3__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mistocr/__init__.py +1 -1
- mistocr/_modidx.py +9 -1
- mistocr/core.py +38 -31
- mistocr/refine.py +113 -0
- {mistocr-0.0.3.dist-info → mistocr-0.1.0.dist-info}/METADATA +43 -38
- mistocr-0.1.0.dist-info/RECORD +10 -0
- mistocr-0.0.3.dist-info/RECORD +0 -9
- {mistocr-0.0.3.dist-info → mistocr-0.1.0.dist-info}/WHEEL +0 -0
- {mistocr-0.0.3.dist-info → mistocr-0.1.0.dist-info}/entry_points.txt +0 -0
- {mistocr-0.0.3.dist-info → mistocr-0.1.0.dist-info}/licenses/LICENSE +0 -0
- {mistocr-0.0.3.dist-info → mistocr-0.1.0.dist-info}/top_level.txt +0 -0
mistocr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0
|
|
1
|
+
__version__ = "0.1.0"
|
mistocr/_modidx.py
CHANGED
|
@@ -13,9 +13,17 @@ d = { 'settings': { 'branch': 'main',
|
|
|
13
13
|
'mistocr.core.get_api_key': ('core.html#get_api_key', 'mistocr/core.py'),
|
|
14
14
|
'mistocr.core.ocr': ('core.html#ocr', 'mistocr/core.py'),
|
|
15
15
|
'mistocr.core.prep_pdf_batch': ('core.html#prep_pdf_batch', 'mistocr/core.py'),
|
|
16
|
+
'mistocr.core.read_pgs': ('core.html#read_pgs', 'mistocr/core.py'),
|
|
16
17
|
'mistocr.core.save_images': ('core.html#save_images', 'mistocr/core.py'),
|
|
17
18
|
'mistocr.core.save_page': ('core.html#save_page', 'mistocr/core.py'),
|
|
18
19
|
'mistocr.core.save_pages': ('core.html#save_pages', 'mistocr/core.py'),
|
|
19
20
|
'mistocr.core.submit_batch': ('core.html#submit_batch', 'mistocr/core.py'),
|
|
20
21
|
'mistocr.core.upload_pdf': ('core.html#upload_pdf', 'mistocr/core.py'),
|
|
21
|
-
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')}
|
|
22
|
+
'mistocr.core.wait_for_job': ('core.html#wait_for_job', 'mistocr/core.py')},
|
|
23
|
+
'mistocr.refine': { 'mistocr.refine.HeadingCorrections': ('refine.html#headingcorrections', 'mistocr/refine.py'),
|
|
24
|
+
'mistocr.refine.apply_hdg_fixes': ('refine.html#apply_hdg_fixes', 'mistocr/refine.py'),
|
|
25
|
+
'mistocr.refine.fix_hdg_hierarchy': ('refine.html#fix_hdg_hierarchy', 'mistocr/refine.py'),
|
|
26
|
+
'mistocr.refine.fix_md_hdgs': ('refine.html#fix_md_hdgs', 'mistocr/refine.py'),
|
|
27
|
+
'mistocr.refine.fmt_hdgs_idx': ('refine.html#fmt_hdgs_idx', 'mistocr/refine.py'),
|
|
28
|
+
'mistocr.refine.get_hdgs': ('refine.html#get_hdgs', 'mistocr/refine.py'),
|
|
29
|
+
'mistocr.refine.mk_fixes_lut': ('refine.html#mk_fixes_lut', 'mistocr/refine.py')}}}
|
mistocr/core.py
CHANGED
|
@@ -4,21 +4,17 @@
|
|
|
4
4
|
|
|
5
5
|
# %% auto 0
|
|
6
6
|
__all__ = ['ocr_model', 'ocr_endpoint', 'get_api_key', 'upload_pdf', 'create_batch_entry', 'prep_pdf_batch', 'submit_batch',
|
|
7
|
-
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr']
|
|
7
|
+
'wait_for_job', 'download_results', 'save_images', 'save_page', 'save_pages', 'ocr', 'read_pgs']
|
|
8
8
|
|
|
9
9
|
# %% ../nbs/00_core.ipynb 3
|
|
10
10
|
from fastcore.all import *
|
|
11
|
-
|
|
12
|
-
import os, json, time, base64, tempfile
|
|
11
|
+
import os, re, json, time, base64, tempfile, logging
|
|
13
12
|
from io import BytesIO
|
|
14
13
|
from pathlib import Path
|
|
15
14
|
from PIL import Image
|
|
16
15
|
from mistralai import Mistral
|
|
17
16
|
|
|
18
17
|
# %% ../nbs/00_core.ipynb 6
|
|
19
|
-
load_dotenv()
|
|
20
|
-
|
|
21
|
-
# %% ../nbs/00_core.ipynb 7
|
|
22
18
|
def get_api_key(
|
|
23
19
|
key:str=None # Mistral API key
|
|
24
20
|
):
|
|
@@ -27,11 +23,11 @@ def get_api_key(
|
|
|
27
23
|
if not key: raise ValueError("MISTRAL_API_KEY not found")
|
|
28
24
|
return key
|
|
29
25
|
|
|
30
|
-
# %% ../nbs/00_core.ipynb
|
|
26
|
+
# %% ../nbs/00_core.ipynb 7
|
|
31
27
|
ocr_model = "mistral-ocr-latest"
|
|
32
28
|
ocr_endpoint = "/v1/ocr"
|
|
33
29
|
|
|
34
|
-
# %% ../nbs/00_core.ipynb
|
|
30
|
+
# %% ../nbs/00_core.ipynb 10
|
|
35
31
|
def upload_pdf(
|
|
36
32
|
path:str, # Path to PDF file
|
|
37
33
|
key:str=None # Mistral API key
|
|
@@ -42,11 +38,11 @@ def upload_pdf(
|
|
|
42
38
|
uploaded = c.files.upload(file=dict(file_name=path.stem, content=path.read_bytes()), purpose="ocr")
|
|
43
39
|
return c.files.get_signed_url(file_id=uploaded.id).url, c
|
|
44
40
|
|
|
45
|
-
# %% ../nbs/00_core.ipynb
|
|
41
|
+
# %% ../nbs/00_core.ipynb 15
|
|
46
42
|
def create_batch_entry(
|
|
47
43
|
path:str, # Path to PDF file,
|
|
48
44
|
url:str, # Mistral signed URL
|
|
49
|
-
cid:str=None, # Custom ID (by default using the file name without
|
|
45
|
+
cid:str=None, # Custom ID (by default using the file name without extension)
|
|
50
46
|
inc_img:bool=True # Include image in response
|
|
51
47
|
) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
|
|
52
48
|
"Create a batch entry dict for OCR"
|
|
@@ -54,7 +50,7 @@ def create_batch_entry(
|
|
|
54
50
|
if not cid: cid = path.stem
|
|
55
51
|
return dict(custom_id=cid, body=dict(document=dict(type="document_url", document_url=url), include_image_base64=inc_img))
|
|
56
52
|
|
|
57
|
-
# %% ../nbs/00_core.ipynb
|
|
53
|
+
# %% ../nbs/00_core.ipynb 17
|
|
58
54
|
def prep_pdf_batch(
|
|
59
55
|
path:str, # Path to PDF file,
|
|
60
56
|
cid:str=None, # Custom ID (by default using the file name without extention)
|
|
@@ -65,7 +61,7 @@ def prep_pdf_batch(
|
|
|
65
61
|
url, c = upload_pdf(path, key)
|
|
66
62
|
return create_batch_entry(path, url, cid, inc_img), c
|
|
67
63
|
|
|
68
|
-
# %% ../nbs/00_core.ipynb
|
|
64
|
+
# %% ../nbs/00_core.ipynb 21
|
|
69
65
|
def submit_batch(
|
|
70
66
|
entries:list[dict], # List of batch entries,
|
|
71
67
|
c:Mistral=None, # Mistral client,
|
|
@@ -79,7 +75,7 @@ def submit_batch(
|
|
|
79
75
|
batch_data = c.files.upload(file=dict(file_name="batch.jsonl", content=open(f.name, "rb")), purpose="batch")
|
|
80
76
|
return c.batch.jobs.create(input_files=[batch_data.id], model=model, endpoint=endpoint)
|
|
81
77
|
|
|
82
|
-
# %% ../nbs/00_core.ipynb
|
|
78
|
+
# %% ../nbs/00_core.ipynb 24
|
|
83
79
|
def wait_for_job(
|
|
84
80
|
job:dict, # Job dict,
|
|
85
81
|
c:Mistral=None, # Mistral client,
|
|
@@ -91,7 +87,7 @@ def wait_for_job(
|
|
|
91
87
|
job = c.batch.jobs.get(job_id=job.id)
|
|
92
88
|
return job
|
|
93
89
|
|
|
94
|
-
# %% ../nbs/00_core.ipynb
|
|
90
|
+
# %% ../nbs/00_core.ipynb 26
|
|
95
91
|
def download_results(
|
|
96
92
|
job:dict, # Job dict,
|
|
97
93
|
c:Mistral=None # Mistral client
|
|
@@ -100,7 +96,7 @@ def download_results(
|
|
|
100
96
|
content = c.files.download(file_id=job.output_file).read().decode('utf-8')
|
|
101
97
|
return [json.loads(line) for line in content.strip().split('\n') if line]
|
|
102
98
|
|
|
103
|
-
# %% ../nbs/00_core.ipynb
|
|
99
|
+
# %% ../nbs/00_core.ipynb 31
|
|
104
100
|
def save_images(
|
|
105
101
|
page:dict, # Page dict,
|
|
106
102
|
img_dir:str='img' # Directory to save images
|
|
@@ -111,32 +107,32 @@ def save_images(
|
|
|
111
107
|
img_bytes = base64.b64decode(img['image_base64'].split(',')[1])
|
|
112
108
|
Image.open(BytesIO(img_bytes)).save(img_dir / img['id'])
|
|
113
109
|
|
|
114
|
-
# %% ../nbs/00_core.ipynb
|
|
110
|
+
# %% ../nbs/00_core.ipynb 32
|
|
115
111
|
def save_page(
|
|
116
112
|
page:dict, # Page dict,
|
|
117
|
-
|
|
113
|
+
dst:str, # Directory to save page
|
|
118
114
|
img_dir:str='img' # Directory to save images
|
|
119
115
|
) -> None:
|
|
120
116
|
"Save single page markdown and images"
|
|
121
|
-
(
|
|
117
|
+
(dst / f"page_{page['index']+1}.md").write_text(page['markdown'])
|
|
122
118
|
if page.get('images'):
|
|
123
119
|
img_dir.mkdir(exist_ok=True)
|
|
124
120
|
save_images(page, img_dir)
|
|
125
121
|
|
|
126
|
-
# %% ../nbs/00_core.ipynb
|
|
122
|
+
# %% ../nbs/00_core.ipynb 34
|
|
127
123
|
def save_pages(
|
|
128
124
|
ocr_resp:dict, # OCR response,
|
|
129
|
-
|
|
125
|
+
dst:str, # Directory to save pages,
|
|
130
126
|
cid:str # Custom ID
|
|
131
127
|
) -> Path: # Output directory
|
|
132
128
|
"Save markdown pages and images from OCR response to output directory"
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
img_dir =
|
|
136
|
-
for page in ocr_resp['pages']: save_page(page,
|
|
137
|
-
return
|
|
129
|
+
dst = Path(dst) / cid
|
|
130
|
+
dst.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
img_dir = dst / 'img'
|
|
132
|
+
for page in ocr_resp['pages']: save_page(page, dst, img_dir)
|
|
133
|
+
return dst
|
|
138
134
|
|
|
139
|
-
# %% ../nbs/00_core.ipynb
|
|
135
|
+
# %% ../nbs/00_core.ipynb 40
|
|
140
136
|
def _get_paths(path:str) -> list[Path]:
|
|
141
137
|
"Get list of PDFs from file or folder"
|
|
142
138
|
path = Path(path)
|
|
@@ -147,7 +143,7 @@ def _get_paths(path:str) -> list[Path]:
|
|
|
147
143
|
return pdfs
|
|
148
144
|
raise ValueError(f"Path not found: {path}")
|
|
149
145
|
|
|
150
|
-
# %% ../nbs/00_core.ipynb
|
|
146
|
+
# %% ../nbs/00_core.ipynb 41
|
|
151
147
|
def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[dict], Mistral]:
|
|
152
148
|
"Prepare batch entries for list of PDFs"
|
|
153
149
|
entries, c = [], None
|
|
@@ -156,7 +152,7 @@ def _prep_batch(pdfs:list[Path], inc_img:bool=True, key:str=None) -> tuple[list[
|
|
|
156
152
|
entries.append(entry)
|
|
157
153
|
return entries, c
|
|
158
154
|
|
|
159
|
-
# %% ../nbs/00_core.ipynb
|
|
155
|
+
# %% ../nbs/00_core.ipynb 42
|
|
160
156
|
def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]:
|
|
161
157
|
"Submit batch, wait for completion, and download results"
|
|
162
158
|
job = submit_batch(entries, c)
|
|
@@ -164,10 +160,10 @@ def _run_batch(entries:list[dict], c:Mistral, poll_interval:int=2) -> list[dict]
|
|
|
164
160
|
if job.status != 'SUCCESS': raise Exception(f"Job failed with status: {job.status}")
|
|
165
161
|
return download_results(job, c)
|
|
166
162
|
|
|
167
|
-
# %% ../nbs/00_core.ipynb
|
|
163
|
+
# %% ../nbs/00_core.ipynb 43
|
|
168
164
|
def ocr(
|
|
169
165
|
path:str, # Path to PDF file or folder,
|
|
170
|
-
|
|
166
|
+
dst:str='md', # Directory to save markdown pages,
|
|
171
167
|
inc_img:bool=True, # Include image in response,
|
|
172
168
|
key:str=None, # API key,
|
|
173
169
|
poll_interval:int=2 # Poll interval in seconds
|
|
@@ -176,4 +172,15 @@ def ocr(
|
|
|
176
172
|
pdfs = _get_paths(path)
|
|
177
173
|
entries, c = _prep_batch(pdfs, inc_img, key)
|
|
178
174
|
results = _run_batch(entries, c, poll_interval)
|
|
179
|
-
return L([save_pages(r['response']['body'],
|
|
175
|
+
return L([save_pages(r['response']['body'], dst, r['custom_id']) for r in results])
|
|
176
|
+
|
|
177
|
+
# %% ../nbs/00_core.ipynb 48
|
|
178
|
+
def read_pgs(
|
|
179
|
+
path:str, # OCR output directory,
|
|
180
|
+
join:bool=True # Join pages into single string
|
|
181
|
+
) -> str|list[str]: # Joined string or list of page contents
|
|
182
|
+
"Read specific page or all pages from OCR output directory"
|
|
183
|
+
path = Path(path)
|
|
184
|
+
pgs = sorted(path.glob('page_*.md'), key=lambda p: int(p.stem.split('_')[1]))
|
|
185
|
+
contents = L([p.read_text() for p in pgs])
|
|
186
|
+
return '\n\n'.join(contents) if join else contents
|
mistocr/refine.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Postprocess markdown files by fixing heading hierarchy and describint images"""
|
|
2
|
+
|
|
3
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_refine.ipynb.
|
|
4
|
+
|
|
5
|
+
# %% auto 0
|
|
6
|
+
__all__ = ['prompt_fix_hdgs', 'get_hdgs', 'fmt_hdgs_idx', 'HeadingCorrections', 'fix_hdg_hierarchy', 'mk_fixes_lut',
|
|
7
|
+
'apply_hdg_fixes', 'fix_md_hdgs']
|
|
8
|
+
|
|
9
|
+
# %% ../nbs/01_refine.ipynb 3
|
|
10
|
+
from fastcore.all import *
|
|
11
|
+
from .core import read_pgs
|
|
12
|
+
from re import sub, findall, MULTILINE
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
from lisette.core import completion
|
|
15
|
+
import os
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
# %% ../nbs/01_refine.ipynb 7
|
|
19
|
+
def get_hdgs(
|
|
20
|
+
md:str # Markdown file string
|
|
21
|
+
):
|
|
22
|
+
"Return the markdown headings"
|
|
23
|
+
# Sanitize removing '#' in python snippet if any
|
|
24
|
+
md = sub(r'```[\s\S]*?```', '', md)
|
|
25
|
+
return L(findall(r'^#{1,6} .+$', md, MULTILINE))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# %% ../nbs/01_refine.ipynb 10
|
|
30
|
+
def fmt_hdgs_idx(
|
|
31
|
+
hdgs: list[str] # List of markdown headings
|
|
32
|
+
) -> str: # Formatted string with index
|
|
33
|
+
"Format the headings with index"
|
|
34
|
+
return '\n'.join(f"{i}. {h}" for i, h in enumerate(hdgs))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# %% ../nbs/01_refine.ipynb 13
|
|
38
|
+
class HeadingCorrections(BaseModel):
|
|
39
|
+
corrections: dict[int, str] # index → corrected heading
|
|
40
|
+
|
|
41
|
+
# %% ../nbs/01_refine.ipynb 15
|
|
42
|
+
prompt_fix_hdgs = """Fix markdown heading hierarchy errors while preserving the document's intended structure.
|
|
43
|
+
|
|
44
|
+
INPUT FORMAT: Each heading is prefixed with its index number (e.g., "0. # Title")
|
|
45
|
+
|
|
46
|
+
RULES - Only fix these errors:
|
|
47
|
+
1. **Level jumps**: Headings can only increase by one # at a time
|
|
48
|
+
- Wrong: 0. # Title → 1. #### Abstract
|
|
49
|
+
- Fixed: 0. # Title → 1. ## Abstract
|
|
50
|
+
|
|
51
|
+
2. **Numbering inconsistency**: Subsection numbers must be one level deeper
|
|
52
|
+
- Wrong: 4. ## 3. Section → 5. ## 3.1 Subsection
|
|
53
|
+
- Fixed: 4. ## 3. Section → 5. ### 3.1 Subsection
|
|
54
|
+
|
|
55
|
+
3. **Preserve working structure**: If sections are consistently marked, keep it
|
|
56
|
+
|
|
57
|
+
4. **Decreasing levels is OK**: Going from ### to ## is valid for new sections
|
|
58
|
+
|
|
59
|
+
OUTPUT: Return a Python dictionary mapping index to corrected heading (without the index prefix).
|
|
60
|
+
Only include entries that need changes. Example: {{1: '## Abstract', 15: '### PASCAL VOC'}}
|
|
61
|
+
|
|
62
|
+
Headings to analyze:
|
|
63
|
+
{headings_list}
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# %% ../nbs/01_refine.ipynb 16
|
|
67
|
+
def fix_hdg_hierarchy(
|
|
68
|
+
hdgs: list[str], # List of markdown headings
|
|
69
|
+
model: str='claude-sonnet-4-5', # Model to use
|
|
70
|
+
api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
|
|
71
|
+
) -> dict[int, str]: # Dictionary of index → corrected heading
|
|
72
|
+
"Fix the heading hierarchy"
|
|
73
|
+
r = completion(
|
|
74
|
+
model=model,
|
|
75
|
+
messages=[{"role": "user", "content": prompt_fix_hdgs.format(headings_list=fmt_hdgs_idx(hdgs))}],
|
|
76
|
+
response_format=HeadingCorrections,
|
|
77
|
+
api_key=api_key
|
|
78
|
+
)
|
|
79
|
+
return json.loads(r.choices[0].message.content)['corrections']
|
|
80
|
+
|
|
81
|
+
# %% ../nbs/01_refine.ipynb 19
|
|
82
|
+
def mk_fixes_lut(
|
|
83
|
+
hdgs: list[str], # List of markdown headings
|
|
84
|
+
model: str='claude-sonnet-4-5', # Model to use
|
|
85
|
+
api_key: str=os.getenv('ANTHROPIC_API_KEY') # API key
|
|
86
|
+
) -> dict[str, str]: # Dictionary of old → new heading
|
|
87
|
+
"Make a lookup table of fixes"
|
|
88
|
+
fixes = fix_hdg_hierarchy(hdgs, model, api_key)
|
|
89
|
+
return {hdgs[int(k)]:v for k,v in fixes.items()}
|
|
90
|
+
|
|
91
|
+
# %% ../nbs/01_refine.ipynb 22
|
|
92
|
+
def apply_hdg_fixes(
|
|
93
|
+
p:str, # Page to fix
|
|
94
|
+
lut_fixes: dict[str, str], # Lookup table of fixes
|
|
95
|
+
pg: int=None, # Optionnaly specify the page number to append to original heading
|
|
96
|
+
) -> str: # Page with fixes applied
|
|
97
|
+
"Apply the fixes to the page"
|
|
98
|
+
for old in get_hdgs(p): p = p.replace(old, lut_fixes.get(old, old) + (f' .... page {pg}' if pg else ''))
|
|
99
|
+
return p
|
|
100
|
+
|
|
101
|
+
# %% ../nbs/01_refine.ipynb 25
|
|
102
|
+
def fix_md_hdgs(
|
|
103
|
+
src:str, # Source directory with markdown pages
|
|
104
|
+
model:str='claude-sonnet-4-5', # Model
|
|
105
|
+
dst:str=None, # Destination directory (None=overwrite)
|
|
106
|
+
pg_nums:bool=True # Add page numbers
|
|
107
|
+
):
|
|
108
|
+
"Fix heading hierarchy in markdown document"
|
|
109
|
+
src_path,dst_path = Path(src),Path(dst) if dst else Path(src)
|
|
110
|
+
if dst_path != src_path: dst_path.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
lut = mk_fixes_lut(get_hdgs(read_pgs(src_path)), model)
|
|
112
|
+
for i,p in enumerate(read_pgs(src_path, join=False), 1):
|
|
113
|
+
(dst_path/f'page_{i}.md').write_text(apply_hdg_fixes(p, lut, pg=i if pg_nums else None))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: Simple batch OCR for PDFs using Mistral's state-of-the-art vision model
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -22,6 +22,7 @@ Requires-Dist: fastcore
|
|
|
22
22
|
Requires-Dist: mistralai
|
|
23
23
|
Requires-Dist: pillow
|
|
24
24
|
Requires-Dist: dotenv
|
|
25
|
+
Requires-Dist: lisette
|
|
25
26
|
Provides-Extra: dev
|
|
26
27
|
Dynamic: author
|
|
27
28
|
Dynamic: author-email
|
|
@@ -54,10 +55,11 @@ for large document sets.
|
|
|
54
55
|
**Cost savings**: Batch OCR mode reduces costs from \$1/1000 pages to
|
|
55
56
|
\$0.50/1000 pages - a 50% reduction compared to synchronous processing.
|
|
56
57
|
|
|
57
|
-
**Simplicity**: A single
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
**Simplicity**: A single
|
|
59
|
+
[`ocr()`](https://franckalbinet.github.io/mistocr/core.html#ocr)
|
|
60
|
+
function handles everything - uploading, batch submission, polling for
|
|
61
|
+
completion, and saving results as markdown with extracted images.
|
|
62
|
+
Process one PDF or an entire folder with the same simple interface.
|
|
61
63
|
|
|
62
64
|
**Organized output**: Each PDF is automatically saved to its own folder
|
|
63
65
|
with pages as separate markdown files and images in an `img` subfolder,
|
|
@@ -80,57 +82,60 @@ $ pip install mistocr
|
|
|
80
82
|
|
|
81
83
|
## How to use
|
|
82
84
|
|
|
85
|
+
### Basic usage
|
|
86
|
+
|
|
87
|
+
Process a single PDF:
|
|
88
|
+
|
|
83
89
|
``` python
|
|
84
90
|
from mistocr.core import ocr
|
|
85
|
-
```
|
|
86
|
-
|
|
87
|
-
- **Process a single PDF:**
|
|
88
91
|
|
|
89
|
-
|
|
92
|
+
fname = 'files/test/attention-is-all-you-need.pdf'
|
|
93
|
+
result = ocr(fname)
|
|
94
|
+
```
|
|
90
95
|
|
|
91
|
-
|
|
92
|
-
result = ocr(fname)
|
|
96
|
+
Or process an entire folder:
|
|
93
97
|
|
|
94
98
|
``` python
|
|
99
|
+
results = ocr('files/test')
|
|
95
100
|
```
|
|
96
101
|
|
|
97
|
-
|
|
98
|
-
img/ page_11.md page_14.md page_3.md page_6.md page_9.md
|
|
99
|
-
page_1.md page_12.md page_15.md page_4.md page_7.md
|
|
100
|
-
page_10.md page_13.md page_2.md page_5.md page_8.md
|
|
102
|
+
### Output structure
|
|
101
103
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
+
Each PDF is saved to its own folder with pages as separate markdown
|
|
105
|
+
files and images in an `img` subfolder:
|
|
104
106
|
|
|
105
|
-
|
|
107
|
+
files/test/md/
|
|
108
|
+
├── attention-is-all-you-need/
|
|
109
|
+
│ ├── img/
|
|
110
|
+
│ │ ├── img-0.jpeg
|
|
111
|
+
│ │ ├── img-1.jpeg
|
|
112
|
+
│ │ └── ...
|
|
113
|
+
│ ├── page_1.md
|
|
114
|
+
│ ├── page_2.md
|
|
115
|
+
│ └── ...
|
|
116
|
+
└── resnet/
|
|
117
|
+
├── img/
|
|
118
|
+
└── ...
|
|
106
119
|
|
|
107
|
-
|
|
108
|
-
results = ocr('files/test')
|
|
109
|
-
```
|
|
120
|
+
### Reading results
|
|
110
121
|
|
|
111
|
-
|
|
112
|
-
```
|
|
122
|
+
Read all pages from a processed PDF:
|
|
113
123
|
|
|
114
|
-
|
|
115
|
-
|
|
124
|
+
``` python
|
|
125
|
+
from mistocr.core import read_pgs
|
|
116
126
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
page_1.md page_12.md page_15.md page_4.md page_7.md
|
|
120
|
-
page_10.md page_13.md page_2.md page_5.md page_8.md
|
|
127
|
+
text = read_pgs('files/test/md/attention-is-all-you-need')
|
|
128
|
+
```
|
|
121
129
|
|
|
122
|
-
|
|
123
|
-
img-0.jpeg img-1.jpeg img-2.jpeg img-3.jpeg img-4.jpeg
|
|
130
|
+
Or read a specific page:
|
|
124
131
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
132
|
+
``` python
|
|
133
|
+
text = read_pgs('files/test/md/attention-is-all-you-need', 10)
|
|
134
|
+
```
|
|
128
135
|
|
|
129
|
-
|
|
130
|
-
img-0.jpeg img-2.jpeg img-4.jpeg img-6.jpeg
|
|
131
|
-
img-1.jpeg img-3.jpeg img-5.jpeg
|
|
136
|
+
### Customization
|
|
132
137
|
|
|
133
|
-
|
|
138
|
+
Customize output directory, image inclusion, and polling interval:
|
|
134
139
|
|
|
135
140
|
``` python
|
|
136
141
|
results = ocr('files/test', out_dir='output', inc_img=False, poll_interval=5)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
mistocr/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
2
|
+
mistocr/_modidx.py,sha256=R9zVMv4dKz2sLStoB5wBoKRqjza216_z8xPXszoplU4,2660
|
|
3
|
+
mistocr/core.py,sha256=wtaYZ_Fz0dXIb1cYLAtymwR9Z7_KBI4ULy-UpM3YTX4,7154
|
|
4
|
+
mistocr/refine.py,sha256=gWup79LGjmvKW5RyY1dRKUeAEt94mUJIeTZB3V4D-JE,4258
|
|
5
|
+
mistocr-0.1.0.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
6
|
+
mistocr-0.1.0.dist-info/METADATA,sha256=JOyUQONpYUmmGk2kFzMkxaIBrHwjC9CfmI7fc9qa6ms,4848
|
|
7
|
+
mistocr-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
mistocr-0.1.0.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
9
|
+
mistocr-0.1.0.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
10
|
+
mistocr-0.1.0.dist-info/RECORD,,
|
mistocr-0.0.3.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
mistocr/__init__.py,sha256=4GZKi13lDTD25YBkGakhZyEQZWTER_OWQMNPoH_UM2c,22
|
|
2
|
-
mistocr/_modidx.py,sha256=gViY05_Y4LdQXC5l2yEPG3MX-9M93xf4FJEGh3ns2Fo,1745
|
|
3
|
-
mistocr/core.py,sha256=Ur5R8NLvHxduvSVuWNkWwt8xgkrxpnL9cmJjQ5h9thM,6778
|
|
4
|
-
mistocr-0.0.3.dist-info/licenses/LICENSE,sha256=xV8xoN4VOL0uw9X8RSs2IMuD_Ss_a9yAbtGNeBWZwnw,11337
|
|
5
|
-
mistocr-0.0.3.dist-info/METADATA,sha256=aWl_wHxvy5Qrsze7JtTWMQ6FD-l-1QEM-7GZfTeem88,5076
|
|
6
|
-
mistocr-0.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
mistocr-0.0.3.dist-info/entry_points.txt,sha256=HjAiHozobM-alm_6bTF-ehRr2DD3KYE9PgRngelONOY,36
|
|
8
|
-
mistocr-0.0.3.dist-info/top_level.txt,sha256=LelTYnSpSXLy1Hb1m2YP3gt8luwP-I8KV0NjP_ucdSs,8
|
|
9
|
-
mistocr-0.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|