mistocr 0.4.1__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mistocr-0.4.1/mistocr.egg-info → mistocr-0.4.2}/PKG-INFO +20 -2
- {mistocr-0.4.1 → mistocr-0.4.2}/README.md +19 -1
- mistocr-0.4.2/mistocr/__init__.py +1 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr/core.py +14 -2
- {mistocr-0.4.1 → mistocr-0.4.2/mistocr.egg-info}/PKG-INFO +20 -2
- {mistocr-0.4.1 → mistocr-0.4.2}/settings.ini +1 -1
- mistocr-0.4.1/mistocr/__init__.py +0 -1
- {mistocr-0.4.1 → mistocr-0.4.2}/LICENSE +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/MANIFEST.in +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr/_modidx.py +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr/pipeline.py +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr/refine.py +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr.egg-info/SOURCES.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr.egg-info/dependency_links.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr.egg-info/entry_points.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr.egg-info/not-zip-safe +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr.egg-info/requires.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/mistocr.egg-info/top_level.txt +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/pyproject.toml +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/setup.cfg +0 -0
- {mistocr-0.4.1 → mistocr-0.4.2}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
|
|
|
113
113
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
114
114
|
```
|
|
115
115
|
|
|
116
|
-
|
|
116
|
+
mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
117
|
+
mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
|
|
118
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
|
|
119
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
120
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
121
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
122
|
+
mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
|
|
123
|
+
mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
|
|
124
|
+
mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
|
|
125
|
+
|
|
126
|
+
Describing 12 images...
|
|
127
|
+
|
|
128
|
+
mistocr.pipeline - INFO - Done!
|
|
129
|
+
|
|
130
|
+
Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
|
|
131
|
+
Adding descriptions to 12 pages...
|
|
132
|
+
Done! Enriched pages saved to files/test/md_test
|
|
133
|
+
|
|
134
|
+
This will:
|
|
117
135
|
|
|
118
136
|
1. OCR the PDF using Mistral’s batch API
|
|
119
137
|
2. Fix heading hierarchy inconsistencies
|
|
@@ -72,7 +72,25 @@ from mistocr.pipeline import pdf_to_md
|
|
|
72
72
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
73
73
|
```
|
|
74
74
|
|
|
75
|
-
|
|
75
|
+
mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
76
|
+
mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
|
|
77
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
|
|
78
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
79
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
80
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
81
|
+
mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
|
|
82
|
+
mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
|
|
83
|
+
mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
|
|
84
|
+
|
|
85
|
+
Describing 12 images...
|
|
86
|
+
|
|
87
|
+
mistocr.pipeline - INFO - Done!
|
|
88
|
+
|
|
89
|
+
Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
|
|
90
|
+
Adding descriptions to 12 pages...
|
|
91
|
+
Done! Enriched pages saved to files/test/md_test
|
|
92
|
+
|
|
93
|
+
This will:
|
|
76
94
|
|
|
77
95
|
1. OCR the PDF using Mistral’s batch API
|
|
78
96
|
2. Fix heading hierarchy inconsistencies
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.4.2"
|
|
@@ -51,12 +51,24 @@ def create_batch_entry(
|
|
|
51
51
|
path:str, # Path to PDF file,
|
|
52
52
|
url:str, # Mistral signed URL
|
|
53
53
|
cid:str=None, # Custom ID (by default using the file name without extension)
|
|
54
|
-
inc_img:bool=True # Include image in response
|
|
54
|
+
inc_img:bool=True, # Include image in response
|
|
55
|
+
extract_header:bool=False, # Extract headers from document
|
|
56
|
+
extract_footer:bool=False # Extract footers from document
|
|
55
57
|
) -> dict[str, str | dict[str, str | bool]]: # Batch entry dict
|
|
56
58
|
"Create a batch entry dict for OCR"
|
|
57
59
|
path = Path(path)
|
|
58
60
|
if not cid: cid = path.stem
|
|
59
|
-
return dict(
|
|
61
|
+
return dict(
|
|
62
|
+
custom_id=cid,
|
|
63
|
+
body=dict(
|
|
64
|
+
document=dict(
|
|
65
|
+
type="document_url",
|
|
66
|
+
document_url=url),
|
|
67
|
+
include_image_base64=inc_img,
|
|
68
|
+
extract_header=extract_header,
|
|
69
|
+
extract_footer=extract_footer
|
|
70
|
+
)
|
|
71
|
+
)
|
|
60
72
|
|
|
61
73
|
# %% ../nbs/00_core.ipynb 18
|
|
62
74
|
def prep_pdf_batch(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mistocr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Batch OCR for PDFs with heading restoration and visual content integration
|
|
5
5
|
Home-page: https://github.com/franckalbinet/mistocr
|
|
6
6
|
Author: Solveit
|
|
@@ -113,7 +113,25 @@ from mistocr.pipeline import pdf_to_md
|
|
|
113
113
|
await pdf_to_md('files/test/resnet.pdf', 'files/test/md_test')
|
|
114
114
|
```
|
|
115
115
|
|
|
116
|
-
|
|
116
|
+
mistocr.pipeline - INFO - Step 1/3: Running OCR on files/test/resnet.pdf...
|
|
117
|
+
mistocr.core - INFO - Waiting for batch job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 (initial status: QUEUED)
|
|
118
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: QUEUED (elapsed: 0s)
|
|
119
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
120
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
121
|
+
mistocr.core - DEBUG - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 status: RUNNING (elapsed: 2s)
|
|
122
|
+
mistocr.core - INFO - Job 4ec899ca-ada8-4fa7-8894-0191ff6ac4e5 completed with status: SUCCESS
|
|
123
|
+
mistocr.pipeline - INFO - Step 2/3: Fixing heading hierarchy...
|
|
124
|
+
mistocr.pipeline - INFO - Step 3/3: Adding image descriptions...
|
|
125
|
+
|
|
126
|
+
Describing 12 images...
|
|
127
|
+
|
|
128
|
+
mistocr.pipeline - INFO - Done!
|
|
129
|
+
|
|
130
|
+
Saved descriptions to /tmp/tmp62c7_ac1/resnet/img_descriptions.json
|
|
131
|
+
Adding descriptions to 12 pages...
|
|
132
|
+
Done! Enriched pages saved to files/test/md_test
|
|
133
|
+
|
|
134
|
+
This will:
|
|
117
135
|
|
|
118
136
|
1. OCR the PDF using Mistral’s batch API
|
|
119
137
|
2. Fix heading hierarchy inconsistencies
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.4.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|