vlm4ocr 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlm4ocr/__init__.py +3 -1
- vlm4ocr/assets/default_prompt_templates/ocr_JSON_system_prompt.txt +1 -0
- vlm4ocr/cli.py +276 -287
- vlm4ocr/data_types.py +109 -0
- vlm4ocr/ocr_engines.py +363 -195
- vlm4ocr/utils.py +386 -39
- vlm4ocr/vlm_engines.py +316 -190
- {vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/METADATA +5 -1
- vlm4ocr-0.3.0.dist-info/RECORD +17 -0
- vlm4ocr-0.1.0.dist-info/RECORD +0 -15
- {vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/WHEEL +0 -0
- {vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/entry_points.txt +0 -0
vlm4ocr/utils.py
CHANGED
|
@@ -1,49 +1,218 @@
|
|
|
1
|
+
import abc
|
|
1
2
|
import os
|
|
2
3
|
import io
|
|
3
4
|
import base64
|
|
4
|
-
from typing import List
|
|
5
|
-
|
|
5
|
+
from typing import Dict, List, Tuple
|
|
6
|
+
import json
|
|
7
|
+
import json_repair
|
|
8
|
+
import importlib.util
|
|
9
|
+
from pdf2image import convert_from_path, pdfinfo_from_path
|
|
6
10
|
from PIL import Image
|
|
11
|
+
import asyncio
|
|
12
|
+
import warnings
|
|
7
13
|
|
|
8
|
-
def get_images_from_pdf(file_path: str) -> List[Image.Image]:
|
|
9
|
-
""" Extracts images from a PDF file. """
|
|
10
|
-
try:
|
|
11
|
-
images = convert_from_path(file_path)
|
|
12
|
-
if not images:
|
|
13
|
-
print(f"Warning: No images extracted from PDF: {file_path}")
|
|
14
|
-
return images
|
|
15
|
-
except Exception as e:
|
|
16
|
-
print(f"Error converting PDF to images: {e}")
|
|
17
|
-
raise ValueError(f"Failed to process PDF file '{os.path.basename(file_path)}'. Ensure poppler is installed and the file is valid.") from e
|
|
18
14
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
for i in range(img.n_frames):
|
|
25
|
-
img.seek(i)
|
|
26
|
-
images.append(img.copy())
|
|
27
|
-
if not images:
|
|
28
|
-
print(f"Warning: No images extracted from TIFF: {file_path}")
|
|
29
|
-
return images
|
|
30
|
-
except FileNotFoundError:
|
|
31
|
-
raise FileNotFoundError(f"TIFF file not found: {file_path}")
|
|
32
|
-
except Exception as e:
|
|
33
|
-
print(f"Error processing TIFF file: {e}")
|
|
34
|
-
raise ValueError(f"Failed to process TIFF file '{os.path.basename(file_path)}'. Ensure the file is a valid TIFF.") from e
|
|
15
|
+
class DataLoader(abc.ABC):
|
|
16
|
+
def __init__(self, file_path: str):
|
|
17
|
+
self.file_path = file_path
|
|
18
|
+
if not os.path.exists(file_path):
|
|
19
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
35
20
|
|
|
21
|
+
@abc.abstractmethod
|
|
22
|
+
def get_all_pages(self) -> List[Image.Image]:
|
|
23
|
+
"""
|
|
24
|
+
Abstract method to get all pages from the file.
|
|
25
|
+
"""
|
|
26
|
+
pass
|
|
36
27
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
28
|
+
@abc.abstractmethod
|
|
29
|
+
def get_page(self, page_index:int) -> Image.Image:
|
|
30
|
+
"""
|
|
31
|
+
Abstract method to get pages from the file.
|
|
32
|
+
|
|
33
|
+
Parameters:
|
|
34
|
+
----------
|
|
35
|
+
page_index : int
|
|
36
|
+
Index of the page to retrieve.
|
|
37
|
+
"""
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
@abc.abstractmethod
|
|
41
|
+
async def get_page_async(self, page_index:int) -> Image.Image:
|
|
42
|
+
"""
|
|
43
|
+
Abstract method to get pages from the file.
|
|
44
|
+
|
|
45
|
+
Parameters:
|
|
46
|
+
----------
|
|
47
|
+
page_index : int
|
|
48
|
+
Index of the page to retrieve.
|
|
49
|
+
"""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
@abc.abstractmethod
|
|
53
|
+
def get_page_count(self) -> int:
|
|
54
|
+
""" Returns the number of pages in the PDF file. """
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class PDFDataLoader(DataLoader):
|
|
59
|
+
def __init__(self, file_path: str):
|
|
60
|
+
super().__init__(file_path)
|
|
61
|
+
self.info = pdfinfo_from_path(self.file_path, userpw=None, poppler_path=None)
|
|
62
|
+
|
|
63
|
+
def get_all_pages(self) -> List[Image.Image]:
|
|
64
|
+
"""
|
|
65
|
+
Extracts pages from a PDF file.
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
return convert_from_path(self.file_path)
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error converting PDF to images: {e}")
|
|
72
|
+
raise ValueError(f"Failed to process PDF file '{os.path.basename(self.file_path)}'. Ensure poppler is installed and the file is valid.") from e
|
|
73
|
+
|
|
74
|
+
def get_page(self, page_index:int) -> Image.Image:
|
|
75
|
+
"""
|
|
76
|
+
Extracts a page from a PDF file.
|
|
77
|
+
|
|
78
|
+
Parameters:
|
|
79
|
+
----------
|
|
80
|
+
page_index : int
|
|
81
|
+
Index of the page to retrieve.
|
|
82
|
+
"""
|
|
83
|
+
try:
|
|
84
|
+
return convert_from_path(self.file_path, first_page=page_index + 1, last_page=page_index + 1)[0]
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"Error converting PDF to images: {e}")
|
|
87
|
+
raise ValueError(f"Failed to process PDF file '{os.path.basename(self.file_path)}'. Ensure poppler is installed and the file is valid.") from e
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
async def get_page_async(self, page_index:int) -> Image.Image:
|
|
91
|
+
"""
|
|
92
|
+
Asynchronously extracts a page from a PDF file.
|
|
93
|
+
|
|
94
|
+
Parameters:
|
|
95
|
+
----------
|
|
96
|
+
page_index : int
|
|
97
|
+
Index of the page to retrieve.
|
|
98
|
+
"""
|
|
99
|
+
loop = asyncio.get_running_loop()
|
|
100
|
+
return await loop.run_in_executor(None, self.get_page, page_index)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_page_count(self) -> int:
|
|
104
|
+
""" Returns the number of pages in the PDF file. """
|
|
105
|
+
return self.info['Pages'] if 'Pages' in self.info else 0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class TIFFDataLoader(DataLoader):
|
|
109
|
+
def __init__(self, file_path: str):
|
|
110
|
+
super().__init__(file_path)
|
|
111
|
+
|
|
112
|
+
def get_all_pages(self) -> List[Image.Image]:
|
|
113
|
+
"""
|
|
114
|
+
Extracts images from a TIFF file.
|
|
115
|
+
"""
|
|
116
|
+
try:
|
|
117
|
+
img = Image.open(self.file_path)
|
|
118
|
+
images = []
|
|
119
|
+
for i in range(img.n_frames):
|
|
120
|
+
img.seek(i)
|
|
121
|
+
images.append(img.copy())
|
|
122
|
+
return images
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print(f"Error extracting images from TIFF: {e}")
|
|
125
|
+
raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_page(self, page_index:int) -> Image.Image:
|
|
129
|
+
"""
|
|
130
|
+
Extracts a page from a TIFF file.
|
|
131
|
+
|
|
132
|
+
Parameters:
|
|
133
|
+
----------
|
|
134
|
+
page_index : int
|
|
135
|
+
Index of the page to retrieve.
|
|
136
|
+
"""
|
|
137
|
+
try:
|
|
138
|
+
img = Image.open(self.file_path)
|
|
139
|
+
img.seek(page_index)
|
|
140
|
+
return img.copy()
|
|
141
|
+
except IndexError:
|
|
142
|
+
raise ValueError(f"Page index {page_index} out of range for TIFF file '{os.path.basename(self.file_path)}'.") from None
|
|
143
|
+
except Exception as e:
|
|
144
|
+
print(f"Error extracting page {page_index} from TIFF: {e}")
|
|
145
|
+
raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
|
|
146
|
+
|
|
147
|
+
async def get_page_async(self, page_index:int) -> Image.Image:
|
|
148
|
+
"""
|
|
149
|
+
Asynchronously extracts images from a TIFF file.
|
|
150
|
+
|
|
151
|
+
Parameters:
|
|
152
|
+
----------
|
|
153
|
+
page_index : int
|
|
154
|
+
Index of the page to retrieve.
|
|
155
|
+
"""
|
|
156
|
+
loop = asyncio.get_running_loop()
|
|
157
|
+
return await loop.run_in_executor(None, self.get_page, page_index)
|
|
158
|
+
|
|
159
|
+
def get_page_count(self) -> int:
|
|
160
|
+
""" Returns the number of images (pages) in the TIFF file. """
|
|
161
|
+
try:
|
|
162
|
+
img = Image.open(self.file_path)
|
|
163
|
+
return img.n_frames
|
|
164
|
+
except Exception as e:
|
|
165
|
+
print(f"Error getting page count from TIFF: {e}")
|
|
166
|
+
raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class ImageDataLoader(DataLoader):
|
|
170
|
+
def get_all_pages(self) -> List[Image.Image]:
|
|
171
|
+
"""
|
|
172
|
+
Loads a single image file.
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
image = Image.open(self.file_path)
|
|
176
|
+
image.load()
|
|
177
|
+
return [image]
|
|
178
|
+
except FileNotFoundError:
|
|
179
|
+
raise FileNotFoundError(f"Image file not found: {self.file_path}")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
raise ValueError(f"Failed to load image file '{os.path.basename(self.file_path)}': {e}") from e
|
|
182
|
+
|
|
183
|
+
def get_page(self, page_index:int) -> Image.Image:
|
|
184
|
+
"""
|
|
185
|
+
Loads a single image file.
|
|
186
|
+
|
|
187
|
+
Parameters:
|
|
188
|
+
----------
|
|
189
|
+
page_index : int
|
|
190
|
+
Index of the page to retrieve. Not applicable for single image files.
|
|
191
|
+
"""
|
|
192
|
+
try:
|
|
193
|
+
image = Image.open(self.file_path)
|
|
194
|
+
image.load()
|
|
195
|
+
return image
|
|
196
|
+
except FileNotFoundError:
|
|
197
|
+
raise FileNotFoundError(f"Image file not found: {self.file_path}")
|
|
198
|
+
except Exception as e:
|
|
199
|
+
raise ValueError(f"Failed to load image file '{os.path.basename(self.file_path)}': {e}") from e
|
|
200
|
+
|
|
201
|
+
async def get_page_async(self, page_index:int) -> Image.Image:
|
|
202
|
+
"""
|
|
203
|
+
Asynchronously loads a single image file.
|
|
204
|
+
|
|
205
|
+
Parameters:
|
|
206
|
+
----------
|
|
207
|
+
page_index : int
|
|
208
|
+
Index of the page to retrieve. Not applicable for single image files.
|
|
209
|
+
"""
|
|
210
|
+
loop = asyncio.get_running_loop()
|
|
211
|
+
return await loop.run_in_executor(None, self.get_page, page_index)
|
|
212
|
+
|
|
213
|
+
def get_page_count(self) -> int:
|
|
214
|
+
""" Returns 1 as there is only one image in a single image file. """
|
|
215
|
+
return 1
|
|
47
216
|
|
|
48
217
|
|
|
49
218
|
def image_to_base64(image:Image.Image, format:str="png") -> str:
|
|
@@ -61,4 +230,182 @@ def image_to_base64(image:Image.Image, format:str="png") -> str:
|
|
|
61
230
|
|
|
62
231
|
def clean_markdown(text:str) -> str:
|
|
63
232
|
cleaned_text = text.replace("```markdown", "").replace("```", "")
|
|
64
|
-
return cleaned_text
|
|
233
|
+
return cleaned_text
|
|
234
|
+
|
|
235
|
+
def _find_dict_strings( text: str) -> List[str]:
|
|
236
|
+
"""
|
|
237
|
+
Extracts balanced JSON-like dictionaries from a string, even if nested.
|
|
238
|
+
|
|
239
|
+
Parameters:
|
|
240
|
+
-----------
|
|
241
|
+
text : str
|
|
242
|
+
the input text containing JSON-like structures.
|
|
243
|
+
|
|
244
|
+
Returns : List[str]
|
|
245
|
+
A list of valid JSON-like strings representing dictionaries.
|
|
246
|
+
"""
|
|
247
|
+
open_brace = 0
|
|
248
|
+
start = -1
|
|
249
|
+
json_objects = []
|
|
250
|
+
|
|
251
|
+
for i, char in enumerate(text):
|
|
252
|
+
if char == '{':
|
|
253
|
+
if open_brace == 0:
|
|
254
|
+
# start of a new JSON object
|
|
255
|
+
start = i
|
|
256
|
+
open_brace += 1
|
|
257
|
+
elif char == '}':
|
|
258
|
+
open_brace -= 1
|
|
259
|
+
if open_brace == 0 and start != -1:
|
|
260
|
+
json_objects.append(text[start:i + 1])
|
|
261
|
+
start = -1
|
|
262
|
+
|
|
263
|
+
return json_objects
|
|
264
|
+
|
|
265
|
+
def extract_json(gen_text:str) -> List[Dict[str, str]]:
|
|
266
|
+
"""
|
|
267
|
+
This method inputs a generated text and output a JSON of information tuples
|
|
268
|
+
"""
|
|
269
|
+
out = []
|
|
270
|
+
dict_str_list = _find_dict_strings(gen_text)
|
|
271
|
+
for dict_str in dict_str_list:
|
|
272
|
+
try:
|
|
273
|
+
dict_obj = json.loads(dict_str)
|
|
274
|
+
out.append(dict_obj)
|
|
275
|
+
except json.JSONDecodeError:
|
|
276
|
+
dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
|
|
277
|
+
if dict_obj:
|
|
278
|
+
warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
|
|
279
|
+
out.append(dict_obj)
|
|
280
|
+
else:
|
|
281
|
+
warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
|
|
282
|
+
return out
|
|
283
|
+
|
|
284
|
+
def get_default_page_delimiter(output_mode:str) -> str:
|
|
285
|
+
"""
|
|
286
|
+
Returns the default page delimiter based on the environment variable.
|
|
287
|
+
|
|
288
|
+
Parameters:
|
|
289
|
+
----------
|
|
290
|
+
output_mode : str
|
|
291
|
+
The output mode, which can be "markdown", "HTML", or "text".
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
-------
|
|
295
|
+
str
|
|
296
|
+
The default page delimiter.
|
|
297
|
+
"""
|
|
298
|
+
if output_mode not in ["markdown", "HTML", "text", "JSON"]:
|
|
299
|
+
raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'")
|
|
300
|
+
|
|
301
|
+
if output_mode == "markdown":
|
|
302
|
+
return "\n\n---\n\n"
|
|
303
|
+
elif output_mode == "HTML":
|
|
304
|
+
return "<br><br>"
|
|
305
|
+
elif output_mode == "text":
|
|
306
|
+
return "\n\n---\n\n"
|
|
307
|
+
elif output_mode == "JSON":
|
|
308
|
+
return "\n\n---\n\n"
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
class ImageProcessor:
|
|
312
|
+
def __init__(self):
|
|
313
|
+
self.has_tesseract = importlib.util.find_spec("pytesseract") is not None
|
|
314
|
+
|
|
315
|
+
def rotate_correction(self, image: Image.Image) -> Tuple[Image.Image, int]:
|
|
316
|
+
"""
|
|
317
|
+
This method use Tesseract OSD to correct the rotation of the image.
|
|
318
|
+
|
|
319
|
+
Parameters:
|
|
320
|
+
----------
|
|
321
|
+
image : Image.Image
|
|
322
|
+
The image to be corrected.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
-------
|
|
326
|
+
Image.Image
|
|
327
|
+
The corrected image.
|
|
328
|
+
int
|
|
329
|
+
The rotation angle in degrees.
|
|
330
|
+
"""
|
|
331
|
+
if importlib.util.find_spec("pytesseract") is None:
|
|
332
|
+
raise ImportError("pytesseract is not installed. Please install it to use this feature.")
|
|
333
|
+
|
|
334
|
+
import pytesseract
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)
|
|
338
|
+
rotation_angle = osd['rotate']
|
|
339
|
+
if rotation_angle != 0:
|
|
340
|
+
return image.rotate(rotation_angle, expand=True), rotation_angle
|
|
341
|
+
|
|
342
|
+
return image, 0
|
|
343
|
+
except Exception as e:
|
|
344
|
+
print(f"Error correcting image rotation: {e}")
|
|
345
|
+
raise ValueError(f"Failed to correct image rotation: {e}") from e
|
|
346
|
+
|
|
347
|
+
async def rotate_correction_async(self, image: Image.Image) -> Tuple[Image.Image, int]:
|
|
348
|
+
"""
|
|
349
|
+
Asynchronous version of rotate_correction method.
|
|
350
|
+
|
|
351
|
+
Parameters:
|
|
352
|
+
----------
|
|
353
|
+
image : Image.Image
|
|
354
|
+
The image to be corrected.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
-------
|
|
358
|
+
Image.Image
|
|
359
|
+
The corrected image.
|
|
360
|
+
int
|
|
361
|
+
The rotation angle in degrees.
|
|
362
|
+
"""
|
|
363
|
+
loop = asyncio.get_running_loop()
|
|
364
|
+
return await loop.run_in_executor(None, self.rotate_correction, image)
|
|
365
|
+
|
|
366
|
+
def resize(self, image: Image.Image, max_dimension_pixels:int=4000) -> Tuple[Image.Image, bool]:
|
|
367
|
+
"""
|
|
368
|
+
Resizes the image to fit within the specified maximum dimension while maintaining aspect ratio.
|
|
369
|
+
|
|
370
|
+
Parameters:
|
|
371
|
+
----------
|
|
372
|
+
max_dimension_pixels : int
|
|
373
|
+
The maximum dimension (width or height) in pixels.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
-------
|
|
377
|
+
Image.Image
|
|
378
|
+
The resized image.
|
|
379
|
+
bool
|
|
380
|
+
True if the image was resized, False otherwise.
|
|
381
|
+
"""
|
|
382
|
+
width, height = image.size
|
|
383
|
+
if width > max_dimension_pixels or height > max_dimension_pixels:
|
|
384
|
+
if width > height:
|
|
385
|
+
new_width = max_dimension_pixels
|
|
386
|
+
new_height = int((max_dimension_pixels / width) * height)
|
|
387
|
+
else:
|
|
388
|
+
new_height = max_dimension_pixels
|
|
389
|
+
new_width = int((max_dimension_pixels / height) * width)
|
|
390
|
+
return image.resize((new_width, new_height), resample=Image.Resampling.LANCZOS), True # Resizing was done
|
|
391
|
+
|
|
392
|
+
return image, False # No resizing needed
|
|
393
|
+
|
|
394
|
+
async def resize_async(self, image: Image.Image, max_dimension_pixels:int=4000) -> Tuple[Image.Image, bool]:
|
|
395
|
+
"""
|
|
396
|
+
Asynchronous version of resize method.
|
|
397
|
+
|
|
398
|
+
Parameters:
|
|
399
|
+
----------
|
|
400
|
+
max_dimension_pixels : int
|
|
401
|
+
The maximum dimension (width or height) in pixels.
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
-------
|
|
405
|
+
Image.Image
|
|
406
|
+
The resized image.
|
|
407
|
+
bool
|
|
408
|
+
True if the image was resized, False otherwise.
|
|
409
|
+
"""
|
|
410
|
+
loop = asyncio.get_running_loop()
|
|
411
|
+
return await loop.run_in_executor(None, self.resize, image, max_dimension_pixels)
|