vlm4ocr 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vlm4ocr/utils.py CHANGED
@@ -1,49 +1,218 @@
1
+ import abc
1
2
  import os
2
3
  import io
3
4
  import base64
4
- from typing import List
5
- from pdf2image import convert_from_path
5
+ from typing import Dict, List, Tuple
6
+ import json
7
+ import json_repair
8
+ import importlib.util
9
+ from pdf2image import convert_from_path, pdfinfo_from_path
6
10
  from PIL import Image
11
+ import asyncio
12
+ import warnings
7
13
 
8
- def get_images_from_pdf(file_path: str) -> List[Image.Image]:
9
- """ Extracts images from a PDF file. """
10
- try:
11
- images = convert_from_path(file_path)
12
- if not images:
13
- print(f"Warning: No images extracted from PDF: {file_path}")
14
- return images
15
- except Exception as e:
16
- print(f"Error converting PDF to images: {e}")
17
- raise ValueError(f"Failed to process PDF file '{os.path.basename(file_path)}'. Ensure poppler is installed and the file is valid.") from e
18
14
 
19
- def get_images_from_tiff(file_path: str) -> List[Image.Image]:
20
- """ Extracts images from a TIFF file. """
21
- images = []
22
- try:
23
- img = Image.open(file_path)
24
- for i in range(img.n_frames):
25
- img.seek(i)
26
- images.append(img.copy())
27
- if not images:
28
- print(f"Warning: No images extracted from TIFF: {file_path}")
29
- return images
30
- except FileNotFoundError:
31
- raise FileNotFoundError(f"TIFF file not found: {file_path}")
32
- except Exception as e:
33
- print(f"Error processing TIFF file: {e}")
34
- raise ValueError(f"Failed to process TIFF file '{os.path.basename(file_path)}'. Ensure the file is a valid TIFF.") from e
15
+ class DataLoader(abc.ABC):
16
+ def __init__(self, file_path: str):
17
+ self.file_path = file_path
18
+ if not os.path.exists(file_path):
19
+ raise FileNotFoundError(f"File not found: {file_path}")
35
20
 
21
+ @abc.abstractmethod
22
+ def get_all_pages(self) -> List[Image.Image]:
23
+ """
24
+ Abstract method to get all pages from the file.
25
+ """
26
+ pass
36
27
 
37
- def get_image_from_file(file_path: str) -> Image.Image:
38
- """ Loads a single image file. """
39
- try:
40
- image = Image.open(file_path)
41
- image.load()
42
- return image
43
- except FileNotFoundError:
44
- raise FileNotFoundError(f"Image file not found: {file_path}")
45
- except Exception as e:
46
- raise ValueError(f"Failed to load image file '{os.path.basename(file_path)}': {e}") from e
28
+ @abc.abstractmethod
29
+ def get_page(self, page_index:int) -> Image.Image:
30
+ """
31
+ Abstract method to get pages from the file.
32
+
33
+ Parameters:
34
+ ----------
35
+ page_index : int
36
+ Index of the page to retrieve.
37
+ """
38
+ pass
39
+
40
+ @abc.abstractmethod
41
+ async def get_page_async(self, page_index:int) -> Image.Image:
42
+ """
43
+ Abstract method to get pages from the file.
44
+
45
+ Parameters:
46
+ ----------
47
+ page_index : int
48
+ Index of the page to retrieve.
49
+ """
50
+ pass
51
+
52
+ @abc.abstractmethod
53
+ def get_page_count(self) -> int:
54
+ """ Returns the number of pages in the PDF file. """
55
+ pass
56
+
57
+
58
+ class PDFDataLoader(DataLoader):
59
+ def __init__(self, file_path: str):
60
+ super().__init__(file_path)
61
+ self.info = pdfinfo_from_path(self.file_path, userpw=None, poppler_path=None)
62
+
63
+ def get_all_pages(self) -> List[Image.Image]:
64
+ """
65
+ Extracts pages from a PDF file.
66
+ """
67
+ try:
68
+ return convert_from_path(self.file_path)
69
+
70
+ except Exception as e:
71
+ print(f"Error converting PDF to images: {e}")
72
+ raise ValueError(f"Failed to process PDF file '{os.path.basename(self.file_path)}'. Ensure poppler is installed and the file is valid.") from e
73
+
74
+ def get_page(self, page_index:int) -> Image.Image:
75
+ """
76
+ Extracts a page from a PDF file.
77
+
78
+ Parameters:
79
+ ----------
80
+ page_index : int
81
+ Index of the page to retrieve.
82
+ """
83
+ try:
84
+ return convert_from_path(self.file_path, first_page=page_index + 1, last_page=page_index + 1)[0]
85
+ except Exception as e:
86
+ print(f"Error converting PDF to images: {e}")
87
+ raise ValueError(f"Failed to process PDF file '{os.path.basename(self.file_path)}'. Ensure poppler is installed and the file is valid.") from e
88
+
89
+
90
+ async def get_page_async(self, page_index:int) -> Image.Image:
91
+ """
92
+ Asynchronously extracts a page from a PDF file.
93
+
94
+ Parameters:
95
+ ----------
96
+ page_index : int
97
+ Index of the page to retrieve.
98
+ """
99
+ loop = asyncio.get_running_loop()
100
+ return await loop.run_in_executor(None, self.get_page, page_index)
101
+
102
+
103
+ def get_page_count(self) -> int:
104
+ """ Returns the number of pages in the PDF file. """
105
+ return self.info['Pages'] if 'Pages' in self.info else 0
106
+
107
+
108
+ class TIFFDataLoader(DataLoader):
109
+ def __init__(self, file_path: str):
110
+ super().__init__(file_path)
111
+
112
+ def get_all_pages(self) -> List[Image.Image]:
113
+ """
114
+ Extracts images from a TIFF file.
115
+ """
116
+ try:
117
+ img = Image.open(self.file_path)
118
+ images = []
119
+ for i in range(img.n_frames):
120
+ img.seek(i)
121
+ images.append(img.copy())
122
+ return images
123
+ except Exception as e:
124
+ print(f"Error extracting images from TIFF: {e}")
125
+ raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
126
+
127
+
128
+ def get_page(self, page_index:int) -> Image.Image:
129
+ """
130
+ Extracts a page from a TIFF file.
131
+
132
+ Parameters:
133
+ ----------
134
+ page_index : int
135
+ Index of the page to retrieve.
136
+ """
137
+ try:
138
+ img = Image.open(self.file_path)
139
+ img.seek(page_index)
140
+ return img.copy()
141
+ except IndexError:
142
+ raise ValueError(f"Page index {page_index} out of range for TIFF file '{os.path.basename(self.file_path)}'.") from None
143
+ except Exception as e:
144
+ print(f"Error extracting page {page_index} from TIFF: {e}")
145
+ raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
146
+
147
+ async def get_page_async(self, page_index:int) -> Image.Image:
148
+ """
149
+ Asynchronously extracts images from a TIFF file.
150
+
151
+ Parameters:
152
+ ----------
153
+ page_index : int
154
+ Index of the page to retrieve.
155
+ """
156
+ loop = asyncio.get_running_loop()
157
+ return await loop.run_in_executor(None, self.get_page, page_index)
158
+
159
+ def get_page_count(self) -> int:
160
+ """ Returns the number of images (pages) in the TIFF file. """
161
+ try:
162
+ img = Image.open(self.file_path)
163
+ return img.n_frames
164
+ except Exception as e:
165
+ print(f"Error getting page count from TIFF: {e}")
166
+ raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
167
+
168
+
169
+ class ImageDataLoader(DataLoader):
170
+ def get_all_pages(self) -> List[Image.Image]:
171
+ """
172
+ Loads a single image file.
173
+ """
174
+ try:
175
+ image = Image.open(self.file_path)
176
+ image.load()
177
+ return [image]
178
+ except FileNotFoundError:
179
+ raise FileNotFoundError(f"Image file not found: {self.file_path}")
180
+ except Exception as e:
181
+ raise ValueError(f"Failed to load image file '{os.path.basename(self.file_path)}': {e}") from e
182
+
183
+ def get_page(self, page_index:int) -> Image.Image:
184
+ """
185
+ Loads a single image file.
186
+
187
+ Parameters:
188
+ ----------
189
+ page_index : int
190
+ Index of the page to retrieve. Not applicable for single image files.
191
+ """
192
+ try:
193
+ image = Image.open(self.file_path)
194
+ image.load()
195
+ return image
196
+ except FileNotFoundError:
197
+ raise FileNotFoundError(f"Image file not found: {self.file_path}")
198
+ except Exception as e:
199
+ raise ValueError(f"Failed to load image file '{os.path.basename(self.file_path)}': {e}") from e
200
+
201
+ async def get_page_async(self, page_index:int) -> Image.Image:
202
+ """
203
+ Asynchronously loads a single image file.
204
+
205
+ Parameters:
206
+ ----------
207
+ page_index : int
208
+ Index of the page to retrieve. Not applicable for single image files.
209
+ """
210
+ loop = asyncio.get_running_loop()
211
+ return await loop.run_in_executor(None, self.get_page, page_index)
212
+
213
+ def get_page_count(self) -> int:
214
+ """ Returns 1 as there is only one image in a single image file. """
215
+ return 1
47
216
 
48
217
 
49
218
  def image_to_base64(image:Image.Image, format:str="png") -> str:
@@ -61,4 +230,182 @@ def image_to_base64(image:Image.Image, format:str="png") -> str:
61
230
 
62
231
  def clean_markdown(text:str) -> str:
63
232
  cleaned_text = text.replace("```markdown", "").replace("```", "")
64
- return cleaned_text
233
+ return cleaned_text
234
+
235
+ def _find_dict_strings( text: str) -> List[str]:
236
+ """
237
+ Extracts balanced JSON-like dictionaries from a string, even if nested.
238
+
239
+ Parameters:
240
+ -----------
241
+ text : str
242
+ the input text containing JSON-like structures.
243
+
244
+ Returns : List[str]
245
+ A list of valid JSON-like strings representing dictionaries.
246
+ """
247
+ open_brace = 0
248
+ start = -1
249
+ json_objects = []
250
+
251
+ for i, char in enumerate(text):
252
+ if char == '{':
253
+ if open_brace == 0:
254
+ # start of a new JSON object
255
+ start = i
256
+ open_brace += 1
257
+ elif char == '}':
258
+ open_brace -= 1
259
+ if open_brace == 0 and start != -1:
260
+ json_objects.append(text[start:i + 1])
261
+ start = -1
262
+
263
+ return json_objects
264
+
265
+ def extract_json(gen_text:str) -> List[Dict[str, str]]:
266
+ """
267
+ This method inputs a generated text and output a JSON of information tuples
268
+ """
269
+ out = []
270
+ dict_str_list = _find_dict_strings(gen_text)
271
+ for dict_str in dict_str_list:
272
+ try:
273
+ dict_obj = json.loads(dict_str)
274
+ out.append(dict_obj)
275
+ except json.JSONDecodeError:
276
+ dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
277
+ if dict_obj:
278
+ warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
279
+ out.append(dict_obj)
280
+ else:
281
+ warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
282
+ return out
283
+
284
+ def get_default_page_delimiter(output_mode:str) -> str:
285
+ """
286
+ Returns the default page delimiter based on the environment variable.
287
+
288
+ Parameters:
289
+ ----------
290
+ output_mode : str
291
+ The output mode, which can be "markdown", "HTML", or "text".
292
+
293
+ Returns:
294
+ -------
295
+ str
296
+ The default page delimiter.
297
+ """
298
+ if output_mode not in ["markdown", "HTML", "text", "JSON"]:
299
+ raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'")
300
+
301
+ if output_mode == "markdown":
302
+ return "\n\n---\n\n"
303
+ elif output_mode == "HTML":
304
+ return "<br><br>"
305
+ elif output_mode == "text":
306
+ return "\n\n---\n\n"
307
+ elif output_mode == "JSON":
308
+ return "\n\n---\n\n"
309
+
310
+
311
+ class ImageProcessor:
312
+ def __init__(self):
313
+ self.has_tesseract = importlib.util.find_spec("pytesseract") is not None
314
+
315
+ def rotate_correction(self, image: Image.Image) -> Tuple[Image.Image, int]:
316
+ """
317
+ This method use Tesseract OSD to correct the rotation of the image.
318
+
319
+ Parameters:
320
+ ----------
321
+ image : Image.Image
322
+ The image to be corrected.
323
+
324
+ Returns:
325
+ -------
326
+ Image.Image
327
+ The corrected image.
328
+ int
329
+ The rotation angle in degrees.
330
+ """
331
+ if importlib.util.find_spec("pytesseract") is None:
332
+ raise ImportError("pytesseract is not installed. Please install it to use this feature.")
333
+
334
+ import pytesseract
335
+
336
+ try:
337
+ osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)
338
+ rotation_angle = osd['rotate']
339
+ if rotation_angle != 0:
340
+ return image.rotate(rotation_angle, expand=True), rotation_angle
341
+
342
+ return image, 0
343
+ except Exception as e:
344
+ print(f"Error correcting image rotation: {e}")
345
+ raise ValueError(f"Failed to correct image rotation: {e}") from e
346
+
347
+ async def rotate_correction_async(self, image: Image.Image) -> Tuple[Image.Image, int]:
348
+ """
349
+ Asynchronous version of rotate_correction method.
350
+
351
+ Parameters:
352
+ ----------
353
+ image : Image.Image
354
+ The image to be corrected.
355
+
356
+ Returns:
357
+ -------
358
+ Image.Image
359
+ The corrected image.
360
+ int
361
+ The rotation angle in degrees.
362
+ """
363
+ loop = asyncio.get_running_loop()
364
+ return await loop.run_in_executor(None, self.rotate_correction, image)
365
+
366
+ def resize(self, image: Image.Image, max_dimension_pixels:int=4000) -> Tuple[Image.Image, bool]:
367
+ """
368
+ Resizes the image to fit within the specified maximum dimension while maintaining aspect ratio.
369
+
370
+ Parameters:
371
+ ----------
372
+ max_dimension_pixels : int
373
+ The maximum dimension (width or height) in pixels.
374
+
375
+ Returns:
376
+ -------
377
+ Image.Image
378
+ The resized image.
379
+ bool
380
+ True if the image was resized, False otherwise.
381
+ """
382
+ width, height = image.size
383
+ if width > max_dimension_pixels or height > max_dimension_pixels:
384
+ if width > height:
385
+ new_width = max_dimension_pixels
386
+ new_height = int((max_dimension_pixels / width) * height)
387
+ else:
388
+ new_height = max_dimension_pixels
389
+ new_width = int((max_dimension_pixels / height) * width)
390
+ return image.resize((new_width, new_height), resample=Image.Resampling.LANCZOS), True # Resizing was done
391
+
392
+ return image, False # No resizing needed
393
+
394
+ async def resize_async(self, image: Image.Image, max_dimension_pixels:int=4000) -> Tuple[Image.Image, bool]:
395
+ """
396
+ Asynchronous version of resize method.
397
+
398
+ Parameters:
399
+ ----------
400
+ max_dimension_pixels : int
401
+ The maximum dimension (width or height) in pixels.
402
+
403
+ Returns:
404
+ -------
405
+ Image.Image
406
+ The resized image.
407
+ bool
408
+ True if the image was resized, False otherwise.
409
+ """
410
+ loop = asyncio.get_running_loop()
411
+ return await loop.run_in_executor(None, self.resize, image, max_dimension_pixels)