vlm4ocr 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vlm4ocr/utils.py CHANGED
@@ -1,49 +1,215 @@
1
+ import abc
1
2
  import os
2
3
  import io
3
4
  import base64
4
- from typing import List
5
- from pdf2image import convert_from_path
5
+ from typing import Union, List, Tuple
6
+ import importlib.util
7
+ from pdf2image import convert_from_path, pdfinfo_from_path
6
8
  from PIL import Image
9
+ import asyncio
7
10
 
8
- def get_images_from_pdf(file_path: str) -> List[Image.Image]:
9
- """ Extracts images from a PDF file. """
10
- try:
11
- images = convert_from_path(file_path)
12
- if not images:
13
- print(f"Warning: No images extracted from PDF: {file_path}")
14
- return images
15
- except Exception as e:
16
- print(f"Error converting PDF to images: {e}")
17
- raise ValueError(f"Failed to process PDF file '{os.path.basename(file_path)}'. Ensure poppler is installed and the file is valid.") from e
18
11
 
19
- def get_images_from_tiff(file_path: str) -> List[Image.Image]:
20
- """ Extracts images from a TIFF file. """
21
- images = []
22
- try:
23
- img = Image.open(file_path)
24
- for i in range(img.n_frames):
25
- img.seek(i)
26
- images.append(img.copy())
27
- if not images:
28
- print(f"Warning: No images extracted from TIFF: {file_path}")
29
- return images
30
- except FileNotFoundError:
31
- raise FileNotFoundError(f"TIFF file not found: {file_path}")
32
- except Exception as e:
33
- print(f"Error processing TIFF file: {e}")
34
- raise ValueError(f"Failed to process TIFF file '{os.path.basename(file_path)}'. Ensure the file is a valid TIFF.") from e
12
+ class DataLoader(abc.ABC):
13
+ def __init__(self, file_path: str):
14
+ self.file_path = file_path
15
+ if not os.path.exists(file_path):
16
+ raise FileNotFoundError(f"File not found: {file_path}")
35
17
 
18
+ @abc.abstractmethod
19
+ def get_all_pages(self) -> List[Image.Image]:
20
+ """
21
+ Abstract method to get all pages from the file.
22
+ """
23
+ pass
36
24
 
37
- def get_image_from_file(file_path: str) -> Image.Image:
38
- """ Loads a single image file. """
39
- try:
40
- image = Image.open(file_path)
41
- image.load()
42
- return image
43
- except FileNotFoundError:
44
- raise FileNotFoundError(f"Image file not found: {file_path}")
45
- except Exception as e:
46
- raise ValueError(f"Failed to load image file '{os.path.basename(file_path)}': {e}") from e
25
+ @abc.abstractmethod
26
+ def get_page(self, page_index:int) -> Image.Image:
27
+ """
28
+ Abstract method to get pages from the file.
29
+
30
+ Parameters:
31
+ ----------
32
+ page_index : int
33
+ Index of the page to retrieve.
34
+ """
35
+ pass
36
+
37
+ @abc.abstractmethod
38
+ async def get_page_async(self, page_index:int) -> Image.Image:
39
+ """
40
+ Abstract method to get pages from the file.
41
+
42
+ Parameters:
43
+ ----------
44
+ page_index : int
45
+ Index of the page to retrieve.
46
+ """
47
+ pass
48
+
49
+ @abc.abstractmethod
50
+ def get_page_count(self) -> int:
51
+ """ Returns the number of pages in the PDF file. """
52
+ pass
53
+
54
+
55
+ class PDFDataLoader(DataLoader):
56
+ def __init__(self, file_path: str):
57
+ super().__init__(file_path)
58
+ self.info = pdfinfo_from_path(self.file_path, userpw=None, poppler_path=None)
59
+
60
+ def get_all_pages(self) -> List[Image.Image]:
61
+ """
62
+ Extracts pages from a PDF file.
63
+ """
64
+ try:
65
+ return convert_from_path(self.file_path)
66
+
67
+ except Exception as e:
68
+ print(f"Error converting PDF to images: {e}")
69
+ raise ValueError(f"Failed to process PDF file '{os.path.basename(self.file_path)}'. Ensure poppler is installed and the file is valid.") from e
70
+
71
+ def get_page(self, page_index:int) -> Image.Image:
72
+ """
73
+ Extracts a page from a PDF file.
74
+
75
+ Parameters:
76
+ ----------
77
+ page_index : int
78
+ Index of the page to retrieve.
79
+ """
80
+ try:
81
+ return convert_from_path(self.file_path, first_page=page_index + 1, last_page=page_index + 1)[0]
82
+ except Exception as e:
83
+ print(f"Error converting PDF to images: {e}")
84
+ raise ValueError(f"Failed to process PDF file '{os.path.basename(self.file_path)}'. Ensure poppler is installed and the file is valid.") from e
85
+
86
+
87
+ async def get_page_async(self, page_index:int) -> Image.Image:
88
+ """
89
+ Asynchronously extracts a page from a PDF file.
90
+
91
+ Parameters:
92
+ ----------
93
+ page_index : int
94
+ Index of the page to retrieve.
95
+ """
96
+ loop = asyncio.get_running_loop()
97
+ return await loop.run_in_executor(None, self.get_page, page_index)
98
+
99
+
100
+ def get_page_count(self) -> int:
101
+ """ Returns the number of pages in the PDF file. """
102
+ return self.info['Pages'] if 'Pages' in self.info else 0
103
+
104
+
105
+ class TIFFDataLoader(DataLoader):
106
+ def __init__(self, file_path: str):
107
+ super().__init__(file_path)
108
+
109
+ def get_all_pages(self) -> List[Image.Image]:
110
+ """
111
+ Extracts images from a TIFF file.
112
+ """
113
+ try:
114
+ img = Image.open(self.file_path)
115
+ images = []
116
+ for i in range(img.n_frames):
117
+ img.seek(i)
118
+ images.append(img.copy())
119
+ return images
120
+ except Exception as e:
121
+ print(f"Error extracting images from TIFF: {e}")
122
+ raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
123
+
124
+
125
+ def get_page(self, page_index:int) -> Image.Image:
126
+ """
127
+ Extracts a page from a TIFF file.
128
+
129
+ Parameters:
130
+ ----------
131
+ page_index : int
132
+ Index of the page to retrieve.
133
+ """
134
+ try:
135
+ img = Image.open(self.file_path)
136
+ img.seek(page_index)
137
+ return img.copy()
138
+ except IndexError:
139
+ raise ValueError(f"Page index {page_index} out of range for TIFF file '{os.path.basename(self.file_path)}'.") from None
140
+ except Exception as e:
141
+ print(f"Error extracting page {page_index} from TIFF: {e}")
142
+ raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
143
+
144
+ async def get_page_async(self, page_index:int) -> Image.Image:
145
+ """
146
+ Asynchronously extracts images from a TIFF file.
147
+
148
+ Parameters:
149
+ ----------
150
+ page_index : int
151
+ Index of the page to retrieve.
152
+ """
153
+ loop = asyncio.get_running_loop()
154
+ return await loop.run_in_executor(None, self.get_page, page_index)
155
+
156
+ def get_page_count(self) -> int:
157
+ """ Returns the number of images (pages) in the TIFF file. """
158
+ try:
159
+ img = Image.open(self.file_path)
160
+ return img.n_frames
161
+ except Exception as e:
162
+ print(f"Error getting page count from TIFF: {e}")
163
+ raise ValueError(f"Failed to process TIFF file '{os.path.basename(self.file_path)}'. Ensure the file is valid.") from e
164
+
165
+
166
+ class ImageDataLoader(DataLoader):
167
+ def get_all_pages(self) -> List[Image.Image]:
168
+ """
169
+ Loads a single image file.
170
+ """
171
+ try:
172
+ image = Image.open(self.file_path)
173
+ image.load()
174
+ return [image]
175
+ except FileNotFoundError:
176
+ raise FileNotFoundError(f"Image file not found: {self.file_path}")
177
+ except Exception as e:
178
+ raise ValueError(f"Failed to load image file '{os.path.basename(self.file_path)}': {e}") from e
179
+
180
+ def get_page(self, page_index:int) -> Image.Image:
181
+ """
182
+ Loads a single image file.
183
+
184
+ Parameters:
185
+ ----------
186
+ page_index : int
187
+ Index of the page to retrieve. Not applicable for single image files.
188
+ """
189
+ try:
190
+ image = Image.open(self.file_path)
191
+ image.load()
192
+ return image
193
+ except FileNotFoundError:
194
+ raise FileNotFoundError(f"Image file not found: {self.file_path}")
195
+ except Exception as e:
196
+ raise ValueError(f"Failed to load image file '{os.path.basename(self.file_path)}': {e}") from e
197
+
198
+ async def get_page_async(self, page_index:int) -> Image.Image:
199
+ """
200
+ Asynchronously loads a single image file.
201
+
202
+ Parameters:
203
+ ----------
204
+ page_index : int
205
+ Index of the page to retrieve. Not applicable for single image files.
206
+ """
207
+ loop = asyncio.get_running_loop()
208
+ return await loop.run_in_executor(None, self.get_page, page_index)
209
+
210
+ def get_page_count(self) -> int:
211
+ """ Returns 1 as there is only one image in a single image file. """
212
+ return 1
47
213
 
48
214
 
49
215
  def image_to_base64(image:Image.Image, format:str="png") -> str:
@@ -61,4 +227,131 @@ def image_to_base64(image:Image.Image, format:str="png") -> str:
61
227
 
62
228
  def clean_markdown(text:str) -> str:
63
229
  cleaned_text = text.replace("```markdown", "").replace("```", "")
64
- return cleaned_text
230
+ return cleaned_text
231
+
232
+ def get_default_page_delimiter(output_mode:str) -> str:
233
+ """
234
+ Returns the default page delimiter based on the environment variable.
235
+
236
+ Parameters:
237
+ ----------
238
+ output_mode : str
239
+ The output mode, which can be "markdown", "HTML", or "text".
240
+
241
+ Returns:
242
+ -------
243
+ str
244
+ The default page delimiter.
245
+ """
246
+ if output_mode not in ["markdown", "HTML", "text"]:
247
+ raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
248
+
249
+ if output_mode == "markdown":
250
+ return "\n\n---\n\n"
251
+ elif output_mode == "HTML":
252
+ return "<br><br>"
253
+ elif output_mode == "text":
254
+ return "\n\n---\n\n"
255
+
256
+
257
+ class ImageProcessor:
258
+ def __init__(self):
259
+ self.has_tesseract = importlib.util.find_spec("pytesseract") is not None
260
+
261
+ def rotate_correction(self, image: Image.Image) -> Tuple[Image.Image, int]:
262
+ """
263
+ This method use Tesseract OSD to correct the rotation of the image.
264
+
265
+ Parameters:
266
+ ----------
267
+ image : Image.Image
268
+ The image to be corrected.
269
+
270
+ Returns:
271
+ -------
272
+ Image.Image
273
+ The corrected image.
274
+ int
275
+ The rotation angle in degrees.
276
+ """
277
+ if importlib.util.find_spec("pytesseract") is None:
278
+ raise ImportError("pytesseract is not installed. Please install it to use this feature.")
279
+
280
+ import pytesseract
281
+
282
+ try:
283
+ osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)
284
+ rotation_angle = osd['rotate']
285
+ if rotation_angle != 0:
286
+ return image.rotate(rotation_angle, expand=True), rotation_angle
287
+
288
+ return image, 0
289
+ except Exception as e:
290
+ print(f"Error correcting image rotation: {e}")
291
+ raise ValueError(f"Failed to correct image rotation: {e}") from e
292
+
293
+ async def rotate_correction_async(self, image: Image.Image) -> Tuple[Image.Image, int]:
294
+ """
295
+ Asynchronous version of rotate_correction method.
296
+
297
+ Parameters:
298
+ ----------
299
+ image : Image.Image
300
+ The image to be corrected.
301
+
302
+ Returns:
303
+ -------
304
+ Image.Image
305
+ The corrected image.
306
+ int
307
+ The rotation angle in degrees.
308
+ """
309
+ loop = asyncio.get_running_loop()
310
+ return await loop.run_in_executor(None, self.rotate_correction, image)
311
+
312
+ def resize(self, image: Image.Image, max_dimension_pixels:int=4000) -> Tuple[Image.Image, bool]:
313
+ """
314
+ Resizes the image to fit within the specified maximum dimension while maintaining aspect ratio.
315
+
316
+ Parameters:
317
+ ----------
318
+ max_dimension_pixels : int
319
+ The maximum dimension (width or height) in pixels.
320
+
321
+ Returns:
322
+ -------
323
+ Image.Image
324
+ The resized image.
325
+ bool
326
+ True if the image was resized, False otherwise.
327
+ """
328
+ width, height = image.size
329
+ if width > max_dimension_pixels or height > max_dimension_pixels:
330
+ if width > height:
331
+ new_width = max_dimension_pixels
332
+ new_height = int((max_dimension_pixels / width) * height)
333
+ else:
334
+ new_height = max_dimension_pixels
335
+ new_width = int((max_dimension_pixels / height) * width)
336
+ return image.resize((new_width, new_height), resample=Image.Resampling.LANCZOS), True # Resizing was done
337
+
338
+ return image, False # No resizing needed
339
+
340
+ async def resize_async(self, image: Image.Image, max_dimension_pixels:int=4000) -> Tuple[Image.Image, bool]:
341
+ """
342
+ Asynchronous version of resize method.
343
+
344
+ Parameters:
345
+ ----------
346
+ max_dimension_pixels : int
347
+ The maximum dimension (width or height) in pixels.
348
+
349
+ Returns:
350
+ -------
351
+ Image.Image
352
+ The resized image.
353
+ bool
354
+ True if the image was resized, False otherwise.
355
+ """
356
+ loop = asyncio.get_running_loop()
357
+ return await loop.run_in_executor(None, self.resize, image, max_dimension_pixels)