vlm4ocr 0.0.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlm4ocr/__init__.py +3 -1
- vlm4ocr/assets/default_prompt_templates/ocr_HTML_system_prompt.txt +1 -0
- vlm4ocr/assets/default_prompt_templates/ocr_HTML_user_prompt.txt +1 -0
- vlm4ocr/assets/default_prompt_templates/ocr_text_user_prompt.txt +1 -0
- vlm4ocr/cli.py +367 -0
- vlm4ocr/data_types.py +109 -0
- vlm4ocr/ocr_engines.py +359 -195
- vlm4ocr/utils.py +328 -18
- vlm4ocr/vlm_engines.py +317 -191
- {vlm4ocr-0.0.1.dist-info → vlm4ocr-0.2.0.dist-info}/METADATA +4 -2
- vlm4ocr-0.2.0.dist-info/RECORD +16 -0
- vlm4ocr-0.2.0.dist-info/entry_points.txt +3 -0
- vlm4ocr-0.0.1.dist-info/RECORD +0 -10
- /vlm4ocr/assets/default_prompt_templates/{ocr_user_prompt.txt → ocr_markdown_user_prompt.txt} +0 -0
- {vlm4ocr-0.0.1.dist-info → vlm4ocr-0.2.0.dist-info}/WHEEL +0 -0
vlm4ocr/ocr_engines.py
CHANGED
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import List, Dict, Union, Generator, Iterable
|
|
2
|
+
from typing import Tuple, List, Dict, Union, Generator, AsyncGenerator, Iterable
|
|
3
3
|
import importlib
|
|
4
4
|
import asyncio
|
|
5
|
-
from
|
|
5
|
+
from colorama import Fore, Style
|
|
6
|
+
from PIL import Image
|
|
7
|
+
from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, get_default_page_delimiter
|
|
8
|
+
from vlm4ocr.data_types import OCRResult
|
|
6
9
|
from vlm4ocr.vlm_engines import VLMEngine
|
|
7
10
|
|
|
8
|
-
SUPPORTED_IMAGE_EXTS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
|
|
11
|
+
SUPPORTED_IMAGE_EXTS = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
class OCREngine:
|
|
11
|
-
def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None
|
|
15
|
+
def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None):
|
|
12
16
|
"""
|
|
13
17
|
This class inputs a image or PDF file path and processes them using a VLM inference engine. Outputs plain text or markdown.
|
|
14
18
|
|
|
@@ -17,13 +21,11 @@ class OCREngine:
|
|
|
17
21
|
inference_engine : InferenceEngine
|
|
18
22
|
The inference engine to use for OCR.
|
|
19
23
|
output_mode : str, Optional
|
|
20
|
-
The output format.
|
|
24
|
+
The output format. Must be 'markdown', 'HTML', or 'text'.
|
|
21
25
|
system_prompt : str, Optional
|
|
22
26
|
Custom system prompt. We recommend use a default system prompt by leaving this blank.
|
|
23
27
|
user_prompt : str, Optional
|
|
24
28
|
Custom user prompt. It is good to include some information regarding the document. If not specified, a default will be used.
|
|
25
|
-
page_delimiter : str, Optional
|
|
26
|
-
The delimiter to use between PDF pages.
|
|
27
29
|
"""
|
|
28
30
|
# Check inference engine
|
|
29
31
|
if not isinstance(vlm_engine, VLMEngine):
|
|
@@ -31,50 +33,51 @@ class OCREngine:
|
|
|
31
33
|
self.vlm_engine = vlm_engine
|
|
32
34
|
|
|
33
35
|
# Check output mode
|
|
34
|
-
if output_mode not in ["markdown", "text"]:
|
|
35
|
-
raise ValueError("output_mode must be 'markdown' or 'text'")
|
|
36
|
+
if output_mode not in ["markdown", "HTML", "text"]:
|
|
37
|
+
raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
|
|
36
38
|
self.output_mode = output_mode
|
|
37
39
|
|
|
38
40
|
# System prompt
|
|
39
41
|
if isinstance(system_prompt, str) and system_prompt:
|
|
40
42
|
self.system_prompt = system_prompt
|
|
41
43
|
else:
|
|
42
|
-
|
|
43
|
-
with open(
|
|
44
|
+
prompt_template_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_system_prompt.txt')
|
|
45
|
+
with prompt_template_path.open('r', encoding='utf-8') as f:
|
|
44
46
|
self.system_prompt = f.read()
|
|
45
47
|
|
|
46
48
|
# User prompt
|
|
47
49
|
if isinstance(user_prompt, str) and user_prompt:
|
|
48
50
|
self.user_prompt = user_prompt
|
|
49
51
|
else:
|
|
50
|
-
|
|
51
|
-
with open(
|
|
52
|
+
prompt_template_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
|
|
53
|
+
with prompt_template_path.open('r', encoding='utf-8') as f:
|
|
52
54
|
self.user_prompt = f.read()
|
|
53
55
|
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
self.page_delimiter = page_delimiter
|
|
57
|
-
else:
|
|
58
|
-
raise ValueError("page_delimiter must be a string")
|
|
59
|
-
|
|
56
|
+
# Image processor
|
|
57
|
+
self.image_processor = ImageProcessor()
|
|
60
58
|
|
|
61
|
-
|
|
59
|
+
|
|
60
|
+
def stream_ocr(self, file_path: str, rotate_correction:bool=False, max_dimension_pixels:int=None) -> Generator[Dict[str, str], None, None]:
|
|
62
61
|
"""
|
|
63
62
|
This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications.
|
|
63
|
+
Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.
|
|
64
64
|
|
|
65
65
|
Parameters:
|
|
66
66
|
-----------
|
|
67
67
|
file_path : str
|
|
68
|
-
The path to the image or PDF file. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
The
|
|
73
|
-
|
|
68
|
+
The path to the image or PDF file. Must be one of '.pdf', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
69
|
+
rotate_correction : bool, Optional
|
|
70
|
+
If True, applies rotate correction to the images using pytesseract.
|
|
71
|
+
max_dimension_pixels : int, Optional
|
|
72
|
+
The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
|
|
73
|
+
|
|
74
74
|
Returns:
|
|
75
75
|
--------
|
|
76
|
-
Generator[str, None, None]
|
|
77
|
-
A generator that yields the output
|
|
76
|
+
Generator[Dict[str, str], None, None]
|
|
77
|
+
A generator that yields the output:
|
|
78
|
+
{"type": "info", "data": msg}
|
|
79
|
+
{"type": "ocr_chunk", "data": chunk}
|
|
80
|
+
{"type": "page_delimiter", "data": page_delimiter}
|
|
78
81
|
"""
|
|
79
82
|
# Check file path
|
|
80
83
|
if not isinstance(file_path, str):
|
|
@@ -82,230 +85,391 @@ class OCREngine:
|
|
|
82
85
|
|
|
83
86
|
# Check file extension
|
|
84
87
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
85
|
-
if file_ext not in SUPPORTED_IMAGE_EXTS
|
|
86
|
-
raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS
|
|
88
|
+
if file_ext not in SUPPORTED_IMAGE_EXTS:
|
|
89
|
+
raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
|
|
90
|
+
|
|
91
|
+
# Check if image preprocessing can be applied
|
|
92
|
+
if self.image_processor.has_tesseract==False and rotate_correction:
|
|
93
|
+
raise ImportError("pytesseract is not installed. Please install it to use rotate correction.")
|
|
87
94
|
|
|
88
|
-
# PDF
|
|
89
|
-
if file_ext
|
|
90
|
-
|
|
95
|
+
# PDF or TIFF
|
|
96
|
+
if file_ext in ['.pdf', '.tif', '.tiff']:
|
|
97
|
+
data_loader = PDFDataLoader(file_path) if file_ext == '.pdf' else TIFFDataLoader(file_path)
|
|
98
|
+
images = data_loader.get_all_pages()
|
|
99
|
+
# Check if images were extracted
|
|
91
100
|
if not images:
|
|
92
|
-
raise ValueError(f"No images extracted from
|
|
101
|
+
raise ValueError(f"No images extracted from file: {file_path}")
|
|
102
|
+
|
|
103
|
+
# OCR each image
|
|
93
104
|
for i, image in enumerate(images):
|
|
105
|
+
# Apply rotate correction if specified and tesseract is available
|
|
106
|
+
if rotate_correction and self.image_processor.has_tesseract:
|
|
107
|
+
try:
|
|
108
|
+
image, _ = self.image_processor.rotate_correction(image)
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
yield {"type": "info", "data": f"Error during rotate correction: {str(e)}"}
|
|
112
|
+
|
|
113
|
+
# Resize the image if max_dimension_pixels is specified
|
|
114
|
+
if max_dimension_pixels is not None:
|
|
115
|
+
try:
|
|
116
|
+
image, _ = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
yield {"type": "info", "data": f"Error resizing image: {str(e)}"}
|
|
119
|
+
|
|
94
120
|
messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
|
|
95
121
|
response_stream = self.vlm_engine.chat(
|
|
96
122
|
messages,
|
|
97
|
-
|
|
98
|
-
temperature=temperature,
|
|
99
|
-
stream=True,
|
|
100
|
-
**kwrs
|
|
123
|
+
stream=True
|
|
101
124
|
)
|
|
102
125
|
for chunk in response_stream:
|
|
103
|
-
yield chunk
|
|
126
|
+
yield {"type": "ocr_chunk", "data": chunk}
|
|
104
127
|
|
|
105
128
|
if i < len(images) - 1:
|
|
106
|
-
yield self.
|
|
129
|
+
yield {"type": "page_delimiter", "data": get_default_page_delimiter(self.output_mode)}
|
|
107
130
|
|
|
108
131
|
# Image
|
|
109
132
|
else:
|
|
110
|
-
|
|
133
|
+
data_loader = ImageDataLoader(file_path)
|
|
134
|
+
image = data_loader.get_page(0)
|
|
135
|
+
|
|
136
|
+
# Apply rotate correction if specified and tesseract is available
|
|
137
|
+
if rotate_correction and self.image_processor.has_tesseract:
|
|
138
|
+
try:
|
|
139
|
+
image, _ = self.image_processor.rotate_correction(image)
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
yield {"type": "info", "data": f"Error during rotate correction: {str(e)}"}
|
|
143
|
+
|
|
144
|
+
# Resize the image if max_dimension_pixels is specified
|
|
145
|
+
if max_dimension_pixels is not None:
|
|
146
|
+
try:
|
|
147
|
+
image, _ = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
|
|
148
|
+
except Exception as e:
|
|
149
|
+
yield {"type": "info", "data": f"Error resizing image: {str(e)}"}
|
|
150
|
+
|
|
111
151
|
messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
|
|
112
152
|
response_stream = self.vlm_engine.chat(
|
|
113
153
|
messages,
|
|
114
|
-
|
|
115
|
-
temperature=temperature,
|
|
116
|
-
stream=True,
|
|
117
|
-
**kwrs
|
|
154
|
+
stream=True
|
|
118
155
|
)
|
|
119
156
|
for chunk in response_stream:
|
|
120
|
-
yield chunk
|
|
157
|
+
yield {"type": "ocr_chunk", "data": chunk}
|
|
121
158
|
|
|
122
159
|
|
|
123
|
-
def
|
|
124
|
-
|
|
160
|
+
def sequential_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
|
|
161
|
+
max_dimension_pixels:int=None, verbose:bool=False) -> List[OCRResult]:
|
|
125
162
|
"""
|
|
126
|
-
This method
|
|
163
|
+
This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
|
|
127
164
|
|
|
128
165
|
Parameters:
|
|
129
166
|
-----------
|
|
130
167
|
file_paths : Union[str, Iterable[str]]
|
|
131
|
-
A file path or a list of file paths to process. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
The
|
|
168
|
+
A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
169
|
+
rotate_correction : bool, Optional
|
|
170
|
+
If True, applies rotate correction to the images using pytesseract.
|
|
171
|
+
max_dimension_pixels : int, Optional
|
|
172
|
+
The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
|
|
136
173
|
verbose : bool, Optional
|
|
137
|
-
If True, the function will print the output in terminal.
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
174
|
+
If True, the function will print the output in terminal.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
--------
|
|
178
|
+
List[OCRResult]
|
|
179
|
+
A list of OCR result objects.
|
|
142
180
|
"""
|
|
143
|
-
# if file_paths is a string, convert it to a list
|
|
144
181
|
if isinstance(file_paths, str):
|
|
145
182
|
file_paths = [file_paths]
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
raise TypeError("file_paths must be a string or an iterable of strings")
|
|
149
|
-
|
|
150
|
-
# check if all file paths are valid
|
|
183
|
+
|
|
184
|
+
ocr_results = []
|
|
151
185
|
for file_path in file_paths:
|
|
152
|
-
|
|
153
|
-
|
|
186
|
+
# Define OCRResult object
|
|
187
|
+
ocr_result = OCRResult(input_dir=file_path, output_mode=self.output_mode)
|
|
188
|
+
# get file extension
|
|
154
189
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
155
|
-
|
|
156
|
-
|
|
190
|
+
# Check file extension
|
|
191
|
+
if file_ext not in SUPPORTED_IMAGE_EXTS:
|
|
192
|
+
if verbose:
|
|
193
|
+
print(f"{Fore.RED}Unsupported file type:{Style.RESET_ALL} {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
|
|
194
|
+
ocr_result.status = "error"
|
|
195
|
+
ocr_result.add_page(text=f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}",
|
|
196
|
+
image_processing_status={})
|
|
197
|
+
ocr_results.append(ocr_result)
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
filename = os.path.basename(file_path)
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
# Load images from file
|
|
204
|
+
if file_ext == '.pdf':
|
|
205
|
+
data_loader = PDFDataLoader(file_path)
|
|
206
|
+
elif file_ext in ['.tif', '.tiff']:
|
|
207
|
+
data_loader = TIFFDataLoader(file_path)
|
|
208
|
+
else:
|
|
209
|
+
data_loader = ImageDataLoader(file_path)
|
|
210
|
+
|
|
211
|
+
images = data_loader.get_all_pages()
|
|
212
|
+
except Exception as e:
|
|
213
|
+
if verbose:
|
|
214
|
+
print(f"{Fore.RED}Error processing file {filename}:{Style.RESET_ALL} {str(e)}")
|
|
215
|
+
ocr_result.status = "error"
|
|
216
|
+
ocr_result.add_page(text=f"Error processing file {filename}: {str(e)}", image_processing_status={})
|
|
217
|
+
ocr_results.append(ocr_result)
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
# Check if images were extracted
|
|
221
|
+
if not images:
|
|
222
|
+
if verbose:
|
|
223
|
+
print(f"{Fore.RED}No images extracted from file:{Style.RESET_ALL} {filename}. It might be empty or corrupted.")
|
|
224
|
+
ocr_result.status = "error"
|
|
225
|
+
ocr_result.add_page(text=f"No images extracted from file: {filename}. It might be empty or corrupted.",
|
|
226
|
+
image_processing_status={})
|
|
227
|
+
ocr_results.append(ocr_result)
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
# OCR images
|
|
231
|
+
for i, image in enumerate(images):
|
|
232
|
+
image_processing_status = {}
|
|
233
|
+
# Apply rotate correction if specified and tesseract is available
|
|
234
|
+
if rotate_correction and self.image_processor.has_tesseract:
|
|
235
|
+
try:
|
|
236
|
+
image, rotation_angle = self.image_processor.rotate_correction(image)
|
|
237
|
+
image_processing_status["rotate_correction"] = {
|
|
238
|
+
"status": "success",
|
|
239
|
+
"rotation_angle": rotation_angle
|
|
240
|
+
}
|
|
241
|
+
if verbose:
|
|
242
|
+
print(f"{Fore.GREEN}Rotate correction applied for {filename} page {i} with angle {rotation_angle} degrees.{Style.RESET_ALL}")
|
|
243
|
+
except Exception as e:
|
|
244
|
+
image_processing_status["rotate_correction"] = {
|
|
245
|
+
"status": "error",
|
|
246
|
+
"error": str(e)
|
|
247
|
+
}
|
|
248
|
+
if verbose:
|
|
249
|
+
print(f"{Fore.RED}Error during rotate correction for {filename}:{Style.RESET_ALL} {rotation_angle['error']}. OCR continues without rotate correction.")
|
|
250
|
+
|
|
251
|
+
# Resize the image if max_dimension_pixels is specified
|
|
252
|
+
if max_dimension_pixels is not None:
|
|
253
|
+
try:
|
|
254
|
+
image, resized = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
|
|
255
|
+
image_processing_status["resize"] = {
|
|
256
|
+
"status": "success",
|
|
257
|
+
"resized": resized
|
|
258
|
+
}
|
|
259
|
+
if verbose and resized:
|
|
260
|
+
print(f"{Fore.GREEN}Image resized for {filename} page {i} to fit within {max_dimension_pixels} pixels.{Style.RESET_ALL}")
|
|
261
|
+
except Exception as e:
|
|
262
|
+
image_processing_status["resize"] = {
|
|
263
|
+
"status": "error",
|
|
264
|
+
"error": str(e)
|
|
265
|
+
}
|
|
266
|
+
if verbose:
|
|
267
|
+
print(f"{Fore.RED}Error resizing image for {filename}:{Style.RESET_ALL} {resized['error']}. OCR continues without resizing.")
|
|
157
268
|
|
|
269
|
+
try:
|
|
270
|
+
messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
|
|
271
|
+
response = self.vlm_engine.chat(
|
|
272
|
+
messages,
|
|
273
|
+
verbose=verbose,
|
|
274
|
+
stream=False
|
|
275
|
+
)
|
|
276
|
+
# Clean the response if output mode is markdown
|
|
277
|
+
if self.output_mode == "markdown":
|
|
278
|
+
response = clean_markdown(response)
|
|
279
|
+
|
|
280
|
+
# Add the page to the OCR result
|
|
281
|
+
ocr_result.add_page(text=response,
|
|
282
|
+
image_processing_status=image_processing_status)
|
|
283
|
+
|
|
284
|
+
except Exception as page_e:
|
|
285
|
+
ocr_result.status = "error"
|
|
286
|
+
ocr_result.add_page(text=f"Error during OCR for a page in {filename}: {str(page_e)}",
|
|
287
|
+
image_processing_status={})
|
|
288
|
+
if verbose:
|
|
289
|
+
print(f"{Fore.RED}Error during OCR for a page in {filename}:{Style.RESET_ALL} {page_e}")
|
|
158
290
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
if concurrent_batch_size <= 0:
|
|
163
|
-
raise ValueError("concurrent_batch_size must be greater than 0")
|
|
291
|
+
# Add the OCR result to the list
|
|
292
|
+
ocr_result.status = "success"
|
|
293
|
+
ocr_results.append(ocr_result)
|
|
164
294
|
|
|
165
295
|
if verbose:
|
|
166
|
-
|
|
296
|
+
print(f"{Fore.BLUE}Successfully processed {filename} with {len(ocr_result)} pages.{Style.RESET_ALL}")
|
|
297
|
+
for page in ocr_result:
|
|
298
|
+
print(page)
|
|
299
|
+
print("-" * 80)
|
|
167
300
|
|
|
168
|
-
|
|
169
|
-
max_new_tokens=max_new_tokens,
|
|
170
|
-
temperature=temperature,
|
|
171
|
-
concurrent_batch_size=concurrent_batch_size,
|
|
172
|
-
**kwrs))
|
|
173
|
-
|
|
174
|
-
# Sync processing
|
|
175
|
-
return self._run_ocr(file_paths, max_new_tokens=max_new_tokens, temperature=temperature, verbose=verbose, **kwrs)
|
|
176
|
-
|
|
301
|
+
return ocr_results
|
|
177
302
|
|
|
178
|
-
|
|
179
|
-
|
|
303
|
+
|
|
304
|
+
def concurrent_ocr(self, file_paths: Union[str, Iterable[str]], rotate_correction:bool=False,
|
|
305
|
+
max_dimension_pixels:int=None, concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
|
|
180
306
|
"""
|
|
181
|
-
|
|
307
|
+
First complete first out. Input and output order not guaranteed.
|
|
308
|
+
This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
|
|
309
|
+
Results are processed concurrently using asyncio.
|
|
182
310
|
|
|
183
311
|
Parameters:
|
|
184
312
|
-----------
|
|
185
313
|
file_paths : Union[str, Iterable[str]]
|
|
186
|
-
A file path or a list of file paths to process. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
The
|
|
191
|
-
|
|
192
|
-
|
|
314
|
+
A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
315
|
+
rotate_correction : bool, Optional
|
|
316
|
+
If True, applies rotate correction to the images using pytesseract.
|
|
317
|
+
max_dimension_pixels : int, Optional
|
|
318
|
+
The maximum dimension of the image in pixels. Origianl dimensions will be resized to fit in. If None, no resizing is applied.
|
|
319
|
+
concurrent_batch_size : int, Optional
|
|
320
|
+
The number of concurrent VLM calls to make.
|
|
321
|
+
max_file_load : int, Optional
|
|
322
|
+
The maximum number of files to load concurrently. If None, defaults to 2 times of concurrent_batch_size.
|
|
193
323
|
|
|
194
324
|
Returns:
|
|
195
325
|
--------
|
|
196
|
-
|
|
197
|
-
A
|
|
326
|
+
AsyncGenerator[OCRResult, None]
|
|
327
|
+
A generator that yields OCR result objects as they complete.
|
|
198
328
|
"""
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
images = get_images_from_pdf(file_path)
|
|
205
|
-
if not images:
|
|
206
|
-
raise ValueError(f"No images extracted from PDF: {file_path}")
|
|
207
|
-
pdf_results = []
|
|
208
|
-
for image in images:
|
|
209
|
-
messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
|
|
210
|
-
response = self.vlm_engine.chat(
|
|
211
|
-
messages,
|
|
212
|
-
max_new_tokens=max_new_tokens,
|
|
213
|
-
temperature=temperature,
|
|
214
|
-
verbose=verbose,
|
|
215
|
-
stream=False,
|
|
216
|
-
**kwrs
|
|
217
|
-
)
|
|
218
|
-
pdf_results.append(response)
|
|
219
|
-
|
|
220
|
-
ocr_text = self.page_delimiter.join(pdf_results)
|
|
221
|
-
# Image
|
|
222
|
-
else:
|
|
223
|
-
image = get_image_from_file(file_path)
|
|
224
|
-
messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
|
|
225
|
-
ocr_text = self.vlm_engine.chat(
|
|
226
|
-
messages,
|
|
227
|
-
max_new_tokens=max_new_tokens,
|
|
228
|
-
temperature=temperature,
|
|
229
|
-
verbose=verbose,
|
|
230
|
-
stream=False,
|
|
231
|
-
**kwrs
|
|
232
|
-
)
|
|
233
|
-
|
|
234
|
-
# Clean markdown
|
|
235
|
-
if self.output_mode == "markdown":
|
|
236
|
-
ocr_text = clean_markdown(ocr_text)
|
|
237
|
-
ocr_results.append(ocr_text)
|
|
329
|
+
if isinstance(file_paths, str):
|
|
330
|
+
file_paths = [file_paths]
|
|
331
|
+
|
|
332
|
+
if max_file_load is None:
|
|
333
|
+
max_file_load = concurrent_batch_size * 2
|
|
238
334
|
|
|
239
|
-
|
|
335
|
+
if not isinstance(max_file_load, int) or max_file_load <= 0:
|
|
336
|
+
raise ValueError("max_file_load must be a positive integer")
|
|
337
|
+
|
|
338
|
+
if self.image_processor.has_tesseract==False and rotate_correction:
|
|
339
|
+
raise ImportError("pytesseract is not installed. Please install it to use rotate correction.")
|
|
240
340
|
|
|
341
|
+
return self._ocr_async(file_paths=file_paths,
|
|
342
|
+
rotate_correction=rotate_correction,
|
|
343
|
+
max_dimension_pixels=max_dimension_pixels,
|
|
344
|
+
concurrent_batch_size=concurrent_batch_size,
|
|
345
|
+
max_file_load=max_file_load)
|
|
346
|
+
|
|
241
347
|
|
|
242
|
-
async def
|
|
243
|
-
|
|
348
|
+
async def _ocr_async(self, file_paths: Iterable[str], rotate_correction:bool=False, max_dimension_pixels:int=None,
|
|
349
|
+
concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
|
|
244
350
|
"""
|
|
245
|
-
|
|
351
|
+
Internal method to asynchronously process an iterable of file paths.
|
|
352
|
+
Yields OCRResult objects as they complete. Order not guaranteed.
|
|
353
|
+
concurrent_batch_size controls how many VLM calls are made concurrently.
|
|
246
354
|
"""
|
|
247
|
-
|
|
248
|
-
|
|
355
|
+
vlm_call_semaphore = asyncio.Semaphore(concurrent_batch_size)
|
|
356
|
+
file_load_semaphore = asyncio.Semaphore(max_file_load)
|
|
357
|
+
|
|
358
|
+
tasks = []
|
|
249
359
|
for file_path in file_paths:
|
|
360
|
+
task = self._ocr_file_with_semaphore(file_load_semaphore=file_load_semaphore,
|
|
361
|
+
vlm_call_semaphore=vlm_call_semaphore,
|
|
362
|
+
file_path=file_path,
|
|
363
|
+
rotate_correction=rotate_correction,
|
|
364
|
+
max_dimension_pixels=max_dimension_pixels)
|
|
365
|
+
tasks.append(task)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
for future in asyncio.as_completed(tasks):
|
|
369
|
+
result: OCRResult = await future
|
|
370
|
+
yield result
|
|
371
|
+
|
|
372
|
+
async def _ocr_file_with_semaphore(self, file_load_semaphore:asyncio.Semaphore, vlm_call_semaphore:asyncio.Semaphore,
|
|
373
|
+
file_path:str, rotate_correction:bool=False, max_dimension_pixels:int=None) -> OCRResult:
|
|
374
|
+
"""
|
|
375
|
+
This internal method takes a semaphore and OCR a single file using the VLM inference engine.
|
|
376
|
+
"""
|
|
377
|
+
async with file_load_semaphore:
|
|
378
|
+
filename = os.path.basename(file_path)
|
|
250
379
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
else:
|
|
260
|
-
image = get_image_from_file(file_path)
|
|
261
|
-
flat_page_list.append({'file_path': file_path, 'file_type': "image", "image": image})
|
|
262
|
-
|
|
263
|
-
# Process images with asyncio.Semaphore
|
|
264
|
-
semaphore = asyncio.Semaphore(concurrent_batch_size)
|
|
265
|
-
async def semaphore_helper(page:List[Dict[str,str]], max_new_tokens:int, temperature:float, **kwrs):
|
|
380
|
+
result = OCRResult(input_dir=file_path, output_mode=self.output_mode)
|
|
381
|
+
# check file extension
|
|
382
|
+
if file_ext not in SUPPORTED_IMAGE_EXTS:
|
|
383
|
+
result.status = "error"
|
|
384
|
+
result.add_page(text=f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}",
|
|
385
|
+
image_processing_status={})
|
|
386
|
+
return result
|
|
387
|
+
|
|
266
388
|
try:
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
389
|
+
# Load images from file
|
|
390
|
+
if file_ext == '.pdf':
|
|
391
|
+
data_loader = PDFDataLoader(file_path)
|
|
392
|
+
elif file_ext in ['.tif', '.tiff']:
|
|
393
|
+
data_loader = TIFFDataLoader(file_path)
|
|
394
|
+
else:
|
|
395
|
+
data_loader = ImageDataLoader(file_path)
|
|
396
|
+
|
|
397
|
+
except Exception as e:
|
|
398
|
+
result.status = "error"
|
|
399
|
+
result.add_page(text=f"Error processing file {filename}: {str(e)}", image_processing_status={})
|
|
400
|
+
return result
|
|
401
|
+
|
|
402
|
+
try:
|
|
403
|
+
page_processing_tasks = []
|
|
404
|
+
for page_index in range(data_loader.get_page_count()):
|
|
405
|
+
task = self._ocr_page_with_semaphore(
|
|
406
|
+
vlm_call_semaphore=vlm_call_semaphore,
|
|
407
|
+
data_loader=data_loader,
|
|
408
|
+
page_index=page_index,
|
|
409
|
+
rotate_correction=rotate_correction,
|
|
410
|
+
max_dimension_pixels=max_dimension_pixels
|
|
274
411
|
)
|
|
275
|
-
|
|
412
|
+
page_processing_tasks.append(task)
|
|
413
|
+
|
|
414
|
+
if page_processing_tasks:
|
|
415
|
+
processed_page_results = await asyncio.gather(*page_processing_tasks)
|
|
416
|
+
for text, image_processing_status in processed_page_results:
|
|
417
|
+
result.add_page(text=text, image_processing_status=image_processing_status)
|
|
418
|
+
|
|
276
419
|
except Exception as e:
|
|
277
|
-
|
|
278
|
-
|
|
420
|
+
result.status = "error"
|
|
421
|
+
result.add_page(text=f"Error during OCR for {filename}: {str(e)}", image_processing_status={})
|
|
422
|
+
return result
|
|
279
423
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
tasks.append(asyncio.create_task(async_task))
|
|
424
|
+
# Set status to success if no errors occurred
|
|
425
|
+
result.status = "success"
|
|
426
|
+
return result
|
|
427
|
+
|
|
428
|
+
async def _ocr_page_with_semaphore(self, vlm_call_semaphore: asyncio.Semaphore, data_loader: DataLoader,
|
|
429
|
+
page_index:int, rotate_correction:bool=False, max_dimension_pixels:int=None) -> Tuple[str, Dict[str, str]]:
|
|
430
|
+
"""
|
|
431
|
+
This internal method takes a semaphore and OCR a single image/page using the VLM inference engine.
|
|
289
432
|
|
|
290
|
-
|
|
433
|
+
Returns:
|
|
434
|
+
-------
|
|
435
|
+
Tuple[str, Dict[str, str]]
|
|
436
|
+
A tuple containing the OCR text and a dictionary with image processing status.
|
|
437
|
+
"""
|
|
438
|
+
async with vlm_call_semaphore:
|
|
439
|
+
image = await data_loader.get_page_async(page_index)
|
|
440
|
+
image_processing_status = {}
|
|
441
|
+
# Apply rotate correction if specified and tesseract is available
|
|
442
|
+
if rotate_correction and self.image_processor.has_tesseract:
|
|
443
|
+
try:
|
|
444
|
+
image, rotation_angle = await self.image_processor.rotate_correction_async(image)
|
|
445
|
+
image_processing_status["rotate_correction"] = {
|
|
446
|
+
"status": "success",
|
|
447
|
+
"rotation_angle": rotation_angle
|
|
448
|
+
}
|
|
449
|
+
except Exception as e:
|
|
450
|
+
image_processing_status["rotate_correction"] = {
|
|
451
|
+
"status": "error",
|
|
452
|
+
"error": str(e)
|
|
453
|
+
}
|
|
291
454
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
if page['file_type'] == "image":
|
|
306
|
-
if self.output_mode == "markdown":
|
|
307
|
-
ocr_text = clean_markdown(ocr_text)
|
|
308
|
-
ocr_results.append(ocr_text)
|
|
309
|
-
|
|
310
|
-
return ocr_results
|
|
455
|
+
# Resize the image if max_dimension_pixels is specified
|
|
456
|
+
if max_dimension_pixels is not None:
|
|
457
|
+
try:
|
|
458
|
+
image, resized = await self.image_processor.resize_async(image, max_dimension_pixels=max_dimension_pixels)
|
|
459
|
+
image_processing_status["resize"] = {
|
|
460
|
+
"status": "success",
|
|
461
|
+
"resized": resized
|
|
462
|
+
}
|
|
463
|
+
except Exception as e:
|
|
464
|
+
image_processing_status["resize"] = {
|
|
465
|
+
"status": "error",
|
|
466
|
+
"error": str(e)
|
|
467
|
+
}
|
|
311
468
|
|
|
469
|
+
messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
|
|
470
|
+
ocr_text = await self.vlm_engine.chat_async(
|
|
471
|
+
messages,
|
|
472
|
+
)
|
|
473
|
+
if self.output_mode == "markdown":
|
|
474
|
+
ocr_text = clean_markdown(ocr_text)
|
|
475
|
+
return ocr_text, image_processing_status
|