vlm4ocr 0.0.1__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vlm4ocr
3
- Version: 0.0.1
4
- Summary: OCR with vision language models.
3
+ Version: 0.1.0
4
+ Summary: Python package and Web App for OCR with vision language models.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
7
7
  Requires-Python: >=3.11,<4.0
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "vlm4ocr"
3
- version = "0.0.1"
4
- description = "OCR with vision language models."
3
+ version = "0.1.0"
4
+ description = "Python package and Web App for OCR with vision language models."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
7
7
  readme = "README.md"
@@ -17,6 +17,8 @@ python = "^3.11"
17
17
  pdf2image = ">=1.16.0"
18
18
  pillow = ">=10.0.0"
19
19
 
20
+ [tool.poetry.scripts]
21
+ vlm4ocr = "vlm4ocr.cli:main"
20
22
 
21
23
  [build-system]
22
24
  requires = ["poetry-core"]
@@ -0,0 +1 @@
1
+ You are a helpful assistant that can convert scanned documents into functional HTML. Your output is accurate and well-formatted, starting with <html> and ending with </html>. You will only output the HTML without any additional explanations or comments. The HTML should include all text, tables, and lists with appropriate tags (e.g., "table", "tbody", "tr", ""li) and stlyes (e.g., "font-family", "color", "font-size") that represents the text contents in the input. You will ignore images, icons, or anything that can not be converted into text.
@@ -0,0 +1 @@
1
+ Convert contents in this image into HTML.
@@ -0,0 +1 @@
1
+ Convert contents in this image into plain text.
@@ -0,0 +1,378 @@
1
+ # vlm4ocr/cli.py
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+ import logging
7
+
8
+ # Attempt to import from the local package structure
9
+ # This allows running the script directly for development,
10
+ # assuming the script is in vlm4ocr/vlm4ocr/cli.py and the package root is vlm4ocr/vlm4ocr
11
+ try:
12
+ from .ocr_engines import OCREngine
13
+ from .vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine
14
+ except ImportError:
15
+ # Fallback for when the package is installed and cli.py is run as part of it
16
+ from vlm4ocr.ocr_engines import OCREngine
17
+ from vlm4ocr.vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine
18
+
19
+ # Configure basic logging
20
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Define supported extensions here, ideally this should be sourced from ocr_engines.py
24
+ SUPPORTED_IMAGE_EXTS_CLI = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
25
+ OUTPUT_EXTENSIONS = {'markdown': '.md', 'HTML':'.html', 'text':'txt'}
26
+
27
+ def main():
28
+ """
29
+ Main function for the vlm4ocr CLI.
30
+ Parses arguments, initializes engines, runs OCR, and handles output.
31
+ """
32
+ parser = argparse.ArgumentParser(
33
+ description="VLM4OCR: Perform OCR on images, PDFs, or TIFF files using Vision Language Models.",
34
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
35
+ )
36
+
37
+ # --- Input/Output Arguments ---
38
+ io_group = parser.add_argument_group("Input/Output Options")
39
+ io_group.add_argument(
40
+ "--input_path",
41
+ required=True,
42
+ help="Path to the input image, PDF, or TIFF file, or a directory containing these files. "
43
+ "If a directory is provided, all supported files within will be processed."
44
+ )
45
+ io_group.add_argument(
46
+ "--output_mode",
47
+ choices=["markdown", "HTML", "text"],
48
+ default="markdown",
49
+ help="Desired output format for the OCR results."
50
+ )
51
+ io_group.add_argument(
52
+ "--output_file",
53
+ help="Optional: Path to a file to save the output. "
54
+ "If input_path is a directory, this should be a directory where results will be saved "
55
+ "(one file per input, with original name and new extension). "
56
+ "If not provided, output is written to files in the current working directory "
57
+ "(e.g., 'input_name_ocr.output_mode')."
58
+ )
59
+
60
+ # --- VLM Engine Selection ---
61
+ vlm_engine_group = parser.add_argument_group("VLM Engine Selection")
62
+ vlm_engine_group.add_argument(
63
+ "--vlm_engine",
64
+ choices=["openai", "azure_openai", "ollama", "openai_compatible"],
65
+ required=True,
66
+ help="Specify the VLM engine to use."
67
+ )
68
+ vlm_engine_group.add_argument(
69
+ "--model",
70
+ required=True,
71
+ help="The specific model identifier for the chosen VLM engine. "
72
+ "E.g., 'gpt-4o' for OpenAI, 'deployment-name' for Azure, "
73
+ "'Qwen/Qwen2.5-VL-7B-Instruct' for OpenAI-compatible, "
74
+ "or 'llava:latest' for Ollama."
75
+ )
76
+
77
+ # --- OpenAI Engine Arguments ---
78
+ openai_group = parser.add_argument_group("OpenAI & OpenAI-Compatible Options")
79
+ openai_group.add_argument(
80
+ "--api_key",
81
+ default=os.environ.get("OPENAI_API_KEY"),
82
+ help="API key for OpenAI or OpenAI-compatible service. "
83
+ "Can also be set via OPENAI_API_KEY environment variable."
84
+ )
85
+ openai_group.add_argument(
86
+ "--base_url",
87
+ help="Base URL for OpenAI-compatible services (e.g., vLLM endpoint like 'http://localhost:8000/v1'). "
88
+ "Not used for official OpenAI API."
89
+ )
90
+
91
+ # --- Azure OpenAI Engine Arguments ---
92
+ azure_group = parser.add_argument_group("Azure OpenAI Options")
93
+ azure_group.add_argument(
94
+ "--azure_api_key",
95
+ default=os.environ.get("AZURE_OPENAI_API_KEY"),
96
+ help="API key for Azure OpenAI service. "
97
+ "Can also be set via AZURE_OPENAI_API_KEY environment variable."
98
+ )
99
+ azure_group.add_argument(
100
+ "--azure_endpoint",
101
+ default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
102
+ help="Endpoint URL for Azure OpenAI service. "
103
+ "Can also be set via AZURE_OPENAI_ENDPOINT environment variable."
104
+ )
105
+ azure_group.add_argument(
106
+ "--azure_api_version",
107
+ default=os.environ.get("AZURE_OPENAI_API_VERSION"),
108
+ help="API version for Azure OpenAI service (e.g., '2024-02-01'). "
109
+ "Can also be set via AZURE_OPENAI_API_VERSION environment variable."
110
+ )
111
+
112
+ # --- Ollama Engine Arguments ---
113
+ ollama_group = parser.add_argument_group("Ollama Options")
114
+ ollama_group.add_argument(
115
+ "--ollama_host",
116
+ default="http://localhost:11434",
117
+ help="Host URL for the Ollama server."
118
+ )
119
+ ollama_group.add_argument(
120
+ "--ollama_num_ctx",
121
+ type=int,
122
+ default=4096,
123
+ help="Context length for Ollama models."
124
+ )
125
+ ollama_group.add_argument(
126
+ "--ollama_keep_alive",
127
+ type=int,
128
+ default=300, # Default from OllamaVLMEngine
129
+ help="Seconds to keep the Ollama model loaded after the last call."
130
+ )
131
+
132
+
133
+ # --- OCR Engine Parameters ---
134
+ ocr_params_group = parser.add_argument_group("OCR Engine Parameters")
135
+ ocr_params_group.add_argument(
136
+ "--user_prompt",
137
+ help="Optional: Custom user prompt to provide context about the image/PDF/TIFF."
138
+ )
139
+ # REMOVED --system_prompt argument
140
+ ocr_params_group.add_argument(
141
+ "--max_new_tokens",
142
+ type=int,
143
+ default=4096, # Default from OCREngine
144
+ help="Maximum number of new tokens the VLM can generate."
145
+ )
146
+ ocr_params_group.add_argument(
147
+ "--temperature",
148
+ type=float,
149
+ default=0.0, # Default from OCREngine
150
+ help="Temperature for token sampling (0.0 for deterministic output)."
151
+ )
152
+
153
+ # --- Processing Options ---
154
+ processing_group = parser.add_argument_group("Processing Options")
155
+ processing_group.add_argument(
156
+ "--concurrent",
157
+ action="store_true",
158
+ help="Enable concurrent processing for multiple files or PDF/TIFF pages."
159
+ )
160
+ processing_group.add_argument(
161
+ "--concurrent_batch_size",
162
+ type=int,
163
+ default=32,
164
+ help="Batch size for concurrent processing."
165
+ )
166
+ processing_group.add_argument(
167
+ "--verbose",
168
+ action="store_true",
169
+ help="Enable verbose output from the OCR engine during processing. CLI will also log more info."
170
+ )
171
+ processing_group.add_argument(
172
+ "--debug",
173
+ action="store_true",
174
+ help="Enable debug level logging for more detailed information."
175
+ )
176
+
177
+ args = parser.parse_args()
178
+
179
+ if args.debug:
180
+ logging.getLogger().setLevel(logging.DEBUG)
181
+ logger.setLevel(logging.DEBUG)
182
+ logger.debug("Debug mode enabled.")
183
+ logger.debug(f"Parsed arguments: {args}")
184
+ elif args.verbose:
185
+ logger.setLevel(logging.INFO) # Ensure logger level is at least INFO for verbose CLI output
186
+
187
+ # --- Validate Arguments ---
188
+ # verbose is not supported with concurrent processing
189
+ if args.verbose and args.concurrent:
190
+ logger.warning("Verbose output is not supported with concurrent processing. "
191
+ "Verbose mode will be ignored.")
192
+ args.verbose = False
193
+
194
+ # --- Initialize VLM Engine ---
195
+ vlm_engine_instance = None
196
+ try:
197
+ logger.info(f"Initializing VLM engine: {args.vlm_engine} with model: {args.model}")
198
+ if args.vlm_engine == "openai":
199
+ if not args.api_key:
200
+ parser.error("--api_key (or OPENAI_API_KEY env var) is required for OpenAI engine.")
201
+ vlm_engine_instance = OpenAIVLMEngine(
202
+ model=args.model,
203
+ api_key=args.api_key
204
+ # reasoning_model removed
205
+ )
206
+ elif args.vlm_engine == "openai_compatible":
207
+ if not args.api_key :
208
+ logger.warning("API key not provided or empty for openai_compatible. This might be acceptable for some servers (e.g. if 'EMPTY' is expected).")
209
+ if not args.base_url:
210
+ parser.error("--base_url is required for openai_compatible engine.")
211
+ vlm_engine_instance = OpenAIVLMEngine(
212
+ model=args.model,
213
+ api_key=args.api_key,
214
+ base_url=args.base_url
215
+ # reasoning_model removed
216
+ )
217
+ elif args.vlm_engine == "azure_openai":
218
+ if not args.azure_api_key:
219
+ parser.error("--azure_api_key (or AZURE_OPENAI_API_KEY env var) is required for Azure OpenAI engine.")
220
+ if not args.azure_endpoint:
221
+ parser.error("--azure_endpoint (or AZURE_OPENAI_ENDPOINT env var) is required for Azure OpenAI engine.")
222
+ if not args.azure_api_version:
223
+ parser.error("--azure_api_version (or AZURE_OPENAI_API_VERSION env var) is required for Azure OpenAI engine.")
224
+ vlm_engine_instance = AzureOpenAIVLMEngine(
225
+ model=args.model,
226
+ api_key=args.azure_api_key,
227
+ azure_endpoint=args.azure_endpoint,
228
+ api_version=args.azure_api_version
229
+ # reasoning_model removed
230
+ )
231
+ elif args.vlm_engine == "ollama":
232
+ vlm_engine_instance = OllamaVLMEngine(
233
+ model_name=args.model, # OllamaVLMEngine expects model_name
234
+ host=args.ollama_host,
235
+ num_ctx=args.ollama_num_ctx,
236
+ keep_alive=args.ollama_keep_alive
237
+ )
238
+ else:
239
+ # This case should be caught by argparse choices, but as a safeguard:
240
+ logger.error(f"Invalid VLM engine specified: {args.vlm_engine}")
241
+ sys.exit(1)
242
+ logger.info("VLM engine initialized successfully.")
243
+
244
+ except ImportError as e:
245
+ logger.error(f"Failed to import a required library for {args.vlm_engine}: {e}. "
246
+ "Please ensure the necessary dependencies (e.g., 'openai', 'ollama') are installed.")
247
+ sys.exit(1)
248
+ except Exception as e:
249
+ logger.error(f"Error initializing VLM engine '{args.vlm_engine}': {e}")
250
+ if args.debug:
251
+ logger.exception("Traceback for VLM engine initialization error:")
252
+ sys.exit(1)
253
+
254
+ # --- Initialize OCR Engine ---
255
+ try:
256
+ logger.info(f"Initializing OCR engine with output mode: {args.output_mode}")
257
+ ocr_engine_instance = OCREngine(
258
+ vlm_engine=vlm_engine_instance,
259
+ output_mode=args.output_mode,
260
+ # system_prompt removed, OCREngine will use its default
261
+ user_prompt=args.user_prompt
262
+ )
263
+ logger.info("OCR engine initialized successfully.")
264
+ except Exception as e:
265
+ logger.error(f"Error initializing OCR engine: {e}")
266
+ if args.debug:
267
+ logger.exception("Traceback for OCR engine initialization error:")
268
+ sys.exit(1)
269
+
270
+ # --- Prepare input file paths ---
271
+ input_files_to_process = []
272
+ if os.path.isdir(args.input_path):
273
+ logger.info(f"Input path is a directory: {args.input_path}. Scanning for supported files...")
274
+ for item in os.listdir(args.input_path):
275
+ item_path = os.path.join(args.input_path, item)
276
+ if os.path.isfile(item_path):
277
+ file_ext = os.path.splitext(item)[1].lower()
278
+ if file_ext in SUPPORTED_IMAGE_EXTS_CLI:
279
+ input_files_to_process.append(item_path)
280
+ if not input_files_to_process:
281
+ logger.error(f"No supported files (PDF, TIFF, PNG, JPG, etc.) found in directory: {args.input_path}")
282
+ sys.exit(1)
283
+ logger.info(f"Found {len(input_files_to_process)} supported files to process.")
284
+ elif os.path.isfile(args.input_path):
285
+ file_ext = os.path.splitext(args.input_path)[1].lower()
286
+ if file_ext not in SUPPORTED_IMAGE_EXTS_CLI:
287
+ logger.error(f"Input file '{args.input_path}' is not a supported file type. Supported: {SUPPORTED_IMAGE_EXTS_CLI}")
288
+ sys.exit(1)
289
+ input_files_to_process = [args.input_path]
290
+ logger.info(f"Processing single input file: {args.input_path}")
291
+ else:
292
+ logger.error(f"Input path is not a valid file or directory: {args.input_path}")
293
+ sys.exit(1)
294
+
295
+
296
+ # --- Run OCR ---
297
+ try:
298
+ logger.info("Starting OCR processing...")
299
+ ocr_results_list = ocr_engine_instance.run_ocr(
300
+ file_paths=input_files_to_process,
301
+ max_new_tokens=args.max_new_tokens,
302
+ temperature=args.temperature,
303
+ verbose=args.verbose,
304
+ concurrent=args.concurrent,
305
+ concurrent_batch_size=args.concurrent_batch_size
306
+ )
307
+ logger.info("OCR processing completed.")
308
+
309
+ # --- Handle Output ---
310
+ if args.output_file:
311
+ if os.path.isdir(args.input_path) and len(input_files_to_process) > 1 :
312
+ if not os.path.exists(args.output_file):
313
+ logger.info(f"Creating output directory: {args.output_file}")
314
+ os.makedirs(args.output_file, exist_ok=True)
315
+ elif not os.path.isdir(args.output_file):
316
+ logger.error(f"Output path '{args.output_file}' exists and is not a directory, "
317
+ "but multiple input files were processed. Please specify a directory for --output_file.")
318
+ sys.exit(1)
319
+
320
+ output_target_dir = args.output_file
321
+ elif not (os.path.isdir(args.input_path) and len(input_files_to_process) > 1):
322
+ # Single input file, or directory with one file. output_file is a direct file path.
323
+ # Ensure its directory exists.
324
+ output_target_dir = os.path.dirname(args.output_file)
325
+ if output_target_dir and not os.path.exists(output_target_dir):
326
+ logger.info(f"Creating output directory: {output_target_dir}")
327
+ os.makedirs(output_target_dir, exist_ok=True)
328
+ else: # Should not happen if logic above is correct
329
+ output_target_dir = os.getcwd()
330
+
331
+
332
+ for i, input_file_path in enumerate(input_files_to_process):
333
+ if os.path.isdir(args.input_path) and len(input_files_to_process) > 1:
334
+ # Multiple inputs, save into the directory specified by args.output_file
335
+ base_name = os.path.basename(input_file_path)
336
+ name_part, _ = os.path.splitext(base_name)
337
+ output_filename = f"{name_part}_ocr{OUTPUT_EXTENSIONS[args.output_mode]}"
338
+ full_output_path = os.path.join(args.output_file, output_filename)
339
+ else:
340
+ # Single input, args.output_file is the exact path
341
+ full_output_path = args.output_file
342
+
343
+ try:
344
+ with open(full_output_path, "w", encoding="utf-8") as f:
345
+ f.write(ocr_results_list[i])
346
+ logger.info(f"OCR result for '{input_file_path}' saved to: {full_output_path}")
347
+ except Exception as e:
348
+ logger.error(f"Error writing output for '{input_file_path}' to '{full_output_path}': {e}")
349
+ else:
350
+ # No --output_file specified, save to current working directory
351
+ current_dir = os.getcwd()
352
+ logger.info(f"No --output_file specified. Results will be saved to the current working directory: {current_dir}")
353
+ for i, input_file_path in enumerate(input_files_to_process):
354
+ base_name = os.path.basename(input_file_path)
355
+ name_part, _ = os.path.splitext(base_name)
356
+ output_filename = f"{name_part}_ocr{OUTPUT_EXTENSIONS[args.output_mode]}"
357
+ full_output_path = os.path.join(current_dir, output_filename)
358
+ try:
359
+ with open(full_output_path, "w", encoding="utf-8") as f:
360
+ f.write(ocr_results_list[i])
361
+ logger.info(f"OCR result for '{input_file_path}' saved to: {full_output_path}")
362
+ except Exception as e:
363
+ logger.error(f"Error writing output for '{input_file_path}' to '{full_output_path}': {e}")
364
+
365
+ except FileNotFoundError as e:
366
+ logger.error(f"File not found during OCR processing: {e}")
367
+ sys.exit(1)
368
+ except ValueError as e:
369
+ logger.error(f"Input Error or Value Error during processing: {e}")
370
+ sys.exit(1)
371
+ except Exception as e:
372
+ logger.error(f"An unexpected error occurred during OCR processing: {e}")
373
+ if args.debug:
374
+ logger.exception("Traceback for OCR processing error:")
375
+ sys.exit(1)
376
+
377
+ if __name__ == "__main__":
378
+ main()
@@ -2,13 +2,14 @@ import os
2
2
  from typing import List, Dict, Union, Generator, Iterable
3
3
  import importlib
4
4
  import asyncio
5
- from vlm4ocr.utils import get_images_from_pdf, get_image_from_file, clean_markdown
5
+ from vlm4ocr.utils import get_images_from_pdf, get_images_from_tiff, get_image_from_file, clean_markdown
6
6
  from vlm4ocr.vlm_engines import VLMEngine
7
7
 
8
- SUPPORTED_IMAGE_EXTS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
8
+ SUPPORTED_IMAGE_EXTS = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
9
+
9
10
 
10
11
  class OCREngine:
11
- def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None, page_delimiter:str="\n\n---\n\n"):
12
+ def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None, page_delimiter:str="auto"):
12
13
  """
13
14
  This class inputs a image or PDF file path and processes them using a VLM inference engine. Outputs plain text or markdown.
14
15
 
@@ -17,13 +18,17 @@ class OCREngine:
17
18
  inference_engine : InferenceEngine
18
19
  The inference engine to use for OCR.
19
20
  output_mode : str, Optional
20
- The output format. Can be 'markdown' or 'text'.
21
+ The output format. Must be 'markdown', 'HTML', or 'text'.
21
22
  system_prompt : str, Optional
22
23
  Custom system prompt. We recommend use a default system prompt by leaving this blank.
23
24
  user_prompt : str, Optional
24
25
  Custom user prompt. It is good to include some information regarding the document. If not specified, a default will be used.
25
26
  page_delimiter : str, Optional
26
27
  The delimiter to use between PDF pages.
28
+ if 'auto', it will be set to the default page delimiter for the output mode:
29
+ 'markdown' -> '\n\n---\n\n'
30
+ 'HTML' -> '<br><br>'
31
+ 'text' -> '\n\n---\n\n'
27
32
  """
28
33
  # Check inference engine
29
34
  if not isinstance(vlm_engine, VLMEngine):
@@ -31,8 +36,8 @@ class OCREngine:
31
36
  self.vlm_engine = vlm_engine
32
37
 
33
38
  # Check output mode
34
- if output_mode not in ["markdown", "text"]:
35
- raise ValueError("output_mode must be 'markdown' or 'text'")
39
+ if output_mode not in ["markdown", "HTML", "text"]:
40
+ raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
36
41
  self.output_mode = output_mode
37
42
 
38
43
  # System prompt
@@ -47,13 +52,21 @@ class OCREngine:
47
52
  if isinstance(user_prompt, str) and user_prompt:
48
53
  self.user_prompt = user_prompt
49
54
  else:
50
- file_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath('ocr_user_prompt.txt')
55
+ file_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
51
56
  with open(file_path, 'r', encoding='utf-8') as f:
52
57
  self.user_prompt = f.read()
53
58
 
54
59
  # Page delimiter
55
60
  if isinstance(page_delimiter, str):
56
- self.page_delimiter = page_delimiter
61
+ if page_delimiter == "auto":
62
+ if self.output_mode == "markdown":
63
+ self.page_delimiter = "\n\n---\n\n"
64
+ elif self.output_mode == "HTML":
65
+ self.page_delimiter = "<br><br>"
66
+ else:
67
+ self.page_delimiter = "\n\n---\n\n"
68
+ else:
69
+ self.page_delimiter = page_delimiter
57
70
  else:
58
71
  raise ValueError("page_delimiter must be a string")
59
72
 
@@ -61,16 +74,17 @@ class OCREngine:
61
74
  def stream_ocr(self, file_path: str, max_new_tokens:int=4096, temperature:float=0.0, **kwrs) -> Generator[str, None, None]:
62
75
  """
63
76
  This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications.
77
+ Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.
64
78
 
65
79
  Parameters:
66
80
  -----------
67
81
  file_path : str
68
- The path to the image or PDF file. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
82
+ The path to the image or PDF file. Must be one of '.pdf', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
69
83
  max_new_tokens : int, Optional
70
84
  The maximum number of tokens to generate.
71
85
  temperature : float, Optional
72
86
  The temperature to use for sampling.
73
-
87
+
74
88
  Returns:
75
89
  --------
76
90
  Generator[str, None, None]
@@ -82,14 +96,14 @@ class OCREngine:
82
96
 
83
97
  # Check file extension
84
98
  file_ext = os.path.splitext(file_path)[1].lower()
85
- if file_ext not in SUPPORTED_IMAGE_EXTS and file_ext != '.pdf':
86
- raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS + ['.pdf']}")
99
+ if file_ext not in SUPPORTED_IMAGE_EXTS:
100
+ raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
87
101
 
88
- # PDF
89
- if file_ext == '.pdf':
90
- images = get_images_from_pdf(file_path)
102
+ # PDF or TIFF
103
+ if file_ext in ['.pdf', '.tif', '.tiff']:
104
+ images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
91
105
  if not images:
92
- raise ValueError(f"No images extracted from PDF: {file_path}")
106
+ raise ValueError(f"No images extracted from file: {file_path}")
93
107
  for i, image in enumerate(images):
94
108
  messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
95
109
  response_stream = self.vlm_engine.chat(
@@ -100,10 +114,10 @@ class OCREngine:
100
114
  **kwrs
101
115
  )
102
116
  for chunk in response_stream:
103
- yield chunk
117
+ yield {"type": "ocr_chunk", "data": chunk}
104
118
 
105
119
  if i < len(images) - 1:
106
- yield self.page_delimiter
120
+ yield {"type": "page_delimiter", "data": self.page_delimiter}
107
121
 
108
122
  # Image
109
123
  else:
@@ -117,18 +131,18 @@ class OCREngine:
117
131
  **kwrs
118
132
  )
119
133
  for chunk in response_stream:
120
- yield chunk
134
+ yield {"type": "ocr_chunk", "data": chunk}
121
135
 
122
136
 
123
137
  def run_ocr(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096, temperature:float=0.0,
124
138
  verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32, **kwrs) -> Union[str, Generator[str, None, None]]:
125
139
  """
126
- This method takes a list of file paths (image or PDF) and perform OCR using the VLM inference engine.
140
+ This method takes a list of file paths (image, PDF, TIFF) and perform OCR using the VLM inference engine.
127
141
 
128
142
  Parameters:
129
143
  -----------
130
144
  file_paths : Union[str, Iterable[str]]
131
- A file path or a list of file paths to process. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
145
+ A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff, '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
132
146
  max_new_tokens : int, Optional
133
147
  The maximum number of tokens to generate.
134
148
  temperature : float, Optional
@@ -152,9 +166,8 @@ class OCREngine:
152
166
  if not isinstance(file_path, str):
153
167
  raise TypeError("file_paths must be a string or an iterable of strings")
154
168
  file_ext = os.path.splitext(file_path)[1].lower()
155
- if file_ext not in SUPPORTED_IMAGE_EXTS and file_ext != '.pdf':
156
- raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS + ['.pdf']}")
157
-
169
+ if file_ext not in SUPPORTED_IMAGE_EXTS:
170
+ raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
158
171
 
159
172
  # Concurrent processing
160
173
  if concurrent:
@@ -178,12 +191,12 @@ class OCREngine:
178
191
  def _run_ocr(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096,
179
192
  temperature:float=0.0, verbose:bool=False, **kwrs) -> Iterable[str]:
180
193
  """
181
- This method inputs a file path or a list of file paths (image or PDF) and performs OCR using the VLM inference engine.
194
+ This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
182
195
 
183
196
  Parameters:
184
197
  -----------
185
198
  file_paths : Union[str, Iterable[str]]
186
- A file path or a list of file paths to process. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
199
+ A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
187
200
  max_new_tokens : int, Optional
188
201
  The maximum number of tokens to generate.
189
202
  temperature : float, Optional
@@ -199,12 +212,12 @@ class OCREngine:
199
212
  ocr_results = []
200
213
  for file_path in file_paths:
201
214
  file_ext = os.path.splitext(file_path)[1].lower()
202
- # PDF
203
- if file_ext == '.pdf':
204
- images = get_images_from_pdf(file_path)
215
+ # PDF or TIFF
216
+ if file_ext in ['.pdf', '.tif', '.tiff']:
217
+ images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
205
218
  if not images:
206
- raise ValueError(f"No images extracted from PDF: {file_path}")
207
- pdf_results = []
219
+ raise ValueError(f"No images extracted from file: {file_path}")
220
+ results = []
208
221
  for image in images:
209
222
  messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
210
223
  response = self.vlm_engine.chat(
@@ -215,9 +228,9 @@ class OCREngine:
215
228
  stream=False,
216
229
  **kwrs
217
230
  )
218
- pdf_results.append(response)
231
+ results.append(response)
219
232
 
220
- ocr_text = self.page_delimiter.join(pdf_results)
233
+ ocr_text = self.page_delimiter.join(results)
221
234
  # Image
222
235
  else:
223
236
  image = get_image_from_file(file_path)
@@ -248,13 +261,13 @@ class OCREngine:
248
261
  flat_page_list = []
249
262
  for file_path in file_paths:
250
263
  file_ext = os.path.splitext(file_path)[1].lower()
251
- # PDF
252
- if file_ext == '.pdf':
253
- images = get_images_from_pdf(file_path)
264
+ # PDF or TIFF
265
+ if file_ext in ['.pdf', '.tif', '.tiff']:
266
+ images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
254
267
  if not images:
255
- flat_page_list.append({'file_path': file_path, 'file_type': "PDF", "image": image, "page_num": 0, "total_page_count": 0})
268
+ flat_page_list.append({'file_path': file_path, 'file_type': "PDF/TIFF", "image": image, "page_num": 0, "total_page_count": 0})
256
269
  for page_num, image in enumerate(images):
257
- flat_page_list.append({'file_path': file_path, 'file_type': "PDF", "image": image, "page_num": page_num, "total_page_count": len(images)})
270
+ flat_page_list.append({'file_path': file_path, 'file_type': "PDF/TIFF", "image": image, "page_num": page_num, "total_page_count": len(images)})
258
271
  # Image
259
272
  else:
260
273
  image = get_image_from_file(file_path)
@@ -291,16 +304,16 @@ class OCREngine:
291
304
 
292
305
  # Restructure the results
293
306
  ocr_results = []
294
- pdf_page_text_buffer = ""
307
+ page_text_buffer = ""
295
308
  for page, ocr_text in zip(flat_page_list, responses):
296
- # PDF
297
- if page['file_type'] == "PDF":
298
- pdf_page_text_buffer += ocr_text + self.page_delimiter
309
+ # PDF or TIFF
310
+ if page['file_type'] == "PDF/TIFF":
311
+ page_text_buffer += ocr_text + self.page_delimiter
299
312
  if page['page_num'] == page['total_page_count'] - 1:
300
313
  if self.output_mode == "markdown":
301
- pdf_page_text_buffer = clean_markdown(pdf_page_text_buffer)
302
- ocr_results.append(pdf_page_text_buffer)
303
- pdf_page_text_buffer = ""
314
+ page_text_buffer = clean_markdown(page_text_buffer)
315
+ ocr_results.append(page_text_buffer)
316
+ page_text_buffer = ""
304
317
  # Image
305
318
  if page['file_type'] == "image":
306
319
  if self.output_mode == "markdown":
@@ -16,17 +16,34 @@ def get_images_from_pdf(file_path: str) -> List[Image.Image]:
16
16
  print(f"Error converting PDF to images: {e}")
17
17
  raise ValueError(f"Failed to process PDF file '{os.path.basename(file_path)}'. Ensure poppler is installed and the file is valid.") from e
18
18
 
19
+ def get_images_from_tiff(file_path: str) -> List[Image.Image]:
20
+ """ Extracts images from a TIFF file. """
21
+ images = []
22
+ try:
23
+ img = Image.open(file_path)
24
+ for i in range(img.n_frames):
25
+ img.seek(i)
26
+ images.append(img.copy())
27
+ if not images:
28
+ print(f"Warning: No images extracted from TIFF: {file_path}")
29
+ return images
30
+ except FileNotFoundError:
31
+ raise FileNotFoundError(f"TIFF file not found: {file_path}")
32
+ except Exception as e:
33
+ print(f"Error processing TIFF file: {e}")
34
+ raise ValueError(f"Failed to process TIFF file '{os.path.basename(file_path)}'. Ensure the file is a valid TIFF.") from e
35
+
19
36
 
20
37
  def get_image_from_file(file_path: str) -> Image.Image:
21
- """ Loads a single image file. """
22
- try:
23
- image = Image.open(file_path)
24
- image.load()
25
- return image
26
- except FileNotFoundError:
27
- raise FileNotFoundError(f"Image file not found: {file_path}")
28
- except Exception as e:
29
- raise ValueError(f"Failed to load image file '{os.path.basename(file_path)}': {e}") from e
38
+ """ Loads a single image file. """
39
+ try:
40
+ image = Image.open(file_path)
41
+ image.load()
42
+ return image
43
+ except FileNotFoundError:
44
+ raise FileNotFoundError(f"Image file not found: {file_path}")
45
+ except Exception as e:
46
+ raise ValueError(f"Failed to load image file '{os.path.basename(file_path)}': {e}") from e
30
47
 
31
48
 
32
49
  def image_to_base64(image:Image.Image, format:str="png") -> str:
@@ -1,5 +1,5 @@
1
1
  import abc
2
- import importlib
2
+ import importlib.util
3
3
  from typing import List, Dict, Union, Generator
4
4
  import warnings
5
5
  from PIL import Image
File without changes
File without changes