vlm4ocr 0.0.1__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/PKG-INFO +2 -2
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/pyproject.toml +4 -2
- vlm4ocr-0.1.0/vlm4ocr/assets/default_prompt_templates/ocr_HTML_system_prompt.txt +1 -0
- vlm4ocr-0.1.0/vlm4ocr/assets/default_prompt_templates/ocr_HTML_user_prompt.txt +1 -0
- vlm4ocr-0.1.0/vlm4ocr/assets/default_prompt_templates/ocr_text_user_prompt.txt +1 -0
- vlm4ocr-0.1.0/vlm4ocr/cli.py +378 -0
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/ocr_engines.py +58 -45
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/utils.py +26 -9
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/vlm_engines.py +1 -1
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/README.md +0 -0
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/__init__.py +0 -0
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/assets/default_prompt_templates/ocr_markdown_system_prompt.txt +0 -0
- /vlm4ocr-0.0.1/vlm4ocr/assets/default_prompt_templates/ocr_user_prompt.txt → /vlm4ocr-0.1.0/vlm4ocr/assets/default_prompt_templates/ocr_markdown_user_prompt.txt +0 -0
- {vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/assets/default_prompt_templates/ocr_text_system_prompt.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vlm4ocr
|
|
3
|
-
Version: 0.0
|
|
4
|
-
Summary: OCR with vision language models.
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python package and Web App for OCR with vision language models.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
7
7
|
Requires-Python: >=3.11,<4.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "vlm4ocr"
|
|
3
|
-
version = "0.0
|
|
4
|
-
description = "OCR with vision language models."
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Python package and Web App for OCR with vision language models."
|
|
5
5
|
authors = ["Enshuo (David) Hsu"]
|
|
6
6
|
license = "MIT"
|
|
7
7
|
readme = "README.md"
|
|
@@ -17,6 +17,8 @@ python = "^3.11"
|
|
|
17
17
|
pdf2image = ">=1.16.0"
|
|
18
18
|
pillow = ">=10.0.0"
|
|
19
19
|
|
|
20
|
+
[tool.poetry.scripts]
|
|
21
|
+
vlm4ocr = "vlm4ocr.cli:main"
|
|
20
22
|
|
|
21
23
|
[build-system]
|
|
22
24
|
requires = ["poetry-core"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
You are a helpful assistant that can convert scanned documents into functional HTML. Your output is accurate and well-formatted, starting with <html> and ending with </html>. You will only output the HTML without any additional explanations or comments. The HTML should include all text, tables, and lists with appropriate tags (e.g., "table", "tbody", "tr", ""li) and stlyes (e.g., "font-family", "color", "font-size") that represents the text contents in the input. You will ignore images, icons, or anything that can not be converted into text.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Convert contents in this image into HTML.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Convert contents in this image into plain text.
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
# vlm4ocr/cli.py
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
# Attempt to import from the local package structure
|
|
9
|
+
# This allows running the script directly for development,
|
|
10
|
+
# assuming the script is in vlm4ocr/vlm4ocr/cli.py and the package root is vlm4ocr/vlm4ocr
|
|
11
|
+
try:
|
|
12
|
+
from .ocr_engines import OCREngine
|
|
13
|
+
from .vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine
|
|
14
|
+
except ImportError:
|
|
15
|
+
# Fallback for when the package is installed and cli.py is run as part of it
|
|
16
|
+
from vlm4ocr.ocr_engines import OCREngine
|
|
17
|
+
from vlm4ocr.vlm_engines import OpenAIVLMEngine, AzureOpenAIVLMEngine, OllamaVLMEngine
|
|
18
|
+
|
|
19
|
+
# Configure basic logging
|
|
20
|
+
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# Define supported extensions here, ideally this should be sourced from ocr_engines.py
|
|
24
|
+
SUPPORTED_IMAGE_EXTS_CLI = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
|
|
25
|
+
OUTPUT_EXTENSIONS = {'markdown': '.md', 'HTML':'.html', 'text':'txt'}
|
|
26
|
+
|
|
27
|
+
def main():
|
|
28
|
+
"""
|
|
29
|
+
Main function for the vlm4ocr CLI.
|
|
30
|
+
Parses arguments, initializes engines, runs OCR, and handles output.
|
|
31
|
+
"""
|
|
32
|
+
parser = argparse.ArgumentParser(
|
|
33
|
+
description="VLM4OCR: Perform OCR on images, PDFs, or TIFF files using Vision Language Models.",
|
|
34
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# --- Input/Output Arguments ---
|
|
38
|
+
io_group = parser.add_argument_group("Input/Output Options")
|
|
39
|
+
io_group.add_argument(
|
|
40
|
+
"--input_path",
|
|
41
|
+
required=True,
|
|
42
|
+
help="Path to the input image, PDF, or TIFF file, or a directory containing these files. "
|
|
43
|
+
"If a directory is provided, all supported files within will be processed."
|
|
44
|
+
)
|
|
45
|
+
io_group.add_argument(
|
|
46
|
+
"--output_mode",
|
|
47
|
+
choices=["markdown", "HTML", "text"],
|
|
48
|
+
default="markdown",
|
|
49
|
+
help="Desired output format for the OCR results."
|
|
50
|
+
)
|
|
51
|
+
io_group.add_argument(
|
|
52
|
+
"--output_file",
|
|
53
|
+
help="Optional: Path to a file to save the output. "
|
|
54
|
+
"If input_path is a directory, this should be a directory where results will be saved "
|
|
55
|
+
"(one file per input, with original name and new extension). "
|
|
56
|
+
"If not provided, output is written to files in the current working directory "
|
|
57
|
+
"(e.g., 'input_name_ocr.output_mode')."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# --- VLM Engine Selection ---
|
|
61
|
+
vlm_engine_group = parser.add_argument_group("VLM Engine Selection")
|
|
62
|
+
vlm_engine_group.add_argument(
|
|
63
|
+
"--vlm_engine",
|
|
64
|
+
choices=["openai", "azure_openai", "ollama", "openai_compatible"],
|
|
65
|
+
required=True,
|
|
66
|
+
help="Specify the VLM engine to use."
|
|
67
|
+
)
|
|
68
|
+
vlm_engine_group.add_argument(
|
|
69
|
+
"--model",
|
|
70
|
+
required=True,
|
|
71
|
+
help="The specific model identifier for the chosen VLM engine. "
|
|
72
|
+
"E.g., 'gpt-4o' for OpenAI, 'deployment-name' for Azure, "
|
|
73
|
+
"'Qwen/Qwen2.5-VL-7B-Instruct' for OpenAI-compatible, "
|
|
74
|
+
"or 'llava:latest' for Ollama."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# --- OpenAI Engine Arguments ---
|
|
78
|
+
openai_group = parser.add_argument_group("OpenAI & OpenAI-Compatible Options")
|
|
79
|
+
openai_group.add_argument(
|
|
80
|
+
"--api_key",
|
|
81
|
+
default=os.environ.get("OPENAI_API_KEY"),
|
|
82
|
+
help="API key for OpenAI or OpenAI-compatible service. "
|
|
83
|
+
"Can also be set via OPENAI_API_KEY environment variable."
|
|
84
|
+
)
|
|
85
|
+
openai_group.add_argument(
|
|
86
|
+
"--base_url",
|
|
87
|
+
help="Base URL for OpenAI-compatible services (e.g., vLLM endpoint like 'http://localhost:8000/v1'). "
|
|
88
|
+
"Not used for official OpenAI API."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# --- Azure OpenAI Engine Arguments ---
|
|
92
|
+
azure_group = parser.add_argument_group("Azure OpenAI Options")
|
|
93
|
+
azure_group.add_argument(
|
|
94
|
+
"--azure_api_key",
|
|
95
|
+
default=os.environ.get("AZURE_OPENAI_API_KEY"),
|
|
96
|
+
help="API key for Azure OpenAI service. "
|
|
97
|
+
"Can also be set via AZURE_OPENAI_API_KEY environment variable."
|
|
98
|
+
)
|
|
99
|
+
azure_group.add_argument(
|
|
100
|
+
"--azure_endpoint",
|
|
101
|
+
default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
102
|
+
help="Endpoint URL for Azure OpenAI service. "
|
|
103
|
+
"Can also be set via AZURE_OPENAI_ENDPOINT environment variable."
|
|
104
|
+
)
|
|
105
|
+
azure_group.add_argument(
|
|
106
|
+
"--azure_api_version",
|
|
107
|
+
default=os.environ.get("AZURE_OPENAI_API_VERSION"),
|
|
108
|
+
help="API version for Azure OpenAI service (e.g., '2024-02-01'). "
|
|
109
|
+
"Can also be set via AZURE_OPENAI_API_VERSION environment variable."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# --- Ollama Engine Arguments ---
|
|
113
|
+
ollama_group = parser.add_argument_group("Ollama Options")
|
|
114
|
+
ollama_group.add_argument(
|
|
115
|
+
"--ollama_host",
|
|
116
|
+
default="http://localhost:11434",
|
|
117
|
+
help="Host URL for the Ollama server."
|
|
118
|
+
)
|
|
119
|
+
ollama_group.add_argument(
|
|
120
|
+
"--ollama_num_ctx",
|
|
121
|
+
type=int,
|
|
122
|
+
default=4096,
|
|
123
|
+
help="Context length for Ollama models."
|
|
124
|
+
)
|
|
125
|
+
ollama_group.add_argument(
|
|
126
|
+
"--ollama_keep_alive",
|
|
127
|
+
type=int,
|
|
128
|
+
default=300, # Default from OllamaVLMEngine
|
|
129
|
+
help="Seconds to keep the Ollama model loaded after the last call."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# --- OCR Engine Parameters ---
|
|
134
|
+
ocr_params_group = parser.add_argument_group("OCR Engine Parameters")
|
|
135
|
+
ocr_params_group.add_argument(
|
|
136
|
+
"--user_prompt",
|
|
137
|
+
help="Optional: Custom user prompt to provide context about the image/PDF/TIFF."
|
|
138
|
+
)
|
|
139
|
+
# REMOVED --system_prompt argument
|
|
140
|
+
ocr_params_group.add_argument(
|
|
141
|
+
"--max_new_tokens",
|
|
142
|
+
type=int,
|
|
143
|
+
default=4096, # Default from OCREngine
|
|
144
|
+
help="Maximum number of new tokens the VLM can generate."
|
|
145
|
+
)
|
|
146
|
+
ocr_params_group.add_argument(
|
|
147
|
+
"--temperature",
|
|
148
|
+
type=float,
|
|
149
|
+
default=0.0, # Default from OCREngine
|
|
150
|
+
help="Temperature for token sampling (0.0 for deterministic output)."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# --- Processing Options ---
|
|
154
|
+
processing_group = parser.add_argument_group("Processing Options")
|
|
155
|
+
processing_group.add_argument(
|
|
156
|
+
"--concurrent",
|
|
157
|
+
action="store_true",
|
|
158
|
+
help="Enable concurrent processing for multiple files or PDF/TIFF pages."
|
|
159
|
+
)
|
|
160
|
+
processing_group.add_argument(
|
|
161
|
+
"--concurrent_batch_size",
|
|
162
|
+
type=int,
|
|
163
|
+
default=32,
|
|
164
|
+
help="Batch size for concurrent processing."
|
|
165
|
+
)
|
|
166
|
+
processing_group.add_argument(
|
|
167
|
+
"--verbose",
|
|
168
|
+
action="store_true",
|
|
169
|
+
help="Enable verbose output from the OCR engine during processing. CLI will also log more info."
|
|
170
|
+
)
|
|
171
|
+
processing_group.add_argument(
|
|
172
|
+
"--debug",
|
|
173
|
+
action="store_true",
|
|
174
|
+
help="Enable debug level logging for more detailed information."
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
args = parser.parse_args()
|
|
178
|
+
|
|
179
|
+
if args.debug:
|
|
180
|
+
logging.getLogger().setLevel(logging.DEBUG)
|
|
181
|
+
logger.setLevel(logging.DEBUG)
|
|
182
|
+
logger.debug("Debug mode enabled.")
|
|
183
|
+
logger.debug(f"Parsed arguments: {args}")
|
|
184
|
+
elif args.verbose:
|
|
185
|
+
logger.setLevel(logging.INFO) # Ensure logger level is at least INFO for verbose CLI output
|
|
186
|
+
|
|
187
|
+
# --- Validate Arguments ---
|
|
188
|
+
# verbose is not supported with concurrent processing
|
|
189
|
+
if args.verbose and args.concurrent:
|
|
190
|
+
logger.warning("Verbose output is not supported with concurrent processing. "
|
|
191
|
+
"Verbose mode will be ignored.")
|
|
192
|
+
args.verbose = False
|
|
193
|
+
|
|
194
|
+
# --- Initialize VLM Engine ---
|
|
195
|
+
vlm_engine_instance = None
|
|
196
|
+
try:
|
|
197
|
+
logger.info(f"Initializing VLM engine: {args.vlm_engine} with model: {args.model}")
|
|
198
|
+
if args.vlm_engine == "openai":
|
|
199
|
+
if not args.api_key:
|
|
200
|
+
parser.error("--api_key (or OPENAI_API_KEY env var) is required for OpenAI engine.")
|
|
201
|
+
vlm_engine_instance = OpenAIVLMEngine(
|
|
202
|
+
model=args.model,
|
|
203
|
+
api_key=args.api_key
|
|
204
|
+
# reasoning_model removed
|
|
205
|
+
)
|
|
206
|
+
elif args.vlm_engine == "openai_compatible":
|
|
207
|
+
if not args.api_key :
|
|
208
|
+
logger.warning("API key not provided or empty for openai_compatible. This might be acceptable for some servers (e.g. if 'EMPTY' is expected).")
|
|
209
|
+
if not args.base_url:
|
|
210
|
+
parser.error("--base_url is required for openai_compatible engine.")
|
|
211
|
+
vlm_engine_instance = OpenAIVLMEngine(
|
|
212
|
+
model=args.model,
|
|
213
|
+
api_key=args.api_key,
|
|
214
|
+
base_url=args.base_url
|
|
215
|
+
# reasoning_model removed
|
|
216
|
+
)
|
|
217
|
+
elif args.vlm_engine == "azure_openai":
|
|
218
|
+
if not args.azure_api_key:
|
|
219
|
+
parser.error("--azure_api_key (or AZURE_OPENAI_API_KEY env var) is required for Azure OpenAI engine.")
|
|
220
|
+
if not args.azure_endpoint:
|
|
221
|
+
parser.error("--azure_endpoint (or AZURE_OPENAI_ENDPOINT env var) is required for Azure OpenAI engine.")
|
|
222
|
+
if not args.azure_api_version:
|
|
223
|
+
parser.error("--azure_api_version (or AZURE_OPENAI_API_VERSION env var) is required for Azure OpenAI engine.")
|
|
224
|
+
vlm_engine_instance = AzureOpenAIVLMEngine(
|
|
225
|
+
model=args.model,
|
|
226
|
+
api_key=args.azure_api_key,
|
|
227
|
+
azure_endpoint=args.azure_endpoint,
|
|
228
|
+
api_version=args.azure_api_version
|
|
229
|
+
# reasoning_model removed
|
|
230
|
+
)
|
|
231
|
+
elif args.vlm_engine == "ollama":
|
|
232
|
+
vlm_engine_instance = OllamaVLMEngine(
|
|
233
|
+
model_name=args.model, # OllamaVLMEngine expects model_name
|
|
234
|
+
host=args.ollama_host,
|
|
235
|
+
num_ctx=args.ollama_num_ctx,
|
|
236
|
+
keep_alive=args.ollama_keep_alive
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
# This case should be caught by argparse choices, but as a safeguard:
|
|
240
|
+
logger.error(f"Invalid VLM engine specified: {args.vlm_engine}")
|
|
241
|
+
sys.exit(1)
|
|
242
|
+
logger.info("VLM engine initialized successfully.")
|
|
243
|
+
|
|
244
|
+
except ImportError as e:
|
|
245
|
+
logger.error(f"Failed to import a required library for {args.vlm_engine}: {e}. "
|
|
246
|
+
"Please ensure the necessary dependencies (e.g., 'openai', 'ollama') are installed.")
|
|
247
|
+
sys.exit(1)
|
|
248
|
+
except Exception as e:
|
|
249
|
+
logger.error(f"Error initializing VLM engine '{args.vlm_engine}': {e}")
|
|
250
|
+
if args.debug:
|
|
251
|
+
logger.exception("Traceback for VLM engine initialization error:")
|
|
252
|
+
sys.exit(1)
|
|
253
|
+
|
|
254
|
+
# --- Initialize OCR Engine ---
|
|
255
|
+
try:
|
|
256
|
+
logger.info(f"Initializing OCR engine with output mode: {args.output_mode}")
|
|
257
|
+
ocr_engine_instance = OCREngine(
|
|
258
|
+
vlm_engine=vlm_engine_instance,
|
|
259
|
+
output_mode=args.output_mode,
|
|
260
|
+
# system_prompt removed, OCREngine will use its default
|
|
261
|
+
user_prompt=args.user_prompt
|
|
262
|
+
)
|
|
263
|
+
logger.info("OCR engine initialized successfully.")
|
|
264
|
+
except Exception as e:
|
|
265
|
+
logger.error(f"Error initializing OCR engine: {e}")
|
|
266
|
+
if args.debug:
|
|
267
|
+
logger.exception("Traceback for OCR engine initialization error:")
|
|
268
|
+
sys.exit(1)
|
|
269
|
+
|
|
270
|
+
# --- Prepare input file paths ---
|
|
271
|
+
input_files_to_process = []
|
|
272
|
+
if os.path.isdir(args.input_path):
|
|
273
|
+
logger.info(f"Input path is a directory: {args.input_path}. Scanning for supported files...")
|
|
274
|
+
for item in os.listdir(args.input_path):
|
|
275
|
+
item_path = os.path.join(args.input_path, item)
|
|
276
|
+
if os.path.isfile(item_path):
|
|
277
|
+
file_ext = os.path.splitext(item)[1].lower()
|
|
278
|
+
if file_ext in SUPPORTED_IMAGE_EXTS_CLI:
|
|
279
|
+
input_files_to_process.append(item_path)
|
|
280
|
+
if not input_files_to_process:
|
|
281
|
+
logger.error(f"No supported files (PDF, TIFF, PNG, JPG, etc.) found in directory: {args.input_path}")
|
|
282
|
+
sys.exit(1)
|
|
283
|
+
logger.info(f"Found {len(input_files_to_process)} supported files to process.")
|
|
284
|
+
elif os.path.isfile(args.input_path):
|
|
285
|
+
file_ext = os.path.splitext(args.input_path)[1].lower()
|
|
286
|
+
if file_ext not in SUPPORTED_IMAGE_EXTS_CLI:
|
|
287
|
+
logger.error(f"Input file '{args.input_path}' is not a supported file type. Supported: {SUPPORTED_IMAGE_EXTS_CLI}")
|
|
288
|
+
sys.exit(1)
|
|
289
|
+
input_files_to_process = [args.input_path]
|
|
290
|
+
logger.info(f"Processing single input file: {args.input_path}")
|
|
291
|
+
else:
|
|
292
|
+
logger.error(f"Input path is not a valid file or directory: {args.input_path}")
|
|
293
|
+
sys.exit(1)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# --- Run OCR ---
|
|
297
|
+
try:
|
|
298
|
+
logger.info("Starting OCR processing...")
|
|
299
|
+
ocr_results_list = ocr_engine_instance.run_ocr(
|
|
300
|
+
file_paths=input_files_to_process,
|
|
301
|
+
max_new_tokens=args.max_new_tokens,
|
|
302
|
+
temperature=args.temperature,
|
|
303
|
+
verbose=args.verbose,
|
|
304
|
+
concurrent=args.concurrent,
|
|
305
|
+
concurrent_batch_size=args.concurrent_batch_size
|
|
306
|
+
)
|
|
307
|
+
logger.info("OCR processing completed.")
|
|
308
|
+
|
|
309
|
+
# --- Handle Output ---
|
|
310
|
+
if args.output_file:
|
|
311
|
+
if os.path.isdir(args.input_path) and len(input_files_to_process) > 1 :
|
|
312
|
+
if not os.path.exists(args.output_file):
|
|
313
|
+
logger.info(f"Creating output directory: {args.output_file}")
|
|
314
|
+
os.makedirs(args.output_file, exist_ok=True)
|
|
315
|
+
elif not os.path.isdir(args.output_file):
|
|
316
|
+
logger.error(f"Output path '{args.output_file}' exists and is not a directory, "
|
|
317
|
+
"but multiple input files were processed. Please specify a directory for --output_file.")
|
|
318
|
+
sys.exit(1)
|
|
319
|
+
|
|
320
|
+
output_target_dir = args.output_file
|
|
321
|
+
elif not (os.path.isdir(args.input_path) and len(input_files_to_process) > 1):
|
|
322
|
+
# Single input file, or directory with one file. output_file is a direct file path.
|
|
323
|
+
# Ensure its directory exists.
|
|
324
|
+
output_target_dir = os.path.dirname(args.output_file)
|
|
325
|
+
if output_target_dir and not os.path.exists(output_target_dir):
|
|
326
|
+
logger.info(f"Creating output directory: {output_target_dir}")
|
|
327
|
+
os.makedirs(output_target_dir, exist_ok=True)
|
|
328
|
+
else: # Should not happen if logic above is correct
|
|
329
|
+
output_target_dir = os.getcwd()
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
for i, input_file_path in enumerate(input_files_to_process):
|
|
333
|
+
if os.path.isdir(args.input_path) and len(input_files_to_process) > 1:
|
|
334
|
+
# Multiple inputs, save into the directory specified by args.output_file
|
|
335
|
+
base_name = os.path.basename(input_file_path)
|
|
336
|
+
name_part, _ = os.path.splitext(base_name)
|
|
337
|
+
output_filename = f"{name_part}_ocr{OUTPUT_EXTENSIONS[args.output_mode]}"
|
|
338
|
+
full_output_path = os.path.join(args.output_file, output_filename)
|
|
339
|
+
else:
|
|
340
|
+
# Single input, args.output_file is the exact path
|
|
341
|
+
full_output_path = args.output_file
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
with open(full_output_path, "w", encoding="utf-8") as f:
|
|
345
|
+
f.write(ocr_results_list[i])
|
|
346
|
+
logger.info(f"OCR result for '{input_file_path}' saved to: {full_output_path}")
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.error(f"Error writing output for '{input_file_path}' to '{full_output_path}': {e}")
|
|
349
|
+
else:
|
|
350
|
+
# No --output_file specified, save to current working directory
|
|
351
|
+
current_dir = os.getcwd()
|
|
352
|
+
logger.info(f"No --output_file specified. Results will be saved to the current working directory: {current_dir}")
|
|
353
|
+
for i, input_file_path in enumerate(input_files_to_process):
|
|
354
|
+
base_name = os.path.basename(input_file_path)
|
|
355
|
+
name_part, _ = os.path.splitext(base_name)
|
|
356
|
+
output_filename = f"{name_part}_ocr{OUTPUT_EXTENSIONS[args.output_mode]}"
|
|
357
|
+
full_output_path = os.path.join(current_dir, output_filename)
|
|
358
|
+
try:
|
|
359
|
+
with open(full_output_path, "w", encoding="utf-8") as f:
|
|
360
|
+
f.write(ocr_results_list[i])
|
|
361
|
+
logger.info(f"OCR result for '{input_file_path}' saved to: {full_output_path}")
|
|
362
|
+
except Exception as e:
|
|
363
|
+
logger.error(f"Error writing output for '{input_file_path}' to '{full_output_path}': {e}")
|
|
364
|
+
|
|
365
|
+
except FileNotFoundError as e:
|
|
366
|
+
logger.error(f"File not found during OCR processing: {e}")
|
|
367
|
+
sys.exit(1)
|
|
368
|
+
except ValueError as e:
|
|
369
|
+
logger.error(f"Input Error or Value Error during processing: {e}")
|
|
370
|
+
sys.exit(1)
|
|
371
|
+
except Exception as e:
|
|
372
|
+
logger.error(f"An unexpected error occurred during OCR processing: {e}")
|
|
373
|
+
if args.debug:
|
|
374
|
+
logger.exception("Traceback for OCR processing error:")
|
|
375
|
+
sys.exit(1)
|
|
376
|
+
|
|
377
|
+
if __name__ == "__main__":
|
|
378
|
+
main()
|
|
@@ -2,13 +2,14 @@ import os
|
|
|
2
2
|
from typing import List, Dict, Union, Generator, Iterable
|
|
3
3
|
import importlib
|
|
4
4
|
import asyncio
|
|
5
|
-
from vlm4ocr.utils import get_images_from_pdf, get_image_from_file, clean_markdown
|
|
5
|
+
from vlm4ocr.utils import get_images_from_pdf, get_images_from_tiff, get_image_from_file, clean_markdown
|
|
6
6
|
from vlm4ocr.vlm_engines import VLMEngine
|
|
7
7
|
|
|
8
|
-
SUPPORTED_IMAGE_EXTS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
|
|
8
|
+
SUPPORTED_IMAGE_EXTS = ['.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp']
|
|
9
|
+
|
|
9
10
|
|
|
10
11
|
class OCREngine:
|
|
11
|
-
def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None, page_delimiter:str="
|
|
12
|
+
def __init__(self, vlm_engine:VLMEngine, output_mode:str="markdown", system_prompt:str=None, user_prompt:str=None, page_delimiter:str="auto"):
|
|
12
13
|
"""
|
|
13
14
|
This class inputs a image or PDF file path and processes them using a VLM inference engine. Outputs plain text or markdown.
|
|
14
15
|
|
|
@@ -17,13 +18,17 @@ class OCREngine:
|
|
|
17
18
|
inference_engine : InferenceEngine
|
|
18
19
|
The inference engine to use for OCR.
|
|
19
20
|
output_mode : str, Optional
|
|
20
|
-
The output format.
|
|
21
|
+
The output format. Must be 'markdown', 'HTML', or 'text'.
|
|
21
22
|
system_prompt : str, Optional
|
|
22
23
|
Custom system prompt. We recommend use a default system prompt by leaving this blank.
|
|
23
24
|
user_prompt : str, Optional
|
|
24
25
|
Custom user prompt. It is good to include some information regarding the document. If not specified, a default will be used.
|
|
25
26
|
page_delimiter : str, Optional
|
|
26
27
|
The delimiter to use between PDF pages.
|
|
28
|
+
if 'auto', it will be set to the default page delimiter for the output mode:
|
|
29
|
+
'markdown' -> '\n\n---\n\n'
|
|
30
|
+
'HTML' -> '<br><br>'
|
|
31
|
+
'text' -> '\n\n---\n\n'
|
|
27
32
|
"""
|
|
28
33
|
# Check inference engine
|
|
29
34
|
if not isinstance(vlm_engine, VLMEngine):
|
|
@@ -31,8 +36,8 @@ class OCREngine:
|
|
|
31
36
|
self.vlm_engine = vlm_engine
|
|
32
37
|
|
|
33
38
|
# Check output mode
|
|
34
|
-
if output_mode not in ["markdown", "text"]:
|
|
35
|
-
raise ValueError("output_mode must be 'markdown' or 'text'")
|
|
39
|
+
if output_mode not in ["markdown", "HTML", "text"]:
|
|
40
|
+
raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
|
|
36
41
|
self.output_mode = output_mode
|
|
37
42
|
|
|
38
43
|
# System prompt
|
|
@@ -47,13 +52,21 @@ class OCREngine:
|
|
|
47
52
|
if isinstance(user_prompt, str) and user_prompt:
|
|
48
53
|
self.user_prompt = user_prompt
|
|
49
54
|
else:
|
|
50
|
-
file_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath('
|
|
55
|
+
file_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
|
|
51
56
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
52
57
|
self.user_prompt = f.read()
|
|
53
58
|
|
|
54
59
|
# Page delimiter
|
|
55
60
|
if isinstance(page_delimiter, str):
|
|
56
|
-
|
|
61
|
+
if page_delimiter == "auto":
|
|
62
|
+
if self.output_mode == "markdown":
|
|
63
|
+
self.page_delimiter = "\n\n---\n\n"
|
|
64
|
+
elif self.output_mode == "HTML":
|
|
65
|
+
self.page_delimiter = "<br><br>"
|
|
66
|
+
else:
|
|
67
|
+
self.page_delimiter = "\n\n---\n\n"
|
|
68
|
+
else:
|
|
69
|
+
self.page_delimiter = page_delimiter
|
|
57
70
|
else:
|
|
58
71
|
raise ValueError("page_delimiter must be a string")
|
|
59
72
|
|
|
@@ -61,16 +74,17 @@ class OCREngine:
|
|
|
61
74
|
def stream_ocr(self, file_path: str, max_new_tokens:int=4096, temperature:float=0.0, **kwrs) -> Generator[str, None, None]:
|
|
62
75
|
"""
|
|
63
76
|
This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications.
|
|
77
|
+
Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.
|
|
64
78
|
|
|
65
79
|
Parameters:
|
|
66
80
|
-----------
|
|
67
81
|
file_path : str
|
|
68
|
-
The path to the image or PDF file. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
82
|
+
The path to the image or PDF file. Must be one of '.pdf', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
69
83
|
max_new_tokens : int, Optional
|
|
70
84
|
The maximum number of tokens to generate.
|
|
71
85
|
temperature : float, Optional
|
|
72
86
|
The temperature to use for sampling.
|
|
73
|
-
|
|
87
|
+
|
|
74
88
|
Returns:
|
|
75
89
|
--------
|
|
76
90
|
Generator[str, None, None]
|
|
@@ -82,14 +96,14 @@ class OCREngine:
|
|
|
82
96
|
|
|
83
97
|
# Check file extension
|
|
84
98
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
85
|
-
if file_ext not in SUPPORTED_IMAGE_EXTS
|
|
86
|
-
raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS
|
|
99
|
+
if file_ext not in SUPPORTED_IMAGE_EXTS:
|
|
100
|
+
raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
|
|
87
101
|
|
|
88
|
-
# PDF
|
|
89
|
-
if file_ext
|
|
90
|
-
images = get_images_from_pdf(file_path)
|
|
102
|
+
# PDF or TIFF
|
|
103
|
+
if file_ext in ['.pdf', '.tif', '.tiff']:
|
|
104
|
+
images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
|
|
91
105
|
if not images:
|
|
92
|
-
raise ValueError(f"No images extracted from
|
|
106
|
+
raise ValueError(f"No images extracted from file: {file_path}")
|
|
93
107
|
for i, image in enumerate(images):
|
|
94
108
|
messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
|
|
95
109
|
response_stream = self.vlm_engine.chat(
|
|
@@ -100,10 +114,10 @@ class OCREngine:
|
|
|
100
114
|
**kwrs
|
|
101
115
|
)
|
|
102
116
|
for chunk in response_stream:
|
|
103
|
-
yield chunk
|
|
117
|
+
yield {"type": "ocr_chunk", "data": chunk}
|
|
104
118
|
|
|
105
119
|
if i < len(images) - 1:
|
|
106
|
-
yield self.page_delimiter
|
|
120
|
+
yield {"type": "page_delimiter", "data": self.page_delimiter}
|
|
107
121
|
|
|
108
122
|
# Image
|
|
109
123
|
else:
|
|
@@ -117,18 +131,18 @@ class OCREngine:
|
|
|
117
131
|
**kwrs
|
|
118
132
|
)
|
|
119
133
|
for chunk in response_stream:
|
|
120
|
-
yield chunk
|
|
134
|
+
yield {"type": "ocr_chunk", "data": chunk}
|
|
121
135
|
|
|
122
136
|
|
|
123
137
|
def run_ocr(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096, temperature:float=0.0,
|
|
124
138
|
verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32, **kwrs) -> Union[str, Generator[str, None, None]]:
|
|
125
139
|
"""
|
|
126
|
-
This method takes a list of file paths (image
|
|
140
|
+
This method takes a list of file paths (image, PDF, TIFF) and perform OCR using the VLM inference engine.
|
|
127
141
|
|
|
128
142
|
Parameters:
|
|
129
143
|
-----------
|
|
130
144
|
file_paths : Union[str, Iterable[str]]
|
|
131
|
-
A file path or a list of file paths to process. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
145
|
+
A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff, '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
132
146
|
max_new_tokens : int, Optional
|
|
133
147
|
The maximum number of tokens to generate.
|
|
134
148
|
temperature : float, Optional
|
|
@@ -152,9 +166,8 @@ class OCREngine:
|
|
|
152
166
|
if not isinstance(file_path, str):
|
|
153
167
|
raise TypeError("file_paths must be a string or an iterable of strings")
|
|
154
168
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
155
|
-
if file_ext not in SUPPORTED_IMAGE_EXTS
|
|
156
|
-
raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS
|
|
157
|
-
|
|
169
|
+
if file_ext not in SUPPORTED_IMAGE_EXTS:
|
|
170
|
+
raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
|
|
158
171
|
|
|
159
172
|
# Concurrent processing
|
|
160
173
|
if concurrent:
|
|
@@ -178,12 +191,12 @@ class OCREngine:
|
|
|
178
191
|
def _run_ocr(self, file_paths: Union[str, Iterable[str]], max_new_tokens:int=4096,
|
|
179
192
|
temperature:float=0.0, verbose:bool=False, **kwrs) -> Iterable[str]:
|
|
180
193
|
"""
|
|
181
|
-
This method inputs a file path or a list of file paths (image
|
|
194
|
+
This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
|
|
182
195
|
|
|
183
196
|
Parameters:
|
|
184
197
|
-----------
|
|
185
198
|
file_paths : Union[str, Iterable[str]]
|
|
186
|
-
A file path or a list of file paths to process. Must be one of '.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
199
|
+
A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
|
|
187
200
|
max_new_tokens : int, Optional
|
|
188
201
|
The maximum number of tokens to generate.
|
|
189
202
|
temperature : float, Optional
|
|
@@ -199,12 +212,12 @@ class OCREngine:
|
|
|
199
212
|
ocr_results = []
|
|
200
213
|
for file_path in file_paths:
|
|
201
214
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
202
|
-
# PDF
|
|
203
|
-
if file_ext
|
|
204
|
-
images = get_images_from_pdf(file_path)
|
|
215
|
+
# PDF or TIFF
|
|
216
|
+
if file_ext in ['.pdf', '.tif', '.tiff']:
|
|
217
|
+
images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
|
|
205
218
|
if not images:
|
|
206
|
-
raise ValueError(f"No images extracted from
|
|
207
|
-
|
|
219
|
+
raise ValueError(f"No images extracted from file: {file_path}")
|
|
220
|
+
results = []
|
|
208
221
|
for image in images:
|
|
209
222
|
messages = self.vlm_engine.get_ocr_messages(self.system_prompt, self.user_prompt, image)
|
|
210
223
|
response = self.vlm_engine.chat(
|
|
@@ -215,9 +228,9 @@ class OCREngine:
|
|
|
215
228
|
stream=False,
|
|
216
229
|
**kwrs
|
|
217
230
|
)
|
|
218
|
-
|
|
231
|
+
results.append(response)
|
|
219
232
|
|
|
220
|
-
ocr_text = self.page_delimiter.join(
|
|
233
|
+
ocr_text = self.page_delimiter.join(results)
|
|
221
234
|
# Image
|
|
222
235
|
else:
|
|
223
236
|
image = get_image_from_file(file_path)
|
|
@@ -248,13 +261,13 @@ class OCREngine:
|
|
|
248
261
|
flat_page_list = []
|
|
249
262
|
for file_path in file_paths:
|
|
250
263
|
file_ext = os.path.splitext(file_path)[1].lower()
|
|
251
|
-
# PDF
|
|
252
|
-
if file_ext
|
|
253
|
-
images = get_images_from_pdf(file_path)
|
|
264
|
+
# PDF or TIFF
|
|
265
|
+
if file_ext in ['.pdf', '.tif', '.tiff']:
|
|
266
|
+
images = get_images_from_pdf(file_path) if file_ext == '.pdf' else get_images_from_tiff(file_path)
|
|
254
267
|
if not images:
|
|
255
|
-
flat_page_list.append({'file_path': file_path, 'file_type': "PDF", "image": image, "page_num": 0, "total_page_count": 0})
|
|
268
|
+
flat_page_list.append({'file_path': file_path, 'file_type': "PDF/TIFF", "image": image, "page_num": 0, "total_page_count": 0})
|
|
256
269
|
for page_num, image in enumerate(images):
|
|
257
|
-
flat_page_list.append({'file_path': file_path, 'file_type': "PDF", "image": image, "page_num": page_num, "total_page_count": len(images)})
|
|
270
|
+
flat_page_list.append({'file_path': file_path, 'file_type': "PDF/TIFF", "image": image, "page_num": page_num, "total_page_count": len(images)})
|
|
258
271
|
# Image
|
|
259
272
|
else:
|
|
260
273
|
image = get_image_from_file(file_path)
|
|
@@ -291,16 +304,16 @@ class OCREngine:
|
|
|
291
304
|
|
|
292
305
|
# Restructure the results
|
|
293
306
|
ocr_results = []
|
|
294
|
-
|
|
307
|
+
page_text_buffer = ""
|
|
295
308
|
for page, ocr_text in zip(flat_page_list, responses):
|
|
296
|
-
# PDF
|
|
297
|
-
if page['file_type'] == "PDF":
|
|
298
|
-
|
|
309
|
+
# PDF or TIFF
|
|
310
|
+
if page['file_type'] == "PDF/TIFF":
|
|
311
|
+
page_text_buffer += ocr_text + self.page_delimiter
|
|
299
312
|
if page['page_num'] == page['total_page_count'] - 1:
|
|
300
313
|
if self.output_mode == "markdown":
|
|
301
|
-
|
|
302
|
-
ocr_results.append(
|
|
303
|
-
|
|
314
|
+
page_text_buffer = clean_markdown(page_text_buffer)
|
|
315
|
+
ocr_results.append(page_text_buffer)
|
|
316
|
+
page_text_buffer = ""
|
|
304
317
|
# Image
|
|
305
318
|
if page['file_type'] == "image":
|
|
306
319
|
if self.output_mode == "markdown":
|
|
@@ -16,17 +16,34 @@ def get_images_from_pdf(file_path: str) -> List[Image.Image]:
|
|
|
16
16
|
print(f"Error converting PDF to images: {e}")
|
|
17
17
|
raise ValueError(f"Failed to process PDF file '{os.path.basename(file_path)}'. Ensure poppler is installed and the file is valid.") from e
|
|
18
18
|
|
|
19
|
+
def get_images_from_tiff(file_path: str) -> List[Image.Image]:
|
|
20
|
+
""" Extracts images from a TIFF file. """
|
|
21
|
+
images = []
|
|
22
|
+
try:
|
|
23
|
+
img = Image.open(file_path)
|
|
24
|
+
for i in range(img.n_frames):
|
|
25
|
+
img.seek(i)
|
|
26
|
+
images.append(img.copy())
|
|
27
|
+
if not images:
|
|
28
|
+
print(f"Warning: No images extracted from TIFF: {file_path}")
|
|
29
|
+
return images
|
|
30
|
+
except FileNotFoundError:
|
|
31
|
+
raise FileNotFoundError(f"TIFF file not found: {file_path}")
|
|
32
|
+
except Exception as e:
|
|
33
|
+
print(f"Error processing TIFF file: {e}")
|
|
34
|
+
raise ValueError(f"Failed to process TIFF file '{os.path.basename(file_path)}'. Ensure the file is a valid TIFF.") from e
|
|
35
|
+
|
|
19
36
|
|
|
20
37
|
def get_image_from_file(file_path: str) -> Image.Image:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
38
|
+
""" Loads a single image file. """
|
|
39
|
+
try:
|
|
40
|
+
image = Image.open(file_path)
|
|
41
|
+
image.load()
|
|
42
|
+
return image
|
|
43
|
+
except FileNotFoundError:
|
|
44
|
+
raise FileNotFoundError(f"Image file not found: {file_path}")
|
|
45
|
+
except Exception as e:
|
|
46
|
+
raise ValueError(f"Failed to load image file '{os.path.basename(file_path)}': {e}") from e
|
|
30
47
|
|
|
31
48
|
|
|
32
49
|
def image_to_base64(image:Image.Image, format:str="png") -> str:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{vlm4ocr-0.0.1 → vlm4ocr-0.1.0}/vlm4ocr/assets/default_prompt_templates/ocr_text_system_prompt.txt
RENAMED
|
File without changes
|