openocr-python 0.0.9__py3-none-any.whl → 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openocr/__init__.py +35 -1
- openocr/configs/dataset/rec/evaluation.yaml +41 -0
- openocr/configs/dataset/rec/ltb.yaml +9 -0
- openocr/configs/dataset/rec/mjsynth.yaml +11 -0
- openocr/configs/dataset/rec/openvino.yaml +25 -0
- openocr/configs/dataset/rec/ost.yaml +17 -0
- openocr/configs/dataset/rec/synthtext.yaml +7 -0
- openocr/configs/dataset/rec/test.yaml +77 -0
- openocr/configs/dataset/rec/textocr.yaml +13 -0
- openocr/configs/dataset/rec/textocr_horizontal.yaml +13 -0
- openocr/configs/dataset/rec/union14m_b.yaml +47 -0
- openocr/configs/dataset/rec/union14m_l_filtered.yaml +35 -0
- openocr/configs/rec/cmer/cmer.yml +127 -0
- openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_base.yml +152 -0
- openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_small.yml +152 -0
- openocr/configs/rec/unirec/focalsvtr_ardecoder_unirec.yml +114 -0
- openocr/configs/rec/unirec/opendoc_pipeline.yml +105 -0
- openocr/demo_gradio.py +28 -8
- openocr/demo_opendoc.py +572 -0
- openocr/demo_unirec.py +392 -0
- openocr/opendet/losses/__init__.py +5 -7
- openocr/opendet/preprocess/crop_resize.py +2 -1
- openocr/openocr.py +685 -0
- openocr/openrec/losses/__init__.py +8 -3
- openocr/openrec/losses/cmer_loss.py +12 -0
- openocr/openrec/losses/mdiff_loss.py +11 -0
- openocr/openrec/losses/unirec_loss.py +12 -0
- openocr/openrec/metrics/__init__.py +4 -1
- openocr/openrec/metrics/rec_metric_cmer.py +328 -0
- openocr/openrec/modeling/cmer_modeling/modeling_cmer.py +643 -0
- openocr/openrec/modeling/decoders/__init__.py +1 -0
- openocr/openrec/modeling/decoders/ctc_decoder.py +1 -1
- openocr/openrec/modeling/decoders/dan_decoder.py +4 -4
- openocr/openrec/modeling/decoders/dptr_parseq_clip_b_decoder.py +1563 -1398
- openocr/openrec/modeling/decoders/mdiff_decoder.py +587 -0
- openocr/openrec/modeling/decoders/smtr_decoder.py +99 -48
- openocr/openrec/modeling/unirec_modeling/configuration_unirec.py +166 -0
- openocr/openrec/modeling/unirec_modeling/modeling_unirec.py +433 -0
- openocr/openrec/optimizer/__init__.py +4 -3
- openocr/openrec/optimizer/lr.py +49 -0
- openocr/openrec/postprocess/__init__.py +2 -0
- openocr/openrec/postprocess/abinet_postprocess.py +1 -1
- openocr/openrec/postprocess/ar_postprocess.py +1 -1
- openocr/openrec/postprocess/cmer_postprocess.py +86 -0
- openocr/openrec/postprocess/cppd_postprocess.py +1 -1
- openocr/openrec/postprocess/igtr_postprocess.py +1 -1
- openocr/openrec/postprocess/lister_postprocess.py +1 -1
- openocr/openrec/postprocess/mgp_postprocess.py +1 -1
- openocr/openrec/postprocess/nrtr_postprocess.py +2 -2
- openocr/openrec/postprocess/smtr_postprocess.py +1 -1
- openocr/openrec/postprocess/srn_postprocess.py +1 -1
- openocr/openrec/postprocess/unirec_postprocess.py +58 -0
- openocr/openrec/postprocess/visionlan_postprocess.py +1 -1
- openocr/openrec/preprocess/__init__.py +5 -0
- openocr/openrec/preprocess/ce_label_encode.py +1 -1
- openocr/openrec/preprocess/cmer_label_encode.py +1025 -0
- openocr/openrec/preprocess/ctc_label_encode.py +1 -1
- openocr/openrec/preprocess/dptr_label_encode.py +177 -157
- openocr/openrec/preprocess/igtr_label_encode.py +4 -2
- openocr/openrec/preprocess/mdiff_label_encode.py +312 -0
- openocr/openrec/preprocess/rec_aug.py +128 -2
- openocr/openrec/preprocess/resize.py +57 -0
- openocr/openrec/preprocess/unirec_label_encode.py +62 -0
- openocr/tools/data/__init__.py +78 -55
- openocr/tools/data/cmer_web_dataset.py +310 -0
- openocr/tools/data/native_size_dataset.py +753 -0
- openocr/tools/data/native_size_sampler.py +158 -0
- openocr/tools/data/ratio_dataset_tvresize.py +2 -0
- openocr/tools/data/ratio_sampler.py +2 -1
- openocr/tools/download/download_dataset.py +38 -0
- openocr/tools/download/utils.py +28 -0
- openocr/tools/download_example_images.py +236 -0
- openocr/tools/engine/trainer.py +155 -39
- openocr/tools/eval_rec_all_ch.py +2 -2
- openocr/tools/infer_det.py +20 -2
- openocr/tools/infer_doc.py +898 -0
- openocr/tools/infer_doc_onnx.py +1172 -0
- openocr/tools/infer_e2e.py +27 -10
- openocr/tools/infer_rec.py +64 -15
- openocr/tools/infer_unirec_onnx.py +730 -0
- openocr/tools/to_markdown.py +468 -0
- openocr/tools/utils/ckpt.py +17 -5
- openocr/tools/utils/opendoc_onnx_utils/utils.py +1052 -0
- openocr_python-0.1.0.dev0.dist-info/METADATA +324 -0
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/RECORD +89 -45
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/WHEEL +1 -1
- openocr_python-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- openocr_python-0.0.9.dist-info/METADATA +0 -149
- /openocr_python-0.0.9.dist-info/LICENCE → /openocr_python-0.1.0.dev0.dist-info/licenses/LICENSE +0 -0
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/top_level.txt +0 -0
openocr/openocr.py
ADDED
|
@@ -0,0 +1,685 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenOCR Unified Interface
|
|
3
|
+
Provides a single entry point for all OCR tasks with task-based dispatching.
|
|
4
|
+
|
|
5
|
+
Supported tasks:
|
|
6
|
+
- 'ocr': End-to-end OCR (detection + recognition)
|
|
7
|
+
- 'det': Text detection only
|
|
8
|
+
- 'rec': Text recognition only
|
|
9
|
+
- 'unirec': Universal recognition with VLM
|
|
10
|
+
- 'doc': Document OCR with layout analysis
|
|
11
|
+
- 'launch_openocr_demo': Launch OpenOCR Gradio demo
|
|
12
|
+
- 'launch_unirec_demo': Launch UniRec Gradio demo
|
|
13
|
+
- 'launch_opendoc_demo': Launch OpenDoc Gradio demo
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import absolute_import
|
|
17
|
+
from __future__ import division
|
|
18
|
+
from __future__ import print_function
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import sys
|
|
22
|
+
import argparse
|
|
23
|
+
from typing import Optional, Dict
|
|
24
|
+
|
|
25
|
+
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
26
|
+
sys.path.append(__dir__)
|
|
27
|
+
sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..')))
|
|
28
|
+
|
|
29
|
+
from tools.utils.logging import get_logger
|
|
30
|
+
|
|
31
|
+
logger = get_logger(name='openocr_unified')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class OpenOCR:
|
|
35
|
+
"""
|
|
36
|
+
Unified OpenOCR interface that dispatches to different task implementations.
|
|
37
|
+
|
|
38
|
+
Supported tasks:
|
|
39
|
+
- 'det': Text detection only
|
|
40
|
+
- 'rec': Text recognition only
|
|
41
|
+
- 'ocr': End-to-end OCR (text detection + recognition)
|
|
42
|
+
- 'unirec': Universal recognition with Vision-Language Model
|
|
43
|
+
- 'doc': Document OCR with layout analysis (tables, formulas, etc.)
|
|
44
|
+
- 'launch_openocr_demo': Launch OpenOCR Gradio demo
|
|
45
|
+
- 'launch_unirec_demo': Launch UniRec Gradio demo
|
|
46
|
+
- 'launch_opendoc_demo': Launch OpenDoc Gradio demo
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
task: str = 'ocr',
|
|
52
|
+
# Common parameters
|
|
53
|
+
use_gpu: str = 'auto',
|
|
54
|
+
# OCR task parameters
|
|
55
|
+
mode: str = 'mobile',
|
|
56
|
+
backend: str = 'onnx',
|
|
57
|
+
onnx_det_model_path: Optional[str] = None,
|
|
58
|
+
onnx_rec_model_path: Optional[str] = None,
|
|
59
|
+
drop_score: float = 0.5,
|
|
60
|
+
det_box_type: str = 'quad',
|
|
61
|
+
# UniRec task parameters
|
|
62
|
+
unirec_encoder_path: Optional[str] = None,
|
|
63
|
+
unirec_decoder_path: Optional[str] = None,
|
|
64
|
+
tokenizer_mapping_path: Optional[str] = None,
|
|
65
|
+
max_length: int = 2048,
|
|
66
|
+
# Doc task parameters
|
|
67
|
+
layout_model_path: Optional[str] = None,
|
|
68
|
+
layout_threshold: float = 0.5,
|
|
69
|
+
use_layout_detection: bool = True,
|
|
70
|
+
use_chart_recognition: bool = True,
|
|
71
|
+
auto_download: bool = True,
|
|
72
|
+
):
|
|
73
|
+
"""
|
|
74
|
+
Initialize OpenOCR unified interface.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
task: Task type ('ocr', 'det', 'rec', 'unirec', 'doc', 'launch_openocr_demo', 'launch_unirec_demo', 'launch_opendoc_demo')
|
|
78
|
+
|
|
79
|
+
# Common parameters
|
|
80
|
+
use_gpu: GPU usage strategy ('auto', 'true', or 'false')
|
|
81
|
+
|
|
82
|
+
# OCR task parameters
|
|
83
|
+
mode: Model mode ('mobile' or 'server')
|
|
84
|
+
backend: Backend type ('onnx')
|
|
85
|
+
onnx_det_model_path: Path to detection ONNX model
|
|
86
|
+
onnx_rec_model_path: Path to recognition ONNX model
|
|
87
|
+
drop_score: Score threshold for filtering results
|
|
88
|
+
det_box_type: Detection box type ('quad' or 'poly')
|
|
89
|
+
|
|
90
|
+
# UniRec task parameters
|
|
91
|
+
unirec_encoder_path: Path to UniRec encoder ONNX model
|
|
92
|
+
unirec_decoder_path: Path to UniRec decoder ONNX model
|
|
93
|
+
tokenizer_mapping_path: Path to tokenizer mapping JSON
|
|
94
|
+
max_length: Maximum generation length
|
|
95
|
+
|
|
96
|
+
# Doc task parameters
|
|
97
|
+
layout_model_path: Path to layout detection model
|
|
98
|
+
layout_threshold: Layout detection threshold
|
|
99
|
+
use_layout_detection: Whether to use layout detection
|
|
100
|
+
use_chart_recognition: Whether to recognize charts
|
|
101
|
+
auto_download: Whether to auto-download missing models
|
|
102
|
+
"""
|
|
103
|
+
self.task = task.lower()
|
|
104
|
+
self.model = None
|
|
105
|
+
|
|
106
|
+
# Validate task
|
|
107
|
+
valid_tasks = ['det', 'rec', 'ocr', 'unirec', 'doc', 'launch_openocr_demo', 'launch_unirec_demo', 'launch_opendoc_demo']
|
|
108
|
+
if self.task not in valid_tasks:
|
|
109
|
+
raise ValueError(f"Invalid task '{task}'. Must be one of {valid_tasks}")
|
|
110
|
+
|
|
111
|
+
logger.info(f"Initializing OpenOCR with task: {self.task}")
|
|
112
|
+
|
|
113
|
+
# Demo tasks don't need model initialization
|
|
114
|
+
if self.task in ['launch_openocr_demo', 'launch_unirec_demo', 'launch_opendoc_demo']:
|
|
115
|
+
logger.info(f"Demo task '{self.task}' will be launched via command line")
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
# Initialize task-specific model
|
|
119
|
+
if self.task == 'det':
|
|
120
|
+
self._init_det_task(
|
|
121
|
+
backend=backend,
|
|
122
|
+
onnx_model_path=onnx_det_model_path,
|
|
123
|
+
use_gpu=use_gpu
|
|
124
|
+
)
|
|
125
|
+
elif self.task == 'rec':
|
|
126
|
+
self._init_rec_task(
|
|
127
|
+
mode=mode,
|
|
128
|
+
backend=backend,
|
|
129
|
+
onnx_model_path=onnx_rec_model_path,
|
|
130
|
+
use_gpu=use_gpu
|
|
131
|
+
)
|
|
132
|
+
elif self.task == 'ocr':
|
|
133
|
+
self._init_ocr_task(
|
|
134
|
+
mode=mode,
|
|
135
|
+
backend=backend,
|
|
136
|
+
onnx_det_model_path=onnx_det_model_path,
|
|
137
|
+
onnx_rec_model_path=onnx_rec_model_path,
|
|
138
|
+
drop_score=drop_score,
|
|
139
|
+
det_box_type=det_box_type,
|
|
140
|
+
use_gpu=use_gpu
|
|
141
|
+
)
|
|
142
|
+
elif self.task == 'unirec':
|
|
143
|
+
self._init_unirec_task(
|
|
144
|
+
encoder_path=unirec_encoder_path,
|
|
145
|
+
decoder_path=unirec_decoder_path,
|
|
146
|
+
mapping_path=tokenizer_mapping_path,
|
|
147
|
+
use_gpu=use_gpu,
|
|
148
|
+
auto_download=auto_download
|
|
149
|
+
)
|
|
150
|
+
elif self.task == 'doc':
|
|
151
|
+
self._init_doc_task(
|
|
152
|
+
layout_model_path=layout_model_path,
|
|
153
|
+
unirec_encoder_path=unirec_encoder_path,
|
|
154
|
+
unirec_decoder_path=unirec_decoder_path,
|
|
155
|
+
tokenizer_mapping_path=tokenizer_mapping_path,
|
|
156
|
+
use_gpu=use_gpu,
|
|
157
|
+
layout_threshold=layout_threshold,
|
|
158
|
+
use_layout_detection=use_layout_detection,
|
|
159
|
+
use_chart_recognition=use_chart_recognition,
|
|
160
|
+
auto_download=auto_download
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
logger.info(f"✅ OpenOCR initialized successfully for task: {self.task}")
|
|
164
|
+
|
|
165
|
+
def _init_det_task(self, **kwargs):
|
|
166
|
+
"""Initialize detection task"""
|
|
167
|
+
from tools.infer_det import OpenDetector
|
|
168
|
+
self.model = OpenDetector(**kwargs)
|
|
169
|
+
|
|
170
|
+
def _init_rec_task(self, **kwargs):
|
|
171
|
+
"""Initialize recognition task"""
|
|
172
|
+
from tools.infer_rec import OpenRecognizer
|
|
173
|
+
self.model = OpenRecognizer(**kwargs)
|
|
174
|
+
|
|
175
|
+
def _init_ocr_task(self, **kwargs):
|
|
176
|
+
"""Initialize OCR task (detection + recognition)"""
|
|
177
|
+
from tools.infer_e2e import OpenOCRE2E
|
|
178
|
+
self.model = OpenOCRE2E(**kwargs)
|
|
179
|
+
|
|
180
|
+
def _init_unirec_task(self, **kwargs):
|
|
181
|
+
"""Initialize UniRec task (universal recognition)"""
|
|
182
|
+
from tools.infer_unirec_onnx import UniRecONNX
|
|
183
|
+
self.model = UniRecONNX(**kwargs)
|
|
184
|
+
|
|
185
|
+
def _init_doc_task(self, **kwargs):
|
|
186
|
+
"""Initialize Doc task (document OCR with layout)"""
|
|
187
|
+
from tools.infer_doc_onnx import OpenDocONNX
|
|
188
|
+
self.model = OpenDocONNX(**kwargs)
|
|
189
|
+
|
|
190
|
+
def __call__(self, *args, **kwargs):
|
|
191
|
+
"""
|
|
192
|
+
Execute the task with appropriate parameters.
|
|
193
|
+
|
|
194
|
+
For 'det' task:
|
|
195
|
+
Args:
|
|
196
|
+
image_path: Path to image or directory
|
|
197
|
+
return_mask: Whether to return detection mask
|
|
198
|
+
|
|
199
|
+
For 'rec' task:
|
|
200
|
+
Args:
|
|
201
|
+
image_path: Path to image or directory
|
|
202
|
+
batch_num: Batch size for recognition
|
|
203
|
+
|
|
204
|
+
For 'ocr' task:
|
|
205
|
+
Args:
|
|
206
|
+
image_path: Path to image or directory
|
|
207
|
+
is_visualize: Whether to visualize results
|
|
208
|
+
rec_batch_num: Batch size for recognition
|
|
209
|
+
crop_infer: Whether to use crop inference
|
|
210
|
+
return_mask: Whether to return detection mask
|
|
211
|
+
|
|
212
|
+
For 'unirec' task:
|
|
213
|
+
Args:
|
|
214
|
+
image_path: Path to image
|
|
215
|
+
max_length: Maximum generation length
|
|
216
|
+
|
|
217
|
+
For 'doc' task:
|
|
218
|
+
Args:
|
|
219
|
+
image_path: Path to image
|
|
220
|
+
layout_threshold: Layout detection threshold
|
|
221
|
+
max_length: Maximum generation length
|
|
222
|
+
merge_layout_blocks: Whether to merge layout blocks
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Task-specific results
|
|
226
|
+
"""
|
|
227
|
+
if self.model is None:
|
|
228
|
+
raise RuntimeError('Model not initialized')
|
|
229
|
+
|
|
230
|
+
# Dispatch to appropriate task
|
|
231
|
+
if self.task == 'det':
|
|
232
|
+
return self._call_det(*args, **kwargs)
|
|
233
|
+
elif self.task == 'rec':
|
|
234
|
+
return self._call_rec(*args, **kwargs)
|
|
235
|
+
elif self.task == 'ocr':
|
|
236
|
+
return self._call_ocr(*args, **kwargs)
|
|
237
|
+
elif self.task == 'unirec':
|
|
238
|
+
return self._call_unirec(*args, **kwargs)
|
|
239
|
+
elif self.task == 'doc':
|
|
240
|
+
return self._call_doc(*args, **kwargs)
|
|
241
|
+
|
|
242
|
+
def _call_det(self, image_path, **kwargs):
|
|
243
|
+
"""Call detection task"""
|
|
244
|
+
return self.model(img_path=image_path, **kwargs)
|
|
245
|
+
|
|
246
|
+
def _call_rec(self, image_path, batch_num=1, **kwargs):
|
|
247
|
+
"""Call recognition task"""
|
|
248
|
+
return self.model(img_path=image_path, batch_num=batch_num, **kwargs)
|
|
249
|
+
|
|
250
|
+
def _call_ocr(self, image_path, **kwargs):
|
|
251
|
+
"""Call OCR task"""
|
|
252
|
+
return self.model(img_path=image_path, **kwargs)
|
|
253
|
+
|
|
254
|
+
def _call_unirec(self, image_path, max_length=2048, **kwargs):
|
|
255
|
+
"""Call UniRec task"""
|
|
256
|
+
return self.model(img_path=image_path, max_length=max_length, **kwargs)
|
|
257
|
+
|
|
258
|
+
def _call_doc(self, image_path, **kwargs):
|
|
259
|
+
"""Call Doc task"""
|
|
260
|
+
return self.model(img_path=image_path, **kwargs)
|
|
261
|
+
|
|
262
|
+
# Additional methods for doc task
|
|
263
|
+
def save_to_json(self, result: Dict, output_path: str):
|
|
264
|
+
"""Save doc task results to JSON (only for doc task)"""
|
|
265
|
+
if self.task != 'doc':
|
|
266
|
+
raise RuntimeError("save_to_json is only available for 'doc' task")
|
|
267
|
+
return self.model.save_to_json(result, output_path)
|
|
268
|
+
|
|
269
|
+
def save_to_markdown(self, result: Dict, output_path: str):
|
|
270
|
+
"""Save doc task results to Markdown (only for doc task)"""
|
|
271
|
+
if self.task != 'doc':
|
|
272
|
+
raise RuntimeError("save_to_markdown is only available for 'doc' task")
|
|
273
|
+
return self.model.save_to_markdown(result, output_path)
|
|
274
|
+
|
|
275
|
+
def save_visualization(self, result: Dict, output_path: str):
|
|
276
|
+
"""Save doc task visualization (only for doc task)"""
|
|
277
|
+
if self.task != 'doc':
|
|
278
|
+
raise RuntimeError("save_visualization is only available for 'doc' task")
|
|
279
|
+
return self.model.save_visualization(result, output_path)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def main():
|
|
283
|
+
"""Command-line interface for OpenOCR unified interface"""
|
|
284
|
+
parser = argparse.ArgumentParser(
|
|
285
|
+
description='OpenOCR Unified Interface - Single entry point for all OCR tasks',
|
|
286
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
287
|
+
epilog="""
|
|
288
|
+
Command-line Usage:
|
|
289
|
+
After installation, you can use OpenOCR in three ways:
|
|
290
|
+
|
|
291
|
+
1. Using the 'openocr' command (recommended):
|
|
292
|
+
openocr --task ocr --input_path image.jpg
|
|
293
|
+
|
|
294
|
+
2. Using 'python -m openocr':
|
|
295
|
+
python -m openocr --task ocr --input_path image.jpg
|
|
296
|
+
|
|
297
|
+
3. Running the script directly:
|
|
298
|
+
python openocr.py --task ocr --input_path image.jpg
|
|
299
|
+
|
|
300
|
+
Examples:
|
|
301
|
+
# Detection task
|
|
302
|
+
openocr --task det --input_path image.jpg
|
|
303
|
+
|
|
304
|
+
# Recognition task
|
|
305
|
+
openocr --task rec --input_path image.jpg --mode server
|
|
306
|
+
|
|
307
|
+
# OCR task (detection + recognition)
|
|
308
|
+
openocr --task ocr --input_path image.jpg --is_vis
|
|
309
|
+
|
|
310
|
+
# OCR with custom output path
|
|
311
|
+
openocr --task ocr --input_path ./images --output_path ./results
|
|
312
|
+
|
|
313
|
+
# UniRec task (universal recognition)
|
|
314
|
+
openocr --task unirec --input_path image.jpg --max_length 2048
|
|
315
|
+
|
|
316
|
+
# Doc task (document OCR with layout)
|
|
317
|
+
openocr --task doc --input_path document.jpg --save_markdown --save_json
|
|
318
|
+
|
|
319
|
+
# Doc task with PDF input
|
|
320
|
+
openocr --task doc --input_path document.pdf --save_markdown --save_json
|
|
321
|
+
|
|
322
|
+
# Doc task with custom models
|
|
323
|
+
openocr --task doc --input_path doc.jpg --layout_model path/to/layout.onnx \\
|
|
324
|
+
--encoder_model path/to/encoder.onnx --decoder_model path/to/decoder.onnx
|
|
325
|
+
|
|
326
|
+
# Launch OpenOCR Gradio demo
|
|
327
|
+
openocr --task launch_openocr_demo --share
|
|
328
|
+
|
|
329
|
+
# Launch UniRec Gradio demo
|
|
330
|
+
openocr --task launch_unirec_demo --server_port 7861
|
|
331
|
+
|
|
332
|
+
# Launch OpenDoc Gradio demo
|
|
333
|
+
openocr --task launch_opendoc_demo --share --server_port 7862
|
|
334
|
+
|
|
335
|
+
For more information, visit: https://github.com/Topdu/OpenOCR
|
|
336
|
+
"""
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Task selection
|
|
340
|
+
parser.add_argument(
|
|
341
|
+
'--task',
|
|
342
|
+
type=str,
|
|
343
|
+
required=True,
|
|
344
|
+
default='ocr',
|
|
345
|
+
choices=['det', 'rec', 'ocr', 'unirec', 'doc', 'launch_openocr_demo', 'launch_unirec_demo', 'launch_opendoc_demo'],
|
|
346
|
+
help='Task type: det (detection), rec (recognition), ocr (detection+recognition), unirec (universal recognition), doc (document OCR), launch_*_demo (launch Gradio demo)'
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Unified input/output parameters
|
|
350
|
+
parser.add_argument('--input_path', type=str, help='Input image/PDF path or directory (unified for all tasks, not required for demo tasks)')
|
|
351
|
+
parser.add_argument('--output_path', type=str, help='Output directory (auto-generated as openocr_output/{task} if not specified)')
|
|
352
|
+
|
|
353
|
+
# Demo launch parameters
|
|
354
|
+
parser.add_argument('--share', action='store_true', help='[Demo] Create a public share link')
|
|
355
|
+
parser.add_argument('--server_port', type=int, default=7860, help='[Demo] Server port (default: 7860)')
|
|
356
|
+
parser.add_argument('--server_name', type=str, default='0.0.0.0', help='[Demo] Server name (default: 0.0.0.0)')
|
|
357
|
+
|
|
358
|
+
# Common parameters
|
|
359
|
+
parser.add_argument(
|
|
360
|
+
'--use_gpu',
|
|
361
|
+
type=str,
|
|
362
|
+
default='auto',
|
|
363
|
+
choices=['auto', 'true', 'false'],
|
|
364
|
+
help='GPU usage strategy: auto (detect automatically), true (force GPU), false (force CPU)'
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# OCR/Det/Rec task parameters
|
|
368
|
+
parser.add_argument('--mode', type=str, default='mobile', choices=['mobile', 'server'], help='[OCR/Rec] Model mode')
|
|
369
|
+
parser.add_argument('--backend', type=str, default='onnx', choices=['torch', 'onnx'], help='[OCR] Backend type')
|
|
370
|
+
parser.add_argument('--onnx_det_model_path', type=str, help='[OCR] Detection ONNX model path')
|
|
371
|
+
parser.add_argument('--onnx_rec_model_path', type=str, help='[OCR] Recognition ONNX model path')
|
|
372
|
+
parser.add_argument('--drop_score', type=float, default=0.5, help='[OCR] Score threshold')
|
|
373
|
+
parser.add_argument('--det_box_type', type=str, default='quad', choices=['quad', 'poly'], help='[Det/OCR] Box type')
|
|
374
|
+
parser.add_argument('--is_vis', action='store_true', help='[Det/OCR] Visualize results')
|
|
375
|
+
parser.add_argument('--rec_batch_num', type=int, default=6, help='[Rec/OCR] Recognition batch size')
|
|
376
|
+
parser.add_argument('--return_mask', action='store_true', help='[Det] Return detection mask')
|
|
377
|
+
|
|
378
|
+
# UniRec task parameters
|
|
379
|
+
parser.add_argument('--encoder_model', type=str, help='[Doc/UniRec] Encoder ONNX model path')
|
|
380
|
+
parser.add_argument('--decoder_model', type=str, help='[Doc/UniRec] Decoder ONNX model path')
|
|
381
|
+
parser.add_argument('--mapping', type=str, help='[UniRec] Tokenizer mapping JSON path')
|
|
382
|
+
parser.add_argument('--max_length', type=int, default=2048, help='[UniRec/Doc] Max generation length')
|
|
383
|
+
|
|
384
|
+
# Doc task parameters
|
|
385
|
+
parser.add_argument('--layout_model', type=str, help='[Doc] Layout detection model path')
|
|
386
|
+
parser.add_argument('--tokenizer_mapping', type=str, help='[Doc] Tokenizer mapping path')
|
|
387
|
+
parser.add_argument('--layout_threshold', type=float, default=0.4, help='[Doc] Layout detection threshold')
|
|
388
|
+
parser.add_argument('--use_layout_detection', action='store_true', help='[Doc] Use layout detection')
|
|
389
|
+
parser.add_argument('--no_layout_detection', dest='use_layout_detection', action='store_false', help='[Doc] Disable layout detection')
|
|
390
|
+
parser.add_argument('--use_chart_recognition', action='store_true', help='[Doc] Recognize charts')
|
|
391
|
+
parser.add_argument('--save_vis', action='store_true', help='[Doc] Save visualization')
|
|
392
|
+
parser.add_argument('--save_json', action='store_true', help='[Doc] Save JSON results')
|
|
393
|
+
parser.add_argument('--save_markdown', action='store_true', help='[Doc] Save Markdown results')
|
|
394
|
+
parser.add_argument('--no_auto_download', action='store_true', help='Disable automatic model download')
|
|
395
|
+
|
|
396
|
+
args = parser.parse_args()
|
|
397
|
+
|
|
398
|
+
# use_gpu is already a string from argparse choices
|
|
399
|
+
|
|
400
|
+
# Handle demo tasks
|
|
401
|
+
if args.task == 'launch_openocr_demo':
|
|
402
|
+
logger.info('Launching OpenOCR Gradio demo...')
|
|
403
|
+
from demo_gradio import launch_demo
|
|
404
|
+
launch_demo(
|
|
405
|
+
share=args.share,
|
|
406
|
+
server_port=args.server_port,
|
|
407
|
+
server_name=args.server_name
|
|
408
|
+
)
|
|
409
|
+
return
|
|
410
|
+
|
|
411
|
+
elif args.task == 'launch_unirec_demo':
|
|
412
|
+
logger.info('Launching UniRec Gradio demo...')
|
|
413
|
+
from demo_unirec import launch_demo
|
|
414
|
+
launch_demo(
|
|
415
|
+
encoder_path=args.encoder_model,
|
|
416
|
+
decoder_path=args.decoder_model,
|
|
417
|
+
mapping_path=args.mapping,
|
|
418
|
+
use_gpu=args.use_gpu,
|
|
419
|
+
auto_download=not args.no_auto_download,
|
|
420
|
+
share=args.share,
|
|
421
|
+
server_port=args.server_port,
|
|
422
|
+
server_name=args.server_name
|
|
423
|
+
)
|
|
424
|
+
return
|
|
425
|
+
|
|
426
|
+
elif args.task == 'launch_opendoc_demo':
|
|
427
|
+
logger.info('Launching OpenDoc Gradio demo...')
|
|
428
|
+
from demo_opendoc import launch_demo
|
|
429
|
+
launch_demo(
|
|
430
|
+
layout_model_path=args.layout_model,
|
|
431
|
+
unirec_encoder_path=args.encoder_model,
|
|
432
|
+
unirec_decoder_path=args.decoder_model,
|
|
433
|
+
tokenizer_mapping_path=args.tokenizer_mapping,
|
|
434
|
+
use_gpu=args.use_gpu,
|
|
435
|
+
auto_download=not args.no_auto_download,
|
|
436
|
+
share=args.share,
|
|
437
|
+
server_port=args.server_port,
|
|
438
|
+
server_name=args.server_name
|
|
439
|
+
)
|
|
440
|
+
return
|
|
441
|
+
|
|
442
|
+
# Set default output directory if not specified
|
|
443
|
+
if not args.output_path:
|
|
444
|
+
args.output_path = f'openocr_output/{args.task}'
|
|
445
|
+
|
|
446
|
+
# Use input_path as unified input
|
|
447
|
+
if not args.input_path:
|
|
448
|
+
parser.error('--input_path is required for all tasks')
|
|
449
|
+
|
|
450
|
+
# Initialize unified interface
|
|
451
|
+
try:
|
|
452
|
+
if args.task == 'det':
|
|
453
|
+
openocr = OpenOCR(
|
|
454
|
+
task='det',
|
|
455
|
+
backend=args.backend,
|
|
456
|
+
onnx_det_model_path=args.onnx_det_model_path,
|
|
457
|
+
use_gpu=args.use_gpu
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
from tools.utils.utility import get_image_file_list
|
|
461
|
+
img_list = get_image_file_list(args.input_path)
|
|
462
|
+
|
|
463
|
+
logger.info(f'\nFound {len(img_list)} images in {args.input_path}')
|
|
464
|
+
logger.info(f'Output will be saved to: {args.output_path}')
|
|
465
|
+
logger.info('=' * 80)
|
|
466
|
+
|
|
467
|
+
os.makedirs(args.output_path, exist_ok=True)
|
|
468
|
+
|
|
469
|
+
import json
|
|
470
|
+
with open(os.path.join(args.output_path, 'det_results.txt'), 'w') as fout:
|
|
471
|
+
for idx, img_path in enumerate(img_list):
|
|
472
|
+
logger.info(f"\n[{idx + 1}/{len(img_list)}] Processing: {os.path.basename(img_path)}")
|
|
473
|
+
|
|
474
|
+
try:
|
|
475
|
+
results = openocr(
|
|
476
|
+
image_path=img_path,
|
|
477
|
+
return_mask=args.return_mask
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
boxes = results[0]['boxes']
|
|
481
|
+
elapse = results[0]['elapse']
|
|
482
|
+
|
|
483
|
+
logger.info(f" Found {len(boxes)} text regions, time: {elapse:.3f}s")
|
|
484
|
+
|
|
485
|
+
# Save results
|
|
486
|
+
dt_boxes_json = [{'points': box.tolist()} for box in boxes]
|
|
487
|
+
fout.write(f"{img_path}\t{json.dumps(dt_boxes_json)}\n")
|
|
488
|
+
|
|
489
|
+
# Visualize if requested
|
|
490
|
+
if args.is_vis:
|
|
491
|
+
import cv2
|
|
492
|
+
import numpy as np
|
|
493
|
+
src_img = cv2.imread(img_path)
|
|
494
|
+
for box in boxes:
|
|
495
|
+
box = np.array(box).astype(np.int32).reshape((-1, 1, 2))
|
|
496
|
+
cv2.polylines(src_img, [box], True, color=(255, 255, 0), thickness=2)
|
|
497
|
+
vis_path = os.path.join(args.output_path, os.path.basename(img_path))
|
|
498
|
+
cv2.imwrite(vis_path, src_img)
|
|
499
|
+
|
|
500
|
+
except Exception as e:
|
|
501
|
+
logger.error(f"Error processing {img_path}: {str(e)}")
|
|
502
|
+
import traceback
|
|
503
|
+
traceback.print_exc()
|
|
504
|
+
continue
|
|
505
|
+
|
|
506
|
+
logger.info('\n' + '=' * 80)
|
|
507
|
+
logger.info(f'✅ Detection task completed. Results saved to {args.output_path}')
|
|
508
|
+
logger.info('=' * 80)
|
|
509
|
+
|
|
510
|
+
elif args.task == 'rec':
|
|
511
|
+
openocr = OpenOCR(
|
|
512
|
+
task='rec',
|
|
513
|
+
mode=args.mode,
|
|
514
|
+
backend=args.backend,
|
|
515
|
+
onnx_rec_model_path=args.onnx_rec_model_path,
|
|
516
|
+
use_gpu=args.use_gpu
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
from tools.utils.utility import get_image_file_list
|
|
520
|
+
img_list = get_image_file_list(args.input_path)
|
|
521
|
+
|
|
522
|
+
logger.info(f'\nFound {len(img_list)} images in {args.input_path}')
|
|
523
|
+
logger.info(f'Output will be saved to: {args.output_path}')
|
|
524
|
+
logger.info('=' * 80)
|
|
525
|
+
|
|
526
|
+
os.makedirs(args.output_path, exist_ok=True)
|
|
527
|
+
|
|
528
|
+
with open(os.path.join(args.output_path, 'rec_results.txt'), 'w') as fout:
|
|
529
|
+
for idx, img_path in enumerate(img_list):
|
|
530
|
+
logger.info(f"\n[{idx + 1}/{len(img_list)}] Processing: {os.path.basename(img_path)}")
|
|
531
|
+
|
|
532
|
+
try:
|
|
533
|
+
results = openocr(
|
|
534
|
+
image_path=img_path,
|
|
535
|
+
batch_num=args.rec_batch_num
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
text = results[0]['text']
|
|
539
|
+
score = results[0]['score']
|
|
540
|
+
elapse = results[0]['elapse']
|
|
541
|
+
|
|
542
|
+
logger.info(f" Text: {text}, Score: {score:.3f}, Time: {elapse:.3f}s")
|
|
543
|
+
|
|
544
|
+
fout.write(f"{img_path}\t{text}\t{score:.3f}\n")
|
|
545
|
+
|
|
546
|
+
except Exception as e:
|
|
547
|
+
logger.error(f"Error processing {img_path}: {str(e)}")
|
|
548
|
+
import traceback
|
|
549
|
+
traceback.print_exc()
|
|
550
|
+
continue
|
|
551
|
+
|
|
552
|
+
logger.info('\n' + '=' * 80)
|
|
553
|
+
logger.info(f'✅ Recognition task completed. Results saved to {args.output_path}')
|
|
554
|
+
logger.info('=' * 80)
|
|
555
|
+
|
|
556
|
+
elif args.task == 'ocr':
|
|
557
|
+
openocr = OpenOCR(
|
|
558
|
+
task='ocr',
|
|
559
|
+
mode=args.mode,
|
|
560
|
+
backend=args.backend,
|
|
561
|
+
onnx_det_model_path=args.onnx_det_model_path,
|
|
562
|
+
onnx_rec_model_path=args.onnx_rec_model_path,
|
|
563
|
+
drop_score=args.drop_score,
|
|
564
|
+
det_box_type=args.det_box_type,
|
|
565
|
+
use_gpu=args.use_gpu
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
results, time_dicts = openocr(
|
|
569
|
+
image_path=args.input_path,
|
|
570
|
+
save_dir=args.output_path,
|
|
571
|
+
is_visualize=args.is_vis,
|
|
572
|
+
rec_batch_num=args.rec_batch_num
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
logger.info(f"✅ OCR task completed. Results saved to {args.output_path}")
|
|
576
|
+
|
|
577
|
+
elif args.task == 'unirec':
|
|
578
|
+
openocr = OpenOCR(
|
|
579
|
+
task='unirec',
|
|
580
|
+
unirec_encoder_path=args.encoder_model,
|
|
581
|
+
unirec_decoder_path=args.decoder_model,
|
|
582
|
+
tokenizer_mapping_path=args.mapping,
|
|
583
|
+
use_gpu=args.use_gpu,
|
|
584
|
+
max_length=args.max_length,
|
|
585
|
+
auto_download=not args.no_auto_download
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
from tools.utils.utility import get_image_file_list
|
|
589
|
+
img_list = get_image_file_list(args.input_path)
|
|
590
|
+
|
|
591
|
+
logger.info(f'\nFound {len(img_list)} images in {args.input_path}')
|
|
592
|
+
logger.info(f'Output will be saved to: {args.output_path}')
|
|
593
|
+
logger.info('=' * 80)
|
|
594
|
+
|
|
595
|
+
os.makedirs(args.output_path, exist_ok=True)
|
|
596
|
+
|
|
597
|
+
import json
|
|
598
|
+
with open(os.path.join(args.output_path, 'unirec_results.txt'), 'w') as fout:
|
|
599
|
+
for idx, img_path in enumerate(img_list):
|
|
600
|
+
logger.info(f"\n[{idx + 1}/{len(img_list)}] Processing: {os.path.basename(img_path)}")
|
|
601
|
+
|
|
602
|
+
try:
|
|
603
|
+
result_text, generated_ids = openocr(
|
|
604
|
+
image_path=img_path,
|
|
605
|
+
max_length=args.max_length
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
logger.info(f" Generated {len(generated_ids)} tokens")
|
|
609
|
+
logger.info(f" Text: {result_text[:100]}..." if len(result_text) > 100 else f" Text: {result_text}")
|
|
610
|
+
|
|
611
|
+
image_name = os.path.basename(img_path)
|
|
612
|
+
result_dict = {'text': result_text}
|
|
613
|
+
fout.write(f"{image_name}\t{json.dumps(result_dict, ensure_ascii=False)}\n")
|
|
614
|
+
|
|
615
|
+
except Exception as e:
|
|
616
|
+
logger.error(f"Error processing {img_path}: {str(e)}")
|
|
617
|
+
import traceback
|
|
618
|
+
traceback.print_exc()
|
|
619
|
+
continue
|
|
620
|
+
|
|
621
|
+
logger.info('\n' + '=' * 80)
|
|
622
|
+
logger.info(f'✅ UniRec task completed. Results saved to {args.output_path}')
|
|
623
|
+
logger.info('=' * 80)
|
|
624
|
+
|
|
625
|
+
elif args.task == 'doc':
|
|
626
|
+
openocr = OpenOCR(
|
|
627
|
+
task='doc',
|
|
628
|
+
layout_model_path=args.layout_model,
|
|
629
|
+
unirec_encoder_path=args.encoder_model,
|
|
630
|
+
unirec_decoder_path=args.decoder_model,
|
|
631
|
+
tokenizer_mapping_path=args.tokenizer_mapping,
|
|
632
|
+
use_gpu=args.use_gpu,
|
|
633
|
+
layout_threshold=args.layout_threshold,
|
|
634
|
+
use_layout_detection=args.use_layout_detection,
|
|
635
|
+
use_chart_recognition=args.use_chart_recognition,
|
|
636
|
+
auto_download=not args.no_auto_download
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
from tools.utils.utility import get_image_file_list
|
|
640
|
+
img_list = get_image_file_list(args.input_path)
|
|
641
|
+
|
|
642
|
+
logger.info(f'\nFound {len(img_list)} images/PDFs in {args.input_path}')
|
|
643
|
+
logger.info(f'Output will be saved to: {args.output_path}')
|
|
644
|
+
logger.info('=' * 80)
|
|
645
|
+
|
|
646
|
+
os.makedirs(args.output_path, exist_ok=True)
|
|
647
|
+
|
|
648
|
+
for idx, img_path in enumerate(img_list):
|
|
649
|
+
logger.info(f"\n[{idx + 1}/{len(img_list)}] Processing: {os.path.basename(img_path)}")
|
|
650
|
+
|
|
651
|
+
try:
|
|
652
|
+
result = openocr(
|
|
653
|
+
image_path=img_path,
|
|
654
|
+
layout_threshold=args.layout_threshold,
|
|
655
|
+
max_length=args.max_length
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
if args.save_vis:
|
|
659
|
+
openocr.save_visualization(result, args.output_path)
|
|
660
|
+
|
|
661
|
+
if args.save_json:
|
|
662
|
+
openocr.save_to_json(result, args.output_path)
|
|
663
|
+
|
|
664
|
+
if args.save_markdown:
|
|
665
|
+
openocr.save_to_markdown(result, args.output_path)
|
|
666
|
+
|
|
667
|
+
except Exception as e:
|
|
668
|
+
logger.error(f"Error processing {img_path}: {str(e)}")
|
|
669
|
+
import traceback
|
|
670
|
+
traceback.print_exc()
|
|
671
|
+
continue
|
|
672
|
+
|
|
673
|
+
logger.info('\n' + '=' * 80)
|
|
674
|
+
logger.info(f'✅ Doc task completed. Results saved to {args.output_path}')
|
|
675
|
+
logger.info('=' * 80)
|
|
676
|
+
|
|
677
|
+
except Exception as e:
|
|
678
|
+
logger.error(f"Error: {str(e)}")
|
|
679
|
+
import traceback
|
|
680
|
+
traceback.print_exc()
|
|
681
|
+
sys.exit(1)
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
if __name__ == '__main__':
|
|
685
|
+
main()
|