openocr-python 0.0.9__py3-none-any.whl → 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. openocr/__init__.py +35 -1
  2. openocr/configs/dataset/rec/evaluation.yaml +41 -0
  3. openocr/configs/dataset/rec/ltb.yaml +9 -0
  4. openocr/configs/dataset/rec/mjsynth.yaml +11 -0
  5. openocr/configs/dataset/rec/openvino.yaml +25 -0
  6. openocr/configs/dataset/rec/ost.yaml +17 -0
  7. openocr/configs/dataset/rec/synthtext.yaml +7 -0
  8. openocr/configs/dataset/rec/test.yaml +77 -0
  9. openocr/configs/dataset/rec/textocr.yaml +13 -0
  10. openocr/configs/dataset/rec/textocr_horizontal.yaml +13 -0
  11. openocr/configs/dataset/rec/union14m_b.yaml +47 -0
  12. openocr/configs/dataset/rec/union14m_l_filtered.yaml +35 -0
  13. openocr/configs/rec/cmer/cmer.yml +127 -0
  14. openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_base.yml +152 -0
  15. openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_small.yml +152 -0
  16. openocr/configs/rec/unirec/focalsvtr_ardecoder_unirec.yml +114 -0
  17. openocr/configs/rec/unirec/opendoc_pipeline.yml +105 -0
  18. openocr/demo_gradio.py +28 -8
  19. openocr/demo_opendoc.py +572 -0
  20. openocr/demo_unirec.py +392 -0
  21. openocr/opendet/losses/__init__.py +5 -7
  22. openocr/opendet/preprocess/crop_resize.py +2 -1
  23. openocr/openocr.py +685 -0
  24. openocr/openrec/losses/__init__.py +8 -3
  25. openocr/openrec/losses/cmer_loss.py +12 -0
  26. openocr/openrec/losses/mdiff_loss.py +11 -0
  27. openocr/openrec/losses/unirec_loss.py +12 -0
  28. openocr/openrec/metrics/__init__.py +4 -1
  29. openocr/openrec/metrics/rec_metric_cmer.py +328 -0
  30. openocr/openrec/modeling/cmer_modeling/modeling_cmer.py +643 -0
  31. openocr/openrec/modeling/decoders/__init__.py +1 -0
  32. openocr/openrec/modeling/decoders/ctc_decoder.py +1 -1
  33. openocr/openrec/modeling/decoders/dan_decoder.py +4 -4
  34. openocr/openrec/modeling/decoders/dptr_parseq_clip_b_decoder.py +1563 -1398
  35. openocr/openrec/modeling/decoders/mdiff_decoder.py +587 -0
  36. openocr/openrec/modeling/decoders/smtr_decoder.py +99 -48
  37. openocr/openrec/modeling/unirec_modeling/configuration_unirec.py +166 -0
  38. openocr/openrec/modeling/unirec_modeling/modeling_unirec.py +433 -0
  39. openocr/openrec/optimizer/__init__.py +4 -3
  40. openocr/openrec/optimizer/lr.py +49 -0
  41. openocr/openrec/postprocess/__init__.py +2 -0
  42. openocr/openrec/postprocess/abinet_postprocess.py +1 -1
  43. openocr/openrec/postprocess/ar_postprocess.py +1 -1
  44. openocr/openrec/postprocess/cmer_postprocess.py +86 -0
  45. openocr/openrec/postprocess/cppd_postprocess.py +1 -1
  46. openocr/openrec/postprocess/igtr_postprocess.py +1 -1
  47. openocr/openrec/postprocess/lister_postprocess.py +1 -1
  48. openocr/openrec/postprocess/mgp_postprocess.py +1 -1
  49. openocr/openrec/postprocess/nrtr_postprocess.py +2 -2
  50. openocr/openrec/postprocess/smtr_postprocess.py +1 -1
  51. openocr/openrec/postprocess/srn_postprocess.py +1 -1
  52. openocr/openrec/postprocess/unirec_postprocess.py +58 -0
  53. openocr/openrec/postprocess/visionlan_postprocess.py +1 -1
  54. openocr/openrec/preprocess/__init__.py +5 -0
  55. openocr/openrec/preprocess/ce_label_encode.py +1 -1
  56. openocr/openrec/preprocess/cmer_label_encode.py +1025 -0
  57. openocr/openrec/preprocess/ctc_label_encode.py +1 -1
  58. openocr/openrec/preprocess/dptr_label_encode.py +177 -157
  59. openocr/openrec/preprocess/igtr_label_encode.py +4 -2
  60. openocr/openrec/preprocess/mdiff_label_encode.py +312 -0
  61. openocr/openrec/preprocess/rec_aug.py +128 -2
  62. openocr/openrec/preprocess/resize.py +57 -0
  63. openocr/openrec/preprocess/unirec_label_encode.py +62 -0
  64. openocr/tools/data/__init__.py +78 -55
  65. openocr/tools/data/cmer_web_dataset.py +310 -0
  66. openocr/tools/data/native_size_dataset.py +753 -0
  67. openocr/tools/data/native_size_sampler.py +158 -0
  68. openocr/tools/data/ratio_dataset_tvresize.py +2 -0
  69. openocr/tools/data/ratio_sampler.py +2 -1
  70. openocr/tools/download/download_dataset.py +38 -0
  71. openocr/tools/download/utils.py +28 -0
  72. openocr/tools/download_example_images.py +236 -0
  73. openocr/tools/engine/trainer.py +155 -39
  74. openocr/tools/eval_rec_all_ch.py +2 -2
  75. openocr/tools/infer_det.py +20 -2
  76. openocr/tools/infer_doc.py +898 -0
  77. openocr/tools/infer_doc_onnx.py +1172 -0
  78. openocr/tools/infer_e2e.py +27 -10
  79. openocr/tools/infer_rec.py +64 -15
  80. openocr/tools/infer_unirec_onnx.py +730 -0
  81. openocr/tools/to_markdown.py +468 -0
  82. openocr/tools/utils/ckpt.py +17 -5
  83. openocr/tools/utils/opendoc_onnx_utils/utils.py +1052 -0
  84. openocr_python-0.1.0.dev0.dist-info/METADATA +324 -0
  85. {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/RECORD +89 -45
  86. {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/WHEEL +1 -1
  87. openocr_python-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  88. openocr_python-0.0.9.dist-info/METADATA +0 -149
  89. /openocr_python-0.0.9.dist-info/LICENCE → /openocr_python-0.1.0.dev0.dist-info/licenses/LICENSE +0 -0
  90. {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/top_level.txt +0 -0
openocr/openocr.py ADDED
@@ -0,0 +1,685 @@
1
+ """
2
+ OpenOCR Unified Interface
3
+ Provides a single entry point for all OCR tasks with task-based dispatching.
4
+
5
+ Supported tasks:
6
+ - 'ocr': End-to-end OCR (detection + recognition)
7
+ - 'det': Text detection only
8
+ - 'rec': Text recognition only
9
+ - 'unirec': Universal recognition with VLM
10
+ - 'doc': Document OCR with layout analysis
11
+ - 'launch_openocr_demo': Launch OpenOCR Gradio demo
12
+ - 'launch_unirec_demo': Launch UniRec Gradio demo
13
+ - 'launch_opendoc_demo': Launch OpenDoc Gradio demo
14
+ """
15
+
16
+ from __future__ import absolute_import
17
+ from __future__ import division
18
+ from __future__ import print_function
19
+
20
+ import os
21
+ import sys
22
+ import argparse
23
+ from typing import Optional, Dict
24
+
25
+ __dir__ = os.path.dirname(os.path.abspath(__file__))
26
+ sys.path.append(__dir__)
27
+ sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..')))
28
+
29
+ from tools.utils.logging import get_logger
30
+
31
+ logger = get_logger(name='openocr_unified')
32
+
33
+
34
+ class OpenOCR:
35
+ """
36
+ Unified OpenOCR interface that dispatches to different task implementations.
37
+
38
+ Supported tasks:
39
+ - 'det': Text detection only
40
+ - 'rec': Text recognition only
41
+ - 'ocr': End-to-end OCR (text detection + recognition)
42
+ - 'unirec': Universal recognition with Vision-Language Model
43
+ - 'doc': Document OCR with layout analysis (tables, formulas, etc.)
44
+ - 'launch_openocr_demo': Launch OpenOCR Gradio demo
45
+ - 'launch_unirec_demo': Launch UniRec Gradio demo
46
+ - 'launch_opendoc_demo': Launch OpenDoc Gradio demo
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ task: str = 'ocr',
52
+ # Common parameters
53
+ use_gpu: str = 'auto',
54
+ # OCR task parameters
55
+ mode: str = 'mobile',
56
+ backend: str = 'onnx',
57
+ onnx_det_model_path: Optional[str] = None,
58
+ onnx_rec_model_path: Optional[str] = None,
59
+ drop_score: float = 0.5,
60
+ det_box_type: str = 'quad',
61
+ # UniRec task parameters
62
+ unirec_encoder_path: Optional[str] = None,
63
+ unirec_decoder_path: Optional[str] = None,
64
+ tokenizer_mapping_path: Optional[str] = None,
65
+ max_length: int = 2048,
66
+ # Doc task parameters
67
+ layout_model_path: Optional[str] = None,
68
+ layout_threshold: float = 0.5,
69
+ use_layout_detection: bool = True,
70
+ use_chart_recognition: bool = True,
71
+ auto_download: bool = True,
72
+ ):
73
+ """
74
+ Initialize OpenOCR unified interface.
75
+
76
+ Args:
77
+ task: Task type ('ocr', 'det', 'rec', 'unirec', 'doc', 'launch_openocr_demo', 'launch_unirec_demo', 'launch_opendoc_demo')
78
+
79
+ # Common parameters
80
+ use_gpu: GPU usage strategy ('auto', 'true', or 'false')
81
+
82
+ # OCR task parameters
83
+ mode: Model mode ('mobile' or 'server')
84
+ backend: Backend type ('onnx')
85
+ onnx_det_model_path: Path to detection ONNX model
86
+ onnx_rec_model_path: Path to recognition ONNX model
87
+ drop_score: Score threshold for filtering results
88
+ det_box_type: Detection box type ('quad' or 'poly')
89
+
90
+ # UniRec task parameters
91
+ unirec_encoder_path: Path to UniRec encoder ONNX model
92
+ unirec_decoder_path: Path to UniRec decoder ONNX model
93
+ tokenizer_mapping_path: Path to tokenizer mapping JSON
94
+ max_length: Maximum generation length
95
+
96
+ # Doc task parameters
97
+ layout_model_path: Path to layout detection model
98
+ layout_threshold: Layout detection threshold
99
+ use_layout_detection: Whether to use layout detection
100
+ use_chart_recognition: Whether to recognize charts
101
+ auto_download: Whether to auto-download missing models
102
+ """
103
+ self.task = task.lower()
104
+ self.model = None
105
+
106
+ # Validate task
107
+ valid_tasks = ['det', 'rec', 'ocr', 'unirec', 'doc', 'launch_openocr_demo', 'launch_unirec_demo', 'launch_opendoc_demo']
108
+ if self.task not in valid_tasks:
109
+ raise ValueError(f"Invalid task '{task}'. Must be one of {valid_tasks}")
110
+
111
+ logger.info(f"Initializing OpenOCR with task: {self.task}")
112
+
113
+ # Demo tasks don't need model initialization
114
+ if self.task in ['launch_openocr_demo', 'launch_unirec_demo', 'launch_opendoc_demo']:
115
+ logger.info(f"Demo task '{self.task}' will be launched via command line")
116
+ return
117
+
118
+ # Initialize task-specific model
119
+ if self.task == 'det':
120
+ self._init_det_task(
121
+ backend=backend,
122
+ onnx_model_path=onnx_det_model_path,
123
+ use_gpu=use_gpu
124
+ )
125
+ elif self.task == 'rec':
126
+ self._init_rec_task(
127
+ mode=mode,
128
+ backend=backend,
129
+ onnx_model_path=onnx_rec_model_path,
130
+ use_gpu=use_gpu
131
+ )
132
+ elif self.task == 'ocr':
133
+ self._init_ocr_task(
134
+ mode=mode,
135
+ backend=backend,
136
+ onnx_det_model_path=onnx_det_model_path,
137
+ onnx_rec_model_path=onnx_rec_model_path,
138
+ drop_score=drop_score,
139
+ det_box_type=det_box_type,
140
+ use_gpu=use_gpu
141
+ )
142
+ elif self.task == 'unirec':
143
+ self._init_unirec_task(
144
+ encoder_path=unirec_encoder_path,
145
+ decoder_path=unirec_decoder_path,
146
+ mapping_path=tokenizer_mapping_path,
147
+ use_gpu=use_gpu,
148
+ auto_download=auto_download
149
+ )
150
+ elif self.task == 'doc':
151
+ self._init_doc_task(
152
+ layout_model_path=layout_model_path,
153
+ unirec_encoder_path=unirec_encoder_path,
154
+ unirec_decoder_path=unirec_decoder_path,
155
+ tokenizer_mapping_path=tokenizer_mapping_path,
156
+ use_gpu=use_gpu,
157
+ layout_threshold=layout_threshold,
158
+ use_layout_detection=use_layout_detection,
159
+ use_chart_recognition=use_chart_recognition,
160
+ auto_download=auto_download
161
+ )
162
+
163
+ logger.info(f"✅ OpenOCR initialized successfully for task: {self.task}")
164
+
165
+ def _init_det_task(self, **kwargs):
166
+ """Initialize detection task"""
167
+ from tools.infer_det import OpenDetector
168
+ self.model = OpenDetector(**kwargs)
169
+
170
+ def _init_rec_task(self, **kwargs):
171
+ """Initialize recognition task"""
172
+ from tools.infer_rec import OpenRecognizer
173
+ self.model = OpenRecognizer(**kwargs)
174
+
175
+ def _init_ocr_task(self, **kwargs):
176
+ """Initialize OCR task (detection + recognition)"""
177
+ from tools.infer_e2e import OpenOCRE2E
178
+ self.model = OpenOCRE2E(**kwargs)
179
+
180
+ def _init_unirec_task(self, **kwargs):
181
+ """Initialize UniRec task (universal recognition)"""
182
+ from tools.infer_unirec_onnx import UniRecONNX
183
+ self.model = UniRecONNX(**kwargs)
184
+
185
+ def _init_doc_task(self, **kwargs):
186
+ """Initialize Doc task (document OCR with layout)"""
187
+ from tools.infer_doc_onnx import OpenDocONNX
188
+ self.model = OpenDocONNX(**kwargs)
189
+
190
+ def __call__(self, *args, **kwargs):
191
+ """
192
+ Execute the task with appropriate parameters.
193
+
194
+ For 'det' task:
195
+ Args:
196
+ image_path: Path to image or directory
197
+ return_mask: Whether to return detection mask
198
+
199
+ For 'rec' task:
200
+ Args:
201
+ image_path: Path to image or directory
202
+ batch_num: Batch size for recognition
203
+
204
+ For 'ocr' task:
205
+ Args:
206
+ image_path: Path to image or directory
207
+ is_visualize: Whether to visualize results
208
+ rec_batch_num: Batch size for recognition
209
+ crop_infer: Whether to use crop inference
210
+ return_mask: Whether to return detection mask
211
+
212
+ For 'unirec' task:
213
+ Args:
214
+ image_path: Path to image
215
+ max_length: Maximum generation length
216
+
217
+ For 'doc' task:
218
+ Args:
219
+ image_path: Path to image
220
+ layout_threshold: Layout detection threshold
221
+ max_length: Maximum generation length
222
+ merge_layout_blocks: Whether to merge layout blocks
223
+
224
+ Returns:
225
+ Task-specific results
226
+ """
227
+ if self.model is None:
228
+ raise RuntimeError('Model not initialized')
229
+
230
+ # Dispatch to appropriate task
231
+ if self.task == 'det':
232
+ return self._call_det(*args, **kwargs)
233
+ elif self.task == 'rec':
234
+ return self._call_rec(*args, **kwargs)
235
+ elif self.task == 'ocr':
236
+ return self._call_ocr(*args, **kwargs)
237
+ elif self.task == 'unirec':
238
+ return self._call_unirec(*args, **kwargs)
239
+ elif self.task == 'doc':
240
+ return self._call_doc(*args, **kwargs)
241
+
242
+ def _call_det(self, image_path, **kwargs):
243
+ """Call detection task"""
244
+ return self.model(img_path=image_path, **kwargs)
245
+
246
+ def _call_rec(self, image_path, batch_num=1, **kwargs):
247
+ """Call recognition task"""
248
+ return self.model(img_path=image_path, batch_num=batch_num, **kwargs)
249
+
250
+ def _call_ocr(self, image_path, **kwargs):
251
+ """Call OCR task"""
252
+ return self.model(img_path=image_path, **kwargs)
253
+
254
+ def _call_unirec(self, image_path, max_length=2048, **kwargs):
255
+ """Call UniRec task"""
256
+ return self.model(img_path=image_path, max_length=max_length, **kwargs)
257
+
258
+ def _call_doc(self, image_path, **kwargs):
259
+ """Call Doc task"""
260
+ return self.model(img_path=image_path, **kwargs)
261
+
262
+ # Additional methods for doc task
263
+ def save_to_json(self, result: Dict, output_path: str):
264
+ """Save doc task results to JSON (only for doc task)"""
265
+ if self.task != 'doc':
266
+ raise RuntimeError("save_to_json is only available for 'doc' task")
267
+ return self.model.save_to_json(result, output_path)
268
+
269
+ def save_to_markdown(self, result: Dict, output_path: str):
270
+ """Save doc task results to Markdown (only for doc task)"""
271
+ if self.task != 'doc':
272
+ raise RuntimeError("save_to_markdown is only available for 'doc' task")
273
+ return self.model.save_to_markdown(result, output_path)
274
+
275
+ def save_visualization(self, result: Dict, output_path: str):
276
+ """Save doc task visualization (only for doc task)"""
277
+ if self.task != 'doc':
278
+ raise RuntimeError("save_visualization is only available for 'doc' task")
279
+ return self.model.save_visualization(result, output_path)
280
+
281
+
282
+ def main():
283
+ """Command-line interface for OpenOCR unified interface"""
284
+ parser = argparse.ArgumentParser(
285
+ description='OpenOCR Unified Interface - Single entry point for all OCR tasks',
286
+ formatter_class=argparse.RawDescriptionHelpFormatter,
287
+ epilog="""
288
+ Command-line Usage:
289
+ After installation, you can use OpenOCR in three ways:
290
+
291
+ 1. Using the 'openocr' command (recommended):
292
+ openocr --task ocr --input_path image.jpg
293
+
294
+ 2. Using 'python -m openocr':
295
+ python -m openocr --task ocr --input_path image.jpg
296
+
297
+ 3. Running the script directly:
298
+ python openocr.py --task ocr --input_path image.jpg
299
+
300
+ Examples:
301
+ # Detection task
302
+ openocr --task det --input_path image.jpg
303
+
304
+ # Recognition task
305
+ openocr --task rec --input_path image.jpg --mode server
306
+
307
+ # OCR task (detection + recognition)
308
+ openocr --task ocr --input_path image.jpg --is_vis
309
+
310
+ # OCR with custom output path
311
+ openocr --task ocr --input_path ./images --output_path ./results
312
+
313
+ # UniRec task (universal recognition)
314
+ openocr --task unirec --input_path image.jpg --max_length 2048
315
+
316
+ # Doc task (document OCR with layout)
317
+ openocr --task doc --input_path document.jpg --save_markdown --save_json
318
+
319
+ # Doc task with PDF input
320
+ openocr --task doc --input_path document.pdf --save_markdown --save_json
321
+
322
+ # Doc task with custom models
323
+ openocr --task doc --input_path doc.jpg --layout_model path/to/layout.onnx \\
324
+ --encoder_model path/to/encoder.onnx --decoder_model path/to/decoder.onnx
325
+
326
+ # Launch OpenOCR Gradio demo
327
+ openocr --task launch_openocr_demo --share
328
+
329
+ # Launch UniRec Gradio demo
330
+ openocr --task launch_unirec_demo --server_port 7861
331
+
332
+ # Launch OpenDoc Gradio demo
333
+ openocr --task launch_opendoc_demo --share --server_port 7862
334
+
335
+ For more information, visit: https://github.com/Topdu/OpenOCR
336
+ """
337
+ )
338
+
339
+ # Task selection
340
+ parser.add_argument(
341
+ '--task',
342
+ type=str,
343
+ required=True,
344
+ default='ocr',
345
+ choices=['det', 'rec', 'ocr', 'unirec', 'doc', 'launch_openocr_demo', 'launch_unirec_demo', 'launch_opendoc_demo'],
346
+ help='Task type: det (detection), rec (recognition), ocr (detection+recognition), unirec (universal recognition), doc (document OCR), launch_*_demo (launch Gradio demo)'
347
+ )
348
+
349
+ # Unified input/output parameters
350
+ parser.add_argument('--input_path', type=str, help='Input image/PDF path or directory (unified for all tasks, not required for demo tasks)')
351
+ parser.add_argument('--output_path', type=str, help='Output directory (auto-generated as openocr_output/{task} if not specified)')
352
+
353
+ # Demo launch parameters
354
+ parser.add_argument('--share', action='store_true', help='[Demo] Create a public share link')
355
+ parser.add_argument('--server_port', type=int, default=7860, help='[Demo] Server port (default: 7860)')
356
+ parser.add_argument('--server_name', type=str, default='0.0.0.0', help='[Demo] Server name (default: 0.0.0.0)')
357
+
358
+ # Common parameters
359
+ parser.add_argument(
360
+ '--use_gpu',
361
+ type=str,
362
+ default='auto',
363
+ choices=['auto', 'true', 'false'],
364
+ help='GPU usage strategy: auto (detect automatically), true (force GPU), false (force CPU)'
365
+ )
366
+
367
+ # OCR/Det/Rec task parameters
368
+ parser.add_argument('--mode', type=str, default='mobile', choices=['mobile', 'server'], help='[OCR/Rec] Model mode')
369
+ parser.add_argument('--backend', type=str, default='onnx', choices=['torch', 'onnx'], help='[OCR] Backend type')
370
+ parser.add_argument('--onnx_det_model_path', type=str, help='[OCR] Detection ONNX model path')
371
+ parser.add_argument('--onnx_rec_model_path', type=str, help='[OCR] Recognition ONNX model path')
372
+ parser.add_argument('--drop_score', type=float, default=0.5, help='[OCR] Score threshold')
373
+ parser.add_argument('--det_box_type', type=str, default='quad', choices=['quad', 'poly'], help='[Det/OCR] Box type')
374
+ parser.add_argument('--is_vis', action='store_true', help='[Det/OCR] Visualize results')
375
+ parser.add_argument('--rec_batch_num', type=int, default=6, help='[Rec/OCR] Recognition batch size')
376
+ parser.add_argument('--return_mask', action='store_true', help='[Det] Return detection mask')
377
+
378
+ # UniRec task parameters
379
+ parser.add_argument('--encoder_model', type=str, help='[Doc/UniRec] Encoder ONNX model path')
380
+ parser.add_argument('--decoder_model', type=str, help='[Doc/UniRec] Decoder ONNX model path')
381
+ parser.add_argument('--mapping', type=str, help='[UniRec] Tokenizer mapping JSON path')
382
+ parser.add_argument('--max_length', type=int, default=2048, help='[UniRec/Doc] Max generation length')
383
+
384
+ # Doc task parameters
385
+ parser.add_argument('--layout_model', type=str, help='[Doc] Layout detection model path')
386
+ parser.add_argument('--tokenizer_mapping', type=str, help='[Doc] Tokenizer mapping path')
387
+ parser.add_argument('--layout_threshold', type=float, default=0.4, help='[Doc] Layout detection threshold')
388
+ parser.add_argument('--use_layout_detection', action='store_true', help='[Doc] Use layout detection')
389
+ parser.add_argument('--no_layout_detection', dest='use_layout_detection', action='store_false', help='[Doc] Disable layout detection')
390
+ parser.add_argument('--use_chart_recognition', action='store_true', help='[Doc] Recognize charts')
391
+ parser.add_argument('--save_vis', action='store_true', help='[Doc] Save visualization')
392
+ parser.add_argument('--save_json', action='store_true', help='[Doc] Save JSON results')
393
+ parser.add_argument('--save_markdown', action='store_true', help='[Doc] Save Markdown results')
394
+ parser.add_argument('--no_auto_download', action='store_true', help='Disable automatic model download')
395
+
396
+ args = parser.parse_args()
397
+
398
+ # use_gpu is already a string from argparse choices
399
+
400
+ # Handle demo tasks
401
+ if args.task == 'launch_openocr_demo':
402
+ logger.info('Launching OpenOCR Gradio demo...')
403
+ from demo_gradio import launch_demo
404
+ launch_demo(
405
+ share=args.share,
406
+ server_port=args.server_port,
407
+ server_name=args.server_name
408
+ )
409
+ return
410
+
411
+ elif args.task == 'launch_unirec_demo':
412
+ logger.info('Launching UniRec Gradio demo...')
413
+ from demo_unirec import launch_demo
414
+ launch_demo(
415
+ encoder_path=args.encoder_model,
416
+ decoder_path=args.decoder_model,
417
+ mapping_path=args.mapping,
418
+ use_gpu=args.use_gpu,
419
+ auto_download=not args.no_auto_download,
420
+ share=args.share,
421
+ server_port=args.server_port,
422
+ server_name=args.server_name
423
+ )
424
+ return
425
+
426
+ elif args.task == 'launch_opendoc_demo':
427
+ logger.info('Launching OpenDoc Gradio demo...')
428
+ from demo_opendoc import launch_demo
429
+ launch_demo(
430
+ layout_model_path=args.layout_model,
431
+ unirec_encoder_path=args.encoder_model,
432
+ unirec_decoder_path=args.decoder_model,
433
+ tokenizer_mapping_path=args.tokenizer_mapping,
434
+ use_gpu=args.use_gpu,
435
+ auto_download=not args.no_auto_download,
436
+ share=args.share,
437
+ server_port=args.server_port,
438
+ server_name=args.server_name
439
+ )
440
+ return
441
+
442
+ # Set default output directory if not specified
443
+ if not args.output_path:
444
+ args.output_path = f'openocr_output/{args.task}'
445
+
446
+ # Use input_path as unified input
447
+ if not args.input_path:
448
+ parser.error('--input_path is required for all tasks')
449
+
450
+ # Initialize unified interface
451
+ try:
452
+ if args.task == 'det':
453
+ openocr = OpenOCR(
454
+ task='det',
455
+ backend=args.backend,
456
+ onnx_det_model_path=args.onnx_det_model_path,
457
+ use_gpu=args.use_gpu
458
+ )
459
+
460
+ from tools.utils.utility import get_image_file_list
461
+ img_list = get_image_file_list(args.input_path)
462
+
463
+ logger.info(f'\nFound {len(img_list)} images in {args.input_path}')
464
+ logger.info(f'Output will be saved to: {args.output_path}')
465
+ logger.info('=' * 80)
466
+
467
+ os.makedirs(args.output_path, exist_ok=True)
468
+
469
+ import json
470
+ with open(os.path.join(args.output_path, 'det_results.txt'), 'w') as fout:
471
+ for idx, img_path in enumerate(img_list):
472
+ logger.info(f"\n[{idx + 1}/{len(img_list)}] Processing: {os.path.basename(img_path)}")
473
+
474
+ try:
475
+ results = openocr(
476
+ image_path=img_path,
477
+ return_mask=args.return_mask
478
+ )
479
+
480
+ boxes = results[0]['boxes']
481
+ elapse = results[0]['elapse']
482
+
483
+ logger.info(f" Found {len(boxes)} text regions, time: {elapse:.3f}s")
484
+
485
+ # Save results
486
+ dt_boxes_json = [{'points': box.tolist()} for box in boxes]
487
+ fout.write(f"{img_path}\t{json.dumps(dt_boxes_json)}\n")
488
+
489
+ # Visualize if requested
490
+ if args.is_vis:
491
+ import cv2
492
+ import numpy as np
493
+ src_img = cv2.imread(img_path)
494
+ for box in boxes:
495
+ box = np.array(box).astype(np.int32).reshape((-1, 1, 2))
496
+ cv2.polylines(src_img, [box], True, color=(255, 255, 0), thickness=2)
497
+ vis_path = os.path.join(args.output_path, os.path.basename(img_path))
498
+ cv2.imwrite(vis_path, src_img)
499
+
500
+ except Exception as e:
501
+ logger.error(f"Error processing {img_path}: {str(e)}")
502
+ import traceback
503
+ traceback.print_exc()
504
+ continue
505
+
506
+ logger.info('\n' + '=' * 80)
507
+ logger.info(f'✅ Detection task completed. Results saved to {args.output_path}')
508
+ logger.info('=' * 80)
509
+
510
+ elif args.task == 'rec':
511
+ openocr = OpenOCR(
512
+ task='rec',
513
+ mode=args.mode,
514
+ backend=args.backend,
515
+ onnx_rec_model_path=args.onnx_rec_model_path,
516
+ use_gpu=args.use_gpu
517
+ )
518
+
519
+ from tools.utils.utility import get_image_file_list
520
+ img_list = get_image_file_list(args.input_path)
521
+
522
+ logger.info(f'\nFound {len(img_list)} images in {args.input_path}')
523
+ logger.info(f'Output will be saved to: {args.output_path}')
524
+ logger.info('=' * 80)
525
+
526
+ os.makedirs(args.output_path, exist_ok=True)
527
+
528
+ with open(os.path.join(args.output_path, 'rec_results.txt'), 'w') as fout:
529
+ for idx, img_path in enumerate(img_list):
530
+ logger.info(f"\n[{idx + 1}/{len(img_list)}] Processing: {os.path.basename(img_path)}")
531
+
532
+ try:
533
+ results = openocr(
534
+ image_path=img_path,
535
+ batch_num=args.rec_batch_num
536
+ )
537
+
538
+ text = results[0]['text']
539
+ score = results[0]['score']
540
+ elapse = results[0]['elapse']
541
+
542
+ logger.info(f" Text: {text}, Score: {score:.3f}, Time: {elapse:.3f}s")
543
+
544
+ fout.write(f"{img_path}\t{text}\t{score:.3f}\n")
545
+
546
+ except Exception as e:
547
+ logger.error(f"Error processing {img_path}: {str(e)}")
548
+ import traceback
549
+ traceback.print_exc()
550
+ continue
551
+
552
+ logger.info('\n' + '=' * 80)
553
+ logger.info(f'✅ Recognition task completed. Results saved to {args.output_path}')
554
+ logger.info('=' * 80)
555
+
556
+ elif args.task == 'ocr':
557
+ openocr = OpenOCR(
558
+ task='ocr',
559
+ mode=args.mode,
560
+ backend=args.backend,
561
+ onnx_det_model_path=args.onnx_det_model_path,
562
+ onnx_rec_model_path=args.onnx_rec_model_path,
563
+ drop_score=args.drop_score,
564
+ det_box_type=args.det_box_type,
565
+ use_gpu=args.use_gpu
566
+ )
567
+
568
+ results, time_dicts = openocr(
569
+ image_path=args.input_path,
570
+ save_dir=args.output_path,
571
+ is_visualize=args.is_vis,
572
+ rec_batch_num=args.rec_batch_num
573
+ )
574
+
575
+ logger.info(f"✅ OCR task completed. Results saved to {args.output_path}")
576
+
577
+ elif args.task == 'unirec':
578
+ openocr = OpenOCR(
579
+ task='unirec',
580
+ unirec_encoder_path=args.encoder_model,
581
+ unirec_decoder_path=args.decoder_model,
582
+ tokenizer_mapping_path=args.mapping,
583
+ use_gpu=args.use_gpu,
584
+ max_length=args.max_length,
585
+ auto_download=not args.no_auto_download
586
+ )
587
+
588
+ from tools.utils.utility import get_image_file_list
589
+ img_list = get_image_file_list(args.input_path)
590
+
591
+ logger.info(f'\nFound {len(img_list)} images in {args.input_path}')
592
+ logger.info(f'Output will be saved to: {args.output_path}')
593
+ logger.info('=' * 80)
594
+
595
+ os.makedirs(args.output_path, exist_ok=True)
596
+
597
+ import json
598
+ with open(os.path.join(args.output_path, 'unirec_results.txt'), 'w') as fout:
599
+ for idx, img_path in enumerate(img_list):
600
+ logger.info(f"\n[{idx + 1}/{len(img_list)}] Processing: {os.path.basename(img_path)}")
601
+
602
+ try:
603
+ result_text, generated_ids = openocr(
604
+ image_path=img_path,
605
+ max_length=args.max_length
606
+ )
607
+
608
+ logger.info(f" Generated {len(generated_ids)} tokens")
609
+ logger.info(f" Text: {result_text[:100]}..." if len(result_text) > 100 else f" Text: {result_text}")
610
+
611
+ image_name = os.path.basename(img_path)
612
+ result_dict = {'text': result_text}
613
+ fout.write(f"{image_name}\t{json.dumps(result_dict, ensure_ascii=False)}\n")
614
+
615
+ except Exception as e:
616
+ logger.error(f"Error processing {img_path}: {str(e)}")
617
+ import traceback
618
+ traceback.print_exc()
619
+ continue
620
+
621
+ logger.info('\n' + '=' * 80)
622
+ logger.info(f'✅ UniRec task completed. Results saved to {args.output_path}')
623
+ logger.info('=' * 80)
624
+
625
+ elif args.task == 'doc':
626
+ openocr = OpenOCR(
627
+ task='doc',
628
+ layout_model_path=args.layout_model,
629
+ unirec_encoder_path=args.encoder_model,
630
+ unirec_decoder_path=args.decoder_model,
631
+ tokenizer_mapping_path=args.tokenizer_mapping,
632
+ use_gpu=args.use_gpu,
633
+ layout_threshold=args.layout_threshold,
634
+ use_layout_detection=args.use_layout_detection,
635
+ use_chart_recognition=args.use_chart_recognition,
636
+ auto_download=not args.no_auto_download
637
+ )
638
+
639
+ from tools.utils.utility import get_image_file_list
640
+ img_list = get_image_file_list(args.input_path)
641
+
642
+ logger.info(f'\nFound {len(img_list)} images/PDFs in {args.input_path}')
643
+ logger.info(f'Output will be saved to: {args.output_path}')
644
+ logger.info('=' * 80)
645
+
646
+ os.makedirs(args.output_path, exist_ok=True)
647
+
648
+ for idx, img_path in enumerate(img_list):
649
+ logger.info(f"\n[{idx + 1}/{len(img_list)}] Processing: {os.path.basename(img_path)}")
650
+
651
+ try:
652
+ result = openocr(
653
+ image_path=img_path,
654
+ layout_threshold=args.layout_threshold,
655
+ max_length=args.max_length
656
+ )
657
+
658
+ if args.save_vis:
659
+ openocr.save_visualization(result, args.output_path)
660
+
661
+ if args.save_json:
662
+ openocr.save_to_json(result, args.output_path)
663
+
664
+ if args.save_markdown:
665
+ openocr.save_to_markdown(result, args.output_path)
666
+
667
+ except Exception as e:
668
+ logger.error(f"Error processing {img_path}: {str(e)}")
669
+ import traceback
670
+ traceback.print_exc()
671
+ continue
672
+
673
+ logger.info('\n' + '=' * 80)
674
+ logger.info(f'✅ Doc task completed. Results saved to {args.output_path}')
675
+ logger.info('=' * 80)
676
+
677
+ except Exception as e:
678
+ logger.error(f"Error: {str(e)}")
679
+ import traceback
680
+ traceback.print_exc()
681
+ sys.exit(1)
682
+
683
+
684
+ if __name__ == '__main__':
685
+ main()