magic-pdf 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. magic_pdf/data/batch_build_dataset.py +156 -0
  2. magic_pdf/data/dataset.py +44 -24
  3. magic_pdf/data/utils.py +108 -9
  4. magic_pdf/dict2md/ocr_mkcontent.py +4 -3
  5. magic_pdf/libs/pdf_image_tools.py +11 -6
  6. magic_pdf/libs/performance_stats.py +12 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/batch_analyze.py +175 -201
  9. magic_pdf/model/doc_analyze_by_custom_model.py +137 -92
  10. magic_pdf/model/pdf_extract_kit.py +5 -38
  11. magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
  12. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
  13. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
  14. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
  15. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
  16. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
  17. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
  18. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
  19. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
  20. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
  21. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
  22. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
  25. magic_pdf/model/sub_modules/model_init.py +50 -37
  26. magic_pdf/model/sub_modules/model_utils.py +17 -11
  27. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
  29. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
  32. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
  33. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
  34. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
  35. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
  36. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
  37. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
  38. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
  39. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
  40. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
  41. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
  42. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
  43. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
  44. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
  45. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
  46. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
  47. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
  48. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
  49. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
  50. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
  51. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
  52. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
  53. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
  54. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
  55. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
  56. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
  57. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
  58. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
  59. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
  60. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
  61. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
  62. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
  63. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
  64. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
  65. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
  66. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
  67. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
  68. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
  69. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
  70. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
  71. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
  72. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
  73. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
  74. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
  75. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
  76. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
  77. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
  78. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
  79. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +10 -18
  80. magic_pdf/pdf_parse_union_core_v2.py +112 -74
  81. magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
  82. magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
  83. magic_pdf/resources/model_config/model_configs.yaml +1 -1
  84. magic_pdf/tools/cli.py +30 -12
  85. magic_pdf/tools/common.py +90 -12
  86. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.0.dist-info}/METADATA +50 -40
  87. magic_pdf-1.3.0.dist-info/RECORD +202 -0
  88. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
  89. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
  90. magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
  91. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
  92. magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
  93. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
  94. magic_pdf-1.2.2.dist-info/RECORD +0 -147
  95. /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
  96. /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
  97. /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
  98. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.0.dist-info}/LICENSE.md +0 -0
  99. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.0.dist-info}/WHEEL +0 -0
  100. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.0.dist-info}/entry_points.txt +0 -0
  101. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,19 @@
1
1
  import os
2
2
  import time
3
+
4
+ import numpy as np
3
5
  import torch
4
6
 
5
7
  os.environ['FLAGS_npu_jit_compile'] = '0' # 关闭paddle的jit编译
6
8
  os.environ['FLAGS_use_stride_kernel'] = '0'
7
9
  os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
8
10
  os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
9
- # 关闭paddle的信号处理
10
- import paddle
11
- paddle.disable_signal_handler()
11
+
12
12
 
13
13
  from loguru import logger
14
14
 
15
- from magic_pdf.model.batch_analyze import BatchAnalyze
16
15
  from magic_pdf.model.sub_modules.model_utils import get_vram
17
-
18
- try:
19
- import torchtext
20
- if torchtext.__version__ >= '0.18.0':
21
- torchtext.disable_torchtext_deprecation_warning()
22
- except ImportError:
23
- pass
24
-
16
+ from magic_pdf.config.enums import SupportedPdfParseMethod
25
17
  import magic_pdf.model as model_config
26
18
  from magic_pdf.data.dataset import Dataset
27
19
  from magic_pdf.libs.clean_memory import clean_memory
@@ -30,8 +22,6 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
30
22
  get_local_models_dir,
31
23
  get_table_recog_config)
32
24
  from magic_pdf.model.model_list import MODEL
33
- from magic_pdf.operators.models import InferenceResult
34
-
35
25
 
36
26
  class ModelSingleton:
37
27
  _instance = None
@@ -72,9 +62,7 @@ def custom_model_init(
72
62
  formula_enable=None,
73
63
  table_enable=None,
74
64
  ):
75
-
76
65
  model = None
77
-
78
66
  if model_config.__model_mode__ == 'lite':
79
67
  logger.warning(
80
68
  'The Lite mode is provided for developers to conduct testing only, and the output quality is '
@@ -132,7 +120,6 @@ def custom_model_init(
132
120
 
133
121
  return custom_model
134
122
 
135
-
136
123
  def doc_analyze(
137
124
  dataset: Dataset,
138
125
  ocr: bool = False,
@@ -143,102 +130,160 @@ def doc_analyze(
143
130
  layout_model=None,
144
131
  formula_enable=None,
145
132
  table_enable=None,
146
- ) -> InferenceResult:
147
-
133
+ ):
148
134
  end_page_id = (
149
135
  end_page_id
150
136
  if end_page_id is not None and end_page_id >= 0
151
137
  else len(dataset) - 1
152
138
  )
153
139
 
154
- model_manager = ModelSingleton()
155
- custom_model = model_manager.get_model(
156
- ocr, show_log, lang, layout_model, formula_enable, table_enable
157
- )
158
-
159
- batch_analyze = False
160
- batch_ratio = 1
161
- device = get_device()
140
+ MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
141
+ images = []
142
+ page_wh_list = []
143
+ for index in range(len(dataset)):
144
+ if start_page_id <= index <= end_page_id:
145
+ page_data = dataset.get_page(index)
146
+ img_dict = page_data.get_image()
147
+ images.append(img_dict['img'])
148
+ page_wh_list.append((img_dict['width'], img_dict['height']))
149
+ if lang is None or lang == 'auto':
150
+ images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(dataset))]
151
+ else:
152
+ images_with_extra_info = [(images[index], ocr, lang) for index in range(len(dataset))]
162
153
 
163
- npu_support = False
164
- if str(device).startswith("npu"):
165
- import torch_npu
166
- if torch_npu.npu.is_available():
167
- npu_support = True
154
+ if len(images) >= MIN_BATCH_INFERENCE_SIZE:
155
+ batch_size = MIN_BATCH_INFERENCE_SIZE
156
+ batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
157
+ else:
158
+ batch_images = [images_with_extra_info]
168
159
 
169
- if torch.cuda.is_available() and device != 'cpu' or npu_support:
170
- gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
171
- if gpu_memory is not None and gpu_memory >= 8:
160
+ results = []
161
+ for sn, batch_image in enumerate(batch_images):
162
+ _, result = may_batch_image_analyze(batch_image, sn, ocr, show_log,layout_model, formula_enable, table_enable)
163
+ results.extend(result)
172
164
 
173
- if gpu_memory >= 16:
174
- batch_ratio = 8
175
- elif gpu_memory >= 10:
176
- batch_ratio = 4
177
- else:
178
- batch_ratio = 2
165
+ model_json = []
166
+ for index in range(len(dataset)):
167
+ if start_page_id <= index <= end_page_id:
168
+ result = results.pop(0)
169
+ page_width, page_height = page_wh_list.pop(0)
170
+ else:
171
+ result = []
172
+ page_height = 0
173
+ page_width = 0
179
174
 
180
- logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
181
- batch_analyze = True
175
+ page_info = {'page_no': index, 'width': page_width, 'height': page_height}
176
+ page_dict = {'layout_dets': result, 'page_info': page_info}
177
+ model_json.append(page_dict)
182
178
 
183
- model_json = []
184
- doc_analyze_start = time.time()
179
+ from magic_pdf.operators.models import InferenceResult
180
+ return InferenceResult(model_json, dataset)
185
181
 
186
- if batch_analyze:
187
- # batch analyze
188
- images = []
189
- page_wh_list = []
190
- for index in range(len(dataset)):
191
- if start_page_id <= index <= end_page_id:
192
- page_data = dataset.get_page(index)
193
- img_dict = page_data.get_image()
194
- images.append(img_dict['img'])
195
- page_wh_list.append((img_dict['width'], img_dict['height']))
196
- batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
197
- analyze_result = batch_model(images)
182
+ def batch_doc_analyze(
183
+ datasets: list[Dataset],
184
+ parse_method: str,
185
+ show_log: bool = False,
186
+ lang=None,
187
+ layout_model=None,
188
+ formula_enable=None,
189
+ table_enable=None,
190
+ ):
191
+ MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
192
+ batch_size = MIN_BATCH_INFERENCE_SIZE
193
+ images = []
194
+ page_wh_list = []
198
195
 
196
+ images_with_extra_info = []
197
+ for dataset in datasets:
199
198
  for index in range(len(dataset)):
200
- if start_page_id <= index <= end_page_id:
201
- result = analyze_result.pop(0)
202
- page_width, page_height = page_wh_list.pop(0)
199
+ if lang is None or lang == 'auto':
200
+ _lang = dataset._lang
203
201
  else:
204
- result = []
205
- page_height = 0
206
- page_width = 0
202
+ _lang = lang
207
203
 
208
- page_info = {'page_no': index, 'width': page_width, 'height': page_height}
204
+ page_data = dataset.get_page(index)
205
+ img_dict = page_data.get_image()
206
+ images.append(img_dict['img'])
207
+ page_wh_list.append((img_dict['width'], img_dict['height']))
208
+ if parse_method == 'auto':
209
+ images_with_extra_info.append((images[-1], dataset.classify() == SupportedPdfParseMethod.OCR, _lang))
210
+ else:
211
+ images_with_extra_info.append((images[-1], parse_method == 'ocr', _lang))
212
+
213
+ batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
214
+ results = []
215
+ for sn, batch_image in enumerate(batch_images):
216
+ _, result = may_batch_image_analyze(batch_image, sn, True, show_log, layout_model, formula_enable, table_enable)
217
+ results.extend(result)
218
+
219
+ infer_results = []
220
+ from magic_pdf.operators.models import InferenceResult
221
+ for index in range(len(datasets)):
222
+ dataset = datasets[index]
223
+ model_json = []
224
+ for i in range(len(dataset)):
225
+ result = results.pop(0)
226
+ page_width, page_height = page_wh_list.pop(0)
227
+ page_info = {'page_no': i, 'width': page_width, 'height': page_height}
209
228
  page_dict = {'layout_dets': result, 'page_info': page_info}
210
229
  model_json.append(page_dict)
230
+ infer_results.append(InferenceResult(model_json, dataset))
231
+ return infer_results
211
232
 
212
- else:
213
- # single analyze
214
233
 
215
- for index in range(len(dataset)):
216
- page_data = dataset.get_page(index)
217
- img_dict = page_data.get_image()
218
- img = img_dict['img']
219
- page_width = img_dict['width']
220
- page_height = img_dict['height']
221
- if start_page_id <= index <= end_page_id:
222
- page_start = time.time()
223
- result = custom_model(img)
224
- logger.info(f'-----page_id : {index}, page total time: {round(time.time() - page_start, 2)}-----')
234
+ def may_batch_image_analyze(
235
+ images_with_extra_info: list[(np.ndarray, bool, str)],
236
+ idx: int,
237
+ ocr: bool,
238
+ show_log: bool = False,
239
+ layout_model=None,
240
+ formula_enable=None,
241
+ table_enable=None):
242
+ # os.environ['CUDA_VISIBLE_DEVICES'] = str(idx)
243
+
244
+ from magic_pdf.model.batch_analyze import BatchAnalyze
245
+
246
+ model_manager = ModelSingleton()
247
+
248
+ # images = [image for image, _, _ in images_with_extra_info]
249
+ batch_ratio = 1
250
+ device = get_device()
251
+
252
+ if str(device).startswith('npu'):
253
+ import torch_npu
254
+ if torch_npu.npu.is_available():
255
+ torch.npu.set_compile_mode(jit_compile=False)
256
+
257
+ if str(device).startswith('npu') or str(device).startswith('cuda'):
258
+ gpu_memory = int(os.getenv('VIRTUAL_VRAM_SIZE', round(get_vram(device))))
259
+ if gpu_memory is not None:
260
+ if gpu_memory >= 16:
261
+ batch_ratio = 16
262
+ elif gpu_memory >= 12:
263
+ batch_ratio = 8
264
+ elif gpu_memory >= 8:
265
+ batch_ratio = 4
266
+ elif gpu_memory >= 6:
267
+ batch_ratio = 2
225
268
  else:
226
- result = []
269
+ batch_ratio = 1
270
+ logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
227
271
 
228
- page_info = {'page_no': index, 'width': page_width, 'height': page_height}
229
- page_dict = {'layout_dets': result, 'page_info': page_info}
230
- model_json.append(page_dict)
231
272
 
232
- gc_start = time.time()
233
- clean_memory(get_device())
234
- gc_time = round(time.time() - gc_start, 2)
235
- logger.info(f'gc time: {gc_time}')
236
-
237
- doc_analyze_time = round(time.time() - doc_analyze_start, 2)
238
- doc_analyze_speed = round((end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
239
- logger.info(
240
- f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
241
- f' speed: {doc_analyze_speed} pages/second'
242
- )
273
+ # doc_analyze_start = time.time()
243
274
 
244
- return InferenceResult(model_json, dataset)
275
+ batch_model = BatchAnalyze(model_manager, batch_ratio, show_log, layout_model, formula_enable, table_enable)
276
+ results = batch_model(images_with_extra_info)
277
+
278
+ # gc_start = time.time()
279
+ clean_memory(get_device())
280
+ # gc_time = round(time.time() - gc_start, 2)
281
+ # logger.debug(f'gc time: {gc_time}')
282
+
283
+ # doc_analyze_time = round(time.time() - doc_analyze_start, 2)
284
+ # doc_analyze_speed = round(len(images) / doc_analyze_time, 2)
285
+ # logger.debug(
286
+ # f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
287
+ # f' speed: {doc_analyze_speed} pages/second'
288
+ # )
289
+ return idx, results
@@ -3,28 +3,18 @@ import os
3
3
  import time
4
4
 
5
5
  import cv2
6
- import numpy as np
7
6
  import torch
8
7
  import yaml
9
8
  from loguru import logger
10
- from PIL import Image
11
9
 
12
10
  os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
13
11
 
14
- try:
15
- import torchtext
16
-
17
- if torchtext.__version__ >= '0.18.0':
18
- torchtext.disable_torchtext_deprecation_warning()
19
- except ImportError:
20
- pass
21
-
22
12
  from magic_pdf.config.constants import *
23
13
  from magic_pdf.model.model_list import AtomicModel
24
14
  from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
25
15
  from magic_pdf.model.sub_modules.model_utils import (
26
16
  clean_vram, crop_img, get_res_list_from_layout_res)
27
- from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import (
17
+ from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
28
18
  get_adjusted_mfdetrec_res, get_ocr_result_list)
29
19
 
30
20
 
@@ -120,7 +110,7 @@ class CustomPEKModel:
120
110
  atom_model_name=AtomicModel.MFR,
121
111
  mfr_weight_dir=mfr_weight_dir,
122
112
  mfr_cfg_path=mfr_cfg_path,
123
- device='cpu' if str(self.device).startswith("mps") else self.device,
113
+ device=self.device,
124
114
  )
125
115
 
126
116
  # 初始化layout模型
@@ -174,11 +164,6 @@ class CustomPEKModel:
174
164
  logger.info('DocAnalysis init done!')
175
165
 
176
166
  def __call__(self, image):
177
-
178
- pil_img = Image.fromarray(image)
179
- width, height = pil_img.size
180
- # logger.info(f'width: {width}, height: {height}')
181
-
182
167
  # layout检测
183
168
  layout_start = time.time()
184
169
  layout_res = []
@@ -186,24 +171,6 @@ class CustomPEKModel:
186
171
  # layoutlmv3
187
172
  layout_res = self.layout_model(image, ignore_catids=[])
188
173
  elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
189
- # doclayout_yolo
190
- # if height > width:
191
- # input_res = {"poly":[0,0,width,0,width,height,0,height]}
192
- # new_image, useful_list = crop_img(input_res, pil_img, crop_paste_x=width//2, crop_paste_y=0)
193
- # paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
194
- # layout_res = self.layout_model.predict(new_image)
195
- # for res in layout_res:
196
- # p1, p2, p3, p4, p5, p6, p7, p8 = res['poly']
197
- # p1 = p1 - paste_x + xmin
198
- # p2 = p2 - paste_y + ymin
199
- # p3 = p3 - paste_x + xmin
200
- # p4 = p4 - paste_y + ymin
201
- # p5 = p5 - paste_x + xmin
202
- # p6 = p6 - paste_y + ymin
203
- # p7 = p7 - paste_x + xmin
204
- # p8 = p8 - paste_y + ymin
205
- # res['poly'] = [p1, p2, p3, p4, p5, p6, p7, p8]
206
- # else:
207
174
  layout_res = self.layout_model.predict(image)
208
175
 
209
176
  layout_cost = round(time.time() - layout_start, 2)
@@ -234,11 +201,11 @@ class CustomPEKModel:
234
201
  ocr_start = time.time()
235
202
  # Process each area that requires OCR processing
236
203
  for res in ocr_res_list:
237
- new_image, useful_list = crop_img(res, pil_img, crop_paste_x=50, crop_paste_y=50)
204
+ new_image, useful_list = crop_img(res, image, crop_paste_x=50, crop_paste_y=50)
238
205
  adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(single_page_mfdetrec_res, useful_list)
239
206
 
240
207
  # OCR recognition
241
- new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
208
+ new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
242
209
 
243
210
  if self.apply_ocr:
244
211
  ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
@@ -260,7 +227,7 @@ class CustomPEKModel:
260
227
  if self.apply_table:
261
228
  table_start = time.time()
262
229
  for res in table_res_list:
263
- new_image, _ = crop_img(res, pil_img)
230
+ new_image, _ = crop_img(res, image)
264
231
  single_table_start_time = time.time()
265
232
  html_code = None
266
233
  if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
@@ -3,8 +3,6 @@ import os
3
3
  from pathlib import Path
4
4
 
5
5
  import yaml
6
- from PIL import Image
7
-
8
6
  os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
9
7
 
10
8
  from magic_pdf.config.constants import MODEL_NAME
@@ -42,7 +40,7 @@ def get_text_images(simple_images):
42
40
  )
43
41
  text_images = []
44
42
  for simple_image in simple_images:
45
- image = Image.fromarray(simple_image['img'])
43
+ image = simple_image['img']
46
44
  layout_res = temp_layout_model.predict(image)
47
45
  # 给textblock截图
48
46
  for res in layout_res:
@@ -51,7 +49,7 @@ def get_text_images(simple_images):
51
49
  # 初步清洗(宽和高都小于100)
52
50
  if x2 - x1 < 100 and y2 - y1 < 100:
53
51
  continue
54
- text_images.append(image.crop((x1, y1, x2, y2)))
52
+ text_images.append(image[y1:y2, x1:x2])
55
53
  return text_images
56
54
 
57
55
 
@@ -2,9 +2,9 @@
2
2
  import time
3
3
  from collections import Counter
4
4
  from uuid import uuid4
5
-
5
+ import cv2
6
+ import numpy as np
6
7
  import torch
7
- from PIL import Image
8
8
  from loguru import logger
9
9
  from ultralytics import YOLO
10
10
 
@@ -29,7 +29,7 @@ def split_images(image, result_images=None):
29
29
  if result_images is None:
30
30
  result_images = []
31
31
 
32
- width, height = image.size
32
+ height, width = image.shape[:2]
33
33
  long_side = max(width, height) # 获取较长边长度
34
34
 
35
35
  if long_side <= 400:
@@ -44,16 +44,14 @@ def split_images(image, result_images=None):
44
44
  # 判断裁剪区域是否超出图片范围,如果超出则不进行裁剪保存操作
45
45
  if x + new_long_side > width:
46
46
  continue
47
- box = (x, 0, x + new_long_side, height)
48
- sub_image = image.crop(box)
47
+ sub_image = image[0:height, x:x + new_long_side]
49
48
  sub_images.append(sub_image)
50
49
  else: # 如果高度是较长边
51
50
  for y in range(0, height, new_long_side):
52
51
  # 判断裁剪区域是否超出图片范围,如果超出则不进行裁剪保存操作
53
52
  if y + new_long_side > height:
54
53
  continue
55
- box = (0, y, width, y + new_long_side)
56
- sub_image = image.crop(box)
54
+ sub_image = image[y:y + new_long_side, 0:width]
57
55
  sub_images.append(sub_image)
58
56
 
59
57
  for sub_image in sub_images:
@@ -64,24 +62,32 @@ def split_images(image, result_images=None):
64
62
 
65
63
  def resize_images_to_224(image):
66
64
  """
67
- 若分辨率小于224则用黑色背景补齐到224*224大小,若大于等于224则调整为224*224大小,并保存到输出文件夹中。
65
+ 若分辨率小于224则用黑色背景补齐到224*224大小,若大于等于224则调整为224*224大小。
66
+ Works directly with NumPy arrays.
68
67
  """
69
68
  try:
70
- width, height = image.size
69
+ height, width = image.shape[:2]
70
+
71
71
  if width < 224 or height < 224:
72
- new_image = Image.new('RGB', (224, 224), (0, 0, 0))
73
- paste_x = (224 - width) // 2
74
- paste_y = (224 - height) // 2
75
- new_image.paste(image, (paste_x, paste_y))
72
+ # Create black background
73
+ new_image = np.zeros((224, 224, 3), dtype=np.uint8)
74
+ # Calculate paste position (ensure they're not negative)
75
+ paste_x = max(0, (224 - width) // 2)
76
+ paste_y = max(0, (224 - height) // 2)
77
+ # Make sure we don't exceed the boundaries of new_image
78
+ paste_width = min(width, 224)
79
+ paste_height = min(height, 224)
80
+ # Paste original image onto black background
81
+ new_image[paste_y:paste_y + paste_height, paste_x:paste_x + paste_width] = image[:paste_height, :paste_width]
76
82
  image = new_image
77
83
  else:
78
- image = image.resize((224, 224), Image.Resampling.LANCZOS)
84
+ # Resize using cv2
85
+ image = cv2.resize(image, (224, 224), interpolation=cv2.INTER_LANCZOS4)
79
86
 
80
- # uuid = str(uuid4())
81
- # image.save(f"/tmp/{uuid}.jpg")
82
87
  return image
83
88
  except Exception as e:
84
- logger.exception(e)
89
+ logger.exception(f"Error in resize_images_to_224: {e}")
90
+ return None
85
91
 
86
92
 
87
93
  class YOLOv11LangDetModel(object):
@@ -96,8 +102,7 @@ class YOLOv11LangDetModel(object):
96
102
  def do_detect(self, images: list):
97
103
  all_images = []
98
104
  for image in images:
99
- width, height = image.size
100
- # logger.info(f"image size: {width} x {height}")
105
+ height, width = image.shape[:2]
101
106
  if width < 100 and height < 100:
102
107
  continue
103
108
  temp_images = split_images(image)
@@ -1,4 +1,5 @@
1
1
  from doclayout_yolo import YOLOv10
2
+ from tqdm import tqdm
2
3
 
3
4
 
4
5
  class DocLayoutYOLOModel(object):
@@ -31,7 +32,8 @@ class DocLayoutYOLOModel(object):
31
32
 
32
33
  def batch_predict(self, images: list, batch_size: int) -> list:
33
34
  images_layout_res = []
34
- for index in range(0, len(images), batch_size):
35
+ # for index in range(0, len(images), batch_size):
36
+ for index in tqdm(range(0, len(images), batch_size), desc="Layout Predict"):
35
37
  doclayout_yolo_res = [
36
38
  image_res.cpu()
37
39
  for image_res in self.model.predict(
@@ -1,3 +1,4 @@
1
+ from tqdm import tqdm
1
2
  from ultralytics import YOLO
2
3
 
3
4
 
@@ -14,7 +15,8 @@ class YOLOv8MFDModel(object):
14
15
 
15
16
  def batch_predict(self, images: list, batch_size: int) -> list:
16
17
  images_mfd_res = []
17
- for index in range(0, len(images), batch_size):
18
+ # for index in range(0, len(images), batch_size):
19
+ for index in tqdm(range(0, len(images), batch_size), desc="MFD Predict"):
18
20
  mfd_res = [
19
21
  image_res.cpu()
20
22
  for image_res in self.mfd_model.predict(