magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,176 @@
1
+ import copy
2
+ import time
3
+
4
+ import cv2
5
+ import numpy as np
6
+ from paddleocr import PaddleOCR
7
+ from paddleocr.paddleocr import check_img, logger
8
+ from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
9
+ from paddleocr.tools.infer.predict_system import sorted_boxes
10
+ from paddleocr.tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
11
+
12
+ from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes
13
+
14
+
15
+ class ModifiedPaddleOCR(PaddleOCR):
16
+ def ocr(self,
17
+ img,
18
+ det=True,
19
+ rec=True,
20
+ cls=True,
21
+ bin=False,
22
+ inv=False,
23
+ alpha_color=(255, 255, 255),
24
+ mfd_res=None,
25
+ ):
26
+ """
27
+ OCR with PaddleOCR
28
+ args:
29
+ img: img for OCR, support ndarray, img_path and list or ndarray
30
+ det: use text detection or not. If False, only rec will be exec. Default is True
31
+ rec: use text recognition or not. If False, only det will be exec. Default is True
32
+ cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
33
+ bin: binarize image to black and white. Default is False.
34
+ inv: invert image colors. Default is False.
35
+ alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
36
+ """
37
+ assert isinstance(img, (np.ndarray, list, str, bytes))
38
+ if isinstance(img, list) and det == True:
39
+ logger.error('When input a list of images, det must be false')
40
+ exit(0)
41
+ if cls == True and self.use_angle_cls == False:
42
+ pass
43
+ # logger.warning(
44
+ # 'Since the angle classifier is not initialized, it will not be used during the forward process'
45
+ # )
46
+
47
+ img = check_img(img)
48
+ # for infer pdf file
49
+ if isinstance(img, list):
50
+ if self.page_num > len(img) or self.page_num == 0:
51
+ self.page_num = len(img)
52
+ imgs = img[:self.page_num]
53
+ else:
54
+ imgs = [img]
55
+
56
+ def preprocess_image(_image):
57
+ _image = alpha_to_color(_image, alpha_color)
58
+ if inv:
59
+ _image = cv2.bitwise_not(_image)
60
+ if bin:
61
+ _image = binarize_img(_image)
62
+ return _image
63
+
64
+ if det and rec:
65
+ ocr_res = []
66
+ for idx, img in enumerate(imgs):
67
+ img = preprocess_image(img)
68
+ dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
69
+ if not dt_boxes and not rec_res:
70
+ ocr_res.append(None)
71
+ continue
72
+ tmp_res = [[box.tolist(), res]
73
+ for box, res in zip(dt_boxes, rec_res)]
74
+ ocr_res.append(tmp_res)
75
+ return ocr_res
76
+ elif det and not rec:
77
+ ocr_res = []
78
+ for idx, img in enumerate(imgs):
79
+ img = preprocess_image(img)
80
+ dt_boxes, elapse = self.text_detector(img)
81
+ if dt_boxes is None:
82
+ ocr_res.append(None)
83
+ continue
84
+ dt_boxes = sorted_boxes(dt_boxes)
85
+ # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly,因此需要过滤所有倾斜程度较大的文本框
86
+ dt_boxes = merge_det_boxes(dt_boxes)
87
+ if mfd_res:
88
+ bef = time.time()
89
+ dt_boxes = update_det_boxes(dt_boxes, mfd_res)
90
+ aft = time.time()
91
+ logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
92
+ len(dt_boxes), aft - bef))
93
+ tmp_res = [box.tolist() for box in dt_boxes]
94
+ ocr_res.append(tmp_res)
95
+ return ocr_res
96
+ else:
97
+ ocr_res = []
98
+ cls_res = []
99
+ for idx, img in enumerate(imgs):
100
+ if not isinstance(img, list):
101
+ img = preprocess_image(img)
102
+ img = [img]
103
+ if self.use_angle_cls and cls:
104
+ img, cls_res_tmp, elapse = self.text_classifier(img)
105
+ if not rec:
106
+ cls_res.append(cls_res_tmp)
107
+ rec_res, elapse = self.text_recognizer(img)
108
+ ocr_res.append(rec_res)
109
+ if not rec:
110
+ return cls_res
111
+ return ocr_res
112
+
113
+ def __call__(self, img, cls=True, mfd_res=None):
114
+ time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
115
+
116
+ if img is None:
117
+ logger.debug("no valid image provided")
118
+ return None, None, time_dict
119
+
120
+ start = time.time()
121
+ ori_im = img.copy()
122
+ dt_boxes, elapse = self.text_detector(img)
123
+ time_dict['det'] = elapse
124
+
125
+ if dt_boxes is None:
126
+ logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
127
+ end = time.time()
128
+ time_dict['all'] = end - start
129
+ return None, None, time_dict
130
+ else:
131
+ logger.debug("dt_boxes num : {}, elapsed : {}".format(
132
+ len(dt_boxes), elapse))
133
+ img_crop_list = []
134
+
135
+ dt_boxes = sorted_boxes(dt_boxes)
136
+
137
+ # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly,因此需要过滤所有倾斜程度较大的文本框
138
+ dt_boxes = merge_det_boxes(dt_boxes)
139
+
140
+ if mfd_res:
141
+ bef = time.time()
142
+ dt_boxes = update_det_boxes(dt_boxes, mfd_res)
143
+ aft = time.time()
144
+ logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
145
+ len(dt_boxes), aft - bef))
146
+
147
+ for bno in range(len(dt_boxes)):
148
+ tmp_box = copy.deepcopy(dt_boxes[bno])
149
+ if self.args.det_box_type == "quad":
150
+ img_crop = get_rotate_crop_image(ori_im, tmp_box)
151
+ else:
152
+ img_crop = get_minarea_rect_crop(ori_im, tmp_box)
153
+ img_crop_list.append(img_crop)
154
+ if self.use_angle_cls and cls:
155
+ img_crop_list, angle_list, elapse = self.text_classifier(
156
+ img_crop_list)
157
+ time_dict['cls'] = elapse
158
+ logger.debug("cls num : {}, elapsed : {}".format(
159
+ len(img_crop_list), elapse))
160
+
161
+ rec_res, elapse = self.text_recognizer(img_crop_list)
162
+ time_dict['rec'] = elapse
163
+ logger.debug("rec_res num : {}, elapsed : {}".format(
164
+ len(rec_res), elapse))
165
+ if self.args.save_crop_res:
166
+ self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
167
+ rec_res)
168
+ filter_boxes, filter_rec_res = [], []
169
+ for box, rec_result in zip(dt_boxes, rec_res):
170
+ text, score = rec_result
171
+ if score >= self.drop_score:
172
+ filter_boxes.append(box)
173
+ filter_rec_res.append(rec_result)
174
+ end = time.time()
175
+ time_dict['all'] = end - start
176
+ return filter_boxes, filter_rec_res, time_dict
@@ -0,0 +1,213 @@
1
+ import copy
2
+ import time
3
+
4
+
5
+ import cv2
6
+ import numpy as np
7
+ from paddleocr import PaddleOCR
8
+ from paddleocr.paddleocr import check_img, logger
9
+ from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
10
+ from paddleocr.tools.infer.predict_system import sorted_boxes
11
+ from paddleocr.tools.infer.utility import slice_generator, merge_fragmented, get_rotate_crop_image, \
12
+ get_minarea_rect_crop
13
+
14
+ from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes
15
+
16
+
17
+ class ModifiedPaddleOCR(PaddleOCR):
18
+
19
+ def ocr(
20
+ self,
21
+ img,
22
+ det=True,
23
+ rec=True,
24
+ cls=True,
25
+ bin=False,
26
+ inv=False,
27
+ alpha_color=(255, 255, 255),
28
+ slice={},
29
+ mfd_res=None,
30
+ ):
31
+ """
32
+ OCR with PaddleOCR
33
+
34
+ Args:
35
+ img: Image for OCR. It can be an ndarray, img_path, or a list of ndarrays.
36
+ det: Use text detection or not. If False, only text recognition will be executed. Default is True.
37
+ rec: Use text recognition or not. If False, only text detection will be executed. Default is True.
38
+ cls: Use angle classifier or not. Default is True. If True, the text with a rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance.
39
+ bin: Binarize image to black and white. Default is False.
40
+ inv: Invert image colors. Default is False.
41
+ alpha_color: Set RGB color Tuple for transparent parts replacement. Default is pure white.
42
+ slice: Use sliding window inference for large images. Both det and rec must be True. Requires int values for slice["horizontal_stride"], slice["vertical_stride"], slice["merge_x_thres"], slice["merge_y_thres"] (See doc/doc_en/slice_en.md). Default is {}.
43
+
44
+ Returns:
45
+ If both det and rec are True, returns a list of OCR results for each image. Each OCR result is a list of bounding boxes and recognized text for each detected text region.
46
+ If det is True and rec is False, returns a list of detected bounding boxes for each image.
47
+ If det is False and rec is True, returns a list of recognized text for each image.
48
+ If both det and rec are False, returns a list of angle classification results for each image.
49
+
50
+ Raises:
51
+ AssertionError: If the input image is not of type ndarray, list, str, or bytes.
52
+ SystemExit: If det is True and the input is a list of images.
53
+
54
+ Note:
55
+ - If the angle classifier is not initialized (use_angle_cls=False), it will not be used during the forward process.
56
+ - For PDF files, if the input is a list of images and the page_num is specified, only the first page_num images will be processed.
57
+ - The preprocess_image function is used to preprocess the input image by applying alpha color replacement, inversion, and binarization if specified.
58
+ """
59
+ assert isinstance(img, (np.ndarray, list, str, bytes))
60
+ if isinstance(img, list) and det == True:
61
+ logger.error("When input a list of images, det must be false")
62
+ exit(0)
63
+ if cls == True and self.use_angle_cls == False:
64
+ logger.warning(
65
+ "Since the angle classifier is not initialized, it will not be used during the forward process"
66
+ )
67
+
68
+ img, flag_gif, flag_pdf = check_img(img, alpha_color)
69
+ # for infer pdf file
70
+ if isinstance(img, list) and flag_pdf:
71
+ if self.page_num > len(img) or self.page_num == 0:
72
+ imgs = img
73
+ else:
74
+ imgs = img[: self.page_num]
75
+ else:
76
+ imgs = [img]
77
+
78
+ def preprocess_image(_image):
79
+ _image = alpha_to_color(_image, alpha_color)
80
+ if inv:
81
+ _image = cv2.bitwise_not(_image)
82
+ if bin:
83
+ _image = binarize_img(_image)
84
+ return _image
85
+
86
+ if det and rec:
87
+ ocr_res = []
88
+ for img in imgs:
89
+ img = preprocess_image(img)
90
+ dt_boxes, rec_res, _ = self.__call__(img, cls, slice, mfd_res=mfd_res)
91
+ if not dt_boxes and not rec_res:
92
+ ocr_res.append(None)
93
+ continue
94
+ tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)]
95
+ ocr_res.append(tmp_res)
96
+ return ocr_res
97
+ elif det and not rec:
98
+ ocr_res = []
99
+ for img in imgs:
100
+ img = preprocess_image(img)
101
+ dt_boxes, elapse = self.text_detector(img)
102
+ if dt_boxes.size == 0:
103
+ ocr_res.append(None)
104
+ continue
105
+ tmp_res = [box.tolist() for box in dt_boxes]
106
+ ocr_res.append(tmp_res)
107
+ return ocr_res
108
+ else:
109
+ ocr_res = []
110
+ cls_res = []
111
+ for img in imgs:
112
+ if not isinstance(img, list):
113
+ img = preprocess_image(img)
114
+ img = [img]
115
+ if self.use_angle_cls and cls:
116
+ img, cls_res_tmp, elapse = self.text_classifier(img)
117
+ if not rec:
118
+ cls_res.append(cls_res_tmp)
119
+ rec_res, elapse = self.text_recognizer(img)
120
+ ocr_res.append(rec_res)
121
+ if not rec:
122
+ return cls_res
123
+ return ocr_res
124
+
125
+ def __call__(self, img, cls=True, slice={}, mfd_res=None):
126
+ time_dict = {"det": 0, "rec": 0, "cls": 0, "all": 0}
127
+
128
+ if img is None:
129
+ logger.debug("no valid image provided")
130
+ return None, None, time_dict
131
+
132
+ start = time.time()
133
+ ori_im = img.copy()
134
+ if slice:
135
+ slice_gen = slice_generator(
136
+ img,
137
+ horizontal_stride=slice["horizontal_stride"],
138
+ vertical_stride=slice["vertical_stride"],
139
+ )
140
+ elapsed = []
141
+ dt_slice_boxes = []
142
+ for slice_crop, v_start, h_start in slice_gen:
143
+ dt_boxes, elapse = self.text_detector(slice_crop, use_slice=True)
144
+ if dt_boxes.size:
145
+ dt_boxes[:, :, 0] += h_start
146
+ dt_boxes[:, :, 1] += v_start
147
+ dt_slice_boxes.append(dt_boxes)
148
+ elapsed.append(elapse)
149
+ dt_boxes = np.concatenate(dt_slice_boxes)
150
+
151
+ dt_boxes = merge_fragmented(
152
+ boxes=dt_boxes,
153
+ x_threshold=slice["merge_x_thres"],
154
+ y_threshold=slice["merge_y_thres"],
155
+ )
156
+ elapse = sum(elapsed)
157
+ else:
158
+ dt_boxes, elapse = self.text_detector(img)
159
+
160
+ time_dict["det"] = elapse
161
+
162
+ if dt_boxes is None:
163
+ logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
164
+ end = time.time()
165
+ time_dict["all"] = end - start
166
+ return None, None, time_dict
167
+ else:
168
+ logger.debug(
169
+ "dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), elapse)
170
+ )
171
+ img_crop_list = []
172
+
173
+ dt_boxes = sorted_boxes(dt_boxes)
174
+
175
+ if mfd_res:
176
+ bef = time.time()
177
+ dt_boxes = update_det_boxes(dt_boxes, mfd_res)
178
+ aft = time.time()
179
+ logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
180
+ len(dt_boxes), aft - bef))
181
+
182
+ for bno in range(len(dt_boxes)):
183
+ tmp_box = copy.deepcopy(dt_boxes[bno])
184
+ if self.args.det_box_type == "quad":
185
+ img_crop = get_rotate_crop_image(ori_im, tmp_box)
186
+ else:
187
+ img_crop = get_minarea_rect_crop(ori_im, tmp_box)
188
+ img_crop_list.append(img_crop)
189
+ if self.use_angle_cls and cls:
190
+ img_crop_list, angle_list, elapse = self.text_classifier(img_crop_list)
191
+ time_dict["cls"] = elapse
192
+ logger.debug(
193
+ "cls num : {}, elapsed : {}".format(len(img_crop_list), elapse)
194
+ )
195
+ if len(img_crop_list) > 1000:
196
+ logger.debug(
197
+ f"rec crops num: {len(img_crop_list)}, time and memory cost may be large."
198
+ )
199
+
200
+ rec_res, elapse = self.text_recognizer(img_crop_list)
201
+ time_dict["rec"] = elapse
202
+ logger.debug("rec_res num : {}, elapsed : {}".format(len(rec_res), elapse))
203
+ if self.args.save_crop_res:
204
+ self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, rec_res)
205
+ filter_boxes, filter_rec_res = [], []
206
+ for box, rec_result in zip(dt_boxes, rec_res):
207
+ text, score = rec_result[0], rec_result[1]
208
+ if score >= self.drop_score:
209
+ filter_boxes.append(box)
210
+ filter_rec_res.append(rec_result)
211
+ end = time.time()
212
+ time_dict["all"] = end - start
213
+ return filter_boxes, filter_rec_res, time_dict
File without changes
@@ -0,0 +1,242 @@
1
+ from typing import List
2
+ import cv2
3
+ import numpy as np
4
+
5
+
6
+ def projection_by_bboxes(boxes: np.array, axis: int) -> np.ndarray:
7
+ """
8
+ 通过一组 bbox 获得投影直方图,最后以 per-pixel 形式输出
9
+
10
+ Args:
11
+ boxes: [N, 4]
12
+ axis: 0-x坐标向水平方向投影, 1-y坐标向垂直方向投影
13
+
14
+ Returns:
15
+ 1D 投影直方图,长度为投影方向坐标的最大值(我们不需要图片的实际边长,因为只是要找文本框的间隔)
16
+
17
+ """
18
+ assert axis in [0, 1]
19
+ length = np.max(boxes[:, axis::2])
20
+ res = np.zeros(length, dtype=int)
21
+ # TODO: how to remove for loop?
22
+ for start, end in boxes[:, axis::2]:
23
+ res[start:end] += 1
24
+ return res
25
+
26
+
27
+ # from: https://dothinking.github.io/2021-06-19-%E9%80%92%E5%BD%92%E6%8A%95%E5%BD%B1%E5%88%86%E5%89%B2%E7%AE%97%E6%B3%95/#:~:text=%E9%80%92%E5%BD%92%E6%8A%95%E5%BD%B1%E5%88%86%E5%89%B2%EF%BC%88Recursive%20XY,%EF%BC%8C%E5%8F%AF%E4%BB%A5%E5%88%92%E5%88%86%E6%AE%B5%E8%90%BD%E3%80%81%E8%A1%8C%E3%80%82
28
+ def split_projection_profile(arr_values: np.array, min_value: float, min_gap: float):
29
+ """Split projection profile:
30
+
31
+ ```
32
+ ┌──┐
33
+ arr_values │ │ ┌─┐───
34
+ ┌──┐ │ │ │ │ |
35
+ │ │ │ │ ┌───┐ │ │min_value
36
+ │ │<- min_gap ->│ │ │ │ │ │ |
37
+ ────┴──┴─────────────┴──┴─┴───┴─┴─┴─┴───
38
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
39
+ ```
40
+
41
+ Args:
42
+ arr_values (np.array): 1-d array representing the projection profile.
43
+ min_value (float): Ignore the profile if `arr_value` is less than `min_value`.
44
+ min_gap (float): Ignore the gap if less than this value.
45
+
46
+ Returns:
47
+ tuple: Start indexes and end indexes of split groups.
48
+ """
49
+ # all indexes with projection height exceeding the threshold
50
+ arr_index = np.where(arr_values > min_value)[0]
51
+ if not len(arr_index):
52
+ return
53
+
54
+ # find zero intervals between adjacent projections
55
+ # | | ||
56
+ # ||||<- zero-interval -> |||||
57
+ arr_diff = arr_index[1:] - arr_index[0:-1]
58
+ arr_diff_index = np.where(arr_diff > min_gap)[0]
59
+ arr_zero_intvl_start = arr_index[arr_diff_index]
60
+ arr_zero_intvl_end = arr_index[arr_diff_index + 1]
61
+
62
+ # convert to index of projection range:
63
+ # the start index of zero interval is the end index of projection
64
+ arr_start = np.insert(arr_zero_intvl_end, 0, arr_index[0])
65
+ arr_end = np.append(arr_zero_intvl_start, arr_index[-1])
66
+ arr_end += 1 # end index will be excluded as index slice
67
+
68
+ return arr_start, arr_end
69
+
70
+
71
+ def recursive_xy_cut(boxes: np.ndarray, indices: List[int], res: List[int]):
72
+ """
73
+
74
+ Args:
75
+ boxes: (N, 4)
76
+ indices: 递归过程中始终表示 box 在原始数据中的索引
77
+ res: 保存输出结果
78
+
79
+ """
80
+ # 向 y 轴投影
81
+ assert len(boxes) == len(indices)
82
+
83
+ _indices = boxes[:, 1].argsort()
84
+ y_sorted_boxes = boxes[_indices]
85
+ y_sorted_indices = indices[_indices]
86
+
87
+ # debug_vis(y_sorted_boxes, y_sorted_indices)
88
+
89
+ y_projection = projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
90
+ pos_y = split_projection_profile(y_projection, 0, 1)
91
+ if not pos_y:
92
+ return
93
+
94
+ arr_y0, arr_y1 = pos_y
95
+ for r0, r1 in zip(arr_y0, arr_y1):
96
+ # [r0, r1] 表示按照水平切分,有 bbox 的区域,对这些区域会再进行垂直切分
97
+ _indices = (r0 <= y_sorted_boxes[:, 1]) & (y_sorted_boxes[:, 1] < r1)
98
+
99
+ y_sorted_boxes_chunk = y_sorted_boxes[_indices]
100
+ y_sorted_indices_chunk = y_sorted_indices[_indices]
101
+
102
+ _indices = y_sorted_boxes_chunk[:, 0].argsort()
103
+ x_sorted_boxes_chunk = y_sorted_boxes_chunk[_indices]
104
+ x_sorted_indices_chunk = y_sorted_indices_chunk[_indices]
105
+
106
+ # 往 x 方向投影
107
+ x_projection = projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
108
+ pos_x = split_projection_profile(x_projection, 0, 1)
109
+ if not pos_x:
110
+ continue
111
+
112
+ arr_x0, arr_x1 = pos_x
113
+ if len(arr_x0) == 1:
114
+ # x 方向无法切分
115
+ res.extend(x_sorted_indices_chunk)
116
+ continue
117
+
118
+ # x 方向上能分开,继续递归调用
119
+ for c0, c1 in zip(arr_x0, arr_x1):
120
+ _indices = (c0 <= x_sorted_boxes_chunk[:, 0]) & (
121
+ x_sorted_boxes_chunk[:, 0] < c1
122
+ )
123
+ recursive_xy_cut(
124
+ x_sorted_boxes_chunk[_indices], x_sorted_indices_chunk[_indices], res
125
+ )
126
+
127
+
128
+ def points_to_bbox(points):
129
+ assert len(points) == 8
130
+
131
+ # [x1,y1,x2,y2,x3,y3,x4,y4]
132
+ left = min(points[::2])
133
+ right = max(points[::2])
134
+ top = min(points[1::2])
135
+ bottom = max(points[1::2])
136
+
137
+ left = max(left, 0)
138
+ top = max(top, 0)
139
+ right = max(right, 0)
140
+ bottom = max(bottom, 0)
141
+ return [left, top, right, bottom]
142
+
143
+
144
+ def bbox2points(bbox):
145
+ left, top, right, bottom = bbox
146
+ return [left, top, right, top, right, bottom, left, bottom]
147
+
148
+
149
+ def vis_polygon(img, points, thickness=2, color=None):
150
+ br2bl_color = color
151
+ tl2tr_color = color
152
+ tr2br_color = color
153
+ bl2tl_color = color
154
+ cv2.line(
155
+ img,
156
+ (points[0][0], points[0][1]),
157
+ (points[1][0], points[1][1]),
158
+ color=tl2tr_color,
159
+ thickness=thickness,
160
+ )
161
+
162
+ cv2.line(
163
+ img,
164
+ (points[1][0], points[1][1]),
165
+ (points[2][0], points[2][1]),
166
+ color=tr2br_color,
167
+ thickness=thickness,
168
+ )
169
+
170
+ cv2.line(
171
+ img,
172
+ (points[2][0], points[2][1]),
173
+ (points[3][0], points[3][1]),
174
+ color=br2bl_color,
175
+ thickness=thickness,
176
+ )
177
+
178
+ cv2.line(
179
+ img,
180
+ (points[3][0], points[3][1]),
181
+ (points[0][0], points[0][1]),
182
+ color=bl2tl_color,
183
+ thickness=thickness,
184
+ )
185
+ return img
186
+
187
+
188
+ def vis_points(
189
+ img: np.ndarray, points, texts: List[str] = None, color=(0, 200, 0)
190
+ ) -> np.ndarray:
191
+ """
192
+
193
+ Args:
194
+ img:
195
+ points: [N, 8] 8: x1,y1,x2,y2,x3,y3,x3,y4
196
+ texts:
197
+ color:
198
+
199
+ Returns:
200
+
201
+ """
202
+ points = np.array(points)
203
+ if texts is not None:
204
+ assert len(texts) == points.shape[0]
205
+
206
+ for i, _points in enumerate(points):
207
+ vis_polygon(img, _points.reshape(-1, 2), thickness=2, color=color)
208
+ bbox = points_to_bbox(_points)
209
+ left, top, right, bottom = bbox
210
+ cx = (left + right) // 2
211
+ cy = (top + bottom) // 2
212
+
213
+ txt = texts[i]
214
+ font = cv2.FONT_HERSHEY_SIMPLEX
215
+ cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
216
+
217
+ img = cv2.rectangle(
218
+ img,
219
+ (cx - 5 * len(txt), cy - cat_size[1] - 5),
220
+ (cx - 5 * len(txt) + cat_size[0], cy - 5),
221
+ color,
222
+ -1,
223
+ )
224
+
225
+ img = cv2.putText(
226
+ img,
227
+ txt,
228
+ (cx - 5 * len(txt), cy - 5),
229
+ font,
230
+ 0.5,
231
+ (255, 255, 255),
232
+ thickness=1,
233
+ lineType=cv2.LINE_AA,
234
+ )
235
+
236
+ return img
237
+
238
+
239
+ def vis_polygons_with_index(image, points):
240
+ texts = [str(i) for i in range(len(points))]
241
+ res_img = vis_points(image.copy(), points, texts)
242
+ return res_img
File without changes
@@ -0,0 +1,16 @@
1
+ import numpy as np
2
+ from rapid_table import RapidTable
3
+ from rapidocr_paddle import RapidOCR
4
+
5
+
6
+ class RapidTableModel(object):
7
+ def __init__(self):
8
+ self.table_model = RapidTable()
9
+ self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
10
+
11
+ def predict(self, image):
12
+ ocr_result, _ = self.ocr_engine(np.asarray(image))
13
+ if ocr_result is None:
14
+ return None, None, None
15
+ html_code, table_cell_bboxes, elapse = self.table_model(np.asarray(image), ocr_result)
16
+ return html_code, table_cell_bboxes, elapse
@@ -1,8 +1,8 @@
1
- import re
2
-
3
1
  import torch
4
2
  from struct_eqtable import build_model
5
3
 
4
+ from magic_pdf.model.sub_modules.table.table_utils import minify_html
5
+
6
6
 
7
7
  class StructTableModel:
8
8
  def __init__(self, model_path, max_new_tokens=1024, max_time=60):
@@ -31,15 +31,7 @@ class StructTableModel:
31
31
  )
32
32
 
33
33
  if output_format == "html":
34
- results = [self.minify_html(html) for html in results]
34
+ results = [minify_html(html) for html in results]
35
35
 
36
36
  return results
37
37
 
38
- def minify_html(self, html):
39
- # 移除多余的空白字符
40
- html = re.sub(r'\s+', ' ', html)
41
- # 移除行尾的空白字符
42
- html = re.sub(r'\s*>\s*', '>', html)
43
- # 移除标签前的空白字符
44
- html = re.sub(r'\s*<\s*', '<', html)
45
- return html.strip()