magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
- from loguru import logger
2
1
 
3
- from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
4
- __is_overlaps_y_exceeds_threshold, calculate_iou
5
- from magic_pdf.libs.drop_tag import DropTag
6
- from magic_pdf.libs.ocr_content_type import ContentType, BlockType
2
+ from magic_pdf.config.drop_tag import DropTag
3
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
4
+ from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
5
+ calculate_iou,
6
+ calculate_overlap_area_in_bbox1_area_ratio,
7
+ get_minbox_if_overlap_by_ratio)
7
8
 
8
9
 
9
10
  def remove_overlaps_low_confidence_spans(spans):
@@ -21,7 +22,10 @@ def remove_overlaps_low_confidence_spans(spans):
21
22
  span_need_remove = span1
22
23
  else:
23
24
  span_need_remove = span2
24
- if span_need_remove is not None and span_need_remove not in dropped_spans:
25
+ if (
26
+ span_need_remove is not None
27
+ and span_need_remove not in dropped_spans
28
+ ):
25
29
  dropped_spans.append(span_need_remove)
26
30
 
27
31
  if len(dropped_spans) > 0:
@@ -38,12 +42,15 @@ def remove_overlaps_min_spans(spans):
38
42
  for span1 in spans:
39
43
  for span2 in spans:
40
44
  if span1 != span2:
41
- overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
42
- if overlap_box is not None:
43
- span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
44
- if span_need_remove is not None and span_need_remove not in dropped_spans:
45
- dropped_spans.append(span_need_remove)
46
-
45
+ # span1 span2 任何一个都不应该在 dropped_spans 中
46
+ if span1 in dropped_spans or span2 in dropped_spans:
47
+ continue
48
+ else:
49
+ overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
50
+ if overlap_box is not None:
51
+ span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
52
+ if span_need_remove is not None and span_need_remove not in dropped_spans:
53
+ dropped_spans.append(span_need_remove)
47
54
  if len(dropped_spans) > 0:
48
55
  for span_need_remove in dropped_spans:
49
56
  spans.remove(span_need_remove)
@@ -58,7 +65,10 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
58
65
  need_remove_spans = []
59
66
  for span in spans:
60
67
  for removed_bbox in need_remove_spans_bboxes:
61
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
68
+ if (
69
+ calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox)
70
+ > 0.5
71
+ ):
62
72
  if span not in need_remove_spans:
63
73
  need_remove_spans.append(span)
64
74
  break
@@ -78,12 +88,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
78
88
  for span in spans:
79
89
  # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
80
90
  for removed_bbox in removed_bboxes:
81
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
91
+ if (
92
+ calculate_overlap_area_in_bbox1_area_ratio(
93
+ span['bbox'], removed_bbox
94
+ )
95
+ > 0.5
96
+ ):
82
97
  need_remove_spans.append(span)
83
98
  break
84
99
  # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
85
- elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
86
- removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
100
+ elif (
101
+ drop_tag == DropTag.FOOTNOTE
102
+ and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3]
103
+ and removed_bbox[0]
104
+ < (span['bbox'][0] + span['bbox'][2]) / 2
105
+ < removed_bbox[2]
106
+ ):
87
107
  need_remove_spans.append(span)
88
108
  break
89
109
 
@@ -98,11 +118,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
98
118
  def adjust_bbox_for_standalone_block(spans):
99
119
  # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
100
120
  for sb_span in spans:
101
- if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
121
+ if sb_span['type'] in [
122
+ ContentType.InterlineEquation,
123
+ ContentType.Image,
124
+ ContentType.Table,
125
+ ]:
102
126
  for text_span in spans:
103
127
  if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
104
128
  # 判断span2的纵向高度是否被span所覆盖
105
- if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
129
+ if (
130
+ sb_span['bbox'][1] < text_span['bbox'][1]
131
+ and sb_span['bbox'][3] > text_span['bbox'][3]
132
+ ):
106
133
  # 判断span2是否在span左边
107
134
  if text_span['bbox'][0] < sb_span['bbox'][0]:
108
135
  # 调整span的y0和span2的y0一致
@@ -120,11 +147,15 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
120
147
 
121
148
  lines = []
122
149
  current_line = [spans[0]]
123
- if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
150
+ if spans[0]['type'] in [
151
+ ContentType.InterlineEquation,
152
+ ContentType.Image,
153
+ ContentType.Table,
154
+ ]:
124
155
  displayed_list.append(spans[0])
125
156
 
126
- line_first_y0 = spans[0]["bbox"][1]
127
- line_first_y = spans[0]["bbox"][3]
157
+ line_first_y0 = spans[0]['bbox'][1]
158
+ line_first_y = spans[0]['bbox'][3]
128
159
  # 用于给行间公式搜索
129
160
  # text_inline_lines = []
130
161
  for span in spans[1:]:
@@ -132,26 +163,43 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
132
163
  # print("debug")
133
164
  # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
134
165
  # image和table类型,同上
135
- if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
136
- s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
137
- current_line):
166
+ if span['type'] in [
167
+ ContentType.InterlineEquation,
168
+ ContentType.Image,
169
+ ContentType.Table,
170
+ ] or any(
171
+ s['type']
172
+ in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]
173
+ for s in current_line
174
+ ):
138
175
  # 传入
139
- if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
176
+ if span['type'] in [
177
+ ContentType.InterlineEquation,
178
+ ContentType.Image,
179
+ ContentType.Table,
180
+ ]:
140
181
  displayed_list.append(span)
141
182
  # 则开始新行
142
183
  lines.append(current_line)
143
- if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
144
- text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
184
+ if len(current_line) > 1 or current_line[0]['type'] in [
185
+ ContentType.Text,
186
+ ContentType.InlineEquation,
187
+ ]:
188
+ text_inline_lines.append(
189
+ (current_line, (line_first_y0, line_first_y))
190
+ )
145
191
  current_line = [span]
146
- line_first_y0 = span["bbox"][1]
147
- line_first_y = span["bbox"][3]
192
+ line_first_y0 = span['bbox'][1]
193
+ line_first_y = span['bbox'][3]
148
194
  continue
149
195
 
150
196
  # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
151
- if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
152
- if span["type"] == "text":
153
- line_first_y0 = span["bbox"][1]
154
- line_first_y = span["bbox"][3]
197
+ if __is_overlaps_y_exceeds_threshold(
198
+ span['bbox'], current_line[-1]['bbox']
199
+ ):
200
+ if span['type'] == 'text':
201
+ line_first_y0 = span['bbox'][1]
202
+ line_first_y = span['bbox'][3]
155
203
  current_line.append(span)
156
204
 
157
205
  else:
@@ -159,13 +207,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
159
207
  lines.append(current_line)
160
208
  text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
161
209
  current_line = [span]
162
- line_first_y0 = span["bbox"][1]
163
- line_first_y = span["bbox"][3]
210
+ line_first_y0 = span['bbox'][1]
211
+ line_first_y = span['bbox'][3]
164
212
 
165
213
  # 添加最后一行
166
214
  if current_line:
167
215
  lines.append(current_line)
168
- if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
216
+ if len(current_line) > 1 or current_line[0]['type'] in [
217
+ ContentType.Text,
218
+ ContentType.InlineEquation,
219
+ ]:
169
220
  text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
170
221
  for line in text_inline_lines:
171
222
  # 按照x0坐标排序
@@ -176,8 +227,8 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
176
227
  for line in text_inline_lines:
177
228
  current_line, (line_first_y0, line_first_y) = line
178
229
  for span in current_line:
179
- span["bbox"][1] = line_first_y0
180
- span["bbox"][3] = line_first_y
230
+ span['bbox'][1] = line_first_y0
231
+ span['bbox'][3] = line_first_y
181
232
 
182
233
  # return spans, displayed_list, text_inline_lines
183
234
 
@@ -189,34 +240,42 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
189
240
  # if i == 8:
190
241
  # print("debug")
191
242
  span = displayed_list[i]
192
- span_y0, span_y = span["bbox"][1], span["bbox"][3]
243
+ span_y0, span_y = span['bbox'][1], span['bbox'][3]
193
244
 
194
245
  while j < len(text_inline_lines):
195
246
  text_line = text_inline_lines[j]
196
247
  y0, y1 = text_line[1]
197
248
  if (
198
- span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
199
- ) and __is_overlaps_y_exceeds_threshold(
200
- span['bbox'], (0, y0, 0, y1)
201
- ):
249
+ span_y0 < y0 < span_y
250
+ or span_y0 < y1 < span_y
251
+ or span_y0 < y0
252
+ and span_y > y1
253
+ ) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
202
254
  # 调整公式类型
203
- if span["type"] == ContentType.InterlineEquation:
255
+ if span['type'] == ContentType.InterlineEquation:
204
256
  # 最后一行是行间公式
205
257
  if j + 1 >= len(text_inline_lines):
206
- span["type"] = ContentType.InlineEquation
207
- span["bbox"][1] = y0
208
- span["bbox"][3] = y1
258
+ span['type'] = ContentType.InlineEquation
259
+ span['bbox'][1] = y0
260
+ span['bbox'][3] = y1
209
261
  else:
210
262
  # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
211
263
  y0_next, y1_next = text_inline_lines[j + 1][1]
212
- if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
213
- y1 - y0) > span_y - span_y0:
214
- span["type"] = ContentType.InlineEquation
215
- span["bbox"][1] = y0
216
- span["bbox"][3] = y1
264
+ if (
265
+ not __is_overlaps_y_exceeds_threshold(
266
+ span['bbox'], (0, y0_next, 0, y1_next)
267
+ )
268
+ and 3 * (y1 - y0) > span_y - span_y0
269
+ ):
270
+ span['type'] = ContentType.InlineEquation
271
+ span['bbox'][1] = y0
272
+ span['bbox'][3] = y1
217
273
  break
218
- elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
219
- (0, y0, 0, y1)):
274
+ elif (
275
+ span_y < y0
276
+ or span_y0 < y0 < span_y
277
+ and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1))
278
+ ):
220
279
  break
221
280
  else:
222
281
  j += 1
@@ -232,15 +291,15 @@ def get_qa_need_list(blocks):
232
291
  inline_equations = []
233
292
 
234
293
  for block in blocks:
235
- for line in block["lines"]:
236
- for span in line["spans"]:
237
- if span["type"] == ContentType.Image:
294
+ for line in block['lines']:
295
+ for span in line['spans']:
296
+ if span['type'] == ContentType.Image:
238
297
  images.append(span)
239
- elif span["type"] == ContentType.Table:
298
+ elif span['type'] == ContentType.Table:
240
299
  tables.append(span)
241
- elif span["type"] == ContentType.InlineEquation:
300
+ elif span['type'] == ContentType.InlineEquation:
242
301
  inline_equations.append(span)
243
- elif span["type"] == ContentType.InterlineEquation:
302
+ elif span['type'] == ContentType.InterlineEquation:
244
303
  interline_equations.append(span)
245
304
  else:
246
305
  continue
@@ -254,10 +313,10 @@ def get_qa_need_list_v2(blocks):
254
313
  interline_equations = []
255
314
 
256
315
  for block in blocks:
257
- if block["type"] == BlockType.Image:
316
+ if block['type'] == BlockType.Image:
258
317
  images.append(block)
259
- elif block["type"] == BlockType.Table:
318
+ elif block['type'] == BlockType.Table:
260
319
  tables.append(block)
261
- elif block["type"] == BlockType.InterlineEquation:
320
+ elif block['type'] == BlockType.InterlineEquation:
262
321
  interline_equations.append(block)
263
322
  return images, tables, interline_equations
@@ -1,58 +1,65 @@
1
- from magic_pdf.libs.commons import fitz
1
+ from magic_pdf.config.drop_reason import DropReason
2
2
  from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
3
- from magic_pdf.libs.drop_reason import DropReason
3
+ from magic_pdf.libs.commons import fitz
4
4
 
5
5
 
6
6
  def __area(box):
7
7
  return (box[2] - box[0]) * (box[3] - box[1])
8
8
 
9
- def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
10
- """
11
- 检查page是包含有颜色背景的矩形
12
- """
9
+
10
+ def __is_contain_color_background_rect(
11
+ page: fitz.Page, text_blocks, image_bboxes
12
+ ) -> bool:
13
+ """检查page是包含有颜色背景的矩形."""
13
14
  color_bg_rect = []
14
15
  p_width, p_height = page.rect.width, page.rect.height
15
-
16
+
16
17
  # 先找到最大的带背景矩形
17
18
  blocks = page.get_cdrawings()
18
19
  for block in blocks:
19
-
20
- if 'fill' in block and block['fill']: # 过滤掉透明的
20
+ if 'fill' in block and block['fill']: # 过滤掉透明的
21
21
  fill = list(block['fill'])
22
22
  fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
23
- if fill==(1.0,1.0,1.0):
23
+ if fill == (1.0, 1.0, 1.0):
24
24
  continue
25
25
  rect = block['rect']
26
26
  # 过滤掉特别小的矩形
27
- if __area(rect) < 10*10:
27
+ if __area(rect) < 10 * 10:
28
28
  continue
29
29
  # 为了防止是svg图片上的色块,这里过滤掉这类
30
-
31
- if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
30
+
31
+ if any(
32
+ [_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]
33
+ ):
32
34
  continue
33
35
  color_bg_rect.append(rect)
34
-
36
+
35
37
  # 找到最大的背景矩形
36
38
  if len(color_bg_rect) > 0:
37
- max_rect = max(color_bg_rect, key=lambda x:__area(x))
38
- max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
39
+ max_rect = max(color_bg_rect, key=lambda x: __area(x))
40
+ max_rect_int = (
41
+ int(max_rect[0]),
42
+ int(max_rect[1]),
43
+ int(max_rect[2]),
44
+ int(max_rect[3]),
45
+ )
39
46
  # 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
40
- if max_rect[2]-max_rect[0] > 0.2*p_width and max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
41
- #看是否有文本块落入到这个矩形中
47
+ if (
48
+ max_rect[2] - max_rect[0] > 0.2 * p_width
49
+ and max_rect[3] - max_rect[1] > 0.1 * p_height
50
+ ): # 宽度符合
51
+ # 看是否有文本块落入到这个矩形中
42
52
  for text_block in text_blocks:
43
53
  box = text_block['bbox']
44
54
  box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
45
55
  if _is_in(box_int, max_rect_int):
46
56
  return True
47
-
57
+
48
58
  return False
49
59
 
50
60
 
51
61
  def __is_table_overlap_text_block(text_blocks, table_bbox):
52
- """
53
- 检查table_bbox是否覆盖了text_blocks里的文本块
54
- TODO
55
- """
62
+ """检查table_bbox是否覆盖了text_blocks里的文本块 TODO."""
56
63
  for text_block in text_blocks:
57
64
  box = text_block['bbox']
58
65
  if _is_in_or_part_overlap(table_bbox, box):
@@ -60,15 +67,12 @@ def __is_table_overlap_text_block(text_blocks, table_bbox):
60
67
  return False
61
68
 
62
69
 
63
- def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
64
- """
65
- return:(True|False, err_msg)
66
- True, 如果pdf符合要求
67
- False, 如果pdf不符合要求
68
-
69
- """
70
+ def pdf_filter(page: fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
71
+ """return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
70
72
  if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
71
- return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
73
+ return False, {
74
+ '_need_drop': True,
75
+ '_drop_reason': DropReason.COLOR_BACKGROUND_TEXT_BOX,
76
+ }
72
77
 
73
-
74
- return True, None
78
+ return True, None
@@ -1,8 +1,9 @@
1
- from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
2
- from magic_pdf.libs.drop_reason import DropReason
1
+ from magic_pdf.config.drop_reason import DropReason
2
+ from magic_pdf.libs.boxbase import _is_in, _is_part_overlap
3
+
3
4
 
4
5
  def _remove_overlap_between_bbox(bbox1, bbox2):
5
- if _is_part_overlap(bbox1, bbox2):
6
+ if _is_part_overlap(bbox1, bbox2):
6
7
  ix0, iy0, ix1, iy1 = bbox1
7
8
  x0, y0, x1, y1 = bbox2
8
9
 
@@ -22,10 +23,10 @@ def _remove_overlap_between_bbox(bbox1, bbox2):
22
23
  if y1 >= iy1:
23
24
  mid = (y0 + iy1) // 2
24
25
  y0 = max(mid + 0.25, y0)
25
- iy1 = min(iy1, mid-0.25)
26
+ iy1 = min(iy1, mid - 0.25)
26
27
  else:
27
28
  mid = (iy0 + y1) // 2
28
- y1 = min(y1, mid-0.25)
29
+ y1 = min(y1, mid - 0.25)
29
30
  iy0 = max(mid + 0.25, iy0)
30
31
 
31
32
  if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
@@ -34,8 +35,8 @@ def _remove_overlap_between_bbox(bbox1, bbox2):
34
35
  return bbox1, bbox2, None
35
36
  else:
36
37
  return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
37
- else:
38
- return bbox1, bbox2, None
38
+ else:
39
+ return bbox1, bbox2, None
39
40
 
40
41
 
41
42
  def _remove_overlap_between_bboxes(arr):
@@ -47,7 +48,7 @@ def _remove_overlap_between_bboxes(arr):
47
48
  for j in range(N):
48
49
  if i == j:
49
50
  continue
50
- if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
51
+ if _is_in(arr[i]['bbox'], arr[j]['bbox']):
51
52
  keeps[i] = False
52
53
 
53
54
  for idx, v in enumerate(arr):
@@ -56,13 +57,15 @@ def _remove_overlap_between_bboxes(arr):
56
57
  for i in range(N):
57
58
  if res[i] is None:
58
59
  continue
59
-
60
- bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
60
+
61
+ bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(
62
+ v['bbox'], res[i]['bbox']
63
+ )
61
64
  if drop_reason is None:
62
- v["bbox"] = bbox1
63
- res[i]["bbox"] = bbox2
65
+ v['bbox'] = bbox1
66
+ res[i]['bbox'] = bbox2
64
67
  else:
65
- if v["score"] > res[i]["score"]:
68
+ if v['score'] > res[i]['score']:
66
69
  keeps[i] = False
67
70
  res[i] = None
68
71
  else:
@@ -74,25 +77,24 @@ def _remove_overlap_between_bboxes(arr):
74
77
 
75
78
 
76
79
  def remove_overlap_between_bbox_for_span(spans):
77
- arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
80
+ arr = [{'bbox': span['bbox'], 'score': span.get('score', 0.1)} for span in spans]
78
81
  res, drop_reasons = _remove_overlap_between_bboxes(arr)
79
82
  ret = []
80
83
  for i in range(len(res)):
81
84
  if res[i] is None:
82
85
  continue
83
- spans[i]["bbox"] = res[i]["bbox"]
86
+ spans[i]['bbox'] = res[i]['bbox']
84
87
  ret.append(spans[i])
85
88
  return ret, drop_reasons
86
89
 
87
90
 
88
91
  def remove_overlap_between_bbox_for_block(all_bboxes):
89
- arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
92
+ arr = [{'bbox': bbox[:4], 'score': bbox[-1]} for bbox in all_bboxes]
90
93
  res, drop_reasons = _remove_overlap_between_bboxes(arr)
91
94
  ret = []
92
95
  for i in range(len(res)):
93
96
  if res[i] is None:
94
97
  continue
95
- all_bboxes[i][:4] = res[i]["bbox"]
98
+ all_bboxes[i][:4] = res[i]['bbox']
96
99
  ret.append(all_bboxes[i])
97
100
  return ret, drop_reasons
98
-
@@ -1,7 +1,8 @@
1
- from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
2
1
  from loguru import logger
3
2
 
4
- from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
3
+ from magic_pdf.config.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
4
+ from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
5
+ calculate_overlap_area_2_minbox_area_ratio)
5
6
 
6
7
 
7
8
  def __area(box):
@@ -9,8 +10,7 @@ def __area(box):
9
10
 
10
11
 
11
12
  def rectangle_position_determination(rect, p_width):
12
- """
13
- 判断矩形是否在页面中轴线附近。
13
+ """判断矩形是否在页面中轴线附近。
14
14
 
15
15
  Args:
16
16
  rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
@@ -34,9 +34,10 @@ def rectangle_position_determination(rect, p_width):
34
34
  else:
35
35
  return False
36
36
 
37
+
37
38
  def remove_colored_strip_textblock(remain_text_blocks, page):
38
- """
39
- 根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_strip_textblock
39
+ """根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_str
40
+ ip_textblock
40
41
 
41
42
  Args:
42
43
  remain_text_blocks (list): 剩余文本块列表。
@@ -51,22 +52,44 @@ def remove_colored_strip_textblock(remain_text_blocks, page):
51
52
  blocks = page.get_cdrawings()
52
53
  colored_strip_bg_rect = []
53
54
  for block in blocks:
54
- is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0) # 过滤掉透明的
55
+ is_filled = (
56
+ 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)
57
+ ) # 过滤掉透明的
55
58
  rect = block['rect']
56
59
  area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
57
- rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
58
- in_upper_half_page = rect[3] < p_height * 0.3 # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
59
- aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4 # 找到长宽比超过4的矩形
60
+ rectangle_position_determination_result = rectangle_position_determination(
61
+ rect, p_width
62
+ )
63
+ in_upper_half_page = (
64
+ rect[3] < p_height * 0.3
65
+ ) # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
66
+ aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (
67
+ rect[3] - rect[1]
68
+ ) * 4 # 找到长宽比超过4的矩形
60
69
 
61
- if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
70
+ if (
71
+ is_filled
72
+ and area_is_large_enough
73
+ and rectangle_position_determination_result
74
+ and in_upper_half_page
75
+ and aspect_ratio_exceeds_4
76
+ ):
62
77
  colored_strip_bg_rect.append(rect)
63
78
 
64
79
  if len(colored_strip_bg_rect) > 0:
65
80
  for colored_strip_block_bbox in colored_strip_bg_rect:
66
81
  for text_block in remain_text_blocks:
67
82
  text_bbox = text_block['bbox']
68
- if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
69
- logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
83
+ if _is_in(text_bbox, colored_strip_block_bbox) or (
84
+ _is_in_or_part_overlap(text_bbox, colored_strip_block_bbox)
85
+ and calculate_overlap_area_2_minbox_area_ratio(
86
+ text_bbox, colored_strip_block_bbox
87
+ )
88
+ > 0.6
89
+ ):
90
+ logger.info(
91
+ f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}'
92
+ )
70
93
  text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
71
94
  colored_strip_textblocks.append(text_block)
72
95
 
@@ -76,4 +99,3 @@ def remove_colored_strip_textblock(remain_text_blocks, page):
76
99
  remain_text_blocks.remove(colored_strip_textblock)
77
100
 
78
101
  return remain_text_blocks, colored_strip_textblocks
79
-
@@ -1,15 +1,12 @@
1
1
  import re
2
2
 
3
+ from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
3
4
  from magic_pdf.libs.boxbase import _is_in_or_part_overlap
4
- from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
5
5
 
6
6
 
7
7
  def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
8
8
  page_no_bboxs, page_w, page_h):
9
- """
10
- 删除页眉页脚,页码
11
- 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
12
- """
9
+ """删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
13
10
  header = []
14
11
  footer = []
15
12
  if len(header) == 0: