magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,408 @@
1
+
2
+
3
+ from loguru import logger
4
+ import math
5
+
6
+ def _is_in_or_part_overlap(box1, box2) -> bool:
7
+ """
8
+ 两个bbox是否有部分重叠或者包含
9
+ """
10
+ if box1 is None or box2 is None:
11
+ return False
12
+
13
+ x0_1, y0_1, x1_1, y1_1 = box1
14
+ x0_2, y0_2, x1_2, y1_2 = box2
15
+
16
+ return not (x1_1 < x0_2 or # box1在box2的左边
17
+ x0_1 > x1_2 or # box1在box2的右边
18
+ y1_1 < y0_2 or # box1在box2的上边
19
+ y0_1 > y1_2) # box1在box2的下边
20
+
21
+ def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6):
22
+ """
23
+ 判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold
24
+
25
+ """
26
+ if box1 is None or box2 is None:
27
+ return False
28
+
29
+ x0_1, y0_1, x1_1, y1_1 = box1
30
+ x0_2, y0_2, x1_2, y1_2 = box2
31
+
32
+ if not _is_in_or_part_overlap(box1, box2):
33
+ return False
34
+
35
+ # 计算重叠面积
36
+ x_left = max(x0_1, x0_2)
37
+ y_top = max(y0_1, y0_2)
38
+ x_right = min(x1_1, x1_2)
39
+ y_bottom = min(y1_1, y1_2)
40
+ overlap_area = (x_right - x_left) * (y_bottom - y_top)
41
+
42
+ # 计算box1的面积
43
+ box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
44
+
45
+ return overlap_area / box1_area > area_ratio_threshold
46
+
47
+
48
+ def _is_in(box1, box2) -> bool:
49
+ """
50
+ box1是否完全在box2里面
51
+ """
52
+ x0_1, y0_1, x1_1, y1_1 = box1
53
+ x0_2, y0_2, x1_2, y1_2 = box2
54
+
55
+ return (x0_1 >= x0_2 and # box1的左边界不在box2的左边外
56
+ y0_1 >= y0_2 and # box1的上边界不在box2的上边外
57
+ x1_1 <= x1_2 and # box1的右边界不在box2的右边外
58
+ y1_1 <= y1_2) # box1的下边界不在box2的下边外
59
+
60
+ def _is_part_overlap(box1, box2) -> bool:
61
+ """
62
+ 两个bbox是否有部分重叠,但不完全包含
63
+ """
64
+ if box1 is None or box2 is None:
65
+ return False
66
+
67
+ return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2)
68
+
69
+ def _left_intersect(left_box, right_box):
70
+ "检查两个box的左边界是否有交集,也就是left_box的右边界是否在right_box的左边界内"
71
+ if left_box is None or right_box is None:
72
+ return False
73
+
74
+ x0_1, y0_1, x1_1, y1_1 = left_box
75
+ x0_2, y0_2, x1_2, y1_2 = right_box
76
+
77
+ return x1_1>x0_2 and x0_1<x0_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
78
+
79
+ def _right_intersect(left_box, right_box):
80
+ """
81
+ 检查box是否在右侧边界有交集,也就是left_box的左边界是否在right_box的右边界内
82
+ """
83
+ if left_box is None or right_box is None:
84
+ return False
85
+
86
+ x0_1, y0_1, x1_1, y1_1 = left_box
87
+ x0_2, y0_2, x1_2, y1_2 = right_box
88
+
89
+ return x0_1<x1_2 and x1_1>x1_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
90
+
91
+
92
+ def _is_vertical_full_overlap(box1, box2, x_torlence=2):
93
+ """
94
+ x方向上:要么box1包含box2, 要么box2包含box1。不能部分包含
95
+ y方向上:box1和box2有重叠
96
+ """
97
+ # 解析box的坐标
98
+ x11, y11, x12, y12 = box1 # 左上角和右下角的坐标 (x1, y1, x2, y2)
99
+ x21, y21, x22, y22 = box2
100
+
101
+ # 在x轴方向上,box1是否包含box2 或 box2包含box1
102
+ contains_in_x = (x11-x_torlence <= x21 and x12+x_torlence >= x22) or (x21-x_torlence <= x11 and x22+x_torlence >= x12)
103
+
104
+ # 在y轴方向上,box1和box2是否有重叠
105
+ overlap_in_y = not (y12 < y21 or y11 > y22)
106
+
107
+ return contains_in_x and overlap_in_y
108
+
109
+
110
+ def _is_bottom_full_overlap(box1, box2, y_tolerance=2):
111
+ """
112
+ 检查box1下方和box2的上方有轻微的重叠,轻微程度收到y_tolerance的限制
113
+ 这个函数和_is_vertical-full_overlap的区别是,这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度
114
+ """
115
+ if box1 is None or box2 is None:
116
+ return False
117
+
118
+ x0_1, y0_1, x1_1, y1_1 = box1
119
+ x0_2, y0_2, x1_2, y1_2 = box2
120
+ tolerance_margin = 2
121
+ is_xdir_full_overlap = ((x0_1-tolerance_margin<=x0_2<=x1_1+tolerance_margin and x0_1-tolerance_margin<=x1_2<=x1_1+tolerance_margin) or (x0_2-tolerance_margin<=x0_1<=x1_2+tolerance_margin and x0_2-tolerance_margin<=x1_1<=x1_2+tolerance_margin))
122
+
123
+ return y0_2<y1_1 and 0<(y1_1-y0_2)<y_tolerance and is_xdir_full_overlap
124
+
125
+ def _is_left_overlap(box1, box2,):
126
+ """
127
+ 检查box1的左侧是否和box2有重叠
128
+ 在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系,也就是无论box1在box2下方还是box2在box1下方,都可以检测到重叠。
129
+ X方向上
130
+ """
131
+ def __overlap_y(Ay1, Ay2, By1, By2):
132
+ return max(0, min(Ay2, By2) - max(Ay1, By1))
133
+
134
+ if box1 is None or box2 is None:
135
+ return False
136
+
137
+ x0_1, y0_1, x1_1, y1_1 = box1
138
+ x0_2, y0_2, x1_2, y1_2 = box2
139
+
140
+ y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2)
141
+ ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1-y0_1!=0 else 0
142
+ ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2-y0_2!=0 else 0
143
+ vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5
144
+
145
+ #vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2
146
+ return x0_1<=x0_2<=x1_1 and vertical_overlap_cond
147
+
148
+
149
+ def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
150
+ """检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
151
+ _, y0_1, _, y1_1 = bbox1
152
+ _, y0_2, _, y1_2 = bbox2
153
+
154
+ overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
155
+ height1, height2 = y1_1 - y0_1, y1_2 - y0_2
156
+ max_height = max(height1, height2)
157
+ min_height = min(height1, height2)
158
+
159
+ return (overlap / min_height) > overlap_ratio_threshold
160
+
161
+
162
+
163
+ def calculate_iou(bbox1, bbox2):
164
+ """
165
+ 计算两个边界框的交并比(IOU)。
166
+
167
+ Args:
168
+ bbox1 (list[float]): 第一个边界框的坐标,格式为 [x1, y1, x2, y2],其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。
169
+ bbox2 (list[float]): 第二个边界框的坐标,格式与 `bbox1` 相同。
170
+
171
+ Returns:
172
+ float: 两个边界框的交并比(IOU),取值范围为 [0, 1]。
173
+
174
+ """
175
+ # Determine the coordinates of the intersection rectangle
176
+ x_left = max(bbox1[0], bbox2[0])
177
+ y_top = max(bbox1[1], bbox2[1])
178
+ x_right = min(bbox1[2], bbox2[2])
179
+ y_bottom = min(bbox1[3], bbox2[3])
180
+
181
+ if x_right < x_left or y_bottom < y_top:
182
+ return 0.0
183
+
184
+ # The area of overlap area
185
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
186
+
187
+ # The area of both rectangles
188
+ bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
189
+ bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
190
+
191
+ # Compute the intersection over union by taking the intersection area
192
+ # and dividing it by the sum of both areas minus the intersection area
193
+ iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
194
+ return iou
195
+
196
+
197
+ def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
198
+ """
199
+ 计算box1和box2的重叠面积占最小面积的box的比例
200
+ """
201
+ # Determine the coordinates of the intersection rectangle
202
+ x_left = max(bbox1[0], bbox2[0])
203
+ y_top = max(bbox1[1], bbox2[1])
204
+ x_right = min(bbox1[2], bbox2[2])
205
+ y_bottom = min(bbox1[3], bbox2[3])
206
+
207
+ if x_right < x_left or y_bottom < y_top:
208
+ return 0.0
209
+
210
+ # The area of overlap area
211
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
212
+ min_box_area = min([(bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]), (bbox2[3]-bbox2[1])*(bbox2[2]-bbox2[0])])
213
+ if min_box_area==0:
214
+ return 0
215
+ else:
216
+ return intersection_area / min_box_area
217
+
218
+ def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
219
+ """
220
+ 计算box1和box2的重叠面积占bbox1的比例
221
+ """
222
+ # Determine the coordinates of the intersection rectangle
223
+ x_left = max(bbox1[0], bbox2[0])
224
+ y_top = max(bbox1[1], bbox2[1])
225
+ x_right = min(bbox1[2], bbox2[2])
226
+ y_bottom = min(bbox1[3], bbox2[3])
227
+
228
+ if x_right < x_left or y_bottom < y_top:
229
+ return 0.0
230
+
231
+ # The area of overlap area
232
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
233
+ bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
234
+ if bbox1_area == 0:
235
+ return 0
236
+ else:
237
+ return intersection_area / bbox1_area
238
+
239
+
240
+ def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
241
+ """
242
+ 通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
243
+ 如果比例大于ratio,则返回小的那个bbox,
244
+ 否则返回None
245
+ """
246
+ x1_min, y1_min, x1_max, y1_max = bbox1
247
+ x2_min, y2_min, x2_max, y2_max = bbox2
248
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
249
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
250
+ overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
251
+ if overlap_ratio > ratio:
252
+ if area1 <= area2:
253
+ return bbox1
254
+ else:
255
+ return bbox2
256
+ else:
257
+ return None
258
+
259
+ def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
260
+ x0, y0, x1, y1 = boundry
261
+ new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]
262
+ return new_boxes
263
+
264
+
265
+ def is_vbox_on_side(bbox, width, height, side_threshold=0.2):
266
+ """
267
+ 判断一个bbox是否在pdf页面的边缘
268
+ """
269
+ x0, x1 = bbox[0], bbox[2]
270
+ if x1<=width*side_threshold or x0>=width*(1-side_threshold):
271
+ return True
272
+ return False
273
+
274
+ def find_top_nearest_text_bbox(pymu_blocks, obj_bbox):
275
+ tolerance_margin = 4
276
+ top_boxes = [box for box in pymu_blocks if obj_bbox[1]-box['bbox'][3] >=-tolerance_margin and not _is_in(box['bbox'], obj_bbox)]
277
+ # 然后找到X方向上有互相重叠的
278
+ top_boxes = [box for box in top_boxes if any([obj_bbox[0]-tolerance_margin <=box['bbox'][0]<=obj_bbox[2]+tolerance_margin,
279
+ obj_bbox[0]-tolerance_margin <=box['bbox'][2]<=obj_bbox[2]+tolerance_margin,
280
+ box['bbox'][0]-tolerance_margin <=obj_bbox[0]<=box['bbox'][2]+tolerance_margin,
281
+ box['bbox'][0]-tolerance_margin <=obj_bbox[2]<=box['bbox'][2]+tolerance_margin
282
+ ])]
283
+
284
+ # 然后找到y1最大的那个
285
+ if len(top_boxes)>0:
286
+ top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True)
287
+ return top_boxes[0]
288
+ else:
289
+ return None
290
+
291
+
292
+ def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox):
293
+ bottom_boxes = [box for box in pymu_blocks if box['bbox'][1] - obj_bbox[3]>=-2 and not _is_in(box['bbox'], obj_bbox)]
294
+ # 然后找到X方向上有互相重叠的
295
+ bottom_boxes = [box for box in bottom_boxes if any([obj_bbox[0]-2 <=box['bbox'][0]<=obj_bbox[2]+2,
296
+ obj_bbox[0]-2 <=box['bbox'][2]<=obj_bbox[2]+2,
297
+ box['bbox'][0]-2 <=obj_bbox[0]<=box['bbox'][2]+2,
298
+ box['bbox'][0]-2 <=obj_bbox[2]<=box['bbox'][2]+2
299
+ ])]
300
+
301
+ # 然后找到y0最小的那个
302
+ if len(bottom_boxes)>0:
303
+ bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False)
304
+ return bottom_boxes[0]
305
+ else:
306
+ return None
307
+
308
+ def find_left_nearest_text_bbox(pymu_blocks, obj_bbox):
309
+ """
310
+ 寻找左侧最近的文本block
311
+ """
312
+ left_boxes = [box for box in pymu_blocks if obj_bbox[0]-box['bbox'][2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
313
+ # 然后找到X方向上有互相重叠的
314
+ left_boxes = [box for box in left_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2,
315
+ obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
316
+ box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
317
+ box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
318
+ ])]
319
+
320
+ # 然后找到x1最大的那个
321
+ if len(left_boxes)>0:
322
+ left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True)
323
+ return left_boxes[0]
324
+ else:
325
+ return None
326
+
327
+
328
+ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
329
+ """
330
+ 寻找右侧最近的文本block
331
+ """
332
+ right_boxes = [box for box in pymu_blocks if box['bbox'][0]-obj_bbox[2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
333
+ # 然后找到X方向上有互相重叠的
334
+ right_boxes = [box for box in right_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2,
335
+ obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
336
+ box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
337
+ box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
338
+ ])]
339
+
340
+ # 然后找到x0最小的那个
341
+ if len(right_boxes)>0:
342
+ right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False)
343
+ return right_boxes[0]
344
+ else:
345
+ return None
346
+
347
+
348
+ def bbox_relative_pos(bbox1, bbox2):
349
+ """
350
+ 判断两个矩形框的相对位置关系
351
+
352
+ Args:
353
+ bbox1: 一个四元组,表示第一个矩形框的左上角和右下角的坐标,格式为(x1, y1, x1b, y1b)
354
+ bbox2: 一个四元组,表示第二个矩形框的左上角和右下角的坐标,格式为(x2, y2, x2b, y2b)
355
+
356
+ Returns:
357
+ 一个四元组,表示矩形框1相对于矩形框2的位置关系,格式为(left, right, bottom, top)
358
+ 其中,left表示矩形框1是否在矩形框2的左侧,right表示矩形框1是否在矩形框2的右侧,
359
+ bottom表示矩形框1是否在矩形框2的下方,top表示矩形框1是否在矩形框2的上方
360
+
361
+ """
362
+ x1, y1, x1b, y1b = bbox1
363
+ x2, y2, x2b, y2b = bbox2
364
+
365
+ left = x2b < x1
366
+ right = x1b < x2
367
+ bottom = y2b < y1
368
+ top = y1b < y2
369
+ return left, right, bottom, top
370
+
371
+ def bbox_distance(bbox1, bbox2):
372
+ """
373
+ 计算两个矩形框的距离。
374
+
375
+ Args:
376
+ bbox1 (tuple): 第一个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。
377
+ bbox2 (tuple): 第二个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。
378
+
379
+ Returns:
380
+ float: 矩形框之间的距离。
381
+
382
+ """
383
+ def dist(point1, point2):
384
+ return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
385
+
386
+ x1, y1, x1b, y1b = bbox1
387
+ x2, y2, x2b, y2b = bbox2
388
+
389
+ left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
390
+
391
+ if top and left:
392
+ return dist((x1, y1b), (x2b, y2))
393
+ elif left and bottom:
394
+ return dist((x1, y1), (x2b, y2b))
395
+ elif bottom and right:
396
+ return dist((x1b, y1), (x2, y2b))
397
+ elif right and top:
398
+ return dist((x1b, y1b), (x2, y2))
399
+ elif left:
400
+ return x1 - x2b
401
+ elif right:
402
+ return x2 - x1b
403
+ elif bottom:
404
+ return y1 - y2b
405
+ elif top:
406
+ return y2 - y1b
407
+ else: # rectangles intersect
408
+ return 0
@@ -0,0 +1,239 @@
1
+ import os
2
+ import csv
3
+ import json
4
+ import pandas as pd
5
+ from pandas import DataFrame as df
6
+ from matplotlib import pyplot as plt
7
+ from termcolor import cprint
8
+
9
+ """
10
+ Execute this script in the following way:
11
+
12
+ 1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
13
+
14
+ code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
15
+
16
+ 2. Under the directory code-clean, execute the following command:
17
+
18
+ $ python -m libs.calc_span_stats
19
+
20
+ """
21
+
22
+
23
+ def print_green_on_red(text):
24
+ cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
25
+
26
+
27
+ def print_green(text):
28
+ print()
29
+ cprint(text, "green", attrs=["bold"], end="\n\n")
30
+
31
+
32
+ def print_red(text):
33
+ print()
34
+ cprint(text, "red", attrs=["bold"], end="\n\n")
35
+
36
+
37
+ def safe_get(dict_obj, key, default):
38
+ val = dict_obj.get(key)
39
+ if val is None:
40
+ return default
41
+ else:
42
+ return val
43
+
44
+
45
+ class SpanStatsCalc:
46
+ """Calculate statistics of span."""
47
+
48
+ def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
49
+ """Draw multiple figures in one figure."""
50
+ # make a canvas
51
+ fig = plt.figure(fig_num, figsize=(20, 20))
52
+
53
+ pass
54
+
55
+ def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
56
+ """Calculate statistics per pdf_dict."""
57
+ span_stats = pd.DataFrame()
58
+
59
+ span_stats = []
60
+ span_id = 0
61
+ for page_id, blocks in pdf_dict.items():
62
+ if page_id.startswith("page_"):
63
+ if "para_blocks" in blocks.keys():
64
+ for para_block in blocks["para_blocks"]:
65
+ for line in para_block["lines"]:
66
+ for span in line["spans"]:
67
+ span_text = safe_get(span, "text", "")
68
+ span_font_name = safe_get(span, "font", "")
69
+ span_font_size = safe_get(span, "size", 0)
70
+ span_font_color = safe_get(span, "color", "")
71
+ span_font_flags = safe_get(span, "flags", 0)
72
+
73
+ span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
74
+ span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
75
+ span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
76
+ span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
77
+ span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
78
+ span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
79
+ span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
80
+ span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
81
+
82
+ span_stats.append(
83
+ {
84
+ "span_id": span_id, # id of span
85
+ "page_id": page_id, # page number of pdf
86
+ "span_text": span_text, # text of span
87
+ "span_font_name": span_font_name, # font name of span
88
+ "span_font_size": span_font_size, # font size of span
89
+ "span_font_color": span_font_color, # font color of span
90
+ "span_font_flags": span_font_flags, # font flags of span
91
+ "span_is_superscript": int(
92
+ span_is_super_script
93
+ ), # indicate whether the span is super script or not
94
+ "span_is_italic": int(span_is_italic), # indicate whether the span is italic or not
95
+ "span_is_serifed": int(span_is_serifed), # indicate whether the span is serifed or not
96
+ "span_is_sans_serifed": int(
97
+ span_is_sans_serifed
98
+ ), # indicate whether the span is sans serifed or not
99
+ "span_is_monospaced": int(
100
+ span_is_monospaced
101
+ ), # indicate whether the span is monospaced or not
102
+ "span_is_proportional": int(
103
+ span_is_proportional
104
+ ), # indicate whether the span is proportional or not
105
+ "span_is_bold": int(span_is_bold), # indicate whether the span is bold or not
106
+ }
107
+ )
108
+
109
+ span_id += 1
110
+
111
+ span_stats = pd.DataFrame(span_stats)
112
+ # print(span_stats)
113
+
114
+ return span_stats
115
+
116
+
117
+ def __find_pdf_dic_files(
118
+ jf_name="pdf_dic.json",
119
+ base_code_name="code-clean",
120
+ tgt_base_dir_name="tmp",
121
+ unittest_dir_name="unittest",
122
+ md_dir_name="md",
123
+ book_names=[
124
+ "scihub",
125
+ ], # other possible values: "zlib", "arxiv" and so on
126
+ ):
127
+ pdf_dict_files = []
128
+
129
+ curr_dir = os.path.dirname(__file__)
130
+
131
+ for i in range(len(curr_dir)):
132
+ if curr_dir[i : i + len(base_code_name)] == base_code_name:
133
+ base_code_dir_name = curr_dir[: i + len(base_code_name)]
134
+ for book_name in book_names:
135
+ search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
136
+ if os.path.exists(base_code_dir_name):
137
+ search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
138
+ for root, dirs, files in os.walk(search_dir_name):
139
+ for file in files:
140
+ if file == jf_name:
141
+ pdf_dict_files.append(os.path.join(root, file))
142
+ break
143
+
144
+ return pdf_dict_files
145
+
146
+
147
+ def combine_span_texts(group_df, span_stats):
148
+ combined_span_texts = []
149
+ for _, row in group_df.iterrows():
150
+ curr_span_id = row.name
151
+ curr_span_text = row["span_text"]
152
+
153
+ pre_span_id = curr_span_id - 1
154
+ pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
155
+
156
+ next_span_id = curr_span_id + 1
157
+ next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
158
+
159
+ # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
160
+ pointer_sign = "→ → → "
161
+ combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
162
+ combined_span_texts.append(combined_text)
163
+
164
+ return "\n\n".join(combined_span_texts)
165
+
166
+
167
+ # pd.set_option("display.max_colwidth", None) # 设置为 None 来显示完整的文本
168
+ pd.set_option("display.max_rows", None) # 设置为 None 来显示更多的行
169
+
170
+
171
+ def main():
172
+ pdf_dict_files = __find_pdf_dic_files()
173
+ # print(pdf_dict_files)
174
+
175
+ span_stats_calc = SpanStatsCalc()
176
+
177
+ for pdf_dict_file in pdf_dict_files:
178
+ print("-" * 100)
179
+ print_green_on_red(f"Processing {pdf_dict_file}")
180
+
181
+ with open(pdf_dict_file, "r", encoding="utf-8") as f:
182
+ pdf_dict = json.load(f)
183
+
184
+ raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
185
+ save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
186
+ raw_df.to_csv(save_path, index=False)
187
+
188
+ filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
189
+ if filtered_df.empty:
190
+ print("No superscript span found!")
191
+ continue
192
+
193
+ filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
194
+
195
+ combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df) # type: ignore
196
+
197
+ final_df = filtered_grouped_df.size().reset_index(name="count")
198
+ final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
199
+
200
+ print(final_df)
201
+
202
+ final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
203
+
204
+ save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
205
+ # 使用 UTF-8 编码并添加 BOM,确保所有字段被双引号包围
206
+ final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
207
+
208
+ # 创建一个 2x2 的图表布局
209
+ fig, axs = plt.subplots(2, 2, figsize=(15, 10))
210
+
211
+ # 按照 span_font_name 分类作图
212
+ final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
213
+
214
+ # 按照 span_font_size 分类作图
215
+ final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
216
+
217
+ # 按照 span_font_color 分类作图
218
+ final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
219
+
220
+ # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
221
+ grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
222
+ grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
223
+
224
+ # 调整布局
225
+ plt.tight_layout()
226
+
227
+ # 显示图表
228
+ # plt.show()
229
+
230
+ # 保存图表到 PNG 文件
231
+ save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
232
+ plt.savefig(save_path)
233
+
234
+ # 清除画布
235
+ plt.clf()
236
+
237
+
238
+ if __name__ == "__main__":
239
+ main()