magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +16 -22
  2. magic_pdf/filter/pdf_meta_scan.py +5 -19
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_check.py +52 -25
  7. magic_pdf/libs/pdf_image_tools.py +2 -1
  8. magic_pdf/libs/version.py +1 -1
  9. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  10. magic_pdf/model/magic_model.py +0 -30
  11. magic_pdf/model/pp_structure_v2.py +23 -3
  12. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
  13. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
  14. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
  15. magic_pdf/para/para_split_v3.py +21 -7
  16. magic_pdf/pdf_parse_union_core_v2.py +134 -146
  17. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  18. magic_pdf/pre_proc/cut_image.py +0 -37
  19. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  20. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  21. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  22. magic_pdf/rw/S3ReaderWriter.py +1 -1
  23. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
  24. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
  25. magic_pdf/dict2md/mkcontent.py +0 -438
  26. magic_pdf/layout/__init__.py +0 -0
  27. magic_pdf/layout/bbox_sort.py +0 -681
  28. magic_pdf/layout/layout_det_utils.py +0 -182
  29. magic_pdf/layout/layout_sort.py +0 -921
  30. magic_pdf/layout/layout_spiler_recog.py +0 -101
  31. magic_pdf/layout/mcol_sort.py +0 -336
  32. magic_pdf/libs/calc_span_stats.py +0 -239
  33. magic_pdf/libs/detect_language_from_model.py +0 -21
  34. magic_pdf/libs/nlp_utils.py +0 -203
  35. magic_pdf/libs/textbase.py +0 -33
  36. magic_pdf/libs/vis_utils.py +0 -308
  37. magic_pdf/para/block_continuation_processor.py +0 -562
  38. magic_pdf/para/block_termination_processor.py +0 -480
  39. magic_pdf/para/commons.py +0 -222
  40. magic_pdf/para/denoise.py +0 -246
  41. magic_pdf/para/draw.py +0 -121
  42. magic_pdf/para/exceptions.py +0 -198
  43. magic_pdf/para/layout_match_processor.py +0 -40
  44. magic_pdf/para/para_split.py +0 -807
  45. magic_pdf/para/para_split_v2.py +0 -959
  46. magic_pdf/para/raw_processor.py +0 -207
  47. magic_pdf/para/stats.py +0 -268
  48. magic_pdf/para/title_processor.py +0 -1014
  49. magic_pdf/pdf_parse_union_core.py +0 -345
  50. magic_pdf/post_proc/__init__.py +0 -0
  51. magic_pdf/post_proc/detect_para.py +0 -3472
  52. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  53. magic_pdf/post_proc/remove_footnote.py +0 -153
  54. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  55. magic_pdf/pre_proc/detect_equation.py +0 -134
  56. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  57. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  58. magic_pdf/pre_proc/detect_footnote.py +0 -170
  59. magic_pdf/pre_proc/detect_header.py +0 -64
  60. magic_pdf/pre_proc/detect_images.py +0 -647
  61. magic_pdf/pre_proc/detect_page_number.py +0 -64
  62. magic_pdf/pre_proc/detect_tables.py +0 -62
  63. magic_pdf/pre_proc/equations_replace.py +0 -550
  64. magic_pdf/pre_proc/fix_image.py +0 -244
  65. magic_pdf/pre_proc/fix_table.py +0 -270
  66. magic_pdf/pre_proc/main_text_font.py +0 -23
  67. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  68. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  69. magic_pdf/pre_proc/post_layout_split.py +0 -0
  70. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  71. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  72. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  73. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  74. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  75. magic_pdf/pre_proc/statistics.py +0 -12
  76. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
  77. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
  78. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
  79. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0
@@ -1,184 +0,0 @@
1
- """
2
- 从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
3
- 1. 首先去掉出现在图片上的bbox,图片包括表格和图片
4
- 2. 然后去掉出现在文字blcok上的图片bbox
5
- """
6
-
7
- from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
8
- from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
9
- _is_left_overlap)
10
-
11
-
12
- def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
13
- text_raw_blocks: list):
14
- """
15
- text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
16
- 当下采用一种粗暴的方式:
17
- 1. 去掉图片上的公式
18
- 2. 去掉table上的公式
19
- 2. 图片和文字block部分重叠,首先丢弃图片
20
- 3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
21
- 4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
22
- 5. 去掉表格上的文字
23
- """
24
- text_block_removed = []
25
- images_backup = []
26
-
27
- # 去掉位于图片上的文字block
28
- for image_box in images:
29
- for text_block in text_raw_blocks:
30
- text_bbox = text_block['bbox']
31
- if _is_in(text_bbox, image_box):
32
- text_block['tag'] = ON_IMAGE_TEXT
33
- text_block_removed.append(text_block)
34
- # 去掉table上的文字block
35
- for table_box in tables:
36
- for text_block in text_raw_blocks:
37
- text_bbox = text_block['bbox']
38
- if _is_in(text_bbox, table_box):
39
- text_block['tag'] = ON_TABLE_TEXT
40
- text_block_removed.append(text_block)
41
-
42
- for text_block in text_block_removed:
43
- if text_block in text_raw_blocks:
44
- text_raw_blocks.remove(text_block)
45
-
46
- # 第一步去掉在图片上出现的公式box
47
- temp = []
48
- for image_box in images:
49
- for eq1 in interline_equations:
50
- if _is_in_or_part_overlap(image_box, eq1[:4]):
51
- temp.append(eq1)
52
- for eq2 in inline_equations:
53
- if _is_in_or_part_overlap(image_box, eq2[:4]):
54
- temp.append(eq2)
55
-
56
- for eq in temp:
57
- if eq in interline_equations:
58
- interline_equations.remove(eq)
59
- if eq in inline_equations:
60
- inline_equations.remove(eq)
61
-
62
- # 第二步去掉在表格上出现的公式box
63
- temp = []
64
- for table_box in tables:
65
- for eq1 in interline_equations:
66
- if _is_in_or_part_overlap(table_box, eq1[:4]):
67
- temp.append(eq1)
68
- for eq2 in inline_equations:
69
- if _is_in_or_part_overlap(table_box, eq2[:4]):
70
- temp.append(eq2)
71
-
72
- for eq in temp:
73
- if eq in interline_equations:
74
- interline_equations.remove(eq)
75
- if eq in inline_equations:
76
- inline_equations.remove(eq)
77
-
78
- # 图片和文字重叠,丢掉图片
79
- for image_box in images:
80
- for text_block in text_raw_blocks:
81
- text_bbox = text_block['bbox']
82
- if _is_in_or_part_overlap(image_box, text_bbox):
83
- images_backup.append(image_box)
84
- break
85
- for image_box in images_backup:
86
- images.remove(image_box)
87
-
88
- # 图片和图片重叠,两张都暂时不参与版面计算
89
- images_dup_index = []
90
- for i in range(len(images)):
91
- for j in range(i + 1, len(images)):
92
- if _is_in_or_part_overlap(images[i], images[j]):
93
- images_dup_index.append(i)
94
- images_dup_index.append(j)
95
-
96
- dup_idx = set(images_dup_index)
97
- for img_id in dup_idx:
98
- images_backup.append(images[img_id])
99
- images[img_id] = None
100
-
101
- images = [img for img in images if img is not None]
102
-
103
- # 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
104
- # 对于这样的文本块删除,然后保留行间公式的大小不变。
105
- # 当计算完毕layout,这部分再合并回来
106
- text_block_removed_2 = []
107
- # for text_block in text_raw_blocks:
108
- # text_bbox = text_block["bbox"]
109
- # for eq in interline_equations:
110
- # ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
111
- # if ratio>0.05:
112
- # text_block['tag'] = "belong-to-interline-equation"
113
- # text_block_removed_2.append(text_block)
114
- # break
115
-
116
- # for tb in text_block_removed_2:
117
- # if tb in text_raw_blocks:
118
- # text_raw_blocks.remove(tb)
119
-
120
- # text_block_removed = text_block_removed + text_block_removed_2
121
-
122
- return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
123
-
124
-
125
- def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
126
- """检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
127
- if len(text_blocks) == 0:
128
- return False
129
-
130
- page_min_y = 0
131
- page_max_y = max(yy['bbox'][3] for yy in text_blocks)
132
-
133
- def __max_y(lst: list):
134
- if len(lst) > 0:
135
- return max([item[1] for item in lst])
136
- return page_min_y
137
-
138
- def __min_y(lst: list):
139
- if len(lst) > 0:
140
- return min([item[3] for item in lst])
141
- return page_max_y
142
-
143
- clip_y0 = __max_y(header)
144
- clip_y1 = __min_y(footer)
145
-
146
- txt_bboxes = []
147
- for text_block in text_blocks:
148
- bbox = text_block['bbox']
149
- if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
150
- txt_bboxes.append(bbox)
151
-
152
- for i in range(len(txt_bboxes)):
153
- for j in range(i + 1, len(txt_bboxes)):
154
- if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
155
- return True
156
-
157
- return False
158
-
159
-
160
- def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
161
- """检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
162
- if len(useful_blocks) == 0:
163
- return False
164
-
165
- page_min_y = 0
166
- page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
167
-
168
- useful_bboxes = []
169
- for text_block in useful_blocks:
170
- bbox = text_block['bbox']
171
- if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
172
- useful_bboxes.append(bbox)
173
-
174
- for i in range(len(useful_bboxes)):
175
- for j in range(i + 1, len(useful_bboxes)):
176
- area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
177
- area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
178
- if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
179
- if area_i > area_j:
180
- return True, useful_bboxes[j], useful_bboxes[i]
181
- else:
182
- return True, useful_bboxes[i], useful_bboxes[j]
183
-
184
- return False, None, None
@@ -1,29 +0,0 @@
1
- def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
2
- """解决行内文本间距过大问题"""
3
- for i in range(len(pdf_info_dict)):
4
-
5
- text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
6
-
7
- for block in text_blocks:
8
-
9
- x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
10
-
11
- for line in block['lines']:
12
-
13
- x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
14
- # line_box = [x1, y1, x2, y2]
15
- if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
16
- # if len(line['spans']) == 1:
17
- line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
18
-
19
- x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
20
-
21
- return pdf_info_dict
22
-
23
-
24
-
25
-
26
-
27
-
28
-
29
-
@@ -1,12 +0,0 @@
1
-
2
- """
3
- 统计处需要跨页、全局性的数据
4
- - 统计出字号从大到小
5
- - 正文区域占比最高的前5
6
- - 正文平均行间距
7
- - 正文平均字间距
8
- - 正文平均字符宽度
9
- - 正文平均字符高度
10
-
11
- """
12
-