magic-pdf 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/__init__.py +0 -0
- magic_pdf/cli/__init__.py +0 -0
- magic_pdf/cli/magicpdf.py +294 -0
- magic_pdf/dict2md/__init__.py +0 -0
- magic_pdf/dict2md/mkcontent.py +397 -0
- magic_pdf/dict2md/ocr_mkcontent.py +356 -0
- magic_pdf/filter/__init__.py +0 -0
- magic_pdf/filter/pdf_classify_by_type.py +381 -0
- magic_pdf/filter/pdf_meta_scan.py +368 -0
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +681 -0
- magic_pdf/layout/layout_det_utils.py +182 -0
- magic_pdf/layout/layout_sort.py +732 -0
- magic_pdf/layout/layout_spiler_recog.py +101 -0
- magic_pdf/layout/mcol_sort.py +336 -0
- magic_pdf/libs/Constants.py +11 -0
- magic_pdf/libs/MakeContentConfig.py +10 -0
- magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
- magic_pdf/libs/__init__.py +0 -0
- magic_pdf/libs/boxbase.py +408 -0
- magic_pdf/libs/calc_span_stats.py +239 -0
- magic_pdf/libs/commons.py +204 -0
- magic_pdf/libs/config_reader.py +63 -0
- magic_pdf/libs/convert_utils.py +5 -0
- magic_pdf/libs/coordinate_transform.py +9 -0
- magic_pdf/libs/detect_language_from_model.py +21 -0
- magic_pdf/libs/draw_bbox.py +227 -0
- magic_pdf/libs/drop_reason.py +27 -0
- magic_pdf/libs/drop_tag.py +19 -0
- magic_pdf/libs/hash_utils.py +15 -0
- magic_pdf/libs/json_compressor.py +27 -0
- magic_pdf/libs/language.py +31 -0
- magic_pdf/libs/markdown_utils.py +31 -0
- magic_pdf/libs/math.py +9 -0
- magic_pdf/libs/nlp_utils.py +203 -0
- magic_pdf/libs/ocr_content_type.py +21 -0
- magic_pdf/libs/path_utils.py +23 -0
- magic_pdf/libs/pdf_image_tools.py +33 -0
- magic_pdf/libs/safe_filename.py +11 -0
- magic_pdf/libs/textbase.py +33 -0
- magic_pdf/libs/version.py +1 -0
- magic_pdf/libs/vis_utils.py +308 -0
- magic_pdf/model/__init__.py +0 -0
- magic_pdf/model/doc_analyze_by_360layout.py +8 -0
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
- magic_pdf/model/magic_model.py +632 -0
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/para/block_continuation_processor.py +562 -0
- magic_pdf/para/block_termination_processor.py +480 -0
- magic_pdf/para/commons.py +222 -0
- magic_pdf/para/denoise.py +246 -0
- magic_pdf/para/draw.py +121 -0
- magic_pdf/para/exceptions.py +198 -0
- magic_pdf/para/layout_match_processor.py +40 -0
- magic_pdf/para/para_pipeline.py +297 -0
- magic_pdf/para/para_split.py +644 -0
- magic_pdf/para/para_split_v2.py +772 -0
- magic_pdf/para/raw_processor.py +207 -0
- magic_pdf/para/stats.py +268 -0
- magic_pdf/para/title_processor.py +1014 -0
- magic_pdf/pdf_parse_by_ocr.py +219 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
- magic_pdf/pdf_parse_by_txt.py +410 -0
- magic_pdf/pdf_parse_by_txt_v2.py +56 -0
- magic_pdf/pdf_parse_for_train.py +685 -0
- magic_pdf/pdf_parse_union_core.py +241 -0
- magic_pdf/pipe/AbsPipe.py +112 -0
- magic_pdf/pipe/OCRPipe.py +28 -0
- magic_pdf/pipe/TXTPipe.py +29 -0
- magic_pdf/pipe/UNIPipe.py +83 -0
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +3472 -0
- magic_pdf/post_proc/pdf_post_filter.py +67 -0
- magic_pdf/post_proc/remove_footnote.py +153 -0
- magic_pdf/pre_proc/__init__.py +0 -0
- magic_pdf/pre_proc/citationmarker_remove.py +157 -0
- magic_pdf/pre_proc/construct_page_dict.py +72 -0
- magic_pdf/pre_proc/cut_image.py +71 -0
- magic_pdf/pre_proc/detect_equation.py +134 -0
- magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
- magic_pdf/pre_proc/detect_footnote.py +170 -0
- magic_pdf/pre_proc/detect_header.py +64 -0
- magic_pdf/pre_proc/detect_images.py +647 -0
- magic_pdf/pre_proc/detect_page_number.py +64 -0
- magic_pdf/pre_proc/detect_tables.py +62 -0
- magic_pdf/pre_proc/equations_replace.py +559 -0
- magic_pdf/pre_proc/fix_image.py +244 -0
- magic_pdf/pre_proc/fix_table.py +270 -0
- magic_pdf/pre_proc/main_text_font.py +23 -0
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
- magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
- magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
- magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
- magic_pdf/pre_proc/remove_footer_header.py +117 -0
- magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
- magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
- magic_pdf/pre_proc/solve_line_alien.py +29 -0
- magic_pdf/pre_proc/statistics.py +12 -0
- magic_pdf/rw/AbsReaderWriter.py +34 -0
- magic_pdf/rw/DiskReaderWriter.py +66 -0
- magic_pdf/rw/S3ReaderWriter.py +107 -0
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/spark/__init__.py +0 -0
- magic_pdf/spark/spark_api.py +51 -0
- magic_pdf/train_utils/__init__.py +0 -0
- magic_pdf/train_utils/convert_to_train_format.py +65 -0
- magic_pdf/train_utils/extract_caption.py +59 -0
- magic_pdf/train_utils/remove_footer_header.py +159 -0
- magic_pdf/train_utils/vis_utils.py +327 -0
- magic_pdf/user_api.py +136 -0
- magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
- magic_pdf-0.5.4.dist-info/METADATA +24 -0
- magic_pdf-0.5.4.dist-info/RECORD +121 -0
- magic_pdf-0.5.4.dist-info/WHEEL +5 -0
- magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,188 @@
|
|
1
|
+
import math
|
2
|
+
|
3
|
+
from magic_pdf.libs.boxbase import is_vbox_on_side
|
4
|
+
from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
|
5
|
+
|
6
|
+
|
7
|
+
def detect_non_horizontal_texts(result_dict):
|
8
|
+
"""
|
9
|
+
This function detects watermarks and vertical margin notes in the document.
|
10
|
+
|
11
|
+
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
|
12
|
+
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
|
13
|
+
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
|
14
|
+
|
15
|
+
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
|
16
|
+
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
|
17
|
+
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
|
18
|
+
|
19
|
+
|
20
|
+
Parameters
|
21
|
+
----------
|
22
|
+
result_dict : dict
|
23
|
+
The result dictionary.
|
24
|
+
|
25
|
+
Returns
|
26
|
+
-------
|
27
|
+
result_dict : dict
|
28
|
+
The updated result dictionary.
|
29
|
+
"""
|
30
|
+
# Dictionary to store information about potential watermarks
|
31
|
+
potential_watermarks = {}
|
32
|
+
potential_margin_notes = {}
|
33
|
+
|
34
|
+
for page_id, page_content in result_dict.items():
|
35
|
+
if page_id.startswith("page_"):
|
36
|
+
for block_id, block_data in page_content.items():
|
37
|
+
if block_id.startswith("block_"):
|
38
|
+
if "dir" in block_data:
|
39
|
+
coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
|
40
|
+
|
41
|
+
angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
|
42
|
+
angle = abs(math.degrees(angle))
|
43
|
+
|
44
|
+
if angle > 5 and angle < 85: # Check if direction is watermarks
|
45
|
+
if coordinates_text in potential_watermarks:
|
46
|
+
potential_watermarks[coordinates_text] += 1
|
47
|
+
else:
|
48
|
+
potential_watermarks[coordinates_text] = 1
|
49
|
+
|
50
|
+
if angle > 85 and angle < 105: # Check if direction is vertical
|
51
|
+
if coordinates_text in potential_margin_notes:
|
52
|
+
potential_margin_notes[coordinates_text] += 1 # Increment count
|
53
|
+
else:
|
54
|
+
potential_margin_notes[coordinates_text] = 1 # Initialize count
|
55
|
+
|
56
|
+
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
|
57
|
+
watermark_threshold = len(result_dict) // 2
|
58
|
+
watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
|
59
|
+
|
60
|
+
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
|
61
|
+
margin_note_threshold = len(result_dict) // 2
|
62
|
+
margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
|
63
|
+
|
64
|
+
# Add watermark information to the result dictionary
|
65
|
+
for page_id, blocks in result_dict.items():
|
66
|
+
if page_id.startswith("page_"):
|
67
|
+
for block_id, block_data in blocks.items():
|
68
|
+
coordinates_text = (block_data["bbox"], block_data["text"])
|
69
|
+
if coordinates_text in watermarks:
|
70
|
+
block_data["is_watermark"] = 1
|
71
|
+
else:
|
72
|
+
block_data["is_watermark"] = 0
|
73
|
+
|
74
|
+
if coordinates_text in margin_notes:
|
75
|
+
block_data["is_vertical_margin_note"] = 1
|
76
|
+
else:
|
77
|
+
block_data["is_vertical_margin_note"] = 0
|
78
|
+
|
79
|
+
return result_dict
|
80
|
+
|
81
|
+
|
82
|
+
"""
|
83
|
+
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
|
84
|
+
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
|
85
|
+
"""
|
86
|
+
import re
|
87
|
+
|
88
|
+
def __is_a_word(sentence):
|
89
|
+
# 如果输入是中文并且长度为1,则返回True
|
90
|
+
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
|
91
|
+
return True
|
92
|
+
# 判断是否为单个英文单词或字符(包括ASCII标点)
|
93
|
+
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
|
94
|
+
return True
|
95
|
+
else:
|
96
|
+
return False
|
97
|
+
|
98
|
+
|
99
|
+
def __get_text_color(num):
|
100
|
+
"""获取字体的颜色RGB值"""
|
101
|
+
blue = num & 255
|
102
|
+
green = (num >> 8) & 255
|
103
|
+
red = (num >> 16) & 255
|
104
|
+
return red, green, blue
|
105
|
+
|
106
|
+
|
107
|
+
def __is_empty_side_box(text_block):
|
108
|
+
"""
|
109
|
+
是否是边缘上的空白没有任何内容的block
|
110
|
+
"""
|
111
|
+
for line in text_block['lines']:
|
112
|
+
for span in line['spans']:
|
113
|
+
font_color = span['color']
|
114
|
+
r,g,b = __get_text_color(font_color)
|
115
|
+
if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
|
116
|
+
return False
|
117
|
+
|
118
|
+
return True
|
119
|
+
|
120
|
+
|
121
|
+
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
|
122
|
+
"""
|
123
|
+
返回删除了垂直,水印,旋转的textblock
|
124
|
+
删除的内容打上tag返回
|
125
|
+
"""
|
126
|
+
removed_text_block = []
|
127
|
+
|
128
|
+
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
|
129
|
+
lines = block['lines']
|
130
|
+
block_bbox = block['bbox']
|
131
|
+
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
|
132
|
+
continue
|
133
|
+
|
134
|
+
if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
|
135
|
+
is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
|
136
|
+
|
137
|
+
if is_box_valign:
|
138
|
+
block['tag'] = VERTICAL_TEXT
|
139
|
+
removed_text_block.append(block)
|
140
|
+
continue
|
141
|
+
|
142
|
+
for line in lines:
|
143
|
+
if line['dir']!=(1,0):
|
144
|
+
block['tag'] = ROTATE_TEXT
|
145
|
+
removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉
|
146
|
+
break
|
147
|
+
|
148
|
+
for block in removed_text_block:
|
149
|
+
pymu_text_block.remove(block)
|
150
|
+
|
151
|
+
return pymu_text_block, removed_text_block
|
152
|
+
|
153
|
+
def get_side_boundry(rotate_bbox, page_width, page_height):
|
154
|
+
"""
|
155
|
+
根据rotate_bbox,返回页面的左右正文边界
|
156
|
+
"""
|
157
|
+
left_x = 0
|
158
|
+
right_x = page_width
|
159
|
+
for x in rotate_bbox:
|
160
|
+
box = x['bbox']
|
161
|
+
if box[2]<page_width/2:
|
162
|
+
left_x = max(left_x, box[2])
|
163
|
+
else:
|
164
|
+
right_x = min(right_x, box[0])
|
165
|
+
|
166
|
+
return left_x+1, right_x-1
|
167
|
+
|
168
|
+
|
169
|
+
def remove_side_blank_block(pymu_text_block, page_width, page_height):
|
170
|
+
"""
|
171
|
+
删除页面两侧的空白block
|
172
|
+
"""
|
173
|
+
removed_text_block = []
|
174
|
+
|
175
|
+
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
|
176
|
+
block_bbox = block['bbox']
|
177
|
+
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
|
178
|
+
continue
|
179
|
+
|
180
|
+
if __is_empty_side_box(block):
|
181
|
+
block['tag'] = EMPTY_SIDE_BLOCK
|
182
|
+
removed_text_block.append(block)
|
183
|
+
continue
|
184
|
+
|
185
|
+
for block in removed_text_block:
|
186
|
+
pymu_text_block.remove(block)
|
187
|
+
|
188
|
+
return pymu_text_block, removed_text_block
|
@@ -0,0 +1,191 @@
|
|
1
|
+
"""
|
2
|
+
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
|
3
|
+
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
|
4
|
+
2. 然后去掉出现在文字blcok上的图片bbox
|
5
|
+
"""
|
6
|
+
|
7
|
+
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
|
8
|
+
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
|
9
|
+
|
10
|
+
|
11
|
+
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
|
12
|
+
text_raw_blocks: list):
|
13
|
+
"""
|
14
|
+
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
|
15
|
+
当下采用一种粗暴的方式:
|
16
|
+
1. 去掉图片上的公式
|
17
|
+
2. 去掉table上的公式
|
18
|
+
2. 图片和文字block部分重叠,首先丢弃图片
|
19
|
+
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
|
20
|
+
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
|
21
|
+
5. 去掉表格上的文字
|
22
|
+
"""
|
23
|
+
text_block_removed = []
|
24
|
+
images_backup = []
|
25
|
+
|
26
|
+
# 去掉位于图片上的文字block
|
27
|
+
for image_box in images:
|
28
|
+
for text_block in text_raw_blocks:
|
29
|
+
text_bbox = text_block["bbox"]
|
30
|
+
if _is_in(text_bbox, image_box):
|
31
|
+
text_block['tag'] = ON_IMAGE_TEXT
|
32
|
+
text_block_removed.append(text_block)
|
33
|
+
# 去掉table上的文字block
|
34
|
+
for table_box in tables:
|
35
|
+
for text_block in text_raw_blocks:
|
36
|
+
text_bbox = text_block["bbox"]
|
37
|
+
if _is_in(text_bbox, table_box):
|
38
|
+
text_block['tag'] = ON_TABLE_TEXT
|
39
|
+
text_block_removed.append(text_block)
|
40
|
+
|
41
|
+
for text_block in text_block_removed:
|
42
|
+
if text_block in text_raw_blocks:
|
43
|
+
text_raw_blocks.remove(text_block)
|
44
|
+
|
45
|
+
# 第一步去掉在图片上出现的公式box
|
46
|
+
temp = []
|
47
|
+
for image_box in images:
|
48
|
+
for eq1 in interline_equations:
|
49
|
+
if _is_in_or_part_overlap(image_box, eq1[:4]):
|
50
|
+
temp.append(eq1)
|
51
|
+
for eq2 in inline_equations:
|
52
|
+
if _is_in_or_part_overlap(image_box, eq2[:4]):
|
53
|
+
temp.append(eq2)
|
54
|
+
|
55
|
+
for eq in temp:
|
56
|
+
if eq in interline_equations:
|
57
|
+
interline_equations.remove(eq)
|
58
|
+
if eq in inline_equations:
|
59
|
+
inline_equations.remove(eq)
|
60
|
+
|
61
|
+
# 第二步去掉在表格上出现的公式box
|
62
|
+
temp = []
|
63
|
+
for table_box in tables:
|
64
|
+
for eq1 in interline_equations:
|
65
|
+
if _is_in_or_part_overlap(table_box, eq1[:4]):
|
66
|
+
temp.append(eq1)
|
67
|
+
for eq2 in inline_equations:
|
68
|
+
if _is_in_or_part_overlap(table_box, eq2[:4]):
|
69
|
+
temp.append(eq2)
|
70
|
+
|
71
|
+
for eq in temp:
|
72
|
+
if eq in interline_equations:
|
73
|
+
interline_equations.remove(eq)
|
74
|
+
if eq in inline_equations:
|
75
|
+
inline_equations.remove(eq)
|
76
|
+
|
77
|
+
# 图片和文字重叠,丢掉图片
|
78
|
+
for image_box in images:
|
79
|
+
for text_block in text_raw_blocks:
|
80
|
+
text_bbox = text_block["bbox"]
|
81
|
+
if _is_in_or_part_overlap(image_box, text_bbox):
|
82
|
+
images_backup.append(image_box)
|
83
|
+
break
|
84
|
+
for image_box in images_backup:
|
85
|
+
images.remove(image_box)
|
86
|
+
|
87
|
+
# 图片和图片重叠,两张都暂时不参与版面计算
|
88
|
+
images_dup_index = []
|
89
|
+
for i in range(len(images)):
|
90
|
+
for j in range(i + 1, len(images)):
|
91
|
+
if _is_in_or_part_overlap(images[i], images[j]):
|
92
|
+
images_dup_index.append(i)
|
93
|
+
images_dup_index.append(j)
|
94
|
+
|
95
|
+
dup_idx = set(images_dup_index)
|
96
|
+
for img_id in dup_idx:
|
97
|
+
images_backup.append(images[img_id])
|
98
|
+
images[img_id] = None
|
99
|
+
|
100
|
+
images = [img for img in images if img is not None]
|
101
|
+
|
102
|
+
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
|
103
|
+
# 对于这样的文本块删除,然后保留行间公式的大小不变。
|
104
|
+
# 当计算完毕layout,这部分再合并回来
|
105
|
+
text_block_removed_2 = []
|
106
|
+
# for text_block in text_raw_blocks:
|
107
|
+
# text_bbox = text_block["bbox"]
|
108
|
+
# for eq in interline_equations:
|
109
|
+
# ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
|
110
|
+
# if ratio>0.05:
|
111
|
+
# text_block['tag'] = "belong-to-interline-equation"
|
112
|
+
# text_block_removed_2.append(text_block)
|
113
|
+
# break
|
114
|
+
|
115
|
+
# for tb in text_block_removed_2:
|
116
|
+
# if tb in text_raw_blocks:
|
117
|
+
# text_raw_blocks.remove(tb)
|
118
|
+
|
119
|
+
# text_block_removed = text_block_removed + text_block_removed_2
|
120
|
+
|
121
|
+
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
|
122
|
+
|
123
|
+
|
124
|
+
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
|
125
|
+
"""
|
126
|
+
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
|
127
|
+
因为这种情况大概率发生了公式没有被检测出来。
|
128
|
+
|
129
|
+
"""
|
130
|
+
if len(text_blocks) == 0:
|
131
|
+
return False
|
132
|
+
|
133
|
+
page_min_y = 0
|
134
|
+
page_max_y = max(yy['bbox'][3] for yy in text_blocks)
|
135
|
+
|
136
|
+
def __max_y(lst: list):
|
137
|
+
if len(lst) > 0:
|
138
|
+
return max([item[1] for item in lst])
|
139
|
+
return page_min_y
|
140
|
+
|
141
|
+
def __min_y(lst: list):
|
142
|
+
if len(lst) > 0:
|
143
|
+
return min([item[3] for item in lst])
|
144
|
+
return page_max_y
|
145
|
+
|
146
|
+
clip_y0 = __max_y(header)
|
147
|
+
clip_y1 = __min_y(footer)
|
148
|
+
|
149
|
+
txt_bboxes = []
|
150
|
+
for text_block in text_blocks:
|
151
|
+
bbox = text_block["bbox"]
|
152
|
+
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
|
153
|
+
txt_bboxes.append(bbox)
|
154
|
+
|
155
|
+
for i in range(len(txt_bboxes)):
|
156
|
+
for j in range(i + 1, len(txt_bboxes)):
|
157
|
+
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
|
158
|
+
return True
|
159
|
+
|
160
|
+
return False
|
161
|
+
|
162
|
+
|
163
|
+
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
|
164
|
+
"""
|
165
|
+
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
|
166
|
+
因为这种情况大概率发生了公式没有被检测出来。
|
167
|
+
|
168
|
+
"""
|
169
|
+
if len(useful_blocks) == 0:
|
170
|
+
return False
|
171
|
+
|
172
|
+
page_min_y = 0
|
173
|
+
page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
|
174
|
+
|
175
|
+
useful_bboxes = []
|
176
|
+
for text_block in useful_blocks:
|
177
|
+
bbox = text_block["bbox"]
|
178
|
+
if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
|
179
|
+
useful_bboxes.append(bbox)
|
180
|
+
|
181
|
+
for i in range(len(useful_bboxes)):
|
182
|
+
for j in range(i + 1, len(useful_bboxes)):
|
183
|
+
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
|
184
|
+
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
|
185
|
+
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
|
186
|
+
if area_i > area_j:
|
187
|
+
return True, useful_bboxes[j], useful_bboxes[i]
|
188
|
+
else:
|
189
|
+
return True, useful_bboxes[i], useful_bboxes[j]
|
190
|
+
|
191
|
+
return False, None, None
|
@@ -0,0 +1,29 @@
|
|
1
|
+
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
|
2
|
+
"""解决行内文本间距过大问题"""
|
3
|
+
for i in range(len(pdf_info_dict)):
|
4
|
+
|
5
|
+
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
|
6
|
+
|
7
|
+
for block in text_blocks:
|
8
|
+
|
9
|
+
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
|
10
|
+
|
11
|
+
for line in block['lines']:
|
12
|
+
|
13
|
+
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
|
14
|
+
# line_box = [x1, y1, x2, y2]
|
15
|
+
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
|
16
|
+
# if len(line['spans']) == 1:
|
17
|
+
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
|
18
|
+
|
19
|
+
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
|
20
|
+
|
21
|
+
return pdf_info_dict
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
|
3
|
+
|
4
|
+
class AbsReaderWriter(ABC):
|
5
|
+
"""
|
6
|
+
同时支持二进制和文本读写的抽象类
|
7
|
+
"""
|
8
|
+
MODE_TXT = "text"
|
9
|
+
MODE_BIN = "binary"
|
10
|
+
|
11
|
+
def __init__(self, parent_path):
|
12
|
+
# 初始化代码可以在这里添加,如果需要的话
|
13
|
+
self.parent_path = parent_path # 对于本地目录是父目录,对于s3是会写到这个path下。
|
14
|
+
|
15
|
+
@abstractmethod
|
16
|
+
def read(self, path: str, mode=MODE_TXT):
|
17
|
+
"""
|
18
|
+
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
|
19
|
+
"""
|
20
|
+
raise NotImplementedError
|
21
|
+
|
22
|
+
@abstractmethod
|
23
|
+
def write(self, content: str, path: str, mode=MODE_TXT):
|
24
|
+
"""
|
25
|
+
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
|
26
|
+
"""
|
27
|
+
raise NotImplementedError
|
28
|
+
|
29
|
+
@abstractmethod
|
30
|
+
def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
|
31
|
+
"""
|
32
|
+
无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
|
33
|
+
"""
|
34
|
+
raise NotImplementedError
|
@@ -0,0 +1,66 @@
|
|
1
|
+
import os
|
2
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
3
|
+
from loguru import logger
|
4
|
+
|
5
|
+
|
6
|
+
MODE_TXT = "text"
|
7
|
+
MODE_BIN = "binary"
|
8
|
+
|
9
|
+
|
10
|
+
class DiskReaderWriter(AbsReaderWriter):
|
11
|
+
|
12
|
+
def __init__(self, parent_path, encoding="utf-8"):
|
13
|
+
self.path = parent_path
|
14
|
+
self.encoding = encoding
|
15
|
+
|
16
|
+
def read(self, path, mode=MODE_TXT):
|
17
|
+
if os.path.isabs(path):
|
18
|
+
abspath = path
|
19
|
+
else:
|
20
|
+
abspath = os.path.join(self.path, path)
|
21
|
+
if not os.path.exists(abspath):
|
22
|
+
logger.error(f"文件 {abspath} 不存在")
|
23
|
+
raise Exception(f"文件 {abspath} 不存在")
|
24
|
+
if mode == MODE_TXT:
|
25
|
+
with open(abspath, "r", encoding=self.encoding) as f:
|
26
|
+
return f.read()
|
27
|
+
elif mode == MODE_BIN:
|
28
|
+
with open(abspath, "rb") as f:
|
29
|
+
return f.read()
|
30
|
+
else:
|
31
|
+
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
32
|
+
|
33
|
+
def write(self, content, path, mode=MODE_TXT):
|
34
|
+
if os.path.isabs(path):
|
35
|
+
abspath = path
|
36
|
+
else:
|
37
|
+
abspath = os.path.join(self.path, path)
|
38
|
+
directory_path = os.path.dirname(abspath)
|
39
|
+
if not os.path.exists(directory_path):
|
40
|
+
os.makedirs(directory_path)
|
41
|
+
if mode == MODE_TXT:
|
42
|
+
with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
|
43
|
+
f.write(content)
|
44
|
+
|
45
|
+
elif mode == MODE_BIN:
|
46
|
+
with open(abspath, "wb") as f:
|
47
|
+
f.write(content)
|
48
|
+
else:
|
49
|
+
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
50
|
+
|
51
|
+
def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"):
|
52
|
+
return self.read(path)
|
53
|
+
|
54
|
+
|
55
|
+
# 使用示例
|
56
|
+
if __name__ == "__main__":
|
57
|
+
file_path = "io/test/example.txt"
|
58
|
+
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
|
59
|
+
|
60
|
+
# 写入内容到文件
|
61
|
+
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
|
62
|
+
|
63
|
+
# 从文件读取内容
|
64
|
+
content = drw.read(path=file_path)
|
65
|
+
if content:
|
66
|
+
logger.info(f"从 {file_path} 读取的内容: {content}")
|
@@ -0,0 +1,107 @@
|
|
1
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
2
|
+
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
|
3
|
+
import boto3
|
4
|
+
from loguru import logger
|
5
|
+
from boto3.s3.transfer import TransferConfig
|
6
|
+
from botocore.config import Config
|
7
|
+
import os
|
8
|
+
|
9
|
+
MODE_TXT = "text"
|
10
|
+
MODE_BIN = "binary"
|
11
|
+
|
12
|
+
|
13
|
+
class S3ReaderWriter(AbsReaderWriter):
|
14
|
+
def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str = 'auto', parent_path: str = ''):
|
15
|
+
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
|
16
|
+
self.path = parent_path
|
17
|
+
|
18
|
+
def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
|
19
|
+
s3_client = boto3.client(
|
20
|
+
service_name="s3",
|
21
|
+
aws_access_key_id=ak,
|
22
|
+
aws_secret_access_key=sk,
|
23
|
+
endpoint_url=endpoint_url,
|
24
|
+
config=Config(s3={"addressing_style": addressing_style},
|
25
|
+
retries={'max_attempts': 5, 'mode': 'standard'}),
|
26
|
+
)
|
27
|
+
return s3_client
|
28
|
+
|
29
|
+
def read(self, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
|
30
|
+
if s3_relative_path.startswith("s3://"):
|
31
|
+
s3_path = s3_relative_path
|
32
|
+
else:
|
33
|
+
s3_path = join_path(self.path, s3_relative_path)
|
34
|
+
bucket_name, key = parse_bucket_key(s3_path)
|
35
|
+
res = self.client.get_object(Bucket=bucket_name, Key=key)
|
36
|
+
body = res["Body"].read()
|
37
|
+
if mode == MODE_TXT:
|
38
|
+
data = body.decode(encoding) # Decode bytes to text
|
39
|
+
elif mode == MODE_BIN:
|
40
|
+
data = body
|
41
|
+
else:
|
42
|
+
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
43
|
+
return data
|
44
|
+
|
45
|
+
def write(self, content, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
|
46
|
+
if s3_relative_path.startswith("s3://"):
|
47
|
+
s3_path = s3_relative_path
|
48
|
+
else:
|
49
|
+
s3_path = join_path(self.path, s3_relative_path)
|
50
|
+
if mode == MODE_TXT:
|
51
|
+
body = content.encode(encoding) # Encode text data as bytes
|
52
|
+
elif mode == MODE_BIN:
|
53
|
+
body = content
|
54
|
+
else:
|
55
|
+
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
56
|
+
bucket_name, key = parse_bucket_key(s3_path)
|
57
|
+
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
|
58
|
+
logger.info(f"内容已写入 {s3_path} ")
|
59
|
+
|
60
|
+
def read_jsonl(self, path: str, byte_start=0, byte_end=None, mode=MODE_TXT, encoding='utf-8'):
|
61
|
+
if path.startswith("s3://"):
|
62
|
+
s3_path = path
|
63
|
+
else:
|
64
|
+
s3_path = join_path(self.path, path)
|
65
|
+
bucket_name, key = parse_bucket_key(s3_path)
|
66
|
+
|
67
|
+
range_header = f'bytes={byte_start}-{byte_end}' if byte_end else f'bytes={byte_start}-'
|
68
|
+
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
|
69
|
+
body = res["Body"].read()
|
70
|
+
if mode == MODE_TXT:
|
71
|
+
data = body.decode(encoding) # Decode bytes to text
|
72
|
+
elif mode == MODE_BIN:
|
73
|
+
data = body
|
74
|
+
else:
|
75
|
+
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
76
|
+
return data
|
77
|
+
|
78
|
+
|
79
|
+
if __name__ == "__main__":
|
80
|
+
# Config the connection info
|
81
|
+
ak = ""
|
82
|
+
sk = ""
|
83
|
+
endpoint_url = ""
|
84
|
+
addressing_style = "auto"
|
85
|
+
bucket_name = ""
|
86
|
+
# Create an S3ReaderWriter object
|
87
|
+
s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style, "s3://bucket_name/")
|
88
|
+
|
89
|
+
# Write text data to S3
|
90
|
+
text_data = "This is some text data"
|
91
|
+
s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
|
92
|
+
|
93
|
+
# Read text data from S3
|
94
|
+
text_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
|
95
|
+
logger.info(f"Read text data from S3: {text_data_read}")
|
96
|
+
# Write binary data to S3
|
97
|
+
binary_data = b"This is some binary data"
|
98
|
+
s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
|
99
|
+
|
100
|
+
# Read binary data from S3
|
101
|
+
binary_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
|
102
|
+
logger.info(f"Read binary data from S3: {binary_data_read}")
|
103
|
+
|
104
|
+
# Range Read text data from S3
|
105
|
+
binary_data_read = s3_reader_writer.read_jsonl(path=f"s3://{bucket_name}/ebook/test/test.json",
|
106
|
+
byte_start=0, byte_end=10, mode=MODE_BIN)
|
107
|
+
logger.info(f"Read binary data from S3: {binary_data_read}")
|
magic_pdf/rw/__init__.py
ADDED
File without changes
|
File without changes
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from loguru import logger
|
2
|
+
|
3
|
+
from magic_pdf.libs.drop_reason import DropReason
|
4
|
+
|
5
|
+
|
6
|
+
def get_data_source(jso: dict):
|
7
|
+
data_source = jso.get("data_source")
|
8
|
+
if data_source is None:
|
9
|
+
data_source = jso.get("file_source")
|
10
|
+
return data_source
|
11
|
+
|
12
|
+
|
13
|
+
def get_data_type(jso: dict):
|
14
|
+
data_type = jso.get("data_type")
|
15
|
+
if data_type is None:
|
16
|
+
data_type = jso.get("file_type")
|
17
|
+
return data_type
|
18
|
+
|
19
|
+
|
20
|
+
def get_bookid(jso: dict):
|
21
|
+
book_id = jso.get("bookid")
|
22
|
+
if book_id is None:
|
23
|
+
book_id = jso.get("original_file_id")
|
24
|
+
return book_id
|
25
|
+
|
26
|
+
|
27
|
+
def exception_handler(jso: dict, e):
|
28
|
+
logger.exception(e)
|
29
|
+
jso["_need_drop"] = True
|
30
|
+
jso["_drop_reason"] = DropReason.Exception
|
31
|
+
jso["_exception"] = f"ERROR: {e}"
|
32
|
+
return jso
|
33
|
+
|
34
|
+
|
35
|
+
def get_bookname(jso: dict):
|
36
|
+
data_source = get_data_source(jso)
|
37
|
+
file_id = jso.get("file_id")
|
38
|
+
book_name = f"{data_source}/{file_id}"
|
39
|
+
return book_name
|
40
|
+
|
41
|
+
|
42
|
+
def spark_json_extractor(jso: dict) -> dict:
|
43
|
+
|
44
|
+
"""
|
45
|
+
从json中提取数据,返回一个dict
|
46
|
+
"""
|
47
|
+
|
48
|
+
return {
|
49
|
+
"_pdf_type": jso["_pdf_type"],
|
50
|
+
"model_list": jso["doc_layout_result"],
|
51
|
+
}
|
File without changes
|