magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +12 -12
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +6 -6
- magic_pdf/libs/draw_bbox.py +13 -6
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +142 -351
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
- magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
- magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
- magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
- magic_pdf/model/sub_modules/model_init.py +149 -0
- magic_pdf/model/sub_modules/model_utils.py +51 -0
- magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
- magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
- magic_pdf/model/sub_modules/table/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
- magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
- magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
- magic_pdf/model/sub_modules/table/table_utils.py +11 -0
- magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
- magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +121 -66
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +253 -50
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/resources/model_config/model_configs.yaml +2 -1
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +70 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
- magic_pdf-0.10.0.dist-info/RECORD +198 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -53
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/model/pek_sub_modules/post_process.py +0 -36
- magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
- magic_pdf/para/para_pipeline.py +0 -297
- magic_pdf-0.9.2.dist-info/RECORD +0 -178
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,21 @@
|
|
1
1
|
import math
|
2
|
+
import re
|
2
3
|
|
4
|
+
from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
|
5
|
+
VERTICAL_TEXT)
|
3
6
|
from magic_pdf.libs.boxbase import is_vbox_on_side
|
4
|
-
from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
|
5
7
|
|
6
8
|
|
7
9
|
def detect_non_horizontal_texts(result_dict):
|
8
|
-
"""
|
9
|
-
|
10
|
+
"""This function detects watermarks and vertical margin notes in the
|
11
|
+
document.
|
10
12
|
|
11
13
|
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
|
12
14
|
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
|
13
15
|
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
|
14
16
|
|
15
17
|
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
|
16
|
-
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
|
18
|
+
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
|
17
19
|
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
|
18
20
|
|
19
21
|
|
@@ -32,13 +34,16 @@ def detect_non_horizontal_texts(result_dict):
|
|
32
34
|
potential_margin_notes = {}
|
33
35
|
|
34
36
|
for page_id, page_content in result_dict.items():
|
35
|
-
if page_id.startswith(
|
37
|
+
if page_id.startswith('page_'):
|
36
38
|
for block_id, block_data in page_content.items():
|
37
|
-
if block_id.startswith(
|
38
|
-
if
|
39
|
-
coordinates_text = (
|
40
|
-
|
41
|
-
|
39
|
+
if block_id.startswith('block_'):
|
40
|
+
if 'dir' in block_data:
|
41
|
+
coordinates_text = (
|
42
|
+
block_data['bbox'],
|
43
|
+
block_data['text'],
|
44
|
+
) # Tuple of coordinates and text
|
45
|
+
|
46
|
+
angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
|
42
47
|
angle = abs(math.degrees(angle))
|
43
48
|
|
44
49
|
if angle > 5 and angle < 85: # Check if direction is watermarks
|
@@ -49,32 +54,40 @@ def detect_non_horizontal_texts(result_dict):
|
|
49
54
|
|
50
55
|
if angle > 85 and angle < 105: # Check if direction is vertical
|
51
56
|
if coordinates_text in potential_margin_notes:
|
52
|
-
potential_margin_notes[coordinates_text] +=
|
57
|
+
potential_margin_notes[coordinates_text] += (
|
58
|
+
1 # Increment count
|
59
|
+
)
|
53
60
|
else:
|
54
|
-
potential_margin_notes[coordinates_text] =
|
61
|
+
potential_margin_notes[coordinates_text] = (
|
62
|
+
1 # Initialize count
|
63
|
+
)
|
55
64
|
|
56
65
|
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
|
57
66
|
watermark_threshold = len(result_dict) // 2
|
58
|
-
watermarks = {
|
67
|
+
watermarks = {
|
68
|
+
k: v for k, v in potential_watermarks.items() if v > watermark_threshold
|
69
|
+
}
|
59
70
|
|
60
71
|
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
|
61
72
|
margin_note_threshold = len(result_dict) // 2
|
62
|
-
margin_notes = {
|
73
|
+
margin_notes = {
|
74
|
+
k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
|
75
|
+
}
|
63
76
|
|
64
77
|
# Add watermark information to the result dictionary
|
65
78
|
for page_id, blocks in result_dict.items():
|
66
|
-
if page_id.startswith(
|
79
|
+
if page_id.startswith('page_'):
|
67
80
|
for block_id, block_data in blocks.items():
|
68
|
-
coordinates_text = (block_data[
|
81
|
+
coordinates_text = (block_data['bbox'], block_data['text'])
|
69
82
|
if coordinates_text in watermarks:
|
70
|
-
block_data[
|
83
|
+
block_data['is_watermark'] = 1
|
71
84
|
else:
|
72
|
-
block_data[
|
85
|
+
block_data['is_watermark'] = 0
|
73
86
|
|
74
87
|
if coordinates_text in margin_notes:
|
75
|
-
block_data[
|
88
|
+
block_data['is_vertical_margin_note'] = 1
|
76
89
|
else:
|
77
|
-
block_data[
|
90
|
+
block_data['is_vertical_margin_note'] = 0
|
78
91
|
|
79
92
|
return result_dict
|
80
93
|
|
@@ -83,21 +96,21 @@ def detect_non_horizontal_texts(result_dict):
|
|
83
96
|
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
|
84
97
|
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
|
85
98
|
"""
|
86
|
-
|
99
|
+
|
87
100
|
|
88
101
|
def __is_a_word(sentence):
|
89
102
|
# 如果输入是中文并且长度为1,则返回True
|
90
103
|
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
|
91
104
|
return True
|
92
105
|
# 判断是否为单个英文单词或字符(包括ASCII标点)
|
93
|
-
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
|
106
|
+
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
|
94
107
|
return True
|
95
108
|
else:
|
96
109
|
return False
|
97
110
|
|
98
111
|
|
99
112
|
def __get_text_color(num):
|
100
|
-
"""获取字体的颜色RGB
|
113
|
+
"""获取字体的颜色RGB值."""
|
101
114
|
blue = num & 255
|
102
115
|
green = (num >> 8) & 255
|
103
116
|
red = (num >> 16) & 255
|
@@ -105,84 +118,119 @@ def __get_text_color(num):
|
|
105
118
|
|
106
119
|
|
107
120
|
def __is_empty_side_box(text_block):
|
108
|
-
"""
|
109
|
-
是否是边缘上的空白没有任何内容的block
|
110
|
-
"""
|
121
|
+
"""是否是边缘上的空白没有任何内容的block."""
|
111
122
|
for line in text_block['lines']:
|
112
123
|
for span in line['spans']:
|
113
124
|
font_color = span['color']
|
114
|
-
r,g,b = __get_text_color(font_color)
|
115
|
-
if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
|
125
|
+
r, g, b = __get_text_color(font_color)
|
126
|
+
if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
|
116
127
|
return False
|
117
|
-
|
128
|
+
|
118
129
|
return True
|
119
130
|
|
120
131
|
|
121
132
|
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
|
122
|
-
"""
|
123
|
-
返回删除了垂直,水印,旋转的textblock
|
124
|
-
删除的内容打上tag返回
|
125
|
-
"""
|
133
|
+
"""返回删除了垂直,水印,旋转的textblock 删除的内容打上tag返回."""
|
126
134
|
removed_text_block = []
|
127
|
-
|
128
|
-
for i, block in enumerate(
|
135
|
+
|
136
|
+
for i, block in enumerate(
|
137
|
+
pymu_text_block
|
138
|
+
): # 格式参考test/assets/papre/pymu_textblocks.json
|
129
139
|
lines = block['lines']
|
130
140
|
block_bbox = block['bbox']
|
131
|
-
if not is_vbox_on_side(
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
141
|
+
if not is_vbox_on_side(
|
142
|
+
block_bbox, page_width, page_height, 0.2
|
143
|
+
): # 保证这些box必须在页面的两边
|
144
|
+
continue
|
145
|
+
|
146
|
+
if (
|
147
|
+
all(
|
148
|
+
[
|
149
|
+
__is_a_word(line['spans'][0]['text'])
|
150
|
+
for line in lines
|
151
|
+
if len(line['spans']) > 0
|
152
|
+
]
|
153
|
+
)
|
154
|
+
and len(lines) > 1
|
155
|
+
and all([len(line['spans']) == 1 for line in lines])
|
156
|
+
):
|
157
|
+
is_box_valign = (
|
158
|
+
(
|
159
|
+
len(
|
160
|
+
set(
|
161
|
+
[
|
162
|
+
int(line['spans'][0]['bbox'][0])
|
163
|
+
for line in lines
|
164
|
+
if len(line['spans']) > 0
|
165
|
+
]
|
166
|
+
)
|
167
|
+
)
|
168
|
+
== 1
|
169
|
+
)
|
170
|
+
and (
|
171
|
+
len(
|
172
|
+
[
|
173
|
+
int(line['spans'][0]['bbox'][0])
|
174
|
+
for line in lines
|
175
|
+
if len(line['spans']) > 0
|
176
|
+
]
|
177
|
+
)
|
178
|
+
> 1
|
179
|
+
)
|
180
|
+
) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
|
181
|
+
|
137
182
|
if is_box_valign:
|
138
183
|
block['tag'] = VERTICAL_TEXT
|
139
184
|
removed_text_block.append(block)
|
140
185
|
continue
|
141
|
-
|
186
|
+
|
142
187
|
for line in lines:
|
143
|
-
if line['dir']!=(1,0):
|
188
|
+
if line['dir'] != (1, 0):
|
144
189
|
block['tag'] = ROTATE_TEXT
|
145
|
-
removed_text_block.append(
|
190
|
+
removed_text_block.append(
|
191
|
+
block
|
192
|
+
) # 只要有一个line不是dir=(1,0),就把整个block都删掉
|
146
193
|
break
|
147
|
-
|
194
|
+
|
148
195
|
for block in removed_text_block:
|
149
196
|
pymu_text_block.remove(block)
|
150
|
-
|
197
|
+
|
151
198
|
return pymu_text_block, removed_text_block
|
152
199
|
|
200
|
+
|
153
201
|
def get_side_boundry(rotate_bbox, page_width, page_height):
|
154
|
-
"""
|
155
|
-
根据rotate_bbox,返回页面的左右正文边界
|
156
|
-
"""
|
202
|
+
"""根据rotate_bbox,返回页面的左右正文边界."""
|
157
203
|
left_x = 0
|
158
204
|
right_x = page_width
|
159
205
|
for x in rotate_bbox:
|
160
206
|
box = x['bbox']
|
161
|
-
if box[2]<page_width/2:
|
207
|
+
if box[2] < page_width / 2:
|
162
208
|
left_x = max(left_x, box[2])
|
163
209
|
else:
|
164
210
|
right_x = min(right_x, box[0])
|
165
|
-
|
166
|
-
return left_x+1, right_x-1
|
211
|
+
|
212
|
+
return left_x + 1, right_x - 1
|
167
213
|
|
168
214
|
|
169
215
|
def remove_side_blank_block(pymu_text_block, page_width, page_height):
|
170
|
-
"""
|
171
|
-
删除页面两侧的空白block
|
172
|
-
"""
|
216
|
+
"""删除页面两侧的空白block."""
|
173
217
|
removed_text_block = []
|
174
|
-
|
175
|
-
for i, block in enumerate(
|
218
|
+
|
219
|
+
for i, block in enumerate(
|
220
|
+
pymu_text_block
|
221
|
+
): # 格式参考test/assets/papre/pymu_textblocks.json
|
176
222
|
block_bbox = block['bbox']
|
177
|
-
if not is_vbox_on_side(
|
178
|
-
|
179
|
-
|
223
|
+
if not is_vbox_on_side(
|
224
|
+
block_bbox, page_width, page_height, 0.2
|
225
|
+
): # 保证这些box必须在页面的两边
|
226
|
+
continue
|
227
|
+
|
180
228
|
if __is_empty_side_box(block):
|
181
229
|
block['tag'] = EMPTY_SIDE_BLOCK
|
182
230
|
removed_text_block.append(block)
|
183
231
|
continue
|
184
|
-
|
232
|
+
|
185
233
|
for block in removed_text_block:
|
186
234
|
pymu_text_block.remove(block)
|
187
|
-
|
188
|
-
return pymu_text_block, removed_text_block
|
235
|
+
|
236
|
+
return pymu_text_block, removed_text_block
|
@@ -4,8 +4,9 @@
|
|
4
4
|
2. 然后去掉出现在文字blcok上的图片bbox
|
5
5
|
"""
|
6
6
|
|
7
|
-
from magic_pdf.
|
8
|
-
from magic_pdf.libs.
|
7
|
+
from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
|
8
|
+
from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
|
9
|
+
_is_left_overlap)
|
9
10
|
|
10
11
|
|
11
12
|
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
|
@@ -26,14 +27,14 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
|
|
26
27
|
# 去掉位于图片上的文字block
|
27
28
|
for image_box in images:
|
28
29
|
for text_block in text_raw_blocks:
|
29
|
-
text_bbox = text_block[
|
30
|
+
text_bbox = text_block['bbox']
|
30
31
|
if _is_in(text_bbox, image_box):
|
31
32
|
text_block['tag'] = ON_IMAGE_TEXT
|
32
33
|
text_block_removed.append(text_block)
|
33
34
|
# 去掉table上的文字block
|
34
35
|
for table_box in tables:
|
35
36
|
for text_block in text_raw_blocks:
|
36
|
-
text_bbox = text_block[
|
37
|
+
text_bbox = text_block['bbox']
|
37
38
|
if _is_in(text_bbox, table_box):
|
38
39
|
text_block['tag'] = ON_TABLE_TEXT
|
39
40
|
text_block_removed.append(text_block)
|
@@ -77,7 +78,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
|
|
77
78
|
# 图片和文字重叠,丢掉图片
|
78
79
|
for image_box in images:
|
79
80
|
for text_block in text_raw_blocks:
|
80
|
-
text_bbox = text_block[
|
81
|
+
text_bbox = text_block['bbox']
|
81
82
|
if _is_in_or_part_overlap(image_box, text_bbox):
|
82
83
|
images_backup.append(image_box)
|
83
84
|
break
|
@@ -122,11 +123,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
|
|
122
123
|
|
123
124
|
|
124
125
|
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
|
125
|
-
"""
|
126
|
-
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
|
127
|
-
因为这种情况大概率发生了公式没有被检测出来。
|
128
|
-
|
129
|
-
"""
|
126
|
+
"""检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
|
130
127
|
if len(text_blocks) == 0:
|
131
128
|
return False
|
132
129
|
|
@@ -148,7 +145,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo
|
|
148
145
|
|
149
146
|
txt_bboxes = []
|
150
147
|
for text_block in text_blocks:
|
151
|
-
bbox = text_block[
|
148
|
+
bbox = text_block['bbox']
|
152
149
|
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
|
153
150
|
txt_bboxes.append(bbox)
|
154
151
|
|
@@ -161,11 +158,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo
|
|
161
158
|
|
162
159
|
|
163
160
|
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
|
164
|
-
"""
|
165
|
-
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
|
166
|
-
因为这种情况大概率发生了公式没有被检测出来。
|
167
|
-
|
168
|
-
"""
|
161
|
+
"""检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
|
169
162
|
if len(useful_blocks) == 0:
|
170
163
|
return False
|
171
164
|
|
@@ -174,7 +167,7 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
|
|
174
167
|
|
175
168
|
useful_bboxes = []
|
176
169
|
for text_block in useful_blocks:
|
177
|
-
bbox = text_block[
|
170
|
+
bbox = text_block['bbox']
|
178
171
|
if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
|
179
172
|
useful_bboxes.append(bbox)
|
180
173
|
|
magic_pdf/spark/spark_api.py
CHANGED
@@ -1,51 +1,49 @@
|
|
1
1
|
from loguru import logger
|
2
2
|
|
3
|
-
from magic_pdf.
|
3
|
+
from magic_pdf.config.drop_reason import DropReason
|
4
4
|
|
5
5
|
|
6
6
|
def get_data_source(jso: dict):
|
7
|
-
data_source = jso.get(
|
7
|
+
data_source = jso.get('data_source')
|
8
8
|
if data_source is None:
|
9
|
-
data_source = jso.get(
|
9
|
+
data_source = jso.get('file_source')
|
10
10
|
return data_source
|
11
11
|
|
12
12
|
|
13
13
|
def get_data_type(jso: dict):
|
14
|
-
data_type = jso.get(
|
14
|
+
data_type = jso.get('data_type')
|
15
15
|
if data_type is None:
|
16
|
-
data_type = jso.get(
|
16
|
+
data_type = jso.get('file_type')
|
17
17
|
return data_type
|
18
18
|
|
19
19
|
|
20
20
|
def get_bookid(jso: dict):
|
21
|
-
book_id = jso.get(
|
21
|
+
book_id = jso.get('bookid')
|
22
22
|
if book_id is None:
|
23
|
-
book_id = jso.get(
|
23
|
+
book_id = jso.get('original_file_id')
|
24
24
|
return book_id
|
25
25
|
|
26
26
|
|
27
27
|
def exception_handler(jso: dict, e):
|
28
28
|
logger.exception(e)
|
29
|
-
jso[
|
30
|
-
jso[
|
31
|
-
jso[
|
29
|
+
jso['_need_drop'] = True
|
30
|
+
jso['_drop_reason'] = DropReason.Exception
|
31
|
+
jso['_exception'] = f'ERROR: {e}'
|
32
32
|
return jso
|
33
33
|
|
34
34
|
|
35
35
|
def get_bookname(jso: dict):
|
36
36
|
data_source = get_data_source(jso)
|
37
|
-
file_id = jso.get(
|
38
|
-
book_name = f
|
37
|
+
file_id = jso.get('file_id')
|
38
|
+
book_name = f'{data_source}/{file_id}'
|
39
39
|
return book_name
|
40
40
|
|
41
41
|
|
42
42
|
def spark_json_extractor(jso: dict) -> dict:
|
43
43
|
|
44
|
-
"""
|
45
|
-
从json中提取数据,返回一个dict
|
46
|
-
"""
|
44
|
+
"""从json中提取数据,返回一个dict."""
|
47
45
|
|
48
46
|
return {
|
49
|
-
|
50
|
-
|
47
|
+
'_pdf_type': jso['_pdf_type'],
|
48
|
+
'model_list': jso['doc_layout_result'],
|
51
49
|
}
|
magic_pdf/tools/cli.py
CHANGED
@@ -5,9 +5,8 @@ import click
|
|
5
5
|
from loguru import logger
|
6
6
|
|
7
7
|
import magic_pdf.model as model_config
|
8
|
+
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
8
9
|
from magic_pdf.libs.version import __version__
|
9
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
10
|
-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
11
10
|
from magic_pdf.tools.common import do_parse, parse_pdf_methods
|
12
11
|
|
13
12
|
|
@@ -86,8 +85,8 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
|
|
86
85
|
os.makedirs(output_dir, exist_ok=True)
|
87
86
|
|
88
87
|
def read_fn(path):
|
89
|
-
disk_rw =
|
90
|
-
return disk_rw.read(os.path.basename(path)
|
88
|
+
disk_rw = FileBasedDataReader(os.path.dirname(path))
|
89
|
+
return disk_rw.read(os.path.basename(path))
|
91
90
|
|
92
91
|
def parse_doc(doc_path: str):
|
93
92
|
try:
|
magic_pdf/tools/cli_dev.py
CHANGED
@@ -5,13 +5,11 @@ from pathlib import Path
|
|
5
5
|
import click
|
6
6
|
|
7
7
|
import magic_pdf.model as model_config
|
8
|
+
from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
|
8
9
|
from magic_pdf.libs.config_reader import get_s3_config
|
9
10
|
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
|
10
11
|
remove_non_official_s3_args)
|
11
12
|
from magic_pdf.libs.version import __version__
|
12
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
|
-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
14
|
-
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
|
15
13
|
from magic_pdf.tools.common import do_parse, parse_pdf_methods
|
16
14
|
|
17
15
|
|
@@ -19,15 +17,14 @@ def read_s3_path(s3path):
|
|
19
17
|
bucket, key = parse_s3path(s3path)
|
20
18
|
|
21
19
|
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
|
22
|
-
s3_rw =
|
23
|
-
remove_non_official_s3_args(s3path))
|
20
|
+
s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
|
24
21
|
may_range_params = parse_s3_range_params(s3path)
|
25
22
|
if may_range_params is None or 2 != len(may_range_params):
|
26
|
-
byte_start, byte_end = 0,
|
23
|
+
byte_start, byte_end = 0, -1
|
27
24
|
else:
|
28
25
|
byte_start, byte_end = int(may_range_params[0]), int(
|
29
26
|
may_range_params[1])
|
30
|
-
return s3_rw.
|
27
|
+
return s3_rw.read_at(
|
31
28
|
remove_non_official_s3_args(s3path),
|
32
29
|
byte_start,
|
33
30
|
byte_end,
|
@@ -129,8 +126,8 @@ def pdf(pdf, json_data, output_dir, method):
|
|
129
126
|
os.makedirs(output_dir, exist_ok=True)
|
130
127
|
|
131
128
|
def read_fn(path):
|
132
|
-
disk_rw =
|
133
|
-
return disk_rw.read(os.path.basename(path)
|
129
|
+
disk_rw = FileBasedDataReader(os.path.dirname(path))
|
130
|
+
return disk_rw.read(os.path.basename(path))
|
134
131
|
|
135
132
|
model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
|
136
133
|
|