magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +12 -12
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +6 -6
- magic_pdf/libs/draw_bbox.py +13 -6
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +142 -351
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
- magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
- magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
- magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
- magic_pdf/model/sub_modules/model_init.py +149 -0
- magic_pdf/model/sub_modules/model_utils.py +51 -0
- magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
- magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
- magic_pdf/model/sub_modules/table/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
- magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
- magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
- magic_pdf/model/sub_modules/table/table_utils.py +11 -0
- magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
- magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +121 -66
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +253 -50
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/resources/model_config/model_configs.yaml +2 -1
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +70 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
- magic_pdf-0.10.0.dist-info/RECORD +198 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -53
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/model/pek_sub_modules/post_process.py +0 -36
- magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
- magic_pdf/para/para_pipeline.py +0 -297
- magic_pdf-0.9.2.dist-info/RECORD +0 -178
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,49 +1,45 @@
|
|
1
|
-
"""
|
2
|
-
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
|
3
|
-
"""
|
1
|
+
"""对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果."""
|
4
2
|
|
5
|
-
from magic_pdf.libs.commons import fitz
|
6
3
|
import json
|
7
4
|
import os
|
8
5
|
from pathlib import Path
|
6
|
+
|
9
7
|
from loguru import logger
|
10
|
-
|
8
|
+
|
9
|
+
from magic_pdf.config.ocr_content_type import ContentType
|
10
|
+
from magic_pdf.libs.commons import fitz
|
11
11
|
|
12
12
|
TYPE_INLINE_EQUATION = ContentType.InlineEquation
|
13
13
|
TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
|
14
14
|
|
15
15
|
|
16
16
|
def combine_chars_to_pymudict(block_dict, char_dict):
|
17
|
-
"""
|
18
|
-
把block级别的pymupdf 结构里加入char结构
|
19
|
-
"""
|
17
|
+
"""把block级别的pymupdf 结构里加入char结构."""
|
20
18
|
# 因为block_dict 被裁剪过,因此先把他和char_dict文字块对齐,才能进行补充
|
21
|
-
char_map = {tuple(item[
|
19
|
+
char_map = {tuple(item['bbox']): item for item in char_dict}
|
22
20
|
|
23
|
-
for i in range(len(block_dict)): #
|
21
|
+
for i in range(len(block_dict)): # block
|
24
22
|
block = block_dict[i]
|
25
|
-
key = block[
|
23
|
+
key = block['bbox']
|
26
24
|
char_dict_item = char_map[tuple(key)]
|
27
|
-
char_dict_map = {tuple(item[
|
28
|
-
for j in range(len(block[
|
29
|
-
lines = block[
|
30
|
-
with_char_lines = char_dict_map[lines[
|
31
|
-
for k in range(len(lines[
|
32
|
-
spans = lines[
|
25
|
+
char_dict_map = {tuple(item['bbox']): item for item in char_dict_item['lines']}
|
26
|
+
for j in range(len(block['lines'])):
|
27
|
+
lines = block['lines'][j]
|
28
|
+
with_char_lines = char_dict_map[lines['bbox']]
|
29
|
+
for k in range(len(lines['spans'])):
|
30
|
+
spans = lines['spans'][k]
|
33
31
|
try:
|
34
|
-
chars = with_char_lines[
|
35
|
-
except Exception
|
36
|
-
logger.error(char_dict[i][
|
32
|
+
chars = with_char_lines['spans'][k]['chars']
|
33
|
+
except Exception:
|
34
|
+
logger.error(char_dict[i]['lines'][j])
|
37
35
|
|
38
|
-
spans[
|
36
|
+
spans['chars'] = chars
|
39
37
|
|
40
38
|
return block_dict
|
41
39
|
|
42
40
|
|
43
41
|
def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
|
44
|
-
"""
|
45
|
-
计算box1和box2的重叠面积占最小面积的box的比例
|
46
|
-
"""
|
42
|
+
"""计算box1和box2的重叠面积占最小面积的box的比例."""
|
47
43
|
# Determine the coordinates of the intersection rectangle
|
48
44
|
x_left = max(bbox1[0], min_bbox[0])
|
49
45
|
y_top = max(bbox1[1], min_bbox[1])
|
@@ -74,13 +70,13 @@ def _is_xin(bbox1, bbox2):
|
|
74
70
|
|
75
71
|
|
76
72
|
def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
|
77
|
-
"""
|
73
|
+
"""消除掉整个块都在行间公式块内部的文本块."""
|
78
74
|
for eq_bbox in interline_bboxes:
|
79
75
|
removed_txt_blk = []
|
80
76
|
for text_blk in text_blocks:
|
81
|
-
text_bbox = text_blk[
|
77
|
+
text_bbox = text_blk['bbox']
|
82
78
|
if (
|
83
|
-
calculate_overlap_area_2_minbox_area_ratio(eq_bbox[
|
79
|
+
calculate_overlap_area_2_minbox_area_ratio(eq_bbox['bbox'], text_bbox)
|
84
80
|
>= 0.7
|
85
81
|
):
|
86
82
|
removed_txt_blk.append(text_blk)
|
@@ -91,9 +87,7 @@ def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
|
|
91
87
|
|
92
88
|
|
93
89
|
def _is_in_or_part_overlap(box1, box2) -> bool:
|
94
|
-
"""
|
95
|
-
两个bbox是否有部分重叠或者包含
|
96
|
-
"""
|
90
|
+
"""两个bbox是否有部分重叠或者包含."""
|
97
91
|
if box1 is None or box2 is None:
|
98
92
|
return False
|
99
93
|
|
@@ -111,62 +105,65 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
|
|
111
105
|
def remove_text_block_overlap_interline_equation_bbox(
|
112
106
|
interline_eq_bboxes, pymu_block_list
|
113
107
|
):
|
114
|
-
|
115
|
-
"""消除掉行行内公式有部分重叠的文本块的内容。
|
116
|
-
同时重新计算消除重叠之后文本块的大小"""
|
108
|
+
"""消除掉行行内公式有部分重叠的文本块的内容。 同时重新计算消除重叠之后文本块的大小."""
|
117
109
|
deleted_block = []
|
118
110
|
for text_block in pymu_block_list:
|
119
111
|
deleted_line = []
|
120
|
-
for line in text_block[
|
112
|
+
for line in text_block['lines']:
|
121
113
|
deleted_span = []
|
122
|
-
for span in line[
|
114
|
+
for span in line['spans']:
|
123
115
|
deleted_chars = []
|
124
|
-
for char in span[
|
116
|
+
for char in span['chars']:
|
125
117
|
if any(
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
118
|
+
[
|
119
|
+
(
|
120
|
+
calculate_overlap_area_2_minbox_area_ratio(
|
121
|
+
eq_bbox['bbox'], char['bbox']
|
122
|
+
)
|
123
|
+
> 0.5
|
124
|
+
)
|
125
|
+
for eq_bbox in interline_eq_bboxes
|
126
|
+
]
|
130
127
|
):
|
131
128
|
deleted_chars.append(char)
|
132
129
|
# 检查span里没有char则删除这个span
|
133
130
|
for char in deleted_chars:
|
134
|
-
span[
|
131
|
+
span['chars'].remove(char)
|
135
132
|
# 重新计算这个span的大小
|
136
|
-
if len(span[
|
133
|
+
if len(span['chars']) == 0: # 删除这个span
|
137
134
|
deleted_span.append(span)
|
138
135
|
else:
|
139
|
-
span[
|
140
|
-
min([b[
|
141
|
-
min([b[
|
142
|
-
max([b[
|
143
|
-
max([b[
|
136
|
+
span['bbox'] = (
|
137
|
+
min([b['bbox'][0] for b in span['chars']]),
|
138
|
+
min([b['bbox'][1] for b in span['chars']]),
|
139
|
+
max([b['bbox'][2] for b in span['chars']]),
|
140
|
+
max([b['bbox'][3] for b in span['chars']]),
|
144
141
|
)
|
145
142
|
|
146
143
|
# 检查这个span
|
147
144
|
for span in deleted_span:
|
148
|
-
line[
|
149
|
-
if len(line[
|
145
|
+
line['spans'].remove(span)
|
146
|
+
if len(line['spans']) == 0: # 删除这个line
|
150
147
|
deleted_line.append(line)
|
151
148
|
else:
|
152
|
-
line[
|
153
|
-
min([b[
|
154
|
-
min([b[
|
155
|
-
max([b[
|
156
|
-
max([b[
|
149
|
+
line['bbox'] = (
|
150
|
+
min([b['bbox'][0] for b in line['spans']]),
|
151
|
+
min([b['bbox'][1] for b in line['spans']]),
|
152
|
+
max([b['bbox'][2] for b in line['spans']]),
|
153
|
+
max([b['bbox'][3] for b in line['spans']]),
|
157
154
|
)
|
158
155
|
|
159
156
|
# 检查这个block是否可以删除
|
160
157
|
for line in deleted_line:
|
161
|
-
text_block[
|
162
|
-
if len(text_block[
|
158
|
+
text_block['lines'].remove(line)
|
159
|
+
if len(text_block['lines']) == 0: # 删除block
|
163
160
|
deleted_block.append(text_block)
|
164
161
|
else:
|
165
|
-
text_block[
|
166
|
-
min([b[
|
167
|
-
min([b[
|
168
|
-
max([b[
|
169
|
-
max([b[
|
162
|
+
text_block['bbox'] = (
|
163
|
+
min([b['bbox'][0] for b in text_block['lines']]),
|
164
|
+
min([b['bbox'][1] for b in text_block['lines']]),
|
165
|
+
max([b['bbox'][2] for b in text_block['lines']]),
|
166
|
+
max([b['bbox'][3] for b in text_block['lines']]),
|
170
167
|
)
|
171
168
|
|
172
169
|
# 检查text block删除
|
@@ -179,33 +176,33 @@ def remove_text_block_overlap_interline_equation_bbox(
|
|
179
176
|
|
180
177
|
|
181
178
|
def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
|
182
|
-
"""在行间公式对应的地方插上一个伪造的block"""
|
179
|
+
"""在行间公式对应的地方插上一个伪造的block."""
|
183
180
|
for eq in interline_eq_bboxes:
|
184
|
-
bbox = eq[
|
185
|
-
latex_content = eq[
|
181
|
+
bbox = eq['bbox']
|
182
|
+
latex_content = eq['latex']
|
186
183
|
text_block = {
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
184
|
+
'number': len(pymu_block_list),
|
185
|
+
'type': 0,
|
186
|
+
'bbox': bbox,
|
187
|
+
'lines': [
|
191
188
|
{
|
192
|
-
|
189
|
+
'spans': [
|
193
190
|
{
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
191
|
+
'size': 9.962599754333496,
|
192
|
+
'type': TYPE_INTERLINE_EQUATION,
|
193
|
+
'flags': 4,
|
194
|
+
'font': TYPE_INTERLINE_EQUATION,
|
195
|
+
'color': 0,
|
196
|
+
'ascender': 0.9409999847412109,
|
197
|
+
'descender': -0.3050000071525574,
|
198
|
+
'latex': latex_content,
|
199
|
+
'origin': [bbox[0], bbox[1]],
|
200
|
+
'bbox': bbox,
|
204
201
|
}
|
205
202
|
],
|
206
|
-
|
207
|
-
|
208
|
-
|
203
|
+
'wmode': 0,
|
204
|
+
'dir': [1.0, 0.0],
|
205
|
+
'bbox': bbox,
|
209
206
|
}
|
210
207
|
],
|
211
208
|
}
|
@@ -250,53 +247,52 @@ def __y_overlap_ratio(box1, box2):
|
|
250
247
|
|
251
248
|
|
252
249
|
def replace_line_v2(eqinfo, line):
|
253
|
-
"""
|
254
|
-
|
255
|
-
最后与这个x0,x1有相交的span0, span1内部进行分割。
|
256
|
-
"""
|
250
|
+
"""扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
|
251
|
+
最后与这个x0,x1有相交的span0, span1内部进行分割。"""
|
257
252
|
first_overlap_span = -1
|
258
253
|
first_overlap_span_idx = -1
|
259
254
|
last_overlap_span = -1
|
260
255
|
delete_chars = []
|
261
|
-
for i in range(0, len(line[
|
262
|
-
if
|
256
|
+
for i in range(0, len(line['spans'])):
|
257
|
+
if 'chars' not in line['spans'][i]:
|
263
258
|
continue
|
264
259
|
|
265
|
-
if line[
|
260
|
+
if line['spans'][i].get('_type', None) is not None:
|
266
261
|
continue # 忽略,因为已经是插入的伪造span公式了
|
267
262
|
|
268
|
-
for char in line[
|
269
|
-
if __is_x_dir_overlap(eqinfo[
|
270
|
-
line_txt =
|
271
|
-
for span in line[
|
272
|
-
span_txt =
|
273
|
-
for ch in span[
|
274
|
-
span_txt = span_txt + ch[
|
263
|
+
for char in line['spans'][i]['chars']:
|
264
|
+
if __is_x_dir_overlap(eqinfo['bbox'], char['bbox']):
|
265
|
+
line_txt = ''
|
266
|
+
for span in line['spans']:
|
267
|
+
span_txt = '<span>'
|
268
|
+
for ch in span['chars']:
|
269
|
+
span_txt = span_txt + ch['c']
|
275
270
|
|
276
|
-
span_txt = span_txt +
|
271
|
+
span_txt = span_txt + '</span>'
|
277
272
|
|
278
273
|
line_txt = line_txt + span_txt
|
279
274
|
|
280
275
|
if first_overlap_span_idx == -1:
|
281
|
-
first_overlap_span = line[
|
276
|
+
first_overlap_span = line['spans'][i]
|
282
277
|
first_overlap_span_idx = i
|
283
|
-
last_overlap_span = line[
|
278
|
+
last_overlap_span = line['spans'][i]
|
284
279
|
delete_chars.append(char)
|
285
280
|
|
286
281
|
# 第一个和最后一个char要进行检查,到底属于公式多还是属于正常span多
|
287
282
|
if len(delete_chars) > 0:
|
288
|
-
ch0_bbox = delete_chars[0][
|
289
|
-
if x_overlap_ratio(eqinfo[
|
283
|
+
ch0_bbox = delete_chars[0]['bbox']
|
284
|
+
if x_overlap_ratio(eqinfo['bbox'], ch0_bbox) < 0.51:
|
290
285
|
delete_chars.remove(delete_chars[0])
|
291
286
|
if len(delete_chars) > 0:
|
292
|
-
ch0_bbox = delete_chars[-1][
|
293
|
-
if x_overlap_ratio(eqinfo[
|
287
|
+
ch0_bbox = delete_chars[-1]['bbox']
|
288
|
+
if x_overlap_ratio(eqinfo['bbox'], ch0_bbox) < 0.51:
|
294
289
|
delete_chars.remove(delete_chars[-1])
|
295
290
|
|
296
291
|
# 计算x方向上被删除区间内的char的真实x0, x1
|
297
292
|
if len(delete_chars):
|
298
|
-
x0, x1 =
|
299
|
-
[b[
|
293
|
+
x0, x1 = (
|
294
|
+
min([b['bbox'][0] for b in delete_chars]),
|
295
|
+
max([b['bbox'][2] for b in delete_chars]),
|
300
296
|
)
|
301
297
|
else:
|
302
298
|
# logger.debug(f"行内公式替换没有发生,尝试下一行匹配, eqinfo={eqinfo}")
|
@@ -304,101 +300,101 @@ def replace_line_v2(eqinfo, line):
|
|
304
300
|
|
305
301
|
# 删除位于x0, x1这两个中间的span
|
306
302
|
delete_span = []
|
307
|
-
for span in line[
|
308
|
-
span_box = span[
|
303
|
+
for span in line['spans']:
|
304
|
+
span_box = span['bbox']
|
309
305
|
if x0 <= span_box[0] and span_box[2] <= x1:
|
310
306
|
delete_span.append(span)
|
311
307
|
for span in delete_span:
|
312
|
-
line[
|
308
|
+
line['spans'].remove(span)
|
313
309
|
|
314
310
|
equation_span = {
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
311
|
+
'size': 9.962599754333496,
|
312
|
+
'type': TYPE_INLINE_EQUATION,
|
313
|
+
'flags': 4,
|
314
|
+
'font': TYPE_INLINE_EQUATION,
|
315
|
+
'color': 0,
|
316
|
+
'ascender': 0.9409999847412109,
|
317
|
+
'descender': -0.3050000071525574,
|
318
|
+
'latex': '',
|
319
|
+
'origin': [337.1410153102337, 216.0205245153934],
|
320
|
+
'bbox': eqinfo['bbox'],
|
325
321
|
}
|
326
322
|
# equation_span = line['spans'][0].copy()
|
327
|
-
equation_span[
|
328
|
-
equation_span[
|
329
|
-
equation_span[
|
330
|
-
equation_span[
|
331
|
-
equation_span[
|
332
|
-
equation_span[
|
333
|
-
line[
|
323
|
+
equation_span['latex'] = eqinfo['latex']
|
324
|
+
equation_span['bbox'] = [x0, equation_span['bbox'][1], x1, equation_span['bbox'][3]]
|
325
|
+
equation_span['origin'] = [equation_span['bbox'][0], equation_span['bbox'][1]]
|
326
|
+
equation_span['chars'] = delete_chars
|
327
|
+
equation_span['type'] = TYPE_INLINE_EQUATION
|
328
|
+
equation_span['_eq_bbox'] = eqinfo['bbox']
|
329
|
+
line['spans'].insert(first_overlap_span_idx + 1, equation_span) # 放入公式
|
334
330
|
|
335
331
|
# logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
|
336
332
|
|
337
333
|
# 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
|
338
334
|
first_span_chars = [
|
339
335
|
char
|
340
|
-
for char in first_overlap_span[
|
341
|
-
if (char[
|
336
|
+
for char in first_overlap_span['chars']
|
337
|
+
if (char['bbox'][2] + char['bbox'][0]) / 2 < x0
|
342
338
|
]
|
343
339
|
tail_span_chars = [
|
344
340
|
char
|
345
|
-
for char in last_overlap_span[
|
346
|
-
if (char[
|
341
|
+
for char in last_overlap_span['chars']
|
342
|
+
if (char['bbox'][0] + char['bbox'][2]) / 2 > x1
|
347
343
|
]
|
348
344
|
|
349
345
|
if len(first_span_chars) > 0:
|
350
|
-
first_overlap_span[
|
351
|
-
first_overlap_span[
|
352
|
-
first_overlap_span[
|
353
|
-
first_overlap_span[
|
354
|
-
first_overlap_span[
|
355
|
-
max([chr[
|
356
|
-
first_overlap_span[
|
346
|
+
first_overlap_span['chars'] = first_span_chars
|
347
|
+
first_overlap_span['text'] = ''.join([char['c'] for char in first_span_chars])
|
348
|
+
first_overlap_span['bbox'] = (
|
349
|
+
first_overlap_span['bbox'][0],
|
350
|
+
first_overlap_span['bbox'][1],
|
351
|
+
max([chr['bbox'][2] for chr in first_span_chars]),
|
352
|
+
first_overlap_span['bbox'][3],
|
357
353
|
)
|
358
354
|
# first_overlap_span['_type'] = "first"
|
359
355
|
else:
|
360
356
|
# 删掉
|
361
357
|
if first_overlap_span not in delete_span:
|
362
|
-
line[
|
358
|
+
line['spans'].remove(first_overlap_span)
|
363
359
|
|
364
360
|
if len(tail_span_chars) > 0:
|
365
|
-
min_of_tail_span_x0 = min([chr[
|
366
|
-
min_of_tail_span_y0 = min([chr[
|
367
|
-
max_of_tail_span_x1 = max([chr[
|
368
|
-
max_of_tail_span_y1 = max([chr[
|
361
|
+
min_of_tail_span_x0 = min([chr['bbox'][0] for chr in tail_span_chars])
|
362
|
+
min_of_tail_span_y0 = min([chr['bbox'][1] for chr in tail_span_chars])
|
363
|
+
max_of_tail_span_x1 = max([chr['bbox'][2] for chr in tail_span_chars])
|
364
|
+
max_of_tail_span_y1 = max([chr['bbox'][3] for chr in tail_span_chars])
|
369
365
|
|
370
366
|
if last_overlap_span == first_overlap_span: # 这个时候应该插入一个新的
|
371
|
-
tail_span_txt =
|
367
|
+
tail_span_txt = ''.join([char['c'] for char in tail_span_chars]) # noqa: F841
|
372
368
|
last_span_to_insert = last_overlap_span.copy()
|
373
|
-
last_span_to_insert[
|
374
|
-
last_span_to_insert[
|
375
|
-
[char[
|
369
|
+
last_span_to_insert['chars'] = tail_span_chars
|
370
|
+
last_span_to_insert['text'] = ''.join(
|
371
|
+
[char['c'] for char in tail_span_chars]
|
376
372
|
)
|
377
|
-
if equation_span[
|
378
|
-
last_span_to_insert[
|
373
|
+
if equation_span['bbox'][2] >= last_overlap_span['bbox'][2]:
|
374
|
+
last_span_to_insert['bbox'] = (
|
379
375
|
min_of_tail_span_x0,
|
380
376
|
min_of_tail_span_y0,
|
381
377
|
max_of_tail_span_x1,
|
382
|
-
max_of_tail_span_y1
|
378
|
+
max_of_tail_span_y1,
|
383
379
|
)
|
384
380
|
else:
|
385
|
-
last_span_to_insert[
|
386
|
-
min([chr[
|
387
|
-
last_overlap_span[
|
388
|
-
last_overlap_span[
|
389
|
-
last_overlap_span[
|
381
|
+
last_span_to_insert['bbox'] = (
|
382
|
+
min([chr['bbox'][0] for chr in tail_span_chars]),
|
383
|
+
last_overlap_span['bbox'][1],
|
384
|
+
last_overlap_span['bbox'][2],
|
385
|
+
last_overlap_span['bbox'][3],
|
390
386
|
)
|
391
387
|
# 插入到公式对象之后
|
392
|
-
equation_idx = line[
|
393
|
-
line[
|
388
|
+
equation_idx = line['spans'].index(equation_span)
|
389
|
+
line['spans'].insert(equation_idx + 1, last_span_to_insert) # 放入公式
|
394
390
|
else: # 直接修改原来的span
|
395
|
-
last_overlap_span[
|
396
|
-
last_overlap_span[
|
397
|
-
last_overlap_span[
|
398
|
-
min([chr[
|
399
|
-
last_overlap_span[
|
400
|
-
last_overlap_span[
|
401
|
-
last_overlap_span[
|
391
|
+
last_overlap_span['chars'] = tail_span_chars
|
392
|
+
last_overlap_span['text'] = ''.join([char['c'] for char in tail_span_chars])
|
393
|
+
last_overlap_span['bbox'] = (
|
394
|
+
min([chr['bbox'][0] for chr in tail_span_chars]),
|
395
|
+
last_overlap_span['bbox'][1],
|
396
|
+
last_overlap_span['bbox'][2],
|
397
|
+
last_overlap_span['bbox'][3],
|
402
398
|
)
|
403
399
|
else:
|
404
400
|
# 删掉
|
@@ -406,15 +402,15 @@ def replace_line_v2(eqinfo, line):
|
|
406
402
|
last_overlap_span not in delete_span
|
407
403
|
and last_overlap_span != first_overlap_span
|
408
404
|
):
|
409
|
-
line[
|
405
|
+
line['spans'].remove(last_overlap_span)
|
410
406
|
|
411
|
-
remain_txt =
|
412
|
-
for span in line[
|
413
|
-
span_txt =
|
414
|
-
for char in span[
|
415
|
-
span_txt = span_txt + char[
|
407
|
+
remain_txt = ''
|
408
|
+
for span in line['spans']:
|
409
|
+
span_txt = '<span>'
|
410
|
+
for char in span['chars']:
|
411
|
+
span_txt = span_txt + char['c']
|
416
412
|
|
417
|
-
span_txt = span_txt +
|
413
|
+
span_txt = span_txt + '</span>'
|
418
414
|
|
419
415
|
remain_txt = remain_txt + span_txt
|
420
416
|
|
@@ -424,17 +420,15 @@ def replace_line_v2(eqinfo, line):
|
|
424
420
|
|
425
421
|
|
426
422
|
def replace_eq_blk(eqinfo, text_block):
|
427
|
-
"""
|
428
|
-
for line in text_block[
|
429
|
-
line_bbox = line[
|
423
|
+
"""替换行内公式."""
|
424
|
+
for line in text_block['lines']:
|
425
|
+
line_bbox = line['bbox']
|
430
426
|
if (
|
431
|
-
_is_xin(eqinfo[
|
432
|
-
or __y_overlap_ratio(eqinfo[
|
427
|
+
_is_xin(eqinfo['bbox'], line_bbox)
|
428
|
+
or __y_overlap_ratio(eqinfo['bbox'], line_bbox) > 0.6
|
433
429
|
): # 定位到行, 使用y方向重合率是因为有的时候,一个行的宽度会小于公式位置宽度:行很高,公式很窄,
|
434
430
|
replace_succ = replace_line_v2(eqinfo, line)
|
435
|
-
if
|
436
|
-
not replace_succ
|
437
|
-
): # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行
|
431
|
+
if not replace_succ: # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行
|
438
432
|
continue
|
439
433
|
else:
|
440
434
|
break
|
@@ -444,13 +438,13 @@ def replace_eq_blk(eqinfo, text_block):
|
|
444
438
|
|
445
439
|
|
446
440
|
def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
|
447
|
-
"""
|
441
|
+
"""替换行内公式."""
|
448
442
|
for eqinfo in inline_equation_bboxes:
|
449
|
-
eqbox = eqinfo[
|
443
|
+
eqbox = eqinfo['bbox']
|
450
444
|
for blk in raw_text_blocks:
|
451
|
-
if _is_xin(eqbox, blk[
|
445
|
+
if _is_xin(eqbox, blk['bbox']):
|
452
446
|
if not replace_eq_blk(eqinfo, blk):
|
453
|
-
logger.warning(f
|
447
|
+
logger.warning(f'行内公式没有替换成功:{eqinfo} ')
|
454
448
|
else:
|
455
449
|
break
|
456
450
|
|
@@ -458,20 +452,18 @@ def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
|
|
458
452
|
|
459
453
|
|
460
454
|
def remove_chars_in_text_blocks(text_blocks):
|
461
|
-
"""删除text_blocks里的char"""
|
455
|
+
"""删除text_blocks里的char."""
|
462
456
|
for blk in text_blocks:
|
463
|
-
for line in blk[
|
464
|
-
for span in line[
|
465
|
-
_ = span.pop(
|
457
|
+
for line in blk['lines']:
|
458
|
+
for span in line['spans']:
|
459
|
+
_ = span.pop('chars', 'no such key')
|
466
460
|
return text_blocks
|
467
461
|
|
468
462
|
|
469
463
|
def replace_equations_in_textblock(
|
470
464
|
raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes
|
471
465
|
):
|
472
|
-
"""
|
473
|
-
替换行间和和行内公式为latex
|
474
|
-
"""
|
466
|
+
"""替换行间和和行内公式为latex."""
|
475
467
|
raw_text_blocks = remove_text_block_in_interline_equation_bbox(
|
476
468
|
interline_equation_bboxes, raw_text_blocks
|
477
469
|
) # 消除重叠:第一步,在公式内部的
|
@@ -486,22 +478,22 @@ def replace_equations_in_textblock(
|
|
486
478
|
|
487
479
|
|
488
480
|
def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
|
489
|
-
"""
|
490
|
-
new_pdf = f
|
491
|
-
with open(json_path,
|
481
|
+
""""""
|
482
|
+
new_pdf = f'{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf'
|
483
|
+
with open(json_path, 'r', encoding='utf-8') as f:
|
492
484
|
obj = json.loads(f.read())
|
493
485
|
|
494
486
|
if os.path.exists(new_pdf):
|
495
487
|
os.remove(new_pdf)
|
496
|
-
new_doc = fitz.open(
|
488
|
+
new_doc = fitz.open('')
|
497
489
|
|
498
|
-
doc = fitz.open(pdf_path)
|
490
|
+
doc = fitz.open(pdf_path) # noqa: F841
|
499
491
|
new_doc = fitz.open(pdf_path)
|
500
492
|
for i in range(len(new_doc)):
|
501
493
|
page = new_doc[i]
|
502
|
-
inline_equation_bboxes = obj[f
|
503
|
-
interline_equation_bboxes = obj[f
|
504
|
-
raw_text_blocks = obj[f
|
494
|
+
inline_equation_bboxes = obj[f'page_{i}']['inline_equations']
|
495
|
+
interline_equation_bboxes = obj[f'page_{i}']['interline_equations']
|
496
|
+
raw_text_blocks = obj[f'page_{i}']['preproc_blocks']
|
505
497
|
raw_text_blocks = remove_text_block_in_interline_equation_bbox(
|
506
498
|
interline_equation_bboxes, raw_text_blocks
|
507
499
|
) # 消除重叠:第一步,在公式内部的
|
@@ -514,11 +506,10 @@ def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
|
|
514
506
|
)
|
515
507
|
|
516
508
|
# 为了检验公式是否重复,把每一行里,含有公式的span背景改成黄色的
|
517
|
-
color_map = [fitz.pdfcolor[
|
518
|
-
j = 0
|
509
|
+
color_map = [fitz.pdfcolor['blue'], fitz.pdfcolor['green']] # noqa: F841
|
510
|
+
j = 0 # noqa: F841
|
519
511
|
for blk in raw_text_blocks:
|
520
|
-
for i, line in enumerate(blk[
|
521
|
-
|
512
|
+
for i, line in enumerate(blk['lines']):
|
522
513
|
# line_box = line['bbox']
|
523
514
|
# shape = page.new_shape()
|
524
515
|
# shape.draw_rect(line_box)
|
@@ -526,34 +517,34 @@ def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
|
|
526
517
|
# shape.commit()
|
527
518
|
# j = j+1
|
528
519
|
|
529
|
-
for i, span in enumerate(line[
|
520
|
+
for i, span in enumerate(line['spans']):
|
530
521
|
shape_page = page.new_shape()
|
531
|
-
span_type = span.get(
|
532
|
-
color = fitz.pdfcolor[
|
533
|
-
if span_type ==
|
534
|
-
color = fitz.pdfcolor[
|
535
|
-
elif span_type ==
|
536
|
-
color = fitz.pdfcolor[
|
522
|
+
span_type = span.get('_type')
|
523
|
+
color = fitz.pdfcolor['blue']
|
524
|
+
if span_type == 'first':
|
525
|
+
color = fitz.pdfcolor['blue']
|
526
|
+
elif span_type == 'tail':
|
527
|
+
color = fitz.pdfcolor['green']
|
537
528
|
elif span_type == TYPE_INLINE_EQUATION:
|
538
|
-
color = fitz.pdfcolor[
|
529
|
+
color = fitz.pdfcolor['black']
|
539
530
|
else:
|
540
531
|
color = None
|
541
532
|
|
542
|
-
b = span[
|
533
|
+
b = span['bbox']
|
543
534
|
shape_page.draw_rect(b)
|
544
535
|
|
545
536
|
shape_page.finish(color=None, fill=color, fill_opacity=0.3)
|
546
537
|
shape_page.commit()
|
547
538
|
|
548
539
|
new_doc.save(new_pdf)
|
549
|
-
logger.info(f
|
540
|
+
logger.info(f'save ok {new_pdf}')
|
550
541
|
final_json = json.dumps(obj, ensure_ascii=False, indent=2)
|
551
|
-
with open(
|
542
|
+
with open('equations_test/final_json.json', 'w') as f:
|
552
543
|
f.write(final_json)
|
553
544
|
|
554
545
|
return new_pdf
|
555
546
|
|
556
547
|
|
557
|
-
if __name__ ==
|
548
|
+
if __name__ == '__main__':
|
558
549
|
# draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
|
559
550
|
pass
|