magic-pdf 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/__init__.py +0 -0
- magic_pdf/config/enums.py +7 -0
- magic_pdf/config/exceptions.py +32 -0
- magic_pdf/data/__init__.py +0 -0
- magic_pdf/data/data_reader_writer/__init__.py +12 -0
- magic_pdf/data/data_reader_writer/base.py +51 -0
- magic_pdf/data/data_reader_writer/filebase.py +59 -0
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
- magic_pdf/data/data_reader_writer/s3.py +69 -0
- magic_pdf/data/dataset.py +194 -0
- magic_pdf/data/io/__init__.py +0 -0
- magic_pdf/data/io/base.py +42 -0
- magic_pdf/data/io/http.py +37 -0
- magic_pdf/data/io/s3.py +114 -0
- magic_pdf/data/read_api.py +95 -0
- magic_pdf/data/schemas.py +15 -0
- magic_pdf/data/utils.py +32 -0
- magic_pdf/dict2md/ocr_mkcontent.py +74 -234
- magic_pdf/libs/Constants.py +21 -8
- magic_pdf/libs/MakeContentConfig.py +1 -0
- magic_pdf/libs/boxbase.py +35 -0
- magic_pdf/libs/clean_memory.py +10 -0
- magic_pdf/libs/config_reader.py +53 -23
- magic_pdf/libs/draw_bbox.py +150 -65
- magic_pdf/libs/ocr_content_type.py +2 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
- magic_pdf/model/magic_model.py +331 -15
- magic_pdf/model/pdf_extract_kit.py +164 -80
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
- magic_pdf/model/ppTableModel.py +2 -2
- magic_pdf/model/pp_structure_v2.py +5 -2
- magic_pdf/model/v3/__init__.py +0 -0
- magic_pdf/model/v3/helpers.py +125 -0
- magic_pdf/para/para_split_v3.py +296 -0
- magic_pdf/pdf_parse_by_ocr.py +6 -3
- magic_pdf/pdf_parse_by_txt.py +6 -3
- magic_pdf/pdf_parse_union_core_v2.py +644 -0
- magic_pdf/pipe/AbsPipe.py +5 -1
- magic_pdf/pipe/OCRPipe.py +10 -4
- magic_pdf/pipe/TXTPipe.py +10 -4
- magic_pdf/pipe/UNIPipe.py +16 -7
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
- magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
- magic_pdf/resources/model_config/model_configs.yaml +5 -13
- magic_pdf/tools/cli.py +14 -1
- magic_pdf/tools/common.py +18 -8
- magic_pdf/user_api.py +25 -6
- magic_pdf/utils/__init__.py +0 -0
- magic_pdf/utils/annotations.py +11 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/METADATA +120 -75
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,644 @@
|
|
1
|
+
import copy
|
2
|
+
import os
|
3
|
+
import statistics
|
4
|
+
import time
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
import torch
|
8
|
+
from loguru import logger
|
9
|
+
|
10
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
11
|
+
from magic_pdf.data.dataset import Dataset, PageableData
|
12
|
+
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
13
|
+
from magic_pdf.libs.clean_memory import clean_memory
|
14
|
+
from magic_pdf.libs.commons import fitz, get_delta_time
|
15
|
+
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
|
16
|
+
from magic_pdf.libs.convert_utils import dict_to_list
|
17
|
+
from magic_pdf.libs.drop_reason import DropReason
|
18
|
+
from magic_pdf.libs.hash_utils import compute_md5
|
19
|
+
from magic_pdf.libs.local_math import float_equal
|
20
|
+
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
|
21
|
+
from magic_pdf.model.magic_model import MagicModel
|
22
|
+
from magic_pdf.para.para_split_v3 import para_split
|
23
|
+
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
|
24
|
+
from magic_pdf.pre_proc.construct_page_dict import \
|
25
|
+
ocr_construct_page_component_v2
|
26
|
+
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
|
27
|
+
from magic_pdf.pre_proc.equations_replace import (
|
28
|
+
combine_chars_to_pymudict, remove_chars_in_text_blocks,
|
29
|
+
replace_equations_in_textblock)
|
30
|
+
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
|
31
|
+
ocr_prepare_bboxes_for_layout_split_v2
|
32
|
+
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
33
|
+
fix_block_spans,
|
34
|
+
fix_discarded_block, fix_block_spans_v2)
|
35
|
+
from magic_pdf.pre_proc.ocr_span_list_modify import (
|
36
|
+
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
|
37
|
+
remove_overlaps_min_spans)
|
38
|
+
from magic_pdf.pre_proc.resolve_bbox_conflict import \
|
39
|
+
check_useful_block_horizontal_overlap
|
40
|
+
|
41
|
+
|
42
|
+
def remove_horizontal_overlap_block_which_smaller(all_bboxes):
|
43
|
+
useful_blocks = []
|
44
|
+
for bbox in all_bboxes:
|
45
|
+
useful_blocks.append({'bbox': bbox[:4]})
|
46
|
+
is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
|
47
|
+
check_useful_block_horizontal_overlap(useful_blocks)
|
48
|
+
)
|
49
|
+
if is_useful_block_horz_overlap:
|
50
|
+
logger.warning(
|
51
|
+
f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
|
52
|
+
) # noqa: E501
|
53
|
+
for bbox in all_bboxes.copy():
|
54
|
+
if smaller_bbox == bbox[:4]:
|
55
|
+
all_bboxes.remove(bbox)
|
56
|
+
|
57
|
+
return is_useful_block_horz_overlap, all_bboxes
|
58
|
+
|
59
|
+
|
60
|
+
def __replace_STX_ETX(text_str: str):
|
61
|
+
"""Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
|
62
|
+
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
text_str (str): raw text
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
_type_: replaced text
|
69
|
+
""" # noqa: E501
|
70
|
+
if text_str:
|
71
|
+
s = text_str.replace('\u0002', "'")
|
72
|
+
s = s.replace('\u0003', "'")
|
73
|
+
return s
|
74
|
+
return text_str
|
75
|
+
|
76
|
+
|
77
|
+
def txt_spans_extract(pdf_page, inline_equations, interline_equations):
|
78
|
+
text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
79
|
+
char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
|
80
|
+
'blocks'
|
81
|
+
]
|
82
|
+
text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
|
83
|
+
text_blocks = replace_equations_in_textblock(
|
84
|
+
text_blocks, inline_equations, interline_equations
|
85
|
+
)
|
86
|
+
text_blocks = remove_citation_marker(text_blocks)
|
87
|
+
text_blocks = remove_chars_in_text_blocks(text_blocks)
|
88
|
+
spans = []
|
89
|
+
for v in text_blocks:
|
90
|
+
for line in v['lines']:
|
91
|
+
for span in line['spans']:
|
92
|
+
bbox = span['bbox']
|
93
|
+
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
|
94
|
+
continue
|
95
|
+
if span.get('type') not in (
|
96
|
+
ContentType.InlineEquation,
|
97
|
+
ContentType.InterlineEquation,
|
98
|
+
):
|
99
|
+
spans.append(
|
100
|
+
{
|
101
|
+
'bbox': list(span['bbox']),
|
102
|
+
'content': __replace_STX_ETX(span['text']),
|
103
|
+
'type': ContentType.Text,
|
104
|
+
'score': 1.0,
|
105
|
+
}
|
106
|
+
)
|
107
|
+
return spans
|
108
|
+
|
109
|
+
|
110
|
+
def replace_text_span(pymu_spans, ocr_spans):
|
111
|
+
return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
|
112
|
+
|
113
|
+
|
114
|
+
def model_init(model_name: str):
|
115
|
+
from transformers import LayoutLMv3ForTokenClassification
|
116
|
+
|
117
|
+
if torch.cuda.is_available():
|
118
|
+
device = torch.device('cuda')
|
119
|
+
if torch.cuda.is_bf16_supported():
|
120
|
+
supports_bfloat16 = True
|
121
|
+
else:
|
122
|
+
supports_bfloat16 = False
|
123
|
+
else:
|
124
|
+
device = torch.device('cpu')
|
125
|
+
supports_bfloat16 = False
|
126
|
+
|
127
|
+
if model_name == 'layoutreader':
|
128
|
+
# 检测modelscope的缓存目录是否存在
|
129
|
+
layoutreader_model_dir = get_local_layoutreader_model_dir()
|
130
|
+
if os.path.exists(layoutreader_model_dir):
|
131
|
+
model = LayoutLMv3ForTokenClassification.from_pretrained(
|
132
|
+
layoutreader_model_dir
|
133
|
+
)
|
134
|
+
else:
|
135
|
+
logger.warning(
|
136
|
+
'local layoutreader model not exists, use online model from huggingface'
|
137
|
+
)
|
138
|
+
model = LayoutLMv3ForTokenClassification.from_pretrained(
|
139
|
+
'hantian/layoutreader'
|
140
|
+
)
|
141
|
+
# 检查设备是否支持 bfloat16
|
142
|
+
if supports_bfloat16:
|
143
|
+
model.bfloat16()
|
144
|
+
model.to(device).eval()
|
145
|
+
else:
|
146
|
+
logger.error('model name not allow')
|
147
|
+
exit(1)
|
148
|
+
return model
|
149
|
+
|
150
|
+
|
151
|
+
class ModelSingleton:
|
152
|
+
_instance = None
|
153
|
+
_models = {}
|
154
|
+
|
155
|
+
def __new__(cls, *args, **kwargs):
|
156
|
+
if cls._instance is None:
|
157
|
+
cls._instance = super().__new__(cls)
|
158
|
+
return cls._instance
|
159
|
+
|
160
|
+
def get_model(self, model_name: str):
|
161
|
+
if model_name not in self._models:
|
162
|
+
self._models[model_name] = model_init(model_name=model_name)
|
163
|
+
return self._models[model_name]
|
164
|
+
|
165
|
+
|
166
|
+
def do_predict(boxes: List[List[int]], model) -> List[int]:
|
167
|
+
from magic_pdf.model.v3.helpers import (boxes2inputs, parse_logits,
|
168
|
+
prepare_inputs)
|
169
|
+
|
170
|
+
inputs = boxes2inputs(boxes)
|
171
|
+
inputs = prepare_inputs(inputs, model)
|
172
|
+
logits = model(**inputs).logits.cpu().squeeze(0)
|
173
|
+
return parse_logits(logits, len(boxes))
|
174
|
+
|
175
|
+
|
176
|
+
def cal_block_index(fix_blocks, sorted_bboxes):
|
177
|
+
for block in fix_blocks:
|
178
|
+
|
179
|
+
line_index_list = []
|
180
|
+
if len(block['lines']) == 0:
|
181
|
+
block['index'] = sorted_bboxes.index(block['bbox'])
|
182
|
+
else:
|
183
|
+
for line in block['lines']:
|
184
|
+
line['index'] = sorted_bboxes.index(line['bbox'])
|
185
|
+
line_index_list.append(line['index'])
|
186
|
+
median_value = statistics.median(line_index_list)
|
187
|
+
block['index'] = median_value
|
188
|
+
|
189
|
+
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
190
|
+
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
191
|
+
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
192
|
+
block['lines'] = copy.deepcopy(block['real_lines'])
|
193
|
+
del block['real_lines']
|
194
|
+
|
195
|
+
return fix_blocks
|
196
|
+
|
197
|
+
|
198
|
+
def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
|
199
|
+
# block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标
|
200
|
+
x0, y0, x1, y1 = block_bbox
|
201
|
+
|
202
|
+
block_height = y1 - y0
|
203
|
+
block_weight = x1 - x0
|
204
|
+
|
205
|
+
# 如果block高度小于n行正文,则直接返回block的bbox
|
206
|
+
if line_height * 3 < block_height:
|
207
|
+
if (
|
208
|
+
block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
|
209
|
+
): # 可能是双列结构,可以切细点
|
210
|
+
lines = int(block_height / line_height) + 1
|
211
|
+
else:
|
212
|
+
# 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细)
|
213
|
+
if block_weight > page_w * 0.4:
|
214
|
+
line_height = (y1 - y0) / 3
|
215
|
+
lines = 3
|
216
|
+
elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点)
|
217
|
+
lines = int(block_height / line_height) + 1
|
218
|
+
else: # 判断长宽比
|
219
|
+
if block_height / block_weight > 1.2: # 细长的不分
|
220
|
+
return [[x0, y0, x1, y1]]
|
221
|
+
else: # 不细长的还是分成两行
|
222
|
+
line_height = (y1 - y0) / 2
|
223
|
+
lines = 2
|
224
|
+
|
225
|
+
# 确定从哪个y位置开始绘制线条
|
226
|
+
current_y = y0
|
227
|
+
|
228
|
+
# 用于存储线条的位置信息[(x0, y), ...]
|
229
|
+
lines_positions = []
|
230
|
+
|
231
|
+
for i in range(lines):
|
232
|
+
lines_positions.append([x0, current_y, x1, current_y + line_height])
|
233
|
+
current_y += line_height
|
234
|
+
return lines_positions
|
235
|
+
|
236
|
+
else:
|
237
|
+
return [[x0, y0, x1, y1]]
|
238
|
+
|
239
|
+
|
240
|
+
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
241
|
+
page_line_list = []
|
242
|
+
for block in fix_blocks:
|
243
|
+
if block['type'] in [
|
244
|
+
BlockType.Text, BlockType.Title, BlockType.InterlineEquation,
|
245
|
+
BlockType.ImageCaption, BlockType.ImageFootnote,
|
246
|
+
BlockType.TableCaption, BlockType.TableFootnote
|
247
|
+
]:
|
248
|
+
if len(block['lines']) == 0:
|
249
|
+
bbox = block['bbox']
|
250
|
+
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
|
251
|
+
for line in lines:
|
252
|
+
block['lines'].append({'bbox': line, 'spans': []})
|
253
|
+
page_line_list.extend(lines)
|
254
|
+
else:
|
255
|
+
for line in block['lines']:
|
256
|
+
bbox = line['bbox']
|
257
|
+
page_line_list.append(bbox)
|
258
|
+
elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
259
|
+
bbox = block['bbox']
|
260
|
+
block["real_lines"] = copy.deepcopy(block['lines'])
|
261
|
+
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
|
262
|
+
block['lines'] = []
|
263
|
+
for line in lines:
|
264
|
+
block['lines'].append({'bbox': line, 'spans': []})
|
265
|
+
page_line_list.extend(lines)
|
266
|
+
|
267
|
+
# 使用layoutreader排序
|
268
|
+
x_scale = 1000.0 / page_w
|
269
|
+
y_scale = 1000.0 / page_h
|
270
|
+
boxes = []
|
271
|
+
# logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
|
272
|
+
for left, top, right, bottom in page_line_list:
|
273
|
+
if left < 0:
|
274
|
+
logger.warning(
|
275
|
+
f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
|
276
|
+
) # noqa: E501
|
277
|
+
left = 0
|
278
|
+
if right > page_w:
|
279
|
+
logger.warning(
|
280
|
+
f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
|
281
|
+
) # noqa: E501
|
282
|
+
right = page_w
|
283
|
+
if top < 0:
|
284
|
+
logger.warning(
|
285
|
+
f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
|
286
|
+
) # noqa: E501
|
287
|
+
top = 0
|
288
|
+
if bottom > page_h:
|
289
|
+
logger.warning(
|
290
|
+
f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
|
291
|
+
) # noqa: E501
|
292
|
+
bottom = page_h
|
293
|
+
|
294
|
+
left = round(left * x_scale)
|
295
|
+
top = round(top * y_scale)
|
296
|
+
right = round(right * x_scale)
|
297
|
+
bottom = round(bottom * y_scale)
|
298
|
+
assert (
|
299
|
+
1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
|
300
|
+
), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}' # noqa: E126, E121
|
301
|
+
boxes.append([left, top, right, bottom])
|
302
|
+
model_manager = ModelSingleton()
|
303
|
+
model = model_manager.get_model('layoutreader')
|
304
|
+
with torch.no_grad():
|
305
|
+
orders = do_predict(boxes, model)
|
306
|
+
sorted_bboxes = [page_line_list[i] for i in orders]
|
307
|
+
|
308
|
+
return sorted_bboxes
|
309
|
+
|
310
|
+
|
311
|
+
def get_line_height(blocks):
|
312
|
+
page_line_height_list = []
|
313
|
+
for block in blocks:
|
314
|
+
if block['type'] in [
|
315
|
+
BlockType.Text, BlockType.Title,
|
316
|
+
BlockType.ImageCaption, BlockType.ImageFootnote,
|
317
|
+
BlockType.TableCaption, BlockType.TableFootnote
|
318
|
+
]:
|
319
|
+
for line in block['lines']:
|
320
|
+
bbox = line['bbox']
|
321
|
+
page_line_height_list.append(int(bbox[3] - bbox[1]))
|
322
|
+
if len(page_line_height_list) > 0:
|
323
|
+
return statistics.median(page_line_height_list)
|
324
|
+
else:
|
325
|
+
return 10
|
326
|
+
|
327
|
+
|
328
|
+
def process_groups(groups, body_key, caption_key, footnote_key):
|
329
|
+
body_blocks = []
|
330
|
+
caption_blocks = []
|
331
|
+
footnote_blocks = []
|
332
|
+
for i, group in enumerate(groups):
|
333
|
+
group[body_key]['group_id'] = i
|
334
|
+
body_blocks.append(group[body_key])
|
335
|
+
for caption_block in group[caption_key]:
|
336
|
+
caption_block['group_id'] = i
|
337
|
+
caption_blocks.append(caption_block)
|
338
|
+
for footnote_block in group[footnote_key]:
|
339
|
+
footnote_block['group_id'] = i
|
340
|
+
footnote_blocks.append(footnote_block)
|
341
|
+
return body_blocks, caption_blocks, footnote_blocks
|
342
|
+
|
343
|
+
|
344
|
+
def process_block_list(blocks, body_type, block_type):
|
345
|
+
indices = [block['index'] for block in blocks]
|
346
|
+
median_index = statistics.median(indices)
|
347
|
+
|
348
|
+
body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), [])
|
349
|
+
|
350
|
+
return {
|
351
|
+
'type': block_type,
|
352
|
+
'bbox': body_bbox,
|
353
|
+
'blocks': blocks,
|
354
|
+
'index': median_index,
|
355
|
+
}
|
356
|
+
|
357
|
+
|
358
|
+
def revert_group_blocks(blocks):
|
359
|
+
image_groups = {}
|
360
|
+
table_groups = {}
|
361
|
+
new_blocks = []
|
362
|
+
for block in blocks:
|
363
|
+
if block['type'] in [BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote]:
|
364
|
+
group_id = block['group_id']
|
365
|
+
if group_id not in image_groups:
|
366
|
+
image_groups[group_id] = []
|
367
|
+
image_groups[group_id].append(block)
|
368
|
+
elif block['type'] in [BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote]:
|
369
|
+
group_id = block['group_id']
|
370
|
+
if group_id not in table_groups:
|
371
|
+
table_groups[group_id] = []
|
372
|
+
table_groups[group_id].append(block)
|
373
|
+
else:
|
374
|
+
new_blocks.append(block)
|
375
|
+
|
376
|
+
for group_id, blocks in image_groups.items():
|
377
|
+
new_blocks.append(process_block_list(blocks, BlockType.ImageBody, BlockType.Image))
|
378
|
+
|
379
|
+
for group_id, blocks in table_groups.items():
|
380
|
+
new_blocks.append(process_block_list(blocks, BlockType.TableBody, BlockType.Table))
|
381
|
+
|
382
|
+
return new_blocks
|
383
|
+
|
384
|
+
|
385
|
+
def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
|
386
|
+
def get_block_bboxes(blocks, block_type_list):
|
387
|
+
return [block[0:4] for block in blocks if block[7] in block_type_list]
|
388
|
+
|
389
|
+
image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody])
|
390
|
+
table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody])
|
391
|
+
other_block_type = []
|
392
|
+
for block_type in BlockType.__dict__.values():
|
393
|
+
if not isinstance(block_type, str):
|
394
|
+
continue
|
395
|
+
if block_type not in [BlockType.ImageBody, BlockType.TableBody]:
|
396
|
+
other_block_type.append(block_type)
|
397
|
+
other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
|
398
|
+
discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded])
|
399
|
+
|
400
|
+
new_spans = []
|
401
|
+
|
402
|
+
for span in spans:
|
403
|
+
span_bbox = span['bbox']
|
404
|
+
span_type = span['type']
|
405
|
+
|
406
|
+
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
|
407
|
+
discarded_block_bboxes):
|
408
|
+
new_spans.append(span)
|
409
|
+
continue
|
410
|
+
|
411
|
+
if span_type == ContentType.Image:
|
412
|
+
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
|
413
|
+
image_bboxes):
|
414
|
+
new_spans.append(span)
|
415
|
+
elif span_type == ContentType.Table:
|
416
|
+
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
|
417
|
+
table_bboxes):
|
418
|
+
new_spans.append(span)
|
419
|
+
else:
|
420
|
+
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
|
421
|
+
other_block_bboxes):
|
422
|
+
new_spans.append(span)
|
423
|
+
|
424
|
+
return new_spans
|
425
|
+
|
426
|
+
|
427
|
+
def parse_page_core(
|
428
|
+
page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
|
429
|
+
):
|
430
|
+
need_drop = False
|
431
|
+
drop_reason = []
|
432
|
+
|
433
|
+
"""从magic_model对象中获取后面会用到的区块信息"""
|
434
|
+
# img_blocks = magic_model.get_imgs(page_id)
|
435
|
+
# table_blocks = magic_model.get_tables(page_id)
|
436
|
+
|
437
|
+
img_groups = magic_model.get_imgs_v2(page_id)
|
438
|
+
table_groups = magic_model.get_tables_v2(page_id)
|
439
|
+
|
440
|
+
img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
|
441
|
+
img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
|
442
|
+
)
|
443
|
+
|
444
|
+
table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
|
445
|
+
table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
|
446
|
+
)
|
447
|
+
|
448
|
+
discarded_blocks = magic_model.get_discarded(page_id)
|
449
|
+
text_blocks = magic_model.get_text_blocks(page_id)
|
450
|
+
title_blocks = magic_model.get_title_blocks(page_id)
|
451
|
+
inline_equations, interline_equations, interline_equation_blocks = (
|
452
|
+
magic_model.get_equations(page_id)
|
453
|
+
)
|
454
|
+
|
455
|
+
page_w, page_h = magic_model.get_page_size(page_id)
|
456
|
+
|
457
|
+
"""将所有区块的bbox整理到一起"""
|
458
|
+
# interline_equation_blocks参数不够准,后面切换到interline_equations上
|
459
|
+
interline_equation_blocks = []
|
460
|
+
if len(interline_equation_blocks) > 0:
|
461
|
+
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
|
462
|
+
img_body_blocks, img_caption_blocks, img_footnote_blocks,
|
463
|
+
table_body_blocks, table_caption_blocks, table_footnote_blocks,
|
464
|
+
discarded_blocks,
|
465
|
+
text_blocks,
|
466
|
+
title_blocks,
|
467
|
+
interline_equation_blocks,
|
468
|
+
page_w,
|
469
|
+
page_h,
|
470
|
+
)
|
471
|
+
else:
|
472
|
+
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
|
473
|
+
img_body_blocks, img_caption_blocks, img_footnote_blocks,
|
474
|
+
table_body_blocks, table_caption_blocks, table_footnote_blocks,
|
475
|
+
discarded_blocks,
|
476
|
+
text_blocks,
|
477
|
+
title_blocks,
|
478
|
+
interline_equations,
|
479
|
+
page_w,
|
480
|
+
page_h,
|
481
|
+
)
|
482
|
+
|
483
|
+
spans = magic_model.get_all_spans(page_id)
|
484
|
+
|
485
|
+
"""根据parse_mode,构造spans"""
|
486
|
+
if parse_mode == SupportedPdfParseMethod.TXT:
|
487
|
+
"""ocr 中文本类的 span 用 pymu spans 替换!"""
|
488
|
+
pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
|
489
|
+
spans = replace_text_span(pymu_spans, spans)
|
490
|
+
elif parse_mode == SupportedPdfParseMethod.OCR:
|
491
|
+
pass
|
492
|
+
else:
|
493
|
+
raise Exception('parse_mode must be txt or ocr')
|
494
|
+
|
495
|
+
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
|
496
|
+
"""顺便删除大水印并保留abandon的span"""
|
497
|
+
spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
|
498
|
+
|
499
|
+
"""删除重叠spans中置信度较低的那些"""
|
500
|
+
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
501
|
+
"""删除重叠spans中较小的那些"""
|
502
|
+
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
503
|
+
"""对image和table截图"""
|
504
|
+
spans = ocr_cut_image_and_table(
|
505
|
+
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
|
506
|
+
)
|
507
|
+
|
508
|
+
"""先处理不需要排版的discarded_blocks"""
|
509
|
+
discarded_block_with_spans, spans = fill_spans_in_blocks(
|
510
|
+
all_discarded_blocks, spans, 0.4
|
511
|
+
)
|
512
|
+
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
|
513
|
+
|
514
|
+
"""如果当前页面没有bbox则跳过"""
|
515
|
+
if len(all_bboxes) == 0:
|
516
|
+
logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
|
517
|
+
return ocr_construct_page_component_v2(
|
518
|
+
[],
|
519
|
+
[],
|
520
|
+
page_id,
|
521
|
+
page_w,
|
522
|
+
page_h,
|
523
|
+
[],
|
524
|
+
[],
|
525
|
+
[],
|
526
|
+
interline_equations,
|
527
|
+
fix_discarded_blocks,
|
528
|
+
need_drop,
|
529
|
+
drop_reason,
|
530
|
+
)
|
531
|
+
|
532
|
+
"""将span填入blocks中"""
|
533
|
+
block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
|
534
|
+
|
535
|
+
"""对block进行fix操作"""
|
536
|
+
fix_blocks = fix_block_spans_v2(block_with_spans)
|
537
|
+
|
538
|
+
"""获取所有line并计算正文line的高度"""
|
539
|
+
line_height = get_line_height(fix_blocks)
|
540
|
+
|
541
|
+
"""获取所有line并对line排序"""
|
542
|
+
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
|
543
|
+
|
544
|
+
"""根据line的中位数算block的序列关系"""
|
545
|
+
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
|
546
|
+
|
547
|
+
"""将image和table的block还原回group形式参与后续流程"""
|
548
|
+
fix_blocks = revert_group_blocks(fix_blocks)
|
549
|
+
|
550
|
+
"""重排block"""
|
551
|
+
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
|
552
|
+
|
553
|
+
"""获取QA需要外置的list"""
|
554
|
+
images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)
|
555
|
+
|
556
|
+
"""构造pdf_info_dict"""
|
557
|
+
page_info = ocr_construct_page_component_v2(
|
558
|
+
sorted_blocks,
|
559
|
+
[],
|
560
|
+
page_id,
|
561
|
+
page_w,
|
562
|
+
page_h,
|
563
|
+
[],
|
564
|
+
images,
|
565
|
+
tables,
|
566
|
+
interline_equations,
|
567
|
+
fix_discarded_blocks,
|
568
|
+
need_drop,
|
569
|
+
drop_reason,
|
570
|
+
)
|
571
|
+
return page_info
|
572
|
+
|
573
|
+
|
574
|
+
def pdf_parse_union(
|
575
|
+
dataset: Dataset,
|
576
|
+
model_list,
|
577
|
+
imageWriter,
|
578
|
+
parse_mode,
|
579
|
+
start_page_id=0,
|
580
|
+
end_page_id=None,
|
581
|
+
debug_mode=False,
|
582
|
+
):
|
583
|
+
pdf_bytes_md5 = compute_md5(dataset.data_bits())
|
584
|
+
|
585
|
+
"""初始化空的pdf_info_dict"""
|
586
|
+
pdf_info_dict = {}
|
587
|
+
|
588
|
+
"""用model_list和docs对象初始化magic_model"""
|
589
|
+
magic_model = MagicModel(model_list, dataset)
|
590
|
+
|
591
|
+
"""根据输入的起始范围解析pdf"""
|
592
|
+
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
|
593
|
+
end_page_id = (
|
594
|
+
end_page_id
|
595
|
+
if end_page_id is not None and end_page_id >= 0
|
596
|
+
else len(dataset) - 1
|
597
|
+
)
|
598
|
+
|
599
|
+
if end_page_id > len(dataset) - 1:
|
600
|
+
logger.warning('end_page_id is out of range, use pdf_docs length')
|
601
|
+
end_page_id = len(dataset) - 1
|
602
|
+
|
603
|
+
"""初始化启动时间"""
|
604
|
+
start_time = time.time()
|
605
|
+
|
606
|
+
for page_id, page in enumerate(dataset):
|
607
|
+
"""debug时输出每页解析的耗时."""
|
608
|
+
if debug_mode:
|
609
|
+
time_now = time.time()
|
610
|
+
logger.info(
|
611
|
+
f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
|
612
|
+
)
|
613
|
+
start_time = time_now
|
614
|
+
|
615
|
+
"""解析pdf中的每一页"""
|
616
|
+
if start_page_id <= page_id <= end_page_id:
|
617
|
+
page_info = parse_page_core(
|
618
|
+
page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
|
619
|
+
)
|
620
|
+
else:
|
621
|
+
page_info = page.get_page_info()
|
622
|
+
page_w = page_info.w
|
623
|
+
page_h = page_info.h
|
624
|
+
page_info = ocr_construct_page_component_v2(
|
625
|
+
[], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
|
626
|
+
)
|
627
|
+
pdf_info_dict[f'page_{page_id}'] = page_info
|
628
|
+
|
629
|
+
"""分段"""
|
630
|
+
para_split(pdf_info_dict, debug_mode=debug_mode)
|
631
|
+
|
632
|
+
"""dict转list"""
|
633
|
+
pdf_info_list = dict_to_list(pdf_info_dict)
|
634
|
+
new_pdf_info_dict = {
|
635
|
+
'pdf_info': pdf_info_list,
|
636
|
+
}
|
637
|
+
|
638
|
+
clean_memory()
|
639
|
+
|
640
|
+
return new_pdf_info_dict
|
641
|
+
|
642
|
+
|
643
|
+
if __name__ == '__main__':
|
644
|
+
pass
|
magic_pdf/pipe/AbsPipe.py
CHANGED
@@ -17,7 +17,7 @@ class AbsPipe(ABC):
|
|
17
17
|
PIP_TXT = "txt"
|
18
18
|
|
19
19
|
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
|
20
|
-
start_page_id=0, end_page_id=None):
|
20
|
+
start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
|
21
21
|
self.pdf_bytes = pdf_bytes
|
22
22
|
self.model_list = model_list
|
23
23
|
self.image_writer = image_writer
|
@@ -25,6 +25,10 @@ class AbsPipe(ABC):
|
|
25
25
|
self.is_debug = is_debug
|
26
26
|
self.start_page_id = start_page_id
|
27
27
|
self.end_page_id = end_page_id
|
28
|
+
self.lang = lang
|
29
|
+
self.layout_model = layout_model
|
30
|
+
self.formula_enable = formula_enable
|
31
|
+
self.table_enable = table_enable
|
28
32
|
|
29
33
|
def get_compress_pdf_mid_data(self):
|
30
34
|
return JsonCompressor.compress_json(self.pdf_mid_data)
|
magic_pdf/pipe/OCRPipe.py
CHANGED
@@ -10,19 +10,25 @@ from magic_pdf.user_api import parse_ocr_pdf
|
|
10
10
|
class OCRPipe(AbsPipe):
|
11
11
|
|
12
12
|
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
|
13
|
-
start_page_id=0, end_page_id=None
|
14
|
-
|
13
|
+
start_page_id=0, end_page_id=None, lang=None,
|
14
|
+
layout_model=None, formula_enable=None, table_enable=None):
|
15
|
+
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
16
|
+
layout_model, formula_enable, table_enable)
|
15
17
|
|
16
18
|
def pipe_classify(self):
|
17
19
|
pass
|
18
20
|
|
19
21
|
def pipe_analyze(self):
|
20
22
|
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
|
21
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id
|
23
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
24
|
+
lang=self.lang, layout_model=self.layout_model,
|
25
|
+
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
22
26
|
|
23
27
|
def pipe_parse(self):
|
24
28
|
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
|
25
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id
|
29
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
30
|
+
lang=self.lang, layout_model=self.layout_model,
|
31
|
+
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
26
32
|
|
27
33
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
28
34
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|