magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +134 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/Constants.py +27 -1
- magic_pdf/libs/boxbase.py +169 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +230 -161
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +135 -22
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
- magic_pdf/model/ppTableModel.py +67 -0
- magic_pdf/para/para_split_v2.py +76 -74
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/resources/model_config/model_configs.yaml +3 -1
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
magic_pdf/para/para_split_v2.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
import copy
|
2
|
+
|
1
3
|
from sklearn.cluster import DBSCAN
|
2
4
|
import numpy as np
|
3
5
|
from loguru import logger
|
@@ -100,59 +102,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
|
|
100
102
|
|
101
103
|
if lang != 'en':
|
102
104
|
return lines, None
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
105
|
+
|
106
|
+
total_lines = len(lines)
|
107
|
+
line_fea_encode = []
|
108
|
+
"""
|
109
|
+
对每一行进行特征编码,编码规则如下:
|
110
|
+
1. 如果行顶格,且大写字母开头或者数字开头,编码为1
|
111
|
+
2. 如果顶格,其他非大写开头编码为4
|
112
|
+
3. 如果非顶格,首字符大写,编码为2
|
113
|
+
4. 如果非顶格,首字符非大写编码为3
|
114
|
+
"""
|
115
|
+
if len(lines) > 0:
|
116
|
+
x_map_tag_dict, min_x_tag = cluster_line_x(lines)
|
117
|
+
for l in lines:
|
118
|
+
span_text = __get_span_text(l['spans'][0])
|
119
|
+
if not span_text:
|
120
|
+
line_fea_encode.append(0)
|
121
|
+
continue
|
122
|
+
first_char = span_text[0]
|
123
|
+
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
|
124
|
+
if not layout:
|
125
|
+
line_fea_encode.append(0)
|
126
|
+
else:
|
127
|
+
#
|
128
|
+
if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
|
129
|
+
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
|
130
|
+
if not first_char.isalnum() or if_match_reference_list(span_text):
|
131
|
+
line_fea_encode.append(1)
|
132
|
+
else:
|
133
|
+
line_fea_encode.append(4)
|
121
134
|
else:
|
122
|
-
|
123
|
-
|
124
|
-
# if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
|
125
|
-
if not first_char.isalnum() or if_match_reference_list(span_text):
|
126
|
-
line_fea_encode.append(1)
|
127
|
-
else:
|
128
|
-
line_fea_encode.append(4)
|
135
|
+
if first_char.isupper():
|
136
|
+
line_fea_encode.append(2)
|
129
137
|
else:
|
130
|
-
|
131
|
-
line_fea_encode.append(2)
|
132
|
-
else:
|
133
|
-
line_fea_encode.append(3)
|
138
|
+
line_fea_encode.append(3)
|
134
139
|
|
135
|
-
|
140
|
+
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
|
136
141
|
|
137
|
-
|
138
|
-
|
142
|
+
list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
|
143
|
+
if len(list_indice) > 0:
|
144
|
+
if debug_able:
|
145
|
+
logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
|
146
|
+
|
147
|
+
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
|
148
|
+
segments = []
|
149
|
+
for start, end in list_indice:
|
150
|
+
for i in range(start, end + 1):
|
151
|
+
if i > 0:
|
152
|
+
if line_fea_encode[i] == 4:
|
153
|
+
if debug_able:
|
154
|
+
logger.info(f"列表行的第{i}行不是顶格的")
|
155
|
+
break
|
156
|
+
else:
|
139
157
|
if debug_able:
|
140
|
-
logger.info(f"
|
141
|
-
|
142
|
-
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
|
143
|
-
segments = []
|
144
|
-
for start, end in list_indice:
|
145
|
-
for i in range(start, end + 1):
|
146
|
-
if i > 0:
|
147
|
-
if line_fea_encode[i] == 4:
|
148
|
-
if debug_able:
|
149
|
-
logger.info(f"列表行的第{i}行不是顶格的")
|
150
|
-
break
|
151
|
-
else:
|
152
|
-
if debug_able:
|
153
|
-
logger.info(f"列表行的第{start}到第{end}行是列表")
|
158
|
+
logger.info(f"列表行的第{start}到第{end}行是列表")
|
154
159
|
|
155
|
-
|
160
|
+
return split_indices(total_lines, list_indice), list_start_idx
|
156
161
|
|
157
162
|
|
158
163
|
def cluster_line_x(lines: list) -> dict:
|
@@ -164,7 +169,7 @@ def cluster_line_x(lines: list) -> dict:
|
|
164
169
|
x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
|
165
170
|
x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
|
166
171
|
x0_uniq_label = np.unique(x0_clusters.labels_)
|
167
|
-
#x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
|
172
|
+
# x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
|
168
173
|
x0_2_new_val = {} # 存储旧值对应的新值映射
|
169
174
|
min_x0 = round(lines[0]["bbox"][0])
|
170
175
|
for label in x0_uniq_label:
|
@@ -197,7 +202,9 @@ def __valign_lines(blocks, layout_bboxes):
|
|
197
202
|
min_distance = 3
|
198
203
|
min_sample = 2
|
199
204
|
new_layout_bboxes = []
|
200
|
-
|
205
|
+
# add bbox_fs for para split calculation
|
206
|
+
for block in blocks:
|
207
|
+
block["bbox_fs"] = copy.deepcopy(block["bbox"])
|
201
208
|
for layout_box in layout_bboxes:
|
202
209
|
blocks_in_layoutbox = [b for b in blocks if
|
203
210
|
b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
|
@@ -242,16 +249,15 @@ def __valign_lines(blocks, layout_bboxes):
|
|
242
249
|
# 由于修改了block里的line长度,现在需要重新计算block的bbox
|
243
250
|
for block in blocks_in_layoutbox:
|
244
251
|
if len(block["lines"]) > 0:
|
245
|
-
block['
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
252
|
+
block['bbox_fs'] = [min([line['bbox'][0] for line in block['lines']]),
|
253
|
+
min([line['bbox'][1] for line in block['lines']]),
|
254
|
+
max([line['bbox'][2] for line in block['lines']]),
|
255
|
+
max([line['bbox'][3] for line in block['lines']])]
|
250
256
|
"""新计算layout的bbox,因为block的bbox变了。"""
|
251
|
-
layout_x0 = min([block['
|
252
|
-
layout_y0 = min([block['
|
253
|
-
layout_x1 = max([block['
|
254
|
-
layout_y1 = max([block['
|
257
|
+
layout_x0 = min([block['bbox_fs'][0] for block in blocks_in_layoutbox])
|
258
|
+
layout_y0 = min([block['bbox_fs'][1] for block in blocks_in_layoutbox])
|
259
|
+
layout_x1 = max([block['bbox_fs'][2] for block in blocks_in_layoutbox])
|
260
|
+
layout_y1 = max([block['bbox_fs'][3] for block in blocks_in_layoutbox])
|
255
261
|
new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
|
256
262
|
|
257
263
|
return new_layout_bboxes
|
@@ -309,7 +315,7 @@ def __group_line_by_layout(blocks, layout_bboxes):
|
|
309
315
|
# 因为只是一个block一行目前, 一个block就是一个段落
|
310
316
|
blocks_group = []
|
311
317
|
for lyout in layout_bboxes:
|
312
|
-
blocks_in_layout = [block for block in blocks if is_in_layout(block
|
318
|
+
blocks_in_layout = [block for block in blocks if is_in_layout(block.get('bbox_fs', None), lyout['layout_bbox'])]
|
313
319
|
blocks_group.append(blocks_in_layout)
|
314
320
|
return blocks_group
|
315
321
|
|
@@ -362,7 +368,8 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
|
|
362
368
|
for i in range(0, len(list_start)):
|
363
369
|
index = list_start[i] - 1
|
364
370
|
if index >= 0:
|
365
|
-
if "content" in lines[index]["spans"][-1]
|
371
|
+
if "content" in lines[index]["spans"][-1] and lines[index]["spans"][-1].get('type', '') not in [
|
372
|
+
ContentType.InlineEquation, ContentType.InterlineEquation]:
|
366
373
|
lines[index]["spans"][-1]["content"] += '\n\n'
|
367
374
|
layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
|
368
375
|
for content_type, start, end in text_segments:
|
@@ -474,7 +481,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
|
474
481
|
break
|
475
482
|
# 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
|
476
483
|
if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
|
477
|
-
#pre_page_paras[-1].append(may_list_lines)
|
484
|
+
# pre_page_paras[-1].append(may_list_lines)
|
478
485
|
# 下一页合并到上一页最后一段,打一个cross_page的标签
|
479
486
|
for line in may_list_lines:
|
480
487
|
for span in line["spans"]:
|
@@ -534,7 +541,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
|
534
541
|
next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
|
535
542
|
next_first_line_type = next_first_line['spans'][0]['type']
|
536
543
|
if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
|
537
|
-
#connected_layout_paras.append(layout_paras[i])
|
538
544
|
connected_layout_blocks.append(blocks_group[i])
|
539
545
|
continue
|
540
546
|
pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)
|
@@ -549,10 +555,8 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
|
549
555
|
-1] not in LINE_STOP_FLAG and \
|
550
556
|
next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
|
551
557
|
"""连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
|
552
|
-
#connected_layout_paras[-1][-1].extend(layout_paras[i][0])
|
553
558
|
connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"])
|
554
|
-
|
555
|
-
blocks_group[i][0]["lines"] = [] #删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
|
559
|
+
blocks_group[i][0]["lines"] = [] # 删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
|
556
560
|
blocks_group[i][0][LINES_DELETED] = True
|
557
561
|
# if len(layout_paras[i]) == 0:
|
558
562
|
# layout_paras.pop(i)
|
@@ -561,7 +565,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
|
|
561
565
|
connected_layout_blocks.append(blocks_group[i])
|
562
566
|
else:
|
563
567
|
"""连接段落条件不成立,将前一个layout的段落加入到结果中。"""
|
564
|
-
#connected_layout_paras.append(layout_paras[i])
|
565
568
|
connected_layout_blocks.append(blocks_group[i])
|
566
569
|
return connected_layout_blocks
|
567
570
|
|
@@ -619,7 +622,7 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
|
|
619
622
|
span[CROSS_PAGE] = True
|
620
623
|
pre_last_para.extend(next_first_para)
|
621
624
|
|
622
|
-
#next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
|
625
|
+
# next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
|
623
626
|
next_page_paras[0][0]["lines"] = []
|
624
627
|
next_page_paras[0][0][LINES_DELETED] = True
|
625
628
|
return True
|
@@ -663,16 +666,15 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
|
|
663
666
|
layout_box = new_layout_bbox[layout_i]
|
664
667
|
single_line_paras_tag = []
|
665
668
|
for i in range(len(layout_para)):
|
666
|
-
#single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
|
669
|
+
# single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
|
667
670
|
single_line_paras_tag.append(layout_para[i]['type'] == BlockType.Text and len(layout_para[i]["lines"]) == 1)
|
668
671
|
"""找出来连续的单行文本,如果连续行高度相同,那么合并为一个段落。"""
|
669
672
|
consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
|
670
673
|
if len(consecutive_single_line_indices) > 0:
|
671
|
-
#index_offset = 0
|
672
674
|
"""检查这些行是否是高度相同的,居中的"""
|
673
675
|
for start, end in consecutive_single_line_indices:
|
674
|
-
#start += index_offset
|
675
|
-
#end += index_offset
|
676
|
+
# start += index_offset
|
677
|
+
# end += index_offset
|
676
678
|
line_hi = np.array([block["lines"][0]['bbox'][3] - block["lines"][0]['bbox'][1] for block in
|
677
679
|
layout_para[start:end + 1]])
|
678
680
|
first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
|
@@ -697,9 +699,9 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
|
|
697
699
|
for i_para in range(start + 1, end + 1):
|
698
700
|
layout_para[i_para]["lines"] = []
|
699
701
|
layout_para[i_para][LINES_DELETED] = True
|
700
|
-
#layout_para[start:end + 1] = [merge_para]
|
702
|
+
# layout_para[start:end + 1] = [merge_para]
|
701
703
|
|
702
|
-
#index_offset -= end - start
|
704
|
+
# index_offset -= end - start
|
703
705
|
|
704
706
|
return
|
705
707
|
|
@@ -739,7 +741,7 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
|
|
739
741
|
new_layout_of_pages = [] # 数组的数组,每个元素是一个页面的layoutS
|
740
742
|
all_page_list_info = [] # 保存每个页面开头和结尾是否是列表
|
741
743
|
for page_num, page in pdf_info_dict.items():
|
742
|
-
blocks = page['preproc_blocks']
|
744
|
+
blocks = copy.deepcopy(page['preproc_blocks'])
|
743
745
|
layout_bboxes = page['layout_bboxes']
|
744
746
|
new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
|
745
747
|
new_layout_of_pages.append(new_layout_bbox)
|
@@ -41,6 +41,23 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
|
|
41
41
|
return is_useful_block_horz_overlap, all_bboxes
|
42
42
|
|
43
43
|
|
44
|
+
def __replace_STX_ETX(text_str:str):
|
45
|
+
""" Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
|
46
|
+
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
text_str (str): raw text
|
50
|
+
|
51
|
+
Returns:
|
52
|
+
_type_: replaced text
|
53
|
+
"""
|
54
|
+
if text_str:
|
55
|
+
s = text_str.replace('\u0002', "'")
|
56
|
+
s = s.replace("\u0003", "'")
|
57
|
+
return s
|
58
|
+
return text_str
|
59
|
+
|
60
|
+
|
44
61
|
def txt_spans_extract(pdf_page, inline_equations, interline_equations):
|
45
62
|
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
|
46
63
|
char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
|
@@ -63,7 +80,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
|
|
63
80
|
spans.append(
|
64
81
|
{
|
65
82
|
"bbox": list(span["bbox"]),
|
66
|
-
"content": span["text"],
|
83
|
+
"content": __replace_STX_ETX(span["text"]),
|
67
84
|
"type": ContentType.Text,
|
68
85
|
"score": 1.0,
|
69
86
|
}
|
@@ -175,7 +192,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
|
|
175
192
|
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
|
176
193
|
|
177
194
|
'''将span填入排好序的blocks中'''
|
178
|
-
block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.
|
195
|
+
block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.3)
|
179
196
|
|
180
197
|
'''对block进行fix操作'''
|
181
198
|
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
|
@@ -208,13 +225,17 @@ def pdf_parse_union(pdf_bytes,
|
|
208
225
|
magic_model = MagicModel(model_list, pdf_docs)
|
209
226
|
|
210
227
|
'''根据输入的起始范围解析pdf'''
|
211
|
-
end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
|
228
|
+
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
|
229
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf_docs) - 1
|
230
|
+
|
231
|
+
if end_page_id > len(pdf_docs) - 1:
|
232
|
+
logger.warning("end_page_id is out of range, use pdf_docs length")
|
233
|
+
end_page_id = len(pdf_docs) - 1
|
212
234
|
|
213
235
|
'''初始化启动时间'''
|
214
236
|
start_time = time.time()
|
215
237
|
|
216
|
-
for page_id in
|
217
|
-
|
238
|
+
for page_id, page in enumerate(pdf_docs):
|
218
239
|
'''debug时输出每页解析的耗时'''
|
219
240
|
if debug_mode:
|
220
241
|
time_now = time.time()
|
@@ -224,7 +245,14 @@ def pdf_parse_union(pdf_bytes,
|
|
224
245
|
start_time = time_now
|
225
246
|
|
226
247
|
'''解析pdf中的每一页'''
|
227
|
-
|
248
|
+
if start_page_id <= page_id <= end_page_id:
|
249
|
+
page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
|
250
|
+
else:
|
251
|
+
page_w = page.rect.width
|
252
|
+
page_h = page.rect.height
|
253
|
+
page_info = ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
|
254
|
+
[], [], [], [],
|
255
|
+
True, "skip page")
|
228
256
|
pdf_info_dict[f"page_{page_id}"] = page_info
|
229
257
|
|
230
258
|
"""分段"""
|
magic_pdf/pipe/AbsPipe.py
CHANGED
@@ -16,12 +16,15 @@ class AbsPipe(ABC):
|
|
16
16
|
PIP_OCR = "ocr"
|
17
17
|
PIP_TXT = "txt"
|
18
18
|
|
19
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False
|
19
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
|
20
|
+
start_page_id=0, end_page_id=None):
|
20
21
|
self.pdf_bytes = pdf_bytes
|
21
22
|
self.model_list = model_list
|
22
23
|
self.image_writer = image_writer
|
23
24
|
self.pdf_mid_data = None # 未压缩
|
24
25
|
self.is_debug = is_debug
|
26
|
+
self.start_page_id = start_page_id
|
27
|
+
self.end_page_id = end_page_id
|
25
28
|
|
26
29
|
def get_compress_pdf_mid_data(self):
|
27
30
|
return JsonCompressor.compress_json(self.pdf_mid_data)
|
magic_pdf/pipe/OCRPipe.py
CHANGED
@@ -9,17 +9,20 @@ from magic_pdf.user_api import parse_ocr_pdf
|
|
9
9
|
|
10
10
|
class OCRPipe(AbsPipe):
|
11
11
|
|
12
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False
|
13
|
-
|
12
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
|
13
|
+
start_page_id=0, end_page_id=None):
|
14
|
+
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
|
14
15
|
|
15
16
|
def pipe_classify(self):
|
16
17
|
pass
|
17
18
|
|
18
19
|
def pipe_analyze(self):
|
19
|
-
self.model_list = doc_analyze(self.pdf_bytes, ocr=True
|
20
|
+
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
|
21
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
20
22
|
|
21
23
|
def pipe_parse(self):
|
22
|
-
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug
|
24
|
+
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
|
25
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
23
26
|
|
24
27
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
25
28
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
magic_pdf/pipe/TXTPipe.py
CHANGED
@@ -10,17 +10,20 @@ from magic_pdf.user_api import parse_txt_pdf
|
|
10
10
|
|
11
11
|
class TXTPipe(AbsPipe):
|
12
12
|
|
13
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False
|
14
|
-
|
13
|
+
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
|
14
|
+
start_page_id=0, end_page_id=None):
|
15
|
+
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
|
15
16
|
|
16
17
|
def pipe_classify(self):
|
17
18
|
pass
|
18
19
|
|
19
20
|
def pipe_analyze(self):
|
20
|
-
self.model_list = doc_analyze(self.pdf_bytes, ocr=False
|
21
|
+
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
|
22
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
21
23
|
|
22
24
|
def pipe_parse(self):
|
23
|
-
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug
|
25
|
+
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
|
26
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
24
27
|
|
25
28
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
26
29
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
magic_pdf/pipe/UNIPipe.py
CHANGED
@@ -13,9 +13,10 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
|
|
13
13
|
|
14
14
|
class UNIPipe(AbsPipe):
|
15
15
|
|
16
|
-
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False
|
16
|
+
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
|
17
|
+
start_page_id=0, end_page_id=None):
|
17
18
|
self.pdf_type = jso_useful_key["_pdf_type"]
|
18
|
-
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
|
19
|
+
super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id)
|
19
20
|
if len(self.model_list) == 0:
|
20
21
|
self.input_model_is_empty = True
|
21
22
|
else:
|
@@ -26,17 +27,21 @@ class UNIPipe(AbsPipe):
|
|
26
27
|
|
27
28
|
def pipe_analyze(self):
|
28
29
|
if self.pdf_type == self.PIP_TXT:
|
29
|
-
self.model_list = doc_analyze(self.pdf_bytes, ocr=False
|
30
|
+
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
|
31
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
30
32
|
elif self.pdf_type == self.PIP_OCR:
|
31
|
-
self.model_list = doc_analyze(self.pdf_bytes, ocr=True
|
33
|
+
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
|
34
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
32
35
|
|
33
36
|
def pipe_parse(self):
|
34
37
|
if self.pdf_type == self.PIP_TXT:
|
35
38
|
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
36
|
-
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty
|
39
|
+
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
|
40
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
37
41
|
elif self.pdf_type == self.PIP_OCR:
|
38
42
|
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
39
|
-
is_debug=self.is_debug
|
43
|
+
is_debug=self.is_debug,
|
44
|
+
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
|
40
45
|
|
41
46
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
42
47
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
@@ -133,6 +133,7 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
|
|
133
133
|
|
134
134
|
|
135
135
|
def remove_overlaps_min_blocks(all_bboxes):
|
136
|
+
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
|
136
137
|
# 删除重叠blocks中较小的那些
|
137
138
|
need_remove = []
|
138
139
|
for block1 in all_bboxes:
|
@@ -142,9 +143,17 @@ def remove_overlaps_min_blocks(all_bboxes):
|
|
142
143
|
block2_bbox = block2[:4]
|
143
144
|
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
|
144
145
|
if overlap_box is not None:
|
145
|
-
|
146
|
-
if
|
147
|
-
|
146
|
+
block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
|
147
|
+
if block_to_remove is not None and block_to_remove not in need_remove:
|
148
|
+
large_block = block1 if block1 != block_to_remove else block2
|
149
|
+
x1, y1, x2, y2 = large_block[:4]
|
150
|
+
sx1, sy1, sx2, sy2 = block_to_remove[:4]
|
151
|
+
x1 = min(x1, sx1)
|
152
|
+
y1 = min(y1, sy1)
|
153
|
+
x2 = max(x2, sx2)
|
154
|
+
y2 = max(y2, sy2)
|
155
|
+
large_block[:4] = [x1, y1, x2, y2]
|
156
|
+
need_remove.append(block_to_remove)
|
148
157
|
|
149
158
|
if len(need_remove) > 0:
|
150
159
|
for block in need_remove:
|