magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +134 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/Constants.py +27 -1
  9. magic_pdf/libs/boxbase.py +169 -149
  10. magic_pdf/libs/draw_bbox.py +113 -87
  11. magic_pdf/libs/ocr_content_type.py +21 -18
  12. magic_pdf/libs/version.py +1 -1
  13. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  14. magic_pdf/model/magic_model.py +230 -161
  15. magic_pdf/model/model_list.py +8 -0
  16. magic_pdf/model/pdf_extract_kit.py +135 -22
  17. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  18. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
  19. magic_pdf/model/ppTableModel.py +67 -0
  20. magic_pdf/para/para_split_v2.py +76 -74
  21. magic_pdf/pdf_parse_union_core.py +34 -6
  22. magic_pdf/pipe/AbsPipe.py +4 -1
  23. magic_pdf/pipe/OCRPipe.py +7 -4
  24. magic_pdf/pipe/TXTPipe.py +7 -4
  25. magic_pdf/pipe/UNIPipe.py +11 -6
  26. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  27. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  28. magic_pdf/resources/model_config/model_configs.yaml +3 -1
  29. magic_pdf/tools/cli.py +56 -29
  30. magic_pdf/tools/cli_dev.py +61 -64
  31. magic_pdf/tools/common.py +57 -37
  32. magic_pdf/user_api.py +17 -9
  33. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
  34. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
  35. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
  36. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
  37. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
  38. {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
1
+ import copy
2
+
1
3
  from sklearn.cluster import DBSCAN
2
4
  import numpy as np
3
5
  from loguru import logger
@@ -100,59 +102,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
100
102
 
101
103
  if lang != 'en':
102
104
  return lines, None
103
- else:
104
- total_lines = len(lines)
105
- line_fea_encode = []
106
- """
107
- 对每一行进行特征编码,编码规则如下:
108
- 1. 如果行顶格,且大写字母开头或者数字开头,编码为1
109
- 2. 如果顶格,其他非大写开头编码为4
110
- 3. 如果非顶格,首字符大写,编码为2
111
- 4. 如果非顶格,首字符非大写编码为3
112
- """
113
- if len(lines) > 0:
114
- x_map_tag_dict, min_x_tag = cluster_line_x(lines)
115
- for l in lines:
116
- span_text = __get_span_text(l['spans'][0])
117
- first_char = span_text[0]
118
- layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
119
- if not layout:
120
- line_fea_encode.append(0)
105
+
106
+ total_lines = len(lines)
107
+ line_fea_encode = []
108
+ """
109
+ 对每一行进行特征编码,编码规则如下:
110
+ 1. 如果行顶格,且大写字母开头或者数字开头,编码为1
111
+ 2. 如果顶格,其他非大写开头编码为4
112
+ 3. 如果非顶格,首字符大写,编码为2
113
+ 4. 如果非顶格,首字符非大写编码为3
114
+ """
115
+ if len(lines) > 0:
116
+ x_map_tag_dict, min_x_tag = cluster_line_x(lines)
117
+ for l in lines:
118
+ span_text = __get_span_text(l['spans'][0])
119
+ if not span_text:
120
+ line_fea_encode.append(0)
121
+ continue
122
+ first_char = span_text[0]
123
+ layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
124
+ if not layout:
125
+ line_fea_encode.append(0)
126
+ else:
127
+ #
128
+ if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
129
+ # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
130
+ if not first_char.isalnum() or if_match_reference_list(span_text):
131
+ line_fea_encode.append(1)
132
+ else:
133
+ line_fea_encode.append(4)
121
134
  else:
122
- #
123
- if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
124
- # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
125
- if not first_char.isalnum() or if_match_reference_list(span_text):
126
- line_fea_encode.append(1)
127
- else:
128
- line_fea_encode.append(4)
135
+ if first_char.isupper():
136
+ line_fea_encode.append(2)
129
137
  else:
130
- if first_char.isupper():
131
- line_fea_encode.append(2)
132
- else:
133
- line_fea_encode.append(3)
138
+ line_fea_encode.append(3)
134
139
 
135
- # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
140
+ # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
136
141
 
137
- list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
138
- if len(list_indice) > 0:
142
+ list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
143
+ if len(list_indice) > 0:
144
+ if debug_able:
145
+ logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
146
+
147
+ # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
148
+ segments = []
149
+ for start, end in list_indice:
150
+ for i in range(start, end + 1):
151
+ if i > 0:
152
+ if line_fea_encode[i] == 4:
153
+ if debug_able:
154
+ logger.info(f"列表行的第{i}行不是顶格的")
155
+ break
156
+ else:
139
157
  if debug_able:
140
- logger.info(f"发现了列表,列表行数:{list_indice}{list_start_idx}")
141
-
142
- # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
143
- segments = []
144
- for start, end in list_indice:
145
- for i in range(start, end + 1):
146
- if i > 0:
147
- if line_fea_encode[i] == 4:
148
- if debug_able:
149
- logger.info(f"列表行的第{i}行不是顶格的")
150
- break
151
- else:
152
- if debug_able:
153
- logger.info(f"列表行的第{start}到第{end}行是列表")
158
+ logger.info(f"列表行的第{start}到第{end}行是列表")
154
159
 
155
- return split_indices(total_lines, list_indice), list_start_idx
160
+ return split_indices(total_lines, list_indice), list_start_idx
156
161
 
157
162
 
158
163
  def cluster_line_x(lines: list) -> dict:
@@ -164,7 +169,7 @@ def cluster_line_x(lines: list) -> dict:
164
169
  x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
165
170
  x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
166
171
  x0_uniq_label = np.unique(x0_clusters.labels_)
167
- #x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
172
+ # x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
168
173
  x0_2_new_val = {} # 存储旧值对应的新值映射
169
174
  min_x0 = round(lines[0]["bbox"][0])
170
175
  for label in x0_uniq_label:
@@ -197,7 +202,9 @@ def __valign_lines(blocks, layout_bboxes):
197
202
  min_distance = 3
198
203
  min_sample = 2
199
204
  new_layout_bboxes = []
200
-
205
+ # add bbox_fs for para split calculation
206
+ for block in blocks:
207
+ block["bbox_fs"] = copy.deepcopy(block["bbox"])
201
208
  for layout_box in layout_bboxes:
202
209
  blocks_in_layoutbox = [b for b in blocks if
203
210
  b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
@@ -242,16 +249,15 @@ def __valign_lines(blocks, layout_bboxes):
242
249
  # 由于修改了block里的line长度,现在需要重新计算block的bbox
243
250
  for block in blocks_in_layoutbox:
244
251
  if len(block["lines"]) > 0:
245
- block['bbox'] = [min([line['bbox'][0] for line in block['lines']]),
246
- min([line['bbox'][1] for line in block['lines']]),
247
- max([line['bbox'][2] for line in block['lines']]),
248
- max([line['bbox'][3] for line in block['lines']])]
249
-
252
+ block['bbox_fs'] = [min([line['bbox'][0] for line in block['lines']]),
253
+ min([line['bbox'][1] for line in block['lines']]),
254
+ max([line['bbox'][2] for line in block['lines']]),
255
+ max([line['bbox'][3] for line in block['lines']])]
250
256
  """新计算layout的bbox,因为block的bbox变了。"""
251
- layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox])
252
- layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox])
253
- layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox])
254
- layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox])
257
+ layout_x0 = min([block['bbox_fs'][0] for block in blocks_in_layoutbox])
258
+ layout_y0 = min([block['bbox_fs'][1] for block in blocks_in_layoutbox])
259
+ layout_x1 = max([block['bbox_fs'][2] for block in blocks_in_layoutbox])
260
+ layout_y1 = max([block['bbox_fs'][3] for block in blocks_in_layoutbox])
255
261
  new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
256
262
 
257
263
  return new_layout_bboxes
@@ -309,7 +315,7 @@ def __group_line_by_layout(blocks, layout_bboxes):
309
315
  # 因为只是一个block一行目前, 一个block就是一个段落
310
316
  blocks_group = []
311
317
  for lyout in layout_bboxes:
312
- blocks_in_layout = [block for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox'])]
318
+ blocks_in_layout = [block for block in blocks if is_in_layout(block.get('bbox_fs', None), lyout['layout_bbox'])]
313
319
  blocks_group.append(blocks_in_layout)
314
320
  return blocks_group
315
321
 
@@ -362,7 +368,8 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
362
368
  for i in range(0, len(list_start)):
363
369
  index = list_start[i] - 1
364
370
  if index >= 0:
365
- if "content" in lines[index]["spans"][-1]:
371
+ if "content" in lines[index]["spans"][-1] and lines[index]["spans"][-1].get('type', '') not in [
372
+ ContentType.InlineEquation, ContentType.InterlineEquation]:
366
373
  lines[index]["spans"][-1]["content"] += '\n\n'
367
374
  layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾
368
375
  for content_type, start, end in text_segments:
@@ -474,7 +481,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
474
481
  break
475
482
  # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。
476
483
  if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
477
- #pre_page_paras[-1].append(may_list_lines)
484
+ # pre_page_paras[-1].append(may_list_lines)
478
485
  # 下一页合并到上一页最后一段,打一个cross_page的标签
479
486
  for line in may_list_lines:
480
487
  for span in line["spans"]:
@@ -534,7 +541,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
534
541
  next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
535
542
  next_first_line_type = next_first_line['spans'][0]['type']
536
543
  if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
537
- #connected_layout_paras.append(layout_paras[i])
538
544
  connected_layout_blocks.append(blocks_group[i])
539
545
  continue
540
546
  pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)
@@ -549,10 +555,8 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
549
555
  -1] not in LINE_STOP_FLAG and \
550
556
  next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。
551
557
  """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。"""
552
- #connected_layout_paras[-1][-1].extend(layout_paras[i][0])
553
558
  connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"])
554
- #layout_paras[i].pop(0) # 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。
555
- blocks_group[i][0]["lines"] = [] #删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
559
+ blocks_group[i][0]["lines"] = [] # 删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了
556
560
  blocks_group[i][0][LINES_DELETED] = True
557
561
  # if len(layout_paras[i]) == 0:
558
562
  # layout_paras.pop(i)
@@ -561,7 +565,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
561
565
  connected_layout_blocks.append(blocks_group[i])
562
566
  else:
563
567
  """连接段落条件不成立,将前一个layout的段落加入到结果中。"""
564
- #connected_layout_paras.append(layout_paras[i])
565
568
  connected_layout_blocks.append(blocks_group[i])
566
569
  return connected_layout_blocks
567
570
 
@@ -619,7 +622,7 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
619
622
  span[CROSS_PAGE] = True
620
623
  pre_last_para.extend(next_first_para)
621
624
 
622
- #next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
625
+ # next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。
623
626
  next_page_paras[0][0]["lines"] = []
624
627
  next_page_paras[0][0][LINES_DELETED] = True
625
628
  return True
@@ -663,16 +666,15 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
663
666
  layout_box = new_layout_bbox[layout_i]
664
667
  single_line_paras_tag = []
665
668
  for i in range(len(layout_para)):
666
- #single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
669
+ # single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
667
670
  single_line_paras_tag.append(layout_para[i]['type'] == BlockType.Text and len(layout_para[i]["lines"]) == 1)
668
671
  """找出来连续的单行文本,如果连续行高度相同,那么合并为一个段落。"""
669
672
  consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
670
673
  if len(consecutive_single_line_indices) > 0:
671
- #index_offset = 0
672
674
  """检查这些行是否是高度相同的,居中的"""
673
675
  for start, end in consecutive_single_line_indices:
674
- #start += index_offset
675
- #end += index_offset
676
+ # start += index_offset
677
+ # end += index_offset
676
678
  line_hi = np.array([block["lines"][0]['bbox'][3] - block["lines"][0]['bbox'][1] for block in
677
679
  layout_para[start:end + 1]])
678
680
  first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
@@ -697,9 +699,9 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
697
699
  for i_para in range(start + 1, end + 1):
698
700
  layout_para[i_para]["lines"] = []
699
701
  layout_para[i_para][LINES_DELETED] = True
700
- #layout_para[start:end + 1] = [merge_para]
702
+ # layout_para[start:end + 1] = [merge_para]
701
703
 
702
- #index_offset -= end - start
704
+ # index_offset -= end - start
703
705
 
704
706
  return
705
707
 
@@ -739,7 +741,7 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
739
741
  new_layout_of_pages = [] # 数组的数组,每个元素是一个页面的layoutS
740
742
  all_page_list_info = [] # 保存每个页面开头和结尾是否是列表
741
743
  for page_num, page in pdf_info_dict.items():
742
- blocks = page['preproc_blocks']
744
+ blocks = copy.deepcopy(page['preproc_blocks'])
743
745
  layout_bboxes = page['layout_bboxes']
744
746
  new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
745
747
  new_layout_of_pages.append(new_layout_bbox)
@@ -41,6 +41,23 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
41
41
  return is_useful_block_horz_overlap, all_bboxes
42
42
 
43
43
 
44
+ def __replace_STX_ETX(text_str:str):
45
+ """ Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
46
+ Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
47
+
48
+ Args:
49
+ text_str (str): raw text
50
+
51
+ Returns:
52
+ _type_: replaced text
53
+ """
54
+ if text_str:
55
+ s = text_str.replace('\u0002', "'")
56
+ s = s.replace("\u0003", "'")
57
+ return s
58
+ return text_str
59
+
60
+
44
61
  def txt_spans_extract(pdf_page, inline_equations, interline_equations):
45
62
  text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
46
63
  char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
@@ -63,7 +80,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
63
80
  spans.append(
64
81
  {
65
82
  "bbox": list(span["bbox"]),
66
- "content": span["text"],
83
+ "content": __replace_STX_ETX(span["text"]),
67
84
  "type": ContentType.Text,
68
85
  "score": 1.0,
69
86
  }
@@ -175,7 +192,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
175
192
  sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
176
193
 
177
194
  '''将span填入排好序的blocks中'''
178
- block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.6)
195
+ block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.3)
179
196
 
180
197
  '''对block进行fix操作'''
181
198
  fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
@@ -208,13 +225,17 @@ def pdf_parse_union(pdf_bytes,
208
225
  magic_model = MagicModel(model_list, pdf_docs)
209
226
 
210
227
  '''根据输入的起始范围解析pdf'''
211
- end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
228
+ # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
229
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf_docs) - 1
230
+
231
+ if end_page_id > len(pdf_docs) - 1:
232
+ logger.warning("end_page_id is out of range, use pdf_docs length")
233
+ end_page_id = len(pdf_docs) - 1
212
234
 
213
235
  '''初始化启动时间'''
214
236
  start_time = time.time()
215
237
 
216
- for page_id in range(start_page_id, end_page_id + 1):
217
-
238
+ for page_id, page in enumerate(pdf_docs):
218
239
  '''debug时输出每页解析的耗时'''
219
240
  if debug_mode:
220
241
  time_now = time.time()
@@ -224,7 +245,14 @@ def pdf_parse_union(pdf_bytes,
224
245
  start_time = time_now
225
246
 
226
247
  '''解析pdf中的每一页'''
227
- page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
248
+ if start_page_id <= page_id <= end_page_id:
249
+ page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
250
+ else:
251
+ page_w = page.rect.width
252
+ page_h = page.rect.height
253
+ page_info = ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
254
+ [], [], [], [],
255
+ True, "skip page")
228
256
  pdf_info_dict[f"page_{page_id}"] = page_info
229
257
 
230
258
  """分段"""
magic_pdf/pipe/AbsPipe.py CHANGED
@@ -16,12 +16,15 @@ class AbsPipe(ABC):
16
16
  PIP_OCR = "ocr"
17
17
  PIP_TXT = "txt"
18
18
 
19
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
19
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
20
+ start_page_id=0, end_page_id=None):
20
21
  self.pdf_bytes = pdf_bytes
21
22
  self.model_list = model_list
22
23
  self.image_writer = image_writer
23
24
  self.pdf_mid_data = None # 未压缩
24
25
  self.is_debug = is_debug
26
+ self.start_page_id = start_page_id
27
+ self.end_page_id = end_page_id
25
28
 
26
29
  def get_compress_pdf_mid_data(self):
27
30
  return JsonCompressor.compress_json(self.pdf_mid_data)
magic_pdf/pipe/OCRPipe.py CHANGED
@@ -9,17 +9,20 @@ from magic_pdf.user_api import parse_ocr_pdf
9
9
 
10
10
  class OCRPipe(AbsPipe):
11
11
 
12
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
13
- super().__init__(pdf_bytes, model_list, image_writer, is_debug)
12
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
13
+ start_page_id=0, end_page_id=None):
14
+ super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
14
15
 
15
16
  def pipe_classify(self):
16
17
  pass
17
18
 
18
19
  def pipe_analyze(self):
19
- self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
20
+ self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
21
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id)
20
22
 
21
23
  def pipe_parse(self):
22
- self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
24
+ self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
25
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id)
23
26
 
24
27
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
25
28
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
magic_pdf/pipe/TXTPipe.py CHANGED
@@ -10,17 +10,20 @@ from magic_pdf.user_api import parse_txt_pdf
10
10
 
11
11
  class TXTPipe(AbsPipe):
12
12
 
13
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
14
- super().__init__(pdf_bytes, model_list, image_writer, is_debug)
13
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
14
+ start_page_id=0, end_page_id=None):
15
+ super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
15
16
 
16
17
  def pipe_classify(self):
17
18
  pass
18
19
 
19
20
  def pipe_analyze(self):
20
- self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
21
+ self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
22
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id)
21
23
 
22
24
  def pipe_parse(self):
23
- self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
25
+ self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
26
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id)
24
27
 
25
28
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
26
29
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
magic_pdf/pipe/UNIPipe.py CHANGED
@@ -13,9 +13,10 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
13
13
 
14
14
  class UNIPipe(AbsPipe):
15
15
 
16
- def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
16
+ def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
17
+ start_page_id=0, end_page_id=None):
17
18
  self.pdf_type = jso_useful_key["_pdf_type"]
18
- super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
19
+ super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id)
19
20
  if len(self.model_list) == 0:
20
21
  self.input_model_is_empty = True
21
22
  else:
@@ -26,17 +27,21 @@ class UNIPipe(AbsPipe):
26
27
 
27
28
  def pipe_analyze(self):
28
29
  if self.pdf_type == self.PIP_TXT:
29
- self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
30
+ self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
31
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id)
30
32
  elif self.pdf_type == self.PIP_OCR:
31
- self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
33
+ self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
34
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id)
32
35
 
33
36
  def pipe_parse(self):
34
37
  if self.pdf_type == self.PIP_TXT:
35
38
  self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
36
- is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty)
39
+ is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
40
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id)
37
41
  elif self.pdf_type == self.PIP_OCR:
38
42
  self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
39
- is_debug=self.is_debug)
43
+ is_debug=self.is_debug,
44
+ start_page_id=self.start_page_id, end_page_id=self.end_page_id)
40
45
 
41
46
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
42
47
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
@@ -133,6 +133,7 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
133
133
 
134
134
 
135
135
  def remove_overlaps_min_blocks(all_bboxes):
136
+ # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
136
137
  # 删除重叠blocks中较小的那些
137
138
  need_remove = []
138
139
  for block1 in all_bboxes:
@@ -142,9 +143,17 @@ def remove_overlaps_min_blocks(all_bboxes):
142
143
  block2_bbox = block2[:4]
143
144
  overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
144
145
  if overlap_box is not None:
145
- bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
146
- if bbox_to_remove is not None and bbox_to_remove not in need_remove:
147
- need_remove.append(bbox_to_remove)
146
+ block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
147
+ if block_to_remove is not None and block_to_remove not in need_remove:
148
+ large_block = block1 if block1 != block_to_remove else block2
149
+ x1, y1, x2, y2 = large_block[:4]
150
+ sx1, sy1, sx2, sy2 = block_to_remove[:4]
151
+ x1 = min(x1, sx1)
152
+ y1 = min(y1, sy1)
153
+ x2 = max(x2, sx2)
154
+ y2 = max(y2, sy2)
155
+ large_block[:4] = [x1, y1, x2, y2]
156
+ need_remove.append(block_to_remove)
148
157
 
149
158
  if len(need_remove) > 0:
150
159
  for block in need_remove: