magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +5 -5
  12. magic_pdf/libs/draw_bbox.py +3 -2
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +122 -76
  18. magic_pdf/model/sub_modules/model_init.py +40 -35
  19. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  21. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  22. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  23. magic_pdf/para/para_split.py +411 -248
  24. magic_pdf/para/para_split_v2.py +352 -182
  25. magic_pdf/para/para_split_v3.py +110 -53
  26. magic_pdf/pdf_parse_by_ocr.py +2 -0
  27. magic_pdf/pdf_parse_by_txt.py +2 -0
  28. magic_pdf/pdf_parse_union_core.py +174 -100
  29. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  30. magic_pdf/pipe/AbsPipe.py +28 -44
  31. magic_pdf/pipe/OCRPipe.py +5 -5
  32. magic_pdf/pipe/TXTPipe.py +5 -6
  33. magic_pdf/pipe/UNIPipe.py +24 -25
  34. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  35. magic_pdf/pre_proc/cut_image.py +9 -11
  36. magic_pdf/pre_proc/equations_replace.py +203 -212
  37. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  38. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  39. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  40. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  41. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  42. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  43. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  44. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  45. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  46. magic_pdf/spark/spark_api.py +15 -17
  47. magic_pdf/tools/cli.py +3 -4
  48. magic_pdf/tools/cli_dev.py +6 -9
  49. magic_pdf/tools/common.py +26 -36
  50. magic_pdf/user_api.py +29 -38
  51. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
  53. magic_pdf/libs/Constants.py +0 -55
  54. magic_pdf/libs/MakeContentConfig.py +0 -11
  55. magic_pdf/libs/drop_reason.py +0 -27
  56. magic_pdf/libs/drop_tag.py +0 -19
  57. magic_pdf/para/para_pipeline.py +0 -297
  58. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  59. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  60. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -7,18 +7,32 @@ from typing import List
7
7
  import torch
8
8
  from loguru import logger
9
9
 
10
+ from magic_pdf.config.drop_reason import DropReason
10
11
  from magic_pdf.config.enums import SupportedPdfParseMethod
12
+ from magic_pdf.config.ocr_content_type import BlockType, ContentType
11
13
  from magic_pdf.data.dataset import Dataset, PageableData
12
14
  from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
13
15
  from magic_pdf.libs.clean_memory import clean_memory
14
16
  from magic_pdf.libs.commons import fitz, get_delta_time
15
17
  from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
16
18
  from magic_pdf.libs.convert_utils import dict_to_list
17
- from magic_pdf.libs.drop_reason import DropReason
18
19
  from magic_pdf.libs.hash_utils import compute_md5
19
20
  from magic_pdf.libs.local_math import float_equal
20
- from magic_pdf.libs.ocr_content_type import ContentType, BlockType
21
+ from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
21
22
  from magic_pdf.model.magic_model import MagicModel
23
+
24
+ os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
25
+ os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
26
+
27
+ try:
28
+ import torchtext
29
+
30
+ if torchtext.__version__ >= "0.18.0":
31
+ torchtext.disable_torchtext_deprecation_warning()
32
+ except ImportError:
33
+ pass
34
+ from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
35
+
22
36
  from magic_pdf.para.para_split_v3 import para_split
23
37
  from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
24
38
  from magic_pdf.pre_proc.construct_page_dict import \
@@ -30,8 +44,8 @@ from magic_pdf.pre_proc.equations_replace import (
30
44
  from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
31
45
  ocr_prepare_bboxes_for_layout_split_v2
32
46
  from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
33
- fix_discarded_block,
34
- fix_block_spans_v2)
47
+ fix_block_spans_v2,
48
+ fix_discarded_block)
35
49
  from magic_pdf.pre_proc.ocr_span_list_modify import (
36
50
  get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
37
51
  remove_overlaps_min_spans)
@@ -74,7 +88,151 @@ def __replace_STX_ETX(text_str: str):
74
88
  return text_str
75
89
 
76
90
 
77
- def txt_spans_extract(pdf_page, inline_equations, interline_equations):
91
+ def chars_to_content(span):
92
+ # # 先给chars按char['bbox']的x坐标排序
93
+ # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
94
+
95
+ # 先给chars按char['bbox']的中心点的x坐标排序
96
+ span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
97
+ content = ''
98
+
99
+ # 求char的平均宽度
100
+ if len(span['chars']) == 0:
101
+ span['content'] = content
102
+ del span['chars']
103
+ return
104
+ else:
105
+ char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
106
+ char_avg_width = char_width_sum / len(span['chars'])
107
+
108
+ for char in span['chars']:
109
+ # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
110
+ if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
111
+ content += ' '
112
+ content += char['c']
113
+ span['content'] = __replace_STX_ETX(content)
114
+ del span['chars']
115
+
116
+
117
+ LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
118
+ def fill_char_in_spans(spans, all_chars):
119
+
120
+ for char in all_chars:
121
+ for span in spans:
122
+ # 判断char是否属于LINE_STOP_FLAG
123
+ if char['c'] in LINE_STOP_FLAG:
124
+ char_is_line_stop_flag = True
125
+ else:
126
+ char_is_line_stop_flag = False
127
+ if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
128
+ span['chars'].append(char)
129
+ break
130
+
131
+ for span in spans:
132
+ chars_to_content(span)
133
+
134
+
135
+ # 使用鲁棒性更强的中心点坐标判断
136
+ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
137
+ char_center_x = (char_bbox[0] + char_bbox[2]) / 2
138
+ char_center_y = (char_bbox[1] + char_bbox[3]) / 2
139
+ span_center_y = (span_bbox[1] + span_bbox[3]) / 2
140
+ span_height = span_bbox[3] - span_bbox[1]
141
+
142
+ if (
143
+ span_bbox[0] < char_center_x < span_bbox[2]
144
+ and span_bbox[1] < char_center_y < span_bbox[3]
145
+ and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度
146
+ ):
147
+ return True
148
+ else:
149
+ # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
150
+ # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
151
+ if char_is_line_stop_flag:
152
+ if (
153
+ (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
154
+ and char_center_x > span_bbox[0]
155
+ and span_bbox[1] < char_center_y < span_bbox[3]
156
+ and abs(char_center_y - span_center_y) < span_height / 4
157
+ ):
158
+ return True
159
+ else:
160
+ return False
161
+
162
+
163
+ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
164
+
165
+ useful_spans = []
166
+ unuseful_spans = []
167
+ for span in spans:
168
+ for block in all_bboxes:
169
+ if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
170
+ continue
171
+ else:
172
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
173
+ useful_spans.append(span)
174
+ break
175
+ for block in all_discarded_blocks:
176
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
177
+ unuseful_spans.append(span)
178
+ break
179
+
180
+ text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
181
+
182
+ # @todo: 拿到char之后把倾斜角度较大的先删一遍
183
+ all_pymu_chars = []
184
+ for block in text_blocks:
185
+ for line in block['lines']:
186
+ for span in line['spans']:
187
+ all_pymu_chars.extend(span['chars'])
188
+
189
+ new_spans = []
190
+
191
+ for span in useful_spans:
192
+ if span['type'] in [ContentType.Text]:
193
+ span['chars'] = []
194
+ new_spans.append(span)
195
+
196
+ for span in unuseful_spans:
197
+ if span['type'] in [ContentType.Text]:
198
+ span['chars'] = []
199
+ new_spans.append(span)
200
+
201
+ fill_char_in_spans(new_spans, all_pymu_chars)
202
+
203
+ empty_spans = []
204
+ for span in new_spans:
205
+ if len(span['content']) == 0:
206
+ empty_spans.append(span)
207
+ if len(empty_spans) > 0:
208
+
209
+ # 初始化ocr模型
210
+ atom_model_manager = AtomModelSingleton()
211
+ ocr_model = atom_model_manager.get_atom_model(
212
+ atom_model_name="ocr",
213
+ ocr_show_log=False,
214
+ det_db_box_thresh=0.3,
215
+ lang=lang
216
+ )
217
+
218
+ for span in empty_spans:
219
+ spans.remove(span)
220
+ # 对span的bbox截图
221
+ span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
222
+ ocr_res = ocr_model.ocr(span_img, det=False)
223
+ # logger.info(f"ocr_res: {ocr_res}")
224
+ # logger.info(f"empty_span: {span}")
225
+ if ocr_res and len(ocr_res) > 0:
226
+ if len(ocr_res[0]) > 0:
227
+ ocr_text, ocr_score = ocr_res[0][0]
228
+ if ocr_score > 0.5 and len(ocr_text) > 0:
229
+ span['content'] = ocr_text
230
+ spans.append(span)
231
+
232
+ return spans
233
+
234
+
235
+ def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
78
236
  text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
79
237
  char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
80
238
  'blocks'
@@ -164,8 +322,8 @@ class ModelSingleton:
164
322
 
165
323
 
166
324
  def do_predict(boxes: List[List[int]], model) -> List[int]:
167
- from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (boxes2inputs, parse_logits,
168
- prepare_inputs)
325
+ from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
326
+ boxes2inputs, parse_logits, prepare_inputs)
169
327
 
170
328
  inputs = boxes2inputs(boxes)
171
329
  inputs = prepare_inputs(inputs, model)
@@ -206,7 +364,9 @@ def cal_block_index(fix_blocks, sorted_bboxes):
206
364
  del block['real_lines']
207
365
 
208
366
  import numpy as np
209
- from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import recursive_xy_cut
367
+
368
+ from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \
369
+ recursive_xy_cut
210
370
 
211
371
  random_boxes = np.array(block_bboxes)
212
372
  np.random.shuffle(random_boxes)
@@ -291,7 +451,7 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
291
451
  page_line_list.append(bbox)
292
452
  elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
293
453
  bbox = block['bbox']
294
- block["real_lines"] = copy.deepcopy(block['lines'])
454
+ block['real_lines'] = copy.deepcopy(block['lines'])
295
455
  lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
296
456
  block['lines'] = []
297
457
  for line in lines:
@@ -462,18 +622,16 @@ def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
462
622
 
463
623
 
464
624
  def parse_page_core(
465
- page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
625
+ page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
466
626
  ):
467
627
  need_drop = False
468
628
  drop_reason = []
469
629
 
470
630
  """从magic_model对象中获取后面会用到的区块信息"""
471
- # img_blocks = magic_model.get_imgs(page_id)
472
- # table_blocks = magic_model.get_tables(page_id)
473
-
474
631
  img_groups = magic_model.get_imgs_v2(page_id)
475
632
  table_groups = magic_model.get_tables_v2(page_id)
476
633
 
634
+ """对image和table的区块分组"""
477
635
  img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
478
636
  img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
479
637
  )
@@ -517,38 +675,20 @@ def parse_page_core(
517
675
  page_h,
518
676
  )
519
677
 
678
+ """获取所有的spans信息"""
520
679
  spans = magic_model.get_all_spans(page_id)
521
680
 
522
- """根据parse_mode,构造spans"""
523
- if parse_mode == SupportedPdfParseMethod.TXT:
524
- """ocr 中文本类的 span 用 pymu spans 替换!"""
525
- pymu_spans = txt_spans_extract(page_doc, inline_equations, interline_equations)
526
- spans = replace_text_span(pymu_spans, spans)
527
- elif parse_mode == SupportedPdfParseMethod.OCR:
528
- pass
529
- else:
530
- raise Exception('parse_mode must be txt or ocr')
531
-
532
681
  """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
533
682
  """顺便删除大水印并保留abandon的span"""
534
683
  spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
535
684
 
536
- """删除重叠spans中置信度较低的那些"""
537
- spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
538
- """删除重叠spans中较小的那些"""
539
- spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
540
- """对image和table截图"""
541
- spans = ocr_cut_image_and_table(
542
- spans, page_doc, page_id, pdf_bytes_md5, imageWriter
543
- )
544
-
545
685
  """先处理不需要排版的discarded_blocks"""
546
686
  discarded_block_with_spans, spans = fill_spans_in_blocks(
547
687
  all_discarded_blocks, spans, 0.4
548
688
  )
549
689
  fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
550
690
 
551
- """如果当前页面没有bbox则跳过"""
691
+ """如果当前页面没有有效的bbox则跳过"""
552
692
  if len(all_bboxes) == 0:
553
693
  logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
554
694
  return ocr_construct_page_component_v2(
@@ -566,7 +706,32 @@ def parse_page_core(
566
706
  drop_reason,
567
707
  )
568
708
 
569
- """将span填入blocks中"""
709
+ """删除重叠spans中置信度较低的那些"""
710
+ spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
711
+ """删除重叠spans中较小的那些"""
712
+ spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
713
+
714
+ """根据parse_mode,构造spans,主要是文本类的字符填充"""
715
+ if parse_mode == SupportedPdfParseMethod.TXT:
716
+
717
+ """之前的公式替换方案"""
718
+ # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
719
+ # spans = replace_text_span(pymu_spans, spans)
720
+
721
+ """ocr 中文本类的 span 用 pymu spans 替换!"""
722
+ spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
723
+
724
+ elif parse_mode == SupportedPdfParseMethod.OCR:
725
+ pass
726
+ else:
727
+ raise Exception('parse_mode must be txt or ocr')
728
+
729
+ """对image和table截图"""
730
+ spans = ocr_cut_image_and_table(
731
+ spans, page_doc, page_id, pdf_bytes_md5, imageWriter
732
+ )
733
+
734
+ """span填充进block"""
570
735
  block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
571
736
 
572
737
  """对block进行fix操作"""
@@ -616,6 +781,7 @@ def pdf_parse_union(
616
781
  start_page_id=0,
617
782
  end_page_id=None,
618
783
  debug_mode=False,
784
+ lang=None,
619
785
  ):
620
786
  pdf_bytes_md5 = compute_md5(dataset.data_bits())
621
787
 
@@ -652,7 +818,7 @@ def pdf_parse_union(
652
818
  """解析pdf中的每一页"""
653
819
  if start_page_id <= page_id <= end_page_id:
654
820
  page_info = parse_page_core(
655
- page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
821
+ page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
656
822
  )
657
823
  else:
658
824
  page_info = page.get_page_info()
@@ -664,7 +830,7 @@ def pdf_parse_union(
664
830
  pdf_info_dict[f'page_{page_id}'] = page_info
665
831
 
666
832
  """分段"""
667
- para_split(pdf_info_dict, debug_mode=debug_mode)
833
+ para_split(pdf_info_dict)
668
834
 
669
835
  """dict转list"""
670
836
  pdf_info_list = dict_to_list(pdf_info_dict)
magic_pdf/pipe/AbsPipe.py CHANGED
@@ -1,22 +1,20 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
+ from magic_pdf.config.drop_reason import DropReason
4
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
5
+ from magic_pdf.data.data_reader_writer import DataWriter
3
6
  from magic_pdf.dict2md.ocr_mkcontent import union_make
4
7
  from magic_pdf.filter.pdf_classify_by_type import classify
5
8
  from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
6
- from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
7
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
8
- from magic_pdf.libs.drop_reason import DropReason
9
9
  from magic_pdf.libs.json_compressor import JsonCompressor
10
10
 
11
11
 
12
12
  class AbsPipe(ABC):
13
- """
14
- txt和ocr处理的抽象类
15
- """
16
- PIP_OCR = "ocr"
17
- PIP_TXT = "txt"
13
+ """txt和ocr处理的抽象类."""
14
+ PIP_OCR = 'ocr'
15
+ PIP_TXT = 'txt'
18
16
 
19
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
17
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
20
18
  start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
21
19
  self.pdf_bytes = pdf_bytes
22
20
  self.model_list = model_list
@@ -29,29 +27,23 @@ class AbsPipe(ABC):
29
27
  self.layout_model = layout_model
30
28
  self.formula_enable = formula_enable
31
29
  self.table_enable = table_enable
32
-
30
+
33
31
  def get_compress_pdf_mid_data(self):
34
32
  return JsonCompressor.compress_json(self.pdf_mid_data)
35
33
 
36
34
  @abstractmethod
37
35
  def pipe_classify(self):
38
- """
39
- 有状态的分类
40
- """
36
+ """有状态的分类."""
41
37
  raise NotImplementedError
42
38
 
43
39
  @abstractmethod
44
40
  def pipe_analyze(self):
45
- """
46
- 有状态的跑模型分析
47
- """
41
+ """有状态的跑模型分析."""
48
42
  raise NotImplementedError
49
43
 
50
44
  @abstractmethod
51
45
  def pipe_parse(self):
52
- """
53
- 有状态的解析
54
- """
46
+ """有状态的解析."""
55
47
  raise NotImplementedError
56
48
 
57
49
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
@@ -64,27 +56,25 @@ class AbsPipe(ABC):
64
56
 
65
57
  @staticmethod
66
58
  def classify(pdf_bytes: bytes) -> str:
67
- """
68
- 根据pdf的元数据,判断是文本pdf,还是ocr pdf
69
- """
59
+ """根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
70
60
  pdf_meta = pdf_meta_scan(pdf_bytes)
71
- if pdf_meta.get("_need_drop", False): # 如果返回了需要丢弃的标志,则抛出异常
61
+ if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
72
62
  raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
73
63
  else:
74
- is_encrypted = pdf_meta["is_encrypted"]
75
- is_needs_password = pdf_meta["is_needs_password"]
64
+ is_encrypted = pdf_meta['is_encrypted']
65
+ is_needs_password = pdf_meta['is_needs_password']
76
66
  if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
77
- raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
67
+ raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
78
68
  else:
79
69
  is_text_pdf, results = classify(
80
- pdf_meta["total_page"],
81
- pdf_meta["page_width_pts"],
82
- pdf_meta["page_height_pts"],
83
- pdf_meta["image_info_per_page"],
84
- pdf_meta["text_len_per_page"],
85
- pdf_meta["imgs_per_page"],
86
- pdf_meta["text_layout_per_page"],
87
- pdf_meta["invalid_chars"],
70
+ pdf_meta['total_page'],
71
+ pdf_meta['page_width_pts'],
72
+ pdf_meta['page_height_pts'],
73
+ pdf_meta['image_info_per_page'],
74
+ pdf_meta['text_len_per_page'],
75
+ pdf_meta['imgs_per_page'],
76
+ pdf_meta['text_layout_per_page'],
77
+ pdf_meta['invalid_chars'],
88
78
  )
89
79
  if is_text_pdf:
90
80
  return AbsPipe.PIP_TXT
@@ -93,22 +83,16 @@ class AbsPipe(ABC):
93
83
 
94
84
  @staticmethod
95
85
  def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
96
- """
97
- 根据pdf类型,生成统一格式content_list
98
- """
86
+ """根据pdf类型,生成统一格式content_list."""
99
87
  pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
100
- pdf_info_list = pdf_mid_data["pdf_info"]
88
+ pdf_info_list = pdf_mid_data['pdf_info']
101
89
  content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
102
90
  return content_list
103
91
 
104
92
  @staticmethod
105
93
  def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
106
- """
107
- 根据pdf类型,markdown
108
- """
94
+ """根据pdf类型,markdown."""
109
95
  pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
110
- pdf_info_list = pdf_mid_data["pdf_info"]
96
+ pdf_info_list = pdf_mid_data['pdf_info']
111
97
  md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
112
98
  return md_content
113
-
114
-
magic_pdf/pipe/OCRPipe.py CHANGED
@@ -1,15 +1,15 @@
1
1
  from loguru import logger
2
2
 
3
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
3
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
4
+ from magic_pdf.data.data_reader_writer import DataWriter
4
5
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
5
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
6
6
  from magic_pdf.pipe.AbsPipe import AbsPipe
7
7
  from magic_pdf.user_api import parse_ocr_pdf
8
8
 
9
9
 
10
10
  class OCRPipe(AbsPipe):
11
11
 
12
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
12
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
13
13
  start_page_id=0, end_page_id=None, lang=None,
14
14
  layout_model=None, formula_enable=None, table_enable=None):
15
15
  super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
@@ -32,10 +32,10 @@ class OCRPipe(AbsPipe):
32
32
 
33
33
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
34
34
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
35
- logger.info("ocr_pipe mk content list finished")
35
+ logger.info('ocr_pipe mk content list finished')
36
36
  return result
37
37
 
38
38
  def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
39
39
  result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
40
- logger.info(f"ocr_pipe mk {md_make_mode} finished")
40
+ logger.info(f'ocr_pipe mk {md_make_mode} finished')
41
41
  return result
magic_pdf/pipe/TXTPipe.py CHANGED
@@ -1,16 +1,15 @@
1
1
  from loguru import logger
2
2
 
3
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
3
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
4
+ from magic_pdf.data.data_reader_writer import DataWriter
4
5
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
5
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
6
- from magic_pdf.libs.json_compressor import JsonCompressor
7
6
  from magic_pdf.pipe.AbsPipe import AbsPipe
8
7
  from magic_pdf.user_api import parse_txt_pdf
9
8
 
10
9
 
11
10
  class TXTPipe(AbsPipe):
12
11
 
13
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
12
+ def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
14
13
  start_page_id=0, end_page_id=None, lang=None,
15
14
  layout_model=None, formula_enable=None, table_enable=None):
16
15
  super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
@@ -33,10 +32,10 @@ class TXTPipe(AbsPipe):
33
32
 
34
33
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
35
34
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
36
- logger.info("txt_pipe mk content list finished")
35
+ logger.info('txt_pipe mk content list finished')
37
36
  return result
38
37
 
39
38
  def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
40
39
  result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
41
- logger.info(f"txt_pipe mk {md_make_mode} finished")
40
+ logger.info(f'txt_pipe mk {md_make_mode} finished')
42
41
  return result
magic_pdf/pipe/UNIPipe.py CHANGED
@@ -2,22 +2,21 @@ import json
2
2
 
3
3
  from loguru import logger
4
4
 
5
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
6
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
7
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
8
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
5
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
6
+ from magic_pdf.data.data_reader_writer import DataWriter
9
7
  from magic_pdf.libs.commons import join_path
8
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
10
9
  from magic_pdf.pipe.AbsPipe import AbsPipe
11
- from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
10
+ from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
12
11
 
13
12
 
14
13
  class UNIPipe(AbsPipe):
15
14
 
16
- def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
15
+ def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: DataWriter, is_debug: bool = False,
17
16
  start_page_id=0, end_page_id=None, lang=None,
18
17
  layout_model=None, formula_enable=None, table_enable=None):
19
- self.pdf_type = jso_useful_key["_pdf_type"]
20
- super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id,
18
+ self.pdf_type = jso_useful_key['_pdf_type']
19
+ super().__init__(pdf_bytes, jso_useful_key['model_list'], image_writer, is_debug, start_page_id, end_page_id,
21
20
  lang, layout_model, formula_enable, table_enable)
22
21
  if len(self.model_list) == 0:
23
22
  self.input_model_is_empty = True
@@ -54,27 +53,28 @@ class UNIPipe(AbsPipe):
54
53
 
55
54
  def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
56
55
  result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
57
- logger.info("uni_pipe mk content list finished")
56
+ logger.info('uni_pipe mk content list finished')
58
57
  return result
59
58
 
60
59
  def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
61
60
  result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
62
- logger.info(f"uni_pipe mk {md_make_mode} finished")
61
+ logger.info(f'uni_pipe mk {md_make_mode} finished')
63
62
  return result
64
63
 
65
64
 
66
65
  if __name__ == '__main__':
67
66
  # 测试
68
- drw = DiskReaderWriter(r"D:/project/20231108code-clean")
67
+ from magic_pdf.data.data_reader_writer import DataReader
68
+ drw = DataReader(r'D:/project/20231108code-clean')
69
69
 
70
- pdf_file_path = r"linshixuqiu\19983-00.pdf"
71
- model_file_path = r"linshixuqiu\19983-00.json"
72
- pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
73
- model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
70
+ pdf_file_path = r'linshixuqiu\19983-00.pdf'
71
+ model_file_path = r'linshixuqiu\19983-00.json'
72
+ pdf_bytes = drw.read(pdf_file_path)
73
+ model_json_txt = drw.read(model_file_path).decode()
74
74
  model_list = json.loads(model_json_txt)
75
- write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
76
- img_bucket_path = "imgs"
77
- img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
75
+ write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
76
+ img_bucket_path = 'imgs'
77
+ img_writer = DataWriter(join_path(write_path, img_bucket_path))
78
78
 
79
79
  # pdf_type = UNIPipe.classify(pdf_bytes)
80
80
  # jso_useful_key = {
@@ -83,8 +83,8 @@ if __name__ == '__main__':
83
83
  # }
84
84
 
85
85
  jso_useful_key = {
86
- "_pdf_type": "",
87
- "model_list": model_list
86
+ '_pdf_type': '',
87
+ 'model_list': model_list
88
88
  }
89
89
  pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
90
90
  pipe.pipe_classify()
@@ -92,8 +92,7 @@ if __name__ == '__main__':
92
92
  md_content = pipe.pipe_mk_markdown(img_bucket_path)
93
93
  content_list = pipe.pipe_mk_uni_format(img_bucket_path)
94
94
 
95
- md_writer = DiskReaderWriter(write_path)
96
- md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)
97
- md_writer.write(json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), "19983-00.json",
98
- AbsReaderWriter.MODE_TXT)
99
- md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)
95
+ md_writer = DataWriter(write_path)
96
+ md_writer.write_string('19983-00.md', md_content)
97
+ md_writer.write_string('19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4))
98
+ md_writer.write_string('19983-00.txt', str(content_list))