magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
magic_pdf/user_api.py ADDED
@@ -0,0 +1,136 @@
1
+ """
2
+ 用户输入:
3
+ model数组,每个元素代表一个页面
4
+ pdf在s3的路径
5
+ 截图保存的s3位置
6
+
7
+ 然后:
8
+ 1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
9
+ 2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
10
+
11
+ 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
12
+
13
+ """
14
+ import re
15
+
16
+ from loguru import logger
17
+
18
+ from magic_pdf.libs.version import __version__
19
+ from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
20
+ from magic_pdf.rw import AbsReaderWriter
21
+ from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
22
+ from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
23
+
24
+ PARSE_TYPE_TXT = "txt"
25
+ PARSE_TYPE_OCR = "ocr"
26
+
27
+
28
+ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
29
+ **kwargs):
30
+ """
31
+ 解析文本类pdf
32
+ """
33
+ pdf_info_dict = parse_pdf_by_txt(
34
+ pdf_bytes,
35
+ pdf_models,
36
+ imageWriter,
37
+ start_page_id=start_page,
38
+ debug_mode=is_debug,
39
+ )
40
+
41
+ pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
42
+
43
+ pdf_info_dict["_version_name"] = __version__
44
+
45
+ return pdf_info_dict
46
+
47
+
48
+ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
49
+ **kwargs):
50
+ """
51
+ 解析ocr类pdf
52
+ """
53
+ pdf_info_dict = parse_pdf_by_ocr(
54
+ pdf_bytes,
55
+ pdf_models,
56
+ imageWriter,
57
+ start_page_id=start_page,
58
+ debug_mode=is_debug,
59
+ )
60
+
61
+ pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
62
+
63
+ pdf_info_dict["_version_name"] = __version__
64
+
65
+ return pdf_info_dict
66
+
67
+
68
+ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
69
+ input_model_is_empty: bool = False,
70
+ *args, **kwargs):
71
+ """
72
+ ocr和文本混合的pdf,全部解析出来
73
+ """
74
+
75
+ def parse_pdf(method):
76
+ try:
77
+ return method(
78
+ pdf_bytes,
79
+ pdf_models,
80
+ imageWriter,
81
+ start_page_id=start_page,
82
+ debug_mode=is_debug,
83
+ )
84
+ except Exception as e:
85
+ logger.exception(e)
86
+ return None
87
+
88
+ pdf_info_dict = parse_pdf(parse_pdf_by_txt)
89
+ text_all = ""
90
+ for page_dict in pdf_info_dict['pdf_info']:
91
+ for para_block in page_dict['para_blocks']:
92
+ if para_block['type'] in ['title', 'text']:
93
+ for line in para_block['lines']:
94
+ for span in line['spans']:
95
+ text_all += span['content']
96
+
97
+ def calculate_not_common_character_rate(text):
98
+ garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
99
+ # 计算乱码字符的数量
100
+ garbage_count = len(garbage_regex.findall(text))
101
+ total = len(text)
102
+ if total == 0:
103
+ return 0 # 避免除以零的错误
104
+ return garbage_count / total
105
+
106
+ def calculate_not_printable_rate(text):
107
+ printable = sum(1 for c in text if c.isprintable())
108
+ total = len(text)
109
+ if total == 0:
110
+ return 0 # 避免除以零的错误
111
+ return (total - printable) / total
112
+
113
+ not_common_character_rate = calculate_not_common_character_rate(text_all)
114
+ not_printable_rate = calculate_not_printable_rate(text_all)
115
+ pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
116
+ pdf_info_dict["_not_printable_rate"] = not_printable_rate
117
+ logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
118
+ # not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
119
+ if (pdf_info_dict is None
120
+ or pdf_info_dict.get("_need_drop", False)
121
+ or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
122
+ ):
123
+ logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
124
+ if input_model_is_empty:
125
+ pdf_models = doc_analyze(pdf_bytes, ocr=True)
126
+ pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
127
+ if pdf_info_dict is None:
128
+ raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
129
+ else:
130
+ pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
131
+ else:
132
+ pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
133
+
134
+ pdf_info_dict["_version_name"] = __version__
135
+
136
+ return pdf_info_dict