magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,204 @@
1
+ import datetime
2
+ import json
3
+ import os, re, configparser
4
+ import subprocess
5
+ import time
6
+
7
+ import boto3
8
+ from loguru import logger
9
+ from boto3.s3.transfer import TransferConfig
10
+ from botocore.config import Config
11
+
12
+ import fitz # 1.23.9中已经切换到rebase
13
+ # import fitz_old as fitz # 使用1.23.9之前的pymupdf库
14
+
15
+
16
+ def get_delta_time(input_time):
17
+ return round(time.time() - input_time, 2)
18
+
19
+
20
+ def join_path(*args):
21
+ return '/'.join(str(s).rstrip('/') for s in args)
22
+
23
+
24
+ #配置全局的errlog_path,方便demo同步引用
25
+ error_log_path = "s3://llm-pdf-text/err_logs/"
26
+ # json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
27
+ json_dump_path = "s3://llm-pdf-text/json_dump/"
28
+
29
+ # s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
30
+
31
+
32
+ def get_top_percent_list(num_list, percent):
33
+ """
34
+ 获取列表中前百分之多少的元素
35
+ :param num_list:
36
+ :param percent:
37
+ :return:
38
+ """
39
+ if len(num_list) == 0:
40
+ top_percent_list = []
41
+ else:
42
+ # 对imgs_len_list排序
43
+ sorted_imgs_len_list = sorted(num_list, reverse=True)
44
+ # 计算 percent 的索引
45
+ top_percent_index = int(len(sorted_imgs_len_list) * percent)
46
+ # 取前80%的元素
47
+ top_percent_list = sorted_imgs_len_list[:top_percent_index]
48
+ return top_percent_list
49
+
50
+
51
+ def formatted_time(time_stamp):
52
+ dt_object = datetime.datetime.fromtimestamp(time_stamp)
53
+ output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
54
+ return output_time
55
+
56
+
57
+ def mymax(alist: list):
58
+ if len(alist) == 0:
59
+ return 0 # 空是0, 0*0也是0大小q
60
+ else:
61
+ return max(alist)
62
+
63
+ def parse_aws_param(profile):
64
+ if isinstance(profile, str):
65
+ # 解析配置文件
66
+ config_file = join_path(os.path.expanduser("~"), ".aws", "config")
67
+ credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
68
+ config = configparser.ConfigParser()
69
+ config.read(credentials_file)
70
+ config.read(config_file)
71
+ # 获取 AWS 账户相关信息
72
+ ak = config.get(profile, "aws_access_key_id")
73
+ sk = config.get(profile, "aws_secret_access_key")
74
+ if profile == "default":
75
+ s3_str = config.get(f"{profile}", "s3")
76
+ else:
77
+ s3_str = config.get(f"profile {profile}", "s3")
78
+ end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
79
+ if end_match:
80
+ endpoint = end_match.group(1)
81
+ else:
82
+ raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
83
+ style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
84
+ if style_match:
85
+ addressing_style = style_match.group(1)
86
+ else:
87
+ addressing_style = "path"
88
+ elif isinstance(profile, dict):
89
+ ak = profile["ak"]
90
+ sk = profile["sk"]
91
+ endpoint = profile["endpoint"]
92
+ addressing_style = "auto"
93
+
94
+ return ak, sk, endpoint, addressing_style
95
+
96
+
97
+ def parse_bucket_key(s3_full_path: str):
98
+ """
99
+ 输入 s3://bucket/path/to/my/file.txt
100
+ 输出 bucket, path/to/my/file.txt
101
+ """
102
+ s3_full_path = s3_full_path.strip()
103
+ if s3_full_path.startswith("s3://"):
104
+ s3_full_path = s3_full_path[5:]
105
+ if s3_full_path.startswith("/"):
106
+ s3_full_path = s3_full_path[1:]
107
+ bucket, key = s3_full_path.split("/", 1)
108
+ return bucket, key
109
+
110
+
111
+ def read_file(pdf_path: str, s3_profile):
112
+ if pdf_path.startswith("s3://"):
113
+ ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
114
+ cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
115
+ config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
116
+ bucket_name, bucket_key = parse_bucket_key(pdf_path)
117
+ res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
118
+ file_content = res["Body"].read()
119
+ return file_content
120
+ else:
121
+ with open(pdf_path, "rb") as f:
122
+ return f.read()
123
+
124
+
125
+ def get_docx_model_output(pdf_model_output, page_id):
126
+
127
+ model_output_json = pdf_model_output[page_id]
128
+
129
+ return model_output_json
130
+
131
+
132
+ def list_dir(dir_path:str, s3_profile:str):
133
+ """
134
+ 列出dir_path下的所有文件
135
+ """
136
+ ret = []
137
+
138
+ if dir_path.startswith("s3"):
139
+ ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
140
+ s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
141
+ bucket, path = s3info[0][0], s3info[0][1]
142
+ try:
143
+ cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
144
+ config=Config(s3={'addressing_style': addressing_style}))
145
+ def list_obj_scluster():
146
+ marker = None
147
+ while True:
148
+ list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
149
+ if marker:
150
+ list_kwargs['Marker'] = marker
151
+ response = cli.list_objects(**list_kwargs)
152
+ contents = response.get("Contents", [])
153
+ yield from contents
154
+ if not response.get("IsTruncated") or len(contents)==0:
155
+ break
156
+ marker = contents[-1]['Key']
157
+
158
+
159
+ for info in list_obj_scluster():
160
+ file_path = info['Key']
161
+ #size = info['Size']
162
+
163
+ if path!="":
164
+ afile = file_path[len(path):]
165
+ if afile.endswith(".json"):
166
+ ret.append(f"s3://{bucket}/{file_path}")
167
+
168
+ return ret
169
+
170
+ except Exception as e:
171
+ logger.exception(e)
172
+ exit(-1)
173
+ else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
174
+
175
+ for root, dirs, files in os.walk(dir_path):
176
+ for file in files:
177
+ if file.endswith(".json"):
178
+ ret.append(join_path(root, file))
179
+ ret.sort()
180
+ return ret
181
+
182
+ def get_img_s3_client(save_path:str, image_s3_config:str):
183
+ """
184
+ """
185
+ if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
186
+ ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
187
+ img_s3_client = boto3.client(
188
+ service_name="s3",
189
+ aws_access_key_id=ak,
190
+ aws_secret_access_key=sk,
191
+ endpoint_url=end_point,
192
+ config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
193
+ )
194
+ else:
195
+ img_s3_client = None
196
+
197
+ return img_s3_client
198
+
199
+ if __name__=="__main__":
200
+ s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
201
+ s3_profile = "langchao"
202
+ ret = list_dir(s3_path, s3_profile)
203
+ print(ret)
204
+
@@ -0,0 +1,63 @@
1
+ """
2
+ 根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
3
+
4
+ """
5
+
6
+ import json
7
+ import os
8
+
9
+ from loguru import logger
10
+
11
+ from magic_pdf.libs.commons import parse_bucket_key
12
+
13
+
14
+ def read_config():
15
+ home_dir = os.path.expanduser("~")
16
+
17
+ config_file = os.path.join(home_dir, "magic-pdf.json")
18
+
19
+ if not os.path.exists(config_file):
20
+ raise Exception(f"{config_file} not found")
21
+
22
+ with open(config_file, "r") as f:
23
+ config = json.load(f)
24
+ return config
25
+
26
+
27
+ def get_s3_config(bucket_name: str):
28
+ """
29
+ ~/magic-pdf.json 读出来
30
+ """
31
+ config = read_config()
32
+
33
+ bucket_info = config.get("bucket_info")
34
+ if bucket_name not in bucket_info:
35
+ access_key, secret_key, storage_endpoint = bucket_info["[default]"]
36
+ else:
37
+ access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
38
+
39
+ if access_key is None or secret_key is None or storage_endpoint is None:
40
+ raise Exception("ak, sk or endpoint not found in magic-pdf.json")
41
+
42
+ # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
43
+
44
+ return access_key, secret_key, storage_endpoint
45
+
46
+
47
+ def get_s3_config_dict(path: str):
48
+ access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
49
+ return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
50
+
51
+
52
+ def get_bucket_name(path):
53
+ bucket, key = parse_bucket_key(path)
54
+ return bucket
55
+
56
+
57
+ def get_local_dir():
58
+ config = read_config()
59
+ return config.get("temp-output-dir", "/tmp")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ ak, sk, endpoint = get_s3_config("llm-raw")
@@ -0,0 +1,5 @@
1
+ def dict_to_list(input_dict):
2
+ items_list = []
3
+ for _, item in input_dict.items():
4
+ items_list.append(item)
5
+ return items_list
@@ -0,0 +1,9 @@
1
+ def get_scale_ratio(model_page_info, page):
2
+ pix = page.get_pixmap(dpi=72)
3
+ pymu_width = int(pix.w)
4
+ pymu_height = int(pix.h)
5
+ width_from_json = model_page_info['page_info']['width']
6
+ height_from_json = model_page_info['page_info']['height']
7
+ horizontal_scale_ratio = width_from_json / pymu_width
8
+ vertical_scale_ratio = height_from_json / pymu_height
9
+ return horizontal_scale_ratio, vertical_scale_ratio
@@ -0,0 +1,21 @@
1
+ from collections import Counter
2
+
3
+ from magic_pdf.libs.language import detect_lang
4
+
5
+ def get_language_from_model(model_list: list):
6
+ language_lst = []
7
+ for ocr_page_info in model_list:
8
+ page_text = ""
9
+ layout_dets = ocr_page_info["layout_dets"]
10
+ for layout_det in layout_dets:
11
+ category_id = layout_det["category_id"]
12
+ allow_category_id_list = [15]
13
+ if category_id in allow_category_id_list:
14
+ page_text += layout_det["text"]
15
+ page_language = detect_lang(page_text)
16
+ language_lst.append(page_language)
17
+ # 统计text_language_list中每种语言的个数
18
+ count_dict = Counter(language_lst)
19
+ # 输出text_language_list中出现的次数最多的语言
20
+ language = max(count_dict, key=count_dict.get)
21
+ return language
@@ -0,0 +1,227 @@
1
+ from magic_pdf.libs.Constants import CROSS_PAGE
2
+ from magic_pdf.libs.commons import fitz # PyMuPDF
3
+ from magic_pdf.libs.ocr_content_type import ContentType, BlockType
4
+
5
+
6
+ def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
7
+ new_rgb = []
8
+ for item in rgb_config:
9
+ item = float(item) / 255
10
+ new_rgb.append(item)
11
+ page_data = bbox_list[i]
12
+ for bbox in page_data:
13
+ x0, y0, x1, y1 = bbox
14
+ rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
15
+ if fill_config:
16
+ page.draw_rect(
17
+ rect_coords,
18
+ color=None,
19
+ fill=new_rgb,
20
+ fill_opacity=0.3,
21
+ width=0.5,
22
+ overlay=True,
23
+ ) # Draw the rectangle
24
+ else:
25
+ page.draw_rect(
26
+ rect_coords,
27
+ color=new_rgb,
28
+ fill=None,
29
+ fill_opacity=1,
30
+ width=0.5,
31
+ overlay=True,
32
+ ) # Draw the rectangle
33
+
34
+
35
+ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
36
+ new_rgb = []
37
+ for item in rgb_config:
38
+ item = float(item) / 255
39
+ new_rgb.append(item)
40
+ page_data = bbox_list[i]
41
+ for j, bbox in enumerate(page_data):
42
+ x0, y0, x1, y1 = bbox
43
+ rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
44
+ if fill_config:
45
+ page.draw_rect(
46
+ rect_coords,
47
+ color=None,
48
+ fill=new_rgb,
49
+ fill_opacity=0.3,
50
+ width=0.5,
51
+ overlay=True,
52
+ ) # Draw the rectangle
53
+ else:
54
+ page.draw_rect(
55
+ rect_coords,
56
+ color=new_rgb,
57
+ fill=None,
58
+ fill_opacity=1,
59
+ width=0.5,
60
+ overlay=True,
61
+ ) # Draw the rectangle
62
+ page.insert_text(
63
+ (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
64
+ ) # Insert the index in the top left corner of the rectangle
65
+
66
+
67
+ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
68
+ layout_bbox_list = []
69
+ dropped_bbox_list = []
70
+ tables_list, tables_body_list, tables_caption_list, tables_footnote_list = [], [], [], []
71
+ imgs_list, imgs_body_list, imgs_caption_list = [], [], []
72
+ titles_list = []
73
+ texts_list = []
74
+ interequations_list = []
75
+ for page in pdf_info:
76
+ page_layout_list = []
77
+ page_dropped_list = []
78
+ tables, tables_body, tables_caption, tables_footnote = [], [], [], []
79
+ imgs, imgs_body, imgs_caption = [], [], []
80
+ titles = []
81
+ texts = []
82
+ interequations = []
83
+ for layout in page["layout_bboxes"]:
84
+ page_layout_list.append(layout["layout_bbox"])
85
+ layout_bbox_list.append(page_layout_list)
86
+ for dropped_bbox in page["discarded_blocks"]:
87
+ page_dropped_list.append(dropped_bbox["bbox"])
88
+ dropped_bbox_list.append(page_dropped_list)
89
+ for block in page["para_blocks"]:
90
+ bbox = block["bbox"]
91
+ if block["type"] == BlockType.Table:
92
+ tables.append(bbox)
93
+ for nested_block in block["blocks"]:
94
+ bbox = nested_block["bbox"]
95
+ if nested_block["type"] == BlockType.TableBody:
96
+ tables_body.append(bbox)
97
+ elif nested_block["type"] == BlockType.TableCaption:
98
+ tables_caption.append(bbox)
99
+ elif nested_block["type"] == BlockType.TableFootnote:
100
+ tables_footnote.append(bbox)
101
+ elif block["type"] == BlockType.Image:
102
+ imgs.append(bbox)
103
+ for nested_block in block["blocks"]:
104
+ bbox = nested_block["bbox"]
105
+ if nested_block["type"] == BlockType.ImageBody:
106
+ imgs_body.append(bbox)
107
+ elif nested_block["type"] == BlockType.ImageCaption:
108
+ imgs_caption.append(bbox)
109
+ elif block["type"] == BlockType.Title:
110
+ titles.append(bbox)
111
+ elif block["type"] == BlockType.Text:
112
+ texts.append(bbox)
113
+ elif block["type"] == BlockType.InterlineEquation:
114
+ interequations.append(bbox)
115
+ tables_list.append(tables)
116
+ tables_body_list.append(tables_body)
117
+ tables_caption_list.append(tables_caption)
118
+ tables_footnote_list.append(tables_footnote)
119
+ imgs_list.append(imgs)
120
+ imgs_body_list.append(imgs_body)
121
+ imgs_caption_list.append(imgs_caption)
122
+ titles_list.append(titles)
123
+ texts_list.append(texts)
124
+ interequations_list.append(interequations)
125
+
126
+ pdf_docs = fitz.open("pdf", pdf_bytes)
127
+ for i, page in enumerate(pdf_docs):
128
+ draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
129
+ draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
130
+ draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
131
+ draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
132
+ draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
133
+ draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
134
+ draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
135
+ draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
136
+ draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
137
+ draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
138
+ draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
139
+ draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
140
+
141
+ # Save the PDF
142
+ pdf_docs.save(f"{out_path}/layout.pdf")
143
+
144
+
145
+ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
146
+ text_list = []
147
+ inline_equation_list = []
148
+ interline_equation_list = []
149
+ image_list = []
150
+ table_list = []
151
+ dropped_list = []
152
+ next_page_text_list = []
153
+ next_page_inline_equation_list = []
154
+
155
+ def get_span_info(span):
156
+ if span["type"] == ContentType.Text:
157
+ if span.get(CROSS_PAGE, False):
158
+ next_page_text_list.append(span["bbox"])
159
+ else:
160
+ page_text_list.append(span["bbox"])
161
+ elif span["type"] == ContentType.InlineEquation:
162
+ if span.get(CROSS_PAGE, False):
163
+ next_page_inline_equation_list.append(span["bbox"])
164
+ else:
165
+ page_inline_equation_list.append(span["bbox"])
166
+ elif span["type"] == ContentType.InterlineEquation:
167
+ page_interline_equation_list.append(span["bbox"])
168
+ elif span["type"] == ContentType.Image:
169
+ page_image_list.append(span["bbox"])
170
+ elif span["type"] == ContentType.Table:
171
+ page_table_list.append(span["bbox"])
172
+
173
+ for page in pdf_info:
174
+ page_text_list = []
175
+ page_inline_equation_list = []
176
+ page_interline_equation_list = []
177
+ page_image_list = []
178
+ page_table_list = []
179
+ page_dropped_list = []
180
+
181
+ # 将跨页的span放到移动到下一页的列表中
182
+ if len(next_page_text_list) > 0:
183
+ page_text_list.extend(next_page_text_list)
184
+ next_page_text_list.clear()
185
+ if len(next_page_inline_equation_list) > 0:
186
+ page_inline_equation_list.extend(next_page_inline_equation_list)
187
+ next_page_inline_equation_list.clear()
188
+
189
+ # 构造dropped_list
190
+ for block in page["discarded_blocks"]:
191
+ if block["type"] == BlockType.Discarded:
192
+ for line in block["lines"]:
193
+ for span in line["spans"]:
194
+ page_dropped_list.append(span["bbox"])
195
+ dropped_list.append(page_dropped_list)
196
+ # 构造其余useful_list
197
+ for block in page["para_blocks"]:
198
+ if block["type"] in [
199
+ BlockType.Text,
200
+ BlockType.Title,
201
+ BlockType.InterlineEquation,
202
+ ]:
203
+ for line in block["lines"]:
204
+ for span in line["spans"]:
205
+ get_span_info(span)
206
+ elif block["type"] in [BlockType.Image, BlockType.Table]:
207
+ for sub_block in block["blocks"]:
208
+ for line in sub_block["lines"]:
209
+ for span in line["spans"]:
210
+ get_span_info(span)
211
+ text_list.append(page_text_list)
212
+ inline_equation_list.append(page_inline_equation_list)
213
+ interline_equation_list.append(page_interline_equation_list)
214
+ image_list.append(page_image_list)
215
+ table_list.append(page_table_list)
216
+ pdf_docs = fitz.open("pdf", pdf_bytes)
217
+ for i, page in enumerate(pdf_docs):
218
+ # 获取当前页面的数据
219
+ draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
220
+ draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
221
+ draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
222
+ draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
223
+ draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
224
+ draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
225
+
226
+ # Save the PDF
227
+ pdf_docs.save(f"{out_path}/spans.pdf")
@@ -0,0 +1,27 @@
1
+
2
+ class DropReason:
3
+ TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
4
+ USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
5
+ COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
6
+ TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
7
+ COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
8
+ HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片,计算量太大,从而丢弃
9
+ HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
10
+ HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
11
+ MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
12
+ Exception = "_exception" # 解析中发生异常
13
+ ENCRYPTED = "encrypted" # PDF是加密的
14
+ EMPTY_PDF = "total_page=0" # PDF页面总数为0
15
+ NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
16
+ DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
17
+ TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
18
+ TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败(例如一级、二级、三级标题)
19
+ PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
20
+ PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
21
+ NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
22
+ SPECIAL_PDF = "special_pdf"
23
+ PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
24
+ CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
25
+ NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
26
+ OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
27
+
@@ -0,0 +1,19 @@
1
+
2
+ COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
3
+ PAGE_NO = "page-no" # 页码
4
+ CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
5
+ VERTICAL_TEXT = 'vertical-text' # 垂直文本
6
+ ROTATE_TEXT = 'rotate-text' # 旋转文本
7
+ EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
8
+ ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
9
+ ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
10
+
11
+
12
+ class DropTag:
13
+ PAGE_NUMBER = "page_no"
14
+ HEADER = "header"
15
+ FOOTER = "footer"
16
+ FOOTNOTE = "footnote"
17
+ NOT_IN_LAYOUT = "not_in_layout"
18
+ SPAN_OVERLAP = "span_overlap"
19
+ BLOCK_OVERLAP = "block_overlap"
@@ -0,0 +1,15 @@
1
+ import hashlib
2
+
3
+
4
+ def compute_md5(file_bytes):
5
+ hasher = hashlib.md5()
6
+ hasher.update(file_bytes)
7
+ return hasher.hexdigest().upper()
8
+
9
+
10
+ def compute_sha256(input_string):
11
+ hasher = hashlib.sha256()
12
+ # 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
13
+ input_bytes = input_string.encode('utf-8')
14
+ hasher.update(input_bytes)
15
+ return hasher.hexdigest()
@@ -0,0 +1,27 @@
1
+ import json
2
+ import brotli
3
+ import base64
4
+
5
+ class JsonCompressor:
6
+
7
+ @staticmethod
8
+ def compress_json(data):
9
+ """
10
+ Compress a json object and encode it with base64
11
+ """
12
+ json_str = json.dumps(data)
13
+ json_bytes = json_str.encode('utf-8')
14
+ compressed = brotli.compress(json_bytes, quality=6)
15
+ compressed_str = base64.b64encode(compressed).decode('utf-8') # convert bytes to string
16
+ return compressed_str
17
+
18
+ @staticmethod
19
+ def decompress_json(compressed_str):
20
+ """
21
+ Decode the base64 string and decompress the json object
22
+ """
23
+ compressed = base64.b64decode(compressed_str.encode('utf-8')) # convert string to bytes
24
+ decompressed_bytes = brotli.decompress(compressed)
25
+ json_str = decompressed_bytes.decode('utf-8')
26
+ data = json.loads(json_str)
27
+ return data
@@ -0,0 +1,31 @@
1
+ import regex
2
+ import unicodedata
3
+ from fast_langdetect import detect_langs
4
+
5
+ RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
6
+
7
+
8
+ def remove_bad_chars(text):
9
+ return RE_BAD_CHARS.sub("", text)
10
+
11
+
12
+ def detect_lang(text: str) -> str:
13
+ if len(text) == 0:
14
+ return ""
15
+ try:
16
+ lang_upper = detect_langs(text)
17
+ except:
18
+ html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
19
+ lang_upper = detect_langs(html_no_ctrl_chars)
20
+ try:
21
+ lang = lang_upper.lower()
22
+ except:
23
+ lang = ""
24
+ return lang
25
+
26
+
27
+ if __name__ == '__main__':
28
+ print(detect_lang("This is a test."))
29
+ print(detect_lang("<html>This is a test</html>"))
30
+ print(detect_lang("这个是中文测试。"))
31
+ print(detect_lang("<html>这个是中文测试。</html>"))