magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +16 -22
  2. magic_pdf/filter/pdf_meta_scan.py +5 -19
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_check.py +52 -25
  7. magic_pdf/libs/pdf_image_tools.py +2 -1
  8. magic_pdf/libs/version.py +1 -1
  9. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  10. magic_pdf/model/magic_model.py +0 -30
  11. magic_pdf/model/pp_structure_v2.py +23 -3
  12. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
  13. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
  14. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
  15. magic_pdf/para/para_split_v3.py +21 -7
  16. magic_pdf/pdf_parse_union_core_v2.py +134 -146
  17. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  18. magic_pdf/pre_proc/cut_image.py +0 -37
  19. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  20. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  21. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  22. magic_pdf/rw/S3ReaderWriter.py +1 -1
  23. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
  24. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
  25. magic_pdf/dict2md/mkcontent.py +0 -438
  26. magic_pdf/layout/__init__.py +0 -0
  27. magic_pdf/layout/bbox_sort.py +0 -681
  28. magic_pdf/layout/layout_det_utils.py +0 -182
  29. magic_pdf/layout/layout_sort.py +0 -921
  30. magic_pdf/layout/layout_spiler_recog.py +0 -101
  31. magic_pdf/layout/mcol_sort.py +0 -336
  32. magic_pdf/libs/calc_span_stats.py +0 -239
  33. magic_pdf/libs/detect_language_from_model.py +0 -21
  34. magic_pdf/libs/nlp_utils.py +0 -203
  35. magic_pdf/libs/textbase.py +0 -33
  36. magic_pdf/libs/vis_utils.py +0 -308
  37. magic_pdf/para/block_continuation_processor.py +0 -562
  38. magic_pdf/para/block_termination_processor.py +0 -480
  39. magic_pdf/para/commons.py +0 -222
  40. magic_pdf/para/denoise.py +0 -246
  41. magic_pdf/para/draw.py +0 -121
  42. magic_pdf/para/exceptions.py +0 -198
  43. magic_pdf/para/layout_match_processor.py +0 -40
  44. magic_pdf/para/para_split.py +0 -807
  45. magic_pdf/para/para_split_v2.py +0 -959
  46. magic_pdf/para/raw_processor.py +0 -207
  47. magic_pdf/para/stats.py +0 -268
  48. magic_pdf/para/title_processor.py +0 -1014
  49. magic_pdf/pdf_parse_union_core.py +0 -345
  50. magic_pdf/post_proc/__init__.py +0 -0
  51. magic_pdf/post_proc/detect_para.py +0 -3472
  52. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  53. magic_pdf/post_proc/remove_footnote.py +0 -153
  54. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  55. magic_pdf/pre_proc/detect_equation.py +0 -134
  56. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  57. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  58. magic_pdf/pre_proc/detect_footnote.py +0 -170
  59. magic_pdf/pre_proc/detect_header.py +0 -64
  60. magic_pdf/pre_proc/detect_images.py +0 -647
  61. magic_pdf/pre_proc/detect_page_number.py +0 -64
  62. magic_pdf/pre_proc/detect_tables.py +0 -62
  63. magic_pdf/pre_proc/equations_replace.py +0 -550
  64. magic_pdf/pre_proc/fix_image.py +0 -244
  65. magic_pdf/pre_proc/fix_table.py +0 -270
  66. magic_pdf/pre_proc/main_text_font.py +0 -23
  67. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  68. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  69. magic_pdf/pre_proc/post_layout_split.py +0 -0
  70. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  71. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  72. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  73. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  74. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  75. magic_pdf/pre_proc/statistics.py +0 -12
  76. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
  77. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
  78. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
  79. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,6 @@ from loguru import logger
5
5
  from magic_pdf.config.make_content_config import DropMode, MakeMode
6
6
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
7
  from magic_pdf.libs.commons import join_path
8
- from magic_pdf.libs.language import detect_lang
9
8
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
10
9
  from magic_pdf.para.para_split_v3 import ListLineTag
11
10
 
@@ -30,6 +29,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
30
29
  for page_info in pdf_info_dict:
31
30
  paras_of_layout = page_info.get('para_blocks')
32
31
  if not paras_of_layout:
32
+ markdown_with_para_and_pagination.append({
33
+ 'page_no':
34
+ page_no,
35
+ 'md_content':
36
+ '',
37
+ })
38
+ page_no += 1
33
39
  continue
34
40
  page_markdown = ocr_mk_markdown_with_para_core_v2(
35
41
  paras_of_layout, 'mm', img_buket_path)
@@ -136,14 +142,11 @@ def merge_para_with_text(para_block):
136
142
  para_text += ' \n'
137
143
 
138
144
  line_text = ''
139
- line_lang = ''
140
145
  for span in line['spans']:
141
146
  span_type = span['type']
142
147
  if span_type == ContentType.Text:
143
148
  line_text += span['content'].strip()
144
149
 
145
- if line_text != '':
146
- line_lang = detect_lang(line_text)
147
150
  for j, span in enumerate(line['spans']):
148
151
 
149
152
  span_type = span['type']
@@ -157,27 +160,18 @@ def merge_para_with_text(para_block):
157
160
 
158
161
  content = content.strip()
159
162
  if content != '':
160
- langs = ['zh', 'ja', 'ko']
161
- if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
162
- if span_type in [ContentType.Text, ContentType.InterlineEquation]:
163
- para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
164
- elif span_type == ContentType.InlineEquation:
165
- para_text += f' {content} '
166
- else:
167
- if span_type in [ContentType.Text, ContentType.InlineEquation]:
168
- # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
169
- if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
170
- para_text += content[:-1]
171
- elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
172
- para_text += content
173
- else: # 西方文本语境下 content间需要空格分隔
174
- para_text += f'{content} '
175
- elif span_type == ContentType.InterlineEquation:
176
- para_text += content
163
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
164
+ # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
165
+ if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
166
+ para_text += content[:-1]
167
+ else: # content间需要空格分隔
168
+ para_text += f'{content} '
169
+ elif span_type == ContentType.InterlineEquation:
170
+ para_text += content
177
171
  else:
178
172
  continue
179
173
  # 连写字符拆分
180
- para_text = __replace_ligatures(para_text)
174
+ # para_text = __replace_ligatures(para_text)
181
175
 
182
176
  return para_text
183
177
 
@@ -1,15 +1,14 @@
1
1
  """输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
2
2
 
3
- import sys
4
3
  from collections import Counter
5
4
 
6
- import click
5
+ import fitz
7
6
  from loguru import logger
8
7
 
9
8
  from magic_pdf.config.drop_reason import DropReason
10
- from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
9
+ from magic_pdf.libs.commons import get_top_percent_list, mymax
11
10
  from magic_pdf.libs.language import detect_lang
12
- from magic_pdf.libs.pdf_check import detect_invalid_chars
11
+ from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
13
12
 
14
13
  scan_max_page = 50
15
14
  junk_limit_min = 10
@@ -324,7 +323,7 @@ def get_language(doc: fitz.Document):
324
323
 
325
324
  def check_invalid_chars(pdf_bytes):
326
325
  """乱码检测."""
327
- return detect_invalid_chars(pdf_bytes)
326
+ return detect_invalid_chars_by_pymupdf(pdf_bytes)
328
327
 
329
328
 
330
329
  def pdf_meta_scan(pdf_bytes: bytes):
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
384
383
  return res
385
384
 
386
385
 
387
- @click.command()
388
- @click.option('--s3-pdf-path', help='s3上pdf文件的路径')
389
- @click.option('--s3-profile', help='s3上的profile')
390
- def main(s3_pdf_path: str, s3_profile: str):
391
- """"""
392
- try:
393
- file_content = read_file(s3_pdf_path, s3_profile)
394
- pdf_meta_scan(file_content)
395
- except Exception as e:
396
- print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
397
- logger.exception(e)
398
-
399
-
400
386
  if __name__ == '__main__':
401
- main()
387
+ pass
402
388
  # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
403
389
  # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
404
390
  # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
magic_pdf/libs/commons.py CHANGED
@@ -1,34 +1,8 @@
1
- import datetime
2
- import json
3
- import os, re, configparser
4
- import subprocess
5
- import time
6
-
7
- import boto3
8
- from loguru import logger
9
- from boto3.s3.transfer import TransferConfig
10
- from botocore.config import Config
11
-
12
- import fitz # 1.23.9中已经切换到rebase
13
- # import fitz_old as fitz # 使用1.23.9之前的pymupdf库
14
-
15
-
16
- def get_delta_time(input_time):
17
- return round(time.time() - input_time, 2)
18
-
19
1
 
20
2
  def join_path(*args):
21
3
  return '/'.join(str(s).rstrip('/') for s in args)
22
4
 
23
5
 
24
- #配置全局的errlog_path,方便demo同步引用
25
- error_log_path = "s3://llm-pdf-text/err_logs/"
26
- # json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
27
- json_dump_path = "s3://llm-pdf-text/json_dump/"
28
-
29
- # s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
30
-
31
-
32
6
  def get_top_percent_list(num_list, percent):
33
7
  """
34
8
  获取列表中前百分之多少的元素
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
48
22
  return top_percent_list
49
23
 
50
24
 
51
- def formatted_time(time_stamp):
52
- dt_object = datetime.datetime.fromtimestamp(time_stamp)
53
- output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
54
- return output_time
55
-
56
-
57
25
  def mymax(alist: list):
58
26
  if len(alist) == 0:
59
27
  return 0 # 空是0, 0*0也是0大小q
60
28
  else:
61
29
  return max(alist)
62
30
 
63
- def parse_aws_param(profile):
64
- if isinstance(profile, str):
65
- # 解析配置文件
66
- config_file = join_path(os.path.expanduser("~"), ".aws", "config")
67
- credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
68
- config = configparser.ConfigParser()
69
- config.read(credentials_file)
70
- config.read(config_file)
71
- # 获取 AWS 账户相关信息
72
- ak = config.get(profile, "aws_access_key_id")
73
- sk = config.get(profile, "aws_secret_access_key")
74
- if profile == "default":
75
- s3_str = config.get(f"{profile}", "s3")
76
- else:
77
- s3_str = config.get(f"profile {profile}", "s3")
78
- end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
79
- if end_match:
80
- endpoint = end_match.group(1)
81
- else:
82
- raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
83
- style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
84
- if style_match:
85
- addressing_style = style_match.group(1)
86
- else:
87
- addressing_style = "path"
88
- elif isinstance(profile, dict):
89
- ak = profile["ak"]
90
- sk = profile["sk"]
91
- endpoint = profile["endpoint"]
92
- addressing_style = "auto"
93
-
94
- return ak, sk, endpoint, addressing_style
95
-
96
31
 
97
32
  def parse_bucket_key(s3_full_path: str):
98
33
  """
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
106
41
  s3_full_path = s3_full_path[1:]
107
42
  bucket, key = s3_full_path.split("/", 1)
108
43
  return bucket, key
109
-
110
-
111
- def read_file(pdf_path: str, s3_profile):
112
- if pdf_path.startswith("s3://"):
113
- ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
114
- cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
115
- config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
116
- bucket_name, bucket_key = parse_bucket_key(pdf_path)
117
- res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
118
- file_content = res["Body"].read()
119
- return file_content
120
- else:
121
- with open(pdf_path, "rb") as f:
122
- return f.read()
123
-
124
-
125
- def get_docx_model_output(pdf_model_output, page_id):
126
-
127
- model_output_json = pdf_model_output[page_id]
128
-
129
- return model_output_json
130
-
131
-
132
- def list_dir(dir_path:str, s3_profile:str):
133
- """
134
- 列出dir_path下的所有文件
135
- """
136
- ret = []
137
-
138
- if dir_path.startswith("s3"):
139
- ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
140
- s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
141
- bucket, path = s3info[0][0], s3info[0][1]
142
- try:
143
- cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
144
- config=Config(s3={'addressing_style': addressing_style}))
145
- def list_obj_scluster():
146
- marker = None
147
- while True:
148
- list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
149
- if marker:
150
- list_kwargs['Marker'] = marker
151
- response = cli.list_objects(**list_kwargs)
152
- contents = response.get("Contents", [])
153
- yield from contents
154
- if not response.get("IsTruncated") or len(contents)==0:
155
- break
156
- marker = contents[-1]['Key']
157
-
158
-
159
- for info in list_obj_scluster():
160
- file_path = info['Key']
161
- #size = info['Size']
162
-
163
- if path!="":
164
- afile = file_path[len(path):]
165
- if afile.endswith(".json"):
166
- ret.append(f"s3://{bucket}/{file_path}")
167
-
168
- return ret
169
-
170
- except Exception as e:
171
- logger.exception(e)
172
- exit(-1)
173
- else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
174
-
175
- for root, dirs, files in os.walk(dir_path):
176
- for file in files:
177
- if file.endswith(".json"):
178
- ret.append(join_path(root, file))
179
- ret.sort()
180
- return ret
181
-
182
- def get_img_s3_client(save_path:str, image_s3_config:str):
183
- """
184
- """
185
- if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
186
- ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
187
- img_s3_client = boto3.client(
188
- service_name="s3",
189
- aws_access_key_id=ak,
190
- aws_secret_access_key=sk,
191
- endpoint_url=end_point,
192
- config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
193
- )
194
- else:
195
- img_s3_client = None
196
-
197
- return img_s3_client
198
-
199
- if __name__=="__main__":
200
- s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
201
- s3_profile = "langchao"
202
- ret = list_dir(s3_path, s3_profile)
203
- print(ret)
204
-
@@ -1,8 +1,7 @@
1
+ import fitz
1
2
  from magic_pdf.config.constants import CROSS_PAGE
2
- from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
3
- ContentType)
3
+ from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
4
4
  from magic_pdf.data.dataset import PymuDocDataset
5
- from magic_pdf.libs.commons import fitz # PyMuPDF
6
5
  from magic_pdf.model.magic_model import MagicModel
7
6
 
8
7
 
@@ -1,24 +1,3 @@
1
- import re
2
-
3
-
4
- def escape_special_markdown_char(pymu_blocks):
5
- """
6
- 转义正文里对markdown语法有特殊意义的字符
7
- """
8
- special_chars = ["*", "`", "~", "$"]
9
- for blk in pymu_blocks:
10
- for line in blk['lines']:
11
- for span in line['spans']:
12
- for char in special_chars:
13
- span_text = span['text']
14
- span_type = span.get("_type", None)
15
- if span_type in ['inline-equation', 'interline-equation']:
16
- continue
17
- elif span_text:
18
- span['text'] = span['text'].replace(char, "\\" + char)
19
-
20
- return pymu_blocks
21
-
22
1
 
23
2
  def ocr_escape_special_markdown_char(content):
24
3
  """
@@ -1,9 +1,9 @@
1
- from io import BytesIO
2
- import re
3
1
  import fitz
4
2
  import numpy as np
5
3
  from loguru import logger
6
- from pdfminer.high_level import extract_text
4
+ # import re
5
+ # from io import BytesIO
6
+ # from pdfminer.high_level import extract_text
7
7
 
8
8
 
9
9
  def calculate_sample_count(total_page: int):
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
14
14
  return select_page_cnt
15
15
 
16
16
 
17
- def extract_pages(src_pdf_bytes: bytes):
17
+ def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
18
18
  pdf_docs = fitz.open("pdf", src_pdf_bytes)
19
19
  total_page = len(pdf_docs)
20
20
  if total_page == 0:
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
33
33
  return sample_docs
34
34
 
35
35
 
36
- def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
37
- """"
38
- 检测PDF中是否包含非法字符
36
+ # def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
37
+ # """"
38
+ # 检测PDF中是否包含非法字符
39
+ # """
40
+ # '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
41
+ # sample_docs = extract_pages(src_pdf_bytes)
42
+ # sample_pdf_bytes = sample_docs.tobytes()
43
+ # sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
44
+ # text = extract_text(sample_pdf_file_like_object)
45
+ # text = text.replace("\n", "")
46
+ # # logger.info(text)
47
+ # '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
48
+ # cid_pattern = re.compile(r'\(cid:\d+\)')
49
+ # matches = cid_pattern.findall(text)
50
+ # cid_count = len(matches)
51
+ # cid_len = sum(len(match) for match in matches)
52
+ # text_len = len(text)
53
+ # if text_len == 0:
54
+ # cid_chars_radio = 0
55
+ # else:
56
+ # cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
57
+ # logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
58
+ # '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
59
+ # if cid_chars_radio > 0.05:
60
+ # return False # 乱码文档
61
+ # else:
62
+ # return True # 正常文档
63
+
64
+
65
+ def count_replacement_characters(text: str) -> int:
66
+ """
67
+ 统计字符串中 0xfffd 字符的数量。
39
68
  """
40
- '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
69
+ return text.count('\ufffd')
70
+
71
+
72
+ def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
41
73
  sample_docs = extract_pages(src_pdf_bytes)
42
- sample_pdf_bytes = sample_docs.tobytes()
43
- sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
44
- text = extract_text(sample_pdf_file_like_object)
45
- text = text.replace("\n", "")
46
- # logger.info(text)
47
- '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
48
- cid_pattern = re.compile(r'\(cid:\d+\)')
49
- matches = cid_pattern.findall(text)
50
- cid_count = len(matches)
51
- cid_len = sum(len(match) for match in matches)
52
- text_len = len(text)
74
+ doc_text = ""
75
+ for page in sample_docs:
76
+ page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
77
+ doc_text += page_text
78
+ text_len = len(doc_text)
79
+ uffd_count = count_replacement_characters(doc_text)
53
80
  if text_len == 0:
54
- cid_chars_radio = 0
81
+ uffd_chars_radio = 0
55
82
  else:
56
- cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
57
- logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
58
- '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
59
- if cid_chars_radio > 0.05:
83
+ uffd_chars_radio = uffd_count / text_len
84
+ logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
85
+ '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
86
+ if uffd_chars_radio > 0.01:
60
87
  return False # 乱码文档
61
88
  else:
62
- return True # 正常文档
89
+ return True # 正常文档
@@ -1,9 +1,10 @@
1
1
  from io import BytesIO
2
2
  import cv2
3
+ import fitz
3
4
  import numpy as np
4
5
  from PIL import Image
5
6
  from magic_pdf.data.data_reader_writer import DataWriter
6
- from magic_pdf.libs.commons import fitz, join_path
7
+ from magic_pdf.libs.commons import join_path
7
8
  from magic_pdf.libs.hash_utils import compute_sha256
8
9
 
9
10
 
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.10.1"
1
+ __version__ = "0.10.3"
@@ -46,8 +46,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
46
46
  mat = fitz.Matrix(dpi / 72, dpi / 72)
47
47
  pm = page.get_pixmap(matrix=mat, alpha=False)
48
48
 
49
- # If the width or height exceeds 9000 after scaling, do not scale further.
50
- if pm.width > 9000 or pm.height > 9000:
49
+ # If the width or height exceeds 4500 after scaling, do not scale further.
50
+ if pm.width > 4500 or pm.height > 4500:
51
51
  pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
52
52
 
53
53
  img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
@@ -1,16 +1,12 @@
1
1
  import enum
2
- import json
3
2
 
4
3
  from magic_pdf.config.model_block_type import ModelBlockTypeEnum
5
4
  from magic_pdf.config.ocr_content_type import CategoryId, ContentType
6
- from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
7
- FileBasedDataWriter)
8
5
  from magic_pdf.data.dataset import Dataset
9
6
  from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
10
7
  bbox_relative_pos, box_area, calculate_iou,
11
8
  calculate_overlap_area_in_bbox1_area_ratio,
12
9
  get_overlap_area)
13
- from magic_pdf.libs.commons import fitz, join_path
14
10
  from magic_pdf.libs.coordinate_transform import get_scale_ratio
15
11
  from magic_pdf.libs.local_math import float_gt
16
12
  from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
@@ -1048,29 +1044,3 @@ class MagicModel:
1048
1044
  def get_model_list(self, page_no):
1049
1045
  return self.__model_list[page_no]
1050
1046
 
1051
-
1052
- if __name__ == '__main__':
1053
- drw = FileBasedDataReader(r'D:/project/20231108code-clean')
1054
- if 0:
1055
- pdf_file_path = r'linshixuqiu\19983-00.pdf'
1056
- model_file_path = r'linshixuqiu\19983-00_new.json'
1057
- pdf_bytes = drw.read(pdf_file_path)
1058
- model_json_txt = drw.read(model_file_path).decode()
1059
- model_list = json.loads(model_json_txt)
1060
- write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
1061
- img_bucket_path = 'imgs'
1062
- img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
1063
- pdf_docs = fitz.open('pdf', pdf_bytes)
1064
- magic_model = MagicModel(model_list, pdf_docs)
1065
-
1066
- if 1:
1067
- from magic_pdf.data.dataset import PymuDocDataset
1068
-
1069
- model_list = json.loads(
1070
- drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
1071
- )
1072
- pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
1073
-
1074
- magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
1075
- for i in range(7):
1076
- print(magic_model.get_imgs(i))
@@ -18,11 +18,31 @@ def region_to_bbox(region):
18
18
 
19
19
 
20
20
  class CustomPaddleModel:
21
- def __init__(self, ocr: bool = False, show_log: bool = False, lang=None):
21
+ def __init__(self,
22
+ ocr: bool = False,
23
+ show_log: bool = False,
24
+ lang=None,
25
+ det_db_box_thresh=0.3,
26
+ use_dilation=True,
27
+ det_db_unclip_ratio=1.8
28
+ ):
22
29
  if lang is not None:
23
- self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang)
30
+ self.model = PPStructure(table=False,
31
+ ocr=True,
32
+ show_log=show_log,
33
+ lang=lang,
34
+ det_db_box_thresh=det_db_box_thresh,
35
+ use_dilation=use_dilation,
36
+ det_db_unclip_ratio=det_db_unclip_ratio,
37
+ )
24
38
  else:
25
- self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
39
+ self.model = PPStructure(table=False,
40
+ ocr=True,
41
+ show_log=show_log,
42
+ det_db_box_thresh=det_db_box_thresh,
43
+ use_dilation=use_dilation,
44
+ det_db_unclip_ratio=det_db_unclip_ratio,
45
+ )
26
46
 
27
47
  def __call__(self, img):
28
48
  try:
@@ -1,11 +1,55 @@
1
- import math
2
-
1
+ import cv2
3
2
  import numpy as np
4
3
  from loguru import logger
5
-
4
+ from io import BytesIO
5
+ from PIL import Image
6
+ import base64
6
7
  from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
7
8
  from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
8
9
 
10
+ from ppocr.utils.utility import check_and_read
11
+
12
+
13
+ def img_decode(content: bytes):
14
+ np_arr = np.frombuffer(content, dtype=np.uint8)
15
+ return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
16
+
17
+
18
+ def check_img(img):
19
+ if isinstance(img, bytes):
20
+ img = img_decode(img)
21
+ if isinstance(img, str):
22
+ image_file = img
23
+ img, flag_gif, flag_pdf = check_and_read(image_file)
24
+ if not flag_gif and not flag_pdf:
25
+ with open(image_file, 'rb') as f:
26
+ img_str = f.read()
27
+ img = img_decode(img_str)
28
+ if img is None:
29
+ try:
30
+ buf = BytesIO()
31
+ image = BytesIO(img_str)
32
+ im = Image.open(image)
33
+ rgb = im.convert('RGB')
34
+ rgb.save(buf, 'jpeg')
35
+ buf.seek(0)
36
+ image_bytes = buf.read()
37
+ data_base64 = str(base64.b64encode(image_bytes),
38
+ encoding="utf-8")
39
+ image_decode = base64.b64decode(data_base64)
40
+ img_array = np.frombuffer(image_decode, np.uint8)
41
+ img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
42
+ except:
43
+ logger.error("error in loading image:{}".format(image_file))
44
+ return None
45
+ if img is None:
46
+ logger.error("error in loading image:{}".format(image_file))
47
+ return None
48
+ if isinstance(img, np.ndarray) and len(img.shape) == 2:
49
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
50
+
51
+ return img
52
+
9
53
 
10
54
  def bbox_to_points(bbox):
11
55
  """ 将bbox格式转换为四个顶点的数组 """
@@ -214,6 +258,9 @@ def get_ocr_result_list(ocr_res, useful_list):
214
258
  if len(box_ocr_res) == 2:
215
259
  p1, p2, p3, p4 = box_ocr_res[0]
216
260
  text, score = box_ocr_res[1]
261
+ # logger.info(f"text: {text}, score: {score}")
262
+ if score < 0.6: # 过滤低置信度的结果
263
+ continue
217
264
  else:
218
265
  p1, p2, p3, p4 = box_ocr_res
219
266
  text, score = "", 1
@@ -249,32 +296,6 @@ def get_ocr_result_list(ocr_res, useful_list):
249
296
  return ocr_result_list
250
297
 
251
298
 
252
- def calculate_angle_degrees(poly):
253
- # 定义对角线的顶点
254
- diagonal1 = (poly[0], poly[2])
255
- diagonal2 = (poly[1], poly[3])
256
-
257
- # 计算对角线的斜率
258
- def slope(p1, p2):
259
- return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
260
-
261
- slope1 = slope(diagonal1[0], diagonal1[1])
262
- slope2 = slope(diagonal2[0], diagonal2[1])
263
-
264
- # 计算对角线与x轴的夹角(以弧度为单位)
265
- angle1_radians = math.atan(slope1)
266
- angle2_radians = math.atan(slope2)
267
-
268
- # 将弧度转换为角度
269
- angle1_degrees = math.degrees(angle1_radians)
270
- angle2_degrees = math.degrees(angle2_radians)
271
-
272
- # 取两条对角线与x轴夹角的平均值
273
- average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
274
- # logger.info(f"average_angle_degrees: {average_angle_degrees}")
275
- return average_angle_degrees
276
-
277
-
278
299
  def calculate_is_angle(poly):
279
300
  p1, p2, p3, p4 = poly
280
301
  height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2