magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -55,5 +55,8 @@ class FileBasedDataWriter(DataWriter):
55
55
  if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
56
56
  fn_path = os.path.join(self._parent_dir, path)
57
57
 
58
+ if not os.path.exists(os.path.dirname(fn_path)):
59
+ os.makedirs(os.path.dirname(fn_path), exist_ok=True)
60
+
58
61
  with open(fn_path, 'wb') as f:
59
62
  f.write(data)
@@ -1,13 +1,12 @@
1
1
  """输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
2
2
 
3
- import sys
4
3
  from collections import Counter
5
4
 
6
- import click
5
+ import fitz
7
6
  from loguru import logger
8
7
 
9
8
  from magic_pdf.config.drop_reason import DropReason
10
- from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
9
+ from magic_pdf.libs.commons import get_top_percent_list, mymax
11
10
  from magic_pdf.libs.language import detect_lang
12
11
  from magic_pdf.libs.pdf_check import detect_invalid_chars
13
12
 
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
384
383
  return res
385
384
 
386
385
 
387
- @click.command()
388
- @click.option('--s3-pdf-path', help='s3上pdf文件的路径')
389
- @click.option('--s3-profile', help='s3上的profile')
390
- def main(s3_pdf_path: str, s3_profile: str):
391
- """"""
392
- try:
393
- file_content = read_file(s3_pdf_path, s3_profile)
394
- pdf_meta_scan(file_content)
395
- except Exception as e:
396
- print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
397
- logger.exception(e)
398
-
399
-
400
386
  if __name__ == '__main__':
401
- main()
387
+ pass
402
388
  # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
403
389
  # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
404
390
  # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
magic_pdf/libs/commons.py CHANGED
@@ -1,34 +1,8 @@
1
- import datetime
2
- import json
3
- import os, re, configparser
4
- import subprocess
5
- import time
6
-
7
- import boto3
8
- from loguru import logger
9
- from boto3.s3.transfer import TransferConfig
10
- from botocore.config import Config
11
-
12
- import fitz # 1.23.9中已经切换到rebase
13
- # import fitz_old as fitz # 使用1.23.9之前的pymupdf库
14
-
15
-
16
- def get_delta_time(input_time):
17
- return round(time.time() - input_time, 2)
18
-
19
1
 
20
2
  def join_path(*args):
21
3
  return '/'.join(str(s).rstrip('/') for s in args)
22
4
 
23
5
 
24
- #配置全局的errlog_path,方便demo同步引用
25
- error_log_path = "s3://llm-pdf-text/err_logs/"
26
- # json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
27
- json_dump_path = "s3://llm-pdf-text/json_dump/"
28
-
29
- # s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
30
-
31
-
32
6
  def get_top_percent_list(num_list, percent):
33
7
  """
34
8
  获取列表中前百分之多少的元素
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
48
22
  return top_percent_list
49
23
 
50
24
 
51
- def formatted_time(time_stamp):
52
- dt_object = datetime.datetime.fromtimestamp(time_stamp)
53
- output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
54
- return output_time
55
-
56
-
57
25
  def mymax(alist: list):
58
26
  if len(alist) == 0:
59
27
  return 0 # 空是0, 0*0也是0大小q
60
28
  else:
61
29
  return max(alist)
62
30
 
63
- def parse_aws_param(profile):
64
- if isinstance(profile, str):
65
- # 解析配置文件
66
- config_file = join_path(os.path.expanduser("~"), ".aws", "config")
67
- credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
68
- config = configparser.ConfigParser()
69
- config.read(credentials_file)
70
- config.read(config_file)
71
- # 获取 AWS 账户相关信息
72
- ak = config.get(profile, "aws_access_key_id")
73
- sk = config.get(profile, "aws_secret_access_key")
74
- if profile == "default":
75
- s3_str = config.get(f"{profile}", "s3")
76
- else:
77
- s3_str = config.get(f"profile {profile}", "s3")
78
- end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
79
- if end_match:
80
- endpoint = end_match.group(1)
81
- else:
82
- raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
83
- style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
84
- if style_match:
85
- addressing_style = style_match.group(1)
86
- else:
87
- addressing_style = "path"
88
- elif isinstance(profile, dict):
89
- ak = profile["ak"]
90
- sk = profile["sk"]
91
- endpoint = profile["endpoint"]
92
- addressing_style = "auto"
93
-
94
- return ak, sk, endpoint, addressing_style
95
-
96
31
 
97
32
  def parse_bucket_key(s3_full_path: str):
98
33
  """
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
106
41
  s3_full_path = s3_full_path[1:]
107
42
  bucket, key = s3_full_path.split("/", 1)
108
43
  return bucket, key
109
-
110
-
111
- def read_file(pdf_path: str, s3_profile):
112
- if pdf_path.startswith("s3://"):
113
- ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
114
- cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
115
- config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
116
- bucket_name, bucket_key = parse_bucket_key(pdf_path)
117
- res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
118
- file_content = res["Body"].read()
119
- return file_content
120
- else:
121
- with open(pdf_path, "rb") as f:
122
- return f.read()
123
-
124
-
125
- def get_docx_model_output(pdf_model_output, page_id):
126
-
127
- model_output_json = pdf_model_output[page_id]
128
-
129
- return model_output_json
130
-
131
-
132
- def list_dir(dir_path:str, s3_profile:str):
133
- """
134
- 列出dir_path下的所有文件
135
- """
136
- ret = []
137
-
138
- if dir_path.startswith("s3"):
139
- ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
140
- s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
141
- bucket, path = s3info[0][0], s3info[0][1]
142
- try:
143
- cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
144
- config=Config(s3={'addressing_style': addressing_style}))
145
- def list_obj_scluster():
146
- marker = None
147
- while True:
148
- list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
149
- if marker:
150
- list_kwargs['Marker'] = marker
151
- response = cli.list_objects(**list_kwargs)
152
- contents = response.get("Contents", [])
153
- yield from contents
154
- if not response.get("IsTruncated") or len(contents)==0:
155
- break
156
- marker = contents[-1]['Key']
157
-
158
-
159
- for info in list_obj_scluster():
160
- file_path = info['Key']
161
- #size = info['Size']
162
-
163
- if path!="":
164
- afile = file_path[len(path):]
165
- if afile.endswith(".json"):
166
- ret.append(f"s3://{bucket}/{file_path}")
167
-
168
- return ret
169
-
170
- except Exception as e:
171
- logger.exception(e)
172
- exit(-1)
173
- else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
174
-
175
- for root, dirs, files in os.walk(dir_path):
176
- for file in files:
177
- if file.endswith(".json"):
178
- ret.append(join_path(root, file))
179
- ret.sort()
180
- return ret
181
-
182
- def get_img_s3_client(save_path:str, image_s3_config:str):
183
- """
184
- """
185
- if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
186
- ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
187
- img_s3_client = boto3.client(
188
- service_name="s3",
189
- aws_access_key_id=ak,
190
- aws_secret_access_key=sk,
191
- endpoint_url=end_point,
192
- config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
193
- )
194
- else:
195
- img_s3_client = None
196
-
197
- return img_s3_client
198
-
199
- if __name__=="__main__":
200
- s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
201
- s3_profile = "langchao"
202
- ret = list_dir(s3_path, s3_profile)
203
- print(ret)
204
-
@@ -1,8 +1,7 @@
1
+ import fitz
1
2
  from magic_pdf.config.constants import CROSS_PAGE
2
- from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
3
- ContentType)
3
+ from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
4
4
  from magic_pdf.data.dataset import PymuDocDataset
5
- from magic_pdf.libs.commons import fitz # PyMuPDF
6
5
  from magic_pdf.model.magic_model import MagicModel
7
6
 
8
7
 
@@ -1,24 +1,3 @@
1
- import re
2
-
3
-
4
- def escape_special_markdown_char(pymu_blocks):
5
- """
6
- 转义正文里对markdown语法有特殊意义的字符
7
- """
8
- special_chars = ["*", "`", "~", "$"]
9
- for blk in pymu_blocks:
10
- for line in blk['lines']:
11
- for span in line['spans']:
12
- for char in special_chars:
13
- span_text = span['text']
14
- span_type = span.get("_type", None)
15
- if span_type in ['inline-equation', 'interline-equation']:
16
- continue
17
- elif span_text:
18
- span['text'] = span['text'].replace(char, "\\" + char)
19
-
20
- return pymu_blocks
21
-
22
1
 
23
2
  def ocr_escape_special_markdown_char(content):
24
3
  """
@@ -1,9 +1,10 @@
1
1
  from io import BytesIO
2
2
  import cv2
3
+ import fitz
3
4
  import numpy as np
4
5
  from PIL import Image
5
6
  from magic_pdf.data.data_reader_writer import DataWriter
6
- from magic_pdf.libs.commons import fitz, join_path
7
+ from magic_pdf.libs.commons import join_path
7
8
  from magic_pdf.libs.hash_utils import compute_sha256
8
9
 
9
10
 
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.10.0"
1
+ __version__ = "0.10.2"
@@ -46,8 +46,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
46
46
  mat = fitz.Matrix(dpi / 72, dpi / 72)
47
47
  pm = page.get_pixmap(matrix=mat, alpha=False)
48
48
 
49
- # If the width or height exceeds 9000 after scaling, do not scale further.
50
- if pm.width > 9000 or pm.height > 9000:
49
+ # If the width or height exceeds 4500 after scaling, do not scale further.
50
+ if pm.width > 4500 or pm.height > 4500:
51
51
  pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
52
52
 
53
53
  img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
@@ -1,16 +1,12 @@
1
1
  import enum
2
- import json
3
2
 
4
3
  from magic_pdf.config.model_block_type import ModelBlockTypeEnum
5
4
  from magic_pdf.config.ocr_content_type import CategoryId, ContentType
6
- from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
7
- FileBasedDataWriter)
8
5
  from magic_pdf.data.dataset import Dataset
9
6
  from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
10
7
  bbox_relative_pos, box_area, calculate_iou,
11
8
  calculate_overlap_area_in_bbox1_area_ratio,
12
9
  get_overlap_area)
13
- from magic_pdf.libs.commons import fitz, join_path
14
10
  from magic_pdf.libs.coordinate_transform import get_scale_ratio
15
11
  from magic_pdf.libs.local_math import float_gt
16
12
  from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
@@ -1048,29 +1044,3 @@ class MagicModel:
1048
1044
  def get_model_list(self, page_no):
1049
1045
  return self.__model_list[page_no]
1050
1046
 
1051
-
1052
- if __name__ == '__main__':
1053
- drw = FileBasedDataReader(r'D:/project/20231108code-clean')
1054
- if 0:
1055
- pdf_file_path = r'linshixuqiu\19983-00.pdf'
1056
- model_file_path = r'linshixuqiu\19983-00_new.json'
1057
- pdf_bytes = drw.read(pdf_file_path)
1058
- model_json_txt = drw.read(model_file_path).decode()
1059
- model_list = json.loads(model_json_txt)
1060
- write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
1061
- img_bucket_path = 'imgs'
1062
- img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
1063
- pdf_docs = fitz.open('pdf', pdf_bytes)
1064
- magic_model = MagicModel(model_list, pdf_docs)
1065
-
1066
- if 1:
1067
- from magic_pdf.data.dataset import PymuDocDataset
1068
-
1069
- model_list = json.loads(
1070
- drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
1071
- )
1072
- pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
1073
-
1074
- magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
1075
- for i in range(7):
1076
- print(magic_model.get_imgs(i))
@@ -1,5 +1,3 @@
1
- import math
2
-
3
1
  import numpy as np
4
2
  from loguru import logger
5
3
 
@@ -214,6 +212,9 @@ def get_ocr_result_list(ocr_res, useful_list):
214
212
  if len(box_ocr_res) == 2:
215
213
  p1, p2, p3, p4 = box_ocr_res[0]
216
214
  text, score = box_ocr_res[1]
215
+ # logger.info(f"text: {text}, score: {score}")
216
+ if score < 0.6: # 过滤低置信度的结果
217
+ continue
217
218
  else:
218
219
  p1, p2, p3, p4 = box_ocr_res
219
220
  text, score = "", 1
@@ -249,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list):
249
250
  return ocr_result_list
250
251
 
251
252
 
252
- def calculate_angle_degrees(poly):
253
- # 定义对角线的顶点
254
- diagonal1 = (poly[0], poly[2])
255
- diagonal2 = (poly[1], poly[3])
256
-
257
- # 计算对角线的斜率
258
- def slope(p1, p2):
259
- return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
260
-
261
- slope1 = slope(diagonal1[0], diagonal1[1])
262
- slope2 = slope(diagonal2[0], diagonal2[1])
263
-
264
- # 计算对角线与x轴的夹角(以弧度为单位)
265
- angle1_radians = math.atan(slope1)
266
- angle2_radians = math.atan(slope2)
267
-
268
- # 将弧度转换为角度
269
- angle1_degrees = math.degrees(angle1_radians)
270
- angle2_degrees = math.degrees(angle2_radians)
271
-
272
- # 取两条对角线与x轴夹角的平均值
273
- average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
274
- # logger.info(f"average_angle_degrees: {average_angle_degrees}")
275
- return average_angle_degrees
276
-
277
-
278
253
  def calculate_is_angle(poly):
279
254
  p1, p2, p3, p4 = poly
280
255
  height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2
@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR):
63
63
 
64
64
  if det and rec:
65
65
  ocr_res = []
66
- for idx, img in enumerate(imgs):
66
+ for img in imgs:
67
67
  img = preprocess_image(img)
68
68
  dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
69
69
  if not dt_boxes and not rec_res:
@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR):
75
75
  return ocr_res
76
76
  elif det and not rec:
77
77
  ocr_res = []
78
- for idx, img in enumerate(imgs):
78
+ for img in imgs:
79
79
  img = preprocess_image(img)
80
80
  dt_boxes, elapse = self.text_detector(img)
81
81
  if dt_boxes is None:
@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR):
96
96
  else:
97
97
  ocr_res = []
98
98
  cls_res = []
99
- for idx, img in enumerate(imgs):
99
+ for img in imgs:
100
100
  if not isinstance(img, list):
101
101
  img = preprocess_image(img)
102
102
  img = [img]
@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
271
271
  first_span = first_line['spans'][0]
272
272
  if len(first_span['content']) > 0:
273
273
  span_start_with_num = first_span['content'][0].isdigit()
274
+ span_start_with_big_char = first_span['content'][0].isupper()
274
275
  if (
275
- abs(block2['bbox_fs'][2] - last_line['bbox'][2])
276
- < line_height
276
+ # 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
277
+ abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
278
+ # 上一个block的最后一个span不是以特定符号结尾
277
279
  and not last_span['content'].endswith(LINE_STOP_FLAG)
278
280
  # 两个block宽度差距超过2倍也不合并
279
281
  and abs(block1_weight - block2_weight) < min_block_weight
282
+ # 下一个block的第一个字符是数字
280
283
  and not span_start_with_num
284
+ # 下一个block的第一个字符是大写字母
285
+ and not span_start_with_big_char
281
286
  ):
282
287
  if block1['page_num'] != block2['page_num']:
283
288
  for line in block1['lines']: