magic-pdf 0.10.3__py3-none-any.whl → 0.10.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ from loguru import logger
5
5
  from magic_pdf.config.make_content_config import DropMode, MakeMode
6
6
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
7
  from magic_pdf.libs.commons import join_path
8
+ from magic_pdf.libs.language import detect_lang
8
9
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
9
10
  from magic_pdf.para.para_split_v3 import ListLineTag
10
11
 
@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
135
136
 
136
137
 
137
138
  def merge_para_with_text(para_block):
139
+ block_text = ''
140
+ for line in para_block['lines']:
141
+ for span in line['spans']:
142
+ if span['type'] in [ContentType.Text]:
143
+ block_text += span['content']
144
+ block_lang = detect_lang(block_text)
145
+
138
146
  para_text = ''
139
147
  for i, line in enumerate(para_block['lines']):
140
148
 
141
149
  if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
142
150
  para_text += ' \n'
143
151
 
144
- line_text = ''
145
- for span in line['spans']:
146
- span_type = span['type']
147
- if span_type == ContentType.Text:
148
- line_text += span['content'].strip()
149
-
150
152
  for j, span in enumerate(line['spans']):
151
153
 
152
154
  span_type = span['type']
@@ -159,15 +161,24 @@ def merge_para_with_text(para_block):
159
161
  content = f"\n$$\n{span['content']}\n$$\n"
160
162
 
161
163
  content = content.strip()
162
- if content != '':
163
- if span_type in [ContentType.Text, ContentType.InlineEquation]:
164
- # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
165
- if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
166
- para_text += content[:-1]
167
- else: # content间需要空格分隔
164
+
165
+ if content:
166
+ langs = ['zh', 'ja', 'ko']
167
+ # logger.info(f'block_lang: {block_lang}, content: {content}')
168
+ if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
169
+ if j == len(line['spans']) - 1:
170
+ para_text += content
171
+ else:
168
172
  para_text += f'{content} '
169
- elif span_type == ContentType.InterlineEquation:
170
- para_text += content
173
+ else:
174
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
175
+ # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
176
+ if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
177
+ para_text += content[:-1]
178
+ else: # 西方文本语境下 content间需要空格分隔
179
+ para_text += f'{content} '
180
+ elif span_type == ContentType.InterlineEquation:
181
+ para_text += content
171
182
  else:
172
183
  continue
173
184
  # 连写字符拆分
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.10.3"
1
+ __version__ = "0.10.4"
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
117
117
  all_bboxes = remove_overlaps_min_blocks(all_bboxes)
118
118
  all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
119
119
  """将剩余的bbox做分离处理,防止后面分layout时出错"""
120
- all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
121
-
120
+ # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
121
+ all_bboxes.sort(key=lambda x: x[0]+x[1])
122
122
  return all_bboxes, all_discarded_blocks
123
123
 
124
124
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.10.3
3
+ Version: 0.10.4
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -27,7 +27,7 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
27
27
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
28
28
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
29
29
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=hwcHTEx1tbIlM9ukmPBOAyH0G6rmbOTu87nVtZ1gE6k,12354
30
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=pE-lEUsYAhZC3nSmbgYO42Kvk_bW8Ds-AL-QMPHFu8c,12941
31
31
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
33
33
  magic_pdf/filter/pdf_meta_scan.py,sha256=3ba7SxXu1z2r5N97Dxmp_L10Lo7llsrBlvtEAJeIJBQ,17403
@@ -53,7 +53,7 @@ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,
53
53
  magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
54
54
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
55
55
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
56
- magic_pdf/libs/version.py,sha256=0C8KcY1dzs3hdkAre06v0NCQ0Uxcqv6g9a93bRcVLW0,23
56
+ magic_pdf/libs/version.py,sha256=fGZMaoPHZfTX9I4TDkr07gp-kj_1U_SD-gjQC_2flQs,23
57
57
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
58
58
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
59
59
  magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
@@ -119,7 +119,7 @@ magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
119
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
120
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
121
121
  magic_pdf/pre_proc/cut_image.py,sha256=U-ttnl3lAhhmgtkR1GGyPAVm0i0-6VscXf3E2EDy3lE,1187
122
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=xQ1L6pwQjN4xBSKEXslheip1aMFaiB0grqlX3BF-kh0,9282
122
+ magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
123
123
  magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
124
124
  magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
125
125
  magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
@@ -139,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
139
139
  magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
140
140
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
141
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
142
- magic_pdf-0.10.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
- magic_pdf-0.10.3.dist-info/METADATA,sha256=R86XDaSfj1tcu3etkvhQfg3FSoARv8mKW2KpwjsdqWs,36992
144
- magic_pdf-0.10.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
- magic_pdf-0.10.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
- magic_pdf-0.10.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
- magic_pdf-0.10.3.dist-info/RECORD,,
142
+ magic_pdf-0.10.4.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
+ magic_pdf-0.10.4.dist-info/METADATA,sha256=pujqC_qUWiPT-L6R065MoL0QO9q4IEra0iW4BCRkxr4,36992
144
+ magic_pdf-0.10.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
+ magic_pdf-0.10.4.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
+ magic_pdf-0.10.4.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
+ magic_pdf-0.10.4.dist-info/RECORD,,