magic-pdf 0.10.3__py3-none-any.whl → 0.10.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/data/utils.py CHANGED
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
20
20
  mat = fitz.Matrix(dpi / 72, dpi / 72)
21
21
  pm = doc.get_pixmap(matrix=mat, alpha=False)
22
22
 
23
- # If the width or height exceeds 9000 after scaling, do not scale further.
24
- if pm.width > 9000 or pm.height > 9000:
23
+ # If the width or height exceeds 4500 after scaling, do not scale further.
24
+ if pm.width > 4500 or pm.height > 4500:
25
25
  pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
26
26
 
27
27
  img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
@@ -5,6 +5,7 @@ from loguru import logger
5
5
  from magic_pdf.config.make_content_config import DropMode, MakeMode
6
6
  from magic_pdf.config.ocr_content_type import BlockType, ContentType
7
7
  from magic_pdf.libs.commons import join_path
8
+ from magic_pdf.libs.language import detect_lang
8
9
  from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
9
10
  from magic_pdf.para.para_split_v3 import ListLineTag
10
11
 
@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
135
136
 
136
137
 
137
138
  def merge_para_with_text(para_block):
139
+ block_text = ''
140
+ for line in para_block['lines']:
141
+ for span in line['spans']:
142
+ if span['type'] in [ContentType.Text]:
143
+ block_text += span['content']
144
+ block_lang = detect_lang(block_text)
145
+
138
146
  para_text = ''
139
147
  for i, line in enumerate(para_block['lines']):
140
148
 
141
149
  if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
142
150
  para_text += ' \n'
143
151
 
144
- line_text = ''
145
- for span in line['spans']:
146
- span_type = span['type']
147
- if span_type == ContentType.Text:
148
- line_text += span['content'].strip()
149
-
150
152
  for j, span in enumerate(line['spans']):
151
153
 
152
154
  span_type = span['type']
@@ -159,15 +161,24 @@ def merge_para_with_text(para_block):
159
161
  content = f"\n$$\n{span['content']}\n$$\n"
160
162
 
161
163
  content = content.strip()
162
- if content != '':
163
- if span_type in [ContentType.Text, ContentType.InlineEquation]:
164
- # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
165
- if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
166
- para_text += content[:-1]
167
- else: # content间需要空格分隔
164
+
165
+ if content:
166
+ langs = ['zh', 'ja', 'ko']
167
+ # logger.info(f'block_lang: {block_lang}, content: {content}')
168
+ if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
169
+ if j == len(line['spans']) - 1:
170
+ para_text += content
171
+ else:
168
172
  para_text += f'{content} '
169
- elif span_type == ContentType.InterlineEquation:
170
- para_text += content
173
+ else:
174
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
175
+ # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
176
+ if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
177
+ para_text += content[:-1]
178
+ else: # 西方文本语境下 content间需要空格分隔
179
+ para_text += f'{content} '
180
+ elif span_type == ContentType.InterlineEquation:
181
+ para_text += content
171
182
  else:
172
183
  continue
173
184
  # 连写字符拆分
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.10.3"
1
+ __version__ = "0.10.5"
@@ -112,8 +112,8 @@ def __is_list_or_index_block(block):
112
112
  line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
113
113
  block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
114
114
  if (
115
- line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
116
- and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
115
+ line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
116
+ and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
117
117
  ):
118
118
  external_sides_not_close_num += 1
119
119
  if abs(line_mid_x - block_mid_x) < line_height / 2:
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
12
12
  for span in spans:
13
13
  span_type = span['type']
14
14
  if span_type == ContentType.Image:
15
- if not check_img_bbox(span['bbox']):
15
+ if not check_img_bbox(span['bbox']) or not imageWriter:
16
16
  continue
17
17
  span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
18
18
  imageWriter=imageWriter)
19
19
  elif span_type == ContentType.Table:
20
- if not check_img_bbox(span['bbox']):
20
+ if not check_img_bbox(span['bbox']) or not imageWriter:
21
21
  continue
22
22
  span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
23
23
  imageWriter=imageWriter)
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
117
117
  all_bboxes = remove_overlaps_min_blocks(all_bboxes)
118
118
  all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
119
119
  """将剩余的bbox做分离处理,防止后面分layout时出错"""
120
- all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
121
-
120
+ # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
121
+ all_bboxes.sort(key=lambda x: x[0]+x[1])
122
122
  return all_bboxes, all_discarded_blocks
123
123
 
124
124
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.10.3
3
+ Version: 0.10.5
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -16,7 +16,7 @@ magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
17
17
  magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
18
18
  magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
19
- magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
19
+ magic_pdf/data/utils.py,sha256=uaSHprh80D_puPUmd1slQDoE4uecNn4zZMzYWY0-a-8,917
20
20
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
21
21
  magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
22
22
  magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
@@ -27,7 +27,7 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
27
27
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
28
28
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
29
29
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=hwcHTEx1tbIlM9ukmPBOAyH0G6rmbOTu87nVtZ1gE6k,12354
30
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=pE-lEUsYAhZC3nSmbgYO42Kvk_bW8Ds-AL-QMPHFu8c,12941
31
31
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
32
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
33
33
  magic_pdf/filter/pdf_meta_scan.py,sha256=3ba7SxXu1z2r5N97Dxmp_L10Lo7llsrBlvtEAJeIJBQ,17403
@@ -53,7 +53,7 @@ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,
53
53
  magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
54
54
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
55
55
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
56
- magic_pdf/libs/version.py,sha256=0C8KcY1dzs3hdkAre06v0NCQ0Uxcqv6g9a93bRcVLW0,23
56
+ magic_pdf/libs/version.py,sha256=c61d5YjslqtpItkzB2NGlURm177H2racruHXV9G6u6s,23
57
57
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
58
58
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
59
59
  magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
@@ -110,7 +110,7 @@ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-u
110
110
  magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
111
  magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
112
112
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
- magic_pdf/para/para_split_v3.py,sha256=UOQe0HUVX7FAlMbJp1OkGfdM7JECWeqscv3s8Hge7ps,16922
113
+ magic_pdf/para/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
114
114
  magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
115
115
  magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
116
116
  magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
@@ -118,8 +118,8 @@ magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,480
118
118
  magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
119
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
120
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
121
- magic_pdf/pre_proc/cut_image.py,sha256=U-ttnl3lAhhmgtkR1GGyPAVm0i0-6VscXf3E2EDy3lE,1187
122
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=xQ1L6pwQjN4xBSKEXslheip1aMFaiB0grqlX3BF-kh0,9282
121
+ magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
122
+ magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
123
123
  magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
124
124
  magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
125
125
  magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
@@ -139,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
139
139
  magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
140
140
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
141
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
142
- magic_pdf-0.10.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
- magic_pdf-0.10.3.dist-info/METADATA,sha256=R86XDaSfj1tcu3etkvhQfg3FSoARv8mKW2KpwjsdqWs,36992
144
- magic_pdf-0.10.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
- magic_pdf-0.10.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
- magic_pdf-0.10.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
- magic_pdf-0.10.3.dist-info/RECORD,,
142
+ magic_pdf-0.10.5.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
+ magic_pdf-0.10.5.dist-info/METADATA,sha256=TIb8C_MrpU0_XwZc2dLfKpH5wQtE8G8Q0w56OPWYG30,36992
144
+ magic_pdf-0.10.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
+ magic_pdf-0.10.5.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
+ magic_pdf-0.10.5.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
+ magic_pdf-0.10.5.dist-info/RECORD,,