magic-pdf 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.1"
1
+ __version__ = "1.2.2"
@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
108
108
  ):
109
109
  multiple_para_flag = True
110
110
 
111
- for line in block['lines']:
112
- line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
113
- block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
114
- if (
115
- line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
116
- and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
117
- ):
118
- external_sides_not_close_num += 1
119
- if abs(line_mid_x - block_mid_x) < line_height / 2:
120
- center_close_num += 1
111
+ block_text = ''
121
112
 
113
+ for line in block['lines']:
122
114
  line_text = ''
123
115
 
124
116
  for span in line['spans']:
125
117
  span_type = span['type']
126
118
  if span_type == ContentType.Text:
127
119
  line_text += span['content'].strip()
128
-
129
120
  # 添加所有文本,包括空行,保持与block['lines']长度一致
130
121
  lines_text_list.append(line_text)
131
122
  block_text = ''.join(lines_text_list)
132
- block_lang = detect_lang(block_text)
133
- # logger.info(f"block_lang: {block_lang}")
123
+
124
+ block_lang = detect_lang(block_text)
125
+ # logger.info(f"block_lang: {block_lang}")
126
+
127
+ for line in block['lines']:
128
+ line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
129
+ block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
130
+ if (
131
+ line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
132
+ and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
133
+ ):
134
+ external_sides_not_close_num += 1
135
+ if abs(line_mid_x - block_mid_x) < line_height / 2:
136
+ center_close_num += 1
134
137
 
135
138
  # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
136
139
  if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 1.2.1
3
+ Version: 1.2.2
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -9,7 +9,7 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3>=1.28.43
10
10
  Requires-Dist: Brotli>=1.1.0
11
11
  Requires-Dist: click>=8.1.7
12
- Requires-Dist: fast-langdetect>=0.2.3
12
+ Requires-Dist: fast-langdetect<0.3.0,>=0.2.3
13
13
  Requires-Dist: loguru>=0.6.0
14
14
  Requires-Dist: numpy<2.0.0,>=1.21.6
15
15
  Requires-Dist: pydantic>=2.7.2
@@ -51,7 +51,7 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
51
51
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
52
52
  magic_pdf/libs/performance_stats.py,sha256=BFi4NIsUYlanznYoTVq4hBpj4NOuShAlWBHzebBGVYM,1702
53
53
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
54
- magic_pdf/libs/version.py,sha256=Mlm4Gvmb_6yQxwUbv2Ksc-BJFXLPg9H1Vt2iV7wXrA4,22
54
+ magic_pdf/libs/version.py,sha256=uuf4VNtTNA93fMhoAur9YafzaKJFnczY-H1SSCSuRVQ,22
55
55
  magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
56
56
  magic_pdf/model/batch_analyze.py,sha256=sbrgOJWycb1Ep6e62CPi6jEyG6VSeklIxc4PmrqaLhM,11933
57
57
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=T0-h4QmSIDXRzgF5uWO4jQrwIot221l26PXU52xeKiA,7933
@@ -117,7 +117,7 @@ magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio
117
117
  magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
118
118
  magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
119
119
  magic_pdf/post_proc/llm_aided.py,sha256=yzhu2cCpUZjdwf3v0swYDgSs9VWIfMAoXepYIP1EMZs,6367
120
- magic_pdf/post_proc/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
120
+ magic_pdf/post_proc/para_split_v3.py,sha256=SPN_VVGvFX5KpFMGw9OzgoE-kTZq-FF036i0cIImGH8,16975
121
121
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
122
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
123
123
  magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
@@ -139,9 +139,9 @@ magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,838
139
139
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
140
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
141
141
  magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
142
- magic_pdf-1.2.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
- magic_pdf-1.2.1.dist-info/METADATA,sha256=oMxODNFW4g154Rrh9g2sE9_irmB6x7j_5KmF7dRYEPQ,40994
144
- magic_pdf-1.2.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
- magic_pdf-1.2.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
- magic_pdf-1.2.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
- magic_pdf-1.2.1.dist-info/RECORD,,
142
+ magic_pdf-1.2.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
+ magic_pdf-1.2.2.dist-info/METADATA,sha256=FYzj0yWzmFAG4mQ22DH9F4KZfqexNg7YuhgiXMHc9Ug,41001
144
+ magic_pdf-1.2.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
+ magic_pdf-1.2.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
+ magic_pdf-1.2.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
+ magic_pdf-1.2.2.dist-info/RECORD,,