magic-pdf 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/libs/version.py +1 -1
- magic_pdf/post_proc/para_split_v3.py +16 -13
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.2.2.dist-info}/METADATA +2 -2
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.2.2.dist-info}/RECORD +8 -8
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.2.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.2.2.dist-info}/WHEEL +0 -0
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.2.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.2.2.dist-info}/top_level.txt +0 -0
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.2"
|
@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
|
|
108
108
|
):
|
109
109
|
multiple_para_flag = True
|
110
110
|
|
111
|
-
|
112
|
-
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
113
|
-
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
114
|
-
if (
|
115
|
-
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
|
116
|
-
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
|
117
|
-
):
|
118
|
-
external_sides_not_close_num += 1
|
119
|
-
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
120
|
-
center_close_num += 1
|
111
|
+
block_text = ''
|
121
112
|
|
113
|
+
for line in block['lines']:
|
122
114
|
line_text = ''
|
123
115
|
|
124
116
|
for span in line['spans']:
|
125
117
|
span_type = span['type']
|
126
118
|
if span_type == ContentType.Text:
|
127
119
|
line_text += span['content'].strip()
|
128
|
-
|
129
120
|
# 添加所有文本,包括空行,保持与block['lines']长度一致
|
130
121
|
lines_text_list.append(line_text)
|
131
122
|
block_text = ''.join(lines_text_list)
|
132
|
-
|
133
|
-
|
123
|
+
|
124
|
+
block_lang = detect_lang(block_text)
|
125
|
+
# logger.info(f"block_lang: {block_lang}")
|
126
|
+
|
127
|
+
for line in block['lines']:
|
128
|
+
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
129
|
+
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
130
|
+
if (
|
131
|
+
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
|
132
|
+
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
|
133
|
+
):
|
134
|
+
external_sides_not_close_num += 1
|
135
|
+
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
136
|
+
center_close_num += 1
|
134
137
|
|
135
138
|
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
|
136
139
|
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.2
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,7 +9,7 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3>=1.28.43
|
10
10
|
Requires-Dist: Brotli>=1.1.0
|
11
11
|
Requires-Dist: click>=8.1.7
|
12
|
-
Requires-Dist: fast-langdetect
|
12
|
+
Requires-Dist: fast-langdetect<0.3.0,>=0.2.3
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
15
|
Requires-Dist: pydantic>=2.7.2
|
@@ -51,7 +51,7 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
|
|
51
51
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
52
52
|
magic_pdf/libs/performance_stats.py,sha256=BFi4NIsUYlanznYoTVq4hBpj4NOuShAlWBHzebBGVYM,1702
|
53
53
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
54
|
-
magic_pdf/libs/version.py,sha256=
|
54
|
+
magic_pdf/libs/version.py,sha256=uuf4VNtTNA93fMhoAur9YafzaKJFnczY-H1SSCSuRVQ,22
|
55
55
|
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
56
56
|
magic_pdf/model/batch_analyze.py,sha256=sbrgOJWycb1Ep6e62CPi6jEyG6VSeklIxc4PmrqaLhM,11933
|
57
57
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=T0-h4QmSIDXRzgF5uWO4jQrwIot221l26PXU52xeKiA,7933
|
@@ -117,7 +117,7 @@ magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio
|
|
117
117
|
magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
|
118
118
|
magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
119
119
|
magic_pdf/post_proc/llm_aided.py,sha256=yzhu2cCpUZjdwf3v0swYDgSs9VWIfMAoXepYIP1EMZs,6367
|
120
|
-
magic_pdf/post_proc/para_split_v3.py,sha256=
|
120
|
+
magic_pdf/post_proc/para_split_v3.py,sha256=SPN_VVGvFX5KpFMGw9OzgoE-kTZq-FF036i0cIImGH8,16975
|
121
121
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
122
122
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
123
123
|
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
@@ -139,9 +139,9 @@ magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,838
|
|
139
139
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
140
140
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
141
141
|
magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
|
142
|
-
magic_pdf-1.2.
|
143
|
-
magic_pdf-1.2.
|
144
|
-
magic_pdf-1.2.
|
145
|
-
magic_pdf-1.2.
|
146
|
-
magic_pdf-1.2.
|
147
|
-
magic_pdf-1.2.
|
142
|
+
magic_pdf-1.2.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
143
|
+
magic_pdf-1.2.2.dist-info/METADATA,sha256=FYzj0yWzmFAG4mQ22DH9F4KZfqexNg7YuhgiXMHc9Ug,41001
|
144
|
+
magic_pdf-1.2.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
145
|
+
magic_pdf-1.2.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
146
|
+
magic_pdf-1.2.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
147
|
+
magic_pdf-1.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|