magic-pdf 0.10.3__py3-none-any.whl → 0.10.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/utils.py +2 -2
- magic_pdf/dict2md/ocr_mkcontent.py +25 -14
- magic_pdf/libs/version.py +1 -1
- magic_pdf/para/para_split_v3.py +2 -2
- magic_pdf/pre_proc/cut_image.py +2 -2
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +2 -2
- {magic_pdf-0.10.3.dist-info → magic_pdf-0.10.5.dist-info}/METADATA +1 -1
- {magic_pdf-0.10.3.dist-info → magic_pdf-0.10.5.dist-info}/RECORD +12 -12
- {magic_pdf-0.10.3.dist-info → magic_pdf-0.10.5.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.3.dist-info → magic_pdf-0.10.5.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.3.dist-info → magic_pdf-0.10.5.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.3.dist-info → magic_pdf-0.10.5.dist-info}/top_level.txt +0 -0
magic_pdf/data/utils.py
CHANGED
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
|
20
20
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
21
21
|
pm = doc.get_pixmap(matrix=mat, alpha=False)
|
22
22
|
|
23
|
-
# If the width or height exceeds
|
24
|
-
if pm.width >
|
23
|
+
# If the width or height exceeds 4500 after scaling, do not scale further.
|
24
|
+
if pm.width > 4500 or pm.height > 4500:
|
25
25
|
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
26
26
|
|
27
27
|
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
@@ -5,6 +5,7 @@ from loguru import logger
|
|
5
5
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
6
6
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
7
7
|
from magic_pdf.libs.commons import join_path
|
8
|
+
from magic_pdf.libs.language import detect_lang
|
8
9
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
9
10
|
from magic_pdf.para.para_split_v3 import ListLineTag
|
10
11
|
|
@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
|
|
135
136
|
|
136
137
|
|
137
138
|
def merge_para_with_text(para_block):
|
139
|
+
block_text = ''
|
140
|
+
for line in para_block['lines']:
|
141
|
+
for span in line['spans']:
|
142
|
+
if span['type'] in [ContentType.Text]:
|
143
|
+
block_text += span['content']
|
144
|
+
block_lang = detect_lang(block_text)
|
145
|
+
|
138
146
|
para_text = ''
|
139
147
|
for i, line in enumerate(para_block['lines']):
|
140
148
|
|
141
149
|
if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
|
142
150
|
para_text += ' \n'
|
143
151
|
|
144
|
-
line_text = ''
|
145
|
-
for span in line['spans']:
|
146
|
-
span_type = span['type']
|
147
|
-
if span_type == ContentType.Text:
|
148
|
-
line_text += span['content'].strip()
|
149
|
-
|
150
152
|
for j, span in enumerate(line['spans']):
|
151
153
|
|
152
154
|
span_type = span['type']
|
@@ -159,15 +161,24 @@ def merge_para_with_text(para_block):
|
|
159
161
|
content = f"\n$$\n{span['content']}\n$$\n"
|
160
162
|
|
161
163
|
content = content.strip()
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
164
|
+
|
165
|
+
if content:
|
166
|
+
langs = ['zh', 'ja', 'ko']
|
167
|
+
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
168
|
+
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
|
169
|
+
if j == len(line['spans']) - 1:
|
170
|
+
para_text += content
|
171
|
+
else:
|
168
172
|
para_text += f'{content} '
|
169
|
-
|
170
|
-
|
173
|
+
else:
|
174
|
+
if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
175
|
+
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
176
|
+
if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
|
177
|
+
para_text += content[:-1]
|
178
|
+
else: # 西方文本语境下 content间需要空格分隔
|
179
|
+
para_text += f'{content} '
|
180
|
+
elif span_type == ContentType.InterlineEquation:
|
181
|
+
para_text += content
|
171
182
|
else:
|
172
183
|
continue
|
173
184
|
# 连写字符拆分
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.10.
|
1
|
+
__version__ = "0.10.5"
|
magic_pdf/para/para_split_v3.py
CHANGED
@@ -112,8 +112,8 @@ def __is_list_or_index_block(block):
|
|
112
112
|
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
113
113
|
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
114
114
|
if (
|
115
|
-
line['bbox'][0] - block['bbox_fs'][0] > 0.
|
116
|
-
and block['bbox_fs'][2] - line['bbox'][2] > 0.
|
115
|
+
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
|
116
|
+
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
|
117
117
|
):
|
118
118
|
external_sides_not_close_num += 1
|
119
119
|
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
magic_pdf/pre_proc/cut_image.py
CHANGED
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
|
|
12
12
|
for span in spans:
|
13
13
|
span_type = span['type']
|
14
14
|
if span_type == ContentType.Image:
|
15
|
-
if not check_img_bbox(span['bbox']):
|
15
|
+
if not check_img_bbox(span['bbox']) or not imageWriter:
|
16
16
|
continue
|
17
17
|
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
|
18
18
|
imageWriter=imageWriter)
|
19
19
|
elif span_type == ContentType.Table:
|
20
|
-
if not check_img_bbox(span['bbox']):
|
20
|
+
if not check_img_bbox(span['bbox']) or not imageWriter:
|
21
21
|
continue
|
22
22
|
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
|
23
23
|
imageWriter=imageWriter)
|
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
|
|
117
117
|
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
|
118
118
|
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
|
119
119
|
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
|
120
|
-
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
121
|
-
|
120
|
+
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
121
|
+
all_bboxes.sort(key=lambda x: x[0]+x[1])
|
122
122
|
return all_bboxes, all_discarded_blocks
|
123
123
|
|
124
124
|
|
@@ -16,7 +16,7 @@ magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
|
17
17
|
magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
|
18
18
|
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
19
|
-
magic_pdf/data/utils.py,sha256=
|
19
|
+
magic_pdf/data/utils.py,sha256=uaSHprh80D_puPUmd1slQDoE4uecNn4zZMzYWY0-a-8,917
|
20
20
|
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
21
21
|
magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
|
22
22
|
magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
|
@@ -27,7 +27,7 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
27
27
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
28
28
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
29
29
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
30
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=pE-lEUsYAhZC3nSmbgYO42Kvk_bW8Ds-AL-QMPHFu8c,12941
|
31
31
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
32
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
33
33
|
magic_pdf/filter/pdf_meta_scan.py,sha256=3ba7SxXu1z2r5N97Dxmp_L10Lo7llsrBlvtEAJeIJBQ,17403
|
@@ -53,7 +53,7 @@ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,
|
|
53
53
|
magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
|
54
54
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
55
55
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
56
|
-
magic_pdf/libs/version.py,sha256=
|
56
|
+
magic_pdf/libs/version.py,sha256=c61d5YjslqtpItkzB2NGlURm177H2racruHXV9G6u6s,23
|
57
57
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
58
58
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
|
59
59
|
magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
|
@@ -110,7 +110,7 @@ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-u
|
|
110
110
|
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
111
111
|
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
|
112
112
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
113
|
-
magic_pdf/para/para_split_v3.py,sha256=
|
113
|
+
magic_pdf/para/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
|
114
114
|
magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
|
115
115
|
magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
|
116
116
|
magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
|
@@ -118,8 +118,8 @@ magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,480
|
|
118
118
|
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
119
119
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
120
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
121
|
-
magic_pdf/pre_proc/cut_image.py,sha256=
|
122
|
-
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=
|
121
|
+
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
122
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
|
123
123
|
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
|
124
124
|
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
|
125
125
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
|
@@ -139,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
|
|
139
139
|
magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
|
140
140
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
141
141
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
142
|
-
magic_pdf-0.10.
|
143
|
-
magic_pdf-0.10.
|
144
|
-
magic_pdf-0.10.
|
145
|
-
magic_pdf-0.10.
|
146
|
-
magic_pdf-0.10.
|
147
|
-
magic_pdf-0.10.
|
142
|
+
magic_pdf-0.10.5.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
143
|
+
magic_pdf-0.10.5.dist-info/METADATA,sha256=TIb8C_MrpU0_XwZc2dLfKpH5wQtE8G8Q0w56OPWYG30,36992
|
144
|
+
magic_pdf-0.10.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
145
|
+
magic_pdf-0.10.5.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
146
|
+
magic_pdf-0.10.5.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
147
|
+
magic_pdf-0.10.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|