magic-pdf 0.10.4__py3-none-any.whl → 0.10.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/utils.py +2 -2
- magic_pdf/libs/version.py +1 -1
- magic_pdf/para/para_split_v3.py +2 -2
- magic_pdf/pre_proc/cut_image.py +2 -2
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.5.dist-info}/METADATA +1 -1
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.5.dist-info}/RECORD +10 -10
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.5.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.5.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.5.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.4.dist-info → magic_pdf-0.10.5.dist-info}/top_level.txt +0 -0
magic_pdf/data/utils.py
CHANGED
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
|
|
20
20
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
21
21
|
pm = doc.get_pixmap(matrix=mat, alpha=False)
|
22
22
|
|
23
|
-
# If the width or height exceeds
|
24
|
-
if pm.width >
|
23
|
+
# If the width or height exceeds 4500 after scaling, do not scale further.
|
24
|
+
if pm.width > 4500 or pm.height > 4500:
|
25
25
|
pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
26
26
|
|
27
27
|
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.10.
|
1
|
+
__version__ = "0.10.5"
|
magic_pdf/para/para_split_v3.py
CHANGED
@@ -112,8 +112,8 @@ def __is_list_or_index_block(block):
|
|
112
112
|
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
113
113
|
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
114
114
|
if (
|
115
|
-
line['bbox'][0] - block['bbox_fs'][0] > 0.
|
116
|
-
and block['bbox_fs'][2] - line['bbox'][2] > 0.
|
115
|
+
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
|
116
|
+
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
|
117
117
|
):
|
118
118
|
external_sides_not_close_num += 1
|
119
119
|
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
magic_pdf/pre_proc/cut_image.py
CHANGED
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
|
|
12
12
|
for span in spans:
|
13
13
|
span_type = span['type']
|
14
14
|
if span_type == ContentType.Image:
|
15
|
-
if not check_img_bbox(span['bbox']):
|
15
|
+
if not check_img_bbox(span['bbox']) or not imageWriter:
|
16
16
|
continue
|
17
17
|
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
|
18
18
|
imageWriter=imageWriter)
|
19
19
|
elif span_type == ContentType.Table:
|
20
|
-
if not check_img_bbox(span['bbox']):
|
20
|
+
if not check_img_bbox(span['bbox']) or not imageWriter:
|
21
21
|
continue
|
22
22
|
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
|
23
23
|
imageWriter=imageWriter)
|
@@ -16,7 +16,7 @@ magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
|
17
17
|
magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
|
18
18
|
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
19
|
-
magic_pdf/data/utils.py,sha256=
|
19
|
+
magic_pdf/data/utils.py,sha256=uaSHprh80D_puPUmd1slQDoE4uecNn4zZMzYWY0-a-8,917
|
20
20
|
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
21
21
|
magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
|
22
22
|
magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
|
@@ -53,7 +53,7 @@ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,
|
|
53
53
|
magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
|
54
54
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
55
55
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
56
|
-
magic_pdf/libs/version.py,sha256=
|
56
|
+
magic_pdf/libs/version.py,sha256=c61d5YjslqtpItkzB2NGlURm177H2racruHXV9G6u6s,23
|
57
57
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
58
58
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
|
59
59
|
magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
|
@@ -110,7 +110,7 @@ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-u
|
|
110
110
|
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
111
111
|
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
|
112
112
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
113
|
-
magic_pdf/para/para_split_v3.py,sha256=
|
113
|
+
magic_pdf/para/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
|
114
114
|
magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
|
115
115
|
magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
|
116
116
|
magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
|
@@ -118,7 +118,7 @@ magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,480
|
|
118
118
|
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
119
119
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
120
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
121
|
-
magic_pdf/pre_proc/cut_image.py,sha256=
|
121
|
+
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
122
122
|
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
|
123
123
|
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
|
124
124
|
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
|
@@ -139,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
|
|
139
139
|
magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
|
140
140
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
141
141
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
142
|
-
magic_pdf-0.10.
|
143
|
-
magic_pdf-0.10.
|
144
|
-
magic_pdf-0.10.
|
145
|
-
magic_pdf-0.10.
|
146
|
-
magic_pdf-0.10.
|
147
|
-
magic_pdf-0.10.
|
142
|
+
magic_pdf-0.10.5.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
143
|
+
magic_pdf-0.10.5.dist-info/METADATA,sha256=TIb8C_MrpU0_XwZc2dLfKpH5wQtE8G8Q0w56OPWYG30,36992
|
144
|
+
magic_pdf-0.10.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
145
|
+
magic_pdf-0.10.5.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
146
|
+
magic_pdf-0.10.5.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
147
|
+
magic_pdf-0.10.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|