magic-pdf 0.10.4__py3-none-any.whl → 0.10.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/data/utils.py CHANGED
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
20
20
  mat = fitz.Matrix(dpi / 72, dpi / 72)
21
21
  pm = doc.get_pixmap(matrix=mat, alpha=False)
22
22
 
23
- # If the width or height exceeds 9000 after scaling, do not scale further.
24
- if pm.width > 9000 or pm.height > 9000:
23
+ # If the width or height exceeds 4500 after scaling, do not scale further.
24
+ if pm.width > 4500 or pm.height > 4500:
25
25
  pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
26
26
 
27
27
  img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.10.4"
1
+ __version__ = "0.10.5"
@@ -112,8 +112,8 @@ def __is_list_or_index_block(block):
112
112
  line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
113
113
  block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
114
114
  if (
115
- line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
116
- and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
115
+ line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
116
+ and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
117
117
  ):
118
118
  external_sides_not_close_num += 1
119
119
  if abs(line_mid_x - block_mid_x) < line_height / 2:
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
12
12
  for span in spans:
13
13
  span_type = span['type']
14
14
  if span_type == ContentType.Image:
15
- if not check_img_bbox(span['bbox']):
15
+ if not check_img_bbox(span['bbox']) or not imageWriter:
16
16
  continue
17
17
  span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
18
18
  imageWriter=imageWriter)
19
19
  elif span_type == ContentType.Table:
20
- if not check_img_bbox(span['bbox']):
20
+ if not check_img_bbox(span['bbox']) or not imageWriter:
21
21
  continue
22
22
  span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
23
23
  imageWriter=imageWriter)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.10.4
3
+ Version: 0.10.5
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -16,7 +16,7 @@ magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
17
17
  magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
18
18
  magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
19
- magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
19
+ magic_pdf/data/utils.py,sha256=uaSHprh80D_puPUmd1slQDoE4uecNn4zZMzYWY0-a-8,917
20
20
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
21
21
  magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
22
22
  magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
@@ -53,7 +53,7 @@ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,
53
53
  magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
54
54
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
55
55
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
56
- magic_pdf/libs/version.py,sha256=fGZMaoPHZfTX9I4TDkr07gp-kj_1U_SD-gjQC_2flQs,23
56
+ magic_pdf/libs/version.py,sha256=c61d5YjslqtpItkzB2NGlURm177H2racruHXV9G6u6s,23
57
57
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
58
58
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
59
59
  magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
@@ -110,7 +110,7 @@ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-u
110
110
  magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
111
  magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
112
112
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
- magic_pdf/para/para_split_v3.py,sha256=UOQe0HUVX7FAlMbJp1OkGfdM7JECWeqscv3s8Hge7ps,16922
113
+ magic_pdf/para/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
114
114
  magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
115
115
  magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
116
116
  magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
@@ -118,7 +118,7 @@ magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,480
118
118
  magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
119
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
120
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
121
- magic_pdf/pre_proc/cut_image.py,sha256=U-ttnl3lAhhmgtkR1GGyPAVm0i0-6VscXf3E2EDy3lE,1187
121
+ magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
122
122
  magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
123
123
  magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
124
124
  magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
@@ -139,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
139
139
  magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
140
140
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
141
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
142
- magic_pdf-0.10.4.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
- magic_pdf-0.10.4.dist-info/METADATA,sha256=pujqC_qUWiPT-L6R065MoL0QO9q4IEra0iW4BCRkxr4,36992
144
- magic_pdf-0.10.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
- magic_pdf-0.10.4.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
- magic_pdf-0.10.4.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
- magic_pdf-0.10.4.dist-info/RECORD,,
142
+ magic_pdf-0.10.5.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
+ magic_pdf-0.10.5.dist-info/METADATA,sha256=TIb8C_MrpU0_XwZc2dLfKpH5wQtE8G8Q0w56OPWYG30,36992
144
+ magic_pdf-0.10.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
+ magic_pdf-0.10.5.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
+ magic_pdf-0.10.5.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
+ magic_pdf-0.10.5.dist-info/RECORD,,