magic-pdf 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +9 -10
- magic_pdf/libs/version.py +1 -1
- magic_pdf/para/para_split_v2.py +43 -28
- magic_pdf/pdf_parse_by_ocr.py +18 -219
- magic_pdf/pdf_parse_by_txt.py +10 -401
- magic_pdf/pre_proc/equations_replace.py +1 -1
- magic_pdf/user_api.py +3 -41
- magic_pdf-0.5.10.dist-info/METADATA +133 -0
- {magic_pdf-0.5.8.dist-info → magic_pdf-0.5.10.dist-info}/RECORD +13 -14
- magic_pdf-0.5.10.dist-info/entry_points.txt +2 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +0 -17
- magic_pdf/pdf_parse_by_txt_v2.py +0 -56
- magic_pdf-0.5.8.dist-info/METADATA +0 -28
- {magic_pdf-0.5.8.dist-info → magic_pdf-0.5.10.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.8.dist-info → magic_pdf-0.5.10.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.8.dist-info → magic_pdf-0.5.10.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,11 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
magic_pdf/pdf_parse_by_ocr.py,sha256=
|
3
|
-
magic_pdf/
|
4
|
-
magic_pdf/pdf_parse_by_txt.py,sha256=5_kdfvDkv_XwDove2AW7SopGysYLJ1-tsOQy2yuII1Y,21932
|
5
|
-
magic_pdf/pdf_parse_by_txt_v2.py,sha256=mGadyYamoCNGNsKOQM1uXQR65zMUKyL24yURGHADmVs,1908
|
2
|
+
magic_pdf/pdf_parse_by_ocr.py,sha256=IWnSWt1Z-d35xRqspzdLR2iUtma_SAu4W7K4kEk8SHc,638
|
3
|
+
magic_pdf/pdf_parse_by_txt.py,sha256=KUSH7Gh83CZmdyWw59pqDskwyJ2Kg-jU-9fnQGJQEs4,537
|
6
4
|
magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
|
7
5
|
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
8
|
-
magic_pdf/user_api.py,sha256=
|
6
|
+
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
9
7
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
magic_pdf/cli/magicpdf.py,sha256=
|
8
|
+
magic_pdf/cli/magicpdf.py,sha256=FF6flO6wUcKG9Qx_FG6-xhHfmQzQWLjwrkMa5kowDgs,10937
|
11
9
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
10
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
13
11
|
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
|
@@ -46,7 +44,7 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
46
44
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
47
45
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
48
46
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
49
|
-
magic_pdf/libs/version.py,sha256=
|
47
|
+
magic_pdf/libs/version.py,sha256=1nlPInsRzDbcDPveZ3ghSJ6v6KveN9n6gnj-twW4DkI,23
|
50
48
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
51
49
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
52
50
|
magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -64,7 +62,7 @@ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,
|
|
64
62
|
magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
|
65
63
|
magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
|
66
64
|
magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
|
67
|
-
magic_pdf/para/para_split_v2.py,sha256=
|
65
|
+
magic_pdf/para/para_split_v2.py,sha256=jGOhsubdh_CEgSv9WMNmp1loq1YNlpcAj3yh3g0gPhw,37027
|
68
66
|
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
69
67
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
70
68
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
@@ -89,7 +87,7 @@ magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1
|
|
89
87
|
magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
|
90
88
|
magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
|
91
89
|
magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
|
92
|
-
magic_pdf/pre_proc/equations_replace.py,sha256=
|
90
|
+
magic_pdf/pre_proc/equations_replace.py,sha256=fXj7ZV7F3YtkDYrAhE9g5tHk4_3pVUyLbhDtMjbxjWU,20386
|
93
91
|
magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
|
94
92
|
magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
|
95
93
|
magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
|
@@ -117,8 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
117
115
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
118
116
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
119
117
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
124
|
-
magic_pdf-0.5.
|
118
|
+
magic_pdf-0.5.10.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
119
|
+
magic_pdf-0.5.10.dist-info/METADATA,sha256=B3e0sVOyFhk47EfHPuLFRUNxzdasWYx3XuYR53LSJX8,4175
|
120
|
+
magic_pdf-0.5.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
121
|
+
magic_pdf-0.5.10.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
122
|
+
magic_pdf-0.5.10.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
123
|
+
magic_pdf-0.5.10.dist-info/RECORD,,
|
magic_pdf/pdf_parse_by_ocr_v2.py
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
from magic_pdf.pdf_parse_union_core import pdf_parse_union
|
2
|
-
|
3
|
-
def parse_pdf_by_ocr(pdf_bytes,
|
4
|
-
model_list,
|
5
|
-
imageWriter,
|
6
|
-
start_page_id=0,
|
7
|
-
end_page_id=None,
|
8
|
-
debug_mode=False,
|
9
|
-
):
|
10
|
-
return pdf_parse_union(pdf_bytes,
|
11
|
-
model_list,
|
12
|
-
imageWriter,
|
13
|
-
"ocr",
|
14
|
-
start_page_id=start_page_id,
|
15
|
-
end_page_id=end_page_id,
|
16
|
-
debug_mode=debug_mode,
|
17
|
-
)
|
magic_pdf/pdf_parse_by_txt_v2.py
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
from magic_pdf.pdf_parse_union_core import pdf_parse_union
|
2
|
-
|
3
|
-
|
4
|
-
def parse_pdf_by_txt(
|
5
|
-
pdf_bytes,
|
6
|
-
model_list,
|
7
|
-
imageWriter,
|
8
|
-
start_page_id=0,
|
9
|
-
end_page_id=None,
|
10
|
-
debug_mode=False,
|
11
|
-
):
|
12
|
-
return pdf_parse_union(pdf_bytes,
|
13
|
-
model_list,
|
14
|
-
imageWriter,
|
15
|
-
"txt",
|
16
|
-
start_page_id=start_page_id,
|
17
|
-
end_page_id=end_page_id,
|
18
|
-
debug_mode=debug_mode,
|
19
|
-
)
|
20
|
-
|
21
|
-
|
22
|
-
if __name__ == "__main__":
|
23
|
-
pass
|
24
|
-
# if 1:
|
25
|
-
# import fitz
|
26
|
-
# import json
|
27
|
-
#
|
28
|
-
# with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
|
29
|
-
# pdf_bytes = f.read()
|
30
|
-
# pdf_docs = fitz.open("pdf", pdf_bytes)
|
31
|
-
#
|
32
|
-
# with open("/opt/data/pdf/20240418/25536-00.json") as f:
|
33
|
-
# model_list = json.loads(f.readline())
|
34
|
-
#
|
35
|
-
# magic_model = MagicModel(model_list, pdf_docs)
|
36
|
-
# for i in range(7):
|
37
|
-
# print(magic_model.get_imgs(i))
|
38
|
-
#
|
39
|
-
# for page_no, page in enumerate(pdf_docs):
|
40
|
-
# inline_equations, interline_equations, interline_equation_blocks = (
|
41
|
-
# magic_model.get_equations(page_no)
|
42
|
-
# )
|
43
|
-
#
|
44
|
-
# text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
|
45
|
-
# char_level_text_blocks = page.get_text(
|
46
|
-
# "rawdict", flags=fitz.TEXTFLAGS_TEXT
|
47
|
-
# )["blocks"]
|
48
|
-
# text_blocks = combine_chars_to_pymudict(
|
49
|
-
# text_raw_blocks, char_level_text_blocks
|
50
|
-
# )
|
51
|
-
# text_blocks = replace_equations_in_textblock(
|
52
|
-
# text_blocks, inline_equations, interline_equations
|
53
|
-
# )
|
54
|
-
# text_blocks = remove_citation_marker(text_blocks)
|
55
|
-
#
|
56
|
-
# text_blocks = remove_chars_in_text_blocks(text_blocks)
|
@@ -1,28 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: magic-pdf
|
3
|
-
Version: 0.5.8
|
4
|
-
Requires-Python: >=3.9
|
5
|
-
License-File: LICENSE.md
|
6
|
-
Requires-Dist: boto3 >=1.28.43
|
7
|
-
Requires-Dist: Brotli >=1.1.0
|
8
|
-
Requires-Dist: click >=8.1.7
|
9
|
-
Requires-Dist: Distance >=0.1.3
|
10
|
-
Requires-Dist: PyMuPDF >=1.24.5
|
11
|
-
Requires-Dist: loguru >=0.6.0
|
12
|
-
Requires-Dist: matplotlib >=3.8.3
|
13
|
-
Requires-Dist: numpy >=1.21.6
|
14
|
-
Requires-Dist: pandas >=1.3.5
|
15
|
-
Requires-Dist: fast-langdetect >=0.1.1
|
16
|
-
Requires-Dist: regex >=2023.12.25
|
17
|
-
Requires-Dist: termcolor >=2.4.0
|
18
|
-
Requires-Dist: wordninja >=2.0.0
|
19
|
-
Requires-Dist: scikit-learn >=1.0.2
|
20
|
-
Requires-Dist: nltk ==3.8.1
|
21
|
-
Requires-Dist: s3pathlib >=2.1.1
|
22
|
-
Requires-Dist: paddleocr
|
23
|
-
Requires-Dist: pdfminer.six >=20231228
|
24
|
-
Provides-Extra: cpu
|
25
|
-
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
26
|
-
Provides-Extra: gpu
|
27
|
-
Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
28
|
-
|
File without changes
|
File without changes
|
File without changes
|