magic-pdf 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,11 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- magic_pdf/pdf_parse_by_ocr.py,sha256=jYEfc4P6o4f7mZqfPFITs3NWlBxe4v1gwLh1yu2VDIQ,8950
3
- magic_pdf/pdf_parse_by_ocr_v2.py,sha256=1XVRwmcGRtxMMRYgqkJcctD4tLjCprgpGQsHpeVQktM,637
4
- magic_pdf/pdf_parse_by_txt.py,sha256=5_kdfvDkv_XwDove2AW7SopGysYLJ1-tsOQy2yuII1Y,21932
5
- magic_pdf/pdf_parse_by_txt_v2.py,sha256=mGadyYamoCNGNsKOQM1uXQR65zMUKyL24yURGHADmVs,1908
2
+ magic_pdf/pdf_parse_by_ocr.py,sha256=IWnSWt1Z-d35xRqspzdLR2iUtma_SAu4W7K4kEk8SHc,638
3
+ magic_pdf/pdf_parse_by_txt.py,sha256=KUSH7Gh83CZmdyWw59pqDskwyJ2Kg-jU-9fnQGJQEs4,537
6
4
  magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
7
5
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
8
- magic_pdf/user_api.py,sha256=R4onPBXlA8GARwlqBh5wmxUtTxwQb-PUeFK7eTAWzoU,4971
6
+ magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
9
7
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- magic_pdf/cli/magicpdf.py,sha256=aMmY_J83_8IdkkxyZUHg8WzIbCA_oW4cyjlUUZE0Wvc,11117
8
+ magic_pdf/cli/magicpdf.py,sha256=FF6flO6wUcKG9Qx_FG6-xhHfmQzQWLjwrkMa5kowDgs,10937
11
9
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
10
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
13
11
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -46,7 +44,7 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
46
44
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
47
45
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
48
46
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
49
- magic_pdf/libs/version.py,sha256=bDuZ37zImJZsQ3a4pW87q4kg-zsIBrUFAv1aumIf_7k,22
47
+ magic_pdf/libs/version.py,sha256=1nlPInsRzDbcDPveZ3ghSJ6v6KveN9n6gnj-twW4DkI,23
50
48
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
51
49
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
52
50
  magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -64,7 +62,7 @@ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,
64
62
  magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
65
63
  magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
66
64
  magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
67
- magic_pdf/para/para_split_v2.py,sha256=a04dsUFE3JD4DA9e2DULJgbKrcqWuCfK58de1p-T3Io,36610
65
+ magic_pdf/para/para_split_v2.py,sha256=jGOhsubdh_CEgSv9WMNmp1loq1YNlpcAj3yh3g0gPhw,37027
68
66
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
69
67
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
70
68
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -89,7 +87,7 @@ magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1
89
87
  magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
90
88
  magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
91
89
  magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
92
- magic_pdf/pre_proc/equations_replace.py,sha256=rpEHD5IvavRgUSus0tLjKcIv2faStp_Ii9vf09HVUtI,20384
90
+ magic_pdf/pre_proc/equations_replace.py,sha256=fXj7ZV7F3YtkDYrAhE9g5tHk4_3pVUyLbhDtMjbxjWU,20386
93
91
  magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
94
92
  magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
95
93
  magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
@@ -117,8 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
117
115
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
118
116
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
119
117
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
120
- magic_pdf-0.5.8.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
- magic_pdf-0.5.8.dist-info/METADATA,sha256=Z7HrhP7T0_dQOjCX-CztMe77Mbt90IoY8JH0IhmRHH0,814
122
- magic_pdf-0.5.8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
- magic_pdf-0.5.8.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
124
- magic_pdf-0.5.8.dist-info/RECORD,,
118
+ magic_pdf-0.5.10.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
+ magic_pdf-0.5.10.dist-info/METADATA,sha256=B3e0sVOyFhk47EfHPuLFRUNxzdasWYx3XuYR53LSJX8,4175
120
+ magic_pdf-0.5.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
+ magic_pdf-0.5.10.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
+ magic_pdf-0.5.10.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
+ magic_pdf-0.5.10.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ magic-pdf = magic_pdf.cli.magicpdf:cli
@@ -1,17 +0,0 @@
1
- from magic_pdf.pdf_parse_union_core import pdf_parse_union
2
-
3
- def parse_pdf_by_ocr(pdf_bytes,
4
- model_list,
5
- imageWriter,
6
- start_page_id=0,
7
- end_page_id=None,
8
- debug_mode=False,
9
- ):
10
- return pdf_parse_union(pdf_bytes,
11
- model_list,
12
- imageWriter,
13
- "ocr",
14
- start_page_id=start_page_id,
15
- end_page_id=end_page_id,
16
- debug_mode=debug_mode,
17
- )
@@ -1,56 +0,0 @@
1
- from magic_pdf.pdf_parse_union_core import pdf_parse_union
2
-
3
-
4
- def parse_pdf_by_txt(
5
- pdf_bytes,
6
- model_list,
7
- imageWriter,
8
- start_page_id=0,
9
- end_page_id=None,
10
- debug_mode=False,
11
- ):
12
- return pdf_parse_union(pdf_bytes,
13
- model_list,
14
- imageWriter,
15
- "txt",
16
- start_page_id=start_page_id,
17
- end_page_id=end_page_id,
18
- debug_mode=debug_mode,
19
- )
20
-
21
-
22
- if __name__ == "__main__":
23
- pass
24
- # if 1:
25
- # import fitz
26
- # import json
27
- #
28
- # with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
29
- # pdf_bytes = f.read()
30
- # pdf_docs = fitz.open("pdf", pdf_bytes)
31
- #
32
- # with open("/opt/data/pdf/20240418/25536-00.json") as f:
33
- # model_list = json.loads(f.readline())
34
- #
35
- # magic_model = MagicModel(model_list, pdf_docs)
36
- # for i in range(7):
37
- # print(magic_model.get_imgs(i))
38
- #
39
- # for page_no, page in enumerate(pdf_docs):
40
- # inline_equations, interline_equations, interline_equation_blocks = (
41
- # magic_model.get_equations(page_no)
42
- # )
43
- #
44
- # text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
45
- # char_level_text_blocks = page.get_text(
46
- # "rawdict", flags=fitz.TEXTFLAGS_TEXT
47
- # )["blocks"]
48
- # text_blocks = combine_chars_to_pymudict(
49
- # text_raw_blocks, char_level_text_blocks
50
- # )
51
- # text_blocks = replace_equations_in_textblock(
52
- # text_blocks, inline_equations, interline_equations
53
- # )
54
- # text_blocks = remove_citation_marker(text_blocks)
55
- #
56
- # text_blocks = remove_chars_in_text_blocks(text_blocks)
@@ -1,28 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: magic-pdf
3
- Version: 0.5.8
4
- Requires-Python: >=3.9
5
- License-File: LICENSE.md
6
- Requires-Dist: boto3 >=1.28.43
7
- Requires-Dist: Brotli >=1.1.0
8
- Requires-Dist: click >=8.1.7
9
- Requires-Dist: Distance >=0.1.3
10
- Requires-Dist: PyMuPDF >=1.24.5
11
- Requires-Dist: loguru >=0.6.0
12
- Requires-Dist: matplotlib >=3.8.3
13
- Requires-Dist: numpy >=1.21.6
14
- Requires-Dist: pandas >=1.3.5
15
- Requires-Dist: fast-langdetect >=0.1.1
16
- Requires-Dist: regex >=2023.12.25
17
- Requires-Dist: termcolor >=2.4.0
18
- Requires-Dist: wordninja >=2.0.0
19
- Requires-Dist: scikit-learn >=1.0.2
20
- Requires-Dist: nltk ==3.8.1
21
- Requires-Dist: s3pathlib >=2.1.1
22
- Requires-Dist: paddleocr
23
- Requires-Dist: pdfminer.six >=20231228
24
- Provides-Extra: cpu
25
- Requires-Dist: paddlepaddle ; extra == 'cpu'
26
- Provides-Extra: gpu
27
- Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
28
-