magic-pdf 0.5.9__py3-none-any.whl → 0.5.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/user_api.py CHANGED
@@ -18,8 +18,8 @@ from loguru import logger
18
18
  from magic_pdf.libs.version import __version__
19
19
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
20
20
  from magic_pdf.rw import AbsReaderWriter
21
- from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
22
- from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
21
+ from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
22
+ from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
23
23
 
24
24
  PARSE_TYPE_TXT = "txt"
25
25
  PARSE_TYPE_OCR = "ocr"
@@ -86,45 +86,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
86
86
  return None
87
87
 
88
88
  pdf_info_dict = parse_pdf(parse_pdf_by_txt)
89
- # text_all = ""
90
- # for page_dict in pdf_info_dict['pdf_info']:
91
- # for para_block in page_dict['para_blocks']:
92
- # if para_block['type'] in ['title', 'text']:
93
- # for line in para_block['lines']:
94
- # for span in line['spans']:
95
- # text_all += span['content']
96
-
97
- # def calculate_not_common_character_rate(text):
98
- # garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
99
- # # 计算乱码字符的数量
100
- # garbage_count = len(garbage_regex.findall(text))
101
- # total = len(text)
102
- # if total == 0:
103
- # return 0 # 避免除以零的错误
104
- # return garbage_count / total
105
- #
106
- # def calculate_not_printable_rate(text):
107
- # printable_text = ""
108
- # for c in text:
109
- # if c.isprintable():
110
- # printable_text += c
111
- # printable_total = len(printable_text)
112
- # total = len(text)
113
- # if total == 0:
114
- # return 0 # 避免除以零的错误
115
- # return (total - printable_total) / total
116
- #
117
- # not_common_character_rate = calculate_not_common_character_rate(text_all)
118
- # not_printable_rate = calculate_not_printable_rate(text_all)
119
- # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
120
- # pdf_info_dict["_not_printable_rate"] = not_printable_rate
121
- # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
122
- '''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
123
- # not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
124
- if (pdf_info_dict is None
125
- or pdf_info_dict.get("_need_drop", False)
126
- # or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
127
- ):
89
+ if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
128
90
  logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
129
91
  if input_model_is_empty:
130
92
  pdf_models = doc_analyze(pdf_bytes, ocr=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.9
3
+ Version: 0.5.11
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/magicpdf/Magic-PDF
6
6
  Requires-Python: >=3.9
@@ -22,11 +22,26 @@ Requires-Dist: wordninja >=2.0.0
22
22
  Requires-Dist: scikit-learn >=1.0.2
23
23
  Requires-Dist: nltk ==3.8.1
24
24
  Requires-Dist: s3pathlib >=2.1.1
25
- Requires-Dist: paddleocr
26
25
  Requires-Dist: pdfminer.six >=20231228
26
+ Requires-Dist: Levenshtein
27
+ Requires-Dist: nltk
28
+ Requires-Dist: rapidfuzz
29
+ Requires-Dist: statistics
30
+ Requires-Dist: openxlab
31
+ Requires-Dist: pandas
32
+ Requires-Dist: numpy
33
+ Requires-Dist: matplotlib
34
+ Requires-Dist: seaborn
35
+ Requires-Dist: scipy
36
+ Requires-Dist: scikit-learn
37
+ Requires-Dist: tqdm
38
+ Requires-Dist: htmltabletomd
39
+ Requires-Dist: pypandoc
27
40
  Provides-Extra: cpu
41
+ Requires-Dist: paddleocr ; extra == 'cpu'
28
42
  Requires-Dist: paddlepaddle ; extra == 'cpu'
29
43
  Provides-Extra: gpu
44
+ Requires-Dist: paddleocr ; extra == 'gpu'
30
45
  Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
31
46
 
32
47
  <div id="top"></div>
@@ -64,6 +79,10 @@ Key features include:
64
79
  - Compatibility with CPU and GPU environments
65
80
  - Available for Windows, Linux, and macOS platforms
66
81
 
82
+ ## Project Panorama
83
+
84
+ ![Project Panorama](docs/images/project_panorama_en.png)
85
+
67
86
  ## Getting Started
68
87
 
69
88
  ### Requirements
@@ -72,20 +91,57 @@ Key features include:
72
91
 
73
92
  ### Usage Instructions
74
93
 
75
- 1. **Install Magic-PDF**
76
-
94
+ #### 1. Install Magic-PDF
77
95
  ```bash
78
- pip install magic-pdf[cpu] # Install the CPU version
79
- or
80
- pip install magic-pdf[gpu] # Install the GPU version
96
+ pip install magic-pdf
81
97
  ```
82
98
 
83
- 2. **Usage via Command Line**
99
+ #### 2. Usage via Command Line
84
100
 
101
+ ###### simple
102
+ ```bash
103
+ cp magic-pdf.template.json to ~/magic-pdf.json
104
+ magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
105
+ ```
106
+ ###### more
85
107
  ```bash
86
108
  magic-pdf --help
87
109
  ```
88
110
 
111
+ #### 3. Usage via Api
112
+
113
+ ###### Local
114
+ ```python
115
+ image_writer = DiskReaderWriter(local_image_dir)
116
+ image_dir = str(os.path.basename(local_image_dir))
117
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
118
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
119
+ pipe.pipe_classify()
120
+ pipe.pipe_parse()
121
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
122
+ ```
123
+
124
+ ###### Object Storage
125
+ ```python
126
+ s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
127
+ image_dir = "s3://img_bucket/"
128
+ s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
129
+ pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
130
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
131
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
132
+ pipe.pipe_classify()
133
+ pipe.pipe_parse()
134
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
135
+ ```
136
+
137
+ Demo can be referred to [demo.py](https://github.com/magicpdf/Magic-PDF/blob/master/demo/demo.py)
138
+
139
+ ## All Thanks To Our Contributors
140
+
141
+ <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
142
+ <img src="https://contrib.rocks/image?repo=magicpdf/Magic-PDF" />
143
+ </a>
144
+
89
145
  ## License Information
90
146
 
91
147
  See [LICENSE.md](https://github.com/magicpdf/Magic-PDF/blob/master/LICENSE.md) for details.
@@ -1,13 +1,11 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- magic_pdf/pdf_parse_by_ocr.py,sha256=jYEfc4P6o4f7mZqfPFITs3NWlBxe4v1gwLh1yu2VDIQ,8950
3
- magic_pdf/pdf_parse_by_ocr_v2.py,sha256=1XVRwmcGRtxMMRYgqkJcctD4tLjCprgpGQsHpeVQktM,637
4
- magic_pdf/pdf_parse_by_txt.py,sha256=5_kdfvDkv_XwDove2AW7SopGysYLJ1-tsOQy2yuII1Y,21932
5
- magic_pdf/pdf_parse_by_txt_v2.py,sha256=mGadyYamoCNGNsKOQM1uXQR65zMUKyL24yURGHADmVs,1908
2
+ magic_pdf/pdf_parse_by_ocr.py,sha256=IWnSWt1Z-d35xRqspzdLR2iUtma_SAu4W7K4kEk8SHc,638
3
+ magic_pdf/pdf_parse_by_txt.py,sha256=KUSH7Gh83CZmdyWw59pqDskwyJ2Kg-jU-9fnQGJQEs4,537
6
4
  magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
7
5
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
8
- magic_pdf/user_api.py,sha256=R4onPBXlA8GARwlqBh5wmxUtTxwQb-PUeFK7eTAWzoU,4971
6
+ magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
9
7
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- magic_pdf/cli/magicpdf.py,sha256=aMmY_J83_8IdkkxyZUHg8WzIbCA_oW4cyjlUUZE0Wvc,11117
8
+ magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
11
9
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
10
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
13
11
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -46,14 +44,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
46
44
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
47
45
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
48
46
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
49
- magic_pdf/libs/version.py,sha256=JXLyhF5WmLgRZBfWGz9zWe2g5ISKSLpn2jp8yLaC-s4,22
47
+ magic_pdf/libs/version.py,sha256=xFez9dUQrcuZqZRWuEIsCbMskoR-Ke1_uUZ51Kyt1tw,23
50
48
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
51
49
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
52
- magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=QD4NWEIz8UXdIG4V_3P8EaYesxk6PvC1SOtTWEy2GEY,2007
50
+ magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
51
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=N3DqbVT1hc4s9KhppWDmZWkCj2ExKltoLrQl2IWGk7c,2231
54
52
  magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
55
53
  magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
56
- magic_pdf/model/pp_structure_v2.py,sha256=qsyt9vFDGaVizBMiSaeFVHTDsJTrIHx46Ec2J8SOj1A,2469
54
+ magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
57
55
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
56
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
59
57
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -64,7 +62,7 @@ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,
64
62
  magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
65
63
  magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
66
64
  magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
67
- magic_pdf/para/para_split_v2.py,sha256=a04dsUFE3JD4DA9e2DULJgbKrcqWuCfK58de1p-T3Io,36610
65
+ magic_pdf/para/para_split_v2.py,sha256=jGOhsubdh_CEgSv9WMNmp1loq1YNlpcAj3yh3g0gPhw,37027
68
66
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
69
67
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
70
68
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -89,7 +87,7 @@ magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1
89
87
  magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
90
88
  magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
91
89
  magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
92
- magic_pdf/pre_proc/equations_replace.py,sha256=rpEHD5IvavRgUSus0tLjKcIv2faStp_Ii9vf09HVUtI,20384
90
+ magic_pdf/pre_proc/equations_replace.py,sha256=fXj7ZV7F3YtkDYrAhE9g5tHk4_3pVUyLbhDtMjbxjWU,20386
93
91
  magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
94
92
  magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
95
93
  magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
@@ -117,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
117
115
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
118
116
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
119
117
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
120
- magic_pdf-0.5.9.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
- magic_pdf-0.5.9.dist-info/METADATA,sha256=6Y0tWpKEWrjYaNVrBWddqU9mn4EKR8cSbka47hUSmog,2971
122
- magic_pdf-0.5.9.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
- magic_pdf-0.5.9.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
124
- magic_pdf-0.5.9.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
125
- magic_pdf-0.5.9.dist-info/RECORD,,
118
+ magic_pdf-0.5.11.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
+ magic_pdf-0.5.11.dist-info/METADATA,sha256=wMDXFCmnlXQKkUdp891cG46MrbDn92TlPPD8T7AT3tE,4649
120
+ magic_pdf-0.5.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
+ magic_pdf-0.5.11.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
+ magic_pdf-0.5.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
+ magic_pdf-0.5.11.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- from magic_pdf.pdf_parse_union_core import pdf_parse_union
2
-
3
- def parse_pdf_by_ocr(pdf_bytes,
4
- model_list,
5
- imageWriter,
6
- start_page_id=0,
7
- end_page_id=None,
8
- debug_mode=False,
9
- ):
10
- return pdf_parse_union(pdf_bytes,
11
- model_list,
12
- imageWriter,
13
- "ocr",
14
- start_page_id=start_page_id,
15
- end_page_id=end_page_id,
16
- debug_mode=debug_mode,
17
- )
@@ -1,56 +0,0 @@
1
- from magic_pdf.pdf_parse_union_core import pdf_parse_union
2
-
3
-
4
- def parse_pdf_by_txt(
5
- pdf_bytes,
6
- model_list,
7
- imageWriter,
8
- start_page_id=0,
9
- end_page_id=None,
10
- debug_mode=False,
11
- ):
12
- return pdf_parse_union(pdf_bytes,
13
- model_list,
14
- imageWriter,
15
- "txt",
16
- start_page_id=start_page_id,
17
- end_page_id=end_page_id,
18
- debug_mode=debug_mode,
19
- )
20
-
21
-
22
- if __name__ == "__main__":
23
- pass
24
- # if 1:
25
- # import fitz
26
- # import json
27
- #
28
- # with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
29
- # pdf_bytes = f.read()
30
- # pdf_docs = fitz.open("pdf", pdf_bytes)
31
- #
32
- # with open("/opt/data/pdf/20240418/25536-00.json") as f:
33
- # model_list = json.loads(f.readline())
34
- #
35
- # magic_model = MagicModel(model_list, pdf_docs)
36
- # for i in range(7):
37
- # print(magic_model.get_imgs(i))
38
- #
39
- # for page_no, page in enumerate(pdf_docs):
40
- # inline_equations, interline_equations, interline_equation_blocks = (
41
- # magic_model.get_equations(page_no)
42
- # )
43
- #
44
- # text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
45
- # char_level_text_blocks = page.get_text(
46
- # "rawdict", flags=fitz.TEXTFLAGS_TEXT
47
- # )["blocks"]
48
- # text_blocks = combine_chars_to_pymudict(
49
- # text_raw_blocks, char_level_text_blocks
50
- # )
51
- # text_blocks = replace_equations_in_textblock(
52
- # text_blocks, inline_equations, interline_equations
53
- # )
54
- # text_blocks = remove_citation_marker(text_blocks)
55
- #
56
- # text_blocks = remove_chars_in_text_blocks(text_blocks)