magic-pdf 0.5.9__py3-none-any.whl → 0.5.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +44 -32
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +9 -1
- magic_pdf/model/pp_structure_v2.py +6 -1
- magic_pdf/para/para_split_v2.py +43 -28
- magic_pdf/pdf_parse_by_ocr.py +18 -219
- magic_pdf/pdf_parse_by_txt.py +10 -401
- magic_pdf/pre_proc/equations_replace.py +1 -1
- magic_pdf/user_api.py +3 -41
- {magic_pdf-0.5.9.dist-info → magic_pdf-0.5.11.dist-info}/METADATA +64 -8
- {magic_pdf-0.5.9.dist-info → magic_pdf-0.5.11.dist-info}/RECORD +16 -18
- magic_pdf/pdf_parse_by_ocr_v2.py +0 -17
- magic_pdf/pdf_parse_by_txt_v2.py +0 -56
- {magic_pdf-0.5.9.dist-info → magic_pdf-0.5.11.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.9.dist-info → magic_pdf-0.5.11.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.9.dist-info → magic_pdf-0.5.11.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.5.9.dist-info → magic_pdf-0.5.11.dist-info}/top_level.txt +0 -0
magic_pdf/user_api.py
CHANGED
@@ -18,8 +18,8 @@ from loguru import logger
|
|
18
18
|
from magic_pdf.libs.version import __version__
|
19
19
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
20
20
|
from magic_pdf.rw import AbsReaderWriter
|
21
|
-
from magic_pdf.
|
22
|
-
from magic_pdf.
|
21
|
+
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
22
|
+
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
|
23
23
|
|
24
24
|
PARSE_TYPE_TXT = "txt"
|
25
25
|
PARSE_TYPE_OCR = "ocr"
|
@@ -86,45 +86,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
86
86
|
return None
|
87
87
|
|
88
88
|
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
|
89
|
-
|
90
|
-
# for page_dict in pdf_info_dict['pdf_info']:
|
91
|
-
# for para_block in page_dict['para_blocks']:
|
92
|
-
# if para_block['type'] in ['title', 'text']:
|
93
|
-
# for line in para_block['lines']:
|
94
|
-
# for span in line['spans']:
|
95
|
-
# text_all += span['content']
|
96
|
-
|
97
|
-
# def calculate_not_common_character_rate(text):
|
98
|
-
# garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
|
99
|
-
# # 计算乱码字符的数量
|
100
|
-
# garbage_count = len(garbage_regex.findall(text))
|
101
|
-
# total = len(text)
|
102
|
-
# if total == 0:
|
103
|
-
# return 0 # 避免除以零的错误
|
104
|
-
# return garbage_count / total
|
105
|
-
#
|
106
|
-
# def calculate_not_printable_rate(text):
|
107
|
-
# printable_text = ""
|
108
|
-
# for c in text:
|
109
|
-
# if c.isprintable():
|
110
|
-
# printable_text += c
|
111
|
-
# printable_total = len(printable_text)
|
112
|
-
# total = len(text)
|
113
|
-
# if total == 0:
|
114
|
-
# return 0 # 避免除以零的错误
|
115
|
-
# return (total - printable_total) / total
|
116
|
-
#
|
117
|
-
# not_common_character_rate = calculate_not_common_character_rate(text_all)
|
118
|
-
# not_printable_rate = calculate_not_printable_rate(text_all)
|
119
|
-
# pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
|
120
|
-
# pdf_info_dict["_not_printable_rate"] = not_printable_rate
|
121
|
-
# logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
|
122
|
-
'''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
|
123
|
-
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
|
124
|
-
if (pdf_info_dict is None
|
125
|
-
or pdf_info_dict.get("_need_drop", False)
|
126
|
-
# or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
|
127
|
-
):
|
89
|
+
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
|
128
90
|
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
|
129
91
|
if input_model_is_empty:
|
130
92
|
pdf_models = doc_analyze(pdf_bytes, ocr=True)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.11
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/magicpdf/Magic-PDF
|
6
6
|
Requires-Python: >=3.9
|
@@ -22,11 +22,26 @@ Requires-Dist: wordninja >=2.0.0
|
|
22
22
|
Requires-Dist: scikit-learn >=1.0.2
|
23
23
|
Requires-Dist: nltk ==3.8.1
|
24
24
|
Requires-Dist: s3pathlib >=2.1.1
|
25
|
-
Requires-Dist: paddleocr
|
26
25
|
Requires-Dist: pdfminer.six >=20231228
|
26
|
+
Requires-Dist: Levenshtein
|
27
|
+
Requires-Dist: nltk
|
28
|
+
Requires-Dist: rapidfuzz
|
29
|
+
Requires-Dist: statistics
|
30
|
+
Requires-Dist: openxlab
|
31
|
+
Requires-Dist: pandas
|
32
|
+
Requires-Dist: numpy
|
33
|
+
Requires-Dist: matplotlib
|
34
|
+
Requires-Dist: seaborn
|
35
|
+
Requires-Dist: scipy
|
36
|
+
Requires-Dist: scikit-learn
|
37
|
+
Requires-Dist: tqdm
|
38
|
+
Requires-Dist: htmltabletomd
|
39
|
+
Requires-Dist: pypandoc
|
27
40
|
Provides-Extra: cpu
|
41
|
+
Requires-Dist: paddleocr ; extra == 'cpu'
|
28
42
|
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
29
43
|
Provides-Extra: gpu
|
44
|
+
Requires-Dist: paddleocr ; extra == 'gpu'
|
30
45
|
Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
31
46
|
|
32
47
|
<div id="top"></div>
|
@@ -64,6 +79,10 @@ Key features include:
|
|
64
79
|
- Compatibility with CPU and GPU environments
|
65
80
|
- Available for Windows, Linux, and macOS platforms
|
66
81
|
|
82
|
+
## Project Panorama
|
83
|
+
|
84
|
+

|
85
|
+
|
67
86
|
## Getting Started
|
68
87
|
|
69
88
|
### Requirements
|
@@ -72,20 +91,57 @@ Key features include:
|
|
72
91
|
|
73
92
|
### Usage Instructions
|
74
93
|
|
75
|
-
1.
|
76
|
-
|
94
|
+
#### 1. Install Magic-PDF
|
77
95
|
```bash
|
78
|
-
pip install magic-pdf
|
79
|
-
or
|
80
|
-
pip install magic-pdf[gpu] # Install the GPU version
|
96
|
+
pip install magic-pdf
|
81
97
|
```
|
82
98
|
|
83
|
-
2.
|
99
|
+
#### 2. Usage via Command Line
|
84
100
|
|
101
|
+
###### simple
|
102
|
+
```bash
|
103
|
+
cp magic-pdf.template.json to ~/magic-pdf.json
|
104
|
+
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
105
|
+
```
|
106
|
+
###### more
|
85
107
|
```bash
|
86
108
|
magic-pdf --help
|
87
109
|
```
|
88
110
|
|
111
|
+
#### 3. Usage via Api
|
112
|
+
|
113
|
+
###### Local
|
114
|
+
```python
|
115
|
+
image_writer = DiskReaderWriter(local_image_dir)
|
116
|
+
image_dir = str(os.path.basename(local_image_dir))
|
117
|
+
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
|
118
|
+
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
|
119
|
+
pipe.pipe_classify()
|
120
|
+
pipe.pipe_parse()
|
121
|
+
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
122
|
+
```
|
123
|
+
|
124
|
+
###### Object Storage
|
125
|
+
```python
|
126
|
+
s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
|
127
|
+
image_dir = "s3://img_bucket/"
|
128
|
+
s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
|
129
|
+
pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
|
130
|
+
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
|
131
|
+
pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
|
132
|
+
pipe.pipe_classify()
|
133
|
+
pipe.pipe_parse()
|
134
|
+
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
135
|
+
```
|
136
|
+
|
137
|
+
Demo can be referred to [demo.py](https://github.com/magicpdf/Magic-PDF/blob/master/demo/demo.py)
|
138
|
+
|
139
|
+
## All Thanks To Our Contributors
|
140
|
+
|
141
|
+
<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
|
142
|
+
<img src="https://contrib.rocks/image?repo=magicpdf/Magic-PDF" />
|
143
|
+
</a>
|
144
|
+
|
89
145
|
## License Information
|
90
146
|
|
91
147
|
See [LICENSE.md](https://github.com/magicpdf/Magic-PDF/blob/master/LICENSE.md) for details.
|
@@ -1,13 +1,11 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
magic_pdf/pdf_parse_by_ocr.py,sha256=
|
3
|
-
magic_pdf/
|
4
|
-
magic_pdf/pdf_parse_by_txt.py,sha256=5_kdfvDkv_XwDove2AW7SopGysYLJ1-tsOQy2yuII1Y,21932
|
5
|
-
magic_pdf/pdf_parse_by_txt_v2.py,sha256=mGadyYamoCNGNsKOQM1uXQR65zMUKyL24yURGHADmVs,1908
|
2
|
+
magic_pdf/pdf_parse_by_ocr.py,sha256=IWnSWt1Z-d35xRqspzdLR2iUtma_SAu4W7K4kEk8SHc,638
|
3
|
+
magic_pdf/pdf_parse_by_txt.py,sha256=KUSH7Gh83CZmdyWw59pqDskwyJ2Kg-jU-9fnQGJQEs4,537
|
6
4
|
magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
|
7
5
|
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
8
|
-
magic_pdf/user_api.py,sha256=
|
6
|
+
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
9
7
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
magic_pdf/cli/magicpdf.py,sha256=
|
8
|
+
magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
|
11
9
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
10
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
13
11
|
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
|
@@ -46,14 +44,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
46
44
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
47
45
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
48
46
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
49
|
-
magic_pdf/libs/version.py,sha256=
|
47
|
+
magic_pdf/libs/version.py,sha256=xFez9dUQrcuZqZRWuEIsCbMskoR-Ke1_uUZ51Kyt1tw,23
|
50
48
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
51
49
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
52
|
-
magic_pdf/model/__init__.py,sha256=
|
53
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
50
|
+
magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
|
51
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=N3DqbVT1hc4s9KhppWDmZWkCj2ExKltoLrQl2IWGk7c,2231
|
54
52
|
magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
|
55
53
|
magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
|
56
|
-
magic_pdf/model/pp_structure_v2.py,sha256=
|
54
|
+
magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
|
57
55
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
56
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
59
57
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -64,7 +62,7 @@ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,
|
|
64
62
|
magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
|
65
63
|
magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
|
66
64
|
magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
|
67
|
-
magic_pdf/para/para_split_v2.py,sha256=
|
65
|
+
magic_pdf/para/para_split_v2.py,sha256=jGOhsubdh_CEgSv9WMNmp1loq1YNlpcAj3yh3g0gPhw,37027
|
68
66
|
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
69
67
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
70
68
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
@@ -89,7 +87,7 @@ magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1
|
|
89
87
|
magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
|
90
88
|
magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
|
91
89
|
magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
|
92
|
-
magic_pdf/pre_proc/equations_replace.py,sha256=
|
90
|
+
magic_pdf/pre_proc/equations_replace.py,sha256=fXj7ZV7F3YtkDYrAhE9g5tHk4_3pVUyLbhDtMjbxjWU,20386
|
93
91
|
magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
|
94
92
|
magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
|
95
93
|
magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
|
@@ -117,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
117
115
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
118
116
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
119
117
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
124
|
-
magic_pdf-0.5.
|
125
|
-
magic_pdf-0.5.
|
118
|
+
magic_pdf-0.5.11.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
119
|
+
magic_pdf-0.5.11.dist-info/METADATA,sha256=wMDXFCmnlXQKkUdp891cG46MrbDn92TlPPD8T7AT3tE,4649
|
120
|
+
magic_pdf-0.5.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
121
|
+
magic_pdf-0.5.11.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
122
|
+
magic_pdf-0.5.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
123
|
+
magic_pdf-0.5.11.dist-info/RECORD,,
|
magic_pdf/pdf_parse_by_ocr_v2.py
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
from magic_pdf.pdf_parse_union_core import pdf_parse_union
|
2
|
-
|
3
|
-
def parse_pdf_by_ocr(pdf_bytes,
|
4
|
-
model_list,
|
5
|
-
imageWriter,
|
6
|
-
start_page_id=0,
|
7
|
-
end_page_id=None,
|
8
|
-
debug_mode=False,
|
9
|
-
):
|
10
|
-
return pdf_parse_union(pdf_bytes,
|
11
|
-
model_list,
|
12
|
-
imageWriter,
|
13
|
-
"ocr",
|
14
|
-
start_page_id=start_page_id,
|
15
|
-
end_page_id=end_page_id,
|
16
|
-
debug_mode=debug_mode,
|
17
|
-
)
|
magic_pdf/pdf_parse_by_txt_v2.py
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
from magic_pdf.pdf_parse_union_core import pdf_parse_union
|
2
|
-
|
3
|
-
|
4
|
-
def parse_pdf_by_txt(
|
5
|
-
pdf_bytes,
|
6
|
-
model_list,
|
7
|
-
imageWriter,
|
8
|
-
start_page_id=0,
|
9
|
-
end_page_id=None,
|
10
|
-
debug_mode=False,
|
11
|
-
):
|
12
|
-
return pdf_parse_union(pdf_bytes,
|
13
|
-
model_list,
|
14
|
-
imageWriter,
|
15
|
-
"txt",
|
16
|
-
start_page_id=start_page_id,
|
17
|
-
end_page_id=end_page_id,
|
18
|
-
debug_mode=debug_mode,
|
19
|
-
)
|
20
|
-
|
21
|
-
|
22
|
-
if __name__ == "__main__":
|
23
|
-
pass
|
24
|
-
# if 1:
|
25
|
-
# import fitz
|
26
|
-
# import json
|
27
|
-
#
|
28
|
-
# with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
|
29
|
-
# pdf_bytes = f.read()
|
30
|
-
# pdf_docs = fitz.open("pdf", pdf_bytes)
|
31
|
-
#
|
32
|
-
# with open("/opt/data/pdf/20240418/25536-00.json") as f:
|
33
|
-
# model_list = json.loads(f.readline())
|
34
|
-
#
|
35
|
-
# magic_model = MagicModel(model_list, pdf_docs)
|
36
|
-
# for i in range(7):
|
37
|
-
# print(magic_model.get_imgs(i))
|
38
|
-
#
|
39
|
-
# for page_no, page in enumerate(pdf_docs):
|
40
|
-
# inline_equations, interline_equations, interline_equation_blocks = (
|
41
|
-
# magic_model.get_equations(page_no)
|
42
|
-
# )
|
43
|
-
#
|
44
|
-
# text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
|
45
|
-
# char_level_text_blocks = page.get_text(
|
46
|
-
# "rawdict", flags=fitz.TEXTFLAGS_TEXT
|
47
|
-
# )["blocks"]
|
48
|
-
# text_blocks = combine_chars_to_pymudict(
|
49
|
-
# text_raw_blocks, char_level_text_blocks
|
50
|
-
# )
|
51
|
-
# text_blocks = replace_equations_in_textblock(
|
52
|
-
# text_blocks, inline_equations, interline_equations
|
53
|
-
# )
|
54
|
-
# text_blocks = remove_citation_marker(text_blocks)
|
55
|
-
#
|
56
|
-
# text_blocks = remove_chars_in_text_blocks(text_blocks)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|