oafuncs 0.0.87__tar.gz → 0.0.89__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {oafuncs-0.0.87/oafuncs.egg-info → oafuncs-0.0.89}/PKG-INFO +9 -8
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_data.py +1 -0
- oafuncs-0.0.89/oafuncs/oa_down/literature.py +263 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_draw.py +1 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_file.py +1 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_help.py +2 -2
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_nc.py +1 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_python.py +1 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89/oafuncs.egg-info}/PKG-INFO +9 -8
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs.egg-info/requires.txt +7 -6
- {oafuncs-0.0.87 → oafuncs-0.0.89}/setup.py +30 -24
- oafuncs-0.0.87/oafuncs/oa_down/literature.py +0 -332
- {oafuncs-0.0.87 → oafuncs-0.0.89}/LICENSE.txt +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/MANIFEST.in +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/README.md +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/__init__.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/data_store/OAFuncs.png +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_cmap.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/User_Agent-list.txt +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/__init__.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/hycom_3hourly.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/test_ua.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/user_agent.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_sign/__init__.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_sign/meteorological.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_sign/ocean.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_sign/scientific.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_tool/__init__.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_tool/email.py +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs.egg-info/SOURCES.txt +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs.egg-info/dependency_links.txt +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs.egg-info/top_level.txt +0 -0
- {oafuncs-0.0.87 → oafuncs-0.0.89}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: oafuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.89
|
4
4
|
Summary: Oceanic and Atmospheric Functions
|
5
5
|
Home-page: https://github.com/Industry-Pays/OAFuncs
|
6
6
|
Author: Kun Liu
|
@@ -15,20 +15,21 @@ Classifier: Programming Language :: Python :: 3.11
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
16
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
17
17
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
18
|
-
Requires-Python: >=3.
|
18
|
+
Requires-Python: >=3.9.0
|
19
19
|
Description-Content-Type: text/markdown
|
20
20
|
License-File: LICENSE.txt
|
21
|
-
Requires-Dist: matplotlib
|
22
21
|
Requires-Dist: numpy
|
23
22
|
Requires-Dist: scipy
|
23
|
+
Requires-Dist: pandas
|
24
24
|
Requires-Dist: xarray
|
25
|
-
Requires-Dist: Cartopy
|
26
|
-
Requires-Dist: netCDF4
|
27
|
-
Requires-Dist: requests
|
28
|
-
Requires-Dist: bs4
|
29
25
|
Requires-Dist: rich
|
30
26
|
Requires-Dist: pathlib
|
31
|
-
Requires-Dist:
|
27
|
+
Requires-Dist: requests
|
28
|
+
Requires-Dist: bs4
|
29
|
+
Requires-Dist: matplotlib
|
30
|
+
Requires-Dist: Cartopy
|
31
|
+
Requires-Dist: netCDF4
|
32
|
+
Requires-Dist: xlrd
|
32
33
|
|
33
34
|
|
34
35
|
# oafuncs
|
@@ -0,0 +1,263 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding=utf-8
|
3
|
+
"""
|
4
|
+
Author: Liu Kun && 16031215@qq.com
|
5
|
+
Date: 2024-11-28 10:42:56
|
6
|
+
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
+
LastEditTime: 2025-01-05 10:51:42
|
8
|
+
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\literature.py
|
9
|
+
Description:
|
10
|
+
EditPlatform: vscode
|
11
|
+
ComputerInfo: XPS 15 9510
|
12
|
+
SystemInfo: Windows 11
|
13
|
+
Python Version: 3.12
|
14
|
+
"""
|
15
|
+
|
16
|
+
import os
|
17
|
+
import re
|
18
|
+
import time
|
19
|
+
from pathlib import Path
|
20
|
+
|
21
|
+
import pandas as pd
|
22
|
+
import requests
|
23
|
+
from rich import print
|
24
|
+
from rich.progress import track
|
25
|
+
from oafuncs.oa_down.user_agent import get_ua
|
26
|
+
|
27
|
+
__all__ = ["download5doi"]
|
28
|
+
|
29
|
+
|
30
|
+
def _get_file_size(file_path, unit="KB"):
|
31
|
+
# 检查文件是否存在
|
32
|
+
if not os.path.exists(file_path):
|
33
|
+
return "文件不存在"
|
34
|
+
|
35
|
+
# 获取文件大小(字节)
|
36
|
+
file_size = os.path.getsize(file_path)
|
37
|
+
|
38
|
+
# 单位转换字典
|
39
|
+
unit_dict = {
|
40
|
+
"PB": 1024**5,
|
41
|
+
"TB": 1024**4,
|
42
|
+
"GB": 1024**3,
|
43
|
+
"MB": 1024**2,
|
44
|
+
"KB": 1024,
|
45
|
+
}
|
46
|
+
|
47
|
+
# 检查传入的单位是否合法
|
48
|
+
if unit not in unit_dict:
|
49
|
+
return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
|
50
|
+
|
51
|
+
# 转换文件大小到指定单位
|
52
|
+
converted_size = file_size / unit_dict[unit]
|
53
|
+
|
54
|
+
return converted_size
|
55
|
+
|
56
|
+
|
57
|
+
class _Downloader:
|
58
|
+
"""
|
59
|
+
根据doi下载文献pdf
|
60
|
+
"""
|
61
|
+
|
62
|
+
def __init__(self, doi, store_path):
|
63
|
+
self.url_list = [
|
64
|
+
r"https://sci-hub.se",
|
65
|
+
r"https://sci-hub.ren",
|
66
|
+
r"https://sci-hub.st",
|
67
|
+
r"https://sci-hub.ru",
|
68
|
+
]
|
69
|
+
self.base_url = None
|
70
|
+
self.url = None
|
71
|
+
self.doi = doi
|
72
|
+
self.pdf_url = None
|
73
|
+
self.pdf_path = None
|
74
|
+
self.headers = {"User-Agent": get_ua().encode("utf-8")}
|
75
|
+
# 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
|
76
|
+
# self.fname = doi.replace(r'/', '_') + '.pdf'
|
77
|
+
self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
|
78
|
+
self.store_path = Path(store_path)
|
79
|
+
self.fpath = self.store_path / self.fname
|
80
|
+
self.wrong_record_file = self.store_path / "wrong_record.txt"
|
81
|
+
self.sleep = 5
|
82
|
+
self.cookies = None
|
83
|
+
self.check_size = 50
|
84
|
+
self.url_index = 0
|
85
|
+
self.try_times_each_url_max = 3
|
86
|
+
self.try_times = 0
|
87
|
+
|
88
|
+
def get_pdf_url(self):
|
89
|
+
print("[bold #E6E6FA]-" * 100)
|
90
|
+
print(f"DOI: {self.doi}")
|
91
|
+
print(f"Requesting: {self.url}...")
|
92
|
+
response = requests.get(self.url, headers=self.headers)
|
93
|
+
if response.status_code == 200:
|
94
|
+
self.cookies = response.cookies
|
95
|
+
text = response.text.replace("\\", "")
|
96
|
+
# text = text.replace(' ', '') # It is important to remove the space
|
97
|
+
# print(text)
|
98
|
+
pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
|
99
|
+
match = pattern.search(text)
|
100
|
+
if match:
|
101
|
+
got_url = match.group(1)
|
102
|
+
if r"http" not in got_url:
|
103
|
+
if got_url[:2] == "//":
|
104
|
+
self.pdf_url = "https:" + got_url
|
105
|
+
else:
|
106
|
+
self.pdf_url = self.base_url + got_url
|
107
|
+
else:
|
108
|
+
self.pdf_url = got_url
|
109
|
+
print(f"URL: {self.pdf_url}")
|
110
|
+
else:
|
111
|
+
print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
|
112
|
+
self.try_times = self.try_times_each_url_max + 1
|
113
|
+
else:
|
114
|
+
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
|
115
|
+
print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
|
116
|
+
self.try_times = self.try_times_each_url_max + 1
|
117
|
+
|
118
|
+
def url_iterate(self):
|
119
|
+
if self.url_index >= len(self.url_list):
|
120
|
+
return
|
121
|
+
url = self.url_list[self.url_index]
|
122
|
+
self.base_url = url
|
123
|
+
self.url = url + "/" + self.doi
|
124
|
+
self.get_pdf_url()
|
125
|
+
# for url in self.url_list:
|
126
|
+
# self.url = url + self.doi
|
127
|
+
# self.get_pdf_url()
|
128
|
+
# if self.pdf_url:
|
129
|
+
# break
|
130
|
+
|
131
|
+
def write_wrong_record(self):
|
132
|
+
with open(self.wrong_record_file, "a") as f:
|
133
|
+
f.write(self.doi + "\n")
|
134
|
+
|
135
|
+
def download_pdf(self):
|
136
|
+
if self.fpath.exists():
|
137
|
+
fsize = _get_file_size(self.fpath, unit="KB")
|
138
|
+
if fsize < self.check_size:
|
139
|
+
# delete the wrong file
|
140
|
+
os.remove(self.fpath)
|
141
|
+
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
142
|
+
else:
|
143
|
+
print("[bold #E6E6FA]-" * 100)
|
144
|
+
print(f"[bold purple]The PDF file {self.fpath} already exists.")
|
145
|
+
return
|
146
|
+
self.url_index = 0
|
147
|
+
already_downloaded = False
|
148
|
+
self.try_times = 0
|
149
|
+
while not already_downloaded:
|
150
|
+
self.url_iterate()
|
151
|
+
if not self.pdf_url:
|
152
|
+
self.url_index += 1
|
153
|
+
if self.url_index >= len(self.url_list):
|
154
|
+
print("Failed to download the PDF file.")
|
155
|
+
self.write_wrong_record()
|
156
|
+
return
|
157
|
+
else:
|
158
|
+
self.try_times = 0
|
159
|
+
continue
|
160
|
+
else:
|
161
|
+
self.try_times += 1
|
162
|
+
if self.try_times > self.try_times_each_url_max:
|
163
|
+
self.url_index += 1
|
164
|
+
if self.url_index >= len(self.url_list):
|
165
|
+
# print("Failed to download the PDF file.")
|
166
|
+
self.write_wrong_record()
|
167
|
+
return
|
168
|
+
print(f"Downloading: {self.fname}...")
|
169
|
+
try:
|
170
|
+
response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
|
171
|
+
if response.status_code == 200:
|
172
|
+
with open(self.fpath, "wb") as f:
|
173
|
+
f.write(response.content)
|
174
|
+
fsize = _get_file_size(self.fpath, unit="KB")
|
175
|
+
if fsize < self.check_size:
|
176
|
+
# delete the wrong file
|
177
|
+
os.remove(self.fpath)
|
178
|
+
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
179
|
+
else:
|
180
|
+
print(f"[bold green]Sucessful to download {self.fpath}")
|
181
|
+
already_downloaded = True
|
182
|
+
else:
|
183
|
+
self.try_times = self.try_times_each_url_max + 1
|
184
|
+
print(f"Failed to download the PDF file. Status code: {response.status_code}")
|
185
|
+
print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
|
186
|
+
except Exception as e:
|
187
|
+
print(f"Failed to download the PDF file. Error: {e}")
|
188
|
+
time.sleep(self.sleep)
|
189
|
+
if self.try_times >= self.try_times_each_url_max:
|
190
|
+
self.url_index += 1
|
191
|
+
if self.url_index >= len(self.url_list):
|
192
|
+
print("\n[bold #CD5C5C]Failed to download the PDF file.")
|
193
|
+
self.write_wrong_record()
|
194
|
+
return
|
195
|
+
if self.try_times == self.try_times_each_url_max:
|
196
|
+
print(f"Tried {self.try_times} times for {self.url_list[self.url_index-1]}.")
|
197
|
+
print("Try another URL...")
|
198
|
+
|
199
|
+
|
200
|
+
def read_excel(file, col_name=r"DOI"):
|
201
|
+
df = pd.read_excel(file)
|
202
|
+
df_list = df[col_name].tolist()
|
203
|
+
# 去掉nan
|
204
|
+
df_list = [doi for doi in df_list if str(doi) != "nan"]
|
205
|
+
return df_list
|
206
|
+
|
207
|
+
|
208
|
+
def read_txt(file):
|
209
|
+
with open(file, "r") as f:
|
210
|
+
lines = f.readlines()
|
211
|
+
# 去掉换行符以及空行
|
212
|
+
lines = [line.strip() for line in lines if line.strip()]
|
213
|
+
return lines
|
214
|
+
|
215
|
+
|
216
|
+
def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r"DOI"):
|
217
|
+
"""
|
218
|
+
Description:
|
219
|
+
Download PDF files by DOI.
|
220
|
+
|
221
|
+
Parameters:
|
222
|
+
store_path: str, The path to store the PDF files.
|
223
|
+
doi_list: list or str, The list of DOIs.
|
224
|
+
txt_file: str, The path of the txt file that contains the DOIs.
|
225
|
+
excel_file: str, The path of the excel file that contains the DOIs.
|
226
|
+
col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
None
|
230
|
+
|
231
|
+
Example:
|
232
|
+
download5doi(doi_list='10.3389/feart.2021.698876')
|
233
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', doi_list='10.3389/feart.2021.698876')
|
234
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
|
235
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', txt_file=r'I:\Delete\ref_pdf\wrong_record.txt')
|
236
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
|
237
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
|
238
|
+
"""
|
239
|
+
if not store_path:
|
240
|
+
store_path = Path.cwd()
|
241
|
+
else:
|
242
|
+
store_path = Path(str(store_path))
|
243
|
+
store_path.mkdir(parents=True, exist_ok=True)
|
244
|
+
store_path = str(store_path)
|
245
|
+
|
246
|
+
# 如果doi_list是str,转换为list
|
247
|
+
if isinstance(doi_list, str) and doi_list:
|
248
|
+
doi_list = [doi_list]
|
249
|
+
if txt_file:
|
250
|
+
doi_list = read_txt(txt_file)
|
251
|
+
if excel_file:
|
252
|
+
doi_list = read_excel(excel_file, col_name)
|
253
|
+
print(f"Downloading {len(doi_list)} PDF files...")
|
254
|
+
for doi in track(doi_list, description="Downloading..."):
|
255
|
+
download = _Downloader(doi, store_path)
|
256
|
+
download.download_pdf()
|
257
|
+
|
258
|
+
|
259
|
+
if __name__ == "__main__":
|
260
|
+
store_path = r"I:\Delete\ref_pdf"
|
261
|
+
excel_file = r"I:\Delete\Ref_DA_ROMS\savedrecs.xls"
|
262
|
+
# download5doi(store_path, doi_list='10.1007/s00382-022-06260-x')
|
263
|
+
download5doi(store_path, excel_file=excel_file)
|
@@ -23,6 +23,7 @@ import matplotlib.pyplot as plt
|
|
23
23
|
import numpy as np
|
24
24
|
import xarray as xr
|
25
25
|
from cartopy.mpl.ticker import LatitudeFormatter, LongitudeFormatter
|
26
|
+
from rich import print
|
26
27
|
|
27
28
|
__all__ = ["fig_minus", "gif", "add_cartopy", "add_gridlines", "MidpointNormalize", "add_lonlat_unit", "contour", "contourf", "quiver"]
|
28
29
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: oafuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.89
|
4
4
|
Summary: Oceanic and Atmospheric Functions
|
5
5
|
Home-page: https://github.com/Industry-Pays/OAFuncs
|
6
6
|
Author: Kun Liu
|
@@ -15,20 +15,21 @@ Classifier: Programming Language :: Python :: 3.11
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
16
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
17
17
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
18
|
-
Requires-Python: >=3.
|
18
|
+
Requires-Python: >=3.9.0
|
19
19
|
Description-Content-Type: text/markdown
|
20
20
|
License-File: LICENSE.txt
|
21
|
-
Requires-Dist: matplotlib
|
22
21
|
Requires-Dist: numpy
|
23
22
|
Requires-Dist: scipy
|
23
|
+
Requires-Dist: pandas
|
24
24
|
Requires-Dist: xarray
|
25
|
-
Requires-Dist: Cartopy
|
26
|
-
Requires-Dist: netCDF4
|
27
|
-
Requires-Dist: requests
|
28
|
-
Requires-Dist: bs4
|
29
25
|
Requires-Dist: rich
|
30
26
|
Requires-Dist: pathlib
|
31
|
-
Requires-Dist:
|
27
|
+
Requires-Dist: requests
|
28
|
+
Requires-Dist: bs4
|
29
|
+
Requires-Dist: matplotlib
|
30
|
+
Requires-Dist: Cartopy
|
31
|
+
Requires-Dist: netCDF4
|
32
|
+
Requires-Dist: xlrd
|
32
33
|
|
33
34
|
|
34
35
|
# oafuncs
|
@@ -17,22 +17,30 @@ DESCRIPTION = 'Oceanic and Atmospheric Functions'
|
|
17
17
|
URL = 'https://github.com/Industry-Pays/OAFuncs'
|
18
18
|
EMAIL = 'liukun0312@stu.ouc.edu.cn'
|
19
19
|
AUTHOR = 'Kun Liu'
|
20
|
-
REQUIRES_PYTHON = '>=3.
|
21
|
-
VERSION = '0.0.
|
20
|
+
REQUIRES_PYTHON = '>=3.9.0' # 2025/01/05
|
21
|
+
VERSION = '0.0.89'
|
22
22
|
|
23
23
|
# What packages are required for this module to be executed?
|
24
24
|
REQUIRED = [
|
25
|
-
|
25
|
+
# ------ General ------
|
26
26
|
"numpy",
|
27
27
|
"scipy",
|
28
|
+
"pandas",
|
28
29
|
"xarray",
|
29
|
-
|
30
|
-
"
|
30
|
+
# ------ Progress and Print ------
|
31
|
+
"rich",
|
32
|
+
# ------ Path ------
|
33
|
+
"pathlib",
|
34
|
+
# ------ Internet ------
|
31
35
|
"requests",
|
32
36
|
"bs4",
|
33
|
-
|
34
|
-
|
35
|
-
|
37
|
+
# ------ Picture ------
|
38
|
+
"matplotlib",
|
39
|
+
"Cartopy",
|
40
|
+
# ------ File ------
|
41
|
+
"netCDF4",
|
42
|
+
"xlrd",
|
43
|
+
# ------ Other ------
|
36
44
|
]
|
37
45
|
|
38
46
|
# What packages are optional?
|
@@ -106,42 +114,40 @@ class UploadCommand(Command):
|
|
106
114
|
# Where the magic happens:
|
107
115
|
setup(
|
108
116
|
name=NAME,
|
109
|
-
version=about[
|
117
|
+
version=about["__version__"],
|
110
118
|
description=DESCRIPTION,
|
111
119
|
long_description=long_description,
|
112
|
-
long_description_content_type=
|
120
|
+
long_description_content_type="text/markdown",
|
113
121
|
author=AUTHOR,
|
114
122
|
author_email=EMAIL,
|
115
123
|
python_requires=REQUIRES_PYTHON,
|
116
124
|
url=URL,
|
117
|
-
packages=find_packages(
|
118
|
-
exclude=["oa_*", "oa_down", "oa_sign", "oa_tool"]),
|
125
|
+
packages=find_packages(exclude=["oa_*", "oa_down", "oa_sign", "oa_tool"]),
|
119
126
|
# packages=find_packages(exclude=["nc", "file", "*.tests.*", "tests.*"]),
|
120
127
|
# If your package is a single module, use this instead of 'packages':
|
121
128
|
# py_modules=['mypackage'],
|
122
|
-
|
123
129
|
# entry_points={
|
124
130
|
# 'console_scripts': ['mycli=mymodule:cli'],
|
125
131
|
# },
|
126
132
|
install_requires=REQUIRED,
|
127
133
|
extras_require=EXTRAS,
|
128
134
|
include_package_data=True,
|
129
|
-
license=
|
135
|
+
license="MIT",
|
130
136
|
classifiers=[
|
131
137
|
# Trove classifiers
|
132
138
|
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
139
|
+
"License :: OSI Approved :: MIT License",
|
140
|
+
"Programming Language :: Python",
|
141
|
+
"Programming Language :: Python :: 3",
|
142
|
+
"Programming Language :: Python :: 3.9",
|
143
|
+
"Programming Language :: Python :: 3.10",
|
144
|
+
"Programming Language :: Python :: 3.11",
|
145
|
+
"Programming Language :: Python :: 3.12",
|
146
|
+
"Programming Language :: Python :: Implementation :: CPython",
|
147
|
+
"Programming Language :: Python :: Implementation :: PyPy",
|
142
148
|
],
|
143
149
|
# $ setup.py publish support.
|
144
150
|
cmdclass={
|
145
|
-
|
151
|
+
"upload": UploadCommand,
|
146
152
|
},
|
147
153
|
)
|
@@ -1,332 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
# coding=utf-8
|
3
|
-
'''
|
4
|
-
Author: Liu Kun && 16031215@qq.com
|
5
|
-
Date: 2024-11-28 10:42:56
|
6
|
-
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
-
LastEditTime: 2024-11-28 10:43:18
|
8
|
-
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\literature.py
|
9
|
-
Description:
|
10
|
-
EditPlatform: vscode
|
11
|
-
ComputerInfo: XPS 15 9510
|
12
|
-
SystemInfo: Windows 11
|
13
|
-
Python Version: 3.12
|
14
|
-
'''
|
15
|
-
|
16
|
-
|
17
|
-
import os
|
18
|
-
import random
|
19
|
-
import re
|
20
|
-
import time
|
21
|
-
from pathlib import Path
|
22
|
-
|
23
|
-
import pandas as pd
|
24
|
-
import requests
|
25
|
-
from rich import print
|
26
|
-
from rich.progress import track
|
27
|
-
|
28
|
-
__all__ = ['download5doi']
|
29
|
-
|
30
|
-
|
31
|
-
def _get_file_size(file_path, unit='KB'):
|
32
|
-
# 检查文件是否存在
|
33
|
-
if not os.path.exists(file_path):
|
34
|
-
return "文件不存在"
|
35
|
-
|
36
|
-
# 获取文件大小(字节)
|
37
|
-
file_size = os.path.getsize(file_path)
|
38
|
-
|
39
|
-
# 单位转换字典
|
40
|
-
unit_dict = {
|
41
|
-
'PB': 1024**5,
|
42
|
-
'TB': 1024**4,
|
43
|
-
'GB': 1024**3,
|
44
|
-
'MB': 1024**2,
|
45
|
-
'KB': 1024,
|
46
|
-
}
|
47
|
-
|
48
|
-
# 检查传入的单位是否合法
|
49
|
-
if unit not in unit_dict:
|
50
|
-
return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
|
51
|
-
|
52
|
-
# 转换文件大小到指定单位
|
53
|
-
converted_size = file_size / unit_dict[unit]
|
54
|
-
|
55
|
-
return converted_size
|
56
|
-
|
57
|
-
|
58
|
-
class _Downloader:
|
59
|
-
'''
|
60
|
-
根据doi下载文献pdf
|
61
|
-
'''
|
62
|
-
|
63
|
-
def __init__(self, doi, store_path):
|
64
|
-
self.url_list = [r'https://sci-hub.se',
|
65
|
-
r'https://sci-hub.ren',
|
66
|
-
r'https://sci-hub.st',
|
67
|
-
r'https://sci-hub.ru',
|
68
|
-
]
|
69
|
-
self.base_url = None
|
70
|
-
self.url = None
|
71
|
-
self.doi = doi
|
72
|
-
self.pdf_url = None
|
73
|
-
self.pdf_path = None
|
74
|
-
self.headers = {'User-Agent': self.get_ua().encode('utf-8')}
|
75
|
-
# 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
|
76
|
-
# self.fname = doi.replace(r'/', '_') + '.pdf'
|
77
|
-
self.fname = re.sub(r'[/<>:"?*|]', '_', doi) + '.pdf'
|
78
|
-
self.store_path = Path(store_path)
|
79
|
-
self.fpath = self.store_path / self.fname
|
80
|
-
self.wrong_record_file = self.store_path / 'wrong_record.txt'
|
81
|
-
self.sleep = 5
|
82
|
-
self.cookies = None
|
83
|
-
self.check_size = 50
|
84
|
-
self.url_index = 0
|
85
|
-
self.try_times_each_url_max = 3
|
86
|
-
self.try_times = 0
|
87
|
-
|
88
|
-
def get_ua(self):
|
89
|
-
ua_list = [
|
90
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
|
91
|
-
"Opera/8.0 (Windows NT 5.1; U; en)",
|
92
|
-
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
|
93
|
-
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
|
94
|
-
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
|
95
|
-
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
|
96
|
-
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
|
97
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
|
98
|
-
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
|
99
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
|
100
|
-
"Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
|
101
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
|
102
|
-
"MAC:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
|
103
|
-
"Windows:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
|
104
|
-
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
105
|
-
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
106
|
-
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
107
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
|
108
|
-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
|
109
|
-
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
|
110
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
111
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
|
112
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
|
113
|
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
|
114
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
|
115
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
|
116
|
-
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
|
117
|
-
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
|
118
|
-
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
|
119
|
-
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
|
120
|
-
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
|
121
|
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
|
122
|
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
|
123
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
|
124
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
125
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
|
126
|
-
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36",
|
127
|
-
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
128
|
-
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
129
|
-
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
|
130
|
-
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
131
|
-
"Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
132
|
-
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
133
|
-
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
134
|
-
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
|
135
|
-
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
|
136
|
-
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
|
137
|
-
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
|
138
|
-
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
|
139
|
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
|
140
|
-
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
|
141
|
-
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
|
142
|
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
|
143
|
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
|
144
|
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
|
145
|
-
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
|
146
|
-
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
147
|
-
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
|
148
|
-
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
|
149
|
-
"UCWEB7.0.2.37/28/999",
|
150
|
-
"NOKIA5700/UCWEB7.0.2.37/28/999",
|
151
|
-
"Openwave/UCWEB7.0.2.37/28/999",
|
152
|
-
"Openwave/UCWEB7.0.2.37/28/999",
|
153
|
-
]
|
154
|
-
ua_index = random.randint(0, len(ua_list)-1)
|
155
|
-
ua = ua_list[ua_index]
|
156
|
-
return ua
|
157
|
-
|
158
|
-
def get_pdf_url(self):
|
159
|
-
print('[bold #E6E6FA]-'*100)
|
160
|
-
print(f"DOI: {self.doi}")
|
161
|
-
print(f"Requesting: {self.url}...")
|
162
|
-
response = requests.get(self.url, headers=self.headers)
|
163
|
-
if response.status_code == 200:
|
164
|
-
self.cookies = response.cookies
|
165
|
-
text = response.text.replace('\\', '')
|
166
|
-
# text = text.replace(' ', '') # It is important to remove the space
|
167
|
-
# print(text)
|
168
|
-
pattern = re.compile(
|
169
|
-
r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
|
170
|
-
match = pattern.search(text)
|
171
|
-
if match:
|
172
|
-
got_url = match.group(1)
|
173
|
-
if r'http' not in got_url:
|
174
|
-
if got_url[:2] == '//':
|
175
|
-
self.pdf_url = 'https:' + got_url
|
176
|
-
else:
|
177
|
-
self.pdf_url = self.base_url + got_url
|
178
|
-
else:
|
179
|
-
self.pdf_url = got_url
|
180
|
-
print(f"URL: {self.pdf_url}")
|
181
|
-
else:
|
182
|
-
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
183
|
-
self.try_times = self.try_times_each_url_max+1
|
184
|
-
else:
|
185
|
-
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
|
186
|
-
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
187
|
-
self.try_times = self.try_times_each_url_max+1
|
188
|
-
|
189
|
-
def url_iterate(self):
|
190
|
-
if self.url_index >= len(self.url_list):
|
191
|
-
return
|
192
|
-
url = self.url_list[self.url_index]
|
193
|
-
self.base_url = url
|
194
|
-
self.url = url + '/' + self.doi
|
195
|
-
self.get_pdf_url()
|
196
|
-
# for url in self.url_list:
|
197
|
-
# self.url = url + self.doi
|
198
|
-
# self.get_pdf_url()
|
199
|
-
# if self.pdf_url:
|
200
|
-
# break
|
201
|
-
|
202
|
-
def write_wrong_record(self):
|
203
|
-
with open(self.wrong_record_file, 'a') as f:
|
204
|
-
f.write(self.doi + '\n')
|
205
|
-
|
206
|
-
def download_pdf(self):
|
207
|
-
if self.fpath.exists():
|
208
|
-
fsize = _get_file_size(self.fpath, unit='KB')
|
209
|
-
if fsize < self.check_size:
|
210
|
-
# delete the wrong file
|
211
|
-
os.remove(self.fpath)
|
212
|
-
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
213
|
-
else:
|
214
|
-
print('[bold #E6E6FA]-'*100)
|
215
|
-
print(f"[bold purple]The PDF file {self.fpath} already exists.")
|
216
|
-
return
|
217
|
-
self.url_index = 0
|
218
|
-
already_downloaded = False
|
219
|
-
self.try_times = 0
|
220
|
-
while not already_downloaded:
|
221
|
-
self.url_iterate()
|
222
|
-
if not self.pdf_url:
|
223
|
-
self.url_index += 1
|
224
|
-
if self.url_index >= len(self.url_list):
|
225
|
-
print("Failed to download the PDF file.")
|
226
|
-
self.write_wrong_record()
|
227
|
-
return
|
228
|
-
else:
|
229
|
-
self.try_times = 0
|
230
|
-
continue
|
231
|
-
else:
|
232
|
-
self.try_times += 1
|
233
|
-
if self.try_times > self.try_times_each_url_max:
|
234
|
-
self.url_index += 1
|
235
|
-
if self.url_index >= len(self.url_list):
|
236
|
-
# print("Failed to download the PDF file.")
|
237
|
-
self.write_wrong_record()
|
238
|
-
return
|
239
|
-
print(f"Downloading: {self.fname}...")
|
240
|
-
try:
|
241
|
-
response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
|
242
|
-
if response.status_code == 200:
|
243
|
-
with open(self.fpath, 'wb') as f:
|
244
|
-
f.write(response.content)
|
245
|
-
fsize = _get_file_size(self.fpath, unit='KB')
|
246
|
-
if fsize < self.check_size:
|
247
|
-
# delete the wrong file
|
248
|
-
os.remove(self.fpath)
|
249
|
-
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
250
|
-
else:
|
251
|
-
print(f"[bold green]Sucessful to download {self.fpath}")
|
252
|
-
already_downloaded = True
|
253
|
-
else:
|
254
|
-
self.try_times = self.try_times_each_url_max+1
|
255
|
-
print(f"Failed to download the PDF file. Status code: {response.status_code}")
|
256
|
-
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
257
|
-
except Exception as e:
|
258
|
-
print(f"Failed to download the PDF file. Error: {e}")
|
259
|
-
time.sleep(self.sleep)
|
260
|
-
if self.try_times >= self.try_times_each_url_max:
|
261
|
-
self.url_index += 1
|
262
|
-
if self.url_index >= len(self.url_list):
|
263
|
-
print("\n[bold #CD5C5C]Failed to download the PDF file.")
|
264
|
-
self.write_wrong_record()
|
265
|
-
return
|
266
|
-
if self.try_times == self.try_times_each_url_max:
|
267
|
-
print(f'Tried {self.try_times} times for {self.url_list[self.url_index-1]}.')
|
268
|
-
print("Try another URL...")
|
269
|
-
|
270
|
-
|
271
|
-
def read_excel(file, col_name=r'DOI'):
|
272
|
-
df = pd.read_excel(file)
|
273
|
-
df_list = df[col_name].tolist()
|
274
|
-
# 去掉nan
|
275
|
-
df_list = [doi for doi in df_list if str(doi) != 'nan']
|
276
|
-
return df_list
|
277
|
-
|
278
|
-
|
279
|
-
def read_txt(file):
|
280
|
-
with open(file, 'r') as f:
|
281
|
-
lines = f.readlines()
|
282
|
-
# 去掉换行符以及空行
|
283
|
-
lines = [line.strip() for line in lines if line.strip()]
|
284
|
-
return lines
|
285
|
-
|
286
|
-
|
287
|
-
def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r'DOI'):
|
288
|
-
'''
|
289
|
-
Description: Download PDF files by DOI.
|
290
|
-
|
291
|
-
Args:
|
292
|
-
store_path: str, The path to store the PDF files.
|
293
|
-
doi_list: list or str, The list of DOIs.
|
294
|
-
txt_file: str, The path of the txt file that contains the DOIs.
|
295
|
-
excel_file: str, The path of the excel file that contains the DOIs.
|
296
|
-
col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
|
297
|
-
|
298
|
-
Returns:
|
299
|
-
None
|
300
|
-
|
301
|
-
Example:
|
302
|
-
download5doi(doi_list='10.3389/feart.2021.698876')
|
303
|
-
download5doi(store_path=r'I:\Delete\ref_pdf', doi_list='10.3389/feart.2021.698876')
|
304
|
-
download5doi(store_path=r'I:\Delete\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
|
305
|
-
download5doi(store_path=r'I:\Delete\ref_pdf', txt_file=r'I:\Delete\ref_pdf\wrong_record.txt')
|
306
|
-
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
|
307
|
-
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
|
308
|
-
'''
|
309
|
-
if not store_path:
|
310
|
-
store_path = Path.cwd()
|
311
|
-
else:
|
312
|
-
store_path = Path(str(store_path))
|
313
|
-
store_path.mkdir(parents=True, exist_ok=True)
|
314
|
-
store_path = str(store_path)
|
315
|
-
|
316
|
-
# 如果doi_list是str,转换为list
|
317
|
-
if isinstance(doi_list, str) and doi_list:
|
318
|
-
doi_list = [doi_list]
|
319
|
-
if txt_file:
|
320
|
-
doi_list = read_txt(txt_file)
|
321
|
-
if excel_file:
|
322
|
-
doi_list = read_excel(excel_file, col_name)
|
323
|
-
print(f"Downloading {len(doi_list)} PDF files...")
|
324
|
-
for doi in track(doi_list, description='Downloading...'):
|
325
|
-
download = _Downloader(doi, store_path)
|
326
|
-
download.download_pdf()
|
327
|
-
|
328
|
-
|
329
|
-
if __name__ == '__main__':
|
330
|
-
store_path = r'I:\Delete\ref_pdf'
|
331
|
-
# download5doi(store_path, doi_list='10.1007/s00382-022-06260-x')
|
332
|
-
download5doi(store_path, excel_file=r'I:\Delete\ref_pdf\savedrecs.xls')
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|