oafuncs 0.0.60__py2.py3-none-any.whl → 0.0.62__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oafuncs/oa_cmap.py +8 -14
- oafuncs/oa_data.py +23 -41
- oafuncs/oa_down/__init__.py +2 -1
- oafuncs/oa_down/hycom_3hourly.py +47 -89
- oafuncs/oa_down/literature.py +332 -0
- oafuncs/oa_down/refs_pdf.py +16 -25
- oafuncs/oa_draw.py +23 -47
- oafuncs/oa_file.py +34 -35
- oafuncs/oa_nc.py +18 -34
- oafuncs/oa_sign/meteorological.py +7 -14
- oafuncs/oa_sign/ocean.py +7 -12
- {oafuncs-0.0.60.dist-info → oafuncs-0.0.62.dist-info}/METADATA +2 -2
- oafuncs-0.0.62.dist-info/RECORD +23 -0
- oafuncs-0.0.60.dist-info/RECORD +0 -22
- {oafuncs-0.0.60.dist-info → oafuncs-0.0.62.dist-info}/LICENSE.txt +0 -0
- {oafuncs-0.0.60.dist-info → oafuncs-0.0.62.dist-info}/WHEEL +0 -0
- {oafuncs-0.0.60.dist-info → oafuncs-0.0.62.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,332 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding=utf-8
|
3
|
+
'''
|
4
|
+
Author: Liu Kun && 16031215@qq.com
|
5
|
+
Date: 2024-11-28 10:42:56
|
6
|
+
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
+
LastEditTime: 2024-11-28 10:43:18
|
8
|
+
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\literature.py
|
9
|
+
Description:
|
10
|
+
EditPlatform: vscode
|
11
|
+
ComputerInfo: XPS 15 9510
|
12
|
+
SystemInfo: Windows 11
|
13
|
+
Python Version: 3.12
|
14
|
+
'''
|
15
|
+
|
16
|
+
|
17
|
+
import os
|
18
|
+
import random
|
19
|
+
import re
|
20
|
+
import time
|
21
|
+
from pathlib import Path
|
22
|
+
|
23
|
+
import pandas as pd
|
24
|
+
import requests
|
25
|
+
from rich import print
|
26
|
+
from rich.progress import track
|
27
|
+
|
28
|
+
__all__ = ['download5doi']
|
29
|
+
|
30
|
+
|
31
|
+
def _get_file_size(file_path, unit='KB'):
|
32
|
+
# 检查文件是否存在
|
33
|
+
if not os.path.exists(file_path):
|
34
|
+
return "文件不存在"
|
35
|
+
|
36
|
+
# 获取文件大小(字节)
|
37
|
+
file_size = os.path.getsize(file_path)
|
38
|
+
|
39
|
+
# 单位转换字典
|
40
|
+
unit_dict = {
|
41
|
+
'PB': 1024**5,
|
42
|
+
'TB': 1024**4,
|
43
|
+
'GB': 1024**3,
|
44
|
+
'MB': 1024**2,
|
45
|
+
'KB': 1024,
|
46
|
+
}
|
47
|
+
|
48
|
+
# 检查传入的单位是否合法
|
49
|
+
if unit not in unit_dict:
|
50
|
+
return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
|
51
|
+
|
52
|
+
# 转换文件大小到指定单位
|
53
|
+
converted_size = file_size / unit_dict[unit]
|
54
|
+
|
55
|
+
return converted_size
|
56
|
+
|
57
|
+
|
58
|
+
class _Downloader:
|
59
|
+
'''
|
60
|
+
根据doi下载文献pdf
|
61
|
+
'''
|
62
|
+
|
63
|
+
def __init__(self, doi, store_path):
|
64
|
+
self.url_list = [r'https://sci-hub.se',
|
65
|
+
r'https://sci-hub.ren',
|
66
|
+
r'https://sci-hub.st',
|
67
|
+
r'https://sci-hub.ru',
|
68
|
+
]
|
69
|
+
self.base_url = None
|
70
|
+
self.url = None
|
71
|
+
self.doi = doi
|
72
|
+
self.pdf_url = None
|
73
|
+
self.pdf_path = None
|
74
|
+
self.headers = {'User-Agent': self.get_ua().encode('utf-8')}
|
75
|
+
# 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
|
76
|
+
# self.fname = doi.replace(r'/', '_') + '.pdf'
|
77
|
+
self.fname = re.sub(r'[/<>:"?*|]', '_', doi) + '.pdf'
|
78
|
+
self.store_path = Path(store_path)
|
79
|
+
self.fpath = self.store_path / self.fname
|
80
|
+
self.wrong_record_file = self.store_path / 'wrong_record.txt'
|
81
|
+
self.sleep = 5
|
82
|
+
self.cookies = None
|
83
|
+
self.check_size = 50
|
84
|
+
self.url_index = 0
|
85
|
+
self.try_times_each_url_max = 3
|
86
|
+
self.try_times = 0
|
87
|
+
|
88
|
+
def get_ua(self):
|
89
|
+
ua_list = [
|
90
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
|
91
|
+
"Opera/8.0 (Windows NT 5.1; U; en)",
|
92
|
+
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
|
93
|
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
|
94
|
+
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
|
95
|
+
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
|
96
|
+
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
|
97
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
|
98
|
+
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
|
99
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
|
100
|
+
"Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
|
101
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
|
102
|
+
"MAC:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
|
103
|
+
"Windows:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
|
104
|
+
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
105
|
+
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
106
|
+
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
107
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
|
108
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
|
109
|
+
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
|
110
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
111
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
|
112
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
|
113
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
|
114
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
|
115
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
|
116
|
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
|
117
|
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
|
118
|
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
|
119
|
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
|
120
|
+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
|
121
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
|
122
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
|
123
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
|
124
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
125
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
|
126
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36",
|
127
|
+
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
128
|
+
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
129
|
+
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
|
130
|
+
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
131
|
+
"Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
132
|
+
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
133
|
+
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
134
|
+
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
|
135
|
+
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
|
136
|
+
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
|
137
|
+
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
|
138
|
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
|
139
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
|
140
|
+
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
|
141
|
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
|
142
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
|
143
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
|
144
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
|
145
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
|
146
|
+
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
147
|
+
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
|
148
|
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
|
149
|
+
"UCWEB7.0.2.37/28/999",
|
150
|
+
"NOKIA5700/UCWEB7.0.2.37/28/999",
|
151
|
+
"Openwave/UCWEB7.0.2.37/28/999",
|
152
|
+
"Openwave/UCWEB7.0.2.37/28/999",
|
153
|
+
]
|
154
|
+
ua_index = random.randint(0, len(ua_list)-1)
|
155
|
+
ua = ua_list[ua_index]
|
156
|
+
return ua
|
157
|
+
|
158
|
+
def get_pdf_url(self):
|
159
|
+
print('[bold #E6E6FA]-'*100)
|
160
|
+
print(f"DOI: {self.doi}")
|
161
|
+
print(f"Requesting: {self.url}...")
|
162
|
+
response = requests.get(self.url, headers=self.headers)
|
163
|
+
if response.status_code == 200:
|
164
|
+
self.cookies = response.cookies
|
165
|
+
text = response.text.replace('\\', '')
|
166
|
+
# text = text.replace(' ', '') # It is important to remove the space
|
167
|
+
# print(text)
|
168
|
+
pattern = re.compile(
|
169
|
+
r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
|
170
|
+
match = pattern.search(text)
|
171
|
+
if match:
|
172
|
+
got_url = match.group(1)
|
173
|
+
if r'http' not in got_url:
|
174
|
+
if got_url[:2] == '//':
|
175
|
+
self.pdf_url = 'https:' + got_url
|
176
|
+
else:
|
177
|
+
self.pdf_url = self.base_url + got_url
|
178
|
+
else:
|
179
|
+
self.pdf_url = got_url
|
180
|
+
print(f"URL: {self.pdf_url}")
|
181
|
+
else:
|
182
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
183
|
+
self.try_times = self.try_times_each_url_max+1
|
184
|
+
else:
|
185
|
+
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
|
186
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
187
|
+
self.try_times = self.try_times_each_url_max+1
|
188
|
+
|
189
|
+
def url_iterate(self):
|
190
|
+
if self.url_index >= len(self.url_list):
|
191
|
+
return
|
192
|
+
url = self.url_list[self.url_index]
|
193
|
+
self.base_url = url
|
194
|
+
self.url = url + '/' + self.doi
|
195
|
+
self.get_pdf_url()
|
196
|
+
# for url in self.url_list:
|
197
|
+
# self.url = url + self.doi
|
198
|
+
# self.get_pdf_url()
|
199
|
+
# if self.pdf_url:
|
200
|
+
# break
|
201
|
+
|
202
|
+
def write_wrong_record(self):
|
203
|
+
with open(self.wrong_record_file, 'a') as f:
|
204
|
+
f.write(self.doi + '\n')
|
205
|
+
|
206
|
+
def download_pdf(self):
|
207
|
+
if self.fpath.exists():
|
208
|
+
fsize = _get_file_size(self.fpath, unit='KB')
|
209
|
+
if fsize < self.check_size:
|
210
|
+
# delete the wrong file
|
211
|
+
os.remove(self.fpath)
|
212
|
+
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
213
|
+
else:
|
214
|
+
print('[bold #E6E6FA]-'*100)
|
215
|
+
print(f"[bold purple]The PDF file {self.fpath} already exists.")
|
216
|
+
return
|
217
|
+
self.url_index = 0
|
218
|
+
already_downloaded = False
|
219
|
+
self.try_times = 0
|
220
|
+
while not already_downloaded:
|
221
|
+
self.url_iterate()
|
222
|
+
if not self.pdf_url:
|
223
|
+
self.url_index += 1
|
224
|
+
if self.url_index >= len(self.url_list):
|
225
|
+
print("Failed to download the PDF file.")
|
226
|
+
self.write_wrong_record()
|
227
|
+
return
|
228
|
+
else:
|
229
|
+
self.try_times = 0
|
230
|
+
continue
|
231
|
+
else:
|
232
|
+
self.try_times += 1
|
233
|
+
if self.try_times > self.try_times_each_url_max:
|
234
|
+
self.url_index += 1
|
235
|
+
if self.url_index >= len(self.url_list):
|
236
|
+
# print("Failed to download the PDF file.")
|
237
|
+
self.write_wrong_record()
|
238
|
+
return
|
239
|
+
print(f"Downloading: {self.fname}...")
|
240
|
+
try:
|
241
|
+
response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
|
242
|
+
if response.status_code == 200:
|
243
|
+
with open(self.fpath, 'wb') as f:
|
244
|
+
f.write(response.content)
|
245
|
+
fsize = _get_file_size(self.fpath, unit='KB')
|
246
|
+
if fsize < self.check_size:
|
247
|
+
# delete the wrong file
|
248
|
+
os.remove(self.fpath)
|
249
|
+
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
250
|
+
else:
|
251
|
+
print(f"[bold green]Sucessful to download {self.fpath}")
|
252
|
+
already_downloaded = True
|
253
|
+
else:
|
254
|
+
self.try_times = self.try_times_each_url_max+1
|
255
|
+
print(f"Failed to download the PDF file. Status code: {response.status_code}")
|
256
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
257
|
+
except Exception as e:
|
258
|
+
print(f"Failed to download the PDF file. Error: {e}")
|
259
|
+
time.sleep(self.sleep)
|
260
|
+
if self.try_times >= self.try_times_each_url_max:
|
261
|
+
self.url_index += 1
|
262
|
+
if self.url_index >= len(self.url_list):
|
263
|
+
print("\n[bold #CD5C5C]Failed to download the PDF file.")
|
264
|
+
self.write_wrong_record()
|
265
|
+
return
|
266
|
+
if self.try_times == self.try_times_each_url_max:
|
267
|
+
print(f'Tried {self.try_times} times for {self.url_list[self.url_index-1]}.')
|
268
|
+
print("Try another URL...")
|
269
|
+
|
270
|
+
|
271
|
+
def read_excel(file, col_name=r'DOI'):
|
272
|
+
df = pd.read_excel(file)
|
273
|
+
df_list = df[col_name].tolist()
|
274
|
+
# 去掉nan
|
275
|
+
df_list = [doi for doi in df_list if str(doi) != 'nan']
|
276
|
+
return df_list
|
277
|
+
|
278
|
+
|
279
|
+
def read_txt(file):
|
280
|
+
with open(file, 'r') as f:
|
281
|
+
lines = f.readlines()
|
282
|
+
# 去掉换行符以及空行
|
283
|
+
lines = [line.strip() for line in lines if line.strip()]
|
284
|
+
return lines
|
285
|
+
|
286
|
+
|
287
|
+
def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r'DOI'):
|
288
|
+
'''
|
289
|
+
Description: Download PDF files by DOI.
|
290
|
+
|
291
|
+
Args:
|
292
|
+
store_path: str, The path to store the PDF files.
|
293
|
+
doi_list: list or str, The list of DOIs.
|
294
|
+
txt_file: str, The path of the txt file that contains the DOIs.
|
295
|
+
excel_file: str, The path of the excel file that contains the DOIs.
|
296
|
+
col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
None
|
300
|
+
|
301
|
+
Example:
|
302
|
+
download5doi(doi_list='10.3389/feart.2021.698876')
|
303
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', doi_list='10.3389/feart.2021.698876')
|
304
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
|
305
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', txt_file=r'I:\Delete\ref_pdf\wrong_record.txt')
|
306
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
|
307
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
|
308
|
+
'''
|
309
|
+
if not store_path:
|
310
|
+
store_path = Path.cwd()
|
311
|
+
else:
|
312
|
+
store_path = Path(str(store_path))
|
313
|
+
store_path.mkdir(parents=True, exist_ok=True)
|
314
|
+
store_path = str(store_path)
|
315
|
+
|
316
|
+
# 如果doi_list是str,转换为list
|
317
|
+
if isinstance(doi_list, str) and doi_list:
|
318
|
+
doi_list = [doi_list]
|
319
|
+
if txt_file:
|
320
|
+
doi_list = read_txt(txt_file)
|
321
|
+
if excel_file:
|
322
|
+
doi_list = read_excel(excel_file, col_name)
|
323
|
+
print(f"Downloading {len(doi_list)} PDF files...")
|
324
|
+
for doi in track(doi_list, description='Downloading...'):
|
325
|
+
download = _Downloader(doi, store_path)
|
326
|
+
download.download_pdf()
|
327
|
+
|
328
|
+
|
329
|
+
if __name__ == '__main__':
|
330
|
+
store_path = r'I:\Delete\ref_pdf'
|
331
|
+
# download5doi(store_path, doi_list='10.1007/s00382-022-06260-x')
|
332
|
+
download5doi(store_path, excel_file=r'I:\Delete\ref_pdf\savedrecs.xls')
|
oafuncs/oa_down/refs_pdf.py
CHANGED
@@ -4,8 +4,8 @@
|
|
4
4
|
Author: Liu Kun && 16031215@qq.com
|
5
5
|
Date: 2024-11-09 13:58:28
|
6
6
|
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
-
LastEditTime: 2024-11-
|
8
|
-
FilePath: \\Python\\My_Funcs\\OAFuncs\\
|
7
|
+
LastEditTime: 2024-11-21 13:18:18
|
8
|
+
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\refs_pdf.py
|
9
9
|
Description:
|
10
10
|
EditPlatform: vscode
|
11
11
|
ComputerInfo: XPS 15 9510
|
@@ -41,7 +41,7 @@ def _get_file_size(file_path, unit='KB'):
|
|
41
41
|
'TB': 1024**4,
|
42
42
|
'GB': 1024**3,
|
43
43
|
'MB': 1024**2,
|
44
|
-
'KB': 1024
|
44
|
+
'KB': 1024,
|
45
45
|
}
|
46
46
|
|
47
47
|
# 检查传入的单位是否合法
|
@@ -178,14 +178,11 @@ class _Downloader:
|
|
178
178
|
self.pdf_url = got_url
|
179
179
|
print(f"URL: {self.pdf_url}")
|
180
180
|
else:
|
181
|
-
print(f'[bold #AFEEEE]The website {
|
182
|
-
self.url_list[self.url_index]} do not inlcude the PDF file.')
|
181
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
183
182
|
self.try_times = self.try_times_each_url_max+1
|
184
183
|
else:
|
185
|
-
print(f"Failed to retrieve the webpage. Status code: {
|
186
|
-
|
187
|
-
print(f'[bold #AFEEEE]The website {
|
188
|
-
self.url_list[self.url_index]} do not inlcude the PDF file.')
|
184
|
+
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
|
185
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
189
186
|
self.try_times = self.try_times_each_url_max+1
|
190
187
|
|
191
188
|
def url_iterate(self):
|
@@ -211,12 +208,10 @@ class _Downloader:
|
|
211
208
|
if fsize < self.check_size:
|
212
209
|
# delete the wrong file
|
213
210
|
os.remove(self.fpath)
|
214
|
-
print(f"[bold yellow]The PDF file {
|
215
|
-
self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
211
|
+
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
216
212
|
else:
|
217
213
|
print('[bold #E6E6FA]-'*100)
|
218
|
-
print(f"[bold purple]The PDF file {
|
219
|
-
self.fpath} already exists.")
|
214
|
+
print(f"[bold purple]The PDF file {self.fpath} already exists.")
|
220
215
|
return
|
221
216
|
self.url_index = 0
|
222
217
|
already_downloaded = False
|
@@ -242,8 +237,7 @@ class _Downloader:
|
|
242
237
|
return
|
243
238
|
print(f"Downloading: {self.fname}...")
|
244
239
|
try:
|
245
|
-
response = requests.get(
|
246
|
-
self.pdf_url, headers=self.headers, cookies=self.cookies)
|
240
|
+
response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
|
247
241
|
if response.status_code == 200:
|
248
242
|
with open(self.fpath, 'wb') as f:
|
249
243
|
f.write(response.content)
|
@@ -251,18 +245,14 @@ class _Downloader:
|
|
251
245
|
if fsize < self.check_size:
|
252
246
|
# delete the wrong file
|
253
247
|
os.remove(self.fpath)
|
254
|
-
print(f"[bold yellow]The PDF file {
|
255
|
-
self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
248
|
+
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
256
249
|
else:
|
257
|
-
print(f"[bold green]Sucessful to download {
|
258
|
-
self.fpath}")
|
250
|
+
print(f"[bold green]Sucessful to download {self.fpath}")
|
259
251
|
already_downloaded = True
|
260
252
|
else:
|
261
253
|
self.try_times = self.try_times_each_url_max+1
|
262
|
-
print(f"Failed to download the PDF file. Status code: {
|
263
|
-
|
264
|
-
print(f'[bold #AFEEEE]The website {
|
265
|
-
self.url_list[self.url_index]} do not inlcude the PDF file.')
|
254
|
+
print(f"Failed to download the PDF file. Status code: {response.status_code}")
|
255
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
266
256
|
except Exception as e:
|
267
257
|
print(f"Failed to download the PDF file. Error: {e}")
|
268
258
|
time.sleep(self.sleep)
|
@@ -273,8 +263,7 @@ class _Downloader:
|
|
273
263
|
self.write_wrong_record()
|
274
264
|
return
|
275
265
|
if self.try_times == self.try_times_each_url_max:
|
276
|
-
print(f'Tried {self.try_times} times for {
|
277
|
-
self.url_list[self.url_index-1]}.')
|
266
|
+
print(f'Tried {self.try_times} times for {self.url_list[self.url_index-1]}.')
|
278
267
|
print("Try another URL...")
|
279
268
|
|
280
269
|
|
@@ -316,6 +305,8 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None,
|
|
316
305
|
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
|
317
306
|
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
|
318
307
|
'''
|
308
|
+
print('[bold #EE33fA]Note:\n 函数路径将改为oafuncs.oa_down.literature.download5doi,此路径将被弃用。')
|
309
|
+
|
319
310
|
if not store_path:
|
320
311
|
store_path = Path.cwd()
|
321
312
|
else:
|
oafuncs/oa_draw.py
CHANGED
@@ -4,8 +4,8 @@
|
|
4
4
|
Author: Liu Kun && 16031215@qq.com
|
5
5
|
Date: 2024-09-17 17:26:11
|
6
6
|
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
-
LastEditTime: 2024-
|
8
|
-
FilePath: \\Python\\My_Funcs\\OAFuncs\\
|
7
|
+
LastEditTime: 2024-11-21 13:10:47
|
8
|
+
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_draw.py
|
9
9
|
Description:
|
10
10
|
EditPlatform: vscode
|
11
11
|
ComputerInfo: XPS 15 9510
|
@@ -26,13 +26,13 @@ import xarray as xr
|
|
26
26
|
from cartopy.mpl.ticker import LatitudeFormatter, LongitudeFormatter
|
27
27
|
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
28
28
|
|
29
|
-
__all__ = ['create_gif', 'xy2lonlat', 'plot_contourf',
|
30
|
-
'plot_contourf_lonlat', 'plot_quiver', 'plot_contourf_cartopy']
|
29
|
+
__all__ = ['create_gif', 'xy2lonlat', 'plot_contourf', 'plot_contourf_lonlat', 'plot_quiver', 'plot_contourf_cartopy']
|
31
30
|
|
32
31
|
warnings.filterwarnings('ignore')
|
33
32
|
|
34
|
-
|
35
33
|
# ** 将生成图片/已有图片制作成动图
|
34
|
+
|
35
|
+
|
36
36
|
def create_gif(image_list: list, gif_name: str, duration=0.2): # 制作动图,默认间隔0.2
|
37
37
|
'''
|
38
38
|
func : 制作动图,将已有图片拼接
|
@@ -69,8 +69,7 @@ def xy2lonlat(xy, lonlat='lon', decimal=2):
|
|
69
69
|
# degrees = int(abs(x))
|
70
70
|
degrees = round(abs(x), decimal)
|
71
71
|
direction = "E" if x >= 0 else "W"
|
72
|
-
out_list.append(
|
73
|
-
f"{degrees:.{decimal}f}°{direction}" if x != 0 and x != 180 else f"{degrees}°")
|
72
|
+
out_list.append(f"{degrees:.{decimal}f}°{direction}" if x != 0 and x != 180 else f"{degrees}°")
|
74
73
|
return out_list if len(out_list) > 1 else out_list[0]
|
75
74
|
|
76
75
|
def format_latitude(y_list):
|
@@ -81,8 +80,7 @@ def xy2lonlat(xy, lonlat='lon', decimal=2):
|
|
81
80
|
# degrees = int(abs(y))
|
82
81
|
degrees = round(abs(y), decimal)
|
83
82
|
direction = "N" if y >= 0 else "S"
|
84
|
-
out_list.append(
|
85
|
-
f"{degrees:.{decimal}f}°{direction}" if y != 0 else f"{degrees}°")
|
83
|
+
out_list.append(f"{degrees:.{decimal}f}°{direction}" if y != 0 else f"{degrees}°")
|
86
84
|
return out_list if len(out_list) > 1 else out_list[0]
|
87
85
|
|
88
86
|
if lonlat == 'lon':
|
@@ -107,12 +105,6 @@ class _MyFormatter(mpl.ticker.ScalarFormatter):
|
|
107
105
|
|
108
106
|
def __call__(self, x, pos):
|
109
107
|
if ((abs(x) < 1e-2) or (abs(x) > 1e4)) and x != 0:
|
110
|
-
# if self.magnitude_max - self.magnitude_min == 1 and (int(math.modf(math.log10(abs(x)))[1]) == self.magnitude_min):
|
111
|
-
# a, b = '{:.1e}'.format(x).split('e')
|
112
|
-
# b = int(b)
|
113
|
-
# return '${}{:.2f} \\times 10^{{{}}}$'.format(' ' if self.p_n and x > 0 else '', float(a)/10, b+1)
|
114
|
-
# else:
|
115
|
-
# return '${}{} \\times 10^{{{}}}$'.format(' ' if self.p_n and x > 0 else '', *'{:.2e}'.format(x).split('e'))
|
116
108
|
if self.magnitude_max - self.magnitude_min == 1 and (int(math.modf(math.log10(abs(x)))[1]) == self.magnitude_min):
|
117
109
|
a, b = '{:.1e}'.format(x).split('e')
|
118
110
|
a = float(a) / 10
|
@@ -152,20 +144,16 @@ def plot_contourf(pic_data, picname=None, c_map='rainbow', minmax=None, labels=N
|
|
152
144
|
flag = (value_min < 0) and (value_max > 0)
|
153
145
|
norm = mpl.colors.TwoSlopeNorm(
|
154
146
|
vmin=-1 * v_bry, vcenter=0, vmax=v_bry) if flag else mpl.colors.Normalize(vmin=value_min, vmax=value_max)
|
155
|
-
cticks = [num for num in np.linspace(-1 * v_bry if flag else value_min,
|
156
|
-
|
157
|
-
levels = np.linspace(-1 * v_bry, v_bry,
|
158
|
-
20) if flag else None if value_min == value_max else np.linspace(value_min, value_max, 20)
|
147
|
+
cticks = [num for num in np.linspace(-1 * v_bry if flag else value_min, v_bry if flag else value_max, 9)] if value_min != value_max else None
|
148
|
+
levels = np.linspace(-1 * v_bry, v_bry, 20) if flag else None if value_min == value_max else np.linspace(value_min, value_max, 20)
|
159
149
|
|
160
150
|
shape = np.array(pic_data).shape
|
161
151
|
x, y = np.meshgrid(np.arange(shape[1]), np.arange(shape[0]))
|
162
152
|
|
163
153
|
fig, ax = plt.subplots(figsize=figsize)
|
164
154
|
flag_lc = levels is not None and cticks is not None
|
165
|
-
CS = ax.contourf(x, y, pic_data, cmap=cmap, norm=norm, levels=levels, extend='both') if flag_lc else ax.contourf(
|
166
|
-
|
167
|
-
cb = fig.colorbar(CS, ax=ax, orientation='vertical', shrink=1, format='%.3g', spacing='uniform', ticks=cticks) if cticks is not None else fig.colorbar(
|
168
|
-
CS, ax=ax, orientation='vertical', shrink=1, format='%.3g', spacing='uniform')
|
155
|
+
CS = ax.contourf(x, y, pic_data, cmap=cmap, norm=norm, levels=levels, extend='both') if flag_lc else ax.contourf(x, y, pic_data, cmap=cmap, norm=norm, extend='both')
|
156
|
+
cb = fig.colorbar(CS, ax=ax, orientation='vertical', shrink=1, format='%.3g', spacing='uniform', ticks=cticks) if cticks is not None else fig.colorbar(CS, ax=ax, orientation='vertical', shrink=1, format='%.3g', spacing='uniform')
|
169
157
|
"""%.3g采用的是自动调整格式,也可设置为%.3f,则改为3位小数"""
|
170
158
|
|
171
159
|
# 将格式化器设置为自定义的函数
|
@@ -232,10 +220,8 @@ def plot_contourf_lonlat(data, lon, lat, interval=5, picname=None, c_map='rainbo
|
|
232
220
|
plt.contourf(data, cmap=c_map)
|
233
221
|
x_space = int(len(lon) * interval / (lon[-1] - lon[0]))
|
234
222
|
y_space = int(len(lat) * interval / (lat[-1] - lat[0]))
|
235
|
-
plt.xticks(np.arange(0, len(lon), x_space), [
|
236
|
-
|
237
|
-
plt.yticks(np.arange(0, len(lat), y_space), [
|
238
|
-
format_latitude(lat[i]) for i in range(0, len(lat), y_space)])
|
223
|
+
plt.xticks(np.arange(0, len(lon), x_space), [format_longitude(lon[i]) for i in range(0, len(lon), x_space)])
|
224
|
+
plt.yticks(np.arange(0, len(lat), y_space), [format_latitude(lat[i]) for i in range(0, len(lat), y_space)])
|
239
225
|
plt.colorbar()
|
240
226
|
plt.savefig(
|
241
227
|
picname, bbox_inches='tight') if picname is not None else plt.show()
|
@@ -288,14 +274,12 @@ def plot_quiver(u, v, lon, lat, picname=None, cmap='coolwarm', scale=0.25, width
|
|
288
274
|
cmap=cmap, # 矢量的颜色,多色
|
289
275
|
width=width)
|
290
276
|
# plt.quiverkey(quiver_plot, X=0.90, Y=0.975, U=1, label='1 m/s', labelpos='E', fontproperties={'size': 10})
|
291
|
-
plt.quiverkey(quiver_plot, X=0.87, Y=0.975, U=mean_S,
|
292
|
-
label=f'{mean_S:.2f} m/s', labelpos='E', fontproperties={'size': 10})
|
277
|
+
plt.quiverkey(quiver_plot, X=0.87, Y=0.975, U=mean_S, label=f'{mean_S:.2f} m/s', labelpos='E', fontproperties={'size': 10})
|
293
278
|
plt.colorbar(quiver_plot)
|
294
279
|
plt.xlabel('X')
|
295
280
|
plt.ylabel('Y')
|
296
281
|
|
297
|
-
plt.savefig(
|
298
|
-
picname, bbox_inches='tight') if picname is not None else plt.show()
|
282
|
+
plt.savefig(picname, bbox_inches='tight') if picname is not None else plt.show()
|
299
283
|
plt.clf()
|
300
284
|
plt.close()
|
301
285
|
|
@@ -353,8 +337,7 @@ def plot_contourf_cartopy(data, lon, lat, picname=None, cmap='rainbow', cn_fill_
|
|
353
337
|
cticks = cbar_ticks
|
354
338
|
norm = mpl.colors.BoundaryNorm(cticks, cmap.N)
|
355
339
|
|
356
|
-
cnplot = ax.contourf(X, Y, data, levels=levels, cmap=cmap,
|
357
|
-
norm=norm, transform=proj, extend='both', alpha=1, zorder=0)
|
340
|
+
cnplot = ax.contourf(X, Y, data, levels=levels, cmap=cmap, norm=norm, transform=proj, extend='both', alpha=1, zorder=0)
|
358
341
|
# cllevels = np.linspace(data_min, data_max, 9)
|
359
342
|
# clplot = ax.contour(X, Y, data, levels=levels[9::10], colors='k', linewidths=0.5, transform=proj, zorder=1, alpha=0.8, linestyle='--')
|
360
343
|
# 添加色标,并选择位置
|
@@ -376,15 +359,13 @@ def plot_contourf_cartopy(data, lon, lat, picname=None, cmap='rainbow', cn_fill_
|
|
376
359
|
cax = divider.new_horizontal(size="5%", pad=0.1, axes_class=plt.Axes)
|
377
360
|
fig.add_axes(cax)
|
378
361
|
# cbar = plt.colorbar(cnplot, cax=cax, orientation='vertical', extend='both', format='%.0f')
|
379
|
-
cbar = fig.colorbar(mpl.cm.ScalarMappable(cmap=cmap, norm=norm),
|
380
|
-
cax=cax, orientation='vertical', extend='both', format='%.3f')
|
362
|
+
cbar = fig.colorbar(mpl.cm.ScalarMappable(cmap=cmap, norm=norm), cax=cax, orientation='vertical', extend='both', format='%.3f')
|
381
363
|
cax.yaxis.set_ticks_position('right')
|
382
364
|
cax.yaxis.set_label_position('right')
|
383
365
|
else: # 上方
|
384
366
|
cax = divider.new_vertical(size="5%", pad=0.2, axes_class=plt.Axes)
|
385
367
|
fig.add_axes(cax)
|
386
|
-
cbar = plt.colorbar(
|
387
|
-
cnplot, cax=cax, orientation='horizontal', extend='both')
|
368
|
+
cbar = plt.colorbar(cnplot, cax=cax, orientation='horizontal', extend='both')
|
388
369
|
cbar.ax.tick_params(labelsize=10)
|
389
370
|
cbar.ax.xaxis.set_tick_params(direction='in', width=1, length=2)
|
390
371
|
# 添加cbar_ticks
|
@@ -396,11 +377,9 @@ def plot_contourf_cartopy(data, lon, lat, picname=None, cmap='rainbow', cn_fill_
|
|
396
377
|
# cbar.set_ticks(np.arange(round(levels[0]), round(levels[-1]), round((levels[-1]-levels[0])/9))) # 设置色标刻度
|
397
378
|
|
398
379
|
# 单独设置label
|
399
|
-
cbar.set_label(title, fontsize=10,
|
400
|
-
weight='bold')
|
380
|
+
cbar.set_label(title, fontsize=10, weight='bold')
|
401
381
|
# cax.set_position([0.1, 0.2, 0.02, 0.6]) # 调整色标位置
|
402
|
-
fig.savefig(
|
403
|
-
picname, bbox_inches='tight', dpi=600) if picname is not None else plt.show()
|
382
|
+
fig.savefig(picname, bbox_inches='tight', dpi=600) if picname is not None else plt.show()
|
404
383
|
plt.close()
|
405
384
|
|
406
385
|
|
@@ -408,23 +387,20 @@ if __name__ == '__main__':
|
|
408
387
|
# ** 绘制填色图
|
409
388
|
data = np.random.randn(100, 100)
|
410
389
|
picname = 'test.png'
|
411
|
-
plot_contourf(data, picname, c_map='rainbow', minmax=None,
|
412
|
-
labels=None, ticks_space=None, ticks=None, figsize=(12, 9))
|
390
|
+
plot_contourf(data, picname, c_map='rainbow', minmax=None, labels=None, ticks_space=None, ticks=None, figsize=(12, 9))
|
413
391
|
# ** 绘制矢量场
|
414
392
|
u = np.random.randn(100, 100)
|
415
393
|
v = np.random.randn(100, 100)
|
416
394
|
lon = np.linspace(0, 360, 100)
|
417
395
|
lat = np.linspace(-90, 90, 100)
|
418
396
|
picname = 'test.png'
|
419
|
-
plot_quiver(u, v, lon, lat, picname, cmap='coolwarm',
|
420
|
-
scale=0.25, width=0.002, x_space=5, y_space=5)
|
397
|
+
plot_quiver(u, v, lon, lat, picname, cmap='coolwarm', scale=0.25, width=0.002, x_space=5, y_space=5)
|
421
398
|
# ** 绘制经纬度填色图
|
422
399
|
data = np.random.randn(100, 100)
|
423
400
|
lon = np.linspace(0, 360, 100)
|
424
401
|
lat = np.linspace(-90, 90, 100)
|
425
402
|
picname = 'test.png'
|
426
|
-
plot_contourf_lonlat(data, lon, lat, interval=5,
|
427
|
-
picname=picname, c_map='rainbow')
|
403
|
+
plot_contourf_lonlat(data, lon, lat, interval=5, picname=picname, c_map='rainbow')
|
428
404
|
# ** 制作动图
|
429
405
|
image_list = ['test1.png', 'test2.png', 'test3.png']
|
430
406
|
gif_name = 'test.gif'
|