oafuncs 0.0.61__py2.py3-none-any.whl → 0.0.63__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oafuncs/oa_down/__init__.py +2 -1
- oafuncs/oa_down/hycom_3hourly.py +14 -14
- oafuncs/oa_down/literature.py +332 -0
- oafuncs/oa_down/refs_pdf.py +2 -0
- {oafuncs-0.0.61.dist-info → oafuncs-0.0.63.dist-info}/METADATA +2 -2
- {oafuncs-0.0.61.dist-info → oafuncs-0.0.63.dist-info}/RECORD +9 -8
- {oafuncs-0.0.61.dist-info → oafuncs-0.0.63.dist-info}/LICENSE.txt +0 -0
- {oafuncs-0.0.61.dist-info → oafuncs-0.0.63.dist-info}/WHEEL +0 -0
- {oafuncs-0.0.61.dist-info → oafuncs-0.0.63.dist-info}/top_level.txt +0 -0
oafuncs/oa_down/__init__.py
CHANGED
oafuncs/oa_down/hycom_3hourly.py
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
Author: Liu Kun && 16031215@qq.com
|
5
5
|
Date: 2024-11-01 10:31:09
|
6
6
|
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
-
LastEditTime: 2024-11-
|
7
|
+
LastEditTime: 2024-11-28 16:04:50
|
8
8
|
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\hycom_3hourly.py
|
9
9
|
Description:
|
10
10
|
EditPlatform: vscode
|
@@ -69,12 +69,12 @@ variable_info = {
|
|
69
69
|
'u': {'var_name': 'water_u', 'standard_name': 'eastward_sea_water_velocity'},
|
70
70
|
'v': {'var_name': 'water_v', 'standard_name': 'northward_sea_water_velocity'},
|
71
71
|
'temp': {'var_name': 'water_temp', 'standard_name': 'sea_water_potential_temperature'},
|
72
|
-
'
|
72
|
+
'salt': {'var_name': 'salinity', 'standard_name': 'sea_water_salinity'},
|
73
73
|
'ssh': {'var_name': 'surf_el', 'standard_name': 'sea_surface_elevation'},
|
74
74
|
'u_b': {'var_name': 'water_u_bottom', 'standard_name': 'eastward_sea_water_velocity_at_sea_floor'},
|
75
75
|
'v_b': {'var_name': 'water_v_bottom', 'standard_name': 'northward_sea_water_velocity_at_sea_floor'},
|
76
76
|
'temp_b': {'var_name': 'water_temp_bottom', 'standard_name': 'sea_water_potential_temperature_at_sea_floor'},
|
77
|
-
'
|
77
|
+
'salt_b': {'var_name': 'salinity_bottom', 'standard_name': 'sea_water_salinity_at_sea_floor'},
|
78
78
|
}
|
79
79
|
|
80
80
|
# classification method
|
@@ -305,7 +305,7 @@ def get_query_dict_single_depth(var, lon_min, lon_max, lat_min, lat_max, depth,
|
|
305
305
|
query_dict = set_query_dict_no_vertical(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh)
|
306
306
|
else:
|
307
307
|
query_dict = set_query_dict_depth_or_level(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh)
|
308
|
-
if var in ['u', 'v', 'temp', '
|
308
|
+
if var in ['u', 'v', 'temp', 'salt']:
|
309
309
|
print('Please ensure the depth is in the range of 0-5000 m')
|
310
310
|
query_dict['vertCoord'] = get_nearest_level_index(depth)+1
|
311
311
|
return query_dict
|
@@ -317,7 +317,7 @@ def get_query_dict_single_level(var, lon_min, lon_max, lat_min, lat_max, level_n
|
|
317
317
|
else:
|
318
318
|
# level_num: 1-40
|
319
319
|
query_dict = set_query_dict_depth_or_level(var, lon_min, lon_max, lat_min, lat_max, time_str_ymdh)
|
320
|
-
if var in ['u', 'v', 'temp', '
|
320
|
+
if var in ['u', 'v', 'temp', 'salt']:
|
321
321
|
print('Please ensure the level_num is in the range of 1-40')
|
322
322
|
if level_num == 0:
|
323
323
|
level_num = 1
|
@@ -397,21 +397,21 @@ def get_base_url(dataset_name, version_name, var, year_str):
|
|
397
397
|
elif classification_method == 'var_different':
|
398
398
|
if var in ['u', 'v', 'u_b', 'v_b']:
|
399
399
|
base_url = url_dict['uv3z']
|
400
|
-
elif var in ['temp', '
|
400
|
+
elif var in ['temp', 'salt', 'temp_b', 'salt_b']:
|
401
401
|
base_url = url_dict['ts3z']
|
402
402
|
elif var in ['ssh']:
|
403
403
|
base_url = url_dict['ssh']
|
404
404
|
else:
|
405
|
-
print('Please ensure the var is in [u,v,temp,
|
405
|
+
print('Please ensure the var is in [u,v,temp,salt,ssh,u_b,v_b,temp_b,salt_b]')
|
406
406
|
elif classification_method == 'var_year_different':
|
407
407
|
if var in ['u', 'v', 'u_b', 'v_b']:
|
408
408
|
base_url = url_dict['uv3z'][str(year_str)]
|
409
|
-
elif var in ['temp', '
|
409
|
+
elif var in ['temp', 'salt', 'temp_b', 'salt_b']:
|
410
410
|
base_url = url_dict['ts3z'][str(year_str)]
|
411
411
|
elif var in ['ssh']:
|
412
412
|
base_url = url_dict['ssh'][str(year_str)]
|
413
413
|
else:
|
414
|
-
print('Please ensure the var is in [u,v,temp,
|
414
|
+
print('Please ensure the var is in [u,v,temp,salt,ssh,u_b,v_b,temp_b,salt_b]')
|
415
415
|
return base_url
|
416
416
|
|
417
417
|
|
@@ -630,7 +630,7 @@ def convert_full_name_to_short_name(full_name):
|
|
630
630
|
for var, info in variable_info.items():
|
631
631
|
if full_name == info['var_name'] or full_name == info['standard_name'] or full_name == var:
|
632
632
|
return var
|
633
|
-
print('[bold #FFE4E1]Please ensure the var is in:\n[bold blue]u,v,temp,
|
633
|
+
print('[bold #FFE4E1]Please ensure the var is in:\n[bold blue]u,v,temp,salt,ssh,u_b,v_b,temp_b,salt_b')
|
634
634
|
print('or')
|
635
635
|
print('[bold blue]water_u, water_v, water_temp, salinity, surf_el, water_u_bottom, water_v_bottom, water_temp_bottom, salinity_bottom')
|
636
636
|
return False
|
@@ -655,7 +655,7 @@ def download(var, time_s, time_e, lon_min=0, lon_max=359.92, lat_min=-80, lat_ma
|
|
655
655
|
Download the data of single time or a series of time
|
656
656
|
|
657
657
|
Parameters:
|
658
|
-
var: str, the variable name, such as 'u', 'v', 'temp', '
|
658
|
+
var: str, the variable name, such as 'u', 'v', 'temp', 'salt', 'ssh', 'u_b', 'v_b', 'temp_b', 'salt_b' or 'water_u', 'water_v', 'water_temp', 'salinity', 'surf_el', 'water_u_bottom', 'water_v_bottom', 'water_temp_bottom', 'salinity_bottom'
|
659
659
|
time_s: str, the start time, such as '2024110100' or '20241101', if add hour, the hour should be 00, 03, 06, 09, 12, 15, 18, 21
|
660
660
|
time_e: str, the end time, such as '2024110221' or '20241102', if add hour, the hour should be 00, 03, 06, 09, 12, 15, 18, 21
|
661
661
|
lon_min: float, the minimum longitude, default is 0
|
@@ -725,11 +725,11 @@ def how_to_use():
|
|
725
725
|
|
726
726
|
# 2. Get the base url according to the dataset, version, var and year:
|
727
727
|
# 2.1 Dataset and version were found in step 1
|
728
|
-
# 2.2 Var: u, v, temp,
|
728
|
+
# 2.2 Var: u, v, temp, salt, ssh, u_b, v_b, temp_b, salt_b
|
729
729
|
# 2.3 Year: 1994-2024(current year)
|
730
730
|
|
731
731
|
# 3. Get the query_dict according to the var, lon_min, lon_max, lat_min, lat_max, depth, level_num, time_str_ymdh:
|
732
|
-
# 3.1 Var: u, v, temp,
|
732
|
+
# 3.1 Var: u, v, temp, salt, ssh, u_b, v_b, temp_b, salt_b
|
733
733
|
# 3.2 Lon_min, lon_max, lat_min, lat_max: float
|
734
734
|
# 3.3 Depth: 0-5000m, if you wanna get single depth data, you can set the depth
|
735
735
|
# 3.4 Level_num: 1-40, if you wanna get single level data, you can set the level_num
|
@@ -772,7 +772,7 @@ if __name__ == '__main__':
|
|
772
772
|
'water_v': {'simple_name': 'v', 'download': 0},
|
773
773
|
'surf_el': {'simple_name': 'ssh', 'download': 0},
|
774
774
|
'water_temp': {'simple_name': 'temp', 'download': 0},
|
775
|
-
'salinity': {'simple_name': '
|
775
|
+
'salinity': {'simple_name': 'salt', 'download': 1},
|
776
776
|
}
|
777
777
|
|
778
778
|
# set depth or level, only one can be True
|
@@ -0,0 +1,332 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding=utf-8
|
3
|
+
'''
|
4
|
+
Author: Liu Kun && 16031215@qq.com
|
5
|
+
Date: 2024-11-28 10:42:56
|
6
|
+
LastEditors: Liu Kun && 16031215@qq.com
|
7
|
+
LastEditTime: 2024-11-28 10:43:18
|
8
|
+
FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\literature.py
|
9
|
+
Description:
|
10
|
+
EditPlatform: vscode
|
11
|
+
ComputerInfo: XPS 15 9510
|
12
|
+
SystemInfo: Windows 11
|
13
|
+
Python Version: 3.12
|
14
|
+
'''
|
15
|
+
|
16
|
+
|
17
|
+
import os
|
18
|
+
import random
|
19
|
+
import re
|
20
|
+
import time
|
21
|
+
from pathlib import Path
|
22
|
+
|
23
|
+
import pandas as pd
|
24
|
+
import requests
|
25
|
+
from rich import print
|
26
|
+
from rich.progress import track
|
27
|
+
|
28
|
+
__all__ = ['download5doi']
|
29
|
+
|
30
|
+
|
31
|
+
def _get_file_size(file_path, unit='KB'):
|
32
|
+
# 检查文件是否存在
|
33
|
+
if not os.path.exists(file_path):
|
34
|
+
return "文件不存在"
|
35
|
+
|
36
|
+
# 获取文件大小(字节)
|
37
|
+
file_size = os.path.getsize(file_path)
|
38
|
+
|
39
|
+
# 单位转换字典
|
40
|
+
unit_dict = {
|
41
|
+
'PB': 1024**5,
|
42
|
+
'TB': 1024**4,
|
43
|
+
'GB': 1024**3,
|
44
|
+
'MB': 1024**2,
|
45
|
+
'KB': 1024,
|
46
|
+
}
|
47
|
+
|
48
|
+
# 检查传入的单位是否合法
|
49
|
+
if unit not in unit_dict:
|
50
|
+
return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
|
51
|
+
|
52
|
+
# 转换文件大小到指定单位
|
53
|
+
converted_size = file_size / unit_dict[unit]
|
54
|
+
|
55
|
+
return converted_size
|
56
|
+
|
57
|
+
|
58
|
+
class _Downloader:
|
59
|
+
'''
|
60
|
+
根据doi下载文献pdf
|
61
|
+
'''
|
62
|
+
|
63
|
+
def __init__(self, doi, store_path):
|
64
|
+
self.url_list = [r'https://sci-hub.se',
|
65
|
+
r'https://sci-hub.ren',
|
66
|
+
r'https://sci-hub.st',
|
67
|
+
r'https://sci-hub.ru',
|
68
|
+
]
|
69
|
+
self.base_url = None
|
70
|
+
self.url = None
|
71
|
+
self.doi = doi
|
72
|
+
self.pdf_url = None
|
73
|
+
self.pdf_path = None
|
74
|
+
self.headers = {'User-Agent': self.get_ua().encode('utf-8')}
|
75
|
+
# 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
|
76
|
+
# self.fname = doi.replace(r'/', '_') + '.pdf'
|
77
|
+
self.fname = re.sub(r'[/<>:"?*|]', '_', doi) + '.pdf'
|
78
|
+
self.store_path = Path(store_path)
|
79
|
+
self.fpath = self.store_path / self.fname
|
80
|
+
self.wrong_record_file = self.store_path / 'wrong_record.txt'
|
81
|
+
self.sleep = 5
|
82
|
+
self.cookies = None
|
83
|
+
self.check_size = 50
|
84
|
+
self.url_index = 0
|
85
|
+
self.try_times_each_url_max = 3
|
86
|
+
self.try_times = 0
|
87
|
+
|
88
|
+
def get_ua(self):
|
89
|
+
ua_list = [
|
90
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
|
91
|
+
"Opera/8.0 (Windows NT 5.1; U; en)",
|
92
|
+
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
|
93
|
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
|
94
|
+
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
|
95
|
+
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
|
96
|
+
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
|
97
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
|
98
|
+
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
|
99
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
|
100
|
+
"Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
|
101
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
|
102
|
+
"MAC:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
|
103
|
+
"Windows:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
|
104
|
+
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
105
|
+
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
106
|
+
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
107
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
|
108
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
|
109
|
+
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
|
110
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
111
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
|
112
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
|
113
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
|
114
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
|
115
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
|
116
|
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
|
117
|
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
|
118
|
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
|
119
|
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
|
120
|
+
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
|
121
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
|
122
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
|
123
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
|
124
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
|
125
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
|
126
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36",
|
127
|
+
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
128
|
+
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
129
|
+
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
|
130
|
+
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
|
131
|
+
"Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
132
|
+
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
133
|
+
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
134
|
+
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
|
135
|
+
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
|
136
|
+
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
|
137
|
+
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
|
138
|
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
|
139
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
|
140
|
+
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
|
141
|
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
|
142
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
|
143
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
|
144
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
|
145
|
+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
|
146
|
+
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
|
147
|
+
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
|
148
|
+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
|
149
|
+
"UCWEB7.0.2.37/28/999",
|
150
|
+
"NOKIA5700/UCWEB7.0.2.37/28/999",
|
151
|
+
"Openwave/UCWEB7.0.2.37/28/999",
|
152
|
+
"Openwave/UCWEB7.0.2.37/28/999",
|
153
|
+
]
|
154
|
+
ua_index = random.randint(0, len(ua_list)-1)
|
155
|
+
ua = ua_list[ua_index]
|
156
|
+
return ua
|
157
|
+
|
158
|
+
def get_pdf_url(self):
|
159
|
+
print('[bold #E6E6FA]-'*100)
|
160
|
+
print(f"DOI: {self.doi}")
|
161
|
+
print(f"Requesting: {self.url}...")
|
162
|
+
response = requests.get(self.url, headers=self.headers)
|
163
|
+
if response.status_code == 200:
|
164
|
+
self.cookies = response.cookies
|
165
|
+
text = response.text.replace('\\', '')
|
166
|
+
# text = text.replace(' ', '') # It is important to remove the space
|
167
|
+
# print(text)
|
168
|
+
pattern = re.compile(
|
169
|
+
r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
|
170
|
+
match = pattern.search(text)
|
171
|
+
if match:
|
172
|
+
got_url = match.group(1)
|
173
|
+
if r'http' not in got_url:
|
174
|
+
if got_url[:2] == '//':
|
175
|
+
self.pdf_url = 'https:' + got_url
|
176
|
+
else:
|
177
|
+
self.pdf_url = self.base_url + got_url
|
178
|
+
else:
|
179
|
+
self.pdf_url = got_url
|
180
|
+
print(f"URL: {self.pdf_url}")
|
181
|
+
else:
|
182
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
183
|
+
self.try_times = self.try_times_each_url_max+1
|
184
|
+
else:
|
185
|
+
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
|
186
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
187
|
+
self.try_times = self.try_times_each_url_max+1
|
188
|
+
|
189
|
+
def url_iterate(self):
|
190
|
+
if self.url_index >= len(self.url_list):
|
191
|
+
return
|
192
|
+
url = self.url_list[self.url_index]
|
193
|
+
self.base_url = url
|
194
|
+
self.url = url + '/' + self.doi
|
195
|
+
self.get_pdf_url()
|
196
|
+
# for url in self.url_list:
|
197
|
+
# self.url = url + self.doi
|
198
|
+
# self.get_pdf_url()
|
199
|
+
# if self.pdf_url:
|
200
|
+
# break
|
201
|
+
|
202
|
+
def write_wrong_record(self):
|
203
|
+
with open(self.wrong_record_file, 'a') as f:
|
204
|
+
f.write(self.doi + '\n')
|
205
|
+
|
206
|
+
def download_pdf(self):
|
207
|
+
if self.fpath.exists():
|
208
|
+
fsize = _get_file_size(self.fpath, unit='KB')
|
209
|
+
if fsize < self.check_size:
|
210
|
+
# delete the wrong file
|
211
|
+
os.remove(self.fpath)
|
212
|
+
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
213
|
+
else:
|
214
|
+
print('[bold #E6E6FA]-'*100)
|
215
|
+
print(f"[bold purple]The PDF file {self.fpath} already exists.")
|
216
|
+
return
|
217
|
+
self.url_index = 0
|
218
|
+
already_downloaded = False
|
219
|
+
self.try_times = 0
|
220
|
+
while not already_downloaded:
|
221
|
+
self.url_iterate()
|
222
|
+
if not self.pdf_url:
|
223
|
+
self.url_index += 1
|
224
|
+
if self.url_index >= len(self.url_list):
|
225
|
+
print("Failed to download the PDF file.")
|
226
|
+
self.write_wrong_record()
|
227
|
+
return
|
228
|
+
else:
|
229
|
+
self.try_times = 0
|
230
|
+
continue
|
231
|
+
else:
|
232
|
+
self.try_times += 1
|
233
|
+
if self.try_times > self.try_times_each_url_max:
|
234
|
+
self.url_index += 1
|
235
|
+
if self.url_index >= len(self.url_list):
|
236
|
+
# print("Failed to download the PDF file.")
|
237
|
+
self.write_wrong_record()
|
238
|
+
return
|
239
|
+
print(f"Downloading: {self.fname}...")
|
240
|
+
try:
|
241
|
+
response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
|
242
|
+
if response.status_code == 200:
|
243
|
+
with open(self.fpath, 'wb') as f:
|
244
|
+
f.write(response.content)
|
245
|
+
fsize = _get_file_size(self.fpath, unit='KB')
|
246
|
+
if fsize < self.check_size:
|
247
|
+
# delete the wrong file
|
248
|
+
os.remove(self.fpath)
|
249
|
+
print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
|
250
|
+
else:
|
251
|
+
print(f"[bold green]Sucessful to download {self.fpath}")
|
252
|
+
already_downloaded = True
|
253
|
+
else:
|
254
|
+
self.try_times = self.try_times_each_url_max+1
|
255
|
+
print(f"Failed to download the PDF file. Status code: {response.status_code}")
|
256
|
+
print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
|
257
|
+
except Exception as e:
|
258
|
+
print(f"Failed to download the PDF file. Error: {e}")
|
259
|
+
time.sleep(self.sleep)
|
260
|
+
if self.try_times >= self.try_times_each_url_max:
|
261
|
+
self.url_index += 1
|
262
|
+
if self.url_index >= len(self.url_list):
|
263
|
+
print("\n[bold #CD5C5C]Failed to download the PDF file.")
|
264
|
+
self.write_wrong_record()
|
265
|
+
return
|
266
|
+
if self.try_times == self.try_times_each_url_max:
|
267
|
+
print(f'Tried {self.try_times} times for {self.url_list[self.url_index-1]}.')
|
268
|
+
print("Try another URL...")
|
269
|
+
|
270
|
+
|
271
|
+
def read_excel(file, col_name=r'DOI'):
|
272
|
+
df = pd.read_excel(file)
|
273
|
+
df_list = df[col_name].tolist()
|
274
|
+
# 去掉nan
|
275
|
+
df_list = [doi for doi in df_list if str(doi) != 'nan']
|
276
|
+
return df_list
|
277
|
+
|
278
|
+
|
279
|
+
def read_txt(file):
|
280
|
+
with open(file, 'r') as f:
|
281
|
+
lines = f.readlines()
|
282
|
+
# 去掉换行符以及空行
|
283
|
+
lines = [line.strip() for line in lines if line.strip()]
|
284
|
+
return lines
|
285
|
+
|
286
|
+
|
287
|
+
def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r'DOI'):
|
288
|
+
'''
|
289
|
+
Description: Download PDF files by DOI.
|
290
|
+
|
291
|
+
Args:
|
292
|
+
store_path: str, The path to store the PDF files.
|
293
|
+
doi_list: list or str, The list of DOIs.
|
294
|
+
txt_file: str, The path of the txt file that contains the DOIs.
|
295
|
+
excel_file: str, The path of the excel file that contains the DOIs.
|
296
|
+
col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
None
|
300
|
+
|
301
|
+
Example:
|
302
|
+
download5doi(doi_list='10.3389/feart.2021.698876')
|
303
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', doi_list='10.3389/feart.2021.698876')
|
304
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
|
305
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', txt_file=r'I:\Delete\ref_pdf\wrong_record.txt')
|
306
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
|
307
|
+
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
|
308
|
+
'''
|
309
|
+
if not store_path:
|
310
|
+
store_path = Path.cwd()
|
311
|
+
else:
|
312
|
+
store_path = Path(str(store_path))
|
313
|
+
store_path.mkdir(parents=True, exist_ok=True)
|
314
|
+
store_path = str(store_path)
|
315
|
+
|
316
|
+
# 如果doi_list是str,转换为list
|
317
|
+
if isinstance(doi_list, str) and doi_list:
|
318
|
+
doi_list = [doi_list]
|
319
|
+
if txt_file:
|
320
|
+
doi_list = read_txt(txt_file)
|
321
|
+
if excel_file:
|
322
|
+
doi_list = read_excel(excel_file, col_name)
|
323
|
+
print(f"Downloading {len(doi_list)} PDF files...")
|
324
|
+
for doi in track(doi_list, description='Downloading...'):
|
325
|
+
download = _Downloader(doi, store_path)
|
326
|
+
download.download_pdf()
|
327
|
+
|
328
|
+
|
329
|
+
if __name__ == '__main__':
|
330
|
+
store_path = r'I:\Delete\ref_pdf'
|
331
|
+
# download5doi(store_path, doi_list='10.1007/s00382-022-06260-x')
|
332
|
+
download5doi(store_path, excel_file=r'I:\Delete\ref_pdf\savedrecs.xls')
|
oafuncs/oa_down/refs_pdf.py
CHANGED
@@ -305,6 +305,8 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None,
|
|
305
305
|
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
|
306
306
|
download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
|
307
307
|
'''
|
308
|
+
print('[bold #EE33fA]Note:\n 升级0.0.62版本后,函数路径将改为oafuncs.oa_down.literature.download5doi,此路径将被弃用。')
|
309
|
+
|
308
310
|
if not store_path:
|
309
311
|
store_path = Path.cwd()
|
310
312
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: oafuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.63
|
4
4
|
Summary: My short description for my project.
|
5
5
|
Home-page: https://github.com/Industry-Pays/OAFuncs
|
6
6
|
Author: Kun Liu
|
@@ -144,7 +144,7 @@ oafuncs.oa_nc.write2nc(r'I:\test.nc', data,
|
|
144
144
|
|
145
145
|
方便获取时间序列,间隔为hour
|
146
146
|
|
147
|
-
-
|
147
|
+
- literature
|
148
148
|
|
149
149
|
- download5doi
|
150
150
|
|
@@ -6,17 +6,18 @@ oafuncs/oa_file.py,sha256=rUfxlks9uR5B9_TBPJFnNswfDF0lRm_ZBDUwIjYtf9U,10250
|
|
6
6
|
oafuncs/oa_help.py,sha256=ppNktmtNzs15R20MD1bM7yImlTQ_ngMwvoIglePOKXA,1000
|
7
7
|
oafuncs/oa_nc.py,sha256=ALAYfqDy5lbUNJsTU29j6ZWkM4wgqQU3p2Fxn5pkvsQ,12102
|
8
8
|
oafuncs/oa_python.py,sha256=XPTP3o7zTFzfJR_YhsKfQksa3bSYwXsne9YxlJplCEA,3994
|
9
|
-
oafuncs/oa_down/__init__.py,sha256=
|
10
|
-
oafuncs/oa_down/hycom_3hourly.py,sha256=
|
11
|
-
oafuncs/oa_down/
|
9
|
+
oafuncs/oa_down/__init__.py,sha256=a6rgxHQi8spvlI-TaVEqnrDNhYsKm5_IQf7ckAZ8U4w,603
|
10
|
+
oafuncs/oa_down/hycom_3hourly.py,sha256=5ssqWxIpmCF77ojguxUem2zGULTPmPKZjqPTgne29fg,43912
|
11
|
+
oafuncs/oa_down/literature.py,sha256=dT3-7-beEzQ9mTP8LNV9Gf3q5Z1Pqqjc6FOS010HZeQ,17833
|
12
|
+
oafuncs/oa_down/refs_pdf.py,sha256=wr1sIYO2FUBJTn-K79NylbRUvCAFaee6_fQljxSI_Q8,17979
|
12
13
|
oafuncs/oa_sign/__init__.py,sha256=QKqTFrJDFK40C5uvk48GlRRbGFzO40rgkYwu6dYxatM,563
|
13
14
|
oafuncs/oa_sign/meteorological.py,sha256=mLbupsZSq427HTfVbZMvIlFzDHwSzQAbK3X19o8anFY,6525
|
14
15
|
oafuncs/oa_sign/ocean.py,sha256=xrW-rWD7xBWsB5PuCyEwQ1Q_RDKq2KCLz-LOONHgldU,5932
|
15
16
|
oafuncs/oa_sign/scientific.py,sha256=a4JxOBgm9vzNZKpJ_GQIQf7cokkraV5nh23HGbmTYKw,5064
|
16
17
|
oafuncs/oa_tool/__init__.py,sha256=IKOlqpWlb4cMDCtq2VKR_RTxQHDNqR_vfqqsOsp_lKQ,466
|
17
18
|
oafuncs/oa_tool/email.py,sha256=7EX3VkD8TxYAKPLOuC_yS104p9zbKilZlGfkVa2C6BQ,2947
|
18
|
-
oafuncs-0.0.
|
19
|
-
oafuncs-0.0.
|
20
|
-
oafuncs-0.0.
|
21
|
-
oafuncs-0.0.
|
22
|
-
oafuncs-0.0.
|
19
|
+
oafuncs-0.0.63.dist-info/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
|
20
|
+
oafuncs-0.0.63.dist-info/METADATA,sha256=c0kTonzu73DG0FabuV9DU_7sRnKBws4NPDq4IQhUPAY,22378
|
21
|
+
oafuncs-0.0.63.dist-info/WHEEL,sha256=pxeNX5JdtCe58PUSYP9upmc7jdRPgvT0Gm9kb1SHlVw,109
|
22
|
+
oafuncs-0.0.63.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
|
23
|
+
oafuncs-0.0.63.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|