oafuncs 0.0.87__tar.gz → 0.0.89__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {oafuncs-0.0.87/oafuncs.egg-info → oafuncs-0.0.89}/PKG-INFO +9 -8
  2. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_data.py +1 -0
  3. oafuncs-0.0.89/oafuncs/oa_down/literature.py +263 -0
  4. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_draw.py +1 -0
  5. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_file.py +1 -0
  6. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_help.py +2 -2
  7. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_nc.py +1 -0
  8. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_python.py +1 -0
  9. {oafuncs-0.0.87 → oafuncs-0.0.89/oafuncs.egg-info}/PKG-INFO +9 -8
  10. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs.egg-info/requires.txt +7 -6
  11. {oafuncs-0.0.87 → oafuncs-0.0.89}/setup.py +30 -24
  12. oafuncs-0.0.87/oafuncs/oa_down/literature.py +0 -332
  13. {oafuncs-0.0.87 → oafuncs-0.0.89}/LICENSE.txt +0 -0
  14. {oafuncs-0.0.87 → oafuncs-0.0.89}/MANIFEST.in +0 -0
  15. {oafuncs-0.0.87 → oafuncs-0.0.89}/README.md +0 -0
  16. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/__init__.py +0 -0
  17. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/data_store/OAFuncs.png +0 -0
  18. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_cmap.py +0 -0
  19. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/User_Agent-list.txt +0 -0
  20. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/__init__.py +0 -0
  21. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/hycom_3hourly.py +0 -0
  22. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/test_ua.py +0 -0
  23. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_down/user_agent.py +0 -0
  24. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_sign/__init__.py +0 -0
  25. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_sign/meteorological.py +0 -0
  26. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_sign/ocean.py +0 -0
  27. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_sign/scientific.py +0 -0
  28. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_tool/__init__.py +0 -0
  29. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs/oa_tool/email.py +0 -0
  30. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs.egg-info/SOURCES.txt +0 -0
  31. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs.egg-info/dependency_links.txt +0 -0
  32. {oafuncs-0.0.87 → oafuncs-0.0.89}/oafuncs.egg-info/top_level.txt +0 -0
  33. {oafuncs-0.0.87 → oafuncs-0.0.89}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: oafuncs
3
- Version: 0.0.87
3
+ Version: 0.0.89
4
4
  Summary: Oceanic and Atmospheric Functions
5
5
  Home-page: https://github.com/Industry-Pays/OAFuncs
6
6
  Author: Kun Liu
@@ -15,20 +15,21 @@ Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
16
  Classifier: Programming Language :: Python :: Implementation :: CPython
17
17
  Classifier: Programming Language :: Python :: Implementation :: PyPy
18
- Requires-Python: >=3.7.0
18
+ Requires-Python: >=3.9.0
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE.txt
21
- Requires-Dist: matplotlib
22
21
  Requires-Dist: numpy
23
22
  Requires-Dist: scipy
23
+ Requires-Dist: pandas
24
24
  Requires-Dist: xarray
25
- Requires-Dist: Cartopy
26
- Requires-Dist: netCDF4
27
- Requires-Dist: requests
28
- Requires-Dist: bs4
29
25
  Requires-Dist: rich
30
26
  Requires-Dist: pathlib
31
- Requires-Dist: pandas
27
+ Requires-Dist: requests
28
+ Requires-Dist: bs4
29
+ Requires-Dist: matplotlib
30
+ Requires-Dist: Cartopy
31
+ Requires-Dist: netCDF4
32
+ Requires-Dist: xlrd
32
33
 
33
34
 
34
35
  # oafuncs
@@ -19,6 +19,7 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
19
19
 
20
20
  import numpy as np
21
21
  from scipy.interpolate import griddata
22
+ from rich import print
22
23
 
23
24
  __all__ = ["interp_2d","ParallelExecutor"]
24
25
 
@@ -0,0 +1,263 @@
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ """
4
+ Author: Liu Kun && 16031215@qq.com
5
+ Date: 2024-11-28 10:42:56
6
+ LastEditors: Liu Kun && 16031215@qq.com
7
+ LastEditTime: 2025-01-05 10:51:42
8
+ FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\literature.py
9
+ Description:
10
+ EditPlatform: vscode
11
+ ComputerInfo: XPS 15 9510
12
+ SystemInfo: Windows 11
13
+ Python Version: 3.12
14
+ """
15
+
16
+ import os
17
+ import re
18
+ import time
19
+ from pathlib import Path
20
+
21
+ import pandas as pd
22
+ import requests
23
+ from rich import print
24
+ from rich.progress import track
25
+ from oafuncs.oa_down.user_agent import get_ua
26
+
27
+ __all__ = ["download5doi"]
28
+
29
+
30
+ def _get_file_size(file_path, unit="KB"):
31
+ # 检查文件是否存在
32
+ if not os.path.exists(file_path):
33
+ return "文件不存在"
34
+
35
+ # 获取文件大小(字节)
36
+ file_size = os.path.getsize(file_path)
37
+
38
+ # 单位转换字典
39
+ unit_dict = {
40
+ "PB": 1024**5,
41
+ "TB": 1024**4,
42
+ "GB": 1024**3,
43
+ "MB": 1024**2,
44
+ "KB": 1024,
45
+ }
46
+
47
+ # 检查传入的单位是否合法
48
+ if unit not in unit_dict:
49
+ return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
50
+
51
+ # 转换文件大小到指定单位
52
+ converted_size = file_size / unit_dict[unit]
53
+
54
+ return converted_size
55
+
56
+
57
+ class _Downloader:
58
+ """
59
+ 根据doi下载文献pdf
60
+ """
61
+
62
+ def __init__(self, doi, store_path):
63
+ self.url_list = [
64
+ r"https://sci-hub.se",
65
+ r"https://sci-hub.ren",
66
+ r"https://sci-hub.st",
67
+ r"https://sci-hub.ru",
68
+ ]
69
+ self.base_url = None
70
+ self.url = None
71
+ self.doi = doi
72
+ self.pdf_url = None
73
+ self.pdf_path = None
74
+ self.headers = {"User-Agent": get_ua().encode("utf-8")}
75
+ # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
76
+ # self.fname = doi.replace(r'/', '_') + '.pdf'
77
+ self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
78
+ self.store_path = Path(store_path)
79
+ self.fpath = self.store_path / self.fname
80
+ self.wrong_record_file = self.store_path / "wrong_record.txt"
81
+ self.sleep = 5
82
+ self.cookies = None
83
+ self.check_size = 50
84
+ self.url_index = 0
85
+ self.try_times_each_url_max = 3
86
+ self.try_times = 0
87
+
88
+ def get_pdf_url(self):
89
+ print("[bold #E6E6FA]-" * 100)
90
+ print(f"DOI: {self.doi}")
91
+ print(f"Requesting: {self.url}...")
92
+ response = requests.get(self.url, headers=self.headers)
93
+ if response.status_code == 200:
94
+ self.cookies = response.cookies
95
+ text = response.text.replace("\\", "")
96
+ # text = text.replace(' ', '') # It is important to remove the space
97
+ # print(text)
98
+ pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
99
+ match = pattern.search(text)
100
+ if match:
101
+ got_url = match.group(1)
102
+ if r"http" not in got_url:
103
+ if got_url[:2] == "//":
104
+ self.pdf_url = "https:" + got_url
105
+ else:
106
+ self.pdf_url = self.base_url + got_url
107
+ else:
108
+ self.pdf_url = got_url
109
+ print(f"URL: {self.pdf_url}")
110
+ else:
111
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
112
+ self.try_times = self.try_times_each_url_max + 1
113
+ else:
114
+ print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
115
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
116
+ self.try_times = self.try_times_each_url_max + 1
117
+
118
+ def url_iterate(self):
119
+ if self.url_index >= len(self.url_list):
120
+ return
121
+ url = self.url_list[self.url_index]
122
+ self.base_url = url
123
+ self.url = url + "/" + self.doi
124
+ self.get_pdf_url()
125
+ # for url in self.url_list:
126
+ # self.url = url + self.doi
127
+ # self.get_pdf_url()
128
+ # if self.pdf_url:
129
+ # break
130
+
131
+ def write_wrong_record(self):
132
+ with open(self.wrong_record_file, "a") as f:
133
+ f.write(self.doi + "\n")
134
+
135
+ def download_pdf(self):
136
+ if self.fpath.exists():
137
+ fsize = _get_file_size(self.fpath, unit="KB")
138
+ if fsize < self.check_size:
139
+ # delete the wrong file
140
+ os.remove(self.fpath)
141
+ print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
142
+ else:
143
+ print("[bold #E6E6FA]-" * 100)
144
+ print(f"[bold purple]The PDF file {self.fpath} already exists.")
145
+ return
146
+ self.url_index = 0
147
+ already_downloaded = False
148
+ self.try_times = 0
149
+ while not already_downloaded:
150
+ self.url_iterate()
151
+ if not self.pdf_url:
152
+ self.url_index += 1
153
+ if self.url_index >= len(self.url_list):
154
+ print("Failed to download the PDF file.")
155
+ self.write_wrong_record()
156
+ return
157
+ else:
158
+ self.try_times = 0
159
+ continue
160
+ else:
161
+ self.try_times += 1
162
+ if self.try_times > self.try_times_each_url_max:
163
+ self.url_index += 1
164
+ if self.url_index >= len(self.url_list):
165
+ # print("Failed to download the PDF file.")
166
+ self.write_wrong_record()
167
+ return
168
+ print(f"Downloading: {self.fname}...")
169
+ try:
170
+ response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
171
+ if response.status_code == 200:
172
+ with open(self.fpath, "wb") as f:
173
+ f.write(response.content)
174
+ fsize = _get_file_size(self.fpath, unit="KB")
175
+ if fsize < self.check_size:
176
+ # delete the wrong file
177
+ os.remove(self.fpath)
178
+ print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
179
+ else:
180
+ print(f"[bold green]Sucessful to download {self.fpath}")
181
+ already_downloaded = True
182
+ else:
183
+ self.try_times = self.try_times_each_url_max + 1
184
+ print(f"Failed to download the PDF file. Status code: {response.status_code}")
185
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
186
+ except Exception as e:
187
+ print(f"Failed to download the PDF file. Error: {e}")
188
+ time.sleep(self.sleep)
189
+ if self.try_times >= self.try_times_each_url_max:
190
+ self.url_index += 1
191
+ if self.url_index >= len(self.url_list):
192
+ print("\n[bold #CD5C5C]Failed to download the PDF file.")
193
+ self.write_wrong_record()
194
+ return
195
+ if self.try_times == self.try_times_each_url_max:
196
+ print(f"Tried {self.try_times} times for {self.url_list[self.url_index-1]}.")
197
+ print("Try another URL...")
198
+
199
+
200
+ def read_excel(file, col_name=r"DOI"):
201
+ df = pd.read_excel(file)
202
+ df_list = df[col_name].tolist()
203
+ # 去掉nan
204
+ df_list = [doi for doi in df_list if str(doi) != "nan"]
205
+ return df_list
206
+
207
+
208
+ def read_txt(file):
209
+ with open(file, "r") as f:
210
+ lines = f.readlines()
211
+ # 去掉换行符以及空行
212
+ lines = [line.strip() for line in lines if line.strip()]
213
+ return lines
214
+
215
+
216
+ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r"DOI"):
217
+ """
218
+ Description:
219
+ Download PDF files by DOI.
220
+
221
+ Parameters:
222
+ store_path: str, The path to store the PDF files.
223
+ doi_list: list or str, The list of DOIs.
224
+ txt_file: str, The path of the txt file that contains the DOIs.
225
+ excel_file: str, The path of the excel file that contains the DOIs.
226
+ col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
227
+
228
+ Returns:
229
+ None
230
+
231
+ Example:
232
+ download5doi(doi_list='10.3389/feart.2021.698876')
233
+ download5doi(store_path=r'I:\Delete\ref_pdf', doi_list='10.3389/feart.2021.698876')
234
+ download5doi(store_path=r'I:\Delete\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
235
+ download5doi(store_path=r'I:\Delete\ref_pdf', txt_file=r'I:\Delete\ref_pdf\wrong_record.txt')
236
+ download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
237
+ download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
238
+ """
239
+ if not store_path:
240
+ store_path = Path.cwd()
241
+ else:
242
+ store_path = Path(str(store_path))
243
+ store_path.mkdir(parents=True, exist_ok=True)
244
+ store_path = str(store_path)
245
+
246
+ # 如果doi_list是str,转换为list
247
+ if isinstance(doi_list, str) and doi_list:
248
+ doi_list = [doi_list]
249
+ if txt_file:
250
+ doi_list = read_txt(txt_file)
251
+ if excel_file:
252
+ doi_list = read_excel(excel_file, col_name)
253
+ print(f"Downloading {len(doi_list)} PDF files...")
254
+ for doi in track(doi_list, description="Downloading..."):
255
+ download = _Downloader(doi, store_path)
256
+ download.download_pdf()
257
+
258
+
259
+ if __name__ == "__main__":
260
+ store_path = r"I:\Delete\ref_pdf"
261
+ excel_file = r"I:\Delete\Ref_DA_ROMS\savedrecs.xls"
262
+ # download5doi(store_path, doi_list='10.1007/s00382-022-06260-x')
263
+ download5doi(store_path, excel_file=excel_file)
@@ -23,6 +23,7 @@ import matplotlib.pyplot as plt
23
23
  import numpy as np
24
24
  import xarray as xr
25
25
  from cartopy.mpl.ticker import LatitudeFormatter, LongitudeFormatter
26
+ from rich import print
26
27
 
27
28
  __all__ = ["fig_minus", "gif", "add_cartopy", "add_gridlines", "MidpointNormalize", "add_lonlat_unit", "contour", "contourf", "quiver"]
28
29
 
@@ -17,6 +17,7 @@ import glob
17
17
  import os
18
18
  import re
19
19
  import shutil
20
+ from rich import print
20
21
 
21
22
  __all__ = ["find_file", "link_file", "copy_file", "rename_file", "make_folder", "clear_folder", "remove_empty_folder", "remove", "file_size"]
22
23
 
@@ -118,8 +118,8 @@ def log():
118
118
  print("更新日志:")
119
119
  print(
120
120
  """
121
- 2025-01-04
122
- 1. 下调Python支持版本到3.7,但未经过测试
121
+ 2025-01-05
122
+ 1. 测试Python版本最低为3.9
123
123
  2. 优化了部分函数说明
124
124
  """
125
125
  )
@@ -18,6 +18,7 @@ import os
18
18
  import netCDF4 as nc
19
19
  import numpy as np
20
20
  import xarray as xr
21
+ from rich import print
21
22
 
22
23
  __all__ = ["get_var", "extract", "save", "merge", "modify", "rename", "check_file", "convert_longitude", "isel"]
23
24
 
@@ -14,6 +14,7 @@ Python Version: 3.11
14
14
  '''
15
15
 
16
16
  import os
17
+ from rich import print
17
18
 
18
19
  __all__ = ['install_lib', 'upgrade_lib']
19
20
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: oafuncs
3
- Version: 0.0.87
3
+ Version: 0.0.89
4
4
  Summary: Oceanic and Atmospheric Functions
5
5
  Home-page: https://github.com/Industry-Pays/OAFuncs
6
6
  Author: Kun Liu
@@ -15,20 +15,21 @@ Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
16
  Classifier: Programming Language :: Python :: Implementation :: CPython
17
17
  Classifier: Programming Language :: Python :: Implementation :: PyPy
18
- Requires-Python: >=3.7.0
18
+ Requires-Python: >=3.9.0
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE.txt
21
- Requires-Dist: matplotlib
22
21
  Requires-Dist: numpy
23
22
  Requires-Dist: scipy
23
+ Requires-Dist: pandas
24
24
  Requires-Dist: xarray
25
- Requires-Dist: Cartopy
26
- Requires-Dist: netCDF4
27
- Requires-Dist: requests
28
- Requires-Dist: bs4
29
25
  Requires-Dist: rich
30
26
  Requires-Dist: pathlib
31
- Requires-Dist: pandas
27
+ Requires-Dist: requests
28
+ Requires-Dist: bs4
29
+ Requires-Dist: matplotlib
30
+ Requires-Dist: Cartopy
31
+ Requires-Dist: netCDF4
32
+ Requires-Dist: xlrd
32
33
 
33
34
 
34
35
  # oafuncs
@@ -1,11 +1,12 @@
1
- matplotlib
2
1
  numpy
3
2
  scipy
3
+ pandas
4
4
  xarray
5
- Cartopy
6
- netCDF4
7
- requests
8
- bs4
9
5
  rich
10
6
  pathlib
11
- pandas
7
+ requests
8
+ bs4
9
+ matplotlib
10
+ Cartopy
11
+ netCDF4
12
+ xlrd
@@ -17,22 +17,30 @@ DESCRIPTION = 'Oceanic and Atmospheric Functions'
17
17
  URL = 'https://github.com/Industry-Pays/OAFuncs'
18
18
  EMAIL = 'liukun0312@stu.ouc.edu.cn'
19
19
  AUTHOR = 'Kun Liu'
20
- REQUIRES_PYTHON = '>=3.7.0'
21
- VERSION = '0.0.87'
20
+ REQUIRES_PYTHON = '>=3.9.0' # 2025/01/05
21
+ VERSION = '0.0.89'
22
22
 
23
23
  # What packages are required for this module to be executed?
24
24
  REQUIRED = [
25
- "matplotlib",
25
+ # ------ General ------
26
26
  "numpy",
27
27
  "scipy",
28
+ "pandas",
28
29
  "xarray",
29
- "Cartopy",
30
- "netCDF4",
30
+ # ------ Progress and Print ------
31
+ "rich",
32
+ # ------ Path ------
33
+ "pathlib",
34
+ # ------ Internet ------
31
35
  "requests",
32
36
  "bs4",
33
- "rich",
34
- 'pathlib',
35
- 'pandas',
37
+ # ------ Picture ------
38
+ "matplotlib",
39
+ "Cartopy",
40
+ # ------ File ------
41
+ "netCDF4",
42
+ "xlrd",
43
+ # ------ Other ------
36
44
  ]
37
45
 
38
46
  # What packages are optional?
@@ -106,42 +114,40 @@ class UploadCommand(Command):
106
114
  # Where the magic happens:
107
115
  setup(
108
116
  name=NAME,
109
- version=about['__version__'],
117
+ version=about["__version__"],
110
118
  description=DESCRIPTION,
111
119
  long_description=long_description,
112
- long_description_content_type='text/markdown',
120
+ long_description_content_type="text/markdown",
113
121
  author=AUTHOR,
114
122
  author_email=EMAIL,
115
123
  python_requires=REQUIRES_PYTHON,
116
124
  url=URL,
117
- packages=find_packages(
118
- exclude=["oa_*", "oa_down", "oa_sign", "oa_tool"]),
125
+ packages=find_packages(exclude=["oa_*", "oa_down", "oa_sign", "oa_tool"]),
119
126
  # packages=find_packages(exclude=["nc", "file", "*.tests.*", "tests.*"]),
120
127
  # If your package is a single module, use this instead of 'packages':
121
128
  # py_modules=['mypackage'],
122
-
123
129
  # entry_points={
124
130
  # 'console_scripts': ['mycli=mymodule:cli'],
125
131
  # },
126
132
  install_requires=REQUIRED,
127
133
  extras_require=EXTRAS,
128
134
  include_package_data=True,
129
- license='MIT',
135
+ license="MIT",
130
136
  classifiers=[
131
137
  # Trove classifiers
132
138
  # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
133
- 'License :: OSI Approved :: MIT License',
134
- 'Programming Language :: Python',
135
- 'Programming Language :: Python :: 3',
136
- 'Programming Language :: Python :: 3.9',
137
- 'Programming Language :: Python :: 3.10',
138
- 'Programming Language :: Python :: 3.11',
139
- 'Programming Language :: Python :: 3.12',
140
- 'Programming Language :: Python :: Implementation :: CPython',
141
- 'Programming Language :: Python :: Implementation :: PyPy'
139
+ "License :: OSI Approved :: MIT License",
140
+ "Programming Language :: Python",
141
+ "Programming Language :: Python :: 3",
142
+ "Programming Language :: Python :: 3.9",
143
+ "Programming Language :: Python :: 3.10",
144
+ "Programming Language :: Python :: 3.11",
145
+ "Programming Language :: Python :: 3.12",
146
+ "Programming Language :: Python :: Implementation :: CPython",
147
+ "Programming Language :: Python :: Implementation :: PyPy",
142
148
  ],
143
149
  # $ setup.py publish support.
144
150
  cmdclass={
145
- 'upload': UploadCommand,
151
+ "upload": UploadCommand,
146
152
  },
147
153
  )
@@ -1,332 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding=utf-8
3
- '''
4
- Author: Liu Kun && 16031215@qq.com
5
- Date: 2024-11-28 10:42:56
6
- LastEditors: Liu Kun && 16031215@qq.com
7
- LastEditTime: 2024-11-28 10:43:18
8
- FilePath: \\Python\\My_Funcs\\OAFuncs\\oafuncs\\oa_down\\literature.py
9
- Description:
10
- EditPlatform: vscode
11
- ComputerInfo: XPS 15 9510
12
- SystemInfo: Windows 11
13
- Python Version: 3.12
14
- '''
15
-
16
-
17
- import os
18
- import random
19
- import re
20
- import time
21
- from pathlib import Path
22
-
23
- import pandas as pd
24
- import requests
25
- from rich import print
26
- from rich.progress import track
27
-
28
- __all__ = ['download5doi']
29
-
30
-
31
- def _get_file_size(file_path, unit='KB'):
32
- # 检查文件是否存在
33
- if not os.path.exists(file_path):
34
- return "文件不存在"
35
-
36
- # 获取文件大小(字节)
37
- file_size = os.path.getsize(file_path)
38
-
39
- # 单位转换字典
40
- unit_dict = {
41
- 'PB': 1024**5,
42
- 'TB': 1024**4,
43
- 'GB': 1024**3,
44
- 'MB': 1024**2,
45
- 'KB': 1024,
46
- }
47
-
48
- # 检查传入的单位是否合法
49
- if unit not in unit_dict:
50
- return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
51
-
52
- # 转换文件大小到指定单位
53
- converted_size = file_size / unit_dict[unit]
54
-
55
- return converted_size
56
-
57
-
58
- class _Downloader:
59
- '''
60
- 根据doi下载文献pdf
61
- '''
62
-
63
- def __init__(self, doi, store_path):
64
- self.url_list = [r'https://sci-hub.se',
65
- r'https://sci-hub.ren',
66
- r'https://sci-hub.st',
67
- r'https://sci-hub.ru',
68
- ]
69
- self.base_url = None
70
- self.url = None
71
- self.doi = doi
72
- self.pdf_url = None
73
- self.pdf_path = None
74
- self.headers = {'User-Agent': self.get_ua().encode('utf-8')}
75
- # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
76
- # self.fname = doi.replace(r'/', '_') + '.pdf'
77
- self.fname = re.sub(r'[/<>:"?*|]', '_', doi) + '.pdf'
78
- self.store_path = Path(store_path)
79
- self.fpath = self.store_path / self.fname
80
- self.wrong_record_file = self.store_path / 'wrong_record.txt'
81
- self.sleep = 5
82
- self.cookies = None
83
- self.check_size = 50
84
- self.url_index = 0
85
- self.try_times_each_url_max = 3
86
- self.try_times = 0
87
-
88
- def get_ua(self):
89
- ua_list = [
90
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
91
- "Opera/8.0 (Windows NT 5.1; U; en)",
92
- "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
93
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
94
- "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
95
- "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
96
- "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
97
- "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
98
- "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
99
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
100
- "Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
101
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
102
- "MAC:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
103
- "Windows:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
104
- "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
105
- "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
106
- "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
107
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
108
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
109
- "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
110
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
111
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
112
- "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
113
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
114
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
115
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
116
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
117
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
118
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
119
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
120
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
121
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
122
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
123
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
124
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
125
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
126
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36",
127
- "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
128
- "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
129
- "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
130
- "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
131
- "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
132
- "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
133
- "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
134
- "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
135
- "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
136
- "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
137
- "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
138
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
139
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
140
- "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
141
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
142
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
143
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
144
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
145
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
146
- "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
147
- "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
148
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
149
- "UCWEB7.0.2.37/28/999",
150
- "NOKIA5700/UCWEB7.0.2.37/28/999",
151
- "Openwave/UCWEB7.0.2.37/28/999",
152
- "Openwave/UCWEB7.0.2.37/28/999",
153
- ]
154
- ua_index = random.randint(0, len(ua_list)-1)
155
- ua = ua_list[ua_index]
156
- return ua
157
-
158
- def get_pdf_url(self):
159
- print('[bold #E6E6FA]-'*100)
160
- print(f"DOI: {self.doi}")
161
- print(f"Requesting: {self.url}...")
162
- response = requests.get(self.url, headers=self.headers)
163
- if response.status_code == 200:
164
- self.cookies = response.cookies
165
- text = response.text.replace('\\', '')
166
- # text = text.replace(' ', '') # It is important to remove the space
167
- # print(text)
168
- pattern = re.compile(
169
- r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
170
- match = pattern.search(text)
171
- if match:
172
- got_url = match.group(1)
173
- if r'http' not in got_url:
174
- if got_url[:2] == '//':
175
- self.pdf_url = 'https:' + got_url
176
- else:
177
- self.pdf_url = self.base_url + got_url
178
- else:
179
- self.pdf_url = got_url
180
- print(f"URL: {self.pdf_url}")
181
- else:
182
- print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
183
- self.try_times = self.try_times_each_url_max+1
184
- else:
185
- print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
186
- print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
187
- self.try_times = self.try_times_each_url_max+1
188
-
189
- def url_iterate(self):
190
- if self.url_index >= len(self.url_list):
191
- return
192
- url = self.url_list[self.url_index]
193
- self.base_url = url
194
- self.url = url + '/' + self.doi
195
- self.get_pdf_url()
196
- # for url in self.url_list:
197
- # self.url = url + self.doi
198
- # self.get_pdf_url()
199
- # if self.pdf_url:
200
- # break
201
-
202
- def write_wrong_record(self):
203
- with open(self.wrong_record_file, 'a') as f:
204
- f.write(self.doi + '\n')
205
-
206
- def download_pdf(self):
207
- if self.fpath.exists():
208
- fsize = _get_file_size(self.fpath, unit='KB')
209
- if fsize < self.check_size:
210
- # delete the wrong file
211
- os.remove(self.fpath)
212
- print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
213
- else:
214
- print('[bold #E6E6FA]-'*100)
215
- print(f"[bold purple]The PDF file {self.fpath} already exists.")
216
- return
217
- self.url_index = 0
218
- already_downloaded = False
219
- self.try_times = 0
220
- while not already_downloaded:
221
- self.url_iterate()
222
- if not self.pdf_url:
223
- self.url_index += 1
224
- if self.url_index >= len(self.url_list):
225
- print("Failed to download the PDF file.")
226
- self.write_wrong_record()
227
- return
228
- else:
229
- self.try_times = 0
230
- continue
231
- else:
232
- self.try_times += 1
233
- if self.try_times > self.try_times_each_url_max:
234
- self.url_index += 1
235
- if self.url_index >= len(self.url_list):
236
- # print("Failed to download the PDF file.")
237
- self.write_wrong_record()
238
- return
239
- print(f"Downloading: {self.fname}...")
240
- try:
241
- response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
242
- if response.status_code == 200:
243
- with open(self.fpath, 'wb') as f:
244
- f.write(response.content)
245
- fsize = _get_file_size(self.fpath, unit='KB')
246
- if fsize < self.check_size:
247
- # delete the wrong file
248
- os.remove(self.fpath)
249
- print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
250
- else:
251
- print(f"[bold green]Sucessful to download {self.fpath}")
252
- already_downloaded = True
253
- else:
254
- self.try_times = self.try_times_each_url_max+1
255
- print(f"Failed to download the PDF file. Status code: {response.status_code}")
256
- print(f'[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.')
257
- except Exception as e:
258
- print(f"Failed to download the PDF file. Error: {e}")
259
- time.sleep(self.sleep)
260
- if self.try_times >= self.try_times_each_url_max:
261
- self.url_index += 1
262
- if self.url_index >= len(self.url_list):
263
- print("\n[bold #CD5C5C]Failed to download the PDF file.")
264
- self.write_wrong_record()
265
- return
266
- if self.try_times == self.try_times_each_url_max:
267
- print(f'Tried {self.try_times} times for {self.url_list[self.url_index-1]}.')
268
- print("Try another URL...")
269
-
270
-
271
- def read_excel(file, col_name=r'DOI'):
272
- df = pd.read_excel(file)
273
- df_list = df[col_name].tolist()
274
- # 去掉nan
275
- df_list = [doi for doi in df_list if str(doi) != 'nan']
276
- return df_list
277
-
278
-
279
- def read_txt(file):
280
- with open(file, 'r') as f:
281
- lines = f.readlines()
282
- # 去掉换行符以及空行
283
- lines = [line.strip() for line in lines if line.strip()]
284
- return lines
285
-
286
-
287
- def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r'DOI'):
288
- '''
289
- Description: Download PDF files by DOI.
290
-
291
- Args:
292
- store_path: str, The path to store the PDF files.
293
- doi_list: list or str, The list of DOIs.
294
- txt_file: str, The path of the txt file that contains the DOIs.
295
- excel_file: str, The path of the excel file that contains the DOIs.
296
- col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
297
-
298
- Returns:
299
- None
300
-
301
- Example:
302
- download5doi(doi_list='10.3389/feart.2021.698876')
303
- download5doi(store_path=r'I:\Delete\ref_pdf', doi_list='10.3389/feart.2021.698876')
304
- download5doi(store_path=r'I:\Delete\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
305
- download5doi(store_path=r'I:\Delete\ref_pdf', txt_file=r'I:\Delete\ref_pdf\wrong_record.txt')
306
- download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx')
307
- download5doi(store_path=r'I:\Delete\ref_pdf', excel_file=r'I:\Delete\ref_pdf\wrong_record.xlsx', col_name='DOI')
308
- '''
309
- if not store_path:
310
- store_path = Path.cwd()
311
- else:
312
- store_path = Path(str(store_path))
313
- store_path.mkdir(parents=True, exist_ok=True)
314
- store_path = str(store_path)
315
-
316
- # 如果doi_list是str,转换为list
317
- if isinstance(doi_list, str) and doi_list:
318
- doi_list = [doi_list]
319
- if txt_file:
320
- doi_list = read_txt(txt_file)
321
- if excel_file:
322
- doi_list = read_excel(excel_file, col_name)
323
- print(f"Downloading {len(doi_list)} PDF files...")
324
- for doi in track(doi_list, description='Downloading...'):
325
- download = _Downloader(doi, store_path)
326
- download.download_pdf()
327
-
328
-
329
- if __name__ == '__main__':
330
- store_path = r'I:\Delete\ref_pdf'
331
- # download5doi(store_path, doi_list='10.1007/s00382-022-06260-x')
332
- download5doi(store_path, excel_file=r'I:\Delete\ref_pdf\savedrecs.xls')
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes