oafuncs 0.0.98.45__py3-none-any.whl → 0.0.98.47__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oafuncs/oa_cmap.py CHANGED
@@ -271,6 +271,9 @@ def get(colormap_name: Optional[str] = None, show_available: bool = False) -> Op
271
271
  "diverging_4": ["#5DADE2", "#A2D9F7", "#D6EAF8", "#F2F3F4", "#FADBD8", "#F1948A", "#E74C3C"],
272
272
  # ----------------------------------------------------------------------------
273
273
  "colorful_1": ["#6d00db", "#9800cb", "#F2003C", "#ff4500", "#ff7f00", "#FE28A2", "#FFC0CB", "#DDA0DD", "#40E0D0", "#1a66f2", "#00f7fb", "#8fff88", "#E3FF00"],
274
+ # ----------------------------------------------------------------------------
275
+ "increasing_1": ["#FFFFFF", "#E6F7FF", "#A5E6F8", "#049CD4", "#11B5A3", "#04BC4C", "#74CC54", "#D9DD5C", "#FB922E", "#FC2224", "#E51C18", "#8B0000"],
276
+ # ----------------------------------------------------------------------------
274
277
  }
275
278
 
276
279
  if show_available:
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  import time
4
4
  from pathlib import Path
5
+ from urllib.parse import urljoin
5
6
 
6
7
  import pandas as pd
7
8
  import requests
@@ -46,75 +47,142 @@ class _Downloader:
46
47
  根据doi下载文献pdf
47
48
  """
48
49
 
49
- def __init__(self, doi, store_path):
50
+ # 进程级缓存:首次探测后的可用镜像列表,后续复用
51
+ _alive_mirrors_cache: list[str] | None = None
52
+
53
+ def __init__(self, doi, store_path, *, min_size_kb=50, timeout_html=15, timeout_pdf=30, sleep_secs=5, tries_each_url=3, debug=False):
50
54
  self.url_list = [
51
55
  r"https://sci-hub.se",
52
56
  r"https://sci-hub.ren",
53
57
  r"https://sci-hub.st",
54
- r"https://sci-hub.ru", # 最好用的一个网站
58
+ r"https://sci-hub.ru", # 最好用的一个网站
55
59
  # ------------------------------------- 以下网站没验证
56
- r"https://sci-hub.wf",
57
- r"https://sci-hub.yt",
58
- r"https://sci-hub.ee",
59
- r"https://sci-hub.cat",
60
60
  r"https://sci-hub.in",
61
- r"https://www.pismin.com",
62
- r"https://sci-hub.vkif.top",
63
- r"https://www.bothonce.com",
64
- r"https://sci-hub.et-fine.com",
65
- r"https://sci-hub.hkvisa.net",
66
- # r"https://sci-hub.3800808.com", # 这个只能手动保存
67
- r"https://sci-hub.zidianzhan.net",
68
- r"https://sci-hub.usualwant.com",
61
+ r"https://sci-hub.hlgczx.com/",
69
62
  ]
70
63
  self.base_url = None
71
64
  self.url = None
72
65
  self.doi = doi
73
66
  self.pdf_url = None
74
67
  self.pdf_path = None
75
- self.headers = {"User-Agent": get_ua().encode("utf-8")}
68
+ # requests 期望 header 值为 str,这里确保 UA 是字符串而不是 bytes
69
+ self.headers = {"User-Agent": str(get_ua())}
76
70
  # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
77
71
  # self.fname = doi.replace(r'/', '_') + '.pdf'
78
72
  self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
79
73
  self.store_path = Path(store_path)
80
74
  self.fpath = self.store_path / self.fname
81
75
  self.wrong_record_file = self.store_path / "wrong_record.txt"
82
- self.sleep = 5
76
+ self.sleep = sleep_secs
83
77
  self.cookies = None
84
- self.check_size = 50
78
+ self.check_size = max(1, int(min_size_kb))
85
79
  self.url_index = 0
86
- self.try_times_each_url_max = 3
80
+ self.try_times_each_url_max = max(1, int(tries_each_url))
87
81
  self.try_times = 0
82
+ self.timeout_html = max(5, int(timeout_html))
83
+ self.timeout_pdf = max(5, int(timeout_pdf))
84
+ self.debug = bool(debug)
85
+
86
+ # ---------------- 镜像可用性探测 ----------------
87
+ def _is_mirror_alive(self, base_url: str) -> bool:
88
+ """
89
+ 仅检测镜像根路径是否可连通(HTTP 200 即认为可用)。
90
+ 不访问具体 DOI,避免被动触发风控;只做连通性筛查。
91
+ """
92
+ try:
93
+ r = requests.get(base_url + "/", headers=self.headers, timeout=8, allow_redirects=True)
94
+ return 200 <= r.status_code < 400
95
+ except Exception:
96
+ return False
97
+
98
+ def _ensure_alive_mirrors(self):
99
+ # 若已经有进程级缓存,直接复用
100
+ if _Downloader._alive_mirrors_cache is not None:
101
+ self.url_list = list(_Downloader._alive_mirrors_cache)
102
+ return
103
+
104
+ print(f"[bold cyan]Probing mirrors connectivity (first run)...")
105
+ alive = []
106
+ for base in self.url_list:
107
+ ok = self._is_mirror_alive(base)
108
+ status = "OK" if ok else "DOWN"
109
+ print(f" [{status}] {base}")
110
+ if ok:
111
+ alive.append(base)
112
+ if alive:
113
+ _Downloader._alive_mirrors_cache = alive
114
+ self.url_list = alive
115
+ print(f"[bold cyan]Alive mirrors: {len(alive)}; pruned {len(set(self.url_list)) - len(alive) if self.url_list else 0}.")
116
+ else:
117
+ print("[bold yellow]No mirror passed probe; keep original list for fallback attempts.")
118
+
119
+ def _extract_pdf_url_from_html(self, html: str) -> str | None:
120
+ """
121
+ 从 Sci-Hub 页面 HTML 中尽可能稳健地提取 PDF 链接。
122
+
123
+ 兼容多种模式:
124
+ - onclick="location.href='...pdf?download=true'"
125
+ - <iframe id="pdf" src="...pdf?...">
126
+ - <a ... href="...pdf?...">
127
+ - 其他出现 .pdf 的 src/href 场景
128
+
129
+ 返回绝对 URL;若找不到返回 None。
130
+ """
131
+ text = html
132
+
133
+ # 先尝试常见 onclick 跳转
134
+ patterns = [
135
+ # onclick="location.href='...pdf?...'" 或 document.location
136
+ r"onclick\s*=\s*[\"']\s*(?:document\.)?location\.href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
137
+ # iframe id="pdf" src="...pdf?..."
138
+ r"<iframe[^>]+id\s*=\s*[\"']pdf[\"'][^>]+src\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
139
+ # 通用 a 标签 href
140
+ r"<a[^>]+href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
141
+ # 通用任意 src/href
142
+ r"(?:src|href)\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
143
+ ]
144
+
145
+ for pat in patterns:
146
+ m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
147
+ if m:
148
+ got_url = m.group(1)
149
+ # 规范化为绝对 URL
150
+ if got_url.startswith("//"):
151
+ return "https:" + got_url
152
+ if got_url.startswith("http://") or got_url.startswith("https://"):
153
+ return got_url
154
+ # 其余按相对路径处理
155
+ return urljoin(self.base_url + "/", got_url.lstrip("/"))
156
+
157
+ return None
88
158
 
89
159
  def get_pdf_url(self):
90
160
  print("[bold #E6E6FA]-" * 120)
91
161
  print(f"DOI: {self.doi}")
92
162
  print(f"Requesting: {self.url}...")
93
163
  try:
94
- response = requests.get(self.url, headers=self.headers)
164
+ # 使用较小的超时时间避免长时间阻塞
165
+ response = requests.get(self.url, headers=self.headers, timeout=self.timeout_html)
95
166
  if response.status_code == 200:
96
167
  self.cookies = response.cookies
97
- text = response.text.replace("\\", "")
98
- # text = text.replace(' ', '') # It is important to remove the space
99
- # print(text)
100
- pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
101
- match = pattern.search(text)
102
- if match:
103
- got_url = match.group(1)
104
- if r"http" not in got_url:
105
- if got_url[:2] == "//":
106
- self.pdf_url = "https:" + got_url
107
- else:
108
- self.pdf_url = self.base_url + got_url
168
+ text = response.text
169
+ # 去除转义反斜杠,提升正则匹配成功率
170
+ text = text.replace("\\", "")
171
+
172
+ self.pdf_url = self._extract_pdf_url_from_html(text)
173
+ if self.pdf_url:
174
+ if self.debug:
175
+ print(f"Found PDF link: {self.pdf_url}")
109
176
  else:
110
- self.pdf_url = got_url
111
- print(f"URL: {self.pdf_url}")
177
+ print(f"Found PDF link (masked): .../{Path(self.pdf_url).name}")
112
178
  else:
113
- print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
179
+ print(
180
+ f"[bold #AFEEEE]The website {self.url_list[self.url_index]} does not expose a detectable PDF link (pattern mismatch)."
181
+ )
114
182
  self.try_times = self.try_times_each_url_max + 1
115
183
  else:
116
184
  print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
117
- print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
185
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not include the PDF file (HTTP error).")
118
186
  self.try_times = self.try_times_each_url_max + 1
119
187
  except Exception as e:
120
188
  print(f"Failed to retrieve the webpage. Error: {e}")
@@ -178,7 +246,7 @@ class _Downloader:
178
246
  return
179
247
  print(f"Downloading: {self.fname}...")
180
248
  try:
181
- response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
249
+ response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies, timeout=self.timeout_pdf)
182
250
  if response.status_code == 200:
183
251
  with open(self.fpath, "wb") as f:
184
252
  f.write(response.content)
@@ -224,7 +292,22 @@ def _read_txt(file):
224
292
  return lines
225
293
 
226
294
 
227
- def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r"DOI"):
295
+ def download5doi(
296
+ store_path=None,
297
+ doi_list=None,
298
+ txt_file=None,
299
+ excel_file=None,
300
+ col_name=r"DOI",
301
+ *,
302
+ probe_mirrors: bool = True,
303
+ min_size_kb: int = 50,
304
+ timeout_html: int = 15,
305
+ timeout_pdf: int = 30,
306
+ tries_each_url: int = 3,
307
+ sleep_secs: int = 5,
308
+ force: bool = False,
309
+ debug: bool = False,
310
+ ):
228
311
  """
229
312
  Description:
230
313
  Download PDF files by DOI.
@@ -260,11 +343,29 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None,
260
343
  doi_list = _read_txt(txt_file)
261
344
  if excel_file:
262
345
  doi_list = _read_excel(excel_file, col_name)
263
- remove(Path(store_path) / "wrong_record.txt")
346
+ # 去重并清洗
347
+ doi_list = [str(x).strip() for x in doi_list if str(x).strip()]
348
+ doi_list = list(dict.fromkeys(doi_list)) # 保序去重
349
+
350
+ # 只有在不是追加下载的场景下再清除 wrong_record
351
+ if not force:
352
+ remove(Path(store_path) / "wrong_record.txt")
264
353
  print(f"Downloading {len(doi_list)} PDF files...")
265
354
  for doi in track(doi_list, description="Downloading..."):
266
- download = _Downloader(doi, store_path)
267
- download.download_pdf()
355
+ dl = _Downloader(
356
+ doi,
357
+ store_path,
358
+ min_size_kb=min_size_kb,
359
+ timeout_html=timeout_html,
360
+ timeout_pdf=timeout_pdf,
361
+ sleep_secs=sleep_secs,
362
+ tries_each_url=tries_each_url,
363
+ debug=debug,
364
+ )
365
+ # 是否进行镜像探测
366
+ if probe_mirrors:
367
+ dl._ensure_alive_mirrors()
368
+ dl.download_pdf()
268
369
 
269
370
 
270
371
  if __name__ == "__main__":
oafuncs/oa_linux.py CHANGED
@@ -1,7 +1,8 @@
1
1
  from rich import print
2
+ import time
3
+ import os
2
4
 
3
-
4
- __all__ = ["os_command", "get_queue_node"]
5
+ __all__ = ["os_command", "get_queue_node", "query_queue", "running_jobs", "submit_job"]
5
6
 
6
7
 
7
8
  # 负责执行命令并返回输出
@@ -49,5 +50,81 @@ def get_queue_node():
49
50
 
50
51
  return queue_node_dict
51
52
 
53
+ def query_queue(need_node=1, queue_list =['dcu','bigmem','cpu_parallel','cpu_single']):
54
+ queue_dict = get_queue_node()
55
+ hs = None
56
+ for my_queue in queue_list:
57
+ if my_queue in queue_dict and queue_dict[my_queue] >= need_node:
58
+ # slurm_file = f'../run.slurm.{my_queue}'
59
+ hs = my_queue
60
+ break
61
+ return hs
62
+
63
+ def running_jobs():
64
+ # 通过qstat判断任务状态,是否还在进行中
65
+ # status = os.popen('qstat').read()
66
+ status = os.popen('squeue').read()
67
+ Jobs = status.split('\n')[1:]
68
+ ids = [job.split()[0] for job in Jobs if job != '']
69
+ return ids
70
+
71
+ def submit_job(working_dir=None, script_tmp='run.slurm', script_run='run.slurm', need_node=1, queue_tmp='<queue_name>', queue_list=['dcu', 'bigmem', 'cpu_parallel', 'cpu_single'], max_job=38, wait=False):
72
+ '''提交任务到集群,并返回任务ID'''
73
+ from .oa_file import replace_content
74
+ import datetime
75
+ if working_dir is None:
76
+ working_dir = os.getcwd()
77
+ os.chdir(working_dir)
78
+ print(f'切换工作目录到: {working_dir}')
79
+
80
+ if need_node > 1 and 'cpu_single' in queue_list:
81
+ queue_list.remove('cpu_single')
82
+
83
+ while True:
84
+ running_job = running_jobs()
85
+ if not running_job or len(running_job) < max_job:
86
+ queue = query_queue(need_node=need_node, queue_list=queue_list)
87
+ if queue:
88
+ replace_content(script_tmp, {f'{queue_tmp}': f"{queue}"}, False, f'{working_dir}', script_run)
89
+ print(f'找到计算资源,提交任务,队列:{queue}')
90
+ print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
91
+ content_sub = os_command(f"sbatch {script_run}")
92
+ # 避免在 None 上使用 'in' 导致 TypeError:os_command 在失败时会返回 None
93
+ if not content_sub:
94
+ print('提交任务命令没有返回输出或返回了错误,等待30秒后重试!')
95
+ time.sleep(30)
96
+ else:
97
+ content_sub_lower = content_sub.lower()
98
+ if 'error' in content_sub_lower or 'failed' in content_sub_lower:
99
+ print('提交任务时出现错误(从输出检测到 error/failed),等待30秒后重试!')
100
+ print(f'命令输出: {content_sub.strip()}')
101
+ time.sleep(30)
102
+ else:
103
+ print(f'提交任务成功,{content_sub.strip()}')
104
+ job_id = content_sub.strip().split()[-1]
105
+ break
106
+ else:
107
+ print('没有足够的计算资源,等待30秒后重试!')
108
+ time.sleep(30)
109
+ else:
110
+ print(f'当前系统任务数:{len(running_job)},等待60秒后重试!')
111
+ time.sleep(60)
112
+ print(f'等待10秒后,继续检查任务状态!')
113
+ time.sleep(10)
114
+
115
+ if wait:
116
+ while True:
117
+ if job_id in running_jobs():
118
+ print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
119
+ print(f'任务{job_id}正在任务队列中...')
120
+ time.sleep(60)
121
+ else:
122
+ print(f'任务{job_id}已完成!')
123
+ break
124
+ else:
125
+ print(f'任务{job_id}已提交,不等待其完成,继续执行后续操作!')
126
+
127
+ return job_id
128
+
52
129
  if __name__ == "__main__":
53
130
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oafuncs
3
- Version: 0.0.98.45
3
+ Version: 0.0.98.47
4
4
  Summary: Oceanic and Atmospheric Functions
5
5
  Home-page: https://github.com/Industry-Pays/OAFuncs
6
6
  Author: Kun Liu
@@ -1,12 +1,12 @@
1
1
  oafuncs/__init__.py,sha256=G523BFVPxmODwq8j_88NYEiKbCzdQ3jfy51cmLeh7kM,1630
2
- oafuncs/oa_cmap.py,sha256=JwZMJ36uNwiCnzXqEtH2_PpeLtEaRaXP9YeGSl0PJSU,13886
2
+ oafuncs/oa_cmap.py,sha256=Mru5XvvBTfYNq8xjsBAGWppI7RGKzSh94glxP2SXomc,14221
3
3
  oafuncs/oa_data.py,sha256=CG2YHY_R6MFrPw3UznT4T8BE8yXdgBMnmdUAEdh9GAo,6506
4
4
  oafuncs/oa_date.py,sha256=aU2wVIWXyWoRiSQ9dg8sHvShFTxw86RrgbV3Q6tDjD4,6841
5
5
  oafuncs/oa_draw.py,sha256=zal0Y3RPpN0TCGN4Gw9qLtjQdT6V0ZqpSUBFVOPL0x4,13952
6
6
  oafuncs/oa_file.py,sha256=j9NOjxPOeAJsD5Zk4ODmFdVSSgr1CHVPvM1IHXy9RQA,17546
7
7
  oafuncs/oa_geo.py,sha256=UbzvUqgT2QP_9B7XSJRL1HDmGu0HnLC5nSP6ZrA5WH4,7177
8
8
  oafuncs/oa_help.py,sha256=0J5VaZX-cB0c090KxgmktQJBc0o00FsY-4wB8l5y00k,4178
9
- oafuncs/oa_linux.py,sha256=1fCmpgM33x3fLdQm8DOmaE4Pz4okXVNO0e-3svYnLgo,1850
9
+ oafuncs/oa_linux.py,sha256=reQYcjMff6mHuM5RXzoxM-4lgIYUcOoHp0u2fLY0cnU,5431
10
10
  oafuncs/oa_nc.py,sha256=j501NlTuvrDIwNLXbMfE7nPPXdbbL7u9PGDj2l5AtnI,16277
11
11
  oafuncs/oa_python.py,sha256=xYMQnM0cGq9xUCtcoMpnN0LG5Rc_s94tai5nC6CNJ3E,4831
12
12
  oafuncs/oa_tool.py,sha256=VHx15VqpbzNlVXh0-3nJqcDgLVaECMD1FvxJ_CrV39E,8046
@@ -26,7 +26,7 @@ oafuncs/oa_down/User_Agent-list.txt,sha256=pHaMlElMvZ8TG4vf4BqkZYKqe0JIGkr4kCN0l
26
26
  oafuncs/oa_down/__init__.py,sha256=IT6oTqaxuV_mC6AwBut0HtkmnVtEu1MyX0x0oS7TKoA,218
27
27
  oafuncs/oa_down/hycom_3hourly.py,sha256=dFXSC_5o-Dic616KrLXir4tEHvCiZt8vGKPEYpXFMmA,57356
28
28
  oafuncs/oa_down/idm.py,sha256=vAhRjt_Sb-rKhzFShmSf29QcFTqsHpHXCvTSD1uSXyQ,1455
29
- oafuncs/oa_down/literature.py,sha256=7Qy5OphcjdRwY2uZ5hmmgK36U_QtVmEUSW0vQaxihC8,10960
29
+ oafuncs/oa_down/literature.py,sha256=bqRSwYjsPO28WIohGIwPcj0xo9vAz5yhc8Ojx8ybh08,14924
30
30
  oafuncs/oa_down/read_proxy.py,sha256=HQpr-Mwn0Z8ICAuf63NUUY9p05E_uNWyWmOK46-73Ec,2866
31
31
  oafuncs/oa_down/test_ua.py,sha256=l8MCD6yU2W75zRPTDKUZTJhCWNF9lfk-MiSFqAqKH1M,1398
32
32
  oafuncs/oa_down/user_agent.py,sha256=LCVQUA60ukUqeJXgLktDHB2sh-ngk7AiX4sKj8w-X4A,416
@@ -39,8 +39,8 @@ oafuncs/oa_sign/__init__.py,sha256=JSx1fcWpmNhQBvX_Bmq3xysfSkkFMrjbJASxV_V6aqE,1
39
39
  oafuncs/oa_sign/meteorological.py,sha256=3MSjy7HTcvz2zsITkjUMr_0Y027Gas1LFE9pk99990k,6110
40
40
  oafuncs/oa_sign/ocean.py,sha256=3uYEzaq-27yVy23IQoqy-clhWu1I_fhPFBAQyT-OF4M,5562
41
41
  oafuncs/oa_sign/scientific.py,sha256=moIl2MEY4uitbXoD596JmXookXGQtQsS-8_1NBBTx84,4689
42
- oafuncs-0.0.98.45.dist-info/licenses/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
43
- oafuncs-0.0.98.45.dist-info/METADATA,sha256=JgC1BWV311BvpDhuyBam21v_fnaplfmdxKJkmakfM4s,4446
44
- oafuncs-0.0.98.45.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
- oafuncs-0.0.98.45.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
46
- oafuncs-0.0.98.45.dist-info/RECORD,,
42
+ oafuncs-0.0.98.47.dist-info/licenses/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
43
+ oafuncs-0.0.98.47.dist-info/METADATA,sha256=iJT9zUhIKuQGVu3zNAoQvKq9DyB6oMm-mlptyweqRR0,4446
44
+ oafuncs-0.0.98.47.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ oafuncs-0.0.98.47.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
46
+ oafuncs-0.0.98.47.dist-info/RECORD,,