oafuncs 0.0.98.45__py3-none-any.whl → 0.0.98.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oafuncs might be problematic. Click here for more details.
- oafuncs/oa_cmap.py +3 -0
- oafuncs/oa_down/literature.py +265 -41
- oafuncs/oa_linux.py +57 -2
- {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.46.dist-info}/METADATA +1 -1
- {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.46.dist-info}/RECORD +8 -8
- {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.46.dist-info}/WHEEL +0 -0
- {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.46.dist-info}/licenses/LICENSE.txt +0 -0
- {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.46.dist-info}/top_level.txt +0 -0
oafuncs/oa_cmap.py
CHANGED
|
@@ -271,6 +271,9 @@ def get(colormap_name: Optional[str] = None, show_available: bool = False) -> Op
|
|
|
271
271
|
"diverging_4": ["#5DADE2", "#A2D9F7", "#D6EAF8", "#F2F3F4", "#FADBD8", "#F1948A", "#E74C3C"],
|
|
272
272
|
# ----------------------------------------------------------------------------
|
|
273
273
|
"colorful_1": ["#6d00db", "#9800cb", "#F2003C", "#ff4500", "#ff7f00", "#FE28A2", "#FFC0CB", "#DDA0DD", "#40E0D0", "#1a66f2", "#00f7fb", "#8fff88", "#E3FF00"],
|
|
274
|
+
# ----------------------------------------------------------------------------
|
|
275
|
+
"increasing_1": ["#FFFFFF", "#E6F7FF", "#A5E6F8", "#049CD4", "#11B5A3", "#04BC4C", "#74CC54", "#D9DD5C", "#FB922E", "#FC2224", "#E51C18", "#8B0000"],
|
|
276
|
+
# ----------------------------------------------------------------------------
|
|
274
277
|
}
|
|
275
278
|
|
|
276
279
|
if show_available:
|
oafuncs/oa_down/literature.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import re
|
|
3
3
|
import time
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from urllib.parse import urljoin
|
|
5
6
|
|
|
6
7
|
import pandas as pd
|
|
7
8
|
import requests
|
|
@@ -11,7 +12,7 @@ from oafuncs.oa_down.user_agent import get_ua
|
|
|
11
12
|
from oafuncs.oa_file import remove
|
|
12
13
|
from oafuncs.oa_data import ensure_list
|
|
13
14
|
|
|
14
|
-
__all__ = ["download5doi"]
|
|
15
|
+
__all__ = ["download5doi", "download5doi_via_unpaywall"]
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def _get_file_size(file_path, unit="KB"):
|
|
@@ -46,75 +47,142 @@ class _Downloader:
|
|
|
46
47
|
根据doi下载文献pdf
|
|
47
48
|
"""
|
|
48
49
|
|
|
49
|
-
|
|
50
|
+
# 进程级缓存:首次探测后的可用镜像列表,后续复用
|
|
51
|
+
_alive_mirrors_cache: list[str] | None = None
|
|
52
|
+
|
|
53
|
+
def __init__(self, doi, store_path, *, min_size_kb=50, timeout_html=15, timeout_pdf=30, sleep_secs=5, tries_each_url=3, debug=False):
|
|
50
54
|
self.url_list = [
|
|
51
55
|
r"https://sci-hub.se",
|
|
52
56
|
r"https://sci-hub.ren",
|
|
53
57
|
r"https://sci-hub.st",
|
|
54
|
-
r"https://sci-hub.ru",
|
|
58
|
+
r"https://sci-hub.ru", # 最好用的一个网站
|
|
55
59
|
# ------------------------------------- 以下网站没验证
|
|
56
|
-
r"https://sci-hub.wf",
|
|
57
|
-
r"https://sci-hub.yt",
|
|
58
|
-
r"https://sci-hub.ee",
|
|
59
|
-
r"https://sci-hub.cat",
|
|
60
60
|
r"https://sci-hub.in",
|
|
61
|
-
r"https://
|
|
62
|
-
r"https://sci-hub.vkif.top",
|
|
63
|
-
r"https://www.bothonce.com",
|
|
64
|
-
r"https://sci-hub.et-fine.com",
|
|
65
|
-
r"https://sci-hub.hkvisa.net",
|
|
66
|
-
# r"https://sci-hub.3800808.com", # 这个只能手动保存
|
|
67
|
-
r"https://sci-hub.zidianzhan.net",
|
|
68
|
-
r"https://sci-hub.usualwant.com",
|
|
61
|
+
r"https://sci-hub.hlgczx.com/",
|
|
69
62
|
]
|
|
70
63
|
self.base_url = None
|
|
71
64
|
self.url = None
|
|
72
65
|
self.doi = doi
|
|
73
66
|
self.pdf_url = None
|
|
74
67
|
self.pdf_path = None
|
|
75
|
-
|
|
68
|
+
# requests 期望 header 值为 str,这里确保 UA 是字符串而不是 bytes
|
|
69
|
+
self.headers = {"User-Agent": str(get_ua())}
|
|
76
70
|
# 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
|
|
77
71
|
# self.fname = doi.replace(r'/', '_') + '.pdf'
|
|
78
72
|
self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
|
|
79
73
|
self.store_path = Path(store_path)
|
|
80
74
|
self.fpath = self.store_path / self.fname
|
|
81
75
|
self.wrong_record_file = self.store_path / "wrong_record.txt"
|
|
82
|
-
self.sleep =
|
|
76
|
+
self.sleep = sleep_secs
|
|
83
77
|
self.cookies = None
|
|
84
|
-
self.check_size =
|
|
78
|
+
self.check_size = max(1, int(min_size_kb))
|
|
85
79
|
self.url_index = 0
|
|
86
|
-
self.try_times_each_url_max =
|
|
80
|
+
self.try_times_each_url_max = max(1, int(tries_each_url))
|
|
87
81
|
self.try_times = 0
|
|
82
|
+
self.timeout_html = max(5, int(timeout_html))
|
|
83
|
+
self.timeout_pdf = max(5, int(timeout_pdf))
|
|
84
|
+
self.debug = bool(debug)
|
|
85
|
+
|
|
86
|
+
# ---------------- 镜像可用性探测 ----------------
|
|
87
|
+
def _is_mirror_alive(self, base_url: str) -> bool:
|
|
88
|
+
"""
|
|
89
|
+
仅检测镜像根路径是否可连通(HTTP 200 即认为可用)。
|
|
90
|
+
不访问具体 DOI,避免被动触发风控;只做连通性筛查。
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
r = requests.get(base_url + "/", headers=self.headers, timeout=8, allow_redirects=True)
|
|
94
|
+
return 200 <= r.status_code < 400
|
|
95
|
+
except Exception:
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
def _ensure_alive_mirrors(self):
|
|
99
|
+
# 若已经有进程级缓存,直接复用
|
|
100
|
+
if _Downloader._alive_mirrors_cache is not None:
|
|
101
|
+
self.url_list = list(_Downloader._alive_mirrors_cache)
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
print(f"[bold cyan]Probing mirrors connectivity (first run)...")
|
|
105
|
+
alive = []
|
|
106
|
+
for base in self.url_list:
|
|
107
|
+
ok = self._is_mirror_alive(base)
|
|
108
|
+
status = "OK" if ok else "DOWN"
|
|
109
|
+
print(f" [{status}] {base}")
|
|
110
|
+
if ok:
|
|
111
|
+
alive.append(base)
|
|
112
|
+
if alive:
|
|
113
|
+
_Downloader._alive_mirrors_cache = alive
|
|
114
|
+
self.url_list = alive
|
|
115
|
+
print(f"[bold cyan]Alive mirrors: {len(alive)}; pruned {len(set(self.url_list)) - len(alive) if self.url_list else 0}.")
|
|
116
|
+
else:
|
|
117
|
+
print("[bold yellow]No mirror passed probe; keep original list for fallback attempts.")
|
|
118
|
+
|
|
119
|
+
def _extract_pdf_url_from_html(self, html: str) -> str | None:
|
|
120
|
+
"""
|
|
121
|
+
从 Sci-Hub 页面 HTML 中尽可能稳健地提取 PDF 链接。
|
|
122
|
+
|
|
123
|
+
兼容多种模式:
|
|
124
|
+
- onclick="location.href='...pdf?download=true'"
|
|
125
|
+
- <iframe id="pdf" src="...pdf?...">
|
|
126
|
+
- <a ... href="...pdf?...">
|
|
127
|
+
- 其他出现 .pdf 的 src/href 场景
|
|
128
|
+
|
|
129
|
+
返回绝对 URL;若找不到返回 None。
|
|
130
|
+
"""
|
|
131
|
+
text = html
|
|
132
|
+
|
|
133
|
+
# 先尝试常见 onclick 跳转
|
|
134
|
+
patterns = [
|
|
135
|
+
# onclick="location.href='...pdf?...'" 或 document.location
|
|
136
|
+
r"onclick\s*=\s*[\"']\s*(?:document\.)?location\.href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
|
|
137
|
+
# iframe id="pdf" src="...pdf?..."
|
|
138
|
+
r"<iframe[^>]+id\s*=\s*[\"']pdf[\"'][^>]+src\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
|
|
139
|
+
# 通用 a 标签 href
|
|
140
|
+
r"<a[^>]+href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
|
|
141
|
+
# 通用任意 src/href
|
|
142
|
+
r"(?:src|href)\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
for pat in patterns:
|
|
146
|
+
m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
|
|
147
|
+
if m:
|
|
148
|
+
got_url = m.group(1)
|
|
149
|
+
# 规范化为绝对 URL
|
|
150
|
+
if got_url.startswith("//"):
|
|
151
|
+
return "https:" + got_url
|
|
152
|
+
if got_url.startswith("http://") or got_url.startswith("https://"):
|
|
153
|
+
return got_url
|
|
154
|
+
# 其余按相对路径处理
|
|
155
|
+
return urljoin(self.base_url + "/", got_url.lstrip("/"))
|
|
156
|
+
|
|
157
|
+
return None
|
|
88
158
|
|
|
89
159
|
def get_pdf_url(self):
|
|
90
160
|
print("[bold #E6E6FA]-" * 120)
|
|
91
161
|
print(f"DOI: {self.doi}")
|
|
92
162
|
print(f"Requesting: {self.url}...")
|
|
93
163
|
try:
|
|
94
|
-
|
|
164
|
+
# 使用较小的超时时间避免长时间阻塞
|
|
165
|
+
response = requests.get(self.url, headers=self.headers, timeout=self.timeout_html)
|
|
95
166
|
if response.status_code == 200:
|
|
96
167
|
self.cookies = response.cookies
|
|
97
|
-
text = response.text
|
|
98
|
-
#
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
if
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
if got_url[:2] == "//":
|
|
106
|
-
self.pdf_url = "https:" + got_url
|
|
107
|
-
else:
|
|
108
|
-
self.pdf_url = self.base_url + got_url
|
|
168
|
+
text = response.text
|
|
169
|
+
# 去除转义反斜杠,提升正则匹配成功率
|
|
170
|
+
text = text.replace("\\", "")
|
|
171
|
+
|
|
172
|
+
self.pdf_url = self._extract_pdf_url_from_html(text)
|
|
173
|
+
if self.pdf_url:
|
|
174
|
+
if self.debug:
|
|
175
|
+
print(f"Found PDF link: {self.pdf_url}")
|
|
109
176
|
else:
|
|
110
|
-
self.pdf_url
|
|
111
|
-
print(f"URL: {self.pdf_url}")
|
|
177
|
+
print(f"Found PDF link (masked): .../{Path(self.pdf_url).name}")
|
|
112
178
|
else:
|
|
113
|
-
print(
|
|
179
|
+
print(
|
|
180
|
+
f"[bold #AFEEEE]The website {self.url_list[self.url_index]} does not expose a detectable PDF link (pattern mismatch)."
|
|
181
|
+
)
|
|
114
182
|
self.try_times = self.try_times_each_url_max + 1
|
|
115
183
|
else:
|
|
116
184
|
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
|
|
117
|
-
print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not
|
|
185
|
+
print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not include the PDF file (HTTP error).")
|
|
118
186
|
self.try_times = self.try_times_each_url_max + 1
|
|
119
187
|
except Exception as e:
|
|
120
188
|
print(f"Failed to retrieve the webpage. Error: {e}")
|
|
@@ -178,7 +246,7 @@ class _Downloader:
|
|
|
178
246
|
return
|
|
179
247
|
print(f"Downloading: {self.fname}...")
|
|
180
248
|
try:
|
|
181
|
-
response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
|
|
249
|
+
response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies, timeout=self.timeout_pdf)
|
|
182
250
|
if response.status_code == 200:
|
|
183
251
|
with open(self.fpath, "wb") as f:
|
|
184
252
|
f.write(response.content)
|
|
@@ -224,7 +292,22 @@ def _read_txt(file):
|
|
|
224
292
|
return lines
|
|
225
293
|
|
|
226
294
|
|
|
227
|
-
def download5doi(
|
|
295
|
+
def download5doi(
|
|
296
|
+
store_path=None,
|
|
297
|
+
doi_list=None,
|
|
298
|
+
txt_file=None,
|
|
299
|
+
excel_file=None,
|
|
300
|
+
col_name=r"DOI",
|
|
301
|
+
*,
|
|
302
|
+
probe_mirrors: bool = True,
|
|
303
|
+
min_size_kb: int = 50,
|
|
304
|
+
timeout_html: int = 15,
|
|
305
|
+
timeout_pdf: int = 30,
|
|
306
|
+
tries_each_url: int = 3,
|
|
307
|
+
sleep_secs: int = 5,
|
|
308
|
+
force: bool = False,
|
|
309
|
+
debug: bool = False,
|
|
310
|
+
):
|
|
228
311
|
"""
|
|
229
312
|
Description:
|
|
230
313
|
Download PDF files by DOI.
|
|
@@ -260,11 +343,152 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None,
|
|
|
260
343
|
doi_list = _read_txt(txt_file)
|
|
261
344
|
if excel_file:
|
|
262
345
|
doi_list = _read_excel(excel_file, col_name)
|
|
263
|
-
|
|
346
|
+
# 去重并清洗
|
|
347
|
+
doi_list = [str(x).strip() for x in doi_list if str(x).strip()]
|
|
348
|
+
doi_list = list(dict.fromkeys(doi_list)) # 保序去重
|
|
349
|
+
|
|
350
|
+
# 只有在不是追加下载的场景下再清除 wrong_record
|
|
351
|
+
if not force:
|
|
352
|
+
remove(Path(store_path) / "wrong_record.txt")
|
|
264
353
|
print(f"Downloading {len(doi_list)} PDF files...")
|
|
265
354
|
for doi in track(doi_list, description="Downloading..."):
|
|
266
|
-
|
|
267
|
-
|
|
355
|
+
dl = _Downloader(
|
|
356
|
+
doi,
|
|
357
|
+
store_path,
|
|
358
|
+
min_size_kb=min_size_kb,
|
|
359
|
+
timeout_html=timeout_html,
|
|
360
|
+
timeout_pdf=timeout_pdf,
|
|
361
|
+
sleep_secs=sleep_secs,
|
|
362
|
+
tries_each_url=tries_each_url,
|
|
363
|
+
debug=debug,
|
|
364
|
+
)
|
|
365
|
+
# 是否进行镜像探测
|
|
366
|
+
if probe_mirrors:
|
|
367
|
+
dl._ensure_alive_mirrors()
|
|
368
|
+
dl.download_pdf()
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
# ------------------------------- 合规替代方案(Open Access 优先) -------------------------------
|
|
372
|
+
def _get_oa_pdf_url_from_unpaywall(doi: str, email: str | None) -> str | None:
|
|
373
|
+
"""
|
|
374
|
+
通过 Unpaywall 获取可开放访问的 PDF 链接(若存在)。
|
|
375
|
+
需要提供 email(Unpaywall 要求标识邮件)。
|
|
376
|
+
返回 PDF URL 或 None。
|
|
377
|
+
"""
|
|
378
|
+
if not email:
|
|
379
|
+
print("[bold yellow]Unpaywall 需要 email 参数;请提供 email 以查询 OA 链接。")
|
|
380
|
+
return None
|
|
381
|
+
api = f"https://api.unpaywall.org/v2/{doi}?email={email}"
|
|
382
|
+
try:
|
|
383
|
+
r = requests.get(api, timeout=15)
|
|
384
|
+
if r.status_code != 200:
|
|
385
|
+
print(f"[bold yellow]Unpaywall 查询失败: HTTP {r.status_code}")
|
|
386
|
+
return None
|
|
387
|
+
data = r.json()
|
|
388
|
+
loc = data.get("best_oa_location") or {}
|
|
389
|
+
url_for_pdf = loc.get("url_for_pdf") or loc.get("url")
|
|
390
|
+
if url_for_pdf and url_for_pdf.lower().endswith(".pdf"):
|
|
391
|
+
return url_for_pdf
|
|
392
|
+
# 有些 OA 链接是落在 landing page,再尝试从记录的所有位置挑选 pdf
|
|
393
|
+
for k in ("oa_locations", "oa_location"):
|
|
394
|
+
entries = data.get(k) or []
|
|
395
|
+
if isinstance(entries, dict):
|
|
396
|
+
entries = [entries]
|
|
397
|
+
for e in entries:
|
|
398
|
+
u = e.get("url_for_pdf") or e.get("url")
|
|
399
|
+
if u and ".pdf" in u.lower():
|
|
400
|
+
return u
|
|
401
|
+
except Exception as e:
|
|
402
|
+
print(f"[bold yellow]Unpaywall 查询异常: {e}")
|
|
403
|
+
return None
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _download_pdf_from_url(url: str, dest_path: Path, headers: dict | None = None) -> bool:
|
|
407
|
+
"""
|
|
408
|
+
给定合法的 PDF 下载 URL,下载保存到 dest_path。
|
|
409
|
+
返回 True/False 表示是否成功。
|
|
410
|
+
"""
|
|
411
|
+
headers = headers or {"User-Agent": str(get_ua()), "Accept": "application/pdf"}
|
|
412
|
+
try:
|
|
413
|
+
with requests.get(url, headers=headers, stream=True, timeout=30) as r:
|
|
414
|
+
if r.status_code != 200 or "application/pdf" not in r.headers.get("Content-Type", "").lower():
|
|
415
|
+
# 仍可能是 PDF(某些服务器未正确设置头),尝试保存但标注提示
|
|
416
|
+
if r.status_code != 200:
|
|
417
|
+
print(f"[bold yellow]下载失败: HTTP {r.status_code}")
|
|
418
|
+
return False
|
|
419
|
+
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
420
|
+
with open(dest_path, "wb") as f:
|
|
421
|
+
for chunk in r.iter_content(chunk_size=8192):
|
|
422
|
+
if chunk:
|
|
423
|
+
f.write(chunk)
|
|
424
|
+
return True
|
|
425
|
+
except Exception as e:
|
|
426
|
+
print(f"[bold yellow]下载异常: {e}")
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def download5doi_via_unpaywall(
|
|
431
|
+
store_path=None,
|
|
432
|
+
doi_list=None,
|
|
433
|
+
txt_file=None,
|
|
434
|
+
excel_file=None,
|
|
435
|
+
col_name=r"DOI",
|
|
436
|
+
email: str | None = None,
|
|
437
|
+
):
|
|
438
|
+
"""
|
|
439
|
+
优先使用 Unpaywall 获取开放访问(OA)的 PDF 并下载,避免非合规站点。
|
|
440
|
+
|
|
441
|
+
参数:
|
|
442
|
+
store_path: 保存目录
|
|
443
|
+
doi_list/txt_file/excel_file/col_name: 同 download5doi
|
|
444
|
+
email: 用于访问 Unpaywall API 的邮箱(必填,否则无法查询)
|
|
445
|
+
"""
|
|
446
|
+
if not store_path:
|
|
447
|
+
store_path = Path.cwd()
|
|
448
|
+
else:
|
|
449
|
+
store_path = Path(str(store_path))
|
|
450
|
+
store_path.mkdir(parents=True, exist_ok=True)
|
|
451
|
+
|
|
452
|
+
if doi_list:
|
|
453
|
+
doi_list = ensure_list(doi_list)
|
|
454
|
+
if txt_file:
|
|
455
|
+
doi_list = _read_txt(txt_file)
|
|
456
|
+
if excel_file:
|
|
457
|
+
doi_list = _read_excel(excel_file, col_name)
|
|
458
|
+
|
|
459
|
+
if not doi_list:
|
|
460
|
+
print("[bold yellow]未提供 DOI 列表。")
|
|
461
|
+
return
|
|
462
|
+
|
|
463
|
+
print(f"[bold cyan]通过 Unpaywall 尝试下载 {len(doi_list)} 篇 OA PDF...")
|
|
464
|
+
ok, miss = 0, 0
|
|
465
|
+
for doi in track(doi_list, description="OA downloading..."):
|
|
466
|
+
# 规范化文件名
|
|
467
|
+
fname = re.sub(r'[/<>:"?*|]', "_", str(doi)) + ".pdf"
|
|
468
|
+
dest = store_path / fname
|
|
469
|
+
if dest.exists() and _get_file_size(dest, unit="KB") > 10:
|
|
470
|
+
ok += 1
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
pdf_url = _get_oa_pdf_url_from_unpaywall(str(doi), email=email)
|
|
474
|
+
if not pdf_url:
|
|
475
|
+
miss += 1
|
|
476
|
+
print(f"[bold yellow]未找到 OA PDF: {doi}")
|
|
477
|
+
continue
|
|
478
|
+
|
|
479
|
+
if _download_pdf_from_url(pdf_url, dest):
|
|
480
|
+
size_kb = _get_file_size(dest, unit="KB")
|
|
481
|
+
if isinstance(size_kb, (int, float)) and size_kb < 10:
|
|
482
|
+
dest.unlink(missing_ok=True)
|
|
483
|
+
miss += 1
|
|
484
|
+
print(f"[bold yellow]文件过小,疑似异常,已删除: {dest}")
|
|
485
|
+
else:
|
|
486
|
+
ok += 1
|
|
487
|
+
print(f"[bold green]已下载: {dest}")
|
|
488
|
+
else:
|
|
489
|
+
miss += 1
|
|
490
|
+
|
|
491
|
+
print(f"[bold]完成。成功 {ok} 篇,未获取 {miss} 篇(可能无 OA 版本或需机构访问)。")
|
|
268
492
|
|
|
269
493
|
|
|
270
494
|
if __name__ == "__main__":
|
oafuncs/oa_linux.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from rich import print
|
|
2
|
+
import time
|
|
3
|
+
import os
|
|
2
4
|
|
|
3
|
-
|
|
4
|
-
__all__ = ["os_command", "get_queue_node"]
|
|
5
|
+
__all__ = ["os_command", "get_queue_node", "query_queue", "running_jobs", "submit_job"]
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
# 负责执行命令并返回输出
|
|
@@ -49,5 +50,59 @@ def get_queue_node():
|
|
|
49
50
|
|
|
50
51
|
return queue_node_dict
|
|
51
52
|
|
|
53
|
+
def query_queue(need_node=1, queue_list =['dcu','bigmem','cpu_parallel','cpu_single']):
|
|
54
|
+
queue_dict = get_queue_node()
|
|
55
|
+
hs = None
|
|
56
|
+
for my_queue in queue_list:
|
|
57
|
+
if my_queue in queue_dict and queue_dict[my_queue] >= need_node:
|
|
58
|
+
# slurm_file = f'../run.slurm.{my_queue}'
|
|
59
|
+
hs = my_queue
|
|
60
|
+
break
|
|
61
|
+
return hs
|
|
62
|
+
|
|
63
|
+
def running_jobs():
|
|
64
|
+
# 通过qstat判断任务状态,是否还在进行中
|
|
65
|
+
# status = os.popen('qstat').read()
|
|
66
|
+
status = os.popen('squeue').read()
|
|
67
|
+
Jobs = status.split('\n')[1:]
|
|
68
|
+
ids = [job.split()[0] for job in Jobs if job != '']
|
|
69
|
+
return ids
|
|
70
|
+
|
|
71
|
+
def submit_job(working_dir, script_tmp='run.slurm', script_run='run.slurm', need_node=1, queue_tmp='<queue_name>', queue_list=['dcu', 'bigmem', 'cpu_parallel', 'cpu_single'], max_job=38):
|
|
72
|
+
from .oa_file import replace_content
|
|
73
|
+
import datetime
|
|
74
|
+
os.chdir(working_dir)
|
|
75
|
+
print(f'切换工作目录到: {working_dir}')
|
|
76
|
+
while True:
|
|
77
|
+
running_job = running_jobs()
|
|
78
|
+
if not running_job or len(running_job) < max_job:
|
|
79
|
+
queue = query_queue(need_node=need_node, queue_list=queue_list)
|
|
80
|
+
if queue:
|
|
81
|
+
replace_content(script_tmp, {f'{queue_tmp}': f"{queue}"}, False, f'{working_dir}', script_run)
|
|
82
|
+
print(f'找到计算资源,提交任务,队列:{queue}')
|
|
83
|
+
print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
|
|
84
|
+
content_sub = os_command(f"sbatch {script_run}")
|
|
85
|
+
# 避免在 None 上使用 'in' 导致 TypeError:os_command 在失败时会返回 None
|
|
86
|
+
if not content_sub:
|
|
87
|
+
print('提交任务命令没有返回输出或返回了错误,等待30秒后重试!')
|
|
88
|
+
time.sleep(30)
|
|
89
|
+
else:
|
|
90
|
+
content_sub_lower = content_sub.lower()
|
|
91
|
+
if 'error' in content_sub_lower or 'failed' in content_sub_lower:
|
|
92
|
+
print('提交任务时出现错误(从输出检测到 error/failed),等待30秒后重试!')
|
|
93
|
+
print(f'命令输出: {content_sub.strip()}')
|
|
94
|
+
time.sleep(30)
|
|
95
|
+
else:
|
|
96
|
+
print(f'提交任务成功,{content_sub.strip()}')
|
|
97
|
+
break
|
|
98
|
+
else:
|
|
99
|
+
print('没有足够的计算资源,等待30秒后重试!')
|
|
100
|
+
time.sleep(30)
|
|
101
|
+
else:
|
|
102
|
+
print(f'当前系统任务数:{len(running_job)},等待60秒后重试!')
|
|
103
|
+
time.sleep(60)
|
|
104
|
+
print(f'等待10秒后,继续检查任务状态!')
|
|
105
|
+
time.sleep(10)
|
|
106
|
+
|
|
52
107
|
if __name__ == "__main__":
|
|
53
108
|
pass
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
oafuncs/__init__.py,sha256=G523BFVPxmODwq8j_88NYEiKbCzdQ3jfy51cmLeh7kM,1630
|
|
2
|
-
oafuncs/oa_cmap.py,sha256=
|
|
2
|
+
oafuncs/oa_cmap.py,sha256=Mru5XvvBTfYNq8xjsBAGWppI7RGKzSh94glxP2SXomc,14221
|
|
3
3
|
oafuncs/oa_data.py,sha256=CG2YHY_R6MFrPw3UznT4T8BE8yXdgBMnmdUAEdh9GAo,6506
|
|
4
4
|
oafuncs/oa_date.py,sha256=aU2wVIWXyWoRiSQ9dg8sHvShFTxw86RrgbV3Q6tDjD4,6841
|
|
5
5
|
oafuncs/oa_draw.py,sha256=zal0Y3RPpN0TCGN4Gw9qLtjQdT6V0ZqpSUBFVOPL0x4,13952
|
|
6
6
|
oafuncs/oa_file.py,sha256=j9NOjxPOeAJsD5Zk4ODmFdVSSgr1CHVPvM1IHXy9RQA,17546
|
|
7
7
|
oafuncs/oa_geo.py,sha256=UbzvUqgT2QP_9B7XSJRL1HDmGu0HnLC5nSP6ZrA5WH4,7177
|
|
8
8
|
oafuncs/oa_help.py,sha256=0J5VaZX-cB0c090KxgmktQJBc0o00FsY-4wB8l5y00k,4178
|
|
9
|
-
oafuncs/oa_linux.py,sha256=
|
|
9
|
+
oafuncs/oa_linux.py,sha256=eijpxTopzL3GpE5AIzis9vdrbm-A7QBeQesA-divBjE,4627
|
|
10
10
|
oafuncs/oa_nc.py,sha256=j501NlTuvrDIwNLXbMfE7nPPXdbbL7u9PGDj2l5AtnI,16277
|
|
11
11
|
oafuncs/oa_python.py,sha256=xYMQnM0cGq9xUCtcoMpnN0LG5Rc_s94tai5nC6CNJ3E,4831
|
|
12
12
|
oafuncs/oa_tool.py,sha256=VHx15VqpbzNlVXh0-3nJqcDgLVaECMD1FvxJ_CrV39E,8046
|
|
@@ -26,7 +26,7 @@ oafuncs/oa_down/User_Agent-list.txt,sha256=pHaMlElMvZ8TG4vf4BqkZYKqe0JIGkr4kCN0l
|
|
|
26
26
|
oafuncs/oa_down/__init__.py,sha256=IT6oTqaxuV_mC6AwBut0HtkmnVtEu1MyX0x0oS7TKoA,218
|
|
27
27
|
oafuncs/oa_down/hycom_3hourly.py,sha256=dFXSC_5o-Dic616KrLXir4tEHvCiZt8vGKPEYpXFMmA,57356
|
|
28
28
|
oafuncs/oa_down/idm.py,sha256=vAhRjt_Sb-rKhzFShmSf29QcFTqsHpHXCvTSD1uSXyQ,1455
|
|
29
|
-
oafuncs/oa_down/literature.py,sha256=
|
|
29
|
+
oafuncs/oa_down/literature.py,sha256=umz8bqYoVJiFkFviK970iOL7sfwbVWuqHPgRs3a199I,19806
|
|
30
30
|
oafuncs/oa_down/read_proxy.py,sha256=HQpr-Mwn0Z8ICAuf63NUUY9p05E_uNWyWmOK46-73Ec,2866
|
|
31
31
|
oafuncs/oa_down/test_ua.py,sha256=l8MCD6yU2W75zRPTDKUZTJhCWNF9lfk-MiSFqAqKH1M,1398
|
|
32
32
|
oafuncs/oa_down/user_agent.py,sha256=LCVQUA60ukUqeJXgLktDHB2sh-ngk7AiX4sKj8w-X4A,416
|
|
@@ -39,8 +39,8 @@ oafuncs/oa_sign/__init__.py,sha256=JSx1fcWpmNhQBvX_Bmq3xysfSkkFMrjbJASxV_V6aqE,1
|
|
|
39
39
|
oafuncs/oa_sign/meteorological.py,sha256=3MSjy7HTcvz2zsITkjUMr_0Y027Gas1LFE9pk99990k,6110
|
|
40
40
|
oafuncs/oa_sign/ocean.py,sha256=3uYEzaq-27yVy23IQoqy-clhWu1I_fhPFBAQyT-OF4M,5562
|
|
41
41
|
oafuncs/oa_sign/scientific.py,sha256=moIl2MEY4uitbXoD596JmXookXGQtQsS-8_1NBBTx84,4689
|
|
42
|
-
oafuncs-0.0.98.
|
|
43
|
-
oafuncs-0.0.98.
|
|
44
|
-
oafuncs-0.0.98.
|
|
45
|
-
oafuncs-0.0.98.
|
|
46
|
-
oafuncs-0.0.98.
|
|
42
|
+
oafuncs-0.0.98.46.dist-info/licenses/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
|
|
43
|
+
oafuncs-0.0.98.46.dist-info/METADATA,sha256=ZeQYycohu3zboTLafN-CHlEwkhmixvmazKaLAADhFpI,4446
|
|
44
|
+
oafuncs-0.0.98.46.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
45
|
+
oafuncs-0.0.98.46.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
|
|
46
|
+
oafuncs-0.0.98.46.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|