oafuncs 0.0.98.44__py3-none-any.whl → 0.0.98.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oafuncs/__init__.py CHANGED
@@ -42,4 +42,6 @@ from .oa_tool import *
42
42
  from .oa_date import *
43
43
  # ------------------- 2025-03-27 16:56:57 -------------------
44
44
  from .oa_geo import *
45
- # ------------------- 2025-09-04 14:08:26 -------------------
45
+ # ------------------- 2025-09-04 14:08:26 -------------------
46
+ from .oa_linux import *
47
+ # ------------------- 2025-09-14 12:30:00 -------------------
oafuncs/_script/email.py CHANGED
@@ -1,14 +1,9 @@
1
- import random
2
- import smtplib
3
- from email.header import Header
4
- from email.mime.multipart import MIMEMultipart
5
- from email.mime.text import MIMEText
6
-
7
1
  from rich import print
8
2
 
9
3
  __all__ = ["send"]
10
4
 
11
5
  def _email_info():
6
+ import random
12
7
  email_dict = {
13
8
  "liukun0312@vip.qq.com": [4, 13, -10, 2, -10, 4, -7, -8, 8, -1, 3, -2, -11, -6, -9, -7],
14
9
  "756866877@qq.com": [4, -2, -3, 13, 12, 8, -6, 9, -12, 13, -10, -12, -11, -12, -4, -11],
@@ -26,6 +21,10 @@ def _decode_password(password):
26
21
 
27
22
 
28
23
  def _send_message(title, content, msg_to):
24
+ from email.header import Header
25
+ from email.mime.multipart import MIMEMultipart
26
+ from email.mime.text import MIMEText
27
+ import smtplib
29
28
  # 1. 连接邮箱服务器
30
29
  con = smtplib.SMTP_SSL("smtp.qq.com", 465)
31
30
 
oafuncs/oa_cmap.py CHANGED
@@ -271,6 +271,9 @@ def get(colormap_name: Optional[str] = None, show_available: bool = False) -> Op
271
271
  "diverging_4": ["#5DADE2", "#A2D9F7", "#D6EAF8", "#F2F3F4", "#FADBD8", "#F1948A", "#E74C3C"],
272
272
  # ----------------------------------------------------------------------------
273
273
  "colorful_1": ["#6d00db", "#9800cb", "#F2003C", "#ff4500", "#ff7f00", "#FE28A2", "#FFC0CB", "#DDA0DD", "#40E0D0", "#1a66f2", "#00f7fb", "#8fff88", "#E3FF00"],
274
+ # ----------------------------------------------------------------------------
275
+ "increasing_1": ["#FFFFFF", "#E6F7FF", "#A5E6F8", "#049CD4", "#11B5A3", "#04BC4C", "#74CC54", "#D9DD5C", "#FB922E", "#FC2224", "#E51C18", "#8B0000"],
276
+ # ----------------------------------------------------------------------------
274
277
  }
275
278
 
276
279
  if show_available:
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  import time
4
4
  from pathlib import Path
5
+ from urllib.parse import urljoin
5
6
 
6
7
  import pandas as pd
7
8
  import requests
@@ -11,7 +12,7 @@ from oafuncs.oa_down.user_agent import get_ua
11
12
  from oafuncs.oa_file import remove
12
13
  from oafuncs.oa_data import ensure_list
13
14
 
14
- __all__ = ["download5doi"]
15
+ __all__ = ["download5doi", "download5doi_via_unpaywall"]
15
16
 
16
17
 
17
18
  def _get_file_size(file_path, unit="KB"):
@@ -46,75 +47,142 @@ class _Downloader:
46
47
  根据doi下载文献pdf
47
48
  """
48
49
 
49
- def __init__(self, doi, store_path):
50
+ # 进程级缓存:首次探测后的可用镜像列表,后续复用
51
+ _alive_mirrors_cache: list[str] | None = None
52
+
53
+ def __init__(self, doi, store_path, *, min_size_kb=50, timeout_html=15, timeout_pdf=30, sleep_secs=5, tries_each_url=3, debug=False):
50
54
  self.url_list = [
51
55
  r"https://sci-hub.se",
52
56
  r"https://sci-hub.ren",
53
57
  r"https://sci-hub.st",
54
- r"https://sci-hub.ru", # 最好用的一个网站
58
+ r"https://sci-hub.ru", # 最好用的一个网站
55
59
  # ------------------------------------- 以下网站没验证
56
- r"https://sci-hub.wf",
57
- r"https://sci-hub.yt",
58
- r"https://sci-hub.ee",
59
- r"https://sci-hub.cat",
60
60
  r"https://sci-hub.in",
61
- r"https://www.pismin.com",
62
- r"https://sci-hub.vkif.top",
63
- r"https://www.bothonce.com",
64
- r"https://sci-hub.et-fine.com",
65
- r"https://sci-hub.hkvisa.net",
66
- # r"https://sci-hub.3800808.com", # 这个只能手动保存
67
- r"https://sci-hub.zidianzhan.net",
68
- r"https://sci-hub.usualwant.com",
61
+ r"https://sci-hub.hlgczx.com/",
69
62
  ]
70
63
  self.base_url = None
71
64
  self.url = None
72
65
  self.doi = doi
73
66
  self.pdf_url = None
74
67
  self.pdf_path = None
75
- self.headers = {"User-Agent": get_ua().encode("utf-8")}
68
+ # requests 期望 header 值为 str,这里确保 UA 是字符串而不是 bytes
69
+ self.headers = {"User-Agent": str(get_ua())}
76
70
  # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
77
71
  # self.fname = doi.replace(r'/', '_') + '.pdf'
78
72
  self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
79
73
  self.store_path = Path(store_path)
80
74
  self.fpath = self.store_path / self.fname
81
75
  self.wrong_record_file = self.store_path / "wrong_record.txt"
82
- self.sleep = 5
76
+ self.sleep = sleep_secs
83
77
  self.cookies = None
84
- self.check_size = 50
78
+ self.check_size = max(1, int(min_size_kb))
85
79
  self.url_index = 0
86
- self.try_times_each_url_max = 3
80
+ self.try_times_each_url_max = max(1, int(tries_each_url))
87
81
  self.try_times = 0
82
+ self.timeout_html = max(5, int(timeout_html))
83
+ self.timeout_pdf = max(5, int(timeout_pdf))
84
+ self.debug = bool(debug)
85
+
86
+ # ---------------- 镜像可用性探测 ----------------
87
+ def _is_mirror_alive(self, base_url: str) -> bool:
88
+ """
89
+ 仅检测镜像根路径是否可连通(HTTP 200 即认为可用)。
90
+ 不访问具体 DOI,避免被动触发风控;只做连通性筛查。
91
+ """
92
+ try:
93
+ r = requests.get(base_url + "/", headers=self.headers, timeout=8, allow_redirects=True)
94
+ return 200 <= r.status_code < 400
95
+ except Exception:
96
+ return False
97
+
98
+ def _ensure_alive_mirrors(self):
99
+ # 若已经有进程级缓存,直接复用
100
+ if _Downloader._alive_mirrors_cache is not None:
101
+ self.url_list = list(_Downloader._alive_mirrors_cache)
102
+ return
103
+
104
+ print(f"[bold cyan]Probing mirrors connectivity (first run)...")
105
+ alive = []
106
+ for base in self.url_list:
107
+ ok = self._is_mirror_alive(base)
108
+ status = "OK" if ok else "DOWN"
109
+ print(f" [{status}] {base}")
110
+ if ok:
111
+ alive.append(base)
112
+ if alive:
113
+ _Downloader._alive_mirrors_cache = alive
114
+ self.url_list = alive
115
+ print(f"[bold cyan]Alive mirrors: {len(alive)}; pruned {len(set(self.url_list)) - len(alive) if self.url_list else 0}.")
116
+ else:
117
+ print("[bold yellow]No mirror passed probe; keep original list for fallback attempts.")
118
+
119
+ def _extract_pdf_url_from_html(self, html: str) -> str | None:
120
+ """
121
+ 从 Sci-Hub 页面 HTML 中尽可能稳健地提取 PDF 链接。
122
+
123
+ 兼容多种模式:
124
+ - onclick="location.href='...pdf?download=true'"
125
+ - <iframe id="pdf" src="...pdf?...">
126
+ - <a ... href="...pdf?...">
127
+ - 其他出现 .pdf 的 src/href 场景
128
+
129
+ 返回绝对 URL;若找不到返回 None。
130
+ """
131
+ text = html
132
+
133
+ # 先尝试常见 onclick 跳转
134
+ patterns = [
135
+ # onclick="location.href='...pdf?...'" 或 document.location
136
+ r"onclick\s*=\s*[\"']\s*(?:document\.)?location\.href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
137
+ # iframe id="pdf" src="...pdf?..."
138
+ r"<iframe[^>]+id\s*=\s*[\"']pdf[\"'][^>]+src\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
139
+ # 通用 a 标签 href
140
+ r"<a[^>]+href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
141
+ # 通用任意 src/href
142
+ r"(?:src|href)\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
143
+ ]
144
+
145
+ for pat in patterns:
146
+ m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
147
+ if m:
148
+ got_url = m.group(1)
149
+ # 规范化为绝对 URL
150
+ if got_url.startswith("//"):
151
+ return "https:" + got_url
152
+ if got_url.startswith("http://") or got_url.startswith("https://"):
153
+ return got_url
154
+ # 其余按相对路径处理
155
+ return urljoin(self.base_url + "/", got_url.lstrip("/"))
156
+
157
+ return None
88
158
 
89
159
  def get_pdf_url(self):
90
160
  print("[bold #E6E6FA]-" * 120)
91
161
  print(f"DOI: {self.doi}")
92
162
  print(f"Requesting: {self.url}...")
93
163
  try:
94
- response = requests.get(self.url, headers=self.headers)
164
+ # 使用较小的超时时间避免长时间阻塞
165
+ response = requests.get(self.url, headers=self.headers, timeout=self.timeout_html)
95
166
  if response.status_code == 200:
96
167
  self.cookies = response.cookies
97
- text = response.text.replace("\\", "")
98
- # text = text.replace(' ', '') # It is important to remove the space
99
- # print(text)
100
- pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
101
- match = pattern.search(text)
102
- if match:
103
- got_url = match.group(1)
104
- if r"http" not in got_url:
105
- if got_url[:2] == "//":
106
- self.pdf_url = "https:" + got_url
107
- else:
108
- self.pdf_url = self.base_url + got_url
168
+ text = response.text
169
+ # 去除转义反斜杠,提升正则匹配成功率
170
+ text = text.replace("\\", "")
171
+
172
+ self.pdf_url = self._extract_pdf_url_from_html(text)
173
+ if self.pdf_url:
174
+ if self.debug:
175
+ print(f"Found PDF link: {self.pdf_url}")
109
176
  else:
110
- self.pdf_url = got_url
111
- print(f"URL: {self.pdf_url}")
177
+ print(f"Found PDF link (masked): .../{Path(self.pdf_url).name}")
112
178
  else:
113
- print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
179
+ print(
180
+ f"[bold #AFEEEE]The website {self.url_list[self.url_index]} does not expose a detectable PDF link (pattern mismatch)."
181
+ )
114
182
  self.try_times = self.try_times_each_url_max + 1
115
183
  else:
116
184
  print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
117
- print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
185
+ print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not include the PDF file (HTTP error).")
118
186
  self.try_times = self.try_times_each_url_max + 1
119
187
  except Exception as e:
120
188
  print(f"Failed to retrieve the webpage. Error: {e}")
@@ -178,7 +246,7 @@ class _Downloader:
178
246
  return
179
247
  print(f"Downloading: {self.fname}...")
180
248
  try:
181
- response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
249
+ response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies, timeout=self.timeout_pdf)
182
250
  if response.status_code == 200:
183
251
  with open(self.fpath, "wb") as f:
184
252
  f.write(response.content)
@@ -224,7 +292,22 @@ def _read_txt(file):
224
292
  return lines
225
293
 
226
294
 
227
- def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r"DOI"):
295
+ def download5doi(
296
+ store_path=None,
297
+ doi_list=None,
298
+ txt_file=None,
299
+ excel_file=None,
300
+ col_name=r"DOI",
301
+ *,
302
+ probe_mirrors: bool = True,
303
+ min_size_kb: int = 50,
304
+ timeout_html: int = 15,
305
+ timeout_pdf: int = 30,
306
+ tries_each_url: int = 3,
307
+ sleep_secs: int = 5,
308
+ force: bool = False,
309
+ debug: bool = False,
310
+ ):
228
311
  """
229
312
  Description:
230
313
  Download PDF files by DOI.
@@ -260,11 +343,152 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None,
260
343
  doi_list = _read_txt(txt_file)
261
344
  if excel_file:
262
345
  doi_list = _read_excel(excel_file, col_name)
263
- remove(Path(store_path) / "wrong_record.txt")
346
+ # 去重并清洗
347
+ doi_list = [str(x).strip() for x in doi_list if str(x).strip()]
348
+ doi_list = list(dict.fromkeys(doi_list)) # 保序去重
349
+
350
+ # 只有在不是追加下载的场景下再清除 wrong_record
351
+ if not force:
352
+ remove(Path(store_path) / "wrong_record.txt")
264
353
  print(f"Downloading {len(doi_list)} PDF files...")
265
354
  for doi in track(doi_list, description="Downloading..."):
266
- download = _Downloader(doi, store_path)
267
- download.download_pdf()
355
+ dl = _Downloader(
356
+ doi,
357
+ store_path,
358
+ min_size_kb=min_size_kb,
359
+ timeout_html=timeout_html,
360
+ timeout_pdf=timeout_pdf,
361
+ sleep_secs=sleep_secs,
362
+ tries_each_url=tries_each_url,
363
+ debug=debug,
364
+ )
365
+ # 是否进行镜像探测
366
+ if probe_mirrors:
367
+ dl._ensure_alive_mirrors()
368
+ dl.download_pdf()
369
+
370
+
371
+ # ------------------------------- 合规替代方案(Open Access 优先) -------------------------------
372
+ def _get_oa_pdf_url_from_unpaywall(doi: str, email: str | None) -> str | None:
373
+ """
374
+ 通过 Unpaywall 获取可开放访问的 PDF 链接(若存在)。
375
+ 需要提供 email(Unpaywall 要求标识邮件)。
376
+ 返回 PDF URL 或 None。
377
+ """
378
+ if not email:
379
+ print("[bold yellow]Unpaywall 需要 email 参数;请提供 email 以查询 OA 链接。")
380
+ return None
381
+ api = f"https://api.unpaywall.org/v2/{doi}?email={email}"
382
+ try:
383
+ r = requests.get(api, timeout=15)
384
+ if r.status_code != 200:
385
+ print(f"[bold yellow]Unpaywall 查询失败: HTTP {r.status_code}")
386
+ return None
387
+ data = r.json()
388
+ loc = data.get("best_oa_location") or {}
389
+ url_for_pdf = loc.get("url_for_pdf") or loc.get("url")
390
+ if url_for_pdf and url_for_pdf.lower().endswith(".pdf"):
391
+ return url_for_pdf
392
+ # 有些 OA 链接是落在 landing page,再尝试从记录的所有位置挑选 pdf
393
+ for k in ("oa_locations", "oa_location"):
394
+ entries = data.get(k) or []
395
+ if isinstance(entries, dict):
396
+ entries = [entries]
397
+ for e in entries:
398
+ u = e.get("url_for_pdf") or e.get("url")
399
+ if u and ".pdf" in u.lower():
400
+ return u
401
+ except Exception as e:
402
+ print(f"[bold yellow]Unpaywall 查询异常: {e}")
403
+ return None
404
+
405
+
406
+ def _download_pdf_from_url(url: str, dest_path: Path, headers: dict | None = None) -> bool:
407
+ """
408
+ 给定合法的 PDF 下载 URL,下载保存到 dest_path。
409
+ 返回 True/False 表示是否成功。
410
+ """
411
+ headers = headers or {"User-Agent": str(get_ua()), "Accept": "application/pdf"}
412
+ try:
413
+ with requests.get(url, headers=headers, stream=True, timeout=30) as r:
414
+ if r.status_code != 200 or "application/pdf" not in r.headers.get("Content-Type", "").lower():
415
+ # 仍可能是 PDF(某些服务器未正确设置头),尝试保存但标注提示
416
+ if r.status_code != 200:
417
+ print(f"[bold yellow]下载失败: HTTP {r.status_code}")
418
+ return False
419
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
420
+ with open(dest_path, "wb") as f:
421
+ for chunk in r.iter_content(chunk_size=8192):
422
+ if chunk:
423
+ f.write(chunk)
424
+ return True
425
+ except Exception as e:
426
+ print(f"[bold yellow]下载异常: {e}")
427
+ return False
428
+
429
+
430
+ def download5doi_via_unpaywall(
431
+ store_path=None,
432
+ doi_list=None,
433
+ txt_file=None,
434
+ excel_file=None,
435
+ col_name=r"DOI",
436
+ email: str | None = None,
437
+ ):
438
+ """
439
+ 优先使用 Unpaywall 获取开放访问(OA)的 PDF 并下载,避免非合规站点。
440
+
441
+ 参数:
442
+ store_path: 保存目录
443
+ doi_list/txt_file/excel_file/col_name: 同 download5doi
444
+ email: 用于访问 Unpaywall API 的邮箱(必填,否则无法查询)
445
+ """
446
+ if not store_path:
447
+ store_path = Path.cwd()
448
+ else:
449
+ store_path = Path(str(store_path))
450
+ store_path.mkdir(parents=True, exist_ok=True)
451
+
452
+ if doi_list:
453
+ doi_list = ensure_list(doi_list)
454
+ if txt_file:
455
+ doi_list = _read_txt(txt_file)
456
+ if excel_file:
457
+ doi_list = _read_excel(excel_file, col_name)
458
+
459
+ if not doi_list:
460
+ print("[bold yellow]未提供 DOI 列表。")
461
+ return
462
+
463
+ print(f"[bold cyan]通过 Unpaywall 尝试下载 {len(doi_list)} 篇 OA PDF...")
464
+ ok, miss = 0, 0
465
+ for doi in track(doi_list, description="OA downloading..."):
466
+ # 规范化文件名
467
+ fname = re.sub(r'[/<>:"?*|]', "_", str(doi)) + ".pdf"
468
+ dest = store_path / fname
469
+ if dest.exists() and _get_file_size(dest, unit="KB") > 10:
470
+ ok += 1
471
+ continue
472
+
473
+ pdf_url = _get_oa_pdf_url_from_unpaywall(str(doi), email=email)
474
+ if not pdf_url:
475
+ miss += 1
476
+ print(f"[bold yellow]未找到 OA PDF: {doi}")
477
+ continue
478
+
479
+ if _download_pdf_from_url(pdf_url, dest):
480
+ size_kb = _get_file_size(dest, unit="KB")
481
+ if isinstance(size_kb, (int, float)) and size_kb < 10:
482
+ dest.unlink(missing_ok=True)
483
+ miss += 1
484
+ print(f"[bold yellow]文件过小,疑似异常,已删除: {dest}")
485
+ else:
486
+ ok += 1
487
+ print(f"[bold green]已下载: {dest}")
488
+ else:
489
+ miss += 1
490
+
491
+ print(f"[bold]完成。成功 {ok} 篇,未获取 {miss} 篇(可能无 OA 版本或需机构访问)。")
268
492
 
269
493
 
270
494
  if __name__ == "__main__":
oafuncs/oa_file.py CHANGED
@@ -9,13 +9,14 @@ from rich import print
9
9
  __all__ = ["find_file", "link_file", "copy_file", "rename_file", "move_file", "clear_folder", "remove_empty_folder", "remove", "file_size", "mean_size", "make_dir", "replace_content"]
10
10
 
11
11
 
12
- def find_file(parent_dir: Union[str, os.PathLike], file_pattern: str, return_mode: str = "path") -> List[str]:
12
+ def find_file(parent_dir: Union[str, os.PathLike], file_pattern: str, return_mode: str = "path", deep_find: bool = False) -> List[str]:
13
13
  """Finds files matching a specified pattern.
14
14
 
15
15
  Args:
16
16
  parent_dir: The parent directory where to search for files
17
17
  file_pattern: The file name pattern to search for
18
18
  return_mode: Return mode, 'path' to return full file paths, 'file' to return only file names. Defaults to 'path'
19
+ deep_find: Whether to search recursively in subdirectories. Defaults to False
19
20
 
20
21
  Returns:
21
22
  A list of file paths or file names if files are found, otherwise an empty list
@@ -24,9 +25,12 @@ def find_file(parent_dir: Union[str, os.PathLike], file_pattern: str, return_mod
24
25
  def natural_sort_key(s: str) -> List[Union[int, str]]:
25
26
  """Generate a key for natural sorting."""
26
27
  return [int(text) if text.isdigit() else text.lower() for text in re.split("([0-9]+)", s)]
27
-
28
- search_pattern = os.path.join(str(parent_dir), file_pattern)
29
- matched_files = glob.glob(search_pattern)
28
+
29
+ if deep_find:
30
+ search_pattern = os.path.join(str(parent_dir), "**", file_pattern)
31
+ else:
32
+ search_pattern = os.path.join(str(parent_dir), file_pattern)
33
+ matched_files = glob.glob(search_pattern, recursive=deep_find)
30
34
 
31
35
  if not matched_files:
32
36
  return []
oafuncs/oa_geo.py CHANGED
@@ -135,14 +135,64 @@ def mask_land_ocean(
135
135
  """
136
136
  mask = _land_sea_mask(lon, lat, keep)
137
137
 
138
- # apply_ufunc 自动对齐并广播掩膜
139
- return xr.apply_ufunc(
140
- lambda x, m: x.where(m),
141
- data,
142
- xr.DataArray(mask, dims=("lat", "lon")),
143
- dask="parallelized",
144
- keep_attrs=True,
145
- )
138
+ # 将布尔掩膜转换为 xarray.DataArray
139
+ mask_da = xr.DataArray(mask, dims=("lat", "lon"))
140
+
141
+ # 如果输入已经是 xarray 对象,直接使用 where
142
+ if isinstance(data, (xr.DataArray, xr.Dataset)):
143
+ return data.where(mask_da)
144
+
145
+ # 如果输入是 numpy 数组,则假定最后两个维度是 (lat, lon)
146
+ if isinstance(data, np.ndarray):
147
+ arr = data
148
+ if arr.ndim < 2:
149
+ raise ValueError("numpy array 数据至少应包含 2 个维度 (lat, lon)")
150
+
151
+ if arr.ndim == 2:
152
+ lat_arr = np.asarray(lat)
153
+ lon_arr = np.asarray(lon)
154
+ # 支持 lat/lon 为 1D 或 2D
155
+ if lat_arr.ndim == 1 and lon_arr.ndim == 1:
156
+ da = xr.DataArray(arr, dims=("lat", "lon"), coords={"lat": lat_arr, "lon": lon_arr})
157
+ elif lat_arr.ndim == 2 and lon_arr.ndim == 2:
158
+ if lat_arr.shape != arr.shape or lon_arr.shape != arr.shape:
159
+ raise ValueError("提供的二维经纬度数组形状必须匹配数据的 (lat, lon) 维度")
160
+ da = xr.DataArray(arr, dims=("lat", "lon"), coords={"lat": (("lat", "lon"), lat_arr), "lon": (("lat", "lon"), lon_arr)})
161
+ else:
162
+ raise ValueError("lat/lon 必须同时为 1D 或同时为 2D")
163
+ else:
164
+ # 为前面的维度生成占位名称,例如 dim_0, dim_1, ...
165
+ leading_dims = [f"dim_{i}" for i in range(arr.ndim - 2)]
166
+ dims = leading_dims + ["lat", "lon"]
167
+ coords = {f"dim_{i}": np.arange(arr.shape[i]) for i in range(arr.ndim - 2)}
168
+
169
+ lat_arr = np.asarray(lat)
170
+ lon_arr = np.asarray(lon)
171
+ # 如果 lat/lon 为 1D
172
+ if lat_arr.ndim == 1 and lon_arr.ndim == 1:
173
+ if lat_arr.shape[0] != arr.shape[-2] or lon_arr.shape[0] != arr.shape[-1]:
174
+ raise ValueError("一维 lat/lon 长度必须匹配数据的最后两个维度")
175
+ coords.update({"lat": lat_arr, "lon": lon_arr})
176
+ # 如果 lat/lon 为 2D,要求其形状与数据最后两个维度一致
177
+ elif lat_arr.ndim == 2 and lon_arr.ndim == 2:
178
+ if lat_arr.shape != (arr.shape[-2], arr.shape[-1]) or lon_arr.shape != (arr.shape[-2], arr.shape[-1]):
179
+ raise ValueError("二维 lat/lon 的形状必须匹配数据的最后两个维度")
180
+ coords.update({"lat": (("lat", "lon"), lat_arr), "lon": (("lat", "lon"), lon_arr)})
181
+ else:
182
+ raise ValueError("lat/lon 必须同时为 1D 或同时为 2D")
183
+
184
+ da = xr.DataArray(arr, dims=dims, coords=coords)
185
+
186
+ masked = da.where(mask_da)
187
+ # 返回与输入相同的类型:numpy -> numpy
188
+ return masked.values
189
+
190
+ # 其他类型尝试转换为 DataArray
191
+ try:
192
+ da = xr.DataArray(data)
193
+ return da.where(mask_da)
194
+ except Exception:
195
+ raise TypeError("data must be xr.DataArray, xr.Dataset, or numpy.ndarray")
146
196
 
147
197
  if __name__ == "__main__":
148
198
  pass
oafuncs/oa_linux.py ADDED
@@ -0,0 +1,108 @@
1
+ from rich import print
2
+ import time
3
+ import os
4
+
5
+ __all__ = ["os_command", "get_queue_node", "query_queue", "running_jobs", "submit_job"]
6
+
7
+
8
+ # 负责执行命令并返回输出
9
+ def os_command(cmd):
10
+ import subprocess
11
+ print(f'🔍 执行命令: {cmd}')
12
+ result = subprocess.run(
13
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
14
+ )
15
+ # 打印错误信息(若有,方便排查问题)
16
+ if result.stderr:
17
+ print(f'❌ 错误输出: {result.stderr.strip()}')
18
+ # 检查命令是否执行成功(非0为失败)
19
+ if result.returncode != 0:
20
+ print(f'❌ 命令执行失败,退出码: {result.returncode}')
21
+ return None
22
+ return result.stdout
23
+
24
+ # 返回“队列名:节点数”的字典
25
+ def get_queue_node():
26
+ import re
27
+ # 执行 sinfo | grep "idle" 获取空闲队列数据
28
+ cmd = 'sinfo | grep "idle"'
29
+ output = os_command(cmd)
30
+ if not output: # 命令执行失败或无输出,返回空字典
31
+ return {}
32
+
33
+ # 初始化结果字典:键=队列名,值=节点数
34
+ queue_node_dict = {}
35
+ # 按行解析命令输出
36
+ for line in output.strip().split('\n'):
37
+ line = line.strip()
38
+ if not line: # 跳过空行
39
+ continue
40
+
41
+ # 正则匹配:仅捕获“队列名”(第1组)和“节点数”(第2组)
42
+ # 末尾用 .* 忽略节点列表,不影响匹配
43
+ pattern = r"^(\S+)\s+\S+\s+\S+\s+(\d+)\s+idle\s+.*$"
44
+ match = re.match(pattern, line)
45
+
46
+ if match:
47
+ queue_name = match.group(1) # 提取队列名作为字典的键
48
+ node_count = int(match.group(2))# 提取节点数作为字典的值(转为整数)
49
+ queue_node_dict[queue_name] = node_count # 存入字典
50
+
51
+ return queue_node_dict
52
+
53
+ def query_queue(need_node=1, queue_list =['dcu','bigmem','cpu_parallel','cpu_single']):
54
+ queue_dict = get_queue_node()
55
+ hs = None
56
+ for my_queue in queue_list:
57
+ if my_queue in queue_dict and queue_dict[my_queue] >= need_node:
58
+ # slurm_file = f'../run.slurm.{my_queue}'
59
+ hs = my_queue
60
+ break
61
+ return hs
62
+
63
+ def running_jobs():
64
+ # 通过qstat判断任务状态,是否还在进行中
65
+ # status = os.popen('qstat').read()
66
+ status = os.popen('squeue').read()
67
+ Jobs = status.split('\n')[1:]
68
+ ids = [job.split()[0] for job in Jobs if job != '']
69
+ return ids
70
+
71
+ def submit_job(working_dir, script_tmp='run.slurm', script_run='run.slurm', need_node=1, queue_tmp='<queue_name>', queue_list=['dcu', 'bigmem', 'cpu_parallel', 'cpu_single'], max_job=38):
72
+ from .oa_file import replace_content
73
+ import datetime
74
+ os.chdir(working_dir)
75
+ print(f'切换工作目录到: {working_dir}')
76
+ while True:
77
+ running_job = running_jobs()
78
+ if not running_job or len(running_job) < max_job:
79
+ queue = query_queue(need_node=need_node, queue_list=queue_list)
80
+ if queue:
81
+ replace_content(script_tmp, {f'{queue_tmp}': f"{queue}"}, False, f'{working_dir}', script_run)
82
+ print(f'找到计算资源,提交任务,队列:{queue}')
83
+ print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
84
+ content_sub = os_command(f"sbatch {script_run}")
85
+ # 避免在 None 上使用 'in' 导致 TypeError:os_command 在失败时会返回 None
86
+ if not content_sub:
87
+ print('提交任务命令没有返回输出或返回了错误,等待30秒后重试!')
88
+ time.sleep(30)
89
+ else:
90
+ content_sub_lower = content_sub.lower()
91
+ if 'error' in content_sub_lower or 'failed' in content_sub_lower:
92
+ print('提交任务时出现错误(从输出检测到 error/failed),等待30秒后重试!')
93
+ print(f'命令输出: {content_sub.strip()}')
94
+ time.sleep(30)
95
+ else:
96
+ print(f'提交任务成功,{content_sub.strip()}')
97
+ break
98
+ else:
99
+ print('没有足够的计算资源,等待30秒后重试!')
100
+ time.sleep(30)
101
+ else:
102
+ print(f'当前系统任务数:{len(running_job)},等待60秒后重试!')
103
+ time.sleep(60)
104
+ print(f'等待10秒后,继续检查任务状态!')
105
+ time.sleep(10)
106
+
107
+ if __name__ == "__main__":
108
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oafuncs
3
- Version: 0.0.98.44
3
+ Version: 0.0.98.46
4
4
  Summary: Oceanic and Atmospheric Functions
5
5
  Home-page: https://github.com/Industry-Pays/OAFuncs
6
6
  Author: Kun Liu
@@ -187,4 +187,4 @@ query()
187
187
  <img title="OAFuncs" src="https://raw.githubusercontent.com/Industry-Pays/OAFuncs/main/oafuncs/_data/oafuncs.png" alt="OAFuncs">
188
188
 
189
189
  ## Wiki
190
- 更多内容,查看[wiki](https://opendeep.wiki/Industry-Pays/OAFuncs/introduction)
190
+ 更多内容,查看[wiki_old](https://opendeep.wiki/Industry-Pays/OAFuncs/introduction) or [wiki_new](https://deepwiki.com/Industry-Pays/OAFuncs)
@@ -1,11 +1,12 @@
1
- oafuncs/__init__.py,sha256=7630YgWbWGClu4Us1H2SAq-_eh9WzFHGxXkIXMcxRu0,1542
2
- oafuncs/oa_cmap.py,sha256=JwZMJ36uNwiCnzXqEtH2_PpeLtEaRaXP9YeGSl0PJSU,13886
1
+ oafuncs/__init__.py,sha256=G523BFVPxmODwq8j_88NYEiKbCzdQ3jfy51cmLeh7kM,1630
2
+ oafuncs/oa_cmap.py,sha256=Mru5XvvBTfYNq8xjsBAGWppI7RGKzSh94glxP2SXomc,14221
3
3
  oafuncs/oa_data.py,sha256=CG2YHY_R6MFrPw3UznT4T8BE8yXdgBMnmdUAEdh9GAo,6506
4
4
  oafuncs/oa_date.py,sha256=aU2wVIWXyWoRiSQ9dg8sHvShFTxw86RrgbV3Q6tDjD4,6841
5
5
  oafuncs/oa_draw.py,sha256=zal0Y3RPpN0TCGN4Gw9qLtjQdT6V0ZqpSUBFVOPL0x4,13952
6
- oafuncs/oa_file.py,sha256=l9HTAK8iC1Bp_K7Mk3AX1UKuTFZZ-2yq5Hq71hnigbo,17299
7
- oafuncs/oa_geo.py,sha256=BWkvV6nW_c-UKqbgaoy4U1YQYUMzAQOJlK--XppNIms,4371
6
+ oafuncs/oa_file.py,sha256=j9NOjxPOeAJsD5Zk4ODmFdVSSgr1CHVPvM1IHXy9RQA,17546
7
+ oafuncs/oa_geo.py,sha256=UbzvUqgT2QP_9B7XSJRL1HDmGu0HnLC5nSP6ZrA5WH4,7177
8
8
  oafuncs/oa_help.py,sha256=0J5VaZX-cB0c090KxgmktQJBc0o00FsY-4wB8l5y00k,4178
9
+ oafuncs/oa_linux.py,sha256=eijpxTopzL3GpE5AIzis9vdrbm-A7QBeQesA-divBjE,4627
9
10
  oafuncs/oa_nc.py,sha256=j501NlTuvrDIwNLXbMfE7nPPXdbbL7u9PGDj2l5AtnI,16277
10
11
  oafuncs/oa_python.py,sha256=xYMQnM0cGq9xUCtcoMpnN0LG5Rc_s94tai5nC6CNJ3E,4831
11
12
  oafuncs/oa_tool.py,sha256=VHx15VqpbzNlVXh0-3nJqcDgLVaECMD1FvxJ_CrV39E,8046
@@ -13,7 +14,7 @@ oafuncs/_data/hycom.png,sha256=MadKs6Gyj5n9-TOu7L4atQfTXtF9dvN9w-tdU9IfygI,10945
13
14
  oafuncs/_data/oafuncs.png,sha256=o3VD7wm-kwDea5E98JqxXl04_78cBX7VcdUt7uQXGiU,3679898
14
15
  oafuncs/_script/cprogressbar.py,sha256=BZi3MzF4q2Yl6fdZcLnW8MdpgpLeldI5NvnWMr-ZS94,16023
15
16
  oafuncs/_script/data_interp.py,sha256=gr1coA2N1mxzS4iv6S0C4lZpEQbuuHHNW-08RrhgPAA,4774
16
- oafuncs/_script/email.py,sha256=l5xDgdVj8O5V0J2SwjsHKdUuxOH2jZvwdMO_P0dImHU,2684
17
+ oafuncs/_script/email.py,sha256=57jhRflm5QsyIshGMqtlfC6qn3b86GyiL4RQxdCOgxU,2702
17
18
  oafuncs/_script/netcdf_merge.py,sha256=tM9ePqLiEsE7eIsNM5XjEYeXwxjYOdNz5ejnEuI7xKw,6066
18
19
  oafuncs/_script/netcdf_modify.py,sha256=XDlAEToe_lwfAetkBSENqU5df-wnH7MGuxNTjG1gwHY,4178
19
20
  oafuncs/_script/netcdf_write.py,sha256=EDNycnhlrW1c6zcpmpObQeszDRX_lRxjTL-j0G4HqjI,17420
@@ -25,7 +26,7 @@ oafuncs/oa_down/User_Agent-list.txt,sha256=pHaMlElMvZ8TG4vf4BqkZYKqe0JIGkr4kCN0l
25
26
  oafuncs/oa_down/__init__.py,sha256=IT6oTqaxuV_mC6AwBut0HtkmnVtEu1MyX0x0oS7TKoA,218
26
27
  oafuncs/oa_down/hycom_3hourly.py,sha256=dFXSC_5o-Dic616KrLXir4tEHvCiZt8vGKPEYpXFMmA,57356
27
28
  oafuncs/oa_down/idm.py,sha256=vAhRjt_Sb-rKhzFShmSf29QcFTqsHpHXCvTSD1uSXyQ,1455
28
- oafuncs/oa_down/literature.py,sha256=7Qy5OphcjdRwY2uZ5hmmgK36U_QtVmEUSW0vQaxihC8,10960
29
+ oafuncs/oa_down/literature.py,sha256=umz8bqYoVJiFkFviK970iOL7sfwbVWuqHPgRs3a199I,19806
29
30
  oafuncs/oa_down/read_proxy.py,sha256=HQpr-Mwn0Z8ICAuf63NUUY9p05E_uNWyWmOK46-73Ec,2866
30
31
  oafuncs/oa_down/test_ua.py,sha256=l8MCD6yU2W75zRPTDKUZTJhCWNF9lfk-MiSFqAqKH1M,1398
31
32
  oafuncs/oa_down/user_agent.py,sha256=LCVQUA60ukUqeJXgLktDHB2sh-ngk7AiX4sKj8w-X4A,416
@@ -38,8 +39,8 @@ oafuncs/oa_sign/__init__.py,sha256=JSx1fcWpmNhQBvX_Bmq3xysfSkkFMrjbJASxV_V6aqE,1
38
39
  oafuncs/oa_sign/meteorological.py,sha256=3MSjy7HTcvz2zsITkjUMr_0Y027Gas1LFE9pk99990k,6110
39
40
  oafuncs/oa_sign/ocean.py,sha256=3uYEzaq-27yVy23IQoqy-clhWu1I_fhPFBAQyT-OF4M,5562
40
41
  oafuncs/oa_sign/scientific.py,sha256=moIl2MEY4uitbXoD596JmXookXGQtQsS-8_1NBBTx84,4689
41
- oafuncs-0.0.98.44.dist-info/licenses/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
42
- oafuncs-0.0.98.44.dist-info/METADATA,sha256=yWxBsUGF1rlJBn42pXZyCUrgqhXWpyqc-l_CTyBEnSk,4384
43
- oafuncs-0.0.98.44.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- oafuncs-0.0.98.44.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
45
- oafuncs-0.0.98.44.dist-info/RECORD,,
42
+ oafuncs-0.0.98.46.dist-info/licenses/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
43
+ oafuncs-0.0.98.46.dist-info/METADATA,sha256=ZeQYycohu3zboTLafN-CHlEwkhmixvmazKaLAADhFpI,4446
44
+ oafuncs-0.0.98.46.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ oafuncs-0.0.98.46.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
46
+ oafuncs-0.0.98.46.dist-info/RECORD,,