PyPI - oafuncs - Versions diffs - 0.0.98.45__py3-none-any.whl → 0.0.98.47__py3-none-any.whl - Mend

oafuncs 0.0.98.45py3-none-any.whl → 0.0.98.47py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

oafuncs/oa_cmap.py CHANGED Viewed

@@ -271,6 +271,9 @@ def get(colormap_name: Optional[str] = None, show_available: bool = False) -> Op
         "diverging_4": ["#5DADE2", "#A2D9F7", "#D6EAF8", "#F2F3F4", "#FADBD8", "#F1948A", "#E74C3C"],
         # ----------------------------------------------------------------------------
         "colorful_1": ["#6d00db", "#9800cb", "#F2003C", "#ff4500", "#ff7f00", "#FE28A2", "#FFC0CB", "#DDA0DD", "#40E0D0", "#1a66f2", "#00f7fb", "#8fff88", "#E3FF00"],
+        # ----------------------------------------------------------------------------
+        "increasing_1": ["#FFFFFF", "#E6F7FF", "#A5E6F8", "#049CD4", "#11B5A3", "#04BC4C", "#74CC54", "#D9DD5C", "#FB922E", "#FC2224", "#E51C18", "#8B0000"],
+        # ----------------------------------------------------------------------------
     }
     if show_available:

oafuncs/oa_down/literature.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import re
 import time
 from pathlib import Path
+from urllib.parse import urljoin
 import pandas as pd
 import requests
@@ -46,75 +47,142 @@ class _Downloader:
     根据doi下载文献pdf
     """
-    def __init__(self, doi, store_path):
+    # 进程级缓存：首次探测后的可用镜像列表，后续复用
+    _alive_mirrors_cache: list[str] | None = None
+    def __init__(self, doi, store_path, *, min_size_kb=50, timeout_html=15, timeout_pdf=30, sleep_secs=5, tries_each_url=3, debug=False):
         self.url_list = [
             r"https://sci-hub.se",
             r"https://sci-hub.ren",
             r"https://sci-hub.st",
-            r"https://sci-hub.ru", # 最好用的一个网站
+            r"https://sci-hub.ru",  # 最好用的一个网站
             # ------------------------------------- 以下网站没验证
-            r"https://sci-hub.wf",
-            r"https://sci-hub.yt",
-            r"https://sci-hub.ee",
-            r"https://sci-hub.cat",
             r"https://sci-hub.in",
-            r"https://www.pismin.com",
-            r"https://sci-hub.vkif.top",
-            r"https://www.bothonce.com",
-            r"https://sci-hub.et-fine.com",
-            r"https://sci-hub.hkvisa.net",
-            # r"https://sci-hub.3800808.com", # 这个只能手动保存
-            r"https://sci-hub.zidianzhan.net",
-            r"https://sci-hub.usualwant.com",
+            r"https://sci-hub.hlgczx.com/",
         ]
         self.base_url = None
         self.url = None
         self.doi = doi
         self.pdf_url = None
         self.pdf_path = None
-        self.headers = {"User-Agent": get_ua().encode("utf-8")}
+        # requests 期望 header 值为 str，这里确保 UA 是字符串而不是 bytes
+        self.headers = {"User-Agent": str(get_ua())}
         # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
         # self.fname = doi.replace(r'/', '_') + '.pdf'
         self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
         self.store_path = Path(store_path)
         self.fpath = self.store_path / self.fname
         self.wrong_record_file = self.store_path / "wrong_record.txt"
-        self.sleep = 5
+        self.sleep = sleep_secs
         self.cookies = None
-        self.check_size = 50
+        self.check_size = max(1, int(min_size_kb))
         self.url_index = 0
-        self.try_times_each_url_max = 3
+        self.try_times_each_url_max = max(1, int(tries_each_url))
         self.try_times = 0
+        self.timeout_html = max(5, int(timeout_html))
+        self.timeout_pdf = max(5, int(timeout_pdf))
+        self.debug = bool(debug)
+    # ---------------- 镜像可用性探测 ----------------
+    def _is_mirror_alive(self, base_url: str) -> bool:
+        """
+        仅检测镜像根路径是否可连通（HTTP 200 即认为可用）。
+        不访问具体 DOI，避免被动触发风控；只做连通性筛查。
+        """
+        try:
+            r = requests.get(base_url + "/", headers=self.headers, timeout=8, allow_redirects=True)
+            return 200 <= r.status_code < 400
+        except Exception:
+            return False
+    def _ensure_alive_mirrors(self):
+        # 若已经有进程级缓存，直接复用
+        if _Downloader._alive_mirrors_cache is not None:
+            self.url_list = list(_Downloader._alive_mirrors_cache)
+            return
+        print(f"[bold cyan]Probing mirrors connectivity (first run)...")
+        alive = []
+        for base in self.url_list:
+            ok = self._is_mirror_alive(base)
+            status = "OK" if ok else "DOWN"
+            print(f"  [{status}] {base}")
+            if ok:
+                alive.append(base)
+        if alive:
+            _Downloader._alive_mirrors_cache = alive
+            self.url_list = alive
+            print(f"[bold cyan]Alive mirrors: {len(alive)}; pruned {len(set(self.url_list)) - len(alive) if self.url_list else 0}.")
+        else:
+            print("[bold yellow]No mirror passed probe; keep original list for fallback attempts.")
+    def _extract_pdf_url_from_html(self, html: str) -> str | None:
+        """
+        从 Sci-Hub 页面 HTML 中尽可能稳健地提取 PDF 链接。
+        兼容多种模式：
+        - onclick="location.href='...pdf?download=true'"
+        - <iframe id="pdf" src="...pdf?...">
+        - <a ... href="...pdf?...">
+        - 其他出现 .pdf 的 src/href 场景
+        返回绝对 URL；若找不到返回 None。
+        """
+        text = html
+        # 先尝试常见 onclick 跳转
+        patterns = [
+            # onclick="location.href='...pdf?...'" 或 document.location
+            r"onclick\s*=\s*[\"']\s*(?:document\.)?location\.href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
+            # iframe id="pdf" src="...pdf?..."
+            r"<iframe[^>]+id\s*=\s*[\"']pdf[\"'][^>]+src\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
+            # 通用 a 标签 href
+            r"<a[^>]+href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
+            # 通用任意 src/href
+            r"(?:src|href)\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
+        ]
+        for pat in patterns:
+            m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
+            if m:
+                got_url = m.group(1)
+                # 规范化为绝对 URL
+                if got_url.startswith("//"):
+                    return "https:" + got_url
+                if got_url.startswith("http://") or got_url.startswith("https://"):
+                    return got_url
+                # 其余按相对路径处理
+                return urljoin(self.base_url + "/", got_url.lstrip("/"))
+        return None
     def get_pdf_url(self):
         print("[bold #E6E6FA]-" * 120)
         print(f"DOI: {self.doi}")
         print(f"Requesting: {self.url}...")
         try:
-            response = requests.get(self.url, headers=self.headers)
+            # 使用较小的超时时间避免长时间阻塞
+            response = requests.get(self.url, headers=self.headers, timeout=self.timeout_html)
             if response.status_code == 200:
                 self.cookies = response.cookies
-                text = response.text.replace("\\", "")
-                # text = text.replace(' ', '')  # It is important to remove the space
-                # print(text)
-                pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
-                match = pattern.search(text)
-                if match:
-                    got_url = match.group(1)
-                    if r"http" not in got_url:
-                        if got_url[:2] == "//":
-                            self.pdf_url = "https:" + got_url
-                        else:
-                            self.pdf_url = self.base_url + got_url
+                text = response.text
+                # 去除转义反斜杠，提升正则匹配成功率
+                text = text.replace("\\", "")
+                self.pdf_url = self._extract_pdf_url_from_html(text)
+                if self.pdf_url:
+                    if self.debug:
+                        print(f"Found PDF link: {self.pdf_url}")
                     else:
-                        self.pdf_url = got_url
-                    print(f"URL: {self.pdf_url}")
+                        print(f"Found PDF link (masked): .../{Path(self.pdf_url).name}")
                 else:
-                    print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
+                    print(
+                        f"[bold #AFEEEE]The website {self.url_list[self.url_index]} does not expose a detectable PDF link (pattern mismatch)."
+                    )
                     self.try_times = self.try_times_each_url_max + 1
             else:
                 print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
-                print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
+                print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not include the PDF file (HTTP error).")
                 self.try_times = self.try_times_each_url_max + 1
         except Exception as e:
             print(f"Failed to retrieve the webpage. Error: {e}")
@@ -178,7 +246,7 @@ class _Downloader:
                     return
             print(f"Downloading: {self.fname}...")
             try:
-                response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
+                response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies, timeout=self.timeout_pdf)
                 if response.status_code == 200:
                     with open(self.fpath, "wb") as f:
                         f.write(response.content)
@@ -224,7 +292,22 @@ def _read_txt(file):
     return lines
-def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r"DOI"):
+def download5doi(
+    store_path=None,
+    doi_list=None,
+    txt_file=None,
+    excel_file=None,
+    col_name=r"DOI",
+    *,
+    probe_mirrors: bool = True,
+    min_size_kb: int = 50,
+    timeout_html: int = 15,
+    timeout_pdf: int = 30,
+    tries_each_url: int = 3,
+    sleep_secs: int = 5,
+    force: bool = False,
+    debug: bool = False,
+):
     """
     Description:
         Download PDF files by DOI.
@@ -260,11 +343,29 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None,
         doi_list = _read_txt(txt_file)
     if excel_file:
         doi_list = _read_excel(excel_file, col_name)
-    remove(Path(store_path) / "wrong_record.txt")
+    # 去重并清洗
+    doi_list = [str(x).strip() for x in doi_list if str(x).strip()]
+    doi_list = list(dict.fromkeys(doi_list))  # 保序去重
+    # 只有在不是追加下载的场景下再清除 wrong_record
+    if not force:
+        remove(Path(store_path) / "wrong_record.txt")
     print(f"Downloading {len(doi_list)} PDF files...")
     for doi in track(doi_list, description="Downloading..."):
-        download = _Downloader(doi, store_path)
-        download.download_pdf()
+        dl = _Downloader(
+            doi,
+            store_path,
+            min_size_kb=min_size_kb,
+            timeout_html=timeout_html,
+            timeout_pdf=timeout_pdf,
+            sleep_secs=sleep_secs,
+            tries_each_url=tries_each_url,
+            debug=debug,
+        )
+        # 是否进行镜像探测
+        if probe_mirrors:
+            dl._ensure_alive_mirrors()
+        dl.download_pdf()
 if __name__ == "__main__":

oafuncs/oa_linux.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from rich import print
+import time
+import os
-__all__ = ["os_command", "get_queue_node"]
+__all__ = ["os_command", "get_queue_node", "query_queue", "running_jobs", "submit_job"]
 # 负责执行命令并返回输出
@@ -49,5 +50,81 @@ def get_queue_node():
     return queue_node_dict
+def query_queue(need_node=1, queue_list =['dcu','bigmem','cpu_parallel','cpu_single']):
+    queue_dict = get_queue_node()
+    hs = None
+    for my_queue in queue_list:
+        if my_queue in queue_dict and queue_dict[my_queue] >= need_node:
+            # slurm_file = f'../run.slurm.{my_queue}'
+            hs = my_queue
+            break
+    return hs
+def running_jobs():
+    # 通过qstat判断任务状态，是否还在进行中
+    # status = os.popen('qstat').read()
+    status = os.popen('squeue').read()
+    Jobs = status.split('\n')[1:]
+    ids = [job.split()[0] for job in Jobs if job != '']
+    return ids
+def submit_job(working_dir=None, script_tmp='run.slurm', script_run='run.slurm', need_node=1, queue_tmp='<queue_name>', queue_list=['dcu', 'bigmem', 'cpu_parallel', 'cpu_single'], max_job=38, wait=False):
+    '''提交任务到集群，并返回任务ID'''
+    from .oa_file import replace_content
+    import datetime
+    if working_dir is None:
+        working_dir = os.getcwd()
+    os.chdir(working_dir)
+    print(f'切换工作目录到: {working_dir}')
+    if need_node > 1 and 'cpu_single' in queue_list:
+        queue_list.remove('cpu_single')
+    while True:
+        running_job = running_jobs()
+        if not running_job or len(running_job) < max_job:
+            queue = query_queue(need_node=need_node, queue_list=queue_list)
+            if queue:
+                replace_content(script_tmp, {f'{queue_tmp}': f"{queue}"}, False, f'{working_dir}', script_run)
+                print(f'找到计算资源，提交任务，队列：{queue}')
+                print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
+                content_sub = os_command(f"sbatch {script_run}")
+                # 避免在 None 上使用 'in' 导致 TypeError：os_command 在失败时会返回 None
+                if not content_sub:
+                    print('提交任务命令没有返回输出或返回了错误，等待30秒后重试！')
+                    time.sleep(30)
+                else:
+                    content_sub_lower = content_sub.lower()
+                    if 'error' in content_sub_lower or 'failed' in content_sub_lower:
+                        print('提交任务时出现错误（从输出检测到 error/failed），等待30秒后重试！')
+                        print(f'命令输出: {content_sub.strip()}')
+                        time.sleep(30)
+                    else:
+                        print(f'提交任务成功，{content_sub.strip()}')
+                        job_id = content_sub.strip().split()[-1]
+                        break
+            else:
+                print('没有足够的计算资源，等待30秒后重试！')
+                time.sleep(30)
+        else:
+            print(f'当前系统任务数：{len(running_job)}，等待60秒后重试！')
+            time.sleep(60)
+    print(f'等待10秒后，继续检查任务状态！')
+    time.sleep(10)
+    if wait:
+        while True:
+            if job_id in running_jobs():
+                print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
+                print(f'任务{job_id}正在任务队列中...')
+                time.sleep(60)
+            else:
+                print(f'任务{job_id}已完成！')
+                break
+    else:
+        print(f'任务{job_id}已提交，不等待其完成，继续执行后续操作！')
+    return job_id
 if __name__ == "__main__":
     pass

{oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: oafuncs
-Version: 0.0.98.45
+Version: 0.0.98.47
 Summary: Oceanic and Atmospheric Functions
 Home-page: https://github.com/Industry-Pays/OAFuncs
 Author: Kun Liu

{oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 oafuncs/__init__.py,sha256=G523BFVPxmODwq8j_88NYEiKbCzdQ3jfy51cmLeh7kM,1630
-oafuncs/oa_cmap.py,sha256=JwZMJ36uNwiCnzXqEtH2_PpeLtEaRaXP9YeGSl0PJSU,13886
+oafuncs/oa_cmap.py,sha256=Mru5XvvBTfYNq8xjsBAGWppI7RGKzSh94glxP2SXomc,14221
 oafuncs/oa_data.py,sha256=CG2YHY_R6MFrPw3UznT4T8BE8yXdgBMnmdUAEdh9GAo,6506
 oafuncs/oa_date.py,sha256=aU2wVIWXyWoRiSQ9dg8sHvShFTxw86RrgbV3Q6tDjD4,6841
 oafuncs/oa_draw.py,sha256=zal0Y3RPpN0TCGN4Gw9qLtjQdT6V0ZqpSUBFVOPL0x4,13952
 oafuncs/oa_file.py,sha256=j9NOjxPOeAJsD5Zk4ODmFdVSSgr1CHVPvM1IHXy9RQA,17546
 oafuncs/oa_geo.py,sha256=UbzvUqgT2QP_9B7XSJRL1HDmGu0HnLC5nSP6ZrA5WH4,7177
 oafuncs/oa_help.py,sha256=0J5VaZX-cB0c090KxgmktQJBc0o00FsY-4wB8l5y00k,4178
-oafuncs/oa_linux.py,sha256=1fCmpgM33x3fLdQm8DOmaE4Pz4okXVNO0e-3svYnLgo,1850
+oafuncs/oa_linux.py,sha256=reQYcjMff6mHuM5RXzoxM-4lgIYUcOoHp0u2fLY0cnU,5431
 oafuncs/oa_nc.py,sha256=j501NlTuvrDIwNLXbMfE7nPPXdbbL7u9PGDj2l5AtnI,16277
 oafuncs/oa_python.py,sha256=xYMQnM0cGq9xUCtcoMpnN0LG5Rc_s94tai5nC6CNJ3E,4831
 oafuncs/oa_tool.py,sha256=VHx15VqpbzNlVXh0-3nJqcDgLVaECMD1FvxJ_CrV39E,8046
@@ -26,7 +26,7 @@ oafuncs/oa_down/User_Agent-list.txt,sha256=pHaMlElMvZ8TG4vf4BqkZYKqe0JIGkr4kCN0l
 oafuncs/oa_down/__init__.py,sha256=IT6oTqaxuV_mC6AwBut0HtkmnVtEu1MyX0x0oS7TKoA,218
 oafuncs/oa_down/hycom_3hourly.py,sha256=dFXSC_5o-Dic616KrLXir4tEHvCiZt8vGKPEYpXFMmA,57356
 oafuncs/oa_down/idm.py,sha256=vAhRjt_Sb-rKhzFShmSf29QcFTqsHpHXCvTSD1uSXyQ,1455
-oafuncs/oa_down/literature.py,sha256=7Qy5OphcjdRwY2uZ5hmmgK36U_QtVmEUSW0vQaxihC8,10960
+oafuncs/oa_down/literature.py,sha256=bqRSwYjsPO28WIohGIwPcj0xo9vAz5yhc8Ojx8ybh08,14924
 oafuncs/oa_down/read_proxy.py,sha256=HQpr-Mwn0Z8ICAuf63NUUY9p05E_uNWyWmOK46-73Ec,2866
 oafuncs/oa_down/test_ua.py,sha256=l8MCD6yU2W75zRPTDKUZTJhCWNF9lfk-MiSFqAqKH1M,1398
 oafuncs/oa_down/user_agent.py,sha256=LCVQUA60ukUqeJXgLktDHB2sh-ngk7AiX4sKj8w-X4A,416
@@ -39,8 +39,8 @@ oafuncs/oa_sign/__init__.py,sha256=JSx1fcWpmNhQBvX_Bmq3xysfSkkFMrjbJASxV_V6aqE,1
 oafuncs/oa_sign/meteorological.py,sha256=3MSjy7HTcvz2zsITkjUMr_0Y027Gas1LFE9pk99990k,6110
 oafuncs/oa_sign/ocean.py,sha256=3uYEzaq-27yVy23IQoqy-clhWu1I_fhPFBAQyT-OF4M,5562
 oafuncs/oa_sign/scientific.py,sha256=moIl2MEY4uitbXoD596JmXookXGQtQsS-8_1NBBTx84,4689
-oafuncs-0.0.98.45.dist-info/licenses/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
-oafuncs-0.0.98.45.dist-info/METADATA,sha256=JgC1BWV311BvpDhuyBam21v_fnaplfmdxKJkmakfM4s,4446
-oafuncs-0.0.98.45.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-oafuncs-0.0.98.45.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
-oafuncs-0.0.98.45.dist-info/RECORD,,
+oafuncs-0.0.98.47.dist-info/licenses/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
+oafuncs-0.0.98.47.dist-info/METADATA,sha256=iJT9zUhIKuQGVu3zNAoQvKq9DyB6oMm-mlptyweqRR0,4446
+oafuncs-0.0.98.47.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+oafuncs-0.0.98.47.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
+oafuncs-0.0.98.47.dist-info/RECORD,,

{oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/WHEEL RENAMED Viewed

File without changes

{oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/top_level.txt RENAMED Viewed

File without changes

oafuncs 0.0.98.45__py3-none-any.whl → 0.0.98.47__py3-none-any.whl

oafuncs 0.0.98.45py3-none-any.whl → 0.0.98.47py3-none-any.whl