oafuncs 0.0.98.45__tar.gz → 0.0.98.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {oafuncs-0.0.98.45/oafuncs.egg-info → oafuncs-0.0.98.46}/PKG-INFO +1 -1
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_cmap.py +3 -0
 - oafuncs-0.0.98.46/oafuncs/oa_down/literature.py +497 -0
 - oafuncs-0.0.98.46/oafuncs/oa_linux.py +108 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46/oafuncs.egg-info}/PKG-INFO +1 -1
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/setup.py +1 -1
 - oafuncs-0.0.98.45/oafuncs/oa_down/literature.py +0 -273
 - oafuncs-0.0.98.45/oafuncs/oa_linux.py +0 -53
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/LICENSE.txt +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/MANIFEST.in +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/README.md +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/__init__.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_data/hycom.png +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_data/oafuncs.png +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/cprogressbar.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/data_interp.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/email.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/netcdf_merge.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/netcdf_modify.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/netcdf_write.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/parallel.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/parallel_bak.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/plot_dataset.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/_script/replace_file_content.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_data.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_date.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_down/User_Agent-list.txt +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_down/__init__.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_down/hycom_3hourly.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_down/idm.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_down/read_proxy.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_down/test_ua.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_down/user_agent.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_draw.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_file.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_geo.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_help.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_model/__init__.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_model/roms/__init__.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_model/roms/test.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_model/wrf/__init__.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_model/wrf/little_r.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_nc.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_python.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_sign/__init__.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_sign/meteorological.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_sign/ocean.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_sign/scientific.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs/oa_tool.py +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs.egg-info/SOURCES.txt +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs.egg-info/dependency_links.txt +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs.egg-info/requires.txt +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/oafuncs.egg-info/top_level.txt +0 -0
 - {oafuncs-0.0.98.45 → oafuncs-0.0.98.46}/setup.cfg +0 -0
 
| 
         @@ -271,6 +271,9 @@ def get(colormap_name: Optional[str] = None, show_available: bool = False) -> Op 
     | 
|
| 
       271 
271 
     | 
    
         
             
                    "diverging_4": ["#5DADE2", "#A2D9F7", "#D6EAF8", "#F2F3F4", "#FADBD8", "#F1948A", "#E74C3C"],
         
     | 
| 
       272 
272 
     | 
    
         
             
                    # ----------------------------------------------------------------------------
         
     | 
| 
       273 
273 
     | 
    
         
             
                    "colorful_1": ["#6d00db", "#9800cb", "#F2003C", "#ff4500", "#ff7f00", "#FE28A2", "#FFC0CB", "#DDA0DD", "#40E0D0", "#1a66f2", "#00f7fb", "#8fff88", "#E3FF00"],
         
     | 
| 
      
 274 
     | 
    
         
            +
                    # ----------------------------------------------------------------------------
         
     | 
| 
      
 275 
     | 
    
         
            +
                    "increasing_1": ["#FFFFFF", "#E6F7FF", "#A5E6F8", "#049CD4", "#11B5A3", "#04BC4C", "#74CC54", "#D9DD5C", "#FB922E", "#FC2224", "#E51C18", "#8B0000"],
         
     | 
| 
      
 276 
     | 
    
         
            +
                    # ----------------------------------------------------------------------------
         
     | 
| 
       274 
277 
     | 
    
         
             
                }
         
     | 
| 
       275 
278 
     | 
    
         | 
| 
       276 
279 
     | 
    
         
             
                if show_available:
         
     | 
| 
         @@ -0,0 +1,497 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            import os
         
     | 
| 
      
 2 
     | 
    
         
            +
            import re
         
     | 
| 
      
 3 
     | 
    
         
            +
            import time
         
     | 
| 
      
 4 
     | 
    
         
            +
            from pathlib import Path
         
     | 
| 
      
 5 
     | 
    
         
            +
            from urllib.parse import urljoin
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            import pandas as pd
         
     | 
| 
      
 8 
     | 
    
         
            +
            import requests
         
     | 
| 
      
 9 
     | 
    
         
            +
            from rich import print
         
     | 
| 
      
 10 
     | 
    
         
            +
            from rich.progress import track
         
     | 
| 
      
 11 
     | 
    
         
            +
            from oafuncs.oa_down.user_agent import get_ua
         
     | 
| 
      
 12 
     | 
    
         
            +
            from oafuncs.oa_file import remove
         
     | 
| 
      
 13 
     | 
    
         
            +
            from oafuncs.oa_data import ensure_list
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            __all__ = ["download5doi", "download5doi_via_unpaywall"]
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            def _get_file_size(file_path, unit="KB"):
         
     | 
| 
      
 19 
     | 
    
         
            +
                # 检查文件是否存在
         
     | 
| 
      
 20 
     | 
    
         
            +
                if not os.path.exists(file_path):
         
     | 
| 
      
 21 
     | 
    
         
            +
                    return "文件不存在"
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                # 获取文件大小(字节)
         
     | 
| 
      
 24 
     | 
    
         
            +
                file_size = os.path.getsize(file_path)
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                # 单位转换字典
         
     | 
| 
      
 27 
     | 
    
         
            +
                unit_dict = {
         
     | 
| 
      
 28 
     | 
    
         
            +
                    "PB": 1024**5,
         
     | 
| 
      
 29 
     | 
    
         
            +
                    "TB": 1024**4,
         
     | 
| 
      
 30 
     | 
    
         
            +
                    "GB": 1024**3,
         
     | 
| 
      
 31 
     | 
    
         
            +
                    "MB": 1024**2,
         
     | 
| 
      
 32 
     | 
    
         
            +
                    "KB": 1024,
         
     | 
| 
      
 33 
     | 
    
         
            +
                }
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                # 检查传入的单位是否合法
         
     | 
| 
      
 36 
     | 
    
         
            +
                if unit not in unit_dict:
         
     | 
| 
      
 37 
     | 
    
         
            +
                    return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                # 转换文件大小到指定单位
         
     | 
| 
      
 40 
     | 
    
         
            +
                converted_size = file_size / unit_dict[unit]
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                return converted_size
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
            class _Downloader:
         
     | 
| 
      
 46 
     | 
    
         
            +
                """
         
     | 
| 
      
 47 
     | 
    
         
            +
                根据doi下载文献pdf
         
     | 
| 
      
 48 
     | 
    
         
            +
                """
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                # 进程级缓存:首次探测后的可用镜像列表,后续复用
         
     | 
| 
      
 51 
     | 
    
         
            +
                _alive_mirrors_cache: list[str] | None = None
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
                def __init__(self, doi, store_path, *, min_size_kb=50, timeout_html=15, timeout_pdf=30, sleep_secs=5, tries_each_url=3, debug=False):
         
     | 
| 
      
 54 
     | 
    
         
            +
                    self.url_list = [
         
     | 
| 
      
 55 
     | 
    
         
            +
                        r"https://sci-hub.se",
         
     | 
| 
      
 56 
     | 
    
         
            +
                        r"https://sci-hub.ren",
         
     | 
| 
      
 57 
     | 
    
         
            +
                        r"https://sci-hub.st",
         
     | 
| 
      
 58 
     | 
    
         
            +
                        r"https://sci-hub.ru",  # 最好用的一个网站
         
     | 
| 
      
 59 
     | 
    
         
            +
                        # ------------------------------------- 以下网站没验证
         
     | 
| 
      
 60 
     | 
    
         
            +
                        r"https://sci-hub.in",
         
     | 
| 
      
 61 
     | 
    
         
            +
                        r"https://sci-hub.hlgczx.com/",
         
     | 
| 
      
 62 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 63 
     | 
    
         
            +
                    self.base_url = None
         
     | 
| 
      
 64 
     | 
    
         
            +
                    self.url = None
         
     | 
| 
      
 65 
     | 
    
         
            +
                    self.doi = doi
         
     | 
| 
      
 66 
     | 
    
         
            +
                    self.pdf_url = None
         
     | 
| 
      
 67 
     | 
    
         
            +
                    self.pdf_path = None
         
     | 
| 
      
 68 
     | 
    
         
            +
                    # requests 期望 header 值为 str,这里确保 UA 是字符串而不是 bytes
         
     | 
| 
      
 69 
     | 
    
         
            +
                    self.headers = {"User-Agent": str(get_ua())}
         
     | 
| 
      
 70 
     | 
    
         
            +
                    # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
         
     | 
| 
      
 71 
     | 
    
         
            +
                    # self.fname = doi.replace(r'/', '_') + '.pdf'
         
     | 
| 
      
 72 
     | 
    
         
            +
                    self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
         
     | 
| 
      
 73 
     | 
    
         
            +
                    self.store_path = Path(store_path)
         
     | 
| 
      
 74 
     | 
    
         
            +
                    self.fpath = self.store_path / self.fname
         
     | 
| 
      
 75 
     | 
    
         
            +
                    self.wrong_record_file = self.store_path / "wrong_record.txt"
         
     | 
| 
      
 76 
     | 
    
         
            +
                    self.sleep = sleep_secs
         
     | 
| 
      
 77 
     | 
    
         
            +
                    self.cookies = None
         
     | 
| 
      
 78 
     | 
    
         
            +
                    self.check_size = max(1, int(min_size_kb))
         
     | 
| 
      
 79 
     | 
    
         
            +
                    self.url_index = 0
         
     | 
| 
      
 80 
     | 
    
         
            +
                    self.try_times_each_url_max = max(1, int(tries_each_url))
         
     | 
| 
      
 81 
     | 
    
         
            +
                    self.try_times = 0
         
     | 
| 
      
 82 
     | 
    
         
            +
                    self.timeout_html = max(5, int(timeout_html))
         
     | 
| 
      
 83 
     | 
    
         
            +
                    self.timeout_pdf = max(5, int(timeout_pdf))
         
     | 
| 
      
 84 
     | 
    
         
            +
                    self.debug = bool(debug)
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                # ---------------- 镜像可用性探测 ----------------
         
     | 
| 
      
 87 
     | 
    
         
            +
                def _is_mirror_alive(self, base_url: str) -> bool:
         
     | 
| 
      
 88 
     | 
    
         
            +
                    """
         
     | 
| 
      
 89 
     | 
    
         
            +
                    仅检测镜像根路径是否可连通(HTTP 200 即认为可用)。
         
     | 
| 
      
 90 
     | 
    
         
            +
                    不访问具体 DOI,避免被动触发风控;只做连通性筛查。
         
     | 
| 
      
 91 
     | 
    
         
            +
                    """
         
     | 
| 
      
 92 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 93 
     | 
    
         
            +
                        r = requests.get(base_url + "/", headers=self.headers, timeout=8, allow_redirects=True)
         
     | 
| 
      
 94 
     | 
    
         
            +
                        return 200 <= r.status_code < 400
         
     | 
| 
      
 95 
     | 
    
         
            +
                    except Exception:
         
     | 
| 
      
 96 
     | 
    
         
            +
                        return False
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
                def _ensure_alive_mirrors(self):
         
     | 
| 
      
 99 
     | 
    
         
            +
                    # 若已经有进程级缓存,直接复用
         
     | 
| 
      
 100 
     | 
    
         
            +
                    if _Downloader._alive_mirrors_cache is not None:
         
     | 
| 
      
 101 
     | 
    
         
            +
                        self.url_list = list(_Downloader._alive_mirrors_cache)
         
     | 
| 
      
 102 
     | 
    
         
            +
                        return
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
                    print(f"[bold cyan]Probing mirrors connectivity (first run)...")
         
     | 
| 
      
 105 
     | 
    
         
            +
                    alive = []
         
     | 
| 
      
 106 
     | 
    
         
            +
                    for base in self.url_list:
         
     | 
| 
      
 107 
     | 
    
         
            +
                        ok = self._is_mirror_alive(base)
         
     | 
| 
      
 108 
     | 
    
         
            +
                        status = "OK" if ok else "DOWN"
         
     | 
| 
      
 109 
     | 
    
         
            +
                        print(f"  [{status}] {base}")
         
     | 
| 
      
 110 
     | 
    
         
            +
                        if ok:
         
     | 
| 
      
 111 
     | 
    
         
            +
                            alive.append(base)
         
     | 
| 
      
 112 
     | 
    
         
            +
                    if alive:
         
     | 
| 
      
 113 
     | 
    
         
            +
                        _Downloader._alive_mirrors_cache = alive
         
     | 
| 
      
 114 
     | 
    
         
            +
                        self.url_list = alive
         
     | 
| 
      
 115 
     | 
    
         
            +
                        print(f"[bold cyan]Alive mirrors: {len(alive)}; pruned {len(set(self.url_list)) - len(alive) if self.url_list else 0}.")
         
     | 
| 
      
 116 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 117 
     | 
    
         
            +
                        print("[bold yellow]No mirror passed probe; keep original list for fallback attempts.")
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
      
 119 
     | 
    
         
            +
                def _extract_pdf_url_from_html(self, html: str) -> str | None:
         
     | 
| 
      
 120 
     | 
    
         
            +
                    """
         
     | 
| 
      
 121 
     | 
    
         
            +
                    从 Sci-Hub 页面 HTML 中尽可能稳健地提取 PDF 链接。
         
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
                    兼容多种模式:
         
     | 
| 
      
 124 
     | 
    
         
            +
                    - onclick="location.href='...pdf?download=true'"
         
     | 
| 
      
 125 
     | 
    
         
            +
                    - <iframe id="pdf" src="...pdf?...">
         
     | 
| 
      
 126 
     | 
    
         
            +
                    - <a ... href="...pdf?...">
         
     | 
| 
      
 127 
     | 
    
         
            +
                    - 其他出现 .pdf 的 src/href 场景
         
     | 
| 
      
 128 
     | 
    
         
            +
             
     | 
| 
      
 129 
     | 
    
         
            +
                    返回绝对 URL;若找不到返回 None。
         
     | 
| 
      
 130 
     | 
    
         
            +
                    """
         
     | 
| 
      
 131 
     | 
    
         
            +
                    text = html
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                    # 先尝试常见 onclick 跳转
         
     | 
| 
      
 134 
     | 
    
         
            +
                    patterns = [
         
     | 
| 
      
 135 
     | 
    
         
            +
                        # onclick="location.href='...pdf?...'" 或 document.location
         
     | 
| 
      
 136 
     | 
    
         
            +
                        r"onclick\s*=\s*[\"']\s*(?:document\.)?location\.href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
         
     | 
| 
      
 137 
     | 
    
         
            +
                        # iframe id="pdf" src="...pdf?..."
         
     | 
| 
      
 138 
     | 
    
         
            +
                        r"<iframe[^>]+id\s*=\s*[\"']pdf[\"'][^>]+src\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
         
     | 
| 
      
 139 
     | 
    
         
            +
                        # 通用 a 标签 href
         
     | 
| 
      
 140 
     | 
    
         
            +
                        r"<a[^>]+href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
         
     | 
| 
      
 141 
     | 
    
         
            +
                        # 通用任意 src/href
         
     | 
| 
      
 142 
     | 
    
         
            +
                        r"(?:src|href)\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
         
     | 
| 
      
 143 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
                    for pat in patterns:
         
     | 
| 
      
 146 
     | 
    
         
            +
                        m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
         
     | 
| 
      
 147 
     | 
    
         
            +
                        if m:
         
     | 
| 
      
 148 
     | 
    
         
            +
                            got_url = m.group(1)
         
     | 
| 
      
 149 
     | 
    
         
            +
                            # 规范化为绝对 URL
         
     | 
| 
      
 150 
     | 
    
         
            +
                            if got_url.startswith("//"):
         
     | 
| 
      
 151 
     | 
    
         
            +
                                return "https:" + got_url
         
     | 
| 
      
 152 
     | 
    
         
            +
                            if got_url.startswith("http://") or got_url.startswith("https://"):
         
     | 
| 
      
 153 
     | 
    
         
            +
                                return got_url
         
     | 
| 
      
 154 
     | 
    
         
            +
                            # 其余按相对路径处理
         
     | 
| 
      
 155 
     | 
    
         
            +
                            return urljoin(self.base_url + "/", got_url.lstrip("/"))
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                    return None
         
     | 
| 
      
 158 
     | 
    
         
            +
             
     | 
| 
      
 159 
     | 
    
         
            +
                def get_pdf_url(self):
         
     | 
| 
      
 160 
     | 
    
         
            +
                    print("[bold #E6E6FA]-" * 120)
         
     | 
| 
      
 161 
     | 
    
         
            +
                    print(f"DOI: {self.doi}")
         
     | 
| 
      
 162 
     | 
    
         
            +
                    print(f"Requesting: {self.url}...")
         
     | 
| 
      
 163 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 164 
     | 
    
         
            +
                        # 使用较小的超时时间避免长时间阻塞
         
     | 
| 
      
 165 
     | 
    
         
            +
                        response = requests.get(self.url, headers=self.headers, timeout=self.timeout_html)
         
     | 
| 
      
 166 
     | 
    
         
            +
                        if response.status_code == 200:
         
     | 
| 
      
 167 
     | 
    
         
            +
                            self.cookies = response.cookies
         
     | 
| 
      
 168 
     | 
    
         
            +
                            text = response.text
         
     | 
| 
      
 169 
     | 
    
         
            +
                            # 去除转义反斜杠,提升正则匹配成功率
         
     | 
| 
      
 170 
     | 
    
         
            +
                            text = text.replace("\\", "")
         
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                            self.pdf_url = self._extract_pdf_url_from_html(text)
         
     | 
| 
      
 173 
     | 
    
         
            +
                            if self.pdf_url:
         
     | 
| 
      
 174 
     | 
    
         
            +
                                if self.debug:
         
     | 
| 
      
 175 
     | 
    
         
            +
                                    print(f"Found PDF link: {self.pdf_url}")
         
     | 
| 
      
 176 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 177 
     | 
    
         
            +
                                    print(f"Found PDF link (masked): .../{Path(self.pdf_url).name}")
         
     | 
| 
      
 178 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 179 
     | 
    
         
            +
                                print(
         
     | 
| 
      
 180 
     | 
    
         
            +
                                    f"[bold #AFEEEE]The website {self.url_list[self.url_index]} does not expose a detectable PDF link (pattern mismatch)."
         
     | 
| 
      
 181 
     | 
    
         
            +
                                )
         
     | 
| 
      
 182 
     | 
    
         
            +
                                self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
      
 183 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 184 
     | 
    
         
            +
                            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
         
     | 
| 
      
 185 
     | 
    
         
            +
                            print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not include the PDF file (HTTP error).")
         
     | 
| 
      
 186 
     | 
    
         
            +
                            self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
      
 187 
     | 
    
         
            +
                    except Exception as e:
         
     | 
| 
      
 188 
     | 
    
         
            +
                        print(f"Failed to retrieve the webpage. Error: {e}")
         
     | 
| 
      
 189 
     | 
    
         
            +
                        self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
                def url_iterate(self):
         
     | 
| 
      
 192 
     | 
    
         
            +
                    if self.url_index >= len(self.url_list):
         
     | 
| 
      
 193 
     | 
    
         
            +
                        return
         
     | 
| 
      
 194 
     | 
    
         
            +
                    url = self.url_list[self.url_index]
         
     | 
| 
      
 195 
     | 
    
         
            +
                    self.base_url = url
         
     | 
| 
      
 196 
     | 
    
         
            +
                    self.url = url + "/" + self.doi
         
     | 
| 
      
 197 
     | 
    
         
            +
                    self.get_pdf_url()
         
     | 
| 
      
 198 
     | 
    
         
            +
                    # for url in self.url_list:
         
     | 
| 
      
 199 
     | 
    
         
            +
                    #     self.url = url + self.doi
         
     | 
| 
      
 200 
     | 
    
         
            +
                    #     self.get_pdf_url()
         
     | 
| 
      
 201 
     | 
    
         
            +
                    #     if self.pdf_url:
         
     | 
| 
      
 202 
     | 
    
         
            +
                    #         break
         
     | 
| 
      
 203 
     | 
    
         
            +
             
     | 
| 
      
 204 
     | 
    
         
            +
                def write_wrong_record(self):
         
     | 
| 
      
 205 
     | 
    
         
            +
                    # 先读取txt中的内容,如果已经存在则不再写入
         
     | 
| 
      
 206 
     | 
    
         
            +
                    if self.wrong_record_file.exists():
         
     | 
| 
      
 207 
     | 
    
         
            +
                        with open(self.wrong_record_file, "r") as f:
         
     | 
| 
      
 208 
     | 
    
         
            +
                            lines = f.readlines()
         
     | 
| 
      
 209 
     | 
    
         
            +
                        if self.doi in lines:
         
     | 
| 
      
 210 
     | 
    
         
            +
                            return
         
     | 
| 
      
 211 
     | 
    
         
            +
                    with open(self.wrong_record_file, "a") as f:
         
     | 
| 
      
 212 
     | 
    
         
            +
                        f.write(self.doi + "\n")
         
     | 
| 
      
 213 
     | 
    
         
            +
             
     | 
| 
      
 214 
     | 
    
         
            +
                def download_pdf(self):
         
     | 
| 
      
 215 
     | 
    
         
            +
                    if self.fpath.exists():
         
     | 
| 
      
 216 
     | 
    
         
            +
                        fsize = _get_file_size(self.fpath, unit="KB")
         
     | 
| 
      
 217 
     | 
    
         
            +
                        if fsize < self.check_size:
         
     | 
| 
      
 218 
     | 
    
         
            +
                            # delete the wrong file
         
     | 
| 
      
 219 
     | 
    
         
            +
                            os.remove(self.fpath)
         
     | 
| 
      
 220 
     | 
    
         
            +
                            print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
         
     | 
| 
      
 221 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 222 
     | 
    
         
            +
                            print("[bold #E6E6FA]-" * 120)
         
     | 
| 
      
 223 
     | 
    
         
            +
                            print(f"[bold purple]The PDF file {self.fpath} already exists.")
         
     | 
| 
      
 224 
     | 
    
         
            +
                            return
         
     | 
| 
      
 225 
     | 
    
         
            +
                    self.url_index = 0
         
     | 
| 
      
 226 
     | 
    
         
            +
                    already_downloaded = False
         
     | 
| 
      
 227 
     | 
    
         
            +
                    self.try_times = 0
         
     | 
| 
      
 228 
     | 
    
         
            +
                    while not already_downloaded:
         
     | 
| 
      
 229 
     | 
    
         
            +
                        self.url_iterate()
         
     | 
| 
      
 230 
     | 
    
         
            +
                        if not self.pdf_url:
         
     | 
| 
      
 231 
     | 
    
         
            +
                            self.url_index += 1
         
     | 
| 
      
 232 
     | 
    
         
            +
                            if self.url_index >= len(self.url_list):
         
     | 
| 
      
 233 
     | 
    
         
            +
                                print("Failed to download the PDF file.")
         
     | 
| 
      
 234 
     | 
    
         
            +
                                self.write_wrong_record()
         
     | 
| 
      
 235 
     | 
    
         
            +
                                return
         
     | 
| 
      
 236 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 237 
     | 
    
         
            +
                                self.try_times = 0
         
     | 
| 
      
 238 
     | 
    
         
            +
                                continue
         
     | 
| 
      
 239 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 240 
     | 
    
         
            +
                            self.try_times += 1
         
     | 
| 
      
 241 
     | 
    
         
            +
                        if self.try_times > self.try_times_each_url_max:
         
     | 
| 
      
 242 
     | 
    
         
            +
                            self.url_index += 1
         
     | 
| 
      
 243 
     | 
    
         
            +
                            if self.url_index >= len(self.url_list):
         
     | 
| 
      
 244 
     | 
    
         
            +
                                # print("Failed to download the PDF file.")
         
     | 
| 
      
 245 
     | 
    
         
            +
                                self.write_wrong_record()
         
     | 
| 
      
 246 
     | 
    
         
            +
                                return
         
     | 
| 
      
 247 
     | 
    
         
            +
                        print(f"Downloading: {self.fname}...")
         
     | 
| 
      
 248 
     | 
    
         
            +
                        try:
         
     | 
| 
      
 249 
     | 
    
         
            +
                            response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies, timeout=self.timeout_pdf)
         
     | 
| 
      
 250 
     | 
    
         
            +
                            if response.status_code == 200:
         
     | 
| 
      
 251 
     | 
    
         
            +
                                with open(self.fpath, "wb") as f:
         
     | 
| 
      
 252 
     | 
    
         
            +
                                    f.write(response.content)
         
     | 
| 
      
 253 
     | 
    
         
            +
                                fsize = _get_file_size(self.fpath, unit="KB")
         
     | 
| 
      
 254 
     | 
    
         
            +
                                if fsize < self.check_size:
         
     | 
| 
      
 255 
     | 
    
         
            +
                                    # delete the wrong file
         
     | 
| 
      
 256 
     | 
    
         
            +
                                    os.remove(self.fpath)
         
     | 
| 
      
 257 
     | 
    
         
            +
                                    print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
         
     | 
| 
      
 258 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 259 
     | 
    
         
            +
                                    print(f"[bold green]Sucessful to download {self.fpath}")
         
     | 
| 
      
 260 
     | 
    
         
            +
                                    already_downloaded = True
         
     | 
| 
      
 261 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 262 
     | 
    
         
            +
                                self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
      
 263 
     | 
    
         
            +
                                print(f"Failed to download the PDF file. Status code: {response.status_code}")
         
     | 
| 
      
 264 
     | 
    
         
            +
                                print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
         
     | 
| 
      
 265 
     | 
    
         
            +
                        except Exception as e:
         
     | 
| 
      
 266 
     | 
    
         
            +
                            print(f"Failed to download the PDF file. Error: {e}")
         
     | 
| 
      
 267 
     | 
    
         
            +
                        time.sleep(self.sleep)
         
     | 
| 
      
 268 
     | 
    
         
            +
                        if self.try_times >= self.try_times_each_url_max:
         
     | 
| 
      
 269 
     | 
    
         
            +
                            self.url_index += 1
         
     | 
| 
      
 270 
     | 
    
         
            +
                            if self.url_index >= len(self.url_list):
         
     | 
| 
      
 271 
     | 
    
         
            +
                                print("\n[bold #CD5C5C]Failed to download the PDF file.")
         
     | 
| 
      
 272 
     | 
    
         
            +
                                self.write_wrong_record()
         
     | 
| 
      
 273 
     | 
    
         
            +
                                return
         
     | 
| 
      
 274 
     | 
    
         
            +
                            if self.try_times == self.try_times_each_url_max:
         
     | 
| 
      
 275 
     | 
    
         
            +
                                print(f"Tried {self.try_times} times for {self.url_list[self.url_index-1]}.")
         
     | 
| 
      
 276 
     | 
    
         
            +
                                print("Try another URL...")
         
     | 
| 
      
 277 
     | 
    
         
            +
             
     | 
| 
      
 278 
     | 
    
         
            +
             
     | 
| 
      
 279 
     | 
    
         
            +
            def _read_excel(file, col_name=r"DOI"):
         
     | 
| 
      
 280 
     | 
    
         
            +
                df = pd.read_excel(file)
         
     | 
| 
      
 281 
     | 
    
         
            +
                df_list = df[col_name].tolist()
         
     | 
| 
      
 282 
     | 
    
         
            +
                # 去掉nan
         
     | 
| 
      
 283 
     | 
    
         
            +
                df_list = [doi for doi in df_list if str(doi) != "nan"]
         
     | 
| 
      
 284 
     | 
    
         
            +
                return df_list
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
             
     | 
| 
      
 287 
     | 
    
         
            +
            def _read_txt(file):
         
     | 
| 
      
 288 
     | 
    
         
            +
                with open(file, "r") as f:
         
     | 
| 
      
 289 
     | 
    
         
            +
                    lines = f.readlines()
         
     | 
| 
      
 290 
     | 
    
         
            +
                # 去掉换行符以及空行
         
     | 
| 
      
 291 
     | 
    
         
            +
                lines = [line.strip() for line in lines if line.strip()]
         
     | 
| 
      
 292 
     | 
    
         
            +
                return lines
         
     | 
| 
      
 293 
     | 
    
         
            +
             
     | 
| 
      
 294 
     | 
    
         
            +
             
     | 
| 
      
 295 
     | 
    
         
            +
            def download5doi(
         
     | 
| 
      
 296 
     | 
    
         
            +
                store_path=None,
         
     | 
| 
      
 297 
     | 
    
         
            +
                doi_list=None,
         
     | 
| 
      
 298 
     | 
    
         
            +
                txt_file=None,
         
     | 
| 
      
 299 
     | 
    
         
            +
                excel_file=None,
         
     | 
| 
      
 300 
     | 
    
         
            +
                col_name=r"DOI",
         
     | 
| 
      
 301 
     | 
    
         
            +
                *,
         
     | 
| 
      
 302 
     | 
    
         
            +
                probe_mirrors: bool = True,
         
     | 
| 
      
 303 
     | 
    
         
            +
                min_size_kb: int = 50,
         
     | 
| 
      
 304 
     | 
    
         
            +
                timeout_html: int = 15,
         
     | 
| 
      
 305 
     | 
    
         
            +
                timeout_pdf: int = 30,
         
     | 
| 
      
 306 
     | 
    
         
            +
                tries_each_url: int = 3,
         
     | 
| 
      
 307 
     | 
    
         
            +
                sleep_secs: int = 5,
         
     | 
| 
      
 308 
     | 
    
         
            +
                force: bool = False,
         
     | 
| 
      
 309 
     | 
    
         
            +
                debug: bool = False,
         
     | 
| 
      
 310 
     | 
    
         
            +
            ):
         
     | 
| 
      
 311 
     | 
    
         
            +
                """
         
     | 
| 
      
 312 
     | 
    
         
            +
                Description:
         
     | 
| 
      
 313 
     | 
    
         
            +
                    Download PDF files by DOI.
         
     | 
| 
      
 314 
     | 
    
         
            +
             
     | 
| 
      
 315 
     | 
    
         
            +
                Parameters:
         
     | 
| 
      
 316 
     | 
    
         
            +
                    store_path: str, The path to store the PDF files.
         
     | 
| 
      
 317 
     | 
    
         
            +
                    doi_list: list or str, The list of DOIs.
         
     | 
| 
      
 318 
     | 
    
         
            +
                    txt_file: str, The path of the txt file that contains the DOIs.
         
     | 
| 
      
 319 
     | 
    
         
            +
                    excel_file: str, The path of the excel file that contains the DOIs.
         
     | 
| 
      
 320 
     | 
    
         
            +
                    col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
         
     | 
| 
      
 321 
     | 
    
         
            +
             
     | 
| 
      
 322 
     | 
    
         
            +
                Returns:
         
     | 
| 
      
 323 
     | 
    
         
            +
                    None
         
     | 
| 
      
 324 
     | 
    
         
            +
             
     | 
| 
      
 325 
     | 
    
         
            +
                Example:
         
     | 
| 
      
 326 
     | 
    
         
            +
                    download5doi(doi_list='10.3389/feart.2021.698876')
         
     | 
| 
      
 327 
     | 
    
         
            +
                    download5doi(store_path='I:\\Delete\\ref_pdf', doi_list='10.3389/feart.2021.698876')
         
     | 
| 
      
 328 
     | 
    
         
            +
                    download5doi(store_path='I:\\Delete\\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
         
     | 
| 
      
 329 
     | 
    
         
            +
                    download5doi(store_path='I:\\Delete\\ref_pdf', txt_file='I:\\Delete\\ref_pdf\\wrong_record.txt')
         
     | 
| 
      
 330 
     | 
    
         
            +
                    download5doi(store_path='I:\\Delete\\ref_pdf', excel_file='I:\\Delete\\ref_pdf\\wrong_record.xlsx')
         
     | 
| 
      
 331 
     | 
    
         
            +
                    download5doi(store_path='I:\\Delete\\ref_pdf', excel_file='I:\\Delete\\ref_pdf\\wrong_record.xlsx', col_name='DOI')
         
     | 
| 
      
 332 
     | 
    
         
            +
                """
         
     | 
| 
      
 333 
     | 
    
         
            +
                if not store_path:
         
     | 
| 
      
 334 
     | 
    
         
            +
                    store_path = Path.cwd()
         
     | 
| 
      
 335 
     | 
    
         
            +
                else:
         
     | 
| 
      
 336 
     | 
    
         
            +
                    store_path = Path(str(store_path))
         
     | 
| 
      
 337 
     | 
    
         
            +
                store_path.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 338 
     | 
    
         
            +
                store_path = str(store_path)
         
     | 
| 
      
 339 
     | 
    
         
            +
             
     | 
| 
      
 340 
     | 
    
         
            +
                if doi_list:
         
     | 
| 
      
 341 
     | 
    
         
            +
                    doi_list = ensure_list(doi_list)
         
     | 
| 
      
 342 
     | 
    
         
            +
                if txt_file:
         
     | 
| 
      
 343 
     | 
    
         
            +
                    doi_list = _read_txt(txt_file)
         
     | 
| 
      
 344 
     | 
    
         
            +
                if excel_file:
         
     | 
| 
      
 345 
     | 
    
         
            +
                    doi_list = _read_excel(excel_file, col_name)
         
     | 
| 
      
 346 
     | 
    
         
            +
                # 去重并清洗
         
     | 
| 
      
 347 
     | 
    
         
            +
                doi_list = [str(x).strip() for x in doi_list if str(x).strip()]
         
     | 
| 
      
 348 
     | 
    
         
            +
                doi_list = list(dict.fromkeys(doi_list))  # 保序去重
         
     | 
| 
      
 349 
     | 
    
         
            +
             
     | 
| 
      
 350 
     | 
    
         
            +
                # 只有在不是追加下载的场景下再清除 wrong_record
         
     | 
| 
      
 351 
     | 
    
         
            +
                if not force:
         
     | 
| 
      
 352 
     | 
    
         
            +
                    remove(Path(store_path) / "wrong_record.txt")
         
     | 
| 
      
 353 
     | 
    
         
            +
                print(f"Downloading {len(doi_list)} PDF files...")
         
     | 
| 
      
 354 
     | 
    
         
            +
                for doi in track(doi_list, description="Downloading..."):
         
     | 
| 
      
 355 
     | 
    
         
            +
                    dl = _Downloader(
         
     | 
| 
      
 356 
     | 
    
         
            +
                        doi,
         
     | 
| 
      
 357 
     | 
    
         
            +
                        store_path,
         
     | 
| 
      
 358 
     | 
    
         
            +
                        min_size_kb=min_size_kb,
         
     | 
| 
      
 359 
     | 
    
         
            +
                        timeout_html=timeout_html,
         
     | 
| 
      
 360 
     | 
    
         
            +
                        timeout_pdf=timeout_pdf,
         
     | 
| 
      
 361 
     | 
    
         
            +
                        sleep_secs=sleep_secs,
         
     | 
| 
      
 362 
     | 
    
         
            +
                        tries_each_url=tries_each_url,
         
     | 
| 
      
 363 
     | 
    
         
            +
                        debug=debug,
         
     | 
| 
      
 364 
     | 
    
         
            +
                    )
         
     | 
| 
      
 365 
     | 
    
         
            +
                    # 是否进行镜像探测
         
     | 
| 
      
 366 
     | 
    
         
            +
                    if probe_mirrors:
         
     | 
| 
      
 367 
     | 
    
         
            +
                        dl._ensure_alive_mirrors()
         
     | 
| 
      
 368 
     | 
    
         
            +
                    dl.download_pdf()
         
     | 
| 
      
 369 
     | 
    
         
            +
             
     | 
| 
      
 370 
     | 
    
         
            +
             
     | 
| 
      
 371 
     | 
    
         
            +
            # ------------------------------- 合规替代方案(Open Access 优先) -------------------------------
         
     | 
| 
      
 372 
     | 
    
         
            +
            def _get_oa_pdf_url_from_unpaywall(doi: str, email: str | None) -> str | None:
         
     | 
| 
      
 373 
     | 
    
         
            +
                """
         
     | 
| 
      
 374 
     | 
    
         
            +
                通过 Unpaywall 获取可开放访问的 PDF 链接(若存在)。
         
     | 
| 
      
 375 
     | 
    
         
            +
                需要提供 email(Unpaywall 要求标识邮件)。
         
     | 
| 
      
 376 
     | 
    
         
            +
                返回 PDF URL 或 None。
         
     | 
| 
      
 377 
     | 
    
         
            +
                """
         
     | 
| 
      
 378 
     | 
    
         
            +
                if not email:
         
     | 
| 
      
 379 
     | 
    
         
            +
                    print("[bold yellow]Unpaywall 需要 email 参数;请提供 email 以查询 OA 链接。")
         
     | 
| 
      
 380 
     | 
    
         
            +
                    return None
         
     | 
| 
      
 381 
     | 
    
         
            +
                api = f"https://api.unpaywall.org/v2/{doi}?email={email}"
         
     | 
| 
      
 382 
     | 
    
         
            +
                try:
         
     | 
| 
      
 383 
     | 
    
         
            +
                    r = requests.get(api, timeout=15)
         
     | 
| 
      
 384 
     | 
    
         
            +
                    if r.status_code != 200:
         
     | 
| 
      
 385 
     | 
    
         
            +
                        print(f"[bold yellow]Unpaywall 查询失败: HTTP {r.status_code}")
         
     | 
| 
      
 386 
     | 
    
         
            +
                        return None
         
     | 
| 
      
 387 
     | 
    
         
            +
                    data = r.json()
         
     | 
| 
      
 388 
     | 
    
         
            +
                    loc = data.get("best_oa_location") or {}
         
     | 
| 
      
 389 
     | 
    
         
            +
                    url_for_pdf = loc.get("url_for_pdf") or loc.get("url")
         
     | 
| 
      
 390 
     | 
    
         
            +
                    if url_for_pdf and url_for_pdf.lower().endswith(".pdf"):
         
     | 
| 
      
 391 
     | 
    
         
            +
                        return url_for_pdf
         
     | 
| 
      
 392 
     | 
    
         
            +
                    # 有些 OA 链接是落在 landing page,再尝试从记录的所有位置挑选 pdf
         
     | 
| 
      
 393 
     | 
    
         
            +
                    for k in ("oa_locations", "oa_location"):
         
     | 
| 
      
 394 
     | 
    
         
            +
                        entries = data.get(k) or []
         
     | 
| 
      
 395 
     | 
    
         
            +
                        if isinstance(entries, dict):
         
     | 
| 
      
 396 
     | 
    
         
            +
                            entries = [entries]
         
     | 
| 
      
 397 
     | 
    
         
            +
                        for e in entries:
         
     | 
| 
      
 398 
     | 
    
         
            +
                            u = e.get("url_for_pdf") or e.get("url")
         
     | 
| 
      
 399 
     | 
    
         
            +
                            if u and ".pdf" in u.lower():
         
     | 
| 
      
 400 
     | 
    
         
            +
                                return u
         
     | 
| 
      
 401 
     | 
    
         
            +
                except Exception as e:
         
     | 
| 
      
 402 
     | 
    
         
            +
                    print(f"[bold yellow]Unpaywall 查询异常: {e}")
         
     | 
| 
      
 403 
     | 
    
         
            +
                return None
         
     | 
| 
      
 404 
     | 
    
         
            +
             
     | 
| 
      
 405 
     | 
    
         
            +
             
     | 
| 
      
 406 
     | 
    
         
            +
            def _download_pdf_from_url(url: str, dest_path: Path, headers: dict | None = None) -> bool:
         
     | 
| 
      
 407 
     | 
    
         
            +
                """
         
     | 
| 
      
 408 
     | 
    
         
            +
                给定合法的 PDF 下载 URL,下载保存到 dest_path。
         
     | 
| 
      
 409 
     | 
    
         
            +
                返回 True/False 表示是否成功。
         
     | 
| 
      
 410 
     | 
    
         
            +
                """
         
     | 
| 
      
 411 
     | 
    
         
            +
                headers = headers or {"User-Agent": str(get_ua()), "Accept": "application/pdf"}
         
     | 
| 
      
 412 
     | 
    
         
            +
                try:
         
     | 
| 
      
 413 
     | 
    
         
            +
                    with requests.get(url, headers=headers, stream=True, timeout=30) as r:
         
     | 
| 
      
 414 
     | 
    
         
            +
                        if r.status_code != 200 or "application/pdf" not in r.headers.get("Content-Type", "").lower():
         
     | 
| 
      
 415 
     | 
    
         
            +
                            # 仍可能是 PDF(某些服务器未正确设置头),尝试保存但标注提示
         
     | 
| 
      
 416 
     | 
    
         
            +
                            if r.status_code != 200:
         
     | 
| 
      
 417 
     | 
    
         
            +
                                print(f"[bold yellow]下载失败: HTTP {r.status_code}")
         
     | 
| 
      
 418 
     | 
    
         
            +
                                return False
         
     | 
| 
      
 419 
     | 
    
         
            +
                        dest_path.parent.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 420 
     | 
    
         
            +
                        with open(dest_path, "wb") as f:
         
     | 
| 
      
 421 
     | 
    
         
            +
                            for chunk in r.iter_content(chunk_size=8192):
         
     | 
| 
      
 422 
     | 
    
         
            +
                                if chunk:
         
     | 
| 
      
 423 
     | 
    
         
            +
                                    f.write(chunk)
         
     | 
| 
      
 424 
     | 
    
         
            +
                    return True
         
     | 
| 
      
 425 
     | 
    
         
            +
                except Exception as e:
         
     | 
| 
      
 426 
     | 
    
         
            +
                    print(f"[bold yellow]下载异常: {e}")
         
     | 
| 
      
 427 
     | 
    
         
            +
                    return False
         
     | 
| 
      
 428 
     | 
    
         
            +
             
     | 
| 
      
 429 
     | 
    
         
            +
             
     | 
| 
      
 430 
     | 
    
         
            +
            def download5doi_via_unpaywall(
         
     | 
| 
      
 431 
     | 
    
         
            +
                store_path=None,
         
     | 
| 
      
 432 
     | 
    
         
            +
                doi_list=None,
         
     | 
| 
      
 433 
     | 
    
         
            +
                txt_file=None,
         
     | 
| 
      
 434 
     | 
    
         
            +
                excel_file=None,
         
     | 
| 
      
 435 
     | 
    
         
            +
                col_name=r"DOI",
         
     | 
| 
      
 436 
     | 
    
         
            +
                email: str | None = None,
         
     | 
| 
      
 437 
     | 
    
         
            +
            ):
         
     | 
| 
      
 438 
     | 
    
         
            +
                """
         
     | 
| 
      
 439 
     | 
    
         
            +
                优先使用 Unpaywall 获取开放访问(OA)的 PDF 并下载,避免非合规站点。
         
     | 
| 
      
 440 
     | 
    
         
            +
             
     | 
| 
      
 441 
     | 
    
         
            +
                参数:
         
     | 
| 
      
 442 
     | 
    
         
            +
                    store_path: 保存目录
         
     | 
| 
      
 443 
     | 
    
         
            +
                    doi_list/txt_file/excel_file/col_name: 同 download5doi
         
     | 
| 
      
 444 
     | 
    
         
            +
                    email: 用于访问 Unpaywall API 的邮箱(必填,否则无法查询)
         
     | 
| 
      
 445 
     | 
    
         
            +
                """
         
     | 
| 
      
 446 
     | 
    
         
            +
                if not store_path:
         
     | 
| 
      
 447 
     | 
    
         
            +
                    store_path = Path.cwd()
         
     | 
| 
      
 448 
     | 
    
         
            +
                else:
         
     | 
| 
      
 449 
     | 
    
         
            +
                    store_path = Path(str(store_path))
         
     | 
| 
      
 450 
     | 
    
         
            +
                store_path.mkdir(parents=True, exist_ok=True)
         
     | 
| 
      
 451 
     | 
    
         
            +
             
     | 
| 
      
 452 
     | 
    
         
            +
                if doi_list:
         
     | 
| 
      
 453 
     | 
    
         
            +
                    doi_list = ensure_list(doi_list)
         
     | 
| 
      
 454 
     | 
    
         
            +
                if txt_file:
         
     | 
| 
      
 455 
     | 
    
         
            +
                    doi_list = _read_txt(txt_file)
         
     | 
| 
      
 456 
     | 
    
         
            +
                if excel_file:
         
     | 
| 
      
 457 
     | 
    
         
            +
                    doi_list = _read_excel(excel_file, col_name)
         
     | 
| 
      
 458 
     | 
    
         
            +
             
     | 
| 
      
 459 
     | 
    
         
            +
                if not doi_list:
         
     | 
| 
      
 460 
     | 
    
         
            +
                    print("[bold yellow]未提供 DOI 列表。")
         
     | 
| 
      
 461 
     | 
    
         
            +
                    return
         
     | 
| 
      
 462 
     | 
    
         
            +
             
     | 
| 
      
 463 
     | 
    
         
            +
                print(f"[bold cyan]通过 Unpaywall 尝试下载 {len(doi_list)} 篇 OA PDF...")
         
     | 
| 
      
 464 
     | 
    
         
            +
                ok, miss = 0, 0
         
     | 
| 
      
 465 
     | 
    
         
            +
                for doi in track(doi_list, description="OA downloading..."):
         
     | 
| 
      
 466 
     | 
    
         
            +
                    # 规范化文件名
         
     | 
| 
      
 467 
     | 
    
         
            +
                    fname = re.sub(r'[/<>:"?*|]', "_", str(doi)) + ".pdf"
         
     | 
| 
      
 468 
     | 
    
         
            +
                    dest = store_path / fname
         
     | 
| 
      
 469 
     | 
    
         
            +
                    if dest.exists() and _get_file_size(dest, unit="KB") > 10:
         
     | 
| 
      
 470 
     | 
    
         
            +
                        ok += 1
         
     | 
| 
      
 471 
     | 
    
         
            +
                        continue
         
     | 
| 
      
 472 
     | 
    
         
            +
             
     | 
| 
      
 473 
     | 
    
         
            +
                    pdf_url = _get_oa_pdf_url_from_unpaywall(str(doi), email=email)
         
     | 
| 
      
 474 
     | 
    
         
            +
                    if not pdf_url:
         
     | 
| 
      
 475 
     | 
    
         
            +
                        miss += 1
         
     | 
| 
      
 476 
     | 
    
         
            +
                        print(f"[bold yellow]未找到 OA PDF: {doi}")
         
     | 
| 
      
 477 
     | 
    
         
            +
                        continue
         
     | 
| 
      
 478 
     | 
    
         
            +
             
     | 
| 
      
 479 
     | 
    
         
            +
                    if _download_pdf_from_url(pdf_url, dest):
         
     | 
| 
      
 480 
     | 
    
         
            +
                        size_kb = _get_file_size(dest, unit="KB")
         
     | 
| 
      
 481 
     | 
    
         
            +
                        if isinstance(size_kb, (int, float)) and size_kb < 10:
         
     | 
| 
      
 482 
     | 
    
         
            +
                            dest.unlink(missing_ok=True)
         
     | 
| 
      
 483 
     | 
    
         
            +
                            miss += 1
         
     | 
| 
      
 484 
     | 
    
         
            +
                            print(f"[bold yellow]文件过小,疑似异常,已删除: {dest}")
         
     | 
| 
      
 485 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 486 
     | 
    
         
            +
                            ok += 1
         
     | 
| 
      
 487 
     | 
    
         
            +
                            print(f"[bold green]已下载: {dest}")
         
     | 
| 
      
 488 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 489 
     | 
    
         
            +
                        miss += 1
         
     | 
| 
      
 490 
     | 
    
         
            +
             
     | 
| 
      
 491 
     | 
    
         
            +
                print(f"[bold]完成。成功 {ok} 篇,未获取 {miss} 篇(可能无 OA 版本或需机构访问)。")
         
     | 
| 
      
 492 
     | 
    
         
            +
             
     | 
| 
      
 493 
     | 
    
         
            +
             
     | 
| 
      
 494 
     | 
    
         
            +
            if __name__ == "__main__":
         
     | 
| 
      
 495 
     | 
    
         
            +
                store_path = r"F:\AAA-Delete\DOI_Reference\5\pdf"
         
     | 
| 
      
 496 
     | 
    
         
            +
                excel_file = r"F:\AAA-Delete\DOI_Reference\5\savedrecs.xls"
         
     | 
| 
      
 497 
     | 
    
         
            +
                download5doi(store_path, excel_file=excel_file)
         
     | 
| 
         @@ -0,0 +1,108 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            from rich import print
         
     | 
| 
      
 2 
     | 
    
         
            +
            import time
         
     | 
| 
      
 3 
     | 
    
         
            +
            import os
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            __all__ = ["os_command", "get_queue_node", "query_queue", "running_jobs", "submit_job"]
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            # 负责执行命令并返回输出
         
     | 
| 
      
 9 
     | 
    
         
            +
            def os_command(cmd):
         
     | 
| 
      
 10 
     | 
    
         
            +
                import subprocess
         
     | 
| 
      
 11 
     | 
    
         
            +
                print(f'🔍 执行命令: {cmd}')
         
     | 
| 
      
 12 
     | 
    
         
            +
                result = subprocess.run(
         
     | 
| 
      
 13 
     | 
    
         
            +
                    cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
         
     | 
| 
      
 14 
     | 
    
         
            +
                )
         
     | 
| 
      
 15 
     | 
    
         
            +
                # 打印错误信息(若有,方便排查问题)
         
     | 
| 
      
 16 
     | 
    
         
            +
                if result.stderr:
         
     | 
| 
      
 17 
     | 
    
         
            +
                    print(f'❌ 错误输出: {result.stderr.strip()}')
         
     | 
| 
      
 18 
     | 
    
         
            +
                # 检查命令是否执行成功(非0为失败)
         
     | 
| 
      
 19 
     | 
    
         
            +
                if result.returncode != 0:
         
     | 
| 
      
 20 
     | 
    
         
            +
                    print(f'❌ 命令执行失败,退出码: {result.returncode}')
         
     | 
| 
      
 21 
     | 
    
         
            +
                    return None
         
     | 
| 
      
 22 
     | 
    
         
            +
                return result.stdout
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
            # 返回“队列名:节点数”的字典
         
     | 
| 
      
 25 
     | 
    
         
            +
            def get_queue_node():
         
     | 
| 
      
 26 
     | 
    
         
            +
                import re
         
     | 
| 
      
 27 
     | 
    
         
            +
                # 执行 sinfo | grep "idle" 获取空闲队列数据
         
     | 
| 
      
 28 
     | 
    
         
            +
                cmd = 'sinfo | grep "idle"'
         
     | 
| 
      
 29 
     | 
    
         
            +
                output = os_command(cmd)
         
     | 
| 
      
 30 
     | 
    
         
            +
                if not output:  # 命令执行失败或无输出,返回空字典
         
     | 
| 
      
 31 
     | 
    
         
            +
                    return {}
         
     | 
| 
      
 32 
     | 
    
         
            +
                
         
     | 
| 
      
 33 
     | 
    
         
            +
                # 初始化结果字典:键=队列名,值=节点数
         
     | 
| 
      
 34 
     | 
    
         
            +
                queue_node_dict = {}
         
     | 
| 
      
 35 
     | 
    
         
            +
                # 按行解析命令输出
         
     | 
| 
      
 36 
     | 
    
         
            +
                for line in output.strip().split('\n'):
         
     | 
| 
      
 37 
     | 
    
         
            +
                    line = line.strip()
         
     | 
| 
      
 38 
     | 
    
         
            +
                    if not line:  # 跳过空行
         
     | 
| 
      
 39 
     | 
    
         
            +
                        continue
         
     | 
| 
      
 40 
     | 
    
         
            +
                    
         
     | 
| 
      
 41 
     | 
    
         
            +
                    # 正则匹配:仅捕获“队列名”(第1组)和“节点数”(第2组)
         
     | 
| 
      
 42 
     | 
    
         
            +
                    # 末尾用 .* 忽略节点列表,不影响匹配
         
     | 
| 
      
 43 
     | 
    
         
            +
                    pattern = r"^(\S+)\s+\S+\s+\S+\s+(\d+)\s+idle\s+.*$"
         
     | 
| 
      
 44 
     | 
    
         
            +
                    match = re.match(pattern, line)
         
     | 
| 
      
 45 
     | 
    
         
            +
                    
         
     | 
| 
      
 46 
     | 
    
         
            +
                    if match:
         
     | 
| 
      
 47 
     | 
    
         
            +
                        queue_name = match.group(1)    # 提取队列名作为字典的键
         
     | 
| 
      
 48 
     | 
    
         
            +
                        node_count = int(match.group(2))# 提取节点数作为字典的值(转为整数)
         
     | 
| 
      
 49 
     | 
    
         
            +
                        queue_node_dict[queue_name] = node_count  # 存入字典
         
     | 
| 
      
 50 
     | 
    
         
            +
                
         
     | 
| 
      
 51 
     | 
    
         
            +
                return queue_node_dict
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
            def query_queue(need_node=1, queue_list =['dcu','bigmem','cpu_parallel','cpu_single']):
         
     | 
| 
      
 54 
     | 
    
         
            +
                queue_dict = get_queue_node()
         
     | 
| 
      
 55 
     | 
    
         
            +
                hs = None
         
     | 
| 
      
 56 
     | 
    
         
            +
                for my_queue in queue_list:
         
     | 
| 
      
 57 
     | 
    
         
            +
                    if my_queue in queue_dict and queue_dict[my_queue] >= need_node:
         
     | 
| 
      
 58 
     | 
    
         
            +
                        # slurm_file = f'../run.slurm.{my_queue}'
         
     | 
| 
      
 59 
     | 
    
         
            +
                        hs = my_queue
         
     | 
| 
      
 60 
     | 
    
         
            +
                        break
         
     | 
| 
      
 61 
     | 
    
         
            +
                return hs
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
            def running_jobs():
         
     | 
| 
      
 64 
     | 
    
         
            +
                # 通过qstat判断任务状态,是否还在进行中
         
     | 
| 
      
 65 
     | 
    
         
            +
                # status = os.popen('qstat').read()
         
     | 
| 
      
 66 
     | 
    
         
            +
                status = os.popen('squeue').read()
         
     | 
| 
      
 67 
     | 
    
         
            +
                Jobs = status.split('\n')[1:]
         
     | 
| 
      
 68 
     | 
    
         
            +
                ids = [job.split()[0] for job in Jobs if job != '']
         
     | 
| 
      
 69 
     | 
    
         
            +
                return ids
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
            def submit_job(working_dir, script_tmp='run.slurm', script_run='run.slurm', need_node=1, queue_tmp='<queue_name>', queue_list=['dcu', 'bigmem', 'cpu_parallel', 'cpu_single'], max_job=38):
         
     | 
| 
      
 72 
     | 
    
         
            +
                from .oa_file import replace_content
         
     | 
| 
      
 73 
     | 
    
         
            +
                import datetime
         
     | 
| 
      
 74 
     | 
    
         
            +
                os.chdir(working_dir)
         
     | 
| 
      
 75 
     | 
    
         
            +
                print(f'切换工作目录到: {working_dir}')
         
     | 
| 
      
 76 
     | 
    
         
            +
                while True:
         
     | 
| 
      
 77 
     | 
    
         
            +
                    running_job = running_jobs()
         
     | 
| 
      
 78 
     | 
    
         
            +
                    if not running_job or len(running_job) < max_job:
         
     | 
| 
      
 79 
     | 
    
         
            +
                        queue = query_queue(need_node=need_node, queue_list=queue_list)
         
     | 
| 
      
 80 
     | 
    
         
            +
                        if queue:
         
     | 
| 
      
 81 
     | 
    
         
            +
                            replace_content(script_tmp, {f'{queue_tmp}': f"{queue}"}, False, f'{working_dir}', script_run)
         
     | 
| 
      
 82 
     | 
    
         
            +
                            print(f'找到计算资源,提交任务,队列:{queue}')
         
     | 
| 
      
 83 
     | 
    
         
            +
                            print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
         
     | 
| 
      
 84 
     | 
    
         
            +
                            content_sub = os_command(f"sbatch {script_run}")
         
     | 
| 
      
 85 
     | 
    
         
            +
                            # 避免在 None 上使用 'in' 导致 TypeError:os_command 在失败时会返回 None
         
     | 
| 
      
 86 
     | 
    
         
            +
                            if not content_sub:
         
     | 
| 
      
 87 
     | 
    
         
            +
                                print('提交任务命令没有返回输出或返回了错误,等待30秒后重试!')
         
     | 
| 
      
 88 
     | 
    
         
            +
                                time.sleep(30)
         
     | 
| 
      
 89 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 90 
     | 
    
         
            +
                                content_sub_lower = content_sub.lower()
         
     | 
| 
      
 91 
     | 
    
         
            +
                                if 'error' in content_sub_lower or 'failed' in content_sub_lower:
         
     | 
| 
      
 92 
     | 
    
         
            +
                                    print('提交任务时出现错误(从输出检测到 error/failed),等待30秒后重试!')
         
     | 
| 
      
 93 
     | 
    
         
            +
                                    print(f'命令输出: {content_sub.strip()}')
         
     | 
| 
      
 94 
     | 
    
         
            +
                                    time.sleep(30)
         
     | 
| 
      
 95 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 96 
     | 
    
         
            +
                                    print(f'提交任务成功,{content_sub.strip()}')
         
     | 
| 
      
 97 
     | 
    
         
            +
                                    break
         
     | 
| 
      
 98 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 99 
     | 
    
         
            +
                            print('没有足够的计算资源,等待30秒后重试!')
         
     | 
| 
      
 100 
     | 
    
         
            +
                            time.sleep(30)
         
     | 
| 
      
 101 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 102 
     | 
    
         
            +
                        print(f'当前系统任务数:{len(running_job)},等待60秒后重试!')
         
     | 
| 
      
 103 
     | 
    
         
            +
                        time.sleep(60)
         
     | 
| 
      
 104 
     | 
    
         
            +
                print(f'等待10秒后,继续检查任务状态!')
         
     | 
| 
      
 105 
     | 
    
         
            +
                time.sleep(10)
         
     | 
| 
      
 106 
     | 
    
         
            +
             
     | 
| 
      
 107 
     | 
    
         
            +
            if __name__ == "__main__":
         
     | 
| 
      
 108 
     | 
    
         
            +
                pass
         
     | 
| 
         @@ -18,7 +18,7 @@ URL = "https://github.com/Industry-Pays/OAFuncs" 
     | 
|
| 
       18 
18 
     | 
    
         
             
            EMAIL = "liukun0312@stu.ouc.edu.cn"
         
     | 
| 
       19 
19 
     | 
    
         
             
            AUTHOR = "Kun Liu"
         
     | 
| 
       20 
20 
     | 
    
         
             
            REQUIRES_PYTHON = ">=3.10.0"  # 2025/03/13
         
     | 
| 
       21 
     | 
    
         
            -
            VERSION = "0.0.98. 
     | 
| 
      
 21 
     | 
    
         
            +
            VERSION = "0.0.98.46"
         
     | 
| 
       22 
22 
     | 
    
         | 
| 
       23 
23 
     | 
    
         
             
            # What packages are required for this module to be executed?
         
     | 
| 
       24 
24 
     | 
    
         
             
            REQUIRED = [
         
     | 
| 
         @@ -1,273 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            import os
         
     | 
| 
       2 
     | 
    
         
            -
            import re
         
     | 
| 
       3 
     | 
    
         
            -
            import time
         
     | 
| 
       4 
     | 
    
         
            -
            from pathlib import Path
         
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
            import pandas as pd
         
     | 
| 
       7 
     | 
    
         
            -
            import requests
         
     | 
| 
       8 
     | 
    
         
            -
            from rich import print
         
     | 
| 
       9 
     | 
    
         
            -
            from rich.progress import track
         
     | 
| 
       10 
     | 
    
         
            -
            from oafuncs.oa_down.user_agent import get_ua
         
     | 
| 
       11 
     | 
    
         
            -
            from oafuncs.oa_file import remove
         
     | 
| 
       12 
     | 
    
         
            -
            from oafuncs.oa_data import ensure_list
         
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
            __all__ = ["download5doi"]
         
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
            def _get_file_size(file_path, unit="KB"):
         
     | 
| 
       18 
     | 
    
         
            -
                # 检查文件是否存在
         
     | 
| 
       19 
     | 
    
         
            -
                if not os.path.exists(file_path):
         
     | 
| 
       20 
     | 
    
         
            -
                    return "文件不存在"
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
                # 获取文件大小(字节)
         
     | 
| 
       23 
     | 
    
         
            -
                file_size = os.path.getsize(file_path)
         
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
                # 单位转换字典
         
     | 
| 
       26 
     | 
    
         
            -
                unit_dict = {
         
     | 
| 
       27 
     | 
    
         
            -
                    "PB": 1024**5,
         
     | 
| 
       28 
     | 
    
         
            -
                    "TB": 1024**4,
         
     | 
| 
       29 
     | 
    
         
            -
                    "GB": 1024**3,
         
     | 
| 
       30 
     | 
    
         
            -
                    "MB": 1024**2,
         
     | 
| 
       31 
     | 
    
         
            -
                    "KB": 1024,
         
     | 
| 
       32 
     | 
    
         
            -
                }
         
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
                # 检查传入的单位是否合法
         
     | 
| 
       35 
     | 
    
         
            -
                if unit not in unit_dict:
         
     | 
| 
       36 
     | 
    
         
            -
                    return "单位不合法,请选择PB、TB、GB、MB、KB中的一个"
         
     | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
       38 
     | 
    
         
            -
                # 转换文件大小到指定单位
         
     | 
| 
       39 
     | 
    
         
            -
                converted_size = file_size / unit_dict[unit]
         
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
                return converted_size
         
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
            class _Downloader:
         
     | 
| 
       45 
     | 
    
         
            -
                """
         
     | 
| 
       46 
     | 
    
         
            -
                根据doi下载文献pdf
         
     | 
| 
       47 
     | 
    
         
            -
                """
         
     | 
| 
       48 
     | 
    
         
            -
             
     | 
| 
       49 
     | 
    
         
            -
                def __init__(self, doi, store_path):
         
     | 
| 
       50 
     | 
    
         
            -
                    self.url_list = [
         
     | 
| 
       51 
     | 
    
         
            -
                        r"https://sci-hub.se",
         
     | 
| 
       52 
     | 
    
         
            -
                        r"https://sci-hub.ren",
         
     | 
| 
       53 
     | 
    
         
            -
                        r"https://sci-hub.st",
         
     | 
| 
       54 
     | 
    
         
            -
                        r"https://sci-hub.ru", # 最好用的一个网站
         
     | 
| 
       55 
     | 
    
         
            -
                        # ------------------------------------- 以下网站没验证
         
     | 
| 
       56 
     | 
    
         
            -
                        r"https://sci-hub.wf",
         
     | 
| 
       57 
     | 
    
         
            -
                        r"https://sci-hub.yt",
         
     | 
| 
       58 
     | 
    
         
            -
                        r"https://sci-hub.ee",
         
     | 
| 
       59 
     | 
    
         
            -
                        r"https://sci-hub.cat",
         
     | 
| 
       60 
     | 
    
         
            -
                        r"https://sci-hub.in",
         
     | 
| 
       61 
     | 
    
         
            -
                        r"https://www.pismin.com",
         
     | 
| 
       62 
     | 
    
         
            -
                        r"https://sci-hub.vkif.top",
         
     | 
| 
       63 
     | 
    
         
            -
                        r"https://www.bothonce.com",
         
     | 
| 
       64 
     | 
    
         
            -
                        r"https://sci-hub.et-fine.com",
         
     | 
| 
       65 
     | 
    
         
            -
                        r"https://sci-hub.hkvisa.net",
         
     | 
| 
       66 
     | 
    
         
            -
                        # r"https://sci-hub.3800808.com", # 这个只能手动保存
         
     | 
| 
       67 
     | 
    
         
            -
                        r"https://sci-hub.zidianzhan.net",
         
     | 
| 
       68 
     | 
    
         
            -
                        r"https://sci-hub.usualwant.com",
         
     | 
| 
       69 
     | 
    
         
            -
                    ]
         
     | 
| 
       70 
     | 
    
         
            -
                    self.base_url = None
         
     | 
| 
       71 
     | 
    
         
            -
                    self.url = None
         
     | 
| 
       72 
     | 
    
         
            -
                    self.doi = doi
         
     | 
| 
       73 
     | 
    
         
            -
                    self.pdf_url = None
         
     | 
| 
       74 
     | 
    
         
            -
                    self.pdf_path = None
         
     | 
| 
       75 
     | 
    
         
            -
                    self.headers = {"User-Agent": get_ua().encode("utf-8")}
         
     | 
| 
       76 
     | 
    
         
            -
                    # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
         
     | 
| 
       77 
     | 
    
         
            -
                    # self.fname = doi.replace(r'/', '_') + '.pdf'
         
     | 
| 
       78 
     | 
    
         
            -
                    self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
         
     | 
| 
       79 
     | 
    
         
            -
                    self.store_path = Path(store_path)
         
     | 
| 
       80 
     | 
    
         
            -
                    self.fpath = self.store_path / self.fname
         
     | 
| 
       81 
     | 
    
         
            -
                    self.wrong_record_file = self.store_path / "wrong_record.txt"
         
     | 
| 
       82 
     | 
    
         
            -
                    self.sleep = 5
         
     | 
| 
       83 
     | 
    
         
            -
                    self.cookies = None
         
     | 
| 
       84 
     | 
    
         
            -
                    self.check_size = 50
         
     | 
| 
       85 
     | 
    
         
            -
                    self.url_index = 0
         
     | 
| 
       86 
     | 
    
         
            -
                    self.try_times_each_url_max = 3
         
     | 
| 
       87 
     | 
    
         
            -
                    self.try_times = 0
         
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
                def get_pdf_url(self):
         
     | 
| 
       90 
     | 
    
         
            -
                    print("[bold #E6E6FA]-" * 120)
         
     | 
| 
       91 
     | 
    
         
            -
                    print(f"DOI: {self.doi}")
         
     | 
| 
       92 
     | 
    
         
            -
                    print(f"Requesting: {self.url}...")
         
     | 
| 
       93 
     | 
    
         
            -
                    try:
         
     | 
| 
       94 
     | 
    
         
            -
                        response = requests.get(self.url, headers=self.headers)
         
     | 
| 
       95 
     | 
    
         
            -
                        if response.status_code == 200:
         
     | 
| 
       96 
     | 
    
         
            -
                            self.cookies = response.cookies
         
     | 
| 
       97 
     | 
    
         
            -
                            text = response.text.replace("\\", "")
         
     | 
| 
       98 
     | 
    
         
            -
                            # text = text.replace(' ', '')  # It is important to remove the space
         
     | 
| 
       99 
     | 
    
         
            -
                            # print(text)
         
     | 
| 
       100 
     | 
    
         
            -
                            pattern = re.compile(r'onclick = "location.href=\'(.*?\.pdf\?download=true)\'"')
         
     | 
| 
       101 
     | 
    
         
            -
                            match = pattern.search(text)
         
     | 
| 
       102 
     | 
    
         
            -
                            if match:
         
     | 
| 
       103 
     | 
    
         
            -
                                got_url = match.group(1)
         
     | 
| 
       104 
     | 
    
         
            -
                                if r"http" not in got_url:
         
     | 
| 
       105 
     | 
    
         
            -
                                    if got_url[:2] == "//":
         
     | 
| 
       106 
     | 
    
         
            -
                                        self.pdf_url = "https:" + got_url
         
     | 
| 
       107 
     | 
    
         
            -
                                    else:
         
     | 
| 
       108 
     | 
    
         
            -
                                        self.pdf_url = self.base_url + got_url
         
     | 
| 
       109 
     | 
    
         
            -
                                else:
         
     | 
| 
       110 
     | 
    
         
            -
                                    self.pdf_url = got_url
         
     | 
| 
       111 
     | 
    
         
            -
                                print(f"URL: {self.pdf_url}")
         
     | 
| 
       112 
     | 
    
         
            -
                            else:
         
     | 
| 
       113 
     | 
    
         
            -
                                print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
         
     | 
| 
       114 
     | 
    
         
            -
                                self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
       115 
     | 
    
         
            -
                        else:
         
     | 
| 
       116 
     | 
    
         
            -
                            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
         
     | 
| 
       117 
     | 
    
         
            -
                            print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
         
     | 
| 
       118 
     | 
    
         
            -
                            self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
       119 
     | 
    
         
            -
                    except Exception as e:
         
     | 
| 
       120 
     | 
    
         
            -
                        print(f"Failed to retrieve the webpage. Error: {e}")
         
     | 
| 
       121 
     | 
    
         
            -
                        self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
       122 
     | 
    
         
            -
             
     | 
| 
       123 
     | 
    
         
            -
                def url_iterate(self):
         
     | 
| 
       124 
     | 
    
         
            -
                    if self.url_index >= len(self.url_list):
         
     | 
| 
       125 
     | 
    
         
            -
                        return
         
     | 
| 
       126 
     | 
    
         
            -
                    url = self.url_list[self.url_index]
         
     | 
| 
       127 
     | 
    
         
            -
                    self.base_url = url
         
     | 
| 
       128 
     | 
    
         
            -
                    self.url = url + "/" + self.doi
         
     | 
| 
       129 
     | 
    
         
            -
                    self.get_pdf_url()
         
     | 
| 
       130 
     | 
    
         
            -
                    # for url in self.url_list:
         
     | 
| 
       131 
     | 
    
         
            -
                    #     self.url = url + self.doi
         
     | 
| 
       132 
     | 
    
         
            -
                    #     self.get_pdf_url()
         
     | 
| 
       133 
     | 
    
         
            -
                    #     if self.pdf_url:
         
     | 
| 
       134 
     | 
    
         
            -
                    #         break
         
     | 
| 
       135 
     | 
    
         
            -
             
     | 
| 
       136 
     | 
    
         
            -
                def write_wrong_record(self):
         
     | 
| 
       137 
     | 
    
         
            -
                    # 先读取txt中的内容,如果已经存在则不再写入
         
     | 
| 
       138 
     | 
    
         
            -
                    if self.wrong_record_file.exists():
         
     | 
| 
       139 
     | 
    
         
            -
                        with open(self.wrong_record_file, "r") as f:
         
     | 
| 
       140 
     | 
    
         
            -
                            lines = f.readlines()
         
     | 
| 
       141 
     | 
    
         
            -
                        if self.doi in lines:
         
     | 
| 
       142 
     | 
    
         
            -
                            return
         
     | 
| 
       143 
     | 
    
         
            -
                    with open(self.wrong_record_file, "a") as f:
         
     | 
| 
       144 
     | 
    
         
            -
                        f.write(self.doi + "\n")
         
     | 
| 
       145 
     | 
    
         
            -
             
     | 
| 
       146 
     | 
    
         
            -
                def download_pdf(self):
         
     | 
| 
       147 
     | 
    
         
            -
                    if self.fpath.exists():
         
     | 
| 
       148 
     | 
    
         
            -
                        fsize = _get_file_size(self.fpath, unit="KB")
         
     | 
| 
       149 
     | 
    
         
            -
                        if fsize < self.check_size:
         
     | 
| 
       150 
     | 
    
         
            -
                            # delete the wrong file
         
     | 
| 
       151 
     | 
    
         
            -
                            os.remove(self.fpath)
         
     | 
| 
       152 
     | 
    
         
            -
                            print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
         
     | 
| 
       153 
     | 
    
         
            -
                        else:
         
     | 
| 
       154 
     | 
    
         
            -
                            print("[bold #E6E6FA]-" * 120)
         
     | 
| 
       155 
     | 
    
         
            -
                            print(f"[bold purple]The PDF file {self.fpath} already exists.")
         
     | 
| 
       156 
     | 
    
         
            -
                            return
         
     | 
| 
       157 
     | 
    
         
            -
                    self.url_index = 0
         
     | 
| 
       158 
     | 
    
         
            -
                    already_downloaded = False
         
     | 
| 
       159 
     | 
    
         
            -
                    self.try_times = 0
         
     | 
| 
       160 
     | 
    
         
            -
                    while not already_downloaded:
         
     | 
| 
       161 
     | 
    
         
            -
                        self.url_iterate()
         
     | 
| 
       162 
     | 
    
         
            -
                        if not self.pdf_url:
         
     | 
| 
       163 
     | 
    
         
            -
                            self.url_index += 1
         
     | 
| 
       164 
     | 
    
         
            -
                            if self.url_index >= len(self.url_list):
         
     | 
| 
       165 
     | 
    
         
            -
                                print("Failed to download the PDF file.")
         
     | 
| 
       166 
     | 
    
         
            -
                                self.write_wrong_record()
         
     | 
| 
       167 
     | 
    
         
            -
                                return
         
     | 
| 
       168 
     | 
    
         
            -
                            else:
         
     | 
| 
       169 
     | 
    
         
            -
                                self.try_times = 0
         
     | 
| 
       170 
     | 
    
         
            -
                                continue
         
     | 
| 
       171 
     | 
    
         
            -
                        else:
         
     | 
| 
       172 
     | 
    
         
            -
                            self.try_times += 1
         
     | 
| 
       173 
     | 
    
         
            -
                        if self.try_times > self.try_times_each_url_max:
         
     | 
| 
       174 
     | 
    
         
            -
                            self.url_index += 1
         
     | 
| 
       175 
     | 
    
         
            -
                            if self.url_index >= len(self.url_list):
         
     | 
| 
       176 
     | 
    
         
            -
                                # print("Failed to download the PDF file.")
         
     | 
| 
       177 
     | 
    
         
            -
                                self.write_wrong_record()
         
     | 
| 
       178 
     | 
    
         
            -
                                return
         
     | 
| 
       179 
     | 
    
         
            -
                        print(f"Downloading: {self.fname}...")
         
     | 
| 
       180 
     | 
    
         
            -
                        try:
         
     | 
| 
       181 
     | 
    
         
            -
                            response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
         
     | 
| 
       182 
     | 
    
         
            -
                            if response.status_code == 200:
         
     | 
| 
       183 
     | 
    
         
            -
                                with open(self.fpath, "wb") as f:
         
     | 
| 
       184 
     | 
    
         
            -
                                    f.write(response.content)
         
     | 
| 
       185 
     | 
    
         
            -
                                fsize = _get_file_size(self.fpath, unit="KB")
         
     | 
| 
       186 
     | 
    
         
            -
                                if fsize < self.check_size:
         
     | 
| 
       187 
     | 
    
         
            -
                                    # delete the wrong file
         
     | 
| 
       188 
     | 
    
         
            -
                                    os.remove(self.fpath)
         
     | 
| 
       189 
     | 
    
         
            -
                                    print(f"[bold yellow]The PDF file {self.fpath} is only {fsize:.2f} KB. It will be deleted and retry.")
         
     | 
| 
       190 
     | 
    
         
            -
                                else:
         
     | 
| 
       191 
     | 
    
         
            -
                                    print(f"[bold green]Sucessful to download {self.fpath}")
         
     | 
| 
       192 
     | 
    
         
            -
                                    already_downloaded = True
         
     | 
| 
       193 
     | 
    
         
            -
                            else:
         
     | 
| 
       194 
     | 
    
         
            -
                                self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
       195 
     | 
    
         
            -
                                print(f"Failed to download the PDF file. Status code: {response.status_code}")
         
     | 
| 
       196 
     | 
    
         
            -
                                print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not inlcude the PDF file.")
         
     | 
| 
       197 
     | 
    
         
            -
                        except Exception as e:
         
     | 
| 
       198 
     | 
    
         
            -
                            print(f"Failed to download the PDF file. Error: {e}")
         
     | 
| 
       199 
     | 
    
         
            -
                        time.sleep(self.sleep)
         
     | 
| 
       200 
     | 
    
         
            -
                        if self.try_times >= self.try_times_each_url_max:
         
     | 
| 
       201 
     | 
    
         
            -
                            self.url_index += 1
         
     | 
| 
       202 
     | 
    
         
            -
                            if self.url_index >= len(self.url_list):
         
     | 
| 
       203 
     | 
    
         
            -
                                print("\n[bold #CD5C5C]Failed to download the PDF file.")
         
     | 
| 
       204 
     | 
    
         
            -
                                self.write_wrong_record()
         
     | 
| 
       205 
     | 
    
         
            -
                                return
         
     | 
| 
       206 
     | 
    
         
            -
                            if self.try_times == self.try_times_each_url_max:
         
     | 
| 
       207 
     | 
    
         
            -
                                print(f"Tried {self.try_times} times for {self.url_list[self.url_index-1]}.")
         
     | 
| 
       208 
     | 
    
         
            -
                                print("Try another URL...")
         
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
             
     | 
| 
       211 
     | 
    
         
            -
            def _read_excel(file, col_name=r"DOI"):
         
     | 
| 
       212 
     | 
    
         
            -
                df = pd.read_excel(file)
         
     | 
| 
       213 
     | 
    
         
            -
                df_list = df[col_name].tolist()
         
     | 
| 
       214 
     | 
    
         
            -
                # 去掉nan
         
     | 
| 
       215 
     | 
    
         
            -
                df_list = [doi for doi in df_list if str(doi) != "nan"]
         
     | 
| 
       216 
     | 
    
         
            -
                return df_list
         
     | 
| 
       217 
     | 
    
         
            -
             
     | 
| 
       218 
     | 
    
         
            -
             
     | 
| 
       219 
     | 
    
         
            -
            def _read_txt(file):
         
     | 
| 
       220 
     | 
    
         
            -
                with open(file, "r") as f:
         
     | 
| 
       221 
     | 
    
         
            -
                    lines = f.readlines()
         
     | 
| 
       222 
     | 
    
         
            -
                # 去掉换行符以及空行
         
     | 
| 
       223 
     | 
    
         
            -
                lines = [line.strip() for line in lines if line.strip()]
         
     | 
| 
       224 
     | 
    
         
            -
                return lines
         
     | 
| 
       225 
     | 
    
         
            -
             
     | 
| 
       226 
     | 
    
         
            -
             
     | 
| 
       227 
     | 
    
         
            -
            def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, col_name=r"DOI"):
         
     | 
| 
       228 
     | 
    
         
            -
                """
         
     | 
| 
       229 
     | 
    
         
            -
                Description:
         
     | 
| 
       230 
     | 
    
         
            -
                    Download PDF files by DOI.
         
     | 
| 
       231 
     | 
    
         
            -
             
     | 
| 
       232 
     | 
    
         
            -
                Parameters:
         
     | 
| 
       233 
     | 
    
         
            -
                    store_path: str, The path to store the PDF files.
         
     | 
| 
       234 
     | 
    
         
            -
                    doi_list: list or str, The list of DOIs.
         
     | 
| 
       235 
     | 
    
         
            -
                    txt_file: str, The path of the txt file that contains the DOIs.
         
     | 
| 
       236 
     | 
    
         
            -
                    excel_file: str, The path of the excel file that contains the DOIs.
         
     | 
| 
       237 
     | 
    
         
            -
                    col_name: str, The column name of the DOIs in the excel file. Default is 'DOI'.
         
     | 
| 
       238 
     | 
    
         
            -
             
     | 
| 
       239 
     | 
    
         
            -
                Returns:
         
     | 
| 
       240 
     | 
    
         
            -
                    None
         
     | 
| 
       241 
     | 
    
         
            -
             
     | 
| 
       242 
     | 
    
         
            -
                Example:
         
     | 
| 
       243 
     | 
    
         
            -
                    download5doi(doi_list='10.3389/feart.2021.698876')
         
     | 
| 
       244 
     | 
    
         
            -
                    download5doi(store_path='I:\\Delete\\ref_pdf', doi_list='10.3389/feart.2021.698876')
         
     | 
| 
       245 
     | 
    
         
            -
                    download5doi(store_path='I:\\Delete\\ref_pdf', doi_list=['10.3389/feart.2021.698876', '10.3389/feart.2021.698876'])
         
     | 
| 
       246 
     | 
    
         
            -
                    download5doi(store_path='I:\\Delete\\ref_pdf', txt_file='I:\\Delete\\ref_pdf\\wrong_record.txt')
         
     | 
| 
       247 
     | 
    
         
            -
                    download5doi(store_path='I:\\Delete\\ref_pdf', excel_file='I:\\Delete\\ref_pdf\\wrong_record.xlsx')
         
     | 
| 
       248 
     | 
    
         
            -
                    download5doi(store_path='I:\\Delete\\ref_pdf', excel_file='I:\\Delete\\ref_pdf\\wrong_record.xlsx', col_name='DOI')
         
     | 
| 
       249 
     | 
    
         
            -
                """
         
     | 
| 
       250 
     | 
    
         
            -
                if not store_path:
         
     | 
| 
       251 
     | 
    
         
            -
                    store_path = Path.cwd()
         
     | 
| 
       252 
     | 
    
         
            -
                else:
         
     | 
| 
       253 
     | 
    
         
            -
                    store_path = Path(str(store_path))
         
     | 
| 
       254 
     | 
    
         
            -
                store_path.mkdir(parents=True, exist_ok=True)
         
     | 
| 
       255 
     | 
    
         
            -
                store_path = str(store_path)
         
     | 
| 
       256 
     | 
    
         
            -
             
     | 
| 
       257 
     | 
    
         
            -
                if doi_list:
         
     | 
| 
       258 
     | 
    
         
            -
                    doi_list = ensure_list(doi_list)
         
     | 
| 
       259 
     | 
    
         
            -
                if txt_file:
         
     | 
| 
       260 
     | 
    
         
            -
                    doi_list = _read_txt(txt_file)
         
     | 
| 
       261 
     | 
    
         
            -
                if excel_file:
         
     | 
| 
       262 
     | 
    
         
            -
                    doi_list = _read_excel(excel_file, col_name)
         
     | 
| 
       263 
     | 
    
         
            -
                remove(Path(store_path) / "wrong_record.txt")
         
     | 
| 
       264 
     | 
    
         
            -
                print(f"Downloading {len(doi_list)} PDF files...")
         
     | 
| 
       265 
     | 
    
         
            -
                for doi in track(doi_list, description="Downloading..."):
         
     | 
| 
       266 
     | 
    
         
            -
                    download = _Downloader(doi, store_path)
         
     | 
| 
       267 
     | 
    
         
            -
                    download.download_pdf()
         
     | 
| 
       268 
     | 
    
         
            -
             
     | 
| 
       269 
     | 
    
         
            -
             
     | 
| 
       270 
     | 
    
         
            -
            if __name__ == "__main__":
         
     | 
| 
       271 
     | 
    
         
            -
                store_path = r"F:\AAA-Delete\DOI_Reference\5\pdf"
         
     | 
| 
       272 
     | 
    
         
            -
                excel_file = r"F:\AAA-Delete\DOI_Reference\5\savedrecs.xls"
         
     | 
| 
       273 
     | 
    
         
            -
                download5doi(store_path, excel_file=excel_file)
         
     | 
| 
         @@ -1,53 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            from rich import print
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
             
     | 
| 
       4 
     | 
    
         
            -
            __all__ = ["os_command", "get_queue_node"]
         
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
            # 负责执行命令并返回输出
         
     | 
| 
       8 
     | 
    
         
            -
            def os_command(cmd):
         
     | 
| 
       9 
     | 
    
         
            -
                import subprocess
         
     | 
| 
       10 
     | 
    
         
            -
                print(f'🔍 执行命令: {cmd}')
         
     | 
| 
       11 
     | 
    
         
            -
                result = subprocess.run(
         
     | 
| 
       12 
     | 
    
         
            -
                    cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
         
     | 
| 
       13 
     | 
    
         
            -
                )
         
     | 
| 
       14 
     | 
    
         
            -
                # 打印错误信息(若有,方便排查问题)
         
     | 
| 
       15 
     | 
    
         
            -
                if result.stderr:
         
     | 
| 
       16 
     | 
    
         
            -
                    print(f'❌ 错误输出: {result.stderr.strip()}')
         
     | 
| 
       17 
     | 
    
         
            -
                # 检查命令是否执行成功(非0为失败)
         
     | 
| 
       18 
     | 
    
         
            -
                if result.returncode != 0:
         
     | 
| 
       19 
     | 
    
         
            -
                    print(f'❌ 命令执行失败,退出码: {result.returncode}')
         
     | 
| 
       20 
     | 
    
         
            -
                    return None
         
     | 
| 
       21 
     | 
    
         
            -
                return result.stdout
         
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
            # 返回“队列名:节点数”的字典
         
     | 
| 
       24 
     | 
    
         
            -
            def get_queue_node():
         
     | 
| 
       25 
     | 
    
         
            -
                import re
         
     | 
| 
       26 
     | 
    
         
            -
                # 执行 sinfo | grep "idle" 获取空闲队列数据
         
     | 
| 
       27 
     | 
    
         
            -
                cmd = 'sinfo | grep "idle"'
         
     | 
| 
       28 
     | 
    
         
            -
                output = os_command(cmd)
         
     | 
| 
       29 
     | 
    
         
            -
                if not output:  # 命令执行失败或无输出,返回空字典
         
     | 
| 
       30 
     | 
    
         
            -
                    return {}
         
     | 
| 
       31 
     | 
    
         
            -
                
         
     | 
| 
       32 
     | 
    
         
            -
                # 初始化结果字典:键=队列名,值=节点数
         
     | 
| 
       33 
     | 
    
         
            -
                queue_node_dict = {}
         
     | 
| 
       34 
     | 
    
         
            -
                # 按行解析命令输出
         
     | 
| 
       35 
     | 
    
         
            -
                for line in output.strip().split('\n'):
         
     | 
| 
       36 
     | 
    
         
            -
                    line = line.strip()
         
     | 
| 
       37 
     | 
    
         
            -
                    if not line:  # 跳过空行
         
     | 
| 
       38 
     | 
    
         
            -
                        continue
         
     | 
| 
       39 
     | 
    
         
            -
                    
         
     | 
| 
       40 
     | 
    
         
            -
                    # 正则匹配:仅捕获“队列名”(第1组)和“节点数”(第2组)
         
     | 
| 
       41 
     | 
    
         
            -
                    # 末尾用 .* 忽略节点列表,不影响匹配
         
     | 
| 
       42 
     | 
    
         
            -
                    pattern = r"^(\S+)\s+\S+\s+\S+\s+(\d+)\s+idle\s+.*$"
         
     | 
| 
       43 
     | 
    
         
            -
                    match = re.match(pattern, line)
         
     | 
| 
       44 
     | 
    
         
            -
                    
         
     | 
| 
       45 
     | 
    
         
            -
                    if match:
         
     | 
| 
       46 
     | 
    
         
            -
                        queue_name = match.group(1)    # 提取队列名作为字典的键
         
     | 
| 
       47 
     | 
    
         
            -
                        node_count = int(match.group(2))# 提取节点数作为字典的值(转为整数)
         
     | 
| 
       48 
     | 
    
         
            -
                        queue_node_dict[queue_name] = node_count  # 存入字典
         
     | 
| 
       49 
     | 
    
         
            -
                
         
     | 
| 
       50 
     | 
    
         
            -
                return queue_node_dict
         
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
            if __name__ == "__main__":
         
     | 
| 
       53 
     | 
    
         
            -
                pass
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     |