oafuncs 0.0.98.45__py3-none-any.whl → 0.0.98.47__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oafuncs/oa_cmap.py +3 -0
 - oafuncs/oa_down/literature.py +141 -40
 - oafuncs/oa_linux.py +79 -2
 - {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/METADATA +1 -1
 - {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/RECORD +8 -8
 - {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/WHEEL +0 -0
 - {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/licenses/LICENSE.txt +0 -0
 - {oafuncs-0.0.98.45.dist-info → oafuncs-0.0.98.47.dist-info}/top_level.txt +0 -0
 
    
        oafuncs/oa_cmap.py
    CHANGED
    
    | 
         @@ -271,6 +271,9 @@ def get(colormap_name: Optional[str] = None, show_available: bool = False) -> Op 
     | 
|
| 
       271 
271 
     | 
    
         
             
                    "diverging_4": ["#5DADE2", "#A2D9F7", "#D6EAF8", "#F2F3F4", "#FADBD8", "#F1948A", "#E74C3C"],
         
     | 
| 
       272 
272 
     | 
    
         
             
                    # ----------------------------------------------------------------------------
         
     | 
| 
       273 
273 
     | 
    
         
             
                    "colorful_1": ["#6d00db", "#9800cb", "#F2003C", "#ff4500", "#ff7f00", "#FE28A2", "#FFC0CB", "#DDA0DD", "#40E0D0", "#1a66f2", "#00f7fb", "#8fff88", "#E3FF00"],
         
     | 
| 
      
 274 
     | 
    
         
            +
                    # ----------------------------------------------------------------------------
         
     | 
| 
      
 275 
     | 
    
         
            +
                    "increasing_1": ["#FFFFFF", "#E6F7FF", "#A5E6F8", "#049CD4", "#11B5A3", "#04BC4C", "#74CC54", "#D9DD5C", "#FB922E", "#FC2224", "#E51C18", "#8B0000"],
         
     | 
| 
      
 276 
     | 
    
         
            +
                    # ----------------------------------------------------------------------------
         
     | 
| 
       274 
277 
     | 
    
         
             
                }
         
     | 
| 
       275 
278 
     | 
    
         | 
| 
       276 
279 
     | 
    
         
             
                if show_available:
         
     | 
    
        oafuncs/oa_down/literature.py
    CHANGED
    
    | 
         @@ -2,6 +2,7 @@ import os 
     | 
|
| 
       2 
2 
     | 
    
         
             
            import re
         
     | 
| 
       3 
3 
     | 
    
         
             
            import time
         
     | 
| 
       4 
4 
     | 
    
         
             
            from pathlib import Path
         
     | 
| 
      
 5 
     | 
    
         
            +
            from urllib.parse import urljoin
         
     | 
| 
       5 
6 
     | 
    
         | 
| 
       6 
7 
     | 
    
         
             
            import pandas as pd
         
     | 
| 
       7 
8 
     | 
    
         
             
            import requests
         
     | 
| 
         @@ -46,75 +47,142 @@ class _Downloader: 
     | 
|
| 
       46 
47 
     | 
    
         
             
                根据doi下载文献pdf
         
     | 
| 
       47 
48 
     | 
    
         
             
                """
         
     | 
| 
       48 
49 
     | 
    
         | 
| 
       49 
     | 
    
         
            -
                 
     | 
| 
      
 50 
     | 
    
         
            +
                # 进程级缓存:首次探测后的可用镜像列表,后续复用
         
     | 
| 
      
 51 
     | 
    
         
            +
                _alive_mirrors_cache: list[str] | None = None
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
                def __init__(self, doi, store_path, *, min_size_kb=50, timeout_html=15, timeout_pdf=30, sleep_secs=5, tries_each_url=3, debug=False):
         
     | 
| 
       50 
54 
     | 
    
         
             
                    self.url_list = [
         
     | 
| 
       51 
55 
     | 
    
         
             
                        r"https://sci-hub.se",
         
     | 
| 
       52 
56 
     | 
    
         
             
                        r"https://sci-hub.ren",
         
     | 
| 
       53 
57 
     | 
    
         
             
                        r"https://sci-hub.st",
         
     | 
| 
       54 
     | 
    
         
            -
                        r"https://sci-hub.ru", 
     | 
| 
      
 58 
     | 
    
         
            +
                        r"https://sci-hub.ru",  # 最好用的一个网站
         
     | 
| 
       55 
59 
     | 
    
         
             
                        # ------------------------------------- 以下网站没验证
         
     | 
| 
       56 
     | 
    
         
            -
                        r"https://sci-hub.wf",
         
     | 
| 
       57 
     | 
    
         
            -
                        r"https://sci-hub.yt",
         
     | 
| 
       58 
     | 
    
         
            -
                        r"https://sci-hub.ee",
         
     | 
| 
       59 
     | 
    
         
            -
                        r"https://sci-hub.cat",
         
     | 
| 
       60 
60 
     | 
    
         
             
                        r"https://sci-hub.in",
         
     | 
| 
       61 
     | 
    
         
            -
                        r"https:// 
     | 
| 
       62 
     | 
    
         
            -
                        r"https://sci-hub.vkif.top",
         
     | 
| 
       63 
     | 
    
         
            -
                        r"https://www.bothonce.com",
         
     | 
| 
       64 
     | 
    
         
            -
                        r"https://sci-hub.et-fine.com",
         
     | 
| 
       65 
     | 
    
         
            -
                        r"https://sci-hub.hkvisa.net",
         
     | 
| 
       66 
     | 
    
         
            -
                        # r"https://sci-hub.3800808.com", # 这个只能手动保存
         
     | 
| 
       67 
     | 
    
         
            -
                        r"https://sci-hub.zidianzhan.net",
         
     | 
| 
       68 
     | 
    
         
            -
                        r"https://sci-hub.usualwant.com",
         
     | 
| 
      
 61 
     | 
    
         
            +
                        r"https://sci-hub.hlgczx.com/",
         
     | 
| 
       69 
62 
     | 
    
         
             
                    ]
         
     | 
| 
       70 
63 
     | 
    
         
             
                    self.base_url = None
         
     | 
| 
       71 
64 
     | 
    
         
             
                    self.url = None
         
     | 
| 
       72 
65 
     | 
    
         
             
                    self.doi = doi
         
     | 
| 
       73 
66 
     | 
    
         
             
                    self.pdf_url = None
         
     | 
| 
       74 
67 
     | 
    
         
             
                    self.pdf_path = None
         
     | 
| 
       75 
     | 
    
         
            -
                     
     | 
| 
      
 68 
     | 
    
         
            +
                    # requests 期望 header 值为 str,这里确保 UA 是字符串而不是 bytes
         
     | 
| 
      
 69 
     | 
    
         
            +
                    self.headers = {"User-Agent": str(get_ua())}
         
     | 
| 
       76 
70 
     | 
    
         
             
                    # 10.1175/1520-0493(1997)125<0742:IODAOO>2.0.CO;2.pdf
         
     | 
| 
       77 
71 
     | 
    
         
             
                    # self.fname = doi.replace(r'/', '_') + '.pdf'
         
     | 
| 
       78 
72 
     | 
    
         
             
                    self.fname = re.sub(r'[/<>:"?*|]', "_", doi) + ".pdf"
         
     | 
| 
       79 
73 
     | 
    
         
             
                    self.store_path = Path(store_path)
         
     | 
| 
       80 
74 
     | 
    
         
             
                    self.fpath = self.store_path / self.fname
         
     | 
| 
       81 
75 
     | 
    
         
             
                    self.wrong_record_file = self.store_path / "wrong_record.txt"
         
     | 
| 
       82 
     | 
    
         
            -
                    self.sleep =  
     | 
| 
      
 76 
     | 
    
         
            +
                    self.sleep = sleep_secs
         
     | 
| 
       83 
77 
     | 
    
         
             
                    self.cookies = None
         
     | 
| 
       84 
     | 
    
         
            -
                    self.check_size =  
     | 
| 
      
 78 
     | 
    
         
            +
                    self.check_size = max(1, int(min_size_kb))
         
     | 
| 
       85 
79 
     | 
    
         
             
                    self.url_index = 0
         
     | 
| 
       86 
     | 
    
         
            -
                    self.try_times_each_url_max =  
     | 
| 
      
 80 
     | 
    
         
            +
                    self.try_times_each_url_max = max(1, int(tries_each_url))
         
     | 
| 
       87 
81 
     | 
    
         
             
                    self.try_times = 0
         
     | 
| 
      
 82 
     | 
    
         
            +
                    self.timeout_html = max(5, int(timeout_html))
         
     | 
| 
      
 83 
     | 
    
         
            +
                    self.timeout_pdf = max(5, int(timeout_pdf))
         
     | 
| 
      
 84 
     | 
    
         
            +
                    self.debug = bool(debug)
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                # ---------------- 镜像可用性探测 ----------------
         
     | 
| 
      
 87 
     | 
    
         
            +
                def _is_mirror_alive(self, base_url: str) -> bool:
         
     | 
| 
      
 88 
     | 
    
         
            +
                    """
         
     | 
| 
      
 89 
     | 
    
         
            +
                    仅检测镜像根路径是否可连通(HTTP 200 即认为可用)。
         
     | 
| 
      
 90 
     | 
    
         
            +
                    不访问具体 DOI,避免被动触发风控;只做连通性筛查。
         
     | 
| 
      
 91 
     | 
    
         
            +
                    """
         
     | 
| 
      
 92 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 93 
     | 
    
         
            +
                        r = requests.get(base_url + "/", headers=self.headers, timeout=8, allow_redirects=True)
         
     | 
| 
      
 94 
     | 
    
         
            +
                        return 200 <= r.status_code < 400
         
     | 
| 
      
 95 
     | 
    
         
            +
                    except Exception:
         
     | 
| 
      
 96 
     | 
    
         
            +
                        return False
         
     | 
| 
      
 97 
     | 
    
         
            +
             
     | 
| 
      
 98 
     | 
    
         
            +
                def _ensure_alive_mirrors(self):
         
     | 
| 
      
 99 
     | 
    
         
            +
                    # 若已经有进程级缓存,直接复用
         
     | 
| 
      
 100 
     | 
    
         
            +
                    if _Downloader._alive_mirrors_cache is not None:
         
     | 
| 
      
 101 
     | 
    
         
            +
                        self.url_list = list(_Downloader._alive_mirrors_cache)
         
     | 
| 
      
 102 
     | 
    
         
            +
                        return
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
                    print(f"[bold cyan]Probing mirrors connectivity (first run)...")
         
     | 
| 
      
 105 
     | 
    
         
            +
                    alive = []
         
     | 
| 
      
 106 
     | 
    
         
            +
                    for base in self.url_list:
         
     | 
| 
      
 107 
     | 
    
         
            +
                        ok = self._is_mirror_alive(base)
         
     | 
| 
      
 108 
     | 
    
         
            +
                        status = "OK" if ok else "DOWN"
         
     | 
| 
      
 109 
     | 
    
         
            +
                        print(f"  [{status}] {base}")
         
     | 
| 
      
 110 
     | 
    
         
            +
                        if ok:
         
     | 
| 
      
 111 
     | 
    
         
            +
                            alive.append(base)
         
     | 
| 
      
 112 
     | 
    
         
            +
                    if alive:
         
     | 
| 
      
 113 
     | 
    
         
            +
                        _Downloader._alive_mirrors_cache = alive
         
     | 
| 
      
 114 
     | 
    
         
            +
                        self.url_list = alive
         
     | 
| 
      
 115 
     | 
    
         
            +
                        print(f"[bold cyan]Alive mirrors: {len(alive)}; pruned {len(set(self.url_list)) - len(alive) if self.url_list else 0}.")
         
     | 
| 
      
 116 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 117 
     | 
    
         
            +
                        print("[bold yellow]No mirror passed probe; keep original list for fallback attempts.")
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
      
 119 
     | 
    
         
            +
                def _extract_pdf_url_from_html(self, html: str) -> str | None:
         
     | 
| 
      
 120 
     | 
    
         
            +
                    """
         
     | 
| 
      
 121 
     | 
    
         
            +
                    从 Sci-Hub 页面 HTML 中尽可能稳健地提取 PDF 链接。
         
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
                    兼容多种模式:
         
     | 
| 
      
 124 
     | 
    
         
            +
                    - onclick="location.href='...pdf?download=true'"
         
     | 
| 
      
 125 
     | 
    
         
            +
                    - <iframe id="pdf" src="...pdf?...">
         
     | 
| 
      
 126 
     | 
    
         
            +
                    - <a ... href="...pdf?...">
         
     | 
| 
      
 127 
     | 
    
         
            +
                    - 其他出现 .pdf 的 src/href 场景
         
     | 
| 
      
 128 
     | 
    
         
            +
             
     | 
| 
      
 129 
     | 
    
         
            +
                    返回绝对 URL;若找不到返回 None。
         
     | 
| 
      
 130 
     | 
    
         
            +
                    """
         
     | 
| 
      
 131 
     | 
    
         
            +
                    text = html
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                    # 先尝试常见 onclick 跳转
         
     | 
| 
      
 134 
     | 
    
         
            +
                    patterns = [
         
     | 
| 
      
 135 
     | 
    
         
            +
                        # onclick="location.href='...pdf?...'" 或 document.location
         
     | 
| 
      
 136 
     | 
    
         
            +
                        r"onclick\s*=\s*[\"']\s*(?:document\.)?location\.href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
         
     | 
| 
      
 137 
     | 
    
         
            +
                        # iframe id="pdf" src="...pdf?..."
         
     | 
| 
      
 138 
     | 
    
         
            +
                        r"<iframe[^>]+id\s*=\s*[\"']pdf[\"'][^>]+src\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
         
     | 
| 
      
 139 
     | 
    
         
            +
                        # 通用 a 标签 href
         
     | 
| 
      
 140 
     | 
    
         
            +
                        r"<a[^>]+href\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
         
     | 
| 
      
 141 
     | 
    
         
            +
                        # 通用任意 src/href
         
     | 
| 
      
 142 
     | 
    
         
            +
                        r"(?:src|href)\s*=\s*[\"']([^\"']+?\.pdf(?:[?#][^\"']*)?)[\"']",
         
     | 
| 
      
 143 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 144 
     | 
    
         
            +
             
     | 
| 
      
 145 
     | 
    
         
            +
                    for pat in patterns:
         
     | 
| 
      
 146 
     | 
    
         
            +
                        m = re.search(pat, text, flags=re.IGNORECASE | re.DOTALL)
         
     | 
| 
      
 147 
     | 
    
         
            +
                        if m:
         
     | 
| 
      
 148 
     | 
    
         
            +
                            got_url = m.group(1)
         
     | 
| 
      
 149 
     | 
    
         
            +
                            # 规范化为绝对 URL
         
     | 
| 
      
 150 
     | 
    
         
            +
                            if got_url.startswith("//"):
         
     | 
| 
      
 151 
     | 
    
         
            +
                                return "https:" + got_url
         
     | 
| 
      
 152 
     | 
    
         
            +
                            if got_url.startswith("http://") or got_url.startswith("https://"):
         
     | 
| 
      
 153 
     | 
    
         
            +
                                return got_url
         
     | 
| 
      
 154 
     | 
    
         
            +
                            # 其余按相对路径处理
         
     | 
| 
      
 155 
     | 
    
         
            +
                            return urljoin(self.base_url + "/", got_url.lstrip("/"))
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                    return None
         
     | 
| 
       88 
158 
     | 
    
         | 
| 
       89 
159 
     | 
    
         
             
                def get_pdf_url(self):
         
     | 
| 
       90 
160 
     | 
    
         
             
                    print("[bold #E6E6FA]-" * 120)
         
     | 
| 
       91 
161 
     | 
    
         
             
                    print(f"DOI: {self.doi}")
         
     | 
| 
       92 
162 
     | 
    
         
             
                    print(f"Requesting: {self.url}...")
         
     | 
| 
       93 
163 
     | 
    
         
             
                    try:
         
     | 
| 
       94 
     | 
    
         
            -
                         
     | 
| 
      
 164 
     | 
    
         
            +
                        # 使用较小的超时时间避免长时间阻塞
         
     | 
| 
      
 165 
     | 
    
         
            +
                        response = requests.get(self.url, headers=self.headers, timeout=self.timeout_html)
         
     | 
| 
       95 
166 
     | 
    
         
             
                        if response.status_code == 200:
         
     | 
| 
       96 
167 
     | 
    
         
             
                            self.cookies = response.cookies
         
     | 
| 
       97 
     | 
    
         
            -
                            text = response.text 
     | 
| 
       98 
     | 
    
         
            -
                            #  
     | 
| 
       99 
     | 
    
         
            -
                             
     | 
| 
       100 
     | 
    
         
            -
             
     | 
| 
       101 
     | 
    
         
            -
                             
     | 
| 
       102 
     | 
    
         
            -
                            if  
     | 
| 
       103 
     | 
    
         
            -
                                 
     | 
| 
       104 
     | 
    
         
            -
             
     | 
| 
       105 
     | 
    
         
            -
                                    if got_url[:2] == "//":
         
     | 
| 
       106 
     | 
    
         
            -
                                        self.pdf_url = "https:" + got_url
         
     | 
| 
       107 
     | 
    
         
            -
                                    else:
         
     | 
| 
       108 
     | 
    
         
            -
                                        self.pdf_url = self.base_url + got_url
         
     | 
| 
      
 168 
     | 
    
         
            +
                            text = response.text
         
     | 
| 
      
 169 
     | 
    
         
            +
                            # 去除转义反斜杠,提升正则匹配成功率
         
     | 
| 
      
 170 
     | 
    
         
            +
                            text = text.replace("\\", "")
         
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                            self.pdf_url = self._extract_pdf_url_from_html(text)
         
     | 
| 
      
 173 
     | 
    
         
            +
                            if self.pdf_url:
         
     | 
| 
      
 174 
     | 
    
         
            +
                                if self.debug:
         
     | 
| 
      
 175 
     | 
    
         
            +
                                    print(f"Found PDF link: {self.pdf_url}")
         
     | 
| 
       109 
176 
     | 
    
         
             
                                else:
         
     | 
| 
       110 
     | 
    
         
            -
                                    self.pdf_url 
     | 
| 
       111 
     | 
    
         
            -
                                print(f"URL: {self.pdf_url}")
         
     | 
| 
      
 177 
     | 
    
         
            +
                                    print(f"Found PDF link (masked): .../{Path(self.pdf_url).name}")
         
     | 
| 
       112 
178 
     | 
    
         
             
                            else:
         
     | 
| 
       113 
     | 
    
         
            -
                                print( 
     | 
| 
      
 179 
     | 
    
         
            +
                                print(
         
     | 
| 
      
 180 
     | 
    
         
            +
                                    f"[bold #AFEEEE]The website {self.url_list[self.url_index]} does not expose a detectable PDF link (pattern mismatch)."
         
     | 
| 
      
 181 
     | 
    
         
            +
                                )
         
     | 
| 
       114 
182 
     | 
    
         
             
                                self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
       115 
183 
     | 
    
         
             
                        else:
         
     | 
| 
       116 
184 
     | 
    
         
             
                            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
         
     | 
| 
       117 
     | 
    
         
            -
                            print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not  
     | 
| 
      
 185 
     | 
    
         
            +
                            print(f"[bold #AFEEEE]The website {self.url_list[self.url_index]} do not include the PDF file (HTTP error).")
         
     | 
| 
       118 
186 
     | 
    
         
             
                            self.try_times = self.try_times_each_url_max + 1
         
     | 
| 
       119 
187 
     | 
    
         
             
                    except Exception as e:
         
     | 
| 
       120 
188 
     | 
    
         
             
                        print(f"Failed to retrieve the webpage. Error: {e}")
         
     | 
| 
         @@ -178,7 +246,7 @@ class _Downloader: 
     | 
|
| 
       178 
246 
     | 
    
         
             
                                return
         
     | 
| 
       179 
247 
     | 
    
         
             
                        print(f"Downloading: {self.fname}...")
         
     | 
| 
       180 
248 
     | 
    
         
             
                        try:
         
     | 
| 
       181 
     | 
    
         
            -
                            response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies)
         
     | 
| 
      
 249 
     | 
    
         
            +
                            response = requests.get(self.pdf_url, headers=self.headers, cookies=self.cookies, timeout=self.timeout_pdf)
         
     | 
| 
       182 
250 
     | 
    
         
             
                            if response.status_code == 200:
         
     | 
| 
       183 
251 
     | 
    
         
             
                                with open(self.fpath, "wb") as f:
         
     | 
| 
       184 
252 
     | 
    
         
             
                                    f.write(response.content)
         
     | 
| 
         @@ -224,7 +292,22 @@ def _read_txt(file): 
     | 
|
| 
       224 
292 
     | 
    
         
             
                return lines
         
     | 
| 
       225 
293 
     | 
    
         | 
| 
       226 
294 
     | 
    
         | 
| 
       227 
     | 
    
         
            -
            def download5doi( 
     | 
| 
      
 295 
     | 
    
         
            +
            def download5doi(
         
     | 
| 
      
 296 
     | 
    
         
            +
                store_path=None,
         
     | 
| 
      
 297 
     | 
    
         
            +
                doi_list=None,
         
     | 
| 
      
 298 
     | 
    
         
            +
                txt_file=None,
         
     | 
| 
      
 299 
     | 
    
         
            +
                excel_file=None,
         
     | 
| 
      
 300 
     | 
    
         
            +
                col_name=r"DOI",
         
     | 
| 
      
 301 
     | 
    
         
            +
                *,
         
     | 
| 
      
 302 
     | 
    
         
            +
                probe_mirrors: bool = True,
         
     | 
| 
      
 303 
     | 
    
         
            +
                min_size_kb: int = 50,
         
     | 
| 
      
 304 
     | 
    
         
            +
                timeout_html: int = 15,
         
     | 
| 
      
 305 
     | 
    
         
            +
                timeout_pdf: int = 30,
         
     | 
| 
      
 306 
     | 
    
         
            +
                tries_each_url: int = 3,
         
     | 
| 
      
 307 
     | 
    
         
            +
                sleep_secs: int = 5,
         
     | 
| 
      
 308 
     | 
    
         
            +
                force: bool = False,
         
     | 
| 
      
 309 
     | 
    
         
            +
                debug: bool = False,
         
     | 
| 
      
 310 
     | 
    
         
            +
            ):
         
     | 
| 
       228 
311 
     | 
    
         
             
                """
         
     | 
| 
       229 
312 
     | 
    
         
             
                Description:
         
     | 
| 
       230 
313 
     | 
    
         
             
                    Download PDF files by DOI.
         
     | 
| 
         @@ -260,11 +343,29 @@ def download5doi(store_path=None, doi_list=None, txt_file=None, excel_file=None, 
     | 
|
| 
       260 
343 
     | 
    
         
             
                    doi_list = _read_txt(txt_file)
         
     | 
| 
       261 
344 
     | 
    
         
             
                if excel_file:
         
     | 
| 
       262 
345 
     | 
    
         
             
                    doi_list = _read_excel(excel_file, col_name)
         
     | 
| 
       263 
     | 
    
         
            -
                 
     | 
| 
      
 346 
     | 
    
         
            +
                # 去重并清洗
         
     | 
| 
      
 347 
     | 
    
         
            +
                doi_list = [str(x).strip() for x in doi_list if str(x).strip()]
         
     | 
| 
      
 348 
     | 
    
         
            +
                doi_list = list(dict.fromkeys(doi_list))  # 保序去重
         
     | 
| 
      
 349 
     | 
    
         
            +
             
     | 
| 
      
 350 
     | 
    
         
            +
                # 只有在不是追加下载的场景下再清除 wrong_record
         
     | 
| 
      
 351 
     | 
    
         
            +
                if not force:
         
     | 
| 
      
 352 
     | 
    
         
            +
                    remove(Path(store_path) / "wrong_record.txt")
         
     | 
| 
       264 
353 
     | 
    
         
             
                print(f"Downloading {len(doi_list)} PDF files...")
         
     | 
| 
       265 
354 
     | 
    
         
             
                for doi in track(doi_list, description="Downloading..."):
         
     | 
| 
       266 
     | 
    
         
            -
                     
     | 
| 
       267 
     | 
    
         
            -
             
     | 
| 
      
 355 
     | 
    
         
            +
                    dl = _Downloader(
         
     | 
| 
      
 356 
     | 
    
         
            +
                        doi,
         
     | 
| 
      
 357 
     | 
    
         
            +
                        store_path,
         
     | 
| 
      
 358 
     | 
    
         
            +
                        min_size_kb=min_size_kb,
         
     | 
| 
      
 359 
     | 
    
         
            +
                        timeout_html=timeout_html,
         
     | 
| 
      
 360 
     | 
    
         
            +
                        timeout_pdf=timeout_pdf,
         
     | 
| 
      
 361 
     | 
    
         
            +
                        sleep_secs=sleep_secs,
         
     | 
| 
      
 362 
     | 
    
         
            +
                        tries_each_url=tries_each_url,
         
     | 
| 
      
 363 
     | 
    
         
            +
                        debug=debug,
         
     | 
| 
      
 364 
     | 
    
         
            +
                    )
         
     | 
| 
      
 365 
     | 
    
         
            +
                    # 是否进行镜像探测
         
     | 
| 
      
 366 
     | 
    
         
            +
                    if probe_mirrors:
         
     | 
| 
      
 367 
     | 
    
         
            +
                        dl._ensure_alive_mirrors()
         
     | 
| 
      
 368 
     | 
    
         
            +
                    dl.download_pdf()
         
     | 
| 
       268 
369 
     | 
    
         | 
| 
       269 
370 
     | 
    
         | 
| 
       270 
371 
     | 
    
         
             
            if __name__ == "__main__":
         
     | 
    
        oafuncs/oa_linux.py
    CHANGED
    
    | 
         @@ -1,7 +1,8 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            from rich import print
         
     | 
| 
      
 2 
     | 
    
         
            +
            import time
         
     | 
| 
      
 3 
     | 
    
         
            +
            import os
         
     | 
| 
       2 
4 
     | 
    
         | 
| 
       3 
     | 
    
         
            -
             
     | 
| 
       4 
     | 
    
         
            -
            __all__ = ["os_command", "get_queue_node"]
         
     | 
| 
      
 5 
     | 
    
         
            +
            __all__ = ["os_command", "get_queue_node", "query_queue", "running_jobs", "submit_job"]
         
     | 
| 
       5 
6 
     | 
    
         | 
| 
       6 
7 
     | 
    
         | 
| 
       7 
8 
     | 
    
         
             
            # 负责执行命令并返回输出
         
     | 
| 
         @@ -49,5 +50,81 @@ def get_queue_node(): 
     | 
|
| 
       49 
50 
     | 
    
         | 
| 
       50 
51 
     | 
    
         
             
                return queue_node_dict
         
     | 
| 
       51 
52 
     | 
    
         | 
| 
      
 53 
     | 
    
         
            +
            def query_queue(need_node=1, queue_list =['dcu','bigmem','cpu_parallel','cpu_single']):
         
     | 
| 
      
 54 
     | 
    
         
            +
                queue_dict = get_queue_node()
         
     | 
| 
      
 55 
     | 
    
         
            +
                hs = None
         
     | 
| 
      
 56 
     | 
    
         
            +
                for my_queue in queue_list:
         
     | 
| 
      
 57 
     | 
    
         
            +
                    if my_queue in queue_dict and queue_dict[my_queue] >= need_node:
         
     | 
| 
      
 58 
     | 
    
         
            +
                        # slurm_file = f'../run.slurm.{my_queue}'
         
     | 
| 
      
 59 
     | 
    
         
            +
                        hs = my_queue
         
     | 
| 
      
 60 
     | 
    
         
            +
                        break
         
     | 
| 
      
 61 
     | 
    
         
            +
                return hs
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
            def running_jobs():
         
     | 
| 
      
 64 
     | 
    
         
            +
                # 通过qstat判断任务状态,是否还在进行中
         
     | 
| 
      
 65 
     | 
    
         
            +
                # status = os.popen('qstat').read()
         
     | 
| 
      
 66 
     | 
    
         
            +
                status = os.popen('squeue').read()
         
     | 
| 
      
 67 
     | 
    
         
            +
                Jobs = status.split('\n')[1:]
         
     | 
| 
      
 68 
     | 
    
         
            +
                ids = [job.split()[0] for job in Jobs if job != '']
         
     | 
| 
      
 69 
     | 
    
         
            +
                return ids
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
            def submit_job(working_dir=None, script_tmp='run.slurm', script_run='run.slurm', need_node=1, queue_tmp='<queue_name>', queue_list=['dcu', 'bigmem', 'cpu_parallel', 'cpu_single'], max_job=38, wait=False): 
         
     | 
| 
      
 72 
     | 
    
         
            +
                '''提交任务到集群,并返回任务ID'''
         
     | 
| 
      
 73 
     | 
    
         
            +
                from .oa_file import replace_content
         
     | 
| 
      
 74 
     | 
    
         
            +
                import datetime
         
     | 
| 
      
 75 
     | 
    
         
            +
                if working_dir is None:
         
     | 
| 
      
 76 
     | 
    
         
            +
                    working_dir = os.getcwd()
         
     | 
| 
      
 77 
     | 
    
         
            +
                os.chdir(working_dir)
         
     | 
| 
      
 78 
     | 
    
         
            +
                print(f'切换工作目录到: {working_dir}')
         
     | 
| 
      
 79 
     | 
    
         
            +
                
         
     | 
| 
      
 80 
     | 
    
         
            +
                if need_node > 1 and 'cpu_single' in queue_list:
         
     | 
| 
      
 81 
     | 
    
         
            +
                    queue_list.remove('cpu_single')
         
     | 
| 
      
 82 
     | 
    
         
            +
                
         
     | 
| 
      
 83 
     | 
    
         
            +
                while True:
         
     | 
| 
      
 84 
     | 
    
         
            +
                    running_job = running_jobs()
         
     | 
| 
      
 85 
     | 
    
         
            +
                    if not running_job or len(running_job) < max_job:
         
     | 
| 
      
 86 
     | 
    
         
            +
                        queue = query_queue(need_node=need_node, queue_list=queue_list)
         
     | 
| 
      
 87 
     | 
    
         
            +
                        if queue:
         
     | 
| 
      
 88 
     | 
    
         
            +
                            replace_content(script_tmp, {f'{queue_tmp}': f"{queue}"}, False, f'{working_dir}', script_run)
         
     | 
| 
      
 89 
     | 
    
         
            +
                            print(f'找到计算资源,提交任务,队列:{queue}')
         
     | 
| 
      
 90 
     | 
    
         
            +
                            print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
         
     | 
| 
      
 91 
     | 
    
         
            +
                            content_sub = os_command(f"sbatch {script_run}")
         
     | 
| 
      
 92 
     | 
    
         
            +
                            # 避免在 None 上使用 'in' 导致 TypeError:os_command 在失败时会返回 None
         
     | 
| 
      
 93 
     | 
    
         
            +
                            if not content_sub:
         
     | 
| 
      
 94 
     | 
    
         
            +
                                print('提交任务命令没有返回输出或返回了错误,等待30秒后重试!')
         
     | 
| 
      
 95 
     | 
    
         
            +
                                time.sleep(30)
         
     | 
| 
      
 96 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 97 
     | 
    
         
            +
                                content_sub_lower = content_sub.lower()
         
     | 
| 
      
 98 
     | 
    
         
            +
                                if 'error' in content_sub_lower or 'failed' in content_sub_lower:
         
     | 
| 
      
 99 
     | 
    
         
            +
                                    print('提交任务时出现错误(从输出检测到 error/failed),等待30秒后重试!')
         
     | 
| 
      
 100 
     | 
    
         
            +
                                    print(f'命令输出: {content_sub.strip()}')
         
     | 
| 
      
 101 
     | 
    
         
            +
                                    time.sleep(30)
         
     | 
| 
      
 102 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 103 
     | 
    
         
            +
                                    print(f'提交任务成功,{content_sub.strip()}')
         
     | 
| 
      
 104 
     | 
    
         
            +
                                    job_id = content_sub.strip().split()[-1]
         
     | 
| 
      
 105 
     | 
    
         
            +
                                    break
         
     | 
| 
      
 106 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 107 
     | 
    
         
            +
                            print('没有足够的计算资源,等待30秒后重试!')
         
     | 
| 
      
 108 
     | 
    
         
            +
                            time.sleep(30)
         
     | 
| 
      
 109 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 110 
     | 
    
         
            +
                        print(f'当前系统任务数:{len(running_job)},等待60秒后重试!')
         
     | 
| 
      
 111 
     | 
    
         
            +
                        time.sleep(60)
         
     | 
| 
      
 112 
     | 
    
         
            +
                print(f'等待10秒后,继续检查任务状态!')
         
     | 
| 
      
 113 
     | 
    
         
            +
                time.sleep(10)
         
     | 
| 
      
 114 
     | 
    
         
            +
                
         
     | 
| 
      
 115 
     | 
    
         
            +
                if wait:
         
     | 
| 
      
 116 
     | 
    
         
            +
                    while True:
         
     | 
| 
      
 117 
     | 
    
         
            +
                        if job_id in running_jobs():
         
     | 
| 
      
 118 
     | 
    
         
            +
                            print(f'Time: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
         
     | 
| 
      
 119 
     | 
    
         
            +
                            print(f'任务{job_id}正在任务队列中...')
         
     | 
| 
      
 120 
     | 
    
         
            +
                            time.sleep(60)
         
     | 
| 
      
 121 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 122 
     | 
    
         
            +
                            print(f'任务{job_id}已完成!')
         
     | 
| 
      
 123 
     | 
    
         
            +
                            break
         
     | 
| 
      
 124 
     | 
    
         
            +
                else:
         
     | 
| 
      
 125 
     | 
    
         
            +
                    print(f'任务{job_id}已提交,不等待其完成,继续执行后续操作!')
         
     | 
| 
      
 126 
     | 
    
         
            +
                
         
     | 
| 
      
 127 
     | 
    
         
            +
                return job_id
         
     | 
| 
      
 128 
     | 
    
         
            +
             
     | 
| 
       52 
129 
     | 
    
         
             
            if __name__ == "__main__":
         
     | 
| 
       53 
130 
     | 
    
         
             
                pass
         
     | 
| 
         @@ -1,12 +1,12 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            oafuncs/__init__.py,sha256=G523BFVPxmODwq8j_88NYEiKbCzdQ3jfy51cmLeh7kM,1630
         
     | 
| 
       2 
     | 
    
         
            -
            oafuncs/oa_cmap.py,sha256= 
     | 
| 
      
 2 
     | 
    
         
            +
            oafuncs/oa_cmap.py,sha256=Mru5XvvBTfYNq8xjsBAGWppI7RGKzSh94glxP2SXomc,14221
         
     | 
| 
       3 
3 
     | 
    
         
             
            oafuncs/oa_data.py,sha256=CG2YHY_R6MFrPw3UznT4T8BE8yXdgBMnmdUAEdh9GAo,6506
         
     | 
| 
       4 
4 
     | 
    
         
             
            oafuncs/oa_date.py,sha256=aU2wVIWXyWoRiSQ9dg8sHvShFTxw86RrgbV3Q6tDjD4,6841
         
     | 
| 
       5 
5 
     | 
    
         
             
            oafuncs/oa_draw.py,sha256=zal0Y3RPpN0TCGN4Gw9qLtjQdT6V0ZqpSUBFVOPL0x4,13952
         
     | 
| 
       6 
6 
     | 
    
         
             
            oafuncs/oa_file.py,sha256=j9NOjxPOeAJsD5Zk4ODmFdVSSgr1CHVPvM1IHXy9RQA,17546
         
     | 
| 
       7 
7 
     | 
    
         
             
            oafuncs/oa_geo.py,sha256=UbzvUqgT2QP_9B7XSJRL1HDmGu0HnLC5nSP6ZrA5WH4,7177
         
     | 
| 
       8 
8 
     | 
    
         
             
            oafuncs/oa_help.py,sha256=0J5VaZX-cB0c090KxgmktQJBc0o00FsY-4wB8l5y00k,4178
         
     | 
| 
       9 
     | 
    
         
            -
            oafuncs/oa_linux.py,sha256= 
     | 
| 
      
 9 
     | 
    
         
            +
            oafuncs/oa_linux.py,sha256=reQYcjMff6mHuM5RXzoxM-4lgIYUcOoHp0u2fLY0cnU,5431
         
     | 
| 
       10 
10 
     | 
    
         
             
            oafuncs/oa_nc.py,sha256=j501NlTuvrDIwNLXbMfE7nPPXdbbL7u9PGDj2l5AtnI,16277
         
     | 
| 
       11 
11 
     | 
    
         
             
            oafuncs/oa_python.py,sha256=xYMQnM0cGq9xUCtcoMpnN0LG5Rc_s94tai5nC6CNJ3E,4831
         
     | 
| 
       12 
12 
     | 
    
         
             
            oafuncs/oa_tool.py,sha256=VHx15VqpbzNlVXh0-3nJqcDgLVaECMD1FvxJ_CrV39E,8046
         
     | 
| 
         @@ -26,7 +26,7 @@ oafuncs/oa_down/User_Agent-list.txt,sha256=pHaMlElMvZ8TG4vf4BqkZYKqe0JIGkr4kCN0l 
     | 
|
| 
       26 
26 
     | 
    
         
             
            oafuncs/oa_down/__init__.py,sha256=IT6oTqaxuV_mC6AwBut0HtkmnVtEu1MyX0x0oS7TKoA,218
         
     | 
| 
       27 
27 
     | 
    
         
             
            oafuncs/oa_down/hycom_3hourly.py,sha256=dFXSC_5o-Dic616KrLXir4tEHvCiZt8vGKPEYpXFMmA,57356
         
     | 
| 
       28 
28 
     | 
    
         
             
            oafuncs/oa_down/idm.py,sha256=vAhRjt_Sb-rKhzFShmSf29QcFTqsHpHXCvTSD1uSXyQ,1455
         
     | 
| 
       29 
     | 
    
         
            -
            oafuncs/oa_down/literature.py,sha256= 
     | 
| 
      
 29 
     | 
    
         
            +
            oafuncs/oa_down/literature.py,sha256=bqRSwYjsPO28WIohGIwPcj0xo9vAz5yhc8Ojx8ybh08,14924
         
     | 
| 
       30 
30 
     | 
    
         
             
            oafuncs/oa_down/read_proxy.py,sha256=HQpr-Mwn0Z8ICAuf63NUUY9p05E_uNWyWmOK46-73Ec,2866
         
     | 
| 
       31 
31 
     | 
    
         
             
            oafuncs/oa_down/test_ua.py,sha256=l8MCD6yU2W75zRPTDKUZTJhCWNF9lfk-MiSFqAqKH1M,1398
         
     | 
| 
       32 
32 
     | 
    
         
             
            oafuncs/oa_down/user_agent.py,sha256=LCVQUA60ukUqeJXgLktDHB2sh-ngk7AiX4sKj8w-X4A,416
         
     | 
| 
         @@ -39,8 +39,8 @@ oafuncs/oa_sign/__init__.py,sha256=JSx1fcWpmNhQBvX_Bmq3xysfSkkFMrjbJASxV_V6aqE,1 
     | 
|
| 
       39 
39 
     | 
    
         
             
            oafuncs/oa_sign/meteorological.py,sha256=3MSjy7HTcvz2zsITkjUMr_0Y027Gas1LFE9pk99990k,6110
         
     | 
| 
       40 
40 
     | 
    
         
             
            oafuncs/oa_sign/ocean.py,sha256=3uYEzaq-27yVy23IQoqy-clhWu1I_fhPFBAQyT-OF4M,5562
         
     | 
| 
       41 
41 
     | 
    
         
             
            oafuncs/oa_sign/scientific.py,sha256=moIl2MEY4uitbXoD596JmXookXGQtQsS-8_1NBBTx84,4689
         
     | 
| 
       42 
     | 
    
         
            -
            oafuncs-0.0.98. 
     | 
| 
       43 
     | 
    
         
            -
            oafuncs-0.0.98. 
     | 
| 
       44 
     | 
    
         
            -
            oafuncs-0.0.98. 
     | 
| 
       45 
     | 
    
         
            -
            oafuncs-0.0.98. 
     | 
| 
       46 
     | 
    
         
            -
            oafuncs-0.0.98. 
     | 
| 
      
 42 
     | 
    
         
            +
            oafuncs-0.0.98.47.dist-info/licenses/LICENSE.txt,sha256=rMtLpVg8sKiSlwClfR9w_Dd_5WubTQgoOzE2PDFxzs4,1074
         
     | 
| 
      
 43 
     | 
    
         
            +
            oafuncs-0.0.98.47.dist-info/METADATA,sha256=iJT9zUhIKuQGVu3zNAoQvKq9DyB6oMm-mlptyweqRR0,4446
         
     | 
| 
      
 44 
     | 
    
         
            +
            oafuncs-0.0.98.47.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
         
     | 
| 
      
 45 
     | 
    
         
            +
            oafuncs-0.0.98.47.dist-info/top_level.txt,sha256=bgC35QkXbN4EmPHEveg_xGIZ5i9NNPYWqtJqaKqTPsQ,8
         
     | 
| 
      
 46 
     | 
    
         
            +
            oafuncs-0.0.98.47.dist-info/RECORD,,
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     |