PyPI - py2ls - Versions diffs - 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl - Mend

py2ls 0.1.10.12py3-none-any.whl → 0.2.7.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of py2ls might be problematic. Click here for more details.

Files changed (72) hide show

py2ls/.DS_Store +0 -0
py2ls/.git/.DS_Store +0 -0
py2ls/.git/index +0 -0
py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
py2ls/.git/objects/.DS_Store +0 -0
py2ls/.git/refs/.DS_Store +0 -0
py2ls/ImageLoader.py +621 -0
py2ls/__init__.py +7 -5
py2ls/apptainer2ls.py +3940 -0
py2ls/batman.py +164 -42
py2ls/bio.py +2595 -0
py2ls/cell_image_clf.py +1632 -0
py2ls/container2ls.py +4635 -0
py2ls/corr.py +475 -0
py2ls/data/.DS_Store +0 -0
py2ls/data/email/email_html_template.html +88 -0
py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
py2ls/data/mygenes_fields_241022.txt +355 -0
py2ls/data/re_common_pattern.json +173 -0
py2ls/data/sns_info.json +74 -0
py2ls/data/styles/.DS_Store +0 -0
py2ls/data/styles/example/.DS_Store +0 -0
py2ls/data/styles/stylelib/.DS_Store +0 -0
py2ls/data/styles/stylelib/grid.mplstyle +15 -0
py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
py2ls/data/styles/stylelib/light.mplstyl +6 -0
py2ls/data/styles/stylelib/muted.mplstyle +6 -0
py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
py2ls/data/styles/stylelib/nature.mplstyle +31 -0
py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
py2ls/data/styles/stylelib/paper.mplstyle +290 -0
py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
py2ls/data/styles/stylelib/retro.mplstyle +4 -0
py2ls/data/styles/stylelib/sans.mplstyle +10 -0
py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
py2ls/data/styles/stylelib/science.mplstyle +48 -0
py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
py2ls/data/tiles.csv +146 -0
py2ls/data/usages_pd.json +1417 -0
py2ls/data/usages_sns.json +31 -0
py2ls/docker2ls.py +5446 -0
py2ls/ec2ls.py +61 -0
py2ls/fetch_update.py +145 -0
py2ls/ich2ls.py +1955 -296
py2ls/im2.py +8242 -0
py2ls/image_ml2ls.py +2100 -0
py2ls/ips.py +33909 -3418
py2ls/ml2ls.py +7700 -0
py2ls/mol.py +289 -0
py2ls/mount2ls.py +1307 -0
py2ls/netfinder.py +873 -351
py2ls/nl2ls.py +283 -0
py2ls/ocr.py +1581 -458
py2ls/plot.py +10394 -314
py2ls/rna2ls.py +311 -0
py2ls/ssh2ls.md +456 -0
py2ls/ssh2ls.py +5933 -0
py2ls/ssh2ls_v01.py +2204 -0
py2ls/stats.py +66 -172
py2ls/temp20251124.py +509 -0
py2ls/translator.py +2 -0
py2ls/utils/decorators.py +3564 -0
py2ls/utils_bio.py +3453 -0
{py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
{py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
{py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0

py2ls/netfinder.py CHANGED Viewed

@@ -1,37 +1,19 @@
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
 import requests
-from requests.utils import dict_from_cookiejar
-from requests.exceptions import ChunkedEncodingError, ConnectionError
 import os
-from urllib.parse import urlparse, urljoin
-import base64
+from tqdm import tqdm
+import chardet
 import pandas as pd
-from collections import Counter
-import random
 import logging
-from time import sleep
-import stem.process
-from stem import Signal
-from stem.control import Controller
 import json
-from fake_useragent import UserAgent
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from webdriver_manager.chrome import ChromeDriverManager
-from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-from pprint import pp
-import mimetypes
-import io
-import matplotlib.pyplot as plt
-from PIL import Image
-from duckduckgo_search import DDGS
-from datetime import datetime
 import time
-from py2ls import ips
+from selenium.webdriver.common.by import By
+from . import ips
+import random
+try:
+    import scrapy
+except ImportError:
+    scrapy = None
 dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
 # Set up logging
@@ -48,20 +30,66 @@ CONTENT_PARSERS = {
     "text/xml": lambda text, parser: BeautifulSoup(text, parser),
     "text/plain": lambda text, parser: text.text,
 }
+# Fallback pool of common User-Agent strings
+fallback_user_agents = [
+    # Chrome (Windows)
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
+    # Firefox (Mac)
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0",
+    # Edge (Windows)
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203",
+    # Safari (Mac)
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
+    # Linux Chrome
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.90 Safari/537.36",
+    # Android Tablet (Samsung)
+    "Mozilla/5.0 (Linux; Android 9; SAMSUNG SM-T860) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/10.1 Chrome/71.0.3578.99 Safari/537.36",
+    # iPhone Safari
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Mobile/15E148 Safari/604.1",
+    # Android Mobile Chrome
+    "Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.154 Mobile Safari/537.36",
+    # iPad Safari
+    "Mozilla/5.0 (iPad; CPU OS 15_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Mobile/15E148 Safari/604.1",
+    # Opera (Windows)
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 OPR/86.0.4363.32",
+    # Brave (Mac)
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
+    # Vivaldi (Windows)
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Vivaldi/5.1.2567.49",
+    # Android Chrome OnePlus
+    "Mozilla/5.0 (Linux; Android 10; ONEPLUS A6010) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.74 Mobile Safari/537.36",
+    # Samsung Galaxy S22 Chrome
+    "Mozilla/5.0 (Linux; Android 12; SAMSUNG SM-S901B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36",
+    # Xiaomi MIUI Browser
+    "Mozilla/5.0 (Linux; Android 11; M2012K11AG) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.125 Mobile Safari/537.36",
+    # Desktop Safari on macOS Ventura
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
+]
 def user_agent(
     browsers=["chrome", "edge", "firefox", "safari"],
     platforms=["pc", "tablet"],
     verbose=False,
-    os=["windows", "macos", "linux"],
+    os_names=["windows", "macos", "linux"],
 ):
-    ua = UserAgent(browsers=browsers, platforms=platforms, os=os)
-    output_ua = ua.random
+    import warnings
+    import traceback
+    try:
+        from fake_useragent import UserAgent
+        ua = UserAgent(browsers=browsers, platforms=platforms, os=os_names)
+        output_ua = ua.random
+    except Exception as e:
+        warnings.warn("fake_useragent failed, using fallback list instead.\n" + str(e))
+        if verbose:
+            traceback.print_exc()
+        output_ua = random.choice(fallback_user_agents)
     if verbose:
-        print(output_ua)
-    return output_ua
+        print("Selected User-Agent:", output_ua)
+    return output_ua
 def get_tags(content, ascending=True):
     tag_names = set()
@@ -109,6 +137,8 @@ def get_attr(content, where=None, attr=None, **kwargs):
             else:
                 print(f"The attribute '{attr}' is not found in the elements.")
         else:
+            from pprint import pp
             print(f"Cannot find tag '{where}' in the content.")
             print("Available tags:")
             pp(all_tags)
@@ -136,8 +166,8 @@ def extract_text_from_content(
     def extract_text(element):
         texts = ""
-        if isinstance(element, str) and element.strip():
-            texts += element.strip()
+        if isinstance(element, NavigableString) and element.strip():
+            texts += element.strip() + " "
         elif hasattr(element, "children"):
             for child in element.children:
                 texts += extract_text(child)
@@ -192,6 +222,8 @@ def extract_text_from_content(
                 texts = ""
                 for tag in result_set:
                     texts = texts + " " + extract_text(tag) + " \n"
+                    # texts = texts + " " + tag.get_text(" ", strip=True)+ " \n"
                 text_list = [tx.strip() for tx in texts.split(" \n") if tx.strip()]
                 return text_list
             else:
@@ -237,6 +269,8 @@ def flatten_json(y):
 def get_proxy():
+    import random
     list_ = []
     headers = {"User-Agent": user_agent()}
     response = requests.get(
@@ -275,6 +309,8 @@ def get_cookies(url, login={"username": "your_username", "password": "your_passw
 ### 更加平滑地移动鼠标, 这样更容易反爬
 def scroll_smth_steps(driver, scroll_pause=0.5, min_step=200, max_step=600):
+    import random
     """Smoothly scrolls down the page to trigger lazy loading."""
     current_scroll_position = 0
     end_of_page = driver.execute_script("return document.body.scrollHeight")
@@ -327,13 +363,164 @@ def corr_by_kind(wait_until_kind):
         raise ValueError(f"Unsupported wait_until_kind: {wait_until_kind}")
+def parse_cookies(cookies_str):
+    """
+    直接复制于browser,它可以负责转换成最终的dict
+    """
+    import re
+    cookies_dict = {}
+    # Split the string by newlines to get each cookie row
+    cookies_list = cookies_str.strip().split("\n")
+    for cookie in cookies_list:
+        # Use regular expression to capture name and value pairs
+        match = re.match(r"([a-zA-Z0-9_\-\.]+)\s+([^\s]+)", cookie)
+        if match:
+            cookie_name = match.group(1)
+            cookie_value = match.group(2)
+            cookies_dict[cookie_name] = cookie_value
+    return cookies_dict
+def fetch_scrapy(
+    url,
+    parser="html.parser",
+    cookies=None,
+    headers=None,
+    settings=None,
+):
+    """
+    Fetches content using Scrapy with proper reactor handling.
+    Args:
+        url (str): The URL to scrape.
+        parser (str): Parser for BeautifulSoup (e.g., "lxml", "html.parser").
+        cookies (dict): Cookies to pass in the request.
+        headers (dict): HTTP headers for the request.
+        settings (dict): Scrapy settings, if any.
+    Returns:
+        dict: Parsed content as a dictionary.
+    """
+    from scrapy.utils.project import get_project_settings
+    from scrapy.crawler import CrawlerRunner
+    from scrapy.signalmanager import dispatcher
+    from scrapy import signals
+    from twisted.internet import reactor, defer
+    from twisted.internet.error import ReactorNotRestartable
+    import scrapy
+    import logging
+    # Disable Scrapy's excessive logging
+    logging.getLogger('scrapy').setLevel(logging.WARNING)
+    logging.getLogger('twisted').setLevel(logging.WARNING)
+    # Container for scraped content
+    content = []
+    # Define the spider class inside the function
+    class FetchSpider(scrapy.Spider):
+        name = "fetch_spider"
+        def __init__(self, url=None, parser=None, cookies=None, headers=None, *args, **kwargs):
+            super(FetchSpider, self).__init__(*args, **kwargs)
+            self.start_urls = [url]
+            self.parser = parser
+            self.cookies = cookies
+            self.headers = headers
+        def start_requests(self):
+            for url in self.start_urls:
+                yield scrapy.Request(
+                    url,
+                    cookies=self.cookies,
+                    headers=self.headers,
+                    callback=self.parse
+                )
+        def parse(self, response):
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(response.text, self.parser)
+            yield {
+                "content": soup,
+                "url": response.url,
+                "status": response.status
+            }
+    # Callback function for item scraped signal
+    def handle_item(item, response, spider):
+        content.append(item)
+    # Scrapy settings
+    process_settings = settings or get_project_settings()
+    process_settings.update(
+        {
+            "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "DOWNLOAD_DELAY": 1,
+            "COOKIES_ENABLED": bool(cookies),
+            "LOG_LEVEL": "ERROR",
+            "RETRY_ENABLED": False,
+            "HTTPERROR_ALLOW_ALL": True,
+        }
+    )
+    # Connect item scraped signal
+    dispatcher.connect(handle_item, signal=signals.item_scraped)
+    # Asynchronous Twisted function
+    @defer.inlineCallbacks
+    def crawl():
+        runner = CrawlerRunner(settings=process_settings)
+        yield runner.crawl(
+            FetchSpider,
+            url=url,
+            parser=parser,
+            cookies=cookies,
+            headers=headers,
+        )
+        reactor.stop()
+    # Handle reactor execution
+    try:
+        if not reactor.running:
+            crawl()
+            reactor.run(installSignalHandlers=0)
+        else:
+            # This case is problematic - reactor can't be restarted
+            raise RuntimeError("Reactor already running. Cannot run multiple crawls in same process.")
+    except ReactorNotRestartable:
+        raise RuntimeError("Scrapy reactor cannot be restarted. Create a new process for additional crawls.")
+    # Return the first scraped content or None if empty
+    return content[0] if content else None
+def _clean_temp():
+    import os
+    import shutil
+    import tempfile
+    from pathlib import Path
+    # Get the parent folder of the tempdir
+    temp_dir = Path(tempfile.gettempdir()).parent  # moves from /T to parent dir
+    for subdir in temp_dir.iterdir():
+        if subdir.is_dir():
+            for d in subdir.iterdir():
+                if "com.google.Chrome.code_sign_clone" in d.name:
+                    try:
+                        print(f"Removing: {d}")
+                        shutil.rmtree(d)
+                    except Exception as e:
+                        print(f"Error removing {d}: {e}")
 def fetch_all(
     url,
     parser="lxml",
     driver="request",  # request or selenium
     by=By.TAG_NAME,
     timeout=10,
-    retry=2,
+    retry=3,  # Increased default retries
     wait=0,
     wait_until=None,
     wait_until_kind=None,
@@ -347,221 +534,225 @@ def fetch_all(
     username_by=By.NAME,
     password_by=By.NAME,
     submit_by=By.NAME,
-    # capability='eager', # eager or none
-    proxy=None,  # Add proxy parameter
-    javascript=True,  # Add JavaScript option
-    disable_images=False,  # Add option to disable images
+    proxy=None,
+    javascript=True,
+    disable_images=False,
     iframe_name=None,
     login_dict=None,
-):  # Add option to handle iframe):  # lxml is faster, # parser="html.parser"
-    try:
-        # # Generate a random user-agent string
-        # response = requests.get(url)
-        # # get cookies
-        # cookie=dict_from_cookiejar(response.cookies)
-        # # get token from cookies
-        # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
-        # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
-        headers = {"User-Agent": user_agent()}
-        if "req" in driver.lower():
-            response = requests.get(
-                url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
-            )
+    cookies=None,
+    verify_ssl=True,  # Added SSL verification option
+    follow_redirects=True,  # Added redirect control
+):
+    """
+    Enhanced fetch function with better error handling and reliability.
+    Returns:
+        tuple: (content_type, parsed_content) or (None, None) on failure
+    """
+    def _parse_content(content, content_type, parser):
+        """Helper function to parse content with fallback"""
+        try:
+            if content_type in CONTENT_PARSERS:
+                return CONTENT_PARSERS[content_type](content, parser)
+            # Fallback parsing attempts
+            if content_type.startswith('text/'):
+                try:
+                    return BeautifulSoup(content, parser)
+                except:
+                    return content
+            return content
+        except Exception as e:
+            logger.warning(f"Content parsing failed: {e}")
+            return content
-            # If the response is a redirect, follow it
-            while response.is_redirect:
-                logger.info(f"Redirecting to: {response.headers['Location']}")
+    def _make_request(url, headers, cookies, timeout, verify_ssl, follow_redirects):
+        """Helper function for HTTP requests with retries"""
+        for attempt in range(retry):
+            try:
                 response = requests.get(
-                    response.headers["Location"],
+                    url,
                     headers=headers,
-                    proxies=proxies_glob,
-                    timeout=30,
+                    cookies=cookies,
+                    timeout=timeout,
                     stream=True,
+                    verify=verify_ssl,
+                    allow_redirects=follow_redirects
                 )
-            # Check for a 403 error
-            if response.status_code == 403:
-                logger.warning("403 Forbidden error. Retrying...")
-                # Retry the request after a short delay
-                sleep(random.uniform(1, 3))
-                response = requests.get(
-                    url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
-                )
-                # Raise an error if retry also fails
+                # Handle redirects manually if needed
+                if not follow_redirects and response.is_redirect:
+                    logger.info(f"Redirect detected to: {response.headers['Location']}")
+                    return None, None
                 response.raise_for_status()
-            # Raise an error for other HTTP status codes
-            response.raise_for_status()
-            # Get the content type
-            content_type = (
-                response.headers.get("content-type", "").split(";")[0].lower()
+                return response, None
+            except requests.RequestException as e:
+                logger.warning(f"Attempt {attempt + 1} failed: {e}")
+                if attempt == retry - 1:
+                    return None, e
+                time.sleep(random.uniform(1, 3))
+    # Convert driver integer to string if needed
+    if isinstance(driver, int):
+        drivers = ["request", "selenium", "scrapy"]
+        try:
+            driver = drivers[driver]
+        except IndexError:
+            driver = "request"
+    headers = {"User-Agent": user_agent()}
+    # Prepare cookies
+    cookie_jar = None
+    if cookies:
+        from requests.cookies import RequestsCookieJar
+        cookie_jar = RequestsCookieJar()
+        if isinstance(cookies, str):
+            cookies = parse_cookies(cookies)
+        for name, value in cookies.items():
+            cookie_jar.set(name, value)
+    try:
+        if "req" in driver.lower():
+            response, error = _make_request(
+                url, headers, cookie_jar, timeout, verify_ssl, follow_redirects
             )
-            if response.encoding:
-                content = response.content.decode(response.encoding)
-            else:
-                content = None
-            # logger.info(f"Content type: {content_type}")
-            # Check if content type is supported
-            if content_type in CONTENT_PARSERS and content:
-                return content_type, CONTENT_PARSERS[content_type](content, parser)
-            else:
-                logger.warning("Unsupported content type")
+            if error:
                 return None, None
+            content_type = response.headers.get("content-type", "").split(";")[0].lower()
+            try:
+                detected = chardet.detect(response.content)
+                encoding = detected.get("encoding") or "utf-8"
+                content = response.content.decode(encoding, errors='replace')
+            except:
+                content = response.content.decode(response.encoding or 'utf-8', errors='replace')
+            return content_type, _parse_content(content, content_type, parser)
         elif "se" in driver.lower():
+            from selenium import webdriver
+            from selenium.webdriver.chrome.service import Service
+            from selenium.webdriver.chrome.options import Options
+            from webdriver_manager.chrome import ChromeDriverManager
+            from selenium.common.exceptions import WebDriverException
             chrome_options = Options()
             chrome_options.add_argument("--headless")
             chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-gpu")
             chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
             chrome_options.add_argument(f"user-agent={user_agent()}")
             if proxy:
                 chrome_options.add_argument(f"--proxy-server={proxy}")
             if disable_images:
-                prefs = {"profile.managed_default_content_settings.images": 2}
-                chrome_options.add_experimental_option("prefs", prefs)
-            # chrome_options.page_load_strategy = capability
-            service = Service(ChromeDriverManager().install())
-            # driver_path='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/127.0.6533.119/chromedriver-mac-arm64/chromedriver'
-            # service=Service(executable_path=driver_path)
-            driver_ = webdriver.Chrome(service=service, options=chrome_options)
-            # 隐式等等待
-            if 3 < wait < 5:
-                wait_ = random.uniform(3, 5)
-            elif 5 <= wait < 8:
-                wait_ = random.uniform(5, 8)
-            elif 8 <= wait < 12:
-                wait_ = random.uniform(8, 10)
-            else:
-                wait_ = 0
-            driver_.implicitly_wait(wait_)
-            if wait_until is not None and wait_until_kind is not None:
-                strategy = corr_by_kind(wait_until_kind)
-                WebDriverWait(driver_, timeout).until(
-                    EC.presence_of_element_located((strategy, wait_until))
-                )
-            if login_url and login_dict:
-                cookies = get_cookies(url=login_url, login=login_dict)
-                driver_.get(url)
-                for cookie_name, cookie_value in cookies.items():
-                    driver_.add_cookie({"name": cookie_name, "value": cookie_value})
-            if not javascript:
-                driver_.execute_cdp_cmd(
-                    "Emulation.setScriptExecutionDisabled", {"value": True}
-                )
-            if login_url:
-                driver_.get(login_url)
-                WebDriverWait(driver_, timeout).until(
-                    EC.presence_of_element_located((username_by, username_field))
-                ).send_keys(username)
-                WebDriverWait(driver_, timeout).until(
-                    EC.presence_of_element_located((password_by, password_field))
-                ).send_keys(password)
-                WebDriverWait(driver_, timeout).until(
-                    EC.element_to_be_clickable((submit_by, submit_field))
-                ).click()
-            driver_.get(url)
-            if iframe_name:
-                iframe = WebDriverWait(driver_, timeout).until(
-                    EC.presence_of_element_located((By.NAME, iframe_name))
+                chrome_options.add_experimental_option(
+                    "prefs", {"profile.managed_default_content_settings.images": 2}
                 )
-                driver_.switch_to.frame(iframe)
-            # WebDriverWait(driver, timeout).until(
-            #     EC.presence_of_element_located((by, where))
-            # )
-            # # scroll down the page by a certain number of pixels
-            scroll_smth_steps(driver_)
-            # 设置轮询
-            for attempt in range(scroll_try):
-                page_source = driver_.page_source
-                content = BeautifulSoup(page_source, "html.parser")
-                if content and content.find_all(by):
-                    break
-                sleep(
-                    random.uniform(2, 4)
-                )  # Wait for a random time before polling again
-            driver_.quit()
-            # content = BeautifulSoup(page_source, "html.parser")
-            if content:
-                return "text/html", content
-            else:
-                logger.warning("Selenium could not fetch content")
+            driver_instance = None
+            try:
+                # Try with latest ChromeDriver first
+                service = Service(ChromeDriverManager().install())
+                driver_instance = webdriver.Chrome(service=service, options=chrome_options)
+                # Configure wait times
+                if 3 < wait < 5:
+                    wait_time = random.uniform(3, 5)
+                elif 5 <= wait < 8:
+                    wait_time = random.uniform(5, 8)
+                elif 8 <= wait < 12:
+                    wait_time = random.uniform(8, 10)
+                else:
+                    wait_time = 0
+                driver_instance.implicitly_wait(wait_time)
+                # Handle login if needed
+                if login_url and login_dict:
+                    cookies = get_cookies(url=login_url, login=login_dict)
+                    driver_instance.get(url)
+                    for name, value in cookies.items():
+                        driver_instance.add_cookie({"name": name, "value": value})
+                elif cookies:
+                    driver_instance.get(url)
+                    if isinstance(cookies, str):
+                        cookies = parse_cookies(cookies)
+                    for name, value in cookies.items():
+                        driver_instance.add_cookie({"name": name, "value": value})
+                if not javascript:
+                    driver_instance.execute_cdp_cmd(
+                        "Emulation.setScriptExecutionDisabled", {"value": True}
+                    )
+                # Navigate to target URL
+                driver_instance.get(url)
+                # Handle iframes if needed
+                if iframe_name:
+                    iframe = WebDriverWait(driver_instance, timeout).until(
+                        EC.presence_of_element_located((By.NAME, iframe_name))
+                    )
+                    driver_instance.switch_to.frame(iframe)
+                # Scroll to trigger dynamic content
+                scroll_smth_steps(driver_instance)
+                # Get page source with retries
+                content = None
+                for attempt in range(scroll_try):
+                    try:
+                        page_source = driver_instance.page_source
+                        content = BeautifulSoup(page_source, parser)
+                        if content and content.find_all(by):
+                            break
+                    except Exception as e:
+                        logger.warning(f"Attempt {attempt + 1} failed: {e}")
+                    time.sleep(random.uniform(1, 3))
+                try:
+                    _clean_temp()
+                except Exception as e:
+                    print(e)
+                return "text/html", content if content else None
+            except WebDriverException as e:
+                logger.error(f"Selenium error: {e}")
                 return None, None
-    except requests.RequestException as e:
-        logger.error(f"Error fetching URL '{url}': {e}")
+            finally:
+                if driver_instance:
+                    driver_instance.quit()
+        elif 'scr' in driver.lower():
+            settings = {
+                "USER_AGENT": user_agent(),
+                "DOWNLOAD_DELAY": 1,
+                "COOKIES_ENABLED": bool(cookies),
+                "LOG_LEVEL": "WARNING",
+                "RETRY_TIMES": retry,
+                "DOWNLOAD_TIMEOUT": timeout,
+            }
+            content = fetch_scrapy(
+                url,
+                parser=parser,
+                cookies=cookies,
+                headers=headers,
+                settings=settings
+            )
+            return parser, content
+    except Exception as e:
+        logger.error(f"Unexpected error in fetch_all: {e}")
         return None, None
+    return None, None
-# # Function to change Tor IP address
-# def renew_tor_ip():
-#     with Controller.from_port(port=9051) as controller:
-#         controller.authenticate()
-#         controller.signal(Signal.NEWNYM)
-# # Function to make requests through Tor
-# def make_tor_request(url, max_retries=3):
-#     renew_tor_ip()
-#     headers = {"User-Agent": user_agent()}
-#     session = requests.Session()
-#     session.proxies = {"http": "socks5h://localhost:9050", "https": "socks5h://localhost:9050"}
-#     for i in range(max_retries):
-#         try:
-#             response = session.get(url, headers=headers, timeout=10)
-#             if response.status_code == 200:
-#                 return response.text
-#         except requests.exceptions.RequestException as e:
-#             print(f"Error: {e}")
-#         time.sleep(2)  # Add a delay between retries
-#     return None
-# def find_links(url,driver='request'):
-#     links_href,cond_ex= [],["javascript:","mailto:","tel:","fax:"]
-#     content_type, soup = fetch_all(url,driver=driver)
-#     if soup:
-#         base_url = urlparse(url)
-#         # Extract links from both 'href' and 'src' attributes across relevant tags
-#         tags_with_links = ['a', 'img', 'script', 'link', 'iframe', 'embed','span']
-#         elements = []
-#         for tag in tags_with_links:
-#             elements.extend(soup.find_all(tag, href=True))
-#             elements.extend(soup.find_all(tag, src=True))
-#         for element in elements:
-#             link_href = element.get('href') or element.get('src')
-#             if link_href:
-#                 if link_href.startswith("//"):
-#                     link_href = "http:" + link_href
-#                 elif not link_href.startswith(("http", "https")):
-#                     link_href = urljoin(base_url.geturl(), link_href)
-#                 if all(exclusion not in link_href for exclusion in cond_ex):
-#                     links_href.append(link_href)
-#         return list(set(links_href))  # Remove duplicates
-#     elif url.split('.')[-1] in ['pdf']:
-#         return url
-#     else:
-#         return None
 def find_links(url, driver="request", booster=False):
+    from urllib.parse import urlparse, urljoin
     links_href, cond_ex = [], ["javascript:", "mailto:", "tel:", "fax:"]
     content_type, soup = fetch_all(url, driver=driver)
@@ -582,7 +773,7 @@ def find_links(url, driver="request", booster=False):
                 if all(exclusion not in link_href for exclusion in cond_ex):
                     links_href.append(link_href)
-        unique_links = list(set(links_href))  # Remove duplicates
+        unique_links = ips.unique(links_href)  # Remove duplicates
         if booster:
             for link in unique_links:
@@ -590,7 +781,7 @@ def find_links(url, driver="request", booster=False):
                     sub_links = find_links(link, driver=driver, booster=False)
                     if sub_links:
                         links_href.extend(sub_links)
-            links_href = list(set(links_href))  # Remove duplicates again
+            links_href = ips.unique(links_href)  # Remove duplicates again
         return links_href
@@ -602,6 +793,8 @@ def find_links(url, driver="request", booster=False):
 # To determine which links are related to target domains(e.g., pages) you are interested in
 def filter_links(links, contains="html", driver="requ", booster=False):
+    from urllib.parse import urlparse, urljoin
     filtered_links = []
     if isinstance(contains, str):
         contains = [contains]
@@ -614,10 +807,13 @@ def filter_links(links, contains="html", driver="requ", booster=False):
         )
         if condition:
             filtered_links.append(link)
-    return filtered_links
+    return ips.unique(filtered_links)
 def find_domain(links):
+    from urllib.parse import urlparse, urljoin
+    from collections import Counter
     if not links:
         return None
     domains = [urlparse(link).netloc for link in links]
@@ -672,6 +868,8 @@ def pdf_detector(url, contains=None, dir_save=None, booster=False):
             pdf_links = filter_links(links=links_all, contains=["pdf"])
     if pdf_links:
+        from pprint import pp
         pp(f"pdf detected{pdf_links}")
     else:
         print("no pdf file")
@@ -693,10 +891,9 @@ def pdf_detector(url, contains=None, dir_save=None, booster=False):
             idx += 1
         print(f"{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}")
 def downloader(
     url,
-    dir_save=dir_save,
+    dir_save=None,
     kind=[".pdf"],
     contains=None,
     rm_folder=False,
@@ -705,38 +902,157 @@ def downloader(
     timeout=30,
     n_try=3,
     timestamp=False,
+    chunk_size=8192,
+    retry_delay=2,
 ):
-    if verbose:
-        print(
-            "usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
-        )
+    """
+    Enhanced file downloader with robust error handling and resume capability
+    Args:
+        url: URL or list of URLs to download
+        dir_save: Directory to save files (None for current directory)
+        kind: List of file extensions to filter for (e.g., ['.pdf', '.xls'])
+        contains: String that must be present in the filename
+        rm_folder: Whether to remove the target folder before downloading
+        booster: Whether to search for links on the page
+        verbose: Whether to print progress information
+        timeout: Connection timeout in seconds
+        n_try: Number of retry attempts
+        timestamp: Whether to add timestamp to filenames
+        chunk_size: Download chunk size in bytes
+        retry_delay: Delay between retries in seconds
+    """
+    import os
+    import time
+    import shutil
+    import requests
+    from requests.exceptions import (ChunkedEncodingError, ConnectionError,
+                                  RequestException, Timeout)
+    from urllib.parse import urlparse
+    from datetime import datetime
+    if verbose and ips.run_once_within():
+        print("usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)")
+    # -------------------- wget integration helper --------------------
+    def _wget_available():
+        """Check if wget exists on system"""
+        return shutil.which("wget") is not None
+    def _wget_download(url, out_path):
+        import subprocess
+        """Download a file using system wget with progress bar"""
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+        try:
+            subprocess.run(
+                ["wget", "-c", "--show-progress", "--progress=bar:force", "-O", out_path, url],
+                check=True,
+            )
+            return True
+        except Exception as e:
+            if verbose:
+                print(f"wget download failed: {e}")
+            return False
+    # -----------------------------------------------------------------
     def fname_corrector(fname, ext):
+        """Ensure filename has correct extension"""
         if not ext.startswith("."):
             ext = "." + ext
-        if not fname.endswith("ext"):  # if not ext in fname:
-            fname = fname[: -len(ext)] + ext
+        if not fname.endswith(ext):
+            fname = os.path.splitext(fname)[0] + ext
+        if not any(fname[:-len(ext)]):
+            fname = datetime.now().strftime("%H%M%S") + ext
         return fname
     def check_and_modify_filename(directory, filename):
+        """Handle duplicate filenames by adding counter"""
         base, ext = os.path.splitext(filename)
         counter = 1
         new_filename = filename
         while os.path.exists(os.path.join(directory, new_filename)):
-            if counter <= 9:
-                counter_ = "0" + str(counter)
-            else:
-                counter_ = str(counter)
-            new_filename = f"{base}_{counter_}{ext}"
+            new_filename = f"{base}_{counter:02d}{ext}"
             counter += 1
         return new_filename
-    fpath_tmp, corrected_fname = None, None
+    def get_partial_file_size(filepath):
+        """Get size of partially downloaded file"""
+        try:
+            return os.path.getsize(filepath)
+        except OSError:
+            return 0
+    def download_with_resume(url, filepath, headers=None):
+        """Download with resume capability"""
+        headers = headers or {}
+        initial_size = get_partial_file_size(filepath)
+        if initial_size > 0:
+            headers['Range'] = f'bytes={initial_size}-'
+            mode = 'ab'
+        else:
+            mode = 'wb'
+        try:
+            with requests.get(url, headers=headers, stream=True, timeout=timeout) as r:
+                r.raise_for_status()
+                total_size = int(r.headers.get('content-length', 0)) + initial_size
+                with open(filepath, mode) as f, tqdm(
+                    total=total_size,
+                    unit='B',
+                    unit_scale=True,
+                    unit_divisor=1024,
+                    initial=initial_size,
+                    desc=os.path.basename(filepath),
+                    disable=not verbose,
+                ) as progress:
+                    for chunk in r.iter_content(chunk_size=chunk_size):
+                        if chunk:  # filter out keep-alive chunks
+                            f.write(chunk)
+                            progress.update(len(chunk))
+            return True
+        except Exception as e:
+            if verbose:
+                print(f"Download error: {e}")
+            return False
+    dir_save = dir_save or "./"
+    filename = os.path.basename(urlparse(url).path)
+    save_path = os.path.join(dir_save, filename)
+    os.makedirs(dir_save, exist_ok=True)
+    # Handle FTP URLs
+    if isinstance(url, str) and url.startswith("ftp"):
+        import urllib.request
+        try:
+            urllib.request.urlretrieve(url, save_path)
+            if verbose:
+                print(f"Downloaded FTP file to: {save_path}")
+            return save_path
+        except Exception as e:
+            print(f"FTP download failed: {e}")
+            return None
+    if kind is None and _wget_available():
+        if verbose:
+            print(f"Using wget for download: {url}")
+        success = _wget_download(url, save_path)
+        if success:
+            if verbose:
+                print(f"Successfully downloaded via wget: {save_path}")
+            return save_path
+        else:
+            if verbose:
+                print("⚠️ wget failed, falling back to requests...")
+            kind = [".*"]  # dummy
+    # Process directory and file links
     if not isinstance(kind, list):
         kind = [kind]
     if isinstance(url, list):
+        results = []
         for url_ in url:
-            downloader(
+            results.append(downloader(
                 url_,
                 dir_save=dir_save,
                 kind=kind,
@@ -746,120 +1062,100 @@ def downloader(
                 timeout=timeout,
                 n_try=n_try,
                 timestamp=timestamp,
-            )
-            # sleep(random.uniform(1, 3))
-    for i, k in enumerate(kind):
-        if not k.startswith("."):
-            kind[i] = "." + kind[i]
+            ))
+        return results
+    # Normalize file extensions
+    kind = [k if k.startswith(".") else f".{k}" for k in kind]
+    # Find and filter links
     file_links_all = []
     for kind_ in kind:
-        if isinstance(contains, str):
-            contains = [contains]
-        if isinstance(url, str):
-            if any(ext in url for ext in kind):
-                file_links = [url]
-            else:
-                if booster:
-                    links_all = []
-                    if "http" in url:
-                        links_all = find_links(url)
-                else:
-                    links_all = url
-                if contains is not None:
-                    file_links = filter_links(links_all, contains=contains + kind_)
-                else:
-                    file_links = links_all  # filter_links(links_all, contains=kind_)
-        elif isinstance(url, list):
-            links_all = url
-            if contains is not None:
-                file_links = filter_links(links_all, contains=contains + kind_)
-            else:
-                file_links = filter_links(links_all, contains=kind_)
+        if isinstance(url, str) and any(ext in url for ext in kind):
+            file_links = [url]
         else:
-            links_all = find_links(url)
-            if contains is not None:
-                file_links = filter_links(links_all, contains=contains + kind_)
-            else:
-                file_links = filter_links(links_all, contains=kind_)
+            links_all = find_links(url) if booster else ([url] if isinstance(url, str) else url)
+            file_links = filter_links(
+                links_all,
+                contains=(contains + kind_) if contains else kind_
+            )
+        file_links = ips.unique(file_links)
         if verbose:
             if file_links:
                 print("Files detected:")
+                from pprint import pp
                 pp(file_links)
             else:
-                file_links = []
                 print("No files detected")
-        if isinstance(file_links, str):
-            file_links_all = [file_links]
-        elif isinstance(file_links, list):
-            file_links_all.extend(file_links)
-    if dir_save:
-        if rm_folder:
-            ips.rm_folder(dir_save)
-        # if verbose:
-        #     print(f"\n... attempting to download to local\n")
-        fnames = [file_link.split("/")[-1] for file_link in file_links_all]
+        if file_links:
+            file_links_all.extend(file_links if isinstance(file_links, list) else [file_links])
-        for idx, file_link in enumerate(file_links_all):
-            headers = {"User-Agent": user_agent()}
-            itry = 0  # Retry logic with exception handling
-            while itry < n_try:
-                try:
-                    # streaming to handle large files and reduce memory usage.
-                    response = requests.get(
-                        file_link, headers=headers, timeout=timeout, stream=True
-                    )
-                    if response.status_code == 200:
-                        ext = next(
-                            (ftype for ftype in kind if ftype in file_link), None
-                        )
-                        if ext is None:
-                            ext = kind_
-                        print("ehereerere", ext)
-                        if ext:
-                            corrected_fname = fname_corrector(fnames[idx], ext)
-                            corrected_fname = check_and_modify_filename(
-                                dir_save, corrected_fname
-                            )
-                            if timestamp:
-                                corrected_fname = (
-                                    datetime.now().strftime("%y%m%d_%H%M%S_")
-                                    + corrected_fname
-                                )
-                            fpath_tmp = os.path.join(dir_save, corrected_fname)
-                            with open(fpath_tmp, "wb") as file:
-                                for chunk in response.iter_content(chunk_size=8192):
-                                    if chunk:  # Filter out keep-alive chunks
-                                        file.write(chunk)
-                            if verbose:
-                                print(f"Done! {fnames[idx]}")
-                        else:
-                            if verbose:
-                                print(f"Unknown file type for {file_link}")
-                        break  # Exit the retry loop if successful
-                    else:
+    file_links_all = ips.unique(file_links_all)
+    if not file_links_all:
+        return None
+    # Prepare download directory
+    dir_save = dir_save or "./"
+    if rm_folder:
+        ips.rm_folder(dir_save)
+    os.makedirs(dir_save, exist_ok=True)
+    # Download files
+    results = []
+    for file_link in file_links_all:
+        headers = {
+            "User-Agent": user_agent(),
+            "Accept-Encoding": "identity"  # Disable compression for resume support
+        }
+        # Determine filename
+        filename = os.path.basename(urlparse(file_link).path)
+        ext = next((ftype for ftype in kind if ftype in filename), kind[0])
+        corrected_fname = fname_corrector(filename, ext)
+        corrected_fname = check_and_modify_filename(dir_save, corrected_fname)
+        if timestamp:
+            corrected_fname = datetime.now().strftime("%y%m%d_%H%M%S_") + corrected_fname
+        save_path = os.path.join(dir_save, corrected_fname)
+        # Download with retry logic
+        success = False
+        for attempt in range(n_try):
+            try:
+                if verbose:
+                    print(f"Downloading {file_link} (attempt {attempt + 1}/{n_try})")
+                if _wget_available():
+                    success = _wget_download(file_link, save_path)
+                    if success:
                         if verbose:
-                            print(
-                                f"Failed to download file: HTTP status code {response.status_code}"
-                            )
+                            print(f"Successfully downloaded via wget: {save_path}")
                         break
-                except (ChunkedEncodingError, ConnectionError) as e:
-                    print(f"Attempt {itry+1} failed: {e}. Retrying in a few seconds...")
-                    # time.sleep(random.uniform(0, 2))  # Random sleep to mitigate server issues
-                    if fpath_tmp and os.path.exists(fpath_tmp):
-                        os.remove(fpath_tmp)
-                    itry += 1
-            if itry == n_try:
-                print(f"Failed to download {file_link} after {n_try} attempts.")
-        # print(f"\n{len(fnames)} files were downloaded:")
-        if verbose:
-            if corrected_fname:
-                pp(corrected_fname)
-                print(f"\n\nsaved @:\n{dir_save}")
-            else:
-                pp(fnames)
+                if download_with_resume(file_link, save_path, headers):
+                    success = True
+                    if verbose:
+                        print(f"Successfully downloaded: {save_path}")
+                    break
+            except (ChunkedEncodingError, ConnectionError, Timeout, RequestException) as e:
+                if verbose:
+                    print(f"Attempt {attempt + 1} failed: {e}")
+                if attempt < n_try - 1:
+                    time.sleep(retry_delay)
+        if success:
+            results.append(save_path)
+        else:
+            if verbose:
+                print(f"Failed to download {file_link} after {n_try} attempts")
+            # Clean up potentially corrupted file
+            if os.path.exists(save_path):
+                os.remove(save_path)
+            results.append(None)
+    return results if len(results) != 1 else results[0]
 def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
     """
@@ -872,9 +1168,14 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
     Returns:
         str: HTML content with updated image URLs pointing to local files.
     """
+    from urllib.parse import urlparse, urljoin
+    import base64
     if rm_folder:
         ips.rm_folder(dir_save)
     content_type, content = fetch_all(url, driver=driver)
+    if content_type is None:
+        content_type=""
     if "html" in content_type.lower():
         # Create the directory if it doesn't exist
         os.makedirs(dir_save, exist_ok=True)
@@ -937,6 +1238,9 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
 def svg_to_png(svg_file):
+    import io
+    from PIL import Image
     with WandImage(filename=svg_file, resolution=300) as img:
         img.format = "png"
         png_image = img.make_blob()
@@ -1002,10 +1306,22 @@ def fetch_selenium(
     iframe_name=None,  # Add option to handle iframe
     **kwargs,
 ):
+    import random
+    from selenium import webdriver
+    from selenium.webdriver.chrome.service import Service
+    from selenium.webdriver.common.by import By
+    from selenium.webdriver.chrome.options import Options
+    from selenium.webdriver.support.ui import WebDriverWait
+    from selenium.webdriver.support import expected_conditions as EC
+    from webdriver_manager.chrome import ChromeDriverManager
+    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
     chrome_options = Options()
     chrome_options.add_argument("--headless")
     chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-gpu")
     chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
     chrome_options.add_argument(f"user-agent={user_agent()}")
     if proxy:
         chrome_options.add_argument(f"--proxy-server={proxy}")
@@ -1061,7 +1377,7 @@ def fetch_selenium(
             if attempt == retry - 1:
                 logger.error("Failed to fetch the content after all retries")
                 return []
-        sleep(random.uniform(1, 3))
+        time.sleep(random.uniform(1, 3))
     # Return empty list if nothing found after all retries
     return []
@@ -1078,6 +1394,9 @@ def fetch(
     output="text",
     **kws,
 ):
+    import random
+    from urllib.parse import urlparse, urljoin
     if "xt" in output.lower():
         for attempt in range(retry):
             if verbose and attempt == 0:
@@ -1103,12 +1422,12 @@ def fetch(
             else:
                 if texts:
                     break
-                sleep(random.uniform(0.5, 1.5))
+                time.sleep(random.uniform(0.5, 1.5))
         if isinstance(texts, pd.core.frame.DataFrame):
             condition_ = [texts.empty, booster]
         else:
             condition_ = [not texts, booster]
-        if any(condition_):
+        if any(condition_):
             print("trying to use 'fetcher2'...")
             texts = fetch_selenium(
                 url=url, where=where, what=what, extend=extend, **kws
@@ -1116,6 +1435,7 @@ def fetch(
         if texts:
             return texts
         else:
+            print("got nothing")
             return fetch(
                 url,
                 where=where,
@@ -1429,6 +1749,8 @@ def isa(fpath, kind="img"):
 def is_image(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type and mime_type.startswith("image"):
         return True
@@ -1437,6 +1759,8 @@ def is_image(fpath):
 def is_document(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type and (
         mime_type.startswith("text/")
@@ -1457,6 +1781,8 @@ def is_document(fpath):
 def is_zip(fpath):
+    import mimetypes
     mime_type, _ = mimetypes.guess_type(fpath)
     if mime_type == "application/zip":
         return True
@@ -1476,6 +1802,8 @@ def search(
 ):
     if "te" in kind.lower():
+        from duckduckgo_search import DDGS
         results = DDGS().text(query, max_results=limit)
         res = pd.DataFrame(results)
         res.rename(columns={"href": "links"}, inplace=True)
@@ -1493,6 +1821,8 @@ def search(
 def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
+    from duckduckgo_search import DDGS
     def is_in_any(str_candi_short, str_full, ignore_case=True):
         if isinstance(str_candi_short, str):
             str_candi_short = [str_candi_short]
@@ -1521,8 +1851,12 @@ def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
     model_valid = valid_mod_name(model)
     res = DDGS().chat(query, model=model_valid)
     if verbose:
+        from pprint import pp
         pp(res)
     if log:
+        from datetime import datetime
         dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
         res_ = f"###{dt_str}\n\n>{res}\n"
         os.makedirs(dir_save, exist_ok=True)
@@ -1542,3 +1876,191 @@ def ai(*args, **kwargs):
     if len(args) == 1 and isinstance(args[0], str):
         kwargs["query"] = args[0]
     return echo(**kwargs)
+#! get_ip()
+def get_ip(ip=None):
+    """
+    Usage:
+        from py2ls import netfinder as nt
+        ip = nt.get_ip()
+    """
+    import requests
+    import time
+    import logging
+    from datetime import datetime, timedelta
+    # Set up logging configuration
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[
+            logging.StreamHandler(),
+            logging.FileHandler("public_ip_log.log"),  # Log to a file
+        ],
+    )
+    cache = {}
+    # Function to fetch IP addresses synchronously
+    def fetch_ip(url, retries, timeout, headers):
+        """
+        Synchronous function to fetch the IP address with retries.
+        """
+        for attempt in range(retries):
+            try:
+                response = requests.get(url, timeout=timeout, headers=headers)
+                response.raise_for_status()
+                return response.json()
+            except requests.RequestException as e:
+                logging.error(f"Attempt {attempt + 1} failed: {e}")
+                if attempt < retries - 1:
+                    time.sleep(2**attempt)  # Exponential backoff
+                else:
+                    logging.error("Max retries reached.")
+                    return {"error": f"Error fetching IP: {e}"}
+            except requests.Timeout:
+                logging.error("Request timed out")
+                time.sleep(2**attempt)
+        return {"error": "Failed to fetch IP after retries"}
+    # Function to fetch geolocation synchronously
+    def fetch_geolocation(url, retries, timeout, headers):
+        """
+        Synchronous function to fetch geolocation data by IP address.
+        """
+        for attempt in range(retries):
+            try:
+                response = requests.get(url, timeout=timeout, headers=headers)
+                response.raise_for_status()
+                return response.json()
+            except requests.RequestException as e:
+                logging.error(f"Geolocation request attempt {attempt + 1} failed: {e}")
+                if attempt < retries - 1:
+                    time.sleep(2**attempt)  # Exponential backoff
+                else:
+                    logging.error("Max retries reached.")
+                    return {"error": f"Error fetching geolocation: {e}"}
+            except requests.Timeout:
+                logging.error("Geolocation request timed out")
+                time.sleep(2**attempt)
+        return {"error": "Failed to fetch geolocation after retries"}
+    # Main function to get public IP and geolocation
+    def get_public_ip(
+        ip4=True,
+        ip6=True,
+        verbose=True,
+        retries=3,
+        timeout=5,
+        geolocation=True,
+        headers=None,
+        cache_duration=5,
+    ):
+        """
+        Synchronously fetches public IPv4 and IPv6 addresses, along with optional geolocation info.
+        """
+        # Use the cache if it's still valid
+        cache_key_ip4 = "public_ip4"
+        cache_key_ip6 = "public_ip6"
+        cache_key_geolocation = "geolocation"
+        if (
+            cache
+            and cache_key_ip4 in cache
+            and datetime.now() < cache[cache_key_ip4]["expires"]
+        ):
+            logging.info("Cache hit for IPv4, using cached data.")
+            ip4_data = cache[cache_key_ip4]["data"]
+        else:
+            ip4_data = None
+        if (
+            cache
+            and cache_key_ip6 in cache
+            and datetime.now() < cache[cache_key_ip6]["expires"]
+        ):
+            logging.info("Cache hit for IPv6, using cached data.")
+            ip6_data = cache[cache_key_ip6]["data"]
+        else:
+            ip6_data = None
+        if (
+            cache
+            and cache_key_geolocation in cache
+            and datetime.now() < cache[cache_key_geolocation]["expires"]
+        ):
+            logging.info("Cache hit for Geolocation, using cached data.")
+            geolocation_data = cache[cache_key_geolocation]["data"]
+        else:
+            geolocation_data = None
+        # Fetch IPv4 if requested
+        if ip4 and not ip4_data:
+            logging.info("Fetching IPv4...")
+            ip4_data = fetch_ip(
+                "https://api.ipify.org?format=json", retries, timeout, headers
+            )
+            cache[cache_key_ip4] = {
+                "data": ip4_data,
+                "expires": datetime.now() + timedelta(minutes=cache_duration),
+            }
+        # Fetch IPv6 if requested
+        if ip6 and not ip6_data:
+            logging.info("Fetching IPv6...")
+            ip6_data = fetch_ip(
+                "https://api6.ipify.org?format=json", retries, timeout, headers
+            )
+            cache[cache_key_ip6] = {
+                "data": ip6_data,
+                "expires": datetime.now() + timedelta(minutes=cache_duration),
+            }
+        # Fetch geolocation if requested
+        if geolocation and not geolocation_data:
+            logging.info("Fetching Geolocation...")
+            geolocation_data = fetch_geolocation(
+                "https://ipinfo.io/json", retries, timeout, headers
+            )
+            cache[cache_key_geolocation] = {
+                "data": geolocation_data,
+                "expires": datetime.now() + timedelta(minutes=cache_duration),
+            }
+        # Prepare the results
+        ip_info = {
+            "ip4": ip4_data.get("ip") if ip4_data else "N/A",
+            "ip6": ip6_data.get("ip") if ip6_data else "N/A",
+            "geolocation": geolocation_data if geolocation_data else "N/A",
+        }
+        # Verbose output if requested
+        if verbose:
+            print(f"Public IPv4: {ip_info['ip4']}")
+            print(f"Public IPv6: {ip_info['ip6']}")
+            print(f"Geolocation: {ip_info['geolocation']}")
+        return ip_info
+    # Function to get geolocation data by IP
+    def get_geolocation_by_ip(ip, retries=3, timeout=5, headers=None):
+        """
+        Fetches geolocation data for a given IP address.
+        """
+        url = f"https://ipinfo.io/{ip}/json"
+        geolocation_data = fetch_geolocation(url, retries, timeout, headers)
+        return geolocation_data
+    #! here starting get_ip()
+    headers = {"User-Agent": user_agent()}
+    if ip is None:
+        try:
+            ip_data = get_public_ip(headers=headers, verbose=True)
+        except Exception as e:
+            print(e)
+            ip_data = None
+        return ip_data
+    else:
+        geolocation_data = get_geolocation_by_ip(ip, headers=headers)
+        return geolocation_data

py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl

Potentially problematic release.

py2ls 0.1.10.12py3-none-any.whl → 0.2.7.10py3-none-any.whl