py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of py2ls might be problematic. Click here for more details.

Files changed (72) hide show
  1. py2ls/.DS_Store +0 -0
  2. py2ls/.git/.DS_Store +0 -0
  3. py2ls/.git/index +0 -0
  4. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  5. py2ls/.git/objects/.DS_Store +0 -0
  6. py2ls/.git/refs/.DS_Store +0 -0
  7. py2ls/ImageLoader.py +621 -0
  8. py2ls/__init__.py +7 -5
  9. py2ls/apptainer2ls.py +3940 -0
  10. py2ls/batman.py +164 -42
  11. py2ls/bio.py +2595 -0
  12. py2ls/cell_image_clf.py +1632 -0
  13. py2ls/container2ls.py +4635 -0
  14. py2ls/corr.py +475 -0
  15. py2ls/data/.DS_Store +0 -0
  16. py2ls/data/email/email_html_template.html +88 -0
  17. py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
  18. py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
  19. py2ls/data/mygenes_fields_241022.txt +355 -0
  20. py2ls/data/re_common_pattern.json +173 -0
  21. py2ls/data/sns_info.json +74 -0
  22. py2ls/data/styles/.DS_Store +0 -0
  23. py2ls/data/styles/example/.DS_Store +0 -0
  24. py2ls/data/styles/stylelib/.DS_Store +0 -0
  25. py2ls/data/styles/stylelib/grid.mplstyle +15 -0
  26. py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
  27. py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
  28. py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
  29. py2ls/data/styles/stylelib/light.mplstyl +6 -0
  30. py2ls/data/styles/stylelib/muted.mplstyle +6 -0
  31. py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
  32. py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
  33. py2ls/data/styles/stylelib/nature.mplstyle +31 -0
  34. py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
  35. py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
  36. py2ls/data/styles/stylelib/paper.mplstyle +290 -0
  37. py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
  38. py2ls/data/styles/stylelib/retro.mplstyle +4 -0
  39. py2ls/data/styles/stylelib/sans.mplstyle +10 -0
  40. py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
  41. py2ls/data/styles/stylelib/science.mplstyle +48 -0
  42. py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
  43. py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
  44. py2ls/data/tiles.csv +146 -0
  45. py2ls/data/usages_pd.json +1417 -0
  46. py2ls/data/usages_sns.json +31 -0
  47. py2ls/docker2ls.py +5446 -0
  48. py2ls/ec2ls.py +61 -0
  49. py2ls/fetch_update.py +145 -0
  50. py2ls/ich2ls.py +1955 -296
  51. py2ls/im2.py +8242 -0
  52. py2ls/image_ml2ls.py +2100 -0
  53. py2ls/ips.py +33909 -3418
  54. py2ls/ml2ls.py +7700 -0
  55. py2ls/mol.py +289 -0
  56. py2ls/mount2ls.py +1307 -0
  57. py2ls/netfinder.py +873 -351
  58. py2ls/nl2ls.py +283 -0
  59. py2ls/ocr.py +1581 -458
  60. py2ls/plot.py +10394 -314
  61. py2ls/rna2ls.py +311 -0
  62. py2ls/ssh2ls.md +456 -0
  63. py2ls/ssh2ls.py +5933 -0
  64. py2ls/ssh2ls_v01.py +2204 -0
  65. py2ls/stats.py +66 -172
  66. py2ls/temp20251124.py +509 -0
  67. py2ls/translator.py +2 -0
  68. py2ls/utils/decorators.py +3564 -0
  69. py2ls/utils_bio.py +3453 -0
  70. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
  71. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
  72. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/netfinder.py CHANGED
@@ -1,37 +1,19 @@
1
- from bs4 import BeautifulSoup
1
+ from bs4 import BeautifulSoup, NavigableString
2
2
  import requests
3
- from requests.utils import dict_from_cookiejar
4
- from requests.exceptions import ChunkedEncodingError, ConnectionError
5
3
  import os
6
- from urllib.parse import urlparse, urljoin
7
- import base64
4
+ from tqdm import tqdm
5
+ import chardet
8
6
  import pandas as pd
9
- from collections import Counter
10
- import random
11
7
  import logging
12
- from time import sleep
13
- import stem.process
14
- from stem import Signal
15
- from stem.control import Controller
16
8
  import json
17
- from fake_useragent import UserAgent
18
- from selenium import webdriver
19
- from selenium.webdriver.chrome.service import Service
20
- from selenium.webdriver.common.by import By
21
- from selenium.webdriver.chrome.options import Options
22
- from selenium.webdriver.support.ui import WebDriverWait
23
- from selenium.webdriver.support import expected_conditions as EC
24
- from webdriver_manager.chrome import ChromeDriverManager
25
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
26
- from pprint import pp
27
- import mimetypes
28
- import io
29
- import matplotlib.pyplot as plt
30
- from PIL import Image
31
- from duckduckgo_search import DDGS
32
- from datetime import datetime
33
9
  import time
34
- from py2ls import ips
10
+ from selenium.webdriver.common.by import By
11
+ from . import ips
12
+ import random
13
+ try:
14
+ import scrapy
15
+ except ImportError:
16
+ scrapy = None
35
17
 
36
18
  dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
37
19
  # Set up logging
@@ -48,20 +30,66 @@ CONTENT_PARSERS = {
48
30
  "text/xml": lambda text, parser: BeautifulSoup(text, parser),
49
31
  "text/plain": lambda text, parser: text.text,
50
32
  }
51
-
52
-
33
+
34
+ # Fallback pool of common User-Agent strings
35
+ fallback_user_agents = [
36
+ # Chrome (Windows)
37
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
38
+ # Firefox (Mac)
39
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0",
40
+ # Edge (Windows)
41
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203",
42
+ # Safari (Mac)
43
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
44
+ # Linux Chrome
45
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.90 Safari/537.36",
46
+ # Android Tablet (Samsung)
47
+ "Mozilla/5.0 (Linux; Android 9; SAMSUNG SM-T860) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/10.1 Chrome/71.0.3578.99 Safari/537.36",
48
+ # iPhone Safari
49
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Mobile/15E148 Safari/604.1",
50
+ # Android Mobile Chrome
51
+ "Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.154 Mobile Safari/537.36",
52
+ # iPad Safari
53
+ "Mozilla/5.0 (iPad; CPU OS 15_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Mobile/15E148 Safari/604.1",
54
+ # Opera (Windows)
55
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 OPR/86.0.4363.32",
56
+ # Brave (Mac)
57
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
58
+ # Vivaldi (Windows)
59
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Vivaldi/5.1.2567.49",
60
+ # Android Chrome OnePlus
61
+ "Mozilla/5.0 (Linux; Android 10; ONEPLUS A6010) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.74 Mobile Safari/537.36",
62
+ # Samsung Galaxy S22 Chrome
63
+ "Mozilla/5.0 (Linux; Android 12; SAMSUNG SM-S901B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36",
64
+ # Xiaomi MIUI Browser
65
+ "Mozilla/5.0 (Linux; Android 11; M2012K11AG) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.125 Mobile Safari/537.36",
66
+ # Desktop Safari on macOS Ventura
67
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
68
+ ]
53
69
  def user_agent(
54
70
  browsers=["chrome", "edge", "firefox", "safari"],
55
71
  platforms=["pc", "tablet"],
56
72
  verbose=False,
57
- os=["windows", "macos", "linux"],
73
+ os_names=["windows", "macos", "linux"],
58
74
  ):
59
- ua = UserAgent(browsers=browsers, platforms=platforms, os=os)
60
- output_ua = ua.random
75
+ import warnings
76
+ import traceback
77
+
78
+ try:
79
+ from fake_useragent import UserAgent
80
+
81
+ ua = UserAgent(browsers=browsers, platforms=platforms, os=os_names)
82
+ output_ua = ua.random
83
+ except Exception as e:
84
+ warnings.warn("fake_useragent failed, using fallback list instead.\n" + str(e))
85
+ if verbose:
86
+ traceback.print_exc()
87
+ output_ua = random.choice(fallback_user_agents)
88
+
61
89
  if verbose:
62
- print(output_ua)
63
- return output_ua
90
+ print("Selected User-Agent:", output_ua)
64
91
 
92
+ return output_ua
65
93
 
66
94
  def get_tags(content, ascending=True):
67
95
  tag_names = set()
@@ -109,6 +137,8 @@ def get_attr(content, where=None, attr=None, **kwargs):
109
137
  else:
110
138
  print(f"The attribute '{attr}' is not found in the elements.")
111
139
  else:
140
+ from pprint import pp
141
+
112
142
  print(f"Cannot find tag '{where}' in the content.")
113
143
  print("Available tags:")
114
144
  pp(all_tags)
@@ -136,8 +166,8 @@ def extract_text_from_content(
136
166
 
137
167
  def extract_text(element):
138
168
  texts = ""
139
- if isinstance(element, str) and element.strip():
140
- texts += element.strip()
169
+ if isinstance(element, NavigableString) and element.strip():
170
+ texts += element.strip() + " "
141
171
  elif hasattr(element, "children"):
142
172
  for child in element.children:
143
173
  texts += extract_text(child)
@@ -192,6 +222,8 @@ def extract_text_from_content(
192
222
  texts = ""
193
223
  for tag in result_set:
194
224
  texts = texts + " " + extract_text(tag) + " \n"
225
+ # texts = texts + " " + tag.get_text(" ", strip=True)+ " \n"
226
+
195
227
  text_list = [tx.strip() for tx in texts.split(" \n") if tx.strip()]
196
228
  return text_list
197
229
  else:
@@ -237,6 +269,8 @@ def flatten_json(y):
237
269
 
238
270
 
239
271
  def get_proxy():
272
+ import random
273
+
240
274
  list_ = []
241
275
  headers = {"User-Agent": user_agent()}
242
276
  response = requests.get(
@@ -275,6 +309,8 @@ def get_cookies(url, login={"username": "your_username", "password": "your_passw
275
309
 
276
310
  ### 更加平滑地移动鼠标, 这样更容易反爬
277
311
  def scroll_smth_steps(driver, scroll_pause=0.5, min_step=200, max_step=600):
312
+ import random
313
+
278
314
  """Smoothly scrolls down the page to trigger lazy loading."""
279
315
  current_scroll_position = 0
280
316
  end_of_page = driver.execute_script("return document.body.scrollHeight")
@@ -327,13 +363,164 @@ def corr_by_kind(wait_until_kind):
327
363
  raise ValueError(f"Unsupported wait_until_kind: {wait_until_kind}")
328
364
 
329
365
 
366
+
367
+
368
+ def parse_cookies(cookies_str):
369
+ """
370
+ 直接复制于browser,它可以负责转换成最终的dict
371
+ """
372
+ import re
373
+ cookies_dict = {}
374
+
375
+ # Split the string by newlines to get each cookie row
376
+ cookies_list = cookies_str.strip().split("\n")
377
+
378
+ for cookie in cookies_list:
379
+ # Use regular expression to capture name and value pairs
380
+ match = re.match(r"([a-zA-Z0-9_\-\.]+)\s+([^\s]+)", cookie)
381
+ if match:
382
+ cookie_name = match.group(1)
383
+ cookie_value = match.group(2)
384
+ cookies_dict[cookie_name] = cookie_value
385
+
386
+ return cookies_dict
387
+ def fetch_scrapy(
388
+ url,
389
+ parser="html.parser",
390
+ cookies=None,
391
+ headers=None,
392
+ settings=None,
393
+ ):
394
+ """
395
+ Fetches content using Scrapy with proper reactor handling.
396
+
397
+ Args:
398
+ url (str): The URL to scrape.
399
+ parser (str): Parser for BeautifulSoup (e.g., "lxml", "html.parser").
400
+ cookies (dict): Cookies to pass in the request.
401
+ headers (dict): HTTP headers for the request.
402
+ settings (dict): Scrapy settings, if any.
403
+
404
+ Returns:
405
+ dict: Parsed content as a dictionary.
406
+ """
407
+ from scrapy.utils.project import get_project_settings
408
+ from scrapy.crawler import CrawlerRunner
409
+ from scrapy.signalmanager import dispatcher
410
+ from scrapy import signals
411
+ from twisted.internet import reactor, defer
412
+ from twisted.internet.error import ReactorNotRestartable
413
+ import scrapy
414
+ import logging
415
+
416
+ # Disable Scrapy's excessive logging
417
+ logging.getLogger('scrapy').setLevel(logging.WARNING)
418
+ logging.getLogger('twisted').setLevel(logging.WARNING)
419
+
420
+ # Container for scraped content
421
+ content = []
422
+
423
+ # Define the spider class inside the function
424
+ class FetchSpider(scrapy.Spider):
425
+ name = "fetch_spider"
426
+
427
+ def __init__(self, url=None, parser=None, cookies=None, headers=None, *args, **kwargs):
428
+ super(FetchSpider, self).__init__(*args, **kwargs)
429
+ self.start_urls = [url]
430
+ self.parser = parser
431
+ self.cookies = cookies
432
+ self.headers = headers
433
+
434
+ def start_requests(self):
435
+ for url in self.start_urls:
436
+ yield scrapy.Request(
437
+ url,
438
+ cookies=self.cookies,
439
+ headers=self.headers,
440
+ callback=self.parse
441
+ )
442
+
443
+ def parse(self, response):
444
+ from bs4 import BeautifulSoup
445
+ soup = BeautifulSoup(response.text, self.parser)
446
+ yield {
447
+ "content": soup,
448
+ "url": response.url,
449
+ "status": response.status
450
+ }
451
+
452
+ # Callback function for item scraped signal
453
+ def handle_item(item, response, spider):
454
+ content.append(item)
455
+
456
+ # Scrapy settings
457
+ process_settings = settings or get_project_settings()
458
+ process_settings.update(
459
+ {
460
+ "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
461
+ "DOWNLOAD_DELAY": 1,
462
+ "COOKIES_ENABLED": bool(cookies),
463
+ "LOG_LEVEL": "ERROR",
464
+ "RETRY_ENABLED": False,
465
+ "HTTPERROR_ALLOW_ALL": True,
466
+ }
467
+ )
468
+
469
+ # Connect item scraped signal
470
+ dispatcher.connect(handle_item, signal=signals.item_scraped)
471
+
472
+ # Asynchronous Twisted function
473
+ @defer.inlineCallbacks
474
+ def crawl():
475
+ runner = CrawlerRunner(settings=process_settings)
476
+ yield runner.crawl(
477
+ FetchSpider,
478
+ url=url,
479
+ parser=parser,
480
+ cookies=cookies,
481
+ headers=headers,
482
+ )
483
+ reactor.stop()
484
+
485
+ # Handle reactor execution
486
+ try:
487
+ if not reactor.running:
488
+ crawl()
489
+ reactor.run(installSignalHandlers=0)
490
+ else:
491
+ # This case is problematic - reactor can't be restarted
492
+ raise RuntimeError("Reactor already running. Cannot run multiple crawls in same process.")
493
+ except ReactorNotRestartable:
494
+ raise RuntimeError("Scrapy reactor cannot be restarted. Create a new process for additional crawls.")
495
+
496
+ # Return the first scraped content or None if empty
497
+ return content[0] if content else None
498
+
499
+ def _clean_temp():
500
+ import os
501
+ import shutil
502
+ import tempfile
503
+ from pathlib import Path
504
+
505
+ # Get the parent folder of the tempdir
506
+ temp_dir = Path(tempfile.gettempdir()).parent # moves from /T to parent dir
507
+
508
+ for subdir in temp_dir.iterdir():
509
+ if subdir.is_dir():
510
+ for d in subdir.iterdir():
511
+ if "com.google.Chrome.code_sign_clone" in d.name:
512
+ try:
513
+ print(f"Removing: {d}")
514
+ shutil.rmtree(d)
515
+ except Exception as e:
516
+ print(f"Error removing {d}: {e}")
330
517
  def fetch_all(
331
518
  url,
332
519
  parser="lxml",
333
520
  driver="request", # request or selenium
334
521
  by=By.TAG_NAME,
335
522
  timeout=10,
336
- retry=2,
523
+ retry=3, # Increased default retries
337
524
  wait=0,
338
525
  wait_until=None,
339
526
  wait_until_kind=None,
@@ -347,221 +534,225 @@ def fetch_all(
347
534
  username_by=By.NAME,
348
535
  password_by=By.NAME,
349
536
  submit_by=By.NAME,
350
- # capability='eager', # eager or none
351
- proxy=None, # Add proxy parameter
352
- javascript=True, # Add JavaScript option
353
- disable_images=False, # Add option to disable images
537
+ proxy=None,
538
+ javascript=True,
539
+ disable_images=False,
354
540
  iframe_name=None,
355
541
  login_dict=None,
356
- ): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
357
- try:
358
- # # Generate a random user-agent string
359
- # response = requests.get(url)
360
- # # get cookies
361
- # cookie=dict_from_cookiejar(response.cookies)
362
- # # get token from cookies
363
- # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
364
- # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
365
-
366
- headers = {"User-Agent": user_agent()}
367
- if "req" in driver.lower():
368
- response = requests.get(
369
- url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
370
- )
542
+ cookies=None,
543
+ verify_ssl=True, # Added SSL verification option
544
+ follow_redirects=True, # Added redirect control
545
+ ):
546
+ """
547
+ Enhanced fetch function with better error handling and reliability.
548
+
549
+ Returns:
550
+ tuple: (content_type, parsed_content) or (None, None) on failure
551
+ """
552
+ def _parse_content(content, content_type, parser):
553
+ """Helper function to parse content with fallback"""
554
+ try:
555
+ if content_type in CONTENT_PARSERS:
556
+ return CONTENT_PARSERS[content_type](content, parser)
557
+
558
+ # Fallback parsing attempts
559
+ if content_type.startswith('text/'):
560
+ try:
561
+ return BeautifulSoup(content, parser)
562
+ except:
563
+ return content
564
+ return content
565
+ except Exception as e:
566
+ logger.warning(f"Content parsing failed: {e}")
567
+ return content
371
568
 
372
- # If the response is a redirect, follow it
373
- while response.is_redirect:
374
- logger.info(f"Redirecting to: {response.headers['Location']}")
569
+ def _make_request(url, headers, cookies, timeout, verify_ssl, follow_redirects):
570
+ """Helper function for HTTP requests with retries"""
571
+ for attempt in range(retry):
572
+ try:
375
573
  response = requests.get(
376
- response.headers["Location"],
574
+ url,
377
575
  headers=headers,
378
- proxies=proxies_glob,
379
- timeout=30,
576
+ cookies=cookies,
577
+ timeout=timeout,
380
578
  stream=True,
579
+ verify=verify_ssl,
580
+ allow_redirects=follow_redirects
381
581
  )
382
- # Check for a 403 error
383
- if response.status_code == 403:
384
- logger.warning("403 Forbidden error. Retrying...")
385
- # Retry the request after a short delay
386
- sleep(random.uniform(1, 3))
387
- response = requests.get(
388
- url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
389
- )
390
- # Raise an error if retry also fails
582
+
583
+ # Handle redirects manually if needed
584
+ if not follow_redirects and response.is_redirect:
585
+ logger.info(f"Redirect detected to: {response.headers['Location']}")
586
+ return None, None
587
+
391
588
  response.raise_for_status()
392
-
393
- # Raise an error for other HTTP status codes
394
- response.raise_for_status()
395
-
396
- # Get the content type
397
- content_type = (
398
- response.headers.get("content-type", "").split(";")[0].lower()
589
+ return response, None
590
+
591
+ except requests.RequestException as e:
592
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
593
+ if attempt == retry - 1:
594
+ return None, e
595
+ time.sleep(random.uniform(1, 3))
596
+
597
+ # Convert driver integer to string if needed
598
+ if isinstance(driver, int):
599
+ drivers = ["request", "selenium", "scrapy"]
600
+ try:
601
+ driver = drivers[driver]
602
+ except IndexError:
603
+ driver = "request"
604
+
605
+ headers = {"User-Agent": user_agent()}
606
+
607
+ # Prepare cookies
608
+ cookie_jar = None
609
+ if cookies:
610
+ from requests.cookies import RequestsCookieJar
611
+ cookie_jar = RequestsCookieJar()
612
+ if isinstance(cookies, str):
613
+ cookies = parse_cookies(cookies)
614
+ for name, value in cookies.items():
615
+ cookie_jar.set(name, value)
616
+
617
+ try:
618
+ if "req" in driver.lower():
619
+ response, error = _make_request(
620
+ url, headers, cookie_jar, timeout, verify_ssl, follow_redirects
399
621
  )
400
- if response.encoding:
401
- content = response.content.decode(response.encoding)
402
- else:
403
- content = None
404
- # logger.info(f"Content type: {content_type}")
405
-
406
- # Check if content type is supported
407
- if content_type in CONTENT_PARSERS and content:
408
- return content_type, CONTENT_PARSERS[content_type](content, parser)
409
- else:
410
- logger.warning("Unsupported content type")
622
+ if error:
411
623
  return None, None
624
+ content_type = response.headers.get("content-type", "").split(";")[0].lower()
625
+ try:
626
+ detected = chardet.detect(response.content)
627
+ encoding = detected.get("encoding") or "utf-8"
628
+ content = response.content.decode(encoding, errors='replace')
629
+ except:
630
+ content = response.content.decode(response.encoding or 'utf-8', errors='replace')
631
+
632
+ return content_type, _parse_content(content, content_type, parser)
633
+
412
634
  elif "se" in driver.lower():
635
+ from selenium import webdriver
636
+ from selenium.webdriver.chrome.service import Service
637
+ from selenium.webdriver.chrome.options import Options
638
+ from webdriver_manager.chrome import ChromeDriverManager
639
+ from selenium.common.exceptions import WebDriverException
640
+
413
641
  chrome_options = Options()
414
642
  chrome_options.add_argument("--headless")
415
643
  chrome_options.add_argument("--no-sandbox")
644
+ chrome_options.add_argument("--disable-gpu")
416
645
  chrome_options.add_argument("--disable-dev-shm-usage")
646
+ chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
417
647
  chrome_options.add_argument(f"user-agent={user_agent()}")
648
+
418
649
  if proxy:
419
650
  chrome_options.add_argument(f"--proxy-server={proxy}")
420
651
  if disable_images:
421
- prefs = {"profile.managed_default_content_settings.images": 2}
422
- chrome_options.add_experimental_option("prefs", prefs)
423
- # chrome_options.page_load_strategy = capability
424
-
425
- service = Service(ChromeDriverManager().install())
426
- # driver_path='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/127.0.6533.119/chromedriver-mac-arm64/chromedriver'
427
- # service=Service(executable_path=driver_path)
428
-
429
- driver_ = webdriver.Chrome(service=service, options=chrome_options)
430
-
431
- # 隐式等等待
432
- if 3 < wait < 5:
433
- wait_ = random.uniform(3, 5)
434
- elif 5 <= wait < 8:
435
- wait_ = random.uniform(5, 8)
436
- elif 8 <= wait < 12:
437
- wait_ = random.uniform(8, 10)
438
- else:
439
- wait_ = 0
440
- driver_.implicitly_wait(wait_)
441
-
442
- if wait_until is not None and wait_until_kind is not None:
443
- strategy = corr_by_kind(wait_until_kind)
444
- WebDriverWait(driver_, timeout).until(
445
- EC.presence_of_element_located((strategy, wait_until))
446
- )
447
- if login_url and login_dict:
448
- cookies = get_cookies(url=login_url, login=login_dict)
449
- driver_.get(url)
450
- for cookie_name, cookie_value in cookies.items():
451
- driver_.add_cookie({"name": cookie_name, "value": cookie_value})
452
-
453
- if not javascript:
454
- driver_.execute_cdp_cmd(
455
- "Emulation.setScriptExecutionDisabled", {"value": True}
456
- )
457
-
458
- if login_url:
459
- driver_.get(login_url)
460
- WebDriverWait(driver_, timeout).until(
461
- EC.presence_of_element_located((username_by, username_field))
462
- ).send_keys(username)
463
- WebDriverWait(driver_, timeout).until(
464
- EC.presence_of_element_located((password_by, password_field))
465
- ).send_keys(password)
466
- WebDriverWait(driver_, timeout).until(
467
- EC.element_to_be_clickable((submit_by, submit_field))
468
- ).click()
469
-
470
- driver_.get(url)
471
-
472
- if iframe_name:
473
- iframe = WebDriverWait(driver_, timeout).until(
474
- EC.presence_of_element_located((By.NAME, iframe_name))
652
+ chrome_options.add_experimental_option(
653
+ "prefs", {"profile.managed_default_content_settings.images": 2}
475
654
  )
476
- driver_.switch_to.frame(iframe)
477
-
478
- # WebDriverWait(driver, timeout).until(
479
- # EC.presence_of_element_located((by, where))
480
- # )
481
-
482
- # # scroll down the page by a certain number of pixels
483
- scroll_smth_steps(driver_)
484
-
485
- # 设置轮询
486
- for attempt in range(scroll_try):
487
- page_source = driver_.page_source
488
- content = BeautifulSoup(page_source, "html.parser")
489
- if content and content.find_all(by):
490
- break
491
- sleep(
492
- random.uniform(2, 4)
493
- ) # Wait for a random time before polling again
494
-
495
- driver_.quit()
496
-
497
- # content = BeautifulSoup(page_source, "html.parser")
498
- if content:
499
- return "text/html", content
500
- else:
501
- logger.warning("Selenium could not fetch content")
655
+
656
+ driver_instance = None
657
+ try:
658
+ # Try with latest ChromeDriver first
659
+ service = Service(ChromeDriverManager().install())
660
+ driver_instance = webdriver.Chrome(service=service, options=chrome_options)
661
+
662
+ # Configure wait times
663
+ if 3 < wait < 5:
664
+ wait_time = random.uniform(3, 5)
665
+ elif 5 <= wait < 8:
666
+ wait_time = random.uniform(5, 8)
667
+ elif 8 <= wait < 12:
668
+ wait_time = random.uniform(8, 10)
669
+ else:
670
+ wait_time = 0
671
+
672
+ driver_instance.implicitly_wait(wait_time)
673
+
674
+ # Handle login if needed
675
+ if login_url and login_dict:
676
+ cookies = get_cookies(url=login_url, login=login_dict)
677
+ driver_instance.get(url)
678
+ for name, value in cookies.items():
679
+ driver_instance.add_cookie({"name": name, "value": value})
680
+ elif cookies:
681
+ driver_instance.get(url)
682
+ if isinstance(cookies, str):
683
+ cookies = parse_cookies(cookies)
684
+ for name, value in cookies.items():
685
+ driver_instance.add_cookie({"name": name, "value": value})
686
+
687
+ if not javascript:
688
+ driver_instance.execute_cdp_cmd(
689
+ "Emulation.setScriptExecutionDisabled", {"value": True}
690
+ )
691
+
692
+ # Navigate to target URL
693
+ driver_instance.get(url)
694
+
695
+ # Handle iframes if needed
696
+ if iframe_name:
697
+ iframe = WebDriverWait(driver_instance, timeout).until(
698
+ EC.presence_of_element_located((By.NAME, iframe_name))
699
+ )
700
+ driver_instance.switch_to.frame(iframe)
701
+
702
+ # Scroll to trigger dynamic content
703
+ scroll_smth_steps(driver_instance)
704
+
705
+ # Get page source with retries
706
+ content = None
707
+ for attempt in range(scroll_try):
708
+ try:
709
+ page_source = driver_instance.page_source
710
+ content = BeautifulSoup(page_source, parser)
711
+ if content and content.find_all(by):
712
+ break
713
+ except Exception as e:
714
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
715
+ time.sleep(random.uniform(1, 3))
716
+ try:
717
+ _clean_temp()
718
+ except Exception as e:
719
+ print(e)
720
+ return "text/html", content if content else None
721
+
722
+ except WebDriverException as e:
723
+ logger.error(f"Selenium error: {e}")
502
724
  return None, None
503
- except requests.RequestException as e:
504
- logger.error(f"Error fetching URL '{url}': {e}")
725
+ finally:
726
+ if driver_instance:
727
+ driver_instance.quit()
728
+
729
+ elif 'scr' in driver.lower():
730
+ settings = {
731
+ "USER_AGENT": user_agent(),
732
+ "DOWNLOAD_DELAY": 1,
733
+ "COOKIES_ENABLED": bool(cookies),
734
+ "LOG_LEVEL": "WARNING",
735
+ "RETRY_TIMES": retry,
736
+ "DOWNLOAD_TIMEOUT": timeout,
737
+ }
738
+ content = fetch_scrapy(
739
+ url,
740
+ parser=parser,
741
+ cookies=cookies,
742
+ headers=headers,
743
+ settings=settings
744
+ )
745
+ return parser, content
746
+
747
+ except Exception as e:
748
+ logger.error(f"Unexpected error in fetch_all: {e}")
505
749
  return None, None
750
+
751
+ return None, None
506
752
 
507
-
508
- # # Function to change Tor IP address
509
- # def renew_tor_ip():
510
- # with Controller.from_port(port=9051) as controller:
511
- # controller.authenticate()
512
- # controller.signal(Signal.NEWNYM)
513
-
514
- # # Function to make requests through Tor
515
- # def make_tor_request(url, max_retries=3):
516
- # renew_tor_ip()
517
- # headers = {"User-Agent": user_agent()}
518
- # session = requests.Session()
519
- # session.proxies = {"http": "socks5h://localhost:9050", "https": "socks5h://localhost:9050"}
520
-
521
- # for i in range(max_retries):
522
- # try:
523
- # response = session.get(url, headers=headers, timeout=10)
524
- # if response.status_code == 200:
525
- # return response.text
526
- # except requests.exceptions.RequestException as e:
527
- # print(f"Error: {e}")
528
- # time.sleep(2) # Add a delay between retries
529
-
530
- # return None
531
-
532
-
533
- # def find_links(url,driver='request'):
534
- # links_href,cond_ex= [],["javascript:","mailto:","tel:","fax:"]
535
- # content_type, soup = fetch_all(url,driver=driver)
536
- # if soup:
537
- # base_url = urlparse(url)
538
-
539
- # # Extract links from both 'href' and 'src' attributes across relevant tags
540
- # tags_with_links = ['a', 'img', 'script', 'link', 'iframe', 'embed','span']
541
- # elements = []
542
- # for tag in tags_with_links:
543
- # elements.extend(soup.find_all(tag, href=True))
544
- # elements.extend(soup.find_all(tag, src=True))
545
-
546
- # for element in elements:
547
- # link_href = element.get('href') or element.get('src')
548
- # if link_href:
549
- # if link_href.startswith("//"):
550
- # link_href = "http:" + link_href
551
- # elif not link_href.startswith(("http", "https")):
552
- # link_href = urljoin(base_url.geturl(), link_href)
553
-
554
- # if all(exclusion not in link_href for exclusion in cond_ex):
555
- # links_href.append(link_href)
556
-
557
- # return list(set(links_href)) # Remove duplicates
558
-
559
-
560
- # elif url.split('.')[-1] in ['pdf']:
561
- # return url
562
- # else:
563
- # return None
564
753
  def find_links(url, driver="request", booster=False):
754
+ from urllib.parse import urlparse, urljoin
755
+
565
756
  links_href, cond_ex = [], ["javascript:", "mailto:", "tel:", "fax:"]
566
757
  content_type, soup = fetch_all(url, driver=driver)
567
758
 
@@ -582,7 +773,7 @@ def find_links(url, driver="request", booster=False):
582
773
  if all(exclusion not in link_href for exclusion in cond_ex):
583
774
  links_href.append(link_href)
584
775
 
585
- unique_links = list(set(links_href)) # Remove duplicates
776
+ unique_links = ips.unique(links_href) # Remove duplicates
586
777
 
587
778
  if booster:
588
779
  for link in unique_links:
@@ -590,7 +781,7 @@ def find_links(url, driver="request", booster=False):
590
781
  sub_links = find_links(link, driver=driver, booster=False)
591
782
  if sub_links:
592
783
  links_href.extend(sub_links)
593
- links_href = list(set(links_href)) # Remove duplicates again
784
+ links_href = ips.unique(links_href) # Remove duplicates again
594
785
 
595
786
  return links_href
596
787
 
@@ -602,6 +793,8 @@ def find_links(url, driver="request", booster=False):
602
793
 
603
794
  # To determine which links are related to target domains(e.g., pages) you are interested in
604
795
  def filter_links(links, contains="html", driver="requ", booster=False):
796
+ from urllib.parse import urlparse, urljoin
797
+
605
798
  filtered_links = []
606
799
  if isinstance(contains, str):
607
800
  contains = [contains]
@@ -614,10 +807,13 @@ def filter_links(links, contains="html", driver="requ", booster=False):
614
807
  )
615
808
  if condition:
616
809
  filtered_links.append(link)
617
- return filtered_links
810
+ return ips.unique(filtered_links)
618
811
 
619
812
 
620
813
  def find_domain(links):
814
+ from urllib.parse import urlparse, urljoin
815
+ from collections import Counter
816
+
621
817
  if not links:
622
818
  return None
623
819
  domains = [urlparse(link).netloc for link in links]
@@ -672,6 +868,8 @@ def pdf_detector(url, contains=None, dir_save=None, booster=False):
672
868
  pdf_links = filter_links(links=links_all, contains=["pdf"])
673
869
 
674
870
  if pdf_links:
871
+ from pprint import pp
872
+
675
873
  pp(f"pdf detected{pdf_links}")
676
874
  else:
677
875
  print("no pdf file")
@@ -693,10 +891,9 @@ def pdf_detector(url, contains=None, dir_save=None, booster=False):
693
891
  idx += 1
694
892
  print(f"{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}")
695
893
 
696
-
697
894
  def downloader(
698
895
  url,
699
- dir_save=dir_save,
896
+ dir_save=None,
700
897
  kind=[".pdf"],
701
898
  contains=None,
702
899
  rm_folder=False,
@@ -705,38 +902,157 @@ def downloader(
705
902
  timeout=30,
706
903
  n_try=3,
707
904
  timestamp=False,
905
+ chunk_size=8192,
906
+ retry_delay=2,
708
907
  ):
709
- if verbose:
710
- print(
711
- "usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
712
- )
908
+ """
909
+ Enhanced file downloader with robust error handling and resume capability
910
+
911
+ Args:
912
+ url: URL or list of URLs to download
913
+ dir_save: Directory to save files (None for current directory)
914
+ kind: List of file extensions to filter for (e.g., ['.pdf', '.xls'])
915
+ contains: String that must be present in the filename
916
+ rm_folder: Whether to remove the target folder before downloading
917
+ booster: Whether to search for links on the page
918
+ verbose: Whether to print progress information
919
+ timeout: Connection timeout in seconds
920
+ n_try: Number of retry attempts
921
+ timestamp: Whether to add timestamp to filenames
922
+ chunk_size: Download chunk size in bytes
923
+ retry_delay: Delay between retries in seconds
924
+ """
925
+ import os
926
+ import time
927
+ import shutil
928
+ import requests
929
+ from requests.exceptions import (ChunkedEncodingError, ConnectionError,
930
+ RequestException, Timeout)
931
+ from urllib.parse import urlparse
932
+ from datetime import datetime
933
+
934
+ if verbose and ips.run_once_within():
935
+ print("usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)")
936
+
937
+ # -------------------- wget integration helper --------------------
938
+ def _wget_available():
939
+ """Check if wget exists on system"""
940
+ return shutil.which("wget") is not None
941
+
942
+ def _wget_download(url, out_path):
943
+ import subprocess
944
+ """Download a file using system wget with progress bar"""
945
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
946
+ try:
947
+ subprocess.run(
948
+ ["wget", "-c", "--show-progress", "--progress=bar:force", "-O", out_path, url],
949
+ check=True,
950
+ )
951
+ return True
952
+ except Exception as e:
953
+ if verbose:
954
+ print(f"wget download failed: {e}")
955
+ return False
956
+ # -----------------------------------------------------------------
713
957
 
714
958
  def fname_corrector(fname, ext):
959
+ """Ensure filename has correct extension"""
715
960
  if not ext.startswith("."):
716
961
  ext = "." + ext
717
- if not fname.endswith("ext"): # if not ext in fname:
718
- fname = fname[: -len(ext)] + ext
962
+ if not fname.endswith(ext):
963
+ fname = os.path.splitext(fname)[0] + ext
964
+ if not any(fname[:-len(ext)]):
965
+ fname = datetime.now().strftime("%H%M%S") + ext
719
966
  return fname
720
967
 
721
968
  def check_and_modify_filename(directory, filename):
969
+ """Handle duplicate filenames by adding counter"""
722
970
  base, ext = os.path.splitext(filename)
723
971
  counter = 1
724
972
  new_filename = filename
725
973
  while os.path.exists(os.path.join(directory, new_filename)):
726
- if counter <= 9:
727
- counter_ = "0" + str(counter)
728
- else:
729
- counter_ = str(counter)
730
- new_filename = f"{base}_{counter_}{ext}"
974
+ new_filename = f"{base}_{counter:02d}{ext}"
731
975
  counter += 1
732
976
  return new_filename
733
977
 
734
- fpath_tmp, corrected_fname = None, None
978
+ def get_partial_file_size(filepath):
979
+ """Get size of partially downloaded file"""
980
+ try:
981
+ return os.path.getsize(filepath)
982
+ except OSError:
983
+ return 0
984
+
985
+ def download_with_resume(url, filepath, headers=None):
986
+ """Download with resume capability"""
987
+ headers = headers or {}
988
+ initial_size = get_partial_file_size(filepath)
989
+
990
+ if initial_size > 0:
991
+ headers['Range'] = f'bytes={initial_size}-'
992
+ mode = 'ab'
993
+ else:
994
+ mode = 'wb'
995
+
996
+ try:
997
+ with requests.get(url, headers=headers, stream=True, timeout=timeout) as r:
998
+ r.raise_for_status()
999
+ total_size = int(r.headers.get('content-length', 0)) + initial_size
1000
+
1001
+ with open(filepath, mode) as f, tqdm(
1002
+ total=total_size,
1003
+ unit='B',
1004
+ unit_scale=True,
1005
+ unit_divisor=1024,
1006
+ initial=initial_size,
1007
+ desc=os.path.basename(filepath),
1008
+ disable=not verbose,
1009
+ ) as progress:
1010
+ for chunk in r.iter_content(chunk_size=chunk_size):
1011
+ if chunk: # filter out keep-alive chunks
1012
+ f.write(chunk)
1013
+ progress.update(len(chunk))
1014
+ return True
1015
+ except Exception as e:
1016
+ if verbose:
1017
+ print(f"Download error: {e}")
1018
+ return False
1019
+
1020
+ dir_save = dir_save or "./"
1021
+ filename = os.path.basename(urlparse(url).path)
1022
+ save_path = os.path.join(dir_save, filename)
1023
+ os.makedirs(dir_save, exist_ok=True)
1024
+ # Handle FTP URLs
1025
+ if isinstance(url, str) and url.startswith("ftp"):
1026
+ import urllib.request
1027
+
1028
+ try:
1029
+ urllib.request.urlretrieve(url, save_path)
1030
+ if verbose:
1031
+ print(f"Downloaded FTP file to: {save_path}")
1032
+ return save_path
1033
+ except Exception as e:
1034
+ print(f"FTP download failed: {e}")
1035
+ return None
1036
+ if kind is None and _wget_available():
1037
+ if verbose:
1038
+ print(f"Using wget for download: {url}")
1039
+ success = _wget_download(url, save_path)
1040
+ if success:
1041
+ if verbose:
1042
+ print(f"Successfully downloaded via wget: {save_path}")
1043
+ return save_path
1044
+ else:
1045
+ if verbose:
1046
+ print("⚠️ wget failed, falling back to requests...")
1047
+ kind = [".*"] # dummy
1048
+ # Process directory and file links
735
1049
  if not isinstance(kind, list):
736
1050
  kind = [kind]
1051
+
737
1052
  if isinstance(url, list):
1053
+ results = []
738
1054
  for url_ in url:
739
- downloader(
1055
+ results.append(downloader(
740
1056
  url_,
741
1057
  dir_save=dir_save,
742
1058
  kind=kind,
@@ -746,120 +1062,100 @@ def downloader(
746
1062
  timeout=timeout,
747
1063
  n_try=n_try,
748
1064
  timestamp=timestamp,
749
- )
750
- # sleep(random.uniform(1, 3))
751
- for i, k in enumerate(kind):
752
- if not k.startswith("."):
753
- kind[i] = "." + kind[i]
1065
+ ))
1066
+ return results
1067
+
1068
+ # Normalize file extensions
1069
+ kind = [k if k.startswith(".") else f".{k}" for k in kind]
1070
+
1071
+ # Find and filter links
754
1072
  file_links_all = []
755
1073
  for kind_ in kind:
756
- if isinstance(contains, str):
757
- contains = [contains]
758
- if isinstance(url, str):
759
- if any(ext in url for ext in kind):
760
- file_links = [url]
761
- else:
762
- if booster:
763
- links_all = []
764
- if "http" in url:
765
- links_all = find_links(url)
766
- else:
767
- links_all = url
768
- if contains is not None:
769
- file_links = filter_links(links_all, contains=contains + kind_)
770
- else:
771
- file_links = links_all # filter_links(links_all, contains=kind_)
772
- elif isinstance(url, list):
773
- links_all = url
774
- if contains is not None:
775
- file_links = filter_links(links_all, contains=contains + kind_)
776
- else:
777
- file_links = filter_links(links_all, contains=kind_)
1074
+ if isinstance(url, str) and any(ext in url for ext in kind):
1075
+ file_links = [url]
778
1076
  else:
779
- links_all = find_links(url)
780
- if contains is not None:
781
- file_links = filter_links(links_all, contains=contains + kind_)
782
- else:
783
- file_links = filter_links(links_all, contains=kind_)
1077
+ links_all = find_links(url) if booster else ([url] if isinstance(url, str) else url)
1078
+ file_links = filter_links(
1079
+ links_all,
1080
+ contains=(contains + kind_) if contains else kind_
1081
+ )
1082
+
1083
+ file_links = ips.unique(file_links)
784
1084
  if verbose:
785
1085
  if file_links:
786
1086
  print("Files detected:")
1087
+ from pprint import pp
787
1088
  pp(file_links)
788
1089
  else:
789
- file_links = []
790
1090
  print("No files detected")
791
- if isinstance(file_links, str):
792
- file_links_all = [file_links]
793
- elif isinstance(file_links, list):
794
- file_links_all.extend(file_links)
795
- if dir_save:
796
- if rm_folder:
797
- ips.rm_folder(dir_save)
798
- # if verbose:
799
- # print(f"\n... attempting to download to local\n")
800
- fnames = [file_link.split("/")[-1] for file_link in file_links_all]
1091
+
1092
+ if file_links:
1093
+ file_links_all.extend(file_links if isinstance(file_links, list) else [file_links])
801
1094
 
802
- for idx, file_link in enumerate(file_links_all):
803
- headers = {"User-Agent": user_agent()}
804
- itry = 0 # Retry logic with exception handling
805
- while itry < n_try:
806
- try:
807
- # streaming to handle large files and reduce memory usage.
808
- response = requests.get(
809
- file_link, headers=headers, timeout=timeout, stream=True
810
- )
811
- if response.status_code == 200:
812
- ext = next(
813
- (ftype for ftype in kind if ftype in file_link), None
814
- )
815
- if ext is None:
816
- ext = kind_
817
- print("ehereerere", ext)
818
- if ext:
819
- corrected_fname = fname_corrector(fnames[idx], ext)
820
- corrected_fname = check_and_modify_filename(
821
- dir_save, corrected_fname
822
- )
823
- if timestamp:
824
- corrected_fname = (
825
- datetime.now().strftime("%y%m%d_%H%M%S_")
826
- + corrected_fname
827
- )
828
- fpath_tmp = os.path.join(dir_save, corrected_fname)
829
- with open(fpath_tmp, "wb") as file:
830
- for chunk in response.iter_content(chunk_size=8192):
831
- if chunk: # Filter out keep-alive chunks
832
- file.write(chunk)
833
- if verbose:
834
- print(f"Done! {fnames[idx]}")
835
- else:
836
- if verbose:
837
- print(f"Unknown file type for {file_link}")
838
- break # Exit the retry loop if successful
839
- else:
1095
+ file_links_all = ips.unique(file_links_all)
1096
+ if not file_links_all:
1097
+ return None
1098
+
1099
+ # Prepare download directory
1100
+ dir_save = dir_save or "./"
1101
+ if rm_folder:
1102
+ ips.rm_folder(dir_save)
1103
+ os.makedirs(dir_save, exist_ok=True)
1104
+
1105
+ # Download files
1106
+ results = []
1107
+ for file_link in file_links_all:
1108
+ headers = {
1109
+ "User-Agent": user_agent(),
1110
+ "Accept-Encoding": "identity" # Disable compression for resume support
1111
+ }
1112
+
1113
+ # Determine filename
1114
+ filename = os.path.basename(urlparse(file_link).path)
1115
+ ext = next((ftype for ftype in kind if ftype in filename), kind[0])
1116
+ corrected_fname = fname_corrector(filename, ext)
1117
+ corrected_fname = check_and_modify_filename(dir_save, corrected_fname)
1118
+
1119
+ if timestamp:
1120
+ corrected_fname = datetime.now().strftime("%y%m%d_%H%M%S_") + corrected_fname
1121
+
1122
+ save_path = os.path.join(dir_save, corrected_fname)
1123
+
1124
+ # Download with retry logic
1125
+ success = False
1126
+ for attempt in range(n_try):
1127
+ try:
1128
+ if verbose:
1129
+ print(f"Downloading {file_link} (attempt {attempt + 1}/{n_try})")
1130
+ if _wget_available():
1131
+ success = _wget_download(file_link, save_path)
1132
+ if success:
840
1133
  if verbose:
841
- print(
842
- f"Failed to download file: HTTP status code {response.status_code}"
843
- )
1134
+ print(f"Successfully downloaded via wget: {save_path}")
844
1135
  break
845
- except (ChunkedEncodingError, ConnectionError) as e:
846
- print(f"Attempt {itry+1} failed: {e}. Retrying in a few seconds...")
847
- # time.sleep(random.uniform(0, 2)) # Random sleep to mitigate server issues
848
- if fpath_tmp and os.path.exists(fpath_tmp):
849
- os.remove(fpath_tmp)
850
- itry += 1
851
-
852
- if itry == n_try:
853
- print(f"Failed to download {file_link} after {n_try} attempts.")
854
-
855
- # print(f"\n{len(fnames)} files were downloaded:")
856
- if verbose:
857
- if corrected_fname:
858
- pp(corrected_fname)
859
- print(f"\n\nsaved @:\n{dir_save}")
860
- else:
861
- pp(fnames)
1136
+ if download_with_resume(file_link, save_path, headers):
1137
+ success = True
1138
+ if verbose:
1139
+ print(f"Successfully downloaded: {save_path}")
1140
+ break
1141
+
1142
+ except (ChunkedEncodingError, ConnectionError, Timeout, RequestException) as e:
1143
+ if verbose:
1144
+ print(f"Attempt {attempt + 1} failed: {e}")
1145
+ if attempt < n_try - 1:
1146
+ time.sleep(retry_delay)
1147
+
1148
+ if success:
1149
+ results.append(save_path)
1150
+ else:
1151
+ if verbose:
1152
+ print(f"Failed to download {file_link} after {n_try} attempts")
1153
+ # Clean up potentially corrupted file
1154
+ if os.path.exists(save_path):
1155
+ os.remove(save_path)
1156
+ results.append(None)
862
1157
 
1158
+ return results if len(results) != 1 else results[0]
863
1159
 
864
1160
  def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
865
1161
  """
@@ -872,9 +1168,14 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
872
1168
  Returns:
873
1169
  str: HTML content with updated image URLs pointing to local files.
874
1170
  """
1171
+ from urllib.parse import urlparse, urljoin
1172
+ import base64
1173
+
875
1174
  if rm_folder:
876
1175
  ips.rm_folder(dir_save)
877
1176
  content_type, content = fetch_all(url, driver=driver)
1177
+ if content_type is None:
1178
+ content_type=""
878
1179
  if "html" in content_type.lower():
879
1180
  # Create the directory if it doesn't exist
880
1181
  os.makedirs(dir_save, exist_ok=True)
@@ -937,6 +1238,9 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
937
1238
 
938
1239
 
939
1240
  def svg_to_png(svg_file):
1241
+ import io
1242
+ from PIL import Image
1243
+
940
1244
  with WandImage(filename=svg_file, resolution=300) as img:
941
1245
  img.format = "png"
942
1246
  png_image = img.make_blob()
@@ -1002,10 +1306,22 @@ def fetch_selenium(
1002
1306
  iframe_name=None, # Add option to handle iframe
1003
1307
  **kwargs,
1004
1308
  ):
1309
+ import random
1310
+ from selenium import webdriver
1311
+ from selenium.webdriver.chrome.service import Service
1312
+ from selenium.webdriver.common.by import By
1313
+ from selenium.webdriver.chrome.options import Options
1314
+ from selenium.webdriver.support.ui import WebDriverWait
1315
+ from selenium.webdriver.support import expected_conditions as EC
1316
+ from webdriver_manager.chrome import ChromeDriverManager
1317
+ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
1318
+
1005
1319
  chrome_options = Options()
1006
1320
  chrome_options.add_argument("--headless")
1007
1321
  chrome_options.add_argument("--no-sandbox")
1322
+ chrome_options.add_argument("--disable-gpu")
1008
1323
  chrome_options.add_argument("--disable-dev-shm-usage")
1324
+ chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
1009
1325
  chrome_options.add_argument(f"user-agent={user_agent()}")
1010
1326
  if proxy:
1011
1327
  chrome_options.add_argument(f"--proxy-server={proxy}")
@@ -1061,7 +1377,7 @@ def fetch_selenium(
1061
1377
  if attempt == retry - 1:
1062
1378
  logger.error("Failed to fetch the content after all retries")
1063
1379
  return []
1064
- sleep(random.uniform(1, 3))
1380
+ time.sleep(random.uniform(1, 3))
1065
1381
  # Return empty list if nothing found after all retries
1066
1382
  return []
1067
1383
 
@@ -1078,6 +1394,9 @@ def fetch(
1078
1394
  output="text",
1079
1395
  **kws,
1080
1396
  ):
1397
+ import random
1398
+ from urllib.parse import urlparse, urljoin
1399
+
1081
1400
  if "xt" in output.lower():
1082
1401
  for attempt in range(retry):
1083
1402
  if verbose and attempt == 0:
@@ -1103,12 +1422,12 @@ def fetch(
1103
1422
  else:
1104
1423
  if texts:
1105
1424
  break
1106
- sleep(random.uniform(0.5, 1.5))
1425
+ time.sleep(random.uniform(0.5, 1.5))
1107
1426
  if isinstance(texts, pd.core.frame.DataFrame):
1108
1427
  condition_ = [texts.empty, booster]
1109
1428
  else:
1110
1429
  condition_ = [not texts, booster]
1111
- if any(condition_):
1430
+ if any(condition_):
1112
1431
  print("trying to use 'fetcher2'...")
1113
1432
  texts = fetch_selenium(
1114
1433
  url=url, where=where, what=what, extend=extend, **kws
@@ -1116,6 +1435,7 @@ def fetch(
1116
1435
  if texts:
1117
1436
  return texts
1118
1437
  else:
1438
+ print("got nothing")
1119
1439
  return fetch(
1120
1440
  url,
1121
1441
  where=where,
@@ -1429,6 +1749,8 @@ def isa(fpath, kind="img"):
1429
1749
 
1430
1750
 
1431
1751
  def is_image(fpath):
1752
+ import mimetypes
1753
+
1432
1754
  mime_type, _ = mimetypes.guess_type(fpath)
1433
1755
  if mime_type and mime_type.startswith("image"):
1434
1756
  return True
@@ -1437,6 +1759,8 @@ def is_image(fpath):
1437
1759
 
1438
1760
 
1439
1761
  def is_document(fpath):
1762
+ import mimetypes
1763
+
1440
1764
  mime_type, _ = mimetypes.guess_type(fpath)
1441
1765
  if mime_type and (
1442
1766
  mime_type.startswith("text/")
@@ -1457,6 +1781,8 @@ def is_document(fpath):
1457
1781
 
1458
1782
 
1459
1783
  def is_zip(fpath):
1784
+ import mimetypes
1785
+
1460
1786
  mime_type, _ = mimetypes.guess_type(fpath)
1461
1787
  if mime_type == "application/zip":
1462
1788
  return True
@@ -1476,6 +1802,8 @@ def search(
1476
1802
  ):
1477
1803
 
1478
1804
  if "te" in kind.lower():
1805
+ from duckduckgo_search import DDGS
1806
+
1479
1807
  results = DDGS().text(query, max_results=limit)
1480
1808
  res = pd.DataFrame(results)
1481
1809
  res.rename(columns={"href": "links"}, inplace=True)
@@ -1493,6 +1821,8 @@ def search(
1493
1821
 
1494
1822
 
1495
1823
  def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
1824
+ from duckduckgo_search import DDGS
1825
+
1496
1826
  def is_in_any(str_candi_short, str_full, ignore_case=True):
1497
1827
  if isinstance(str_candi_short, str):
1498
1828
  str_candi_short = [str_candi_short]
@@ -1521,8 +1851,12 @@ def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
1521
1851
  model_valid = valid_mod_name(model)
1522
1852
  res = DDGS().chat(query, model=model_valid)
1523
1853
  if verbose:
1854
+ from pprint import pp
1855
+
1524
1856
  pp(res)
1525
1857
  if log:
1858
+ from datetime import datetime
1859
+
1526
1860
  dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
1527
1861
  res_ = f"###{dt_str}\n\n>{res}\n"
1528
1862
  os.makedirs(dir_save, exist_ok=True)
@@ -1542,3 +1876,191 @@ def ai(*args, **kwargs):
1542
1876
  if len(args) == 1 and isinstance(args[0], str):
1543
1877
  kwargs["query"] = args[0]
1544
1878
  return echo(**kwargs)
1879
+
1880
+
1881
+ #! get_ip()
1882
+ def get_ip(ip=None):
1883
+ """
1884
+ Usage:
1885
+ from py2ls import netfinder as nt
1886
+ ip = nt.get_ip()
1887
+ """
1888
+
1889
+ import requests
1890
+ import time
1891
+ import logging
1892
+ from datetime import datetime, timedelta
1893
+
1894
+ # Set up logging configuration
1895
+ logging.basicConfig(
1896
+ level=logging.INFO,
1897
+ format="%(asctime)s - %(levelname)s - %(message)s",
1898
+ handlers=[
1899
+ logging.StreamHandler(),
1900
+ logging.FileHandler("public_ip_log.log"), # Log to a file
1901
+ ],
1902
+ )
1903
+
1904
+ cache = {}
1905
+
1906
+ # Function to fetch IP addresses synchronously
1907
+ def fetch_ip(url, retries, timeout, headers):
1908
+ """
1909
+ Synchronous function to fetch the IP address with retries.
1910
+ """
1911
+ for attempt in range(retries):
1912
+ try:
1913
+ response = requests.get(url, timeout=timeout, headers=headers)
1914
+ response.raise_for_status()
1915
+ return response.json()
1916
+ except requests.RequestException as e:
1917
+ logging.error(f"Attempt {attempt + 1} failed: {e}")
1918
+ if attempt < retries - 1:
1919
+ time.sleep(2**attempt) # Exponential backoff
1920
+ else:
1921
+ logging.error("Max retries reached.")
1922
+ return {"error": f"Error fetching IP: {e}"}
1923
+ except requests.Timeout:
1924
+ logging.error("Request timed out")
1925
+ time.sleep(2**attempt)
1926
+ return {"error": "Failed to fetch IP after retries"}
1927
+
1928
+ # Function to fetch geolocation synchronously
1929
+ def fetch_geolocation(url, retries, timeout, headers):
1930
+ """
1931
+ Synchronous function to fetch geolocation data by IP address.
1932
+ """
1933
+ for attempt in range(retries):
1934
+ try:
1935
+ response = requests.get(url, timeout=timeout, headers=headers)
1936
+ response.raise_for_status()
1937
+ return response.json()
1938
+ except requests.RequestException as e:
1939
+ logging.error(f"Geolocation request attempt {attempt + 1} failed: {e}")
1940
+ if attempt < retries - 1:
1941
+ time.sleep(2**attempt) # Exponential backoff
1942
+ else:
1943
+ logging.error("Max retries reached.")
1944
+ return {"error": f"Error fetching geolocation: {e}"}
1945
+ except requests.Timeout:
1946
+ logging.error("Geolocation request timed out")
1947
+ time.sleep(2**attempt)
1948
+ return {"error": "Failed to fetch geolocation after retries"}
1949
+
1950
+ # Main function to get public IP and geolocation
1951
+ def get_public_ip(
1952
+ ip4=True,
1953
+ ip6=True,
1954
+ verbose=True,
1955
+ retries=3,
1956
+ timeout=5,
1957
+ geolocation=True,
1958
+ headers=None,
1959
+ cache_duration=5,
1960
+ ):
1961
+ """
1962
+ Synchronously fetches public IPv4 and IPv6 addresses, along with optional geolocation info.
1963
+ """
1964
+ # Use the cache if it's still valid
1965
+ cache_key_ip4 = "public_ip4"
1966
+ cache_key_ip6 = "public_ip6"
1967
+ cache_key_geolocation = "geolocation"
1968
+
1969
+ if (
1970
+ cache
1971
+ and cache_key_ip4 in cache
1972
+ and datetime.now() < cache[cache_key_ip4]["expires"]
1973
+ ):
1974
+ logging.info("Cache hit for IPv4, using cached data.")
1975
+ ip4_data = cache[cache_key_ip4]["data"]
1976
+ else:
1977
+ ip4_data = None
1978
+
1979
+ if (
1980
+ cache
1981
+ and cache_key_ip6 in cache
1982
+ and datetime.now() < cache[cache_key_ip6]["expires"]
1983
+ ):
1984
+ logging.info("Cache hit for IPv6, using cached data.")
1985
+ ip6_data = cache[cache_key_ip6]["data"]
1986
+ else:
1987
+ ip6_data = None
1988
+
1989
+ if (
1990
+ cache
1991
+ and cache_key_geolocation in cache
1992
+ and datetime.now() < cache[cache_key_geolocation]["expires"]
1993
+ ):
1994
+ logging.info("Cache hit for Geolocation, using cached data.")
1995
+ geolocation_data = cache[cache_key_geolocation]["data"]
1996
+ else:
1997
+ geolocation_data = None
1998
+
1999
+ # Fetch IPv4 if requested
2000
+ if ip4 and not ip4_data:
2001
+ logging.info("Fetching IPv4...")
2002
+ ip4_data = fetch_ip(
2003
+ "https://api.ipify.org?format=json", retries, timeout, headers
2004
+ )
2005
+ cache[cache_key_ip4] = {
2006
+ "data": ip4_data,
2007
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
2008
+ }
2009
+
2010
+ # Fetch IPv6 if requested
2011
+ if ip6 and not ip6_data:
2012
+ logging.info("Fetching IPv6...")
2013
+ ip6_data = fetch_ip(
2014
+ "https://api6.ipify.org?format=json", retries, timeout, headers
2015
+ )
2016
+ cache[cache_key_ip6] = {
2017
+ "data": ip6_data,
2018
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
2019
+ }
2020
+
2021
+ # Fetch geolocation if requested
2022
+ if geolocation and not geolocation_data:
2023
+ logging.info("Fetching Geolocation...")
2024
+ geolocation_data = fetch_geolocation(
2025
+ "https://ipinfo.io/json", retries, timeout, headers
2026
+ )
2027
+ cache[cache_key_geolocation] = {
2028
+ "data": geolocation_data,
2029
+ "expires": datetime.now() + timedelta(minutes=cache_duration),
2030
+ }
2031
+
2032
+ # Prepare the results
2033
+ ip_info = {
2034
+ "ip4": ip4_data.get("ip") if ip4_data else "N/A",
2035
+ "ip6": ip6_data.get("ip") if ip6_data else "N/A",
2036
+ "geolocation": geolocation_data if geolocation_data else "N/A",
2037
+ }
2038
+
2039
+ # Verbose output if requested
2040
+ if verbose:
2041
+ print(f"Public IPv4: {ip_info['ip4']}")
2042
+ print(f"Public IPv6: {ip_info['ip6']}")
2043
+ print(f"Geolocation: {ip_info['geolocation']}")
2044
+
2045
+ return ip_info
2046
+
2047
+ # Function to get geolocation data by IP
2048
+ def get_geolocation_by_ip(ip, retries=3, timeout=5, headers=None):
2049
+ """
2050
+ Fetches geolocation data for a given IP address.
2051
+ """
2052
+ url = f"https://ipinfo.io/{ip}/json"
2053
+ geolocation_data = fetch_geolocation(url, retries, timeout, headers)
2054
+ return geolocation_data
2055
+ #! here starting get_ip()
2056
+ headers = {"User-Agent": user_agent()}
2057
+ if ip is None:
2058
+ try:
2059
+ ip_data = get_public_ip(headers=headers, verbose=True)
2060
+ except Exception as e:
2061
+ print(e)
2062
+ ip_data = None
2063
+ return ip_data
2064
+ else:
2065
+ geolocation_data = get_geolocation_by_ip(ip, headers=headers)
2066
+ return geolocation_data