py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of py2ls might be problematic. Click here for more details.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/ImageLoader.py +621 -0
- py2ls/__init__.py +7 -5
- py2ls/apptainer2ls.py +3940 -0
- py2ls/batman.py +164 -42
- py2ls/bio.py +2595 -0
- py2ls/cell_image_clf.py +1632 -0
- py2ls/container2ls.py +4635 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/email/email_html_template.html +88 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
- py2ls/data/mygenes_fields_241022.txt +355 -0
- py2ls/data/re_common_pattern.json +173 -0
- py2ls/data/sns_info.json +74 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/styles/stylelib/.DS_Store +0 -0
- py2ls/data/styles/stylelib/grid.mplstyle +15 -0
- py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
- py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
- py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
- py2ls/data/styles/stylelib/light.mplstyl +6 -0
- py2ls/data/styles/stylelib/muted.mplstyle +6 -0
- py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature.mplstyle +31 -0
- py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
- py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
- py2ls/data/styles/stylelib/paper.mplstyle +290 -0
- py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
- py2ls/data/styles/stylelib/retro.mplstyle +4 -0
- py2ls/data/styles/stylelib/sans.mplstyle +10 -0
- py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
- py2ls/data/styles/stylelib/science.mplstyle +48 -0
- py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
- py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
- py2ls/data/tiles.csv +146 -0
- py2ls/data/usages_pd.json +1417 -0
- py2ls/data/usages_sns.json +31 -0
- py2ls/docker2ls.py +5446 -0
- py2ls/ec2ls.py +61 -0
- py2ls/fetch_update.py +145 -0
- py2ls/ich2ls.py +1955 -296
- py2ls/im2.py +8242 -0
- py2ls/image_ml2ls.py +2100 -0
- py2ls/ips.py +33909 -3418
- py2ls/ml2ls.py +7700 -0
- py2ls/mol.py +289 -0
- py2ls/mount2ls.py +1307 -0
- py2ls/netfinder.py +873 -351
- py2ls/nl2ls.py +283 -0
- py2ls/ocr.py +1581 -458
- py2ls/plot.py +10394 -314
- py2ls/rna2ls.py +311 -0
- py2ls/ssh2ls.md +456 -0
- py2ls/ssh2ls.py +5933 -0
- py2ls/ssh2ls_v01.py +2204 -0
- py2ls/stats.py +66 -172
- py2ls/temp20251124.py +509 -0
- py2ls/translator.py +2 -0
- py2ls/utils/decorators.py +3564 -0
- py2ls/utils_bio.py +3453 -0
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/netfinder.py
CHANGED
|
@@ -1,37 +1,19 @@
|
|
|
1
|
-
from bs4 import BeautifulSoup
|
|
1
|
+
from bs4 import BeautifulSoup, NavigableString
|
|
2
2
|
import requests
|
|
3
|
-
from requests.utils import dict_from_cookiejar
|
|
4
|
-
from requests.exceptions import ChunkedEncodingError, ConnectionError
|
|
5
3
|
import os
|
|
6
|
-
from
|
|
7
|
-
import
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import chardet
|
|
8
6
|
import pandas as pd
|
|
9
|
-
from collections import Counter
|
|
10
|
-
import random
|
|
11
7
|
import logging
|
|
12
|
-
from time import sleep
|
|
13
|
-
import stem.process
|
|
14
|
-
from stem import Signal
|
|
15
|
-
from stem.control import Controller
|
|
16
8
|
import json
|
|
17
|
-
from fake_useragent import UserAgent
|
|
18
|
-
from selenium import webdriver
|
|
19
|
-
from selenium.webdriver.chrome.service import Service
|
|
20
|
-
from selenium.webdriver.common.by import By
|
|
21
|
-
from selenium.webdriver.chrome.options import Options
|
|
22
|
-
from selenium.webdriver.support.ui import WebDriverWait
|
|
23
|
-
from selenium.webdriver.support import expected_conditions as EC
|
|
24
|
-
from webdriver_manager.chrome import ChromeDriverManager
|
|
25
|
-
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
26
|
-
from pprint import pp
|
|
27
|
-
import mimetypes
|
|
28
|
-
import io
|
|
29
|
-
import matplotlib.pyplot as plt
|
|
30
|
-
from PIL import Image
|
|
31
|
-
from duckduckgo_search import DDGS
|
|
32
|
-
from datetime import datetime
|
|
33
9
|
import time
|
|
34
|
-
from
|
|
10
|
+
from selenium.webdriver.common.by import By
|
|
11
|
+
from . import ips
|
|
12
|
+
import random
|
|
13
|
+
try:
|
|
14
|
+
import scrapy
|
|
15
|
+
except ImportError:
|
|
16
|
+
scrapy = None
|
|
35
17
|
|
|
36
18
|
dir_save = "/Users/macjianfeng/Dropbox/Downloads/"
|
|
37
19
|
# Set up logging
|
|
@@ -48,20 +30,66 @@ CONTENT_PARSERS = {
|
|
|
48
30
|
"text/xml": lambda text, parser: BeautifulSoup(text, parser),
|
|
49
31
|
"text/plain": lambda text, parser: text.text,
|
|
50
32
|
}
|
|
51
|
-
|
|
52
|
-
|
|
33
|
+
|
|
34
|
+
# Fallback pool of common User-Agent strings
|
|
35
|
+
fallback_user_agents = [
|
|
36
|
+
# Chrome (Windows)
|
|
37
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
|
|
38
|
+
# Firefox (Mac)
|
|
39
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:106.0) Gecko/20100101 Firefox/106.0",
|
|
40
|
+
# Edge (Windows)
|
|
41
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203",
|
|
42
|
+
# Safari (Mac)
|
|
43
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
|
|
44
|
+
# Linux Chrome
|
|
45
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.90 Safari/537.36",
|
|
46
|
+
# Android Tablet (Samsung)
|
|
47
|
+
"Mozilla/5.0 (Linux; Android 9; SAMSUNG SM-T860) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/10.1 Chrome/71.0.3578.99 Safari/537.36",
|
|
48
|
+
# iPhone Safari
|
|
49
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Mobile/15E148 Safari/604.1",
|
|
50
|
+
# Android Mobile Chrome
|
|
51
|
+
"Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.154 Mobile Safari/537.36",
|
|
52
|
+
# iPad Safari
|
|
53
|
+
"Mozilla/5.0 (iPad; CPU OS 15_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Mobile/15E148 Safari/604.1",
|
|
54
|
+
# Opera (Windows)
|
|
55
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 OPR/86.0.4363.32",
|
|
56
|
+
# Brave (Mac)
|
|
57
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
|
|
58
|
+
# Vivaldi (Windows)
|
|
59
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Vivaldi/5.1.2567.49",
|
|
60
|
+
# Android Chrome OnePlus
|
|
61
|
+
"Mozilla/5.0 (Linux; Android 10; ONEPLUS A6010) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.74 Mobile Safari/537.36",
|
|
62
|
+
# Samsung Galaxy S22 Chrome
|
|
63
|
+
"Mozilla/5.0 (Linux; Android 12; SAMSUNG SM-S901B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36",
|
|
64
|
+
# Xiaomi MIUI Browser
|
|
65
|
+
"Mozilla/5.0 (Linux; Android 11; M2012K11AG) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.125 Mobile Safari/537.36",
|
|
66
|
+
# Desktop Safari on macOS Ventura
|
|
67
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
|
|
68
|
+
]
|
|
53
69
|
def user_agent(
|
|
54
70
|
browsers=["chrome", "edge", "firefox", "safari"],
|
|
55
71
|
platforms=["pc", "tablet"],
|
|
56
72
|
verbose=False,
|
|
57
|
-
|
|
73
|
+
os_names=["windows", "macos", "linux"],
|
|
58
74
|
):
|
|
59
|
-
|
|
60
|
-
|
|
75
|
+
import warnings
|
|
76
|
+
import traceback
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
from fake_useragent import UserAgent
|
|
80
|
+
|
|
81
|
+
ua = UserAgent(browsers=browsers, platforms=platforms, os=os_names)
|
|
82
|
+
output_ua = ua.random
|
|
83
|
+
except Exception as e:
|
|
84
|
+
warnings.warn("fake_useragent failed, using fallback list instead.\n" + str(e))
|
|
85
|
+
if verbose:
|
|
86
|
+
traceback.print_exc()
|
|
87
|
+
output_ua = random.choice(fallback_user_agents)
|
|
88
|
+
|
|
61
89
|
if verbose:
|
|
62
|
-
print(output_ua)
|
|
63
|
-
return output_ua
|
|
90
|
+
print("Selected User-Agent:", output_ua)
|
|
64
91
|
|
|
92
|
+
return output_ua
|
|
65
93
|
|
|
66
94
|
def get_tags(content, ascending=True):
|
|
67
95
|
tag_names = set()
|
|
@@ -109,6 +137,8 @@ def get_attr(content, where=None, attr=None, **kwargs):
|
|
|
109
137
|
else:
|
|
110
138
|
print(f"The attribute '{attr}' is not found in the elements.")
|
|
111
139
|
else:
|
|
140
|
+
from pprint import pp
|
|
141
|
+
|
|
112
142
|
print(f"Cannot find tag '{where}' in the content.")
|
|
113
143
|
print("Available tags:")
|
|
114
144
|
pp(all_tags)
|
|
@@ -136,8 +166,8 @@ def extract_text_from_content(
|
|
|
136
166
|
|
|
137
167
|
def extract_text(element):
|
|
138
168
|
texts = ""
|
|
139
|
-
if isinstance(element,
|
|
140
|
-
texts += element.strip()
|
|
169
|
+
if isinstance(element, NavigableString) and element.strip():
|
|
170
|
+
texts += element.strip() + " "
|
|
141
171
|
elif hasattr(element, "children"):
|
|
142
172
|
for child in element.children:
|
|
143
173
|
texts += extract_text(child)
|
|
@@ -192,6 +222,8 @@ def extract_text_from_content(
|
|
|
192
222
|
texts = ""
|
|
193
223
|
for tag in result_set:
|
|
194
224
|
texts = texts + " " + extract_text(tag) + " \n"
|
|
225
|
+
# texts = texts + " " + tag.get_text(" ", strip=True)+ " \n"
|
|
226
|
+
|
|
195
227
|
text_list = [tx.strip() for tx in texts.split(" \n") if tx.strip()]
|
|
196
228
|
return text_list
|
|
197
229
|
else:
|
|
@@ -237,6 +269,8 @@ def flatten_json(y):
|
|
|
237
269
|
|
|
238
270
|
|
|
239
271
|
def get_proxy():
|
|
272
|
+
import random
|
|
273
|
+
|
|
240
274
|
list_ = []
|
|
241
275
|
headers = {"User-Agent": user_agent()}
|
|
242
276
|
response = requests.get(
|
|
@@ -275,6 +309,8 @@ def get_cookies(url, login={"username": "your_username", "password": "your_passw
|
|
|
275
309
|
|
|
276
310
|
### 更加平滑地移动鼠标, 这样更容易反爬
|
|
277
311
|
def scroll_smth_steps(driver, scroll_pause=0.5, min_step=200, max_step=600):
|
|
312
|
+
import random
|
|
313
|
+
|
|
278
314
|
"""Smoothly scrolls down the page to trigger lazy loading."""
|
|
279
315
|
current_scroll_position = 0
|
|
280
316
|
end_of_page = driver.execute_script("return document.body.scrollHeight")
|
|
@@ -327,13 +363,164 @@ def corr_by_kind(wait_until_kind):
|
|
|
327
363
|
raise ValueError(f"Unsupported wait_until_kind: {wait_until_kind}")
|
|
328
364
|
|
|
329
365
|
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def parse_cookies(cookies_str):
|
|
369
|
+
"""
|
|
370
|
+
直接复制于browser,它可以负责转换成最终的dict
|
|
371
|
+
"""
|
|
372
|
+
import re
|
|
373
|
+
cookies_dict = {}
|
|
374
|
+
|
|
375
|
+
# Split the string by newlines to get each cookie row
|
|
376
|
+
cookies_list = cookies_str.strip().split("\n")
|
|
377
|
+
|
|
378
|
+
for cookie in cookies_list:
|
|
379
|
+
# Use regular expression to capture name and value pairs
|
|
380
|
+
match = re.match(r"([a-zA-Z0-9_\-\.]+)\s+([^\s]+)", cookie)
|
|
381
|
+
if match:
|
|
382
|
+
cookie_name = match.group(1)
|
|
383
|
+
cookie_value = match.group(2)
|
|
384
|
+
cookies_dict[cookie_name] = cookie_value
|
|
385
|
+
|
|
386
|
+
return cookies_dict
|
|
387
|
+
def fetch_scrapy(
|
|
388
|
+
url,
|
|
389
|
+
parser="html.parser",
|
|
390
|
+
cookies=None,
|
|
391
|
+
headers=None,
|
|
392
|
+
settings=None,
|
|
393
|
+
):
|
|
394
|
+
"""
|
|
395
|
+
Fetches content using Scrapy with proper reactor handling.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
url (str): The URL to scrape.
|
|
399
|
+
parser (str): Parser for BeautifulSoup (e.g., "lxml", "html.parser").
|
|
400
|
+
cookies (dict): Cookies to pass in the request.
|
|
401
|
+
headers (dict): HTTP headers for the request.
|
|
402
|
+
settings (dict): Scrapy settings, if any.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
dict: Parsed content as a dictionary.
|
|
406
|
+
"""
|
|
407
|
+
from scrapy.utils.project import get_project_settings
|
|
408
|
+
from scrapy.crawler import CrawlerRunner
|
|
409
|
+
from scrapy.signalmanager import dispatcher
|
|
410
|
+
from scrapy import signals
|
|
411
|
+
from twisted.internet import reactor, defer
|
|
412
|
+
from twisted.internet.error import ReactorNotRestartable
|
|
413
|
+
import scrapy
|
|
414
|
+
import logging
|
|
415
|
+
|
|
416
|
+
# Disable Scrapy's excessive logging
|
|
417
|
+
logging.getLogger('scrapy').setLevel(logging.WARNING)
|
|
418
|
+
logging.getLogger('twisted').setLevel(logging.WARNING)
|
|
419
|
+
|
|
420
|
+
# Container for scraped content
|
|
421
|
+
content = []
|
|
422
|
+
|
|
423
|
+
# Define the spider class inside the function
|
|
424
|
+
class FetchSpider(scrapy.Spider):
|
|
425
|
+
name = "fetch_spider"
|
|
426
|
+
|
|
427
|
+
def __init__(self, url=None, parser=None, cookies=None, headers=None, *args, **kwargs):
|
|
428
|
+
super(FetchSpider, self).__init__(*args, **kwargs)
|
|
429
|
+
self.start_urls = [url]
|
|
430
|
+
self.parser = parser
|
|
431
|
+
self.cookies = cookies
|
|
432
|
+
self.headers = headers
|
|
433
|
+
|
|
434
|
+
def start_requests(self):
|
|
435
|
+
for url in self.start_urls:
|
|
436
|
+
yield scrapy.Request(
|
|
437
|
+
url,
|
|
438
|
+
cookies=self.cookies,
|
|
439
|
+
headers=self.headers,
|
|
440
|
+
callback=self.parse
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
def parse(self, response):
|
|
444
|
+
from bs4 import BeautifulSoup
|
|
445
|
+
soup = BeautifulSoup(response.text, self.parser)
|
|
446
|
+
yield {
|
|
447
|
+
"content": soup,
|
|
448
|
+
"url": response.url,
|
|
449
|
+
"status": response.status
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
# Callback function for item scraped signal
|
|
453
|
+
def handle_item(item, response, spider):
|
|
454
|
+
content.append(item)
|
|
455
|
+
|
|
456
|
+
# Scrapy settings
|
|
457
|
+
process_settings = settings or get_project_settings()
|
|
458
|
+
process_settings.update(
|
|
459
|
+
{
|
|
460
|
+
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
461
|
+
"DOWNLOAD_DELAY": 1,
|
|
462
|
+
"COOKIES_ENABLED": bool(cookies),
|
|
463
|
+
"LOG_LEVEL": "ERROR",
|
|
464
|
+
"RETRY_ENABLED": False,
|
|
465
|
+
"HTTPERROR_ALLOW_ALL": True,
|
|
466
|
+
}
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
# Connect item scraped signal
|
|
470
|
+
dispatcher.connect(handle_item, signal=signals.item_scraped)
|
|
471
|
+
|
|
472
|
+
# Asynchronous Twisted function
|
|
473
|
+
@defer.inlineCallbacks
|
|
474
|
+
def crawl():
|
|
475
|
+
runner = CrawlerRunner(settings=process_settings)
|
|
476
|
+
yield runner.crawl(
|
|
477
|
+
FetchSpider,
|
|
478
|
+
url=url,
|
|
479
|
+
parser=parser,
|
|
480
|
+
cookies=cookies,
|
|
481
|
+
headers=headers,
|
|
482
|
+
)
|
|
483
|
+
reactor.stop()
|
|
484
|
+
|
|
485
|
+
# Handle reactor execution
|
|
486
|
+
try:
|
|
487
|
+
if not reactor.running:
|
|
488
|
+
crawl()
|
|
489
|
+
reactor.run(installSignalHandlers=0)
|
|
490
|
+
else:
|
|
491
|
+
# This case is problematic - reactor can't be restarted
|
|
492
|
+
raise RuntimeError("Reactor already running. Cannot run multiple crawls in same process.")
|
|
493
|
+
except ReactorNotRestartable:
|
|
494
|
+
raise RuntimeError("Scrapy reactor cannot be restarted. Create a new process for additional crawls.")
|
|
495
|
+
|
|
496
|
+
# Return the first scraped content or None if empty
|
|
497
|
+
return content[0] if content else None
|
|
498
|
+
|
|
499
|
+
def _clean_temp():
|
|
500
|
+
import os
|
|
501
|
+
import shutil
|
|
502
|
+
import tempfile
|
|
503
|
+
from pathlib import Path
|
|
504
|
+
|
|
505
|
+
# Get the parent folder of the tempdir
|
|
506
|
+
temp_dir = Path(tempfile.gettempdir()).parent # moves from /T to parent dir
|
|
507
|
+
|
|
508
|
+
for subdir in temp_dir.iterdir():
|
|
509
|
+
if subdir.is_dir():
|
|
510
|
+
for d in subdir.iterdir():
|
|
511
|
+
if "com.google.Chrome.code_sign_clone" in d.name:
|
|
512
|
+
try:
|
|
513
|
+
print(f"Removing: {d}")
|
|
514
|
+
shutil.rmtree(d)
|
|
515
|
+
except Exception as e:
|
|
516
|
+
print(f"Error removing {d}: {e}")
|
|
330
517
|
def fetch_all(
|
|
331
518
|
url,
|
|
332
519
|
parser="lxml",
|
|
333
520
|
driver="request", # request or selenium
|
|
334
521
|
by=By.TAG_NAME,
|
|
335
522
|
timeout=10,
|
|
336
|
-
retry=
|
|
523
|
+
retry=3, # Increased default retries
|
|
337
524
|
wait=0,
|
|
338
525
|
wait_until=None,
|
|
339
526
|
wait_until_kind=None,
|
|
@@ -347,221 +534,225 @@ def fetch_all(
|
|
|
347
534
|
username_by=By.NAME,
|
|
348
535
|
password_by=By.NAME,
|
|
349
536
|
submit_by=By.NAME,
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
disable_images=False, # Add option to disable images
|
|
537
|
+
proxy=None,
|
|
538
|
+
javascript=True,
|
|
539
|
+
disable_images=False,
|
|
354
540
|
iframe_name=None,
|
|
355
541
|
login_dict=None,
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
542
|
+
cookies=None,
|
|
543
|
+
verify_ssl=True, # Added SSL verification option
|
|
544
|
+
follow_redirects=True, # Added redirect control
|
|
545
|
+
):
|
|
546
|
+
"""
|
|
547
|
+
Enhanced fetch function with better error handling and reliability.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
tuple: (content_type, parsed_content) or (None, None) on failure
|
|
551
|
+
"""
|
|
552
|
+
def _parse_content(content, content_type, parser):
|
|
553
|
+
"""Helper function to parse content with fallback"""
|
|
554
|
+
try:
|
|
555
|
+
if content_type in CONTENT_PARSERS:
|
|
556
|
+
return CONTENT_PARSERS[content_type](content, parser)
|
|
557
|
+
|
|
558
|
+
# Fallback parsing attempts
|
|
559
|
+
if content_type.startswith('text/'):
|
|
560
|
+
try:
|
|
561
|
+
return BeautifulSoup(content, parser)
|
|
562
|
+
except:
|
|
563
|
+
return content
|
|
564
|
+
return content
|
|
565
|
+
except Exception as e:
|
|
566
|
+
logger.warning(f"Content parsing failed: {e}")
|
|
567
|
+
return content
|
|
371
568
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
569
|
+
def _make_request(url, headers, cookies, timeout, verify_ssl, follow_redirects):
|
|
570
|
+
"""Helper function for HTTP requests with retries"""
|
|
571
|
+
for attempt in range(retry):
|
|
572
|
+
try:
|
|
375
573
|
response = requests.get(
|
|
376
|
-
|
|
574
|
+
url,
|
|
377
575
|
headers=headers,
|
|
378
|
-
|
|
379
|
-
timeout=
|
|
576
|
+
cookies=cookies,
|
|
577
|
+
timeout=timeout,
|
|
380
578
|
stream=True,
|
|
579
|
+
verify=verify_ssl,
|
|
580
|
+
allow_redirects=follow_redirects
|
|
381
581
|
)
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
|
|
389
|
-
)
|
|
390
|
-
# Raise an error if retry also fails
|
|
582
|
+
|
|
583
|
+
# Handle redirects manually if needed
|
|
584
|
+
if not follow_redirects and response.is_redirect:
|
|
585
|
+
logger.info(f"Redirect detected to: {response.headers['Location']}")
|
|
586
|
+
return None, None
|
|
587
|
+
|
|
391
588
|
response.raise_for_status()
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
589
|
+
return response, None
|
|
590
|
+
|
|
591
|
+
except requests.RequestException as e:
|
|
592
|
+
logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
|
593
|
+
if attempt == retry - 1:
|
|
594
|
+
return None, e
|
|
595
|
+
time.sleep(random.uniform(1, 3))
|
|
596
|
+
|
|
597
|
+
# Convert driver integer to string if needed
|
|
598
|
+
if isinstance(driver, int):
|
|
599
|
+
drivers = ["request", "selenium", "scrapy"]
|
|
600
|
+
try:
|
|
601
|
+
driver = drivers[driver]
|
|
602
|
+
except IndexError:
|
|
603
|
+
driver = "request"
|
|
604
|
+
|
|
605
|
+
headers = {"User-Agent": user_agent()}
|
|
606
|
+
|
|
607
|
+
# Prepare cookies
|
|
608
|
+
cookie_jar = None
|
|
609
|
+
if cookies:
|
|
610
|
+
from requests.cookies import RequestsCookieJar
|
|
611
|
+
cookie_jar = RequestsCookieJar()
|
|
612
|
+
if isinstance(cookies, str):
|
|
613
|
+
cookies = parse_cookies(cookies)
|
|
614
|
+
for name, value in cookies.items():
|
|
615
|
+
cookie_jar.set(name, value)
|
|
616
|
+
|
|
617
|
+
try:
|
|
618
|
+
if "req" in driver.lower():
|
|
619
|
+
response, error = _make_request(
|
|
620
|
+
url, headers, cookie_jar, timeout, verify_ssl, follow_redirects
|
|
399
621
|
)
|
|
400
|
-
if
|
|
401
|
-
content = response.content.decode(response.encoding)
|
|
402
|
-
else:
|
|
403
|
-
content = None
|
|
404
|
-
# logger.info(f"Content type: {content_type}")
|
|
405
|
-
|
|
406
|
-
# Check if content type is supported
|
|
407
|
-
if content_type in CONTENT_PARSERS and content:
|
|
408
|
-
return content_type, CONTENT_PARSERS[content_type](content, parser)
|
|
409
|
-
else:
|
|
410
|
-
logger.warning("Unsupported content type")
|
|
622
|
+
if error:
|
|
411
623
|
return None, None
|
|
624
|
+
content_type = response.headers.get("content-type", "").split(";")[0].lower()
|
|
625
|
+
try:
|
|
626
|
+
detected = chardet.detect(response.content)
|
|
627
|
+
encoding = detected.get("encoding") or "utf-8"
|
|
628
|
+
content = response.content.decode(encoding, errors='replace')
|
|
629
|
+
except:
|
|
630
|
+
content = response.content.decode(response.encoding or 'utf-8', errors='replace')
|
|
631
|
+
|
|
632
|
+
return content_type, _parse_content(content, content_type, parser)
|
|
633
|
+
|
|
412
634
|
elif "se" in driver.lower():
|
|
635
|
+
from selenium import webdriver
|
|
636
|
+
from selenium.webdriver.chrome.service import Service
|
|
637
|
+
from selenium.webdriver.chrome.options import Options
|
|
638
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
|
639
|
+
from selenium.common.exceptions import WebDriverException
|
|
640
|
+
|
|
413
641
|
chrome_options = Options()
|
|
414
642
|
chrome_options.add_argument("--headless")
|
|
415
643
|
chrome_options.add_argument("--no-sandbox")
|
|
644
|
+
chrome_options.add_argument("--disable-gpu")
|
|
416
645
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
646
|
+
chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
|
|
417
647
|
chrome_options.add_argument(f"user-agent={user_agent()}")
|
|
648
|
+
|
|
418
649
|
if proxy:
|
|
419
650
|
chrome_options.add_argument(f"--proxy-server={proxy}")
|
|
420
651
|
if disable_images:
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
# chrome_options.page_load_strategy = capability
|
|
424
|
-
|
|
425
|
-
service = Service(ChromeDriverManager().install())
|
|
426
|
-
# driver_path='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/127.0.6533.119/chromedriver-mac-arm64/chromedriver'
|
|
427
|
-
# service=Service(executable_path=driver_path)
|
|
428
|
-
|
|
429
|
-
driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
|
430
|
-
|
|
431
|
-
# 隐式等等待
|
|
432
|
-
if 3 < wait < 5:
|
|
433
|
-
wait_ = random.uniform(3, 5)
|
|
434
|
-
elif 5 <= wait < 8:
|
|
435
|
-
wait_ = random.uniform(5, 8)
|
|
436
|
-
elif 8 <= wait < 12:
|
|
437
|
-
wait_ = random.uniform(8, 10)
|
|
438
|
-
else:
|
|
439
|
-
wait_ = 0
|
|
440
|
-
driver_.implicitly_wait(wait_)
|
|
441
|
-
|
|
442
|
-
if wait_until is not None and wait_until_kind is not None:
|
|
443
|
-
strategy = corr_by_kind(wait_until_kind)
|
|
444
|
-
WebDriverWait(driver_, timeout).until(
|
|
445
|
-
EC.presence_of_element_located((strategy, wait_until))
|
|
446
|
-
)
|
|
447
|
-
if login_url and login_dict:
|
|
448
|
-
cookies = get_cookies(url=login_url, login=login_dict)
|
|
449
|
-
driver_.get(url)
|
|
450
|
-
for cookie_name, cookie_value in cookies.items():
|
|
451
|
-
driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
|
452
|
-
|
|
453
|
-
if not javascript:
|
|
454
|
-
driver_.execute_cdp_cmd(
|
|
455
|
-
"Emulation.setScriptExecutionDisabled", {"value": True}
|
|
456
|
-
)
|
|
457
|
-
|
|
458
|
-
if login_url:
|
|
459
|
-
driver_.get(login_url)
|
|
460
|
-
WebDriverWait(driver_, timeout).until(
|
|
461
|
-
EC.presence_of_element_located((username_by, username_field))
|
|
462
|
-
).send_keys(username)
|
|
463
|
-
WebDriverWait(driver_, timeout).until(
|
|
464
|
-
EC.presence_of_element_located((password_by, password_field))
|
|
465
|
-
).send_keys(password)
|
|
466
|
-
WebDriverWait(driver_, timeout).until(
|
|
467
|
-
EC.element_to_be_clickable((submit_by, submit_field))
|
|
468
|
-
).click()
|
|
469
|
-
|
|
470
|
-
driver_.get(url)
|
|
471
|
-
|
|
472
|
-
if iframe_name:
|
|
473
|
-
iframe = WebDriverWait(driver_, timeout).until(
|
|
474
|
-
EC.presence_of_element_located((By.NAME, iframe_name))
|
|
652
|
+
chrome_options.add_experimental_option(
|
|
653
|
+
"prefs", {"profile.managed_default_content_settings.images": 2}
|
|
475
654
|
)
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
)
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
655
|
+
|
|
656
|
+
driver_instance = None
|
|
657
|
+
try:
|
|
658
|
+
# Try with latest ChromeDriver first
|
|
659
|
+
service = Service(ChromeDriverManager().install())
|
|
660
|
+
driver_instance = webdriver.Chrome(service=service, options=chrome_options)
|
|
661
|
+
|
|
662
|
+
# Configure wait times
|
|
663
|
+
if 3 < wait < 5:
|
|
664
|
+
wait_time = random.uniform(3, 5)
|
|
665
|
+
elif 5 <= wait < 8:
|
|
666
|
+
wait_time = random.uniform(5, 8)
|
|
667
|
+
elif 8 <= wait < 12:
|
|
668
|
+
wait_time = random.uniform(8, 10)
|
|
669
|
+
else:
|
|
670
|
+
wait_time = 0
|
|
671
|
+
|
|
672
|
+
driver_instance.implicitly_wait(wait_time)
|
|
673
|
+
|
|
674
|
+
# Handle login if needed
|
|
675
|
+
if login_url and login_dict:
|
|
676
|
+
cookies = get_cookies(url=login_url, login=login_dict)
|
|
677
|
+
driver_instance.get(url)
|
|
678
|
+
for name, value in cookies.items():
|
|
679
|
+
driver_instance.add_cookie({"name": name, "value": value})
|
|
680
|
+
elif cookies:
|
|
681
|
+
driver_instance.get(url)
|
|
682
|
+
if isinstance(cookies, str):
|
|
683
|
+
cookies = parse_cookies(cookies)
|
|
684
|
+
for name, value in cookies.items():
|
|
685
|
+
driver_instance.add_cookie({"name": name, "value": value})
|
|
686
|
+
|
|
687
|
+
if not javascript:
|
|
688
|
+
driver_instance.execute_cdp_cmd(
|
|
689
|
+
"Emulation.setScriptExecutionDisabled", {"value": True}
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
# Navigate to target URL
|
|
693
|
+
driver_instance.get(url)
|
|
694
|
+
|
|
695
|
+
# Handle iframes if needed
|
|
696
|
+
if iframe_name:
|
|
697
|
+
iframe = WebDriverWait(driver_instance, timeout).until(
|
|
698
|
+
EC.presence_of_element_located((By.NAME, iframe_name))
|
|
699
|
+
)
|
|
700
|
+
driver_instance.switch_to.frame(iframe)
|
|
701
|
+
|
|
702
|
+
# Scroll to trigger dynamic content
|
|
703
|
+
scroll_smth_steps(driver_instance)
|
|
704
|
+
|
|
705
|
+
# Get page source with retries
|
|
706
|
+
content = None
|
|
707
|
+
for attempt in range(scroll_try):
|
|
708
|
+
try:
|
|
709
|
+
page_source = driver_instance.page_source
|
|
710
|
+
content = BeautifulSoup(page_source, parser)
|
|
711
|
+
if content and content.find_all(by):
|
|
712
|
+
break
|
|
713
|
+
except Exception as e:
|
|
714
|
+
logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
|
715
|
+
time.sleep(random.uniform(1, 3))
|
|
716
|
+
try:
|
|
717
|
+
_clean_temp()
|
|
718
|
+
except Exception as e:
|
|
719
|
+
print(e)
|
|
720
|
+
return "text/html", content if content else None
|
|
721
|
+
|
|
722
|
+
except WebDriverException as e:
|
|
723
|
+
logger.error(f"Selenium error: {e}")
|
|
502
724
|
return None, None
|
|
503
|
-
|
|
504
|
-
|
|
725
|
+
finally:
|
|
726
|
+
if driver_instance:
|
|
727
|
+
driver_instance.quit()
|
|
728
|
+
|
|
729
|
+
elif 'scr' in driver.lower():
|
|
730
|
+
settings = {
|
|
731
|
+
"USER_AGENT": user_agent(),
|
|
732
|
+
"DOWNLOAD_DELAY": 1,
|
|
733
|
+
"COOKIES_ENABLED": bool(cookies),
|
|
734
|
+
"LOG_LEVEL": "WARNING",
|
|
735
|
+
"RETRY_TIMES": retry,
|
|
736
|
+
"DOWNLOAD_TIMEOUT": timeout,
|
|
737
|
+
}
|
|
738
|
+
content = fetch_scrapy(
|
|
739
|
+
url,
|
|
740
|
+
parser=parser,
|
|
741
|
+
cookies=cookies,
|
|
742
|
+
headers=headers,
|
|
743
|
+
settings=settings
|
|
744
|
+
)
|
|
745
|
+
return parser, content
|
|
746
|
+
|
|
747
|
+
except Exception as e:
|
|
748
|
+
logger.error(f"Unexpected error in fetch_all: {e}")
|
|
505
749
|
return None, None
|
|
750
|
+
|
|
751
|
+
return None, None
|
|
506
752
|
|
|
507
|
-
|
|
508
|
-
# # Function to change Tor IP address
|
|
509
|
-
# def renew_tor_ip():
|
|
510
|
-
# with Controller.from_port(port=9051) as controller:
|
|
511
|
-
# controller.authenticate()
|
|
512
|
-
# controller.signal(Signal.NEWNYM)
|
|
513
|
-
|
|
514
|
-
# # Function to make requests through Tor
|
|
515
|
-
# def make_tor_request(url, max_retries=3):
|
|
516
|
-
# renew_tor_ip()
|
|
517
|
-
# headers = {"User-Agent": user_agent()}
|
|
518
|
-
# session = requests.Session()
|
|
519
|
-
# session.proxies = {"http": "socks5h://localhost:9050", "https": "socks5h://localhost:9050"}
|
|
520
|
-
|
|
521
|
-
# for i in range(max_retries):
|
|
522
|
-
# try:
|
|
523
|
-
# response = session.get(url, headers=headers, timeout=10)
|
|
524
|
-
# if response.status_code == 200:
|
|
525
|
-
# return response.text
|
|
526
|
-
# except requests.exceptions.RequestException as e:
|
|
527
|
-
# print(f"Error: {e}")
|
|
528
|
-
# time.sleep(2) # Add a delay between retries
|
|
529
|
-
|
|
530
|
-
# return None
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
# def find_links(url,driver='request'):
|
|
534
|
-
# links_href,cond_ex= [],["javascript:","mailto:","tel:","fax:"]
|
|
535
|
-
# content_type, soup = fetch_all(url,driver=driver)
|
|
536
|
-
# if soup:
|
|
537
|
-
# base_url = urlparse(url)
|
|
538
|
-
|
|
539
|
-
# # Extract links from both 'href' and 'src' attributes across relevant tags
|
|
540
|
-
# tags_with_links = ['a', 'img', 'script', 'link', 'iframe', 'embed','span']
|
|
541
|
-
# elements = []
|
|
542
|
-
# for tag in tags_with_links:
|
|
543
|
-
# elements.extend(soup.find_all(tag, href=True))
|
|
544
|
-
# elements.extend(soup.find_all(tag, src=True))
|
|
545
|
-
|
|
546
|
-
# for element in elements:
|
|
547
|
-
# link_href = element.get('href') or element.get('src')
|
|
548
|
-
# if link_href:
|
|
549
|
-
# if link_href.startswith("//"):
|
|
550
|
-
# link_href = "http:" + link_href
|
|
551
|
-
# elif not link_href.startswith(("http", "https")):
|
|
552
|
-
# link_href = urljoin(base_url.geturl(), link_href)
|
|
553
|
-
|
|
554
|
-
# if all(exclusion not in link_href for exclusion in cond_ex):
|
|
555
|
-
# links_href.append(link_href)
|
|
556
|
-
|
|
557
|
-
# return list(set(links_href)) # Remove duplicates
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
# elif url.split('.')[-1] in ['pdf']:
|
|
561
|
-
# return url
|
|
562
|
-
# else:
|
|
563
|
-
# return None
|
|
564
753
|
def find_links(url, driver="request", booster=False):
|
|
754
|
+
from urllib.parse import urlparse, urljoin
|
|
755
|
+
|
|
565
756
|
links_href, cond_ex = [], ["javascript:", "mailto:", "tel:", "fax:"]
|
|
566
757
|
content_type, soup = fetch_all(url, driver=driver)
|
|
567
758
|
|
|
@@ -582,7 +773,7 @@ def find_links(url, driver="request", booster=False):
|
|
|
582
773
|
if all(exclusion not in link_href for exclusion in cond_ex):
|
|
583
774
|
links_href.append(link_href)
|
|
584
775
|
|
|
585
|
-
unique_links =
|
|
776
|
+
unique_links = ips.unique(links_href) # Remove duplicates
|
|
586
777
|
|
|
587
778
|
if booster:
|
|
588
779
|
for link in unique_links:
|
|
@@ -590,7 +781,7 @@ def find_links(url, driver="request", booster=False):
|
|
|
590
781
|
sub_links = find_links(link, driver=driver, booster=False)
|
|
591
782
|
if sub_links:
|
|
592
783
|
links_href.extend(sub_links)
|
|
593
|
-
links_href =
|
|
784
|
+
links_href = ips.unique(links_href) # Remove duplicates again
|
|
594
785
|
|
|
595
786
|
return links_href
|
|
596
787
|
|
|
@@ -602,6 +793,8 @@ def find_links(url, driver="request", booster=False):
|
|
|
602
793
|
|
|
603
794
|
# To determine which links are related to target domains(e.g., pages) you are interested in
|
|
604
795
|
def filter_links(links, contains="html", driver="requ", booster=False):
|
|
796
|
+
from urllib.parse import urlparse, urljoin
|
|
797
|
+
|
|
605
798
|
filtered_links = []
|
|
606
799
|
if isinstance(contains, str):
|
|
607
800
|
contains = [contains]
|
|
@@ -614,10 +807,13 @@ def filter_links(links, contains="html", driver="requ", booster=False):
|
|
|
614
807
|
)
|
|
615
808
|
if condition:
|
|
616
809
|
filtered_links.append(link)
|
|
617
|
-
return filtered_links
|
|
810
|
+
return ips.unique(filtered_links)
|
|
618
811
|
|
|
619
812
|
|
|
620
813
|
def find_domain(links):
|
|
814
|
+
from urllib.parse import urlparse, urljoin
|
|
815
|
+
from collections import Counter
|
|
816
|
+
|
|
621
817
|
if not links:
|
|
622
818
|
return None
|
|
623
819
|
domains = [urlparse(link).netloc for link in links]
|
|
@@ -672,6 +868,8 @@ def pdf_detector(url, contains=None, dir_save=None, booster=False):
|
|
|
672
868
|
pdf_links = filter_links(links=links_all, contains=["pdf"])
|
|
673
869
|
|
|
674
870
|
if pdf_links:
|
|
871
|
+
from pprint import pp
|
|
872
|
+
|
|
675
873
|
pp(f"pdf detected{pdf_links}")
|
|
676
874
|
else:
|
|
677
875
|
print("no pdf file")
|
|
@@ -693,10 +891,9 @@ def pdf_detector(url, contains=None, dir_save=None, booster=False):
|
|
|
693
891
|
idx += 1
|
|
694
892
|
print(f"{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}")
|
|
695
893
|
|
|
696
|
-
|
|
697
894
|
def downloader(
|
|
698
895
|
url,
|
|
699
|
-
dir_save=
|
|
896
|
+
dir_save=None,
|
|
700
897
|
kind=[".pdf"],
|
|
701
898
|
contains=None,
|
|
702
899
|
rm_folder=False,
|
|
@@ -705,38 +902,157 @@ def downloader(
|
|
|
705
902
|
timeout=30,
|
|
706
903
|
n_try=3,
|
|
707
904
|
timestamp=False,
|
|
905
|
+
chunk_size=8192,
|
|
906
|
+
retry_delay=2,
|
|
708
907
|
):
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
908
|
+
"""
|
|
909
|
+
Enhanced file downloader with robust error handling and resume capability
|
|
910
|
+
|
|
911
|
+
Args:
|
|
912
|
+
url: URL or list of URLs to download
|
|
913
|
+
dir_save: Directory to save files (None for current directory)
|
|
914
|
+
kind: List of file extensions to filter for (e.g., ['.pdf', '.xls'])
|
|
915
|
+
contains: String that must be present in the filename
|
|
916
|
+
rm_folder: Whether to remove the target folder before downloading
|
|
917
|
+
booster: Whether to search for links on the page
|
|
918
|
+
verbose: Whether to print progress information
|
|
919
|
+
timeout: Connection timeout in seconds
|
|
920
|
+
n_try: Number of retry attempts
|
|
921
|
+
timestamp: Whether to add timestamp to filenames
|
|
922
|
+
chunk_size: Download chunk size in bytes
|
|
923
|
+
retry_delay: Delay between retries in seconds
|
|
924
|
+
"""
|
|
925
|
+
import os
|
|
926
|
+
import time
|
|
927
|
+
import shutil
|
|
928
|
+
import requests
|
|
929
|
+
from requests.exceptions import (ChunkedEncodingError, ConnectionError,
|
|
930
|
+
RequestException, Timeout)
|
|
931
|
+
from urllib.parse import urlparse
|
|
932
|
+
from datetime import datetime
|
|
933
|
+
|
|
934
|
+
if verbose and ips.run_once_within():
|
|
935
|
+
print("usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)")
|
|
936
|
+
|
|
937
|
+
# -------------------- wget integration helper --------------------
|
|
938
|
+
def _wget_available():
|
|
939
|
+
"""Check if wget exists on system"""
|
|
940
|
+
return shutil.which("wget") is not None
|
|
941
|
+
|
|
942
|
+
def _wget_download(url, out_path):
|
|
943
|
+
import subprocess
|
|
944
|
+
"""Download a file using system wget with progress bar"""
|
|
945
|
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
946
|
+
try:
|
|
947
|
+
subprocess.run(
|
|
948
|
+
["wget", "-c", "--show-progress", "--progress=bar:force", "-O", out_path, url],
|
|
949
|
+
check=True,
|
|
950
|
+
)
|
|
951
|
+
return True
|
|
952
|
+
except Exception as e:
|
|
953
|
+
if verbose:
|
|
954
|
+
print(f"wget download failed: {e}")
|
|
955
|
+
return False
|
|
956
|
+
# -----------------------------------------------------------------
|
|
713
957
|
|
|
714
958
|
def fname_corrector(fname, ext):
|
|
959
|
+
"""Ensure filename has correct extension"""
|
|
715
960
|
if not ext.startswith("."):
|
|
716
961
|
ext = "." + ext
|
|
717
|
-
if not fname.endswith(
|
|
718
|
-
fname = fname[
|
|
962
|
+
if not fname.endswith(ext):
|
|
963
|
+
fname = os.path.splitext(fname)[0] + ext
|
|
964
|
+
if not any(fname[:-len(ext)]):
|
|
965
|
+
fname = datetime.now().strftime("%H%M%S") + ext
|
|
719
966
|
return fname
|
|
720
967
|
|
|
721
968
|
def check_and_modify_filename(directory, filename):
|
|
969
|
+
"""Handle duplicate filenames by adding counter"""
|
|
722
970
|
base, ext = os.path.splitext(filename)
|
|
723
971
|
counter = 1
|
|
724
972
|
new_filename = filename
|
|
725
973
|
while os.path.exists(os.path.join(directory, new_filename)):
|
|
726
|
-
|
|
727
|
-
counter_ = "0" + str(counter)
|
|
728
|
-
else:
|
|
729
|
-
counter_ = str(counter)
|
|
730
|
-
new_filename = f"{base}_{counter_}{ext}"
|
|
974
|
+
new_filename = f"{base}_{counter:02d}{ext}"
|
|
731
975
|
counter += 1
|
|
732
976
|
return new_filename
|
|
733
977
|
|
|
734
|
-
|
|
978
|
+
def get_partial_file_size(filepath):
|
|
979
|
+
"""Get size of partially downloaded file"""
|
|
980
|
+
try:
|
|
981
|
+
return os.path.getsize(filepath)
|
|
982
|
+
except OSError:
|
|
983
|
+
return 0
|
|
984
|
+
|
|
985
|
+
def download_with_resume(url, filepath, headers=None):
|
|
986
|
+
"""Download with resume capability"""
|
|
987
|
+
headers = headers or {}
|
|
988
|
+
initial_size = get_partial_file_size(filepath)
|
|
989
|
+
|
|
990
|
+
if initial_size > 0:
|
|
991
|
+
headers['Range'] = f'bytes={initial_size}-'
|
|
992
|
+
mode = 'ab'
|
|
993
|
+
else:
|
|
994
|
+
mode = 'wb'
|
|
995
|
+
|
|
996
|
+
try:
|
|
997
|
+
with requests.get(url, headers=headers, stream=True, timeout=timeout) as r:
|
|
998
|
+
r.raise_for_status()
|
|
999
|
+
total_size = int(r.headers.get('content-length', 0)) + initial_size
|
|
1000
|
+
|
|
1001
|
+
with open(filepath, mode) as f, tqdm(
|
|
1002
|
+
total=total_size,
|
|
1003
|
+
unit='B',
|
|
1004
|
+
unit_scale=True,
|
|
1005
|
+
unit_divisor=1024,
|
|
1006
|
+
initial=initial_size,
|
|
1007
|
+
desc=os.path.basename(filepath),
|
|
1008
|
+
disable=not verbose,
|
|
1009
|
+
) as progress:
|
|
1010
|
+
for chunk in r.iter_content(chunk_size=chunk_size):
|
|
1011
|
+
if chunk: # filter out keep-alive chunks
|
|
1012
|
+
f.write(chunk)
|
|
1013
|
+
progress.update(len(chunk))
|
|
1014
|
+
return True
|
|
1015
|
+
except Exception as e:
|
|
1016
|
+
if verbose:
|
|
1017
|
+
print(f"Download error: {e}")
|
|
1018
|
+
return False
|
|
1019
|
+
|
|
1020
|
+
dir_save = dir_save or "./"
|
|
1021
|
+
filename = os.path.basename(urlparse(url).path)
|
|
1022
|
+
save_path = os.path.join(dir_save, filename)
|
|
1023
|
+
os.makedirs(dir_save, exist_ok=True)
|
|
1024
|
+
# Handle FTP URLs
|
|
1025
|
+
if isinstance(url, str) and url.startswith("ftp"):
|
|
1026
|
+
import urllib.request
|
|
1027
|
+
|
|
1028
|
+
try:
|
|
1029
|
+
urllib.request.urlretrieve(url, save_path)
|
|
1030
|
+
if verbose:
|
|
1031
|
+
print(f"Downloaded FTP file to: {save_path}")
|
|
1032
|
+
return save_path
|
|
1033
|
+
except Exception as e:
|
|
1034
|
+
print(f"FTP download failed: {e}")
|
|
1035
|
+
return None
|
|
1036
|
+
if kind is None and _wget_available():
|
|
1037
|
+
if verbose:
|
|
1038
|
+
print(f"Using wget for download: {url}")
|
|
1039
|
+
success = _wget_download(url, save_path)
|
|
1040
|
+
if success:
|
|
1041
|
+
if verbose:
|
|
1042
|
+
print(f"Successfully downloaded via wget: {save_path}")
|
|
1043
|
+
return save_path
|
|
1044
|
+
else:
|
|
1045
|
+
if verbose:
|
|
1046
|
+
print("⚠️ wget failed, falling back to requests...")
|
|
1047
|
+
kind = [".*"] # dummy
|
|
1048
|
+
# Process directory and file links
|
|
735
1049
|
if not isinstance(kind, list):
|
|
736
1050
|
kind = [kind]
|
|
1051
|
+
|
|
737
1052
|
if isinstance(url, list):
|
|
1053
|
+
results = []
|
|
738
1054
|
for url_ in url:
|
|
739
|
-
downloader(
|
|
1055
|
+
results.append(downloader(
|
|
740
1056
|
url_,
|
|
741
1057
|
dir_save=dir_save,
|
|
742
1058
|
kind=kind,
|
|
@@ -746,120 +1062,100 @@ def downloader(
|
|
|
746
1062
|
timeout=timeout,
|
|
747
1063
|
n_try=n_try,
|
|
748
1064
|
timestamp=timestamp,
|
|
749
|
-
)
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
1065
|
+
))
|
|
1066
|
+
return results
|
|
1067
|
+
|
|
1068
|
+
# Normalize file extensions
|
|
1069
|
+
kind = [k if k.startswith(".") else f".{k}" for k in kind]
|
|
1070
|
+
|
|
1071
|
+
# Find and filter links
|
|
754
1072
|
file_links_all = []
|
|
755
1073
|
for kind_ in kind:
|
|
756
|
-
if isinstance(
|
|
757
|
-
|
|
758
|
-
if isinstance(url, str):
|
|
759
|
-
if any(ext in url for ext in kind):
|
|
760
|
-
file_links = [url]
|
|
761
|
-
else:
|
|
762
|
-
if booster:
|
|
763
|
-
links_all = []
|
|
764
|
-
if "http" in url:
|
|
765
|
-
links_all = find_links(url)
|
|
766
|
-
else:
|
|
767
|
-
links_all = url
|
|
768
|
-
if contains is not None:
|
|
769
|
-
file_links = filter_links(links_all, contains=contains + kind_)
|
|
770
|
-
else:
|
|
771
|
-
file_links = links_all # filter_links(links_all, contains=kind_)
|
|
772
|
-
elif isinstance(url, list):
|
|
773
|
-
links_all = url
|
|
774
|
-
if contains is not None:
|
|
775
|
-
file_links = filter_links(links_all, contains=contains + kind_)
|
|
776
|
-
else:
|
|
777
|
-
file_links = filter_links(links_all, contains=kind_)
|
|
1074
|
+
if isinstance(url, str) and any(ext in url for ext in kind):
|
|
1075
|
+
file_links = [url]
|
|
778
1076
|
else:
|
|
779
|
-
links_all = find_links(url)
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
1077
|
+
links_all = find_links(url) if booster else ([url] if isinstance(url, str) else url)
|
|
1078
|
+
file_links = filter_links(
|
|
1079
|
+
links_all,
|
|
1080
|
+
contains=(contains + kind_) if contains else kind_
|
|
1081
|
+
)
|
|
1082
|
+
|
|
1083
|
+
file_links = ips.unique(file_links)
|
|
784
1084
|
if verbose:
|
|
785
1085
|
if file_links:
|
|
786
1086
|
print("Files detected:")
|
|
1087
|
+
from pprint import pp
|
|
787
1088
|
pp(file_links)
|
|
788
1089
|
else:
|
|
789
|
-
file_links = []
|
|
790
1090
|
print("No files detected")
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
file_links_all.extend(file_links)
|
|
795
|
-
if dir_save:
|
|
796
|
-
if rm_folder:
|
|
797
|
-
ips.rm_folder(dir_save)
|
|
798
|
-
# if verbose:
|
|
799
|
-
# print(f"\n... attempting to download to local\n")
|
|
800
|
-
fnames = [file_link.split("/")[-1] for file_link in file_links_all]
|
|
1091
|
+
|
|
1092
|
+
if file_links:
|
|
1093
|
+
file_links_all.extend(file_links if isinstance(file_links, list) else [file_links])
|
|
801
1094
|
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
1095
|
+
file_links_all = ips.unique(file_links_all)
|
|
1096
|
+
if not file_links_all:
|
|
1097
|
+
return None
|
|
1098
|
+
|
|
1099
|
+
# Prepare download directory
|
|
1100
|
+
dir_save = dir_save or "./"
|
|
1101
|
+
if rm_folder:
|
|
1102
|
+
ips.rm_folder(dir_save)
|
|
1103
|
+
os.makedirs(dir_save, exist_ok=True)
|
|
1104
|
+
|
|
1105
|
+
# Download files
|
|
1106
|
+
results = []
|
|
1107
|
+
for file_link in file_links_all:
|
|
1108
|
+
headers = {
|
|
1109
|
+
"User-Agent": user_agent(),
|
|
1110
|
+
"Accept-Encoding": "identity" # Disable compression for resume support
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
# Determine filename
|
|
1114
|
+
filename = os.path.basename(urlparse(file_link).path)
|
|
1115
|
+
ext = next((ftype for ftype in kind if ftype in filename), kind[0])
|
|
1116
|
+
corrected_fname = fname_corrector(filename, ext)
|
|
1117
|
+
corrected_fname = check_and_modify_filename(dir_save, corrected_fname)
|
|
1118
|
+
|
|
1119
|
+
if timestamp:
|
|
1120
|
+
corrected_fname = datetime.now().strftime("%y%m%d_%H%M%S_") + corrected_fname
|
|
1121
|
+
|
|
1122
|
+
save_path = os.path.join(dir_save, corrected_fname)
|
|
1123
|
+
|
|
1124
|
+
# Download with retry logic
|
|
1125
|
+
success = False
|
|
1126
|
+
for attempt in range(n_try):
|
|
1127
|
+
try:
|
|
1128
|
+
if verbose:
|
|
1129
|
+
print(f"Downloading {file_link} (attempt {attempt + 1}/{n_try})")
|
|
1130
|
+
if _wget_available():
|
|
1131
|
+
success = _wget_download(file_link, save_path)
|
|
1132
|
+
if success:
|
|
840
1133
|
if verbose:
|
|
841
|
-
print(
|
|
842
|
-
f"Failed to download file: HTTP status code {response.status_code}"
|
|
843
|
-
)
|
|
1134
|
+
print(f"Successfully downloaded via wget: {save_path}")
|
|
844
1135
|
break
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
1136
|
+
if download_with_resume(file_link, save_path, headers):
|
|
1137
|
+
success = True
|
|
1138
|
+
if verbose:
|
|
1139
|
+
print(f"Successfully downloaded: {save_path}")
|
|
1140
|
+
break
|
|
1141
|
+
|
|
1142
|
+
except (ChunkedEncodingError, ConnectionError, Timeout, RequestException) as e:
|
|
1143
|
+
if verbose:
|
|
1144
|
+
print(f"Attempt {attempt + 1} failed: {e}")
|
|
1145
|
+
if attempt < n_try - 1:
|
|
1146
|
+
time.sleep(retry_delay)
|
|
1147
|
+
|
|
1148
|
+
if success:
|
|
1149
|
+
results.append(save_path)
|
|
1150
|
+
else:
|
|
1151
|
+
if verbose:
|
|
1152
|
+
print(f"Failed to download {file_link} after {n_try} attempts")
|
|
1153
|
+
# Clean up potentially corrupted file
|
|
1154
|
+
if os.path.exists(save_path):
|
|
1155
|
+
os.remove(save_path)
|
|
1156
|
+
results.append(None)
|
|
862
1157
|
|
|
1158
|
+
return results if len(results) != 1 else results[0]
|
|
863
1159
|
|
|
864
1160
|
def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
|
|
865
1161
|
"""
|
|
@@ -872,9 +1168,14 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
|
|
|
872
1168
|
Returns:
|
|
873
1169
|
str: HTML content with updated image URLs pointing to local files.
|
|
874
1170
|
"""
|
|
1171
|
+
from urllib.parse import urlparse, urljoin
|
|
1172
|
+
import base64
|
|
1173
|
+
|
|
875
1174
|
if rm_folder:
|
|
876
1175
|
ips.rm_folder(dir_save)
|
|
877
1176
|
content_type, content = fetch_all(url, driver=driver)
|
|
1177
|
+
if content_type is None:
|
|
1178
|
+
content_type=""
|
|
878
1179
|
if "html" in content_type.lower():
|
|
879
1180
|
# Create the directory if it doesn't exist
|
|
880
1181
|
os.makedirs(dir_save, exist_ok=True)
|
|
@@ -937,6 +1238,9 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
|
|
|
937
1238
|
|
|
938
1239
|
|
|
939
1240
|
def svg_to_png(svg_file):
|
|
1241
|
+
import io
|
|
1242
|
+
from PIL import Image
|
|
1243
|
+
|
|
940
1244
|
with WandImage(filename=svg_file, resolution=300) as img:
|
|
941
1245
|
img.format = "png"
|
|
942
1246
|
png_image = img.make_blob()
|
|
@@ -1002,10 +1306,22 @@ def fetch_selenium(
|
|
|
1002
1306
|
iframe_name=None, # Add option to handle iframe
|
|
1003
1307
|
**kwargs,
|
|
1004
1308
|
):
|
|
1309
|
+
import random
|
|
1310
|
+
from selenium import webdriver
|
|
1311
|
+
from selenium.webdriver.chrome.service import Service
|
|
1312
|
+
from selenium.webdriver.common.by import By
|
|
1313
|
+
from selenium.webdriver.chrome.options import Options
|
|
1314
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
1315
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
1316
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
|
1317
|
+
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
1318
|
+
|
|
1005
1319
|
chrome_options = Options()
|
|
1006
1320
|
chrome_options.add_argument("--headless")
|
|
1007
1321
|
chrome_options.add_argument("--no-sandbox")
|
|
1322
|
+
chrome_options.add_argument("--disable-gpu")
|
|
1008
1323
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
1324
|
+
chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
|
|
1009
1325
|
chrome_options.add_argument(f"user-agent={user_agent()}")
|
|
1010
1326
|
if proxy:
|
|
1011
1327
|
chrome_options.add_argument(f"--proxy-server={proxy}")
|
|
@@ -1061,7 +1377,7 @@ def fetch_selenium(
|
|
|
1061
1377
|
if attempt == retry - 1:
|
|
1062
1378
|
logger.error("Failed to fetch the content after all retries")
|
|
1063
1379
|
return []
|
|
1064
|
-
sleep(random.uniform(1, 3))
|
|
1380
|
+
time.sleep(random.uniform(1, 3))
|
|
1065
1381
|
# Return empty list if nothing found after all retries
|
|
1066
1382
|
return []
|
|
1067
1383
|
|
|
@@ -1078,6 +1394,9 @@ def fetch(
|
|
|
1078
1394
|
output="text",
|
|
1079
1395
|
**kws,
|
|
1080
1396
|
):
|
|
1397
|
+
import random
|
|
1398
|
+
from urllib.parse import urlparse, urljoin
|
|
1399
|
+
|
|
1081
1400
|
if "xt" in output.lower():
|
|
1082
1401
|
for attempt in range(retry):
|
|
1083
1402
|
if verbose and attempt == 0:
|
|
@@ -1103,12 +1422,12 @@ def fetch(
|
|
|
1103
1422
|
else:
|
|
1104
1423
|
if texts:
|
|
1105
1424
|
break
|
|
1106
|
-
sleep(random.uniform(0.5, 1.5))
|
|
1425
|
+
time.sleep(random.uniform(0.5, 1.5))
|
|
1107
1426
|
if isinstance(texts, pd.core.frame.DataFrame):
|
|
1108
1427
|
condition_ = [texts.empty, booster]
|
|
1109
1428
|
else:
|
|
1110
1429
|
condition_ = [not texts, booster]
|
|
1111
|
-
if any(condition_):
|
|
1430
|
+
if any(condition_):
|
|
1112
1431
|
print("trying to use 'fetcher2'...")
|
|
1113
1432
|
texts = fetch_selenium(
|
|
1114
1433
|
url=url, where=where, what=what, extend=extend, **kws
|
|
@@ -1116,6 +1435,7 @@ def fetch(
|
|
|
1116
1435
|
if texts:
|
|
1117
1436
|
return texts
|
|
1118
1437
|
else:
|
|
1438
|
+
print("got nothing")
|
|
1119
1439
|
return fetch(
|
|
1120
1440
|
url,
|
|
1121
1441
|
where=where,
|
|
@@ -1429,6 +1749,8 @@ def isa(fpath, kind="img"):
|
|
|
1429
1749
|
|
|
1430
1750
|
|
|
1431
1751
|
def is_image(fpath):
|
|
1752
|
+
import mimetypes
|
|
1753
|
+
|
|
1432
1754
|
mime_type, _ = mimetypes.guess_type(fpath)
|
|
1433
1755
|
if mime_type and mime_type.startswith("image"):
|
|
1434
1756
|
return True
|
|
@@ -1437,6 +1759,8 @@ def is_image(fpath):
|
|
|
1437
1759
|
|
|
1438
1760
|
|
|
1439
1761
|
def is_document(fpath):
|
|
1762
|
+
import mimetypes
|
|
1763
|
+
|
|
1440
1764
|
mime_type, _ = mimetypes.guess_type(fpath)
|
|
1441
1765
|
if mime_type and (
|
|
1442
1766
|
mime_type.startswith("text/")
|
|
@@ -1457,6 +1781,8 @@ def is_document(fpath):
|
|
|
1457
1781
|
|
|
1458
1782
|
|
|
1459
1783
|
def is_zip(fpath):
|
|
1784
|
+
import mimetypes
|
|
1785
|
+
|
|
1460
1786
|
mime_type, _ = mimetypes.guess_type(fpath)
|
|
1461
1787
|
if mime_type == "application/zip":
|
|
1462
1788
|
return True
|
|
@@ -1476,6 +1802,8 @@ def search(
|
|
|
1476
1802
|
):
|
|
1477
1803
|
|
|
1478
1804
|
if "te" in kind.lower():
|
|
1805
|
+
from duckduckgo_search import DDGS
|
|
1806
|
+
|
|
1479
1807
|
results = DDGS().text(query, max_results=limit)
|
|
1480
1808
|
res = pd.DataFrame(results)
|
|
1481
1809
|
res.rename(columns={"href": "links"}, inplace=True)
|
|
@@ -1493,6 +1821,8 @@ def search(
|
|
|
1493
1821
|
|
|
1494
1822
|
|
|
1495
1823
|
def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
|
|
1824
|
+
from duckduckgo_search import DDGS
|
|
1825
|
+
|
|
1496
1826
|
def is_in_any(str_candi_short, str_full, ignore_case=True):
|
|
1497
1827
|
if isinstance(str_candi_short, str):
|
|
1498
1828
|
str_candi_short = [str_candi_short]
|
|
@@ -1521,8 +1851,12 @@ def echo(query, model="gpt", verbose=True, log=True, dir_save=dir_save):
|
|
|
1521
1851
|
model_valid = valid_mod_name(model)
|
|
1522
1852
|
res = DDGS().chat(query, model=model_valid)
|
|
1523
1853
|
if verbose:
|
|
1854
|
+
from pprint import pp
|
|
1855
|
+
|
|
1524
1856
|
pp(res)
|
|
1525
1857
|
if log:
|
|
1858
|
+
from datetime import datetime
|
|
1859
|
+
|
|
1526
1860
|
dt_str = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d_%H:%M:%S")
|
|
1527
1861
|
res_ = f"###{dt_str}\n\n>{res}\n"
|
|
1528
1862
|
os.makedirs(dir_save, exist_ok=True)
|
|
@@ -1542,3 +1876,191 @@ def ai(*args, **kwargs):
|
|
|
1542
1876
|
if len(args) == 1 and isinstance(args[0], str):
|
|
1543
1877
|
kwargs["query"] = args[0]
|
|
1544
1878
|
return echo(**kwargs)
|
|
1879
|
+
|
|
1880
|
+
|
|
1881
|
+
#! get_ip()
|
|
1882
|
+
def get_ip(ip=None):
|
|
1883
|
+
"""
|
|
1884
|
+
Usage:
|
|
1885
|
+
from py2ls import netfinder as nt
|
|
1886
|
+
ip = nt.get_ip()
|
|
1887
|
+
"""
|
|
1888
|
+
|
|
1889
|
+
import requests
|
|
1890
|
+
import time
|
|
1891
|
+
import logging
|
|
1892
|
+
from datetime import datetime, timedelta
|
|
1893
|
+
|
|
1894
|
+
# Set up logging configuration
|
|
1895
|
+
logging.basicConfig(
|
|
1896
|
+
level=logging.INFO,
|
|
1897
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
1898
|
+
handlers=[
|
|
1899
|
+
logging.StreamHandler(),
|
|
1900
|
+
logging.FileHandler("public_ip_log.log"), # Log to a file
|
|
1901
|
+
],
|
|
1902
|
+
)
|
|
1903
|
+
|
|
1904
|
+
cache = {}
|
|
1905
|
+
|
|
1906
|
+
# Function to fetch IP addresses synchronously
|
|
1907
|
+
def fetch_ip(url, retries, timeout, headers):
|
|
1908
|
+
"""
|
|
1909
|
+
Synchronous function to fetch the IP address with retries.
|
|
1910
|
+
"""
|
|
1911
|
+
for attempt in range(retries):
|
|
1912
|
+
try:
|
|
1913
|
+
response = requests.get(url, timeout=timeout, headers=headers)
|
|
1914
|
+
response.raise_for_status()
|
|
1915
|
+
return response.json()
|
|
1916
|
+
except requests.RequestException as e:
|
|
1917
|
+
logging.error(f"Attempt {attempt + 1} failed: {e}")
|
|
1918
|
+
if attempt < retries - 1:
|
|
1919
|
+
time.sleep(2**attempt) # Exponential backoff
|
|
1920
|
+
else:
|
|
1921
|
+
logging.error("Max retries reached.")
|
|
1922
|
+
return {"error": f"Error fetching IP: {e}"}
|
|
1923
|
+
except requests.Timeout:
|
|
1924
|
+
logging.error("Request timed out")
|
|
1925
|
+
time.sleep(2**attempt)
|
|
1926
|
+
return {"error": "Failed to fetch IP after retries"}
|
|
1927
|
+
|
|
1928
|
+
# Function to fetch geolocation synchronously
|
|
1929
|
+
def fetch_geolocation(url, retries, timeout, headers):
|
|
1930
|
+
"""
|
|
1931
|
+
Synchronous function to fetch geolocation data by IP address.
|
|
1932
|
+
"""
|
|
1933
|
+
for attempt in range(retries):
|
|
1934
|
+
try:
|
|
1935
|
+
response = requests.get(url, timeout=timeout, headers=headers)
|
|
1936
|
+
response.raise_for_status()
|
|
1937
|
+
return response.json()
|
|
1938
|
+
except requests.RequestException as e:
|
|
1939
|
+
logging.error(f"Geolocation request attempt {attempt + 1} failed: {e}")
|
|
1940
|
+
if attempt < retries - 1:
|
|
1941
|
+
time.sleep(2**attempt) # Exponential backoff
|
|
1942
|
+
else:
|
|
1943
|
+
logging.error("Max retries reached.")
|
|
1944
|
+
return {"error": f"Error fetching geolocation: {e}"}
|
|
1945
|
+
except requests.Timeout:
|
|
1946
|
+
logging.error("Geolocation request timed out")
|
|
1947
|
+
time.sleep(2**attempt)
|
|
1948
|
+
return {"error": "Failed to fetch geolocation after retries"}
|
|
1949
|
+
|
|
1950
|
+
# Main function to get public IP and geolocation
|
|
1951
|
+
def get_public_ip(
|
|
1952
|
+
ip4=True,
|
|
1953
|
+
ip6=True,
|
|
1954
|
+
verbose=True,
|
|
1955
|
+
retries=3,
|
|
1956
|
+
timeout=5,
|
|
1957
|
+
geolocation=True,
|
|
1958
|
+
headers=None,
|
|
1959
|
+
cache_duration=5,
|
|
1960
|
+
):
|
|
1961
|
+
"""
|
|
1962
|
+
Synchronously fetches public IPv4 and IPv6 addresses, along with optional geolocation info.
|
|
1963
|
+
"""
|
|
1964
|
+
# Use the cache if it's still valid
|
|
1965
|
+
cache_key_ip4 = "public_ip4"
|
|
1966
|
+
cache_key_ip6 = "public_ip6"
|
|
1967
|
+
cache_key_geolocation = "geolocation"
|
|
1968
|
+
|
|
1969
|
+
if (
|
|
1970
|
+
cache
|
|
1971
|
+
and cache_key_ip4 in cache
|
|
1972
|
+
and datetime.now() < cache[cache_key_ip4]["expires"]
|
|
1973
|
+
):
|
|
1974
|
+
logging.info("Cache hit for IPv4, using cached data.")
|
|
1975
|
+
ip4_data = cache[cache_key_ip4]["data"]
|
|
1976
|
+
else:
|
|
1977
|
+
ip4_data = None
|
|
1978
|
+
|
|
1979
|
+
if (
|
|
1980
|
+
cache
|
|
1981
|
+
and cache_key_ip6 in cache
|
|
1982
|
+
and datetime.now() < cache[cache_key_ip6]["expires"]
|
|
1983
|
+
):
|
|
1984
|
+
logging.info("Cache hit for IPv6, using cached data.")
|
|
1985
|
+
ip6_data = cache[cache_key_ip6]["data"]
|
|
1986
|
+
else:
|
|
1987
|
+
ip6_data = None
|
|
1988
|
+
|
|
1989
|
+
if (
|
|
1990
|
+
cache
|
|
1991
|
+
and cache_key_geolocation in cache
|
|
1992
|
+
and datetime.now() < cache[cache_key_geolocation]["expires"]
|
|
1993
|
+
):
|
|
1994
|
+
logging.info("Cache hit for Geolocation, using cached data.")
|
|
1995
|
+
geolocation_data = cache[cache_key_geolocation]["data"]
|
|
1996
|
+
else:
|
|
1997
|
+
geolocation_data = None
|
|
1998
|
+
|
|
1999
|
+
# Fetch IPv4 if requested
|
|
2000
|
+
if ip4 and not ip4_data:
|
|
2001
|
+
logging.info("Fetching IPv4...")
|
|
2002
|
+
ip4_data = fetch_ip(
|
|
2003
|
+
"https://api.ipify.org?format=json", retries, timeout, headers
|
|
2004
|
+
)
|
|
2005
|
+
cache[cache_key_ip4] = {
|
|
2006
|
+
"data": ip4_data,
|
|
2007
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
|
2008
|
+
}
|
|
2009
|
+
|
|
2010
|
+
# Fetch IPv6 if requested
|
|
2011
|
+
if ip6 and not ip6_data:
|
|
2012
|
+
logging.info("Fetching IPv6...")
|
|
2013
|
+
ip6_data = fetch_ip(
|
|
2014
|
+
"https://api6.ipify.org?format=json", retries, timeout, headers
|
|
2015
|
+
)
|
|
2016
|
+
cache[cache_key_ip6] = {
|
|
2017
|
+
"data": ip6_data,
|
|
2018
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
|
2019
|
+
}
|
|
2020
|
+
|
|
2021
|
+
# Fetch geolocation if requested
|
|
2022
|
+
if geolocation and not geolocation_data:
|
|
2023
|
+
logging.info("Fetching Geolocation...")
|
|
2024
|
+
geolocation_data = fetch_geolocation(
|
|
2025
|
+
"https://ipinfo.io/json", retries, timeout, headers
|
|
2026
|
+
)
|
|
2027
|
+
cache[cache_key_geolocation] = {
|
|
2028
|
+
"data": geolocation_data,
|
|
2029
|
+
"expires": datetime.now() + timedelta(minutes=cache_duration),
|
|
2030
|
+
}
|
|
2031
|
+
|
|
2032
|
+
# Prepare the results
|
|
2033
|
+
ip_info = {
|
|
2034
|
+
"ip4": ip4_data.get("ip") if ip4_data else "N/A",
|
|
2035
|
+
"ip6": ip6_data.get("ip") if ip6_data else "N/A",
|
|
2036
|
+
"geolocation": geolocation_data if geolocation_data else "N/A",
|
|
2037
|
+
}
|
|
2038
|
+
|
|
2039
|
+
# Verbose output if requested
|
|
2040
|
+
if verbose:
|
|
2041
|
+
print(f"Public IPv4: {ip_info['ip4']}")
|
|
2042
|
+
print(f"Public IPv6: {ip_info['ip6']}")
|
|
2043
|
+
print(f"Geolocation: {ip_info['geolocation']}")
|
|
2044
|
+
|
|
2045
|
+
return ip_info
|
|
2046
|
+
|
|
2047
|
+
# Function to get geolocation data by IP
|
|
2048
|
+
def get_geolocation_by_ip(ip, retries=3, timeout=5, headers=None):
|
|
2049
|
+
"""
|
|
2050
|
+
Fetches geolocation data for a given IP address.
|
|
2051
|
+
"""
|
|
2052
|
+
url = f"https://ipinfo.io/{ip}/json"
|
|
2053
|
+
geolocation_data = fetch_geolocation(url, retries, timeout, headers)
|
|
2054
|
+
return geolocation_data
|
|
2055
|
+
#! here starting get_ip()
|
|
2056
|
+
headers = {"User-Agent": user_agent()}
|
|
2057
|
+
if ip is None:
|
|
2058
|
+
try:
|
|
2059
|
+
ip_data = get_public_ip(headers=headers, verbose=True)
|
|
2060
|
+
except Exception as e:
|
|
2061
|
+
print(e)
|
|
2062
|
+
ip_data = None
|
|
2063
|
+
return ip_data
|
|
2064
|
+
else:
|
|
2065
|
+
geolocation_data = get_geolocation_by_ip(ip, headers=headers)
|
|
2066
|
+
return geolocation_data
|