py2ls 0.2.5.9__py3-none-any.whl → 0.2.5.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/im2.py +67 -0
- py2ls/ips.py +5101 -650
- py2ls/ips_lab.py +17172 -0
- py2ls/netfinder.py +501 -207
- py2ls/plot.py +65 -56
- {py2ls-0.2.5.9.dist-info → py2ls-0.2.5.12.dist-info}/METADATA +1 -1
- {py2ls-0.2.5.9.dist-info → py2ls-0.2.5.12.dist-info}/RECORD +9 -7
- {py2ls-0.2.5.9.dist-info → py2ls-0.2.5.12.dist-info}/WHEEL +1 -1
py2ls/netfinder.py
CHANGED
@@ -332,7 +332,6 @@ def parse_cookies(cookies_str):
|
|
332
332
|
cookies_dict[cookie_name] = cookie_value
|
333
333
|
|
334
334
|
return cookies_dict
|
335
|
-
|
336
335
|
def fetch_scrapy(
|
337
336
|
url,
|
338
337
|
parser="html.parser",
|
@@ -358,23 +357,60 @@ def fetch_scrapy(
|
|
358
357
|
from scrapy.signalmanager import dispatcher
|
359
358
|
from scrapy import signals
|
360
359
|
from twisted.internet import reactor, defer
|
360
|
+
from twisted.internet.error import ReactorNotRestartable
|
361
361
|
import scrapy
|
362
|
+
import logging
|
363
|
+
|
364
|
+
# Disable Scrapy's excessive logging
|
365
|
+
logging.getLogger('scrapy').setLevel(logging.WARNING)
|
366
|
+
logging.getLogger('twisted').setLevel(logging.WARNING)
|
362
367
|
|
363
368
|
# Container for scraped content
|
364
369
|
content = []
|
365
370
|
|
371
|
+
# Define the spider class inside the function
|
372
|
+
class FetchSpider(scrapy.Spider):
|
373
|
+
name = "fetch_spider"
|
374
|
+
|
375
|
+
def __init__(self, url=None, parser=None, cookies=None, headers=None, *args, **kwargs):
|
376
|
+
super(FetchSpider, self).__init__(*args, **kwargs)
|
377
|
+
self.start_urls = [url]
|
378
|
+
self.parser = parser
|
379
|
+
self.cookies = cookies
|
380
|
+
self.headers = headers
|
381
|
+
|
382
|
+
def start_requests(self):
|
383
|
+
for url in self.start_urls:
|
384
|
+
yield scrapy.Request(
|
385
|
+
url,
|
386
|
+
cookies=self.cookies,
|
387
|
+
headers=self.headers,
|
388
|
+
callback=self.parse
|
389
|
+
)
|
390
|
+
|
391
|
+
def parse(self, response):
|
392
|
+
from bs4 import BeautifulSoup
|
393
|
+
soup = BeautifulSoup(response.text, self.parser)
|
394
|
+
yield {
|
395
|
+
"content": soup,
|
396
|
+
"url": response.url,
|
397
|
+
"status": response.status
|
398
|
+
}
|
399
|
+
|
366
400
|
# Callback function for item scraped signal
|
367
401
|
def handle_item(item, response, spider):
|
368
|
-
content.append(item
|
402
|
+
content.append(item)
|
369
403
|
|
370
404
|
# Scrapy settings
|
371
405
|
process_settings = settings or get_project_settings()
|
372
406
|
process_settings.update(
|
373
407
|
{
|
374
|
-
"USER_AGENT": "
|
375
|
-
"DOWNLOAD_DELAY": 1,
|
408
|
+
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
409
|
+
"DOWNLOAD_DELAY": 1,
|
376
410
|
"COOKIES_ENABLED": bool(cookies),
|
377
|
-
"LOG_LEVEL": "ERROR",
|
411
|
+
"LOG_LEVEL": "ERROR",
|
412
|
+
"RETRY_ENABLED": False,
|
413
|
+
"HTTPERROR_ALLOW_ALL": True,
|
378
414
|
}
|
379
415
|
)
|
380
416
|
|
@@ -394,26 +430,292 @@ def fetch_scrapy(
|
|
394
430
|
)
|
395
431
|
reactor.stop()
|
396
432
|
|
397
|
-
#
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
433
|
+
# Handle reactor execution
|
434
|
+
try:
|
435
|
+
if not reactor.running:
|
436
|
+
crawl()
|
437
|
+
reactor.run(installSignalHandlers=0)
|
438
|
+
else:
|
439
|
+
# This case is problematic - reactor can't be restarted
|
440
|
+
raise RuntimeError("Reactor already running. Cannot run multiple crawls in same process.")
|
441
|
+
except ReactorNotRestartable:
|
442
|
+
raise RuntimeError("Scrapy reactor cannot be restarted. Create a new process for additional crawls.")
|
405
443
|
|
406
444
|
# Return the first scraped content or None if empty
|
407
445
|
return content[0] if content else None
|
408
|
-
|
409
|
-
|
446
|
+
|
447
|
+
# def fetch_all(
|
448
|
+
# url,
|
449
|
+
# parser="lxml",
|
450
|
+
# driver="request", # request or selenium
|
451
|
+
# by=By.TAG_NAME,
|
452
|
+
# timeout=10,
|
453
|
+
# retry=2,
|
454
|
+
# wait=0,
|
455
|
+
# wait_until=None,
|
456
|
+
# wait_until_kind=None,
|
457
|
+
# scroll_try=3,
|
458
|
+
# login_url=None,
|
459
|
+
# username=None,
|
460
|
+
# password=None,
|
461
|
+
# username_field="username",
|
462
|
+
# password_field="password",
|
463
|
+
# submit_field="submit",
|
464
|
+
# username_by=By.NAME,
|
465
|
+
# password_by=By.NAME,
|
466
|
+
# submit_by=By.NAME,
|
467
|
+
# # capability='eager', # eager or none
|
468
|
+
# proxy=None, # Add proxy parameter
|
469
|
+
# javascript=True, # Add JavaScript option
|
470
|
+
# disable_images=False, # Add option to disable images
|
471
|
+
# iframe_name=None,
|
472
|
+
# login_dict=None,
|
473
|
+
# cookies=None, # Add cookies parameter
|
474
|
+
# ): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
|
475
|
+
# try:
|
476
|
+
# # # Generate a random user-agent string
|
477
|
+
# # response = requests.get(url)
|
478
|
+
# # # get cookies
|
479
|
+
# # cookie=dict_from_cookiejar(response.cookies)
|
480
|
+
# # # get token from cookies
|
481
|
+
# # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
|
482
|
+
# # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
|
483
|
+
|
484
|
+
# headers = {"User-Agent": user_agent()}
|
485
|
+
# if isinstance(driver, int):
|
486
|
+
# drivers=["request", "selenium","scrapy"]
|
487
|
+
# driver=drivers[driver]
|
488
|
+
# if "req" in driver.lower():
|
489
|
+
# # response = requests.get(
|
490
|
+
# # url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
|
491
|
+
# # )
|
492
|
+
|
493
|
+
# # Handle cookies for requests
|
494
|
+
# if cookies:
|
495
|
+
# from requests.cookies import RequestsCookieJar
|
496
|
+
# cookie_jar = RequestsCookieJar()
|
497
|
+
# if isinstance(cookies, str):
|
498
|
+
# cookies=parse_cookies(cookies)
|
499
|
+
# for cookie_name, cookie_value in cookies.items():
|
500
|
+
# cookie_jar.set(cookie_name, cookie_value)
|
501
|
+
# response = requests.get(
|
502
|
+
# url, headers=headers, cookies=cookie_jar, timeout=timeout, stream=True
|
503
|
+
# )
|
504
|
+
# else:
|
505
|
+
# response = requests.get(url, headers=headers, timeout=timeout, stream=True)
|
506
|
+
|
507
|
+
# # If the response is a redirect, follow it
|
508
|
+
# while response.is_redirect:
|
509
|
+
# logger.info(f"Redirecting to: {response.headers['Location']}")
|
510
|
+
# response = requests.get(
|
511
|
+
# response.headers["Location"],
|
512
|
+
# headers=headers,
|
513
|
+
# proxies=proxies_glob,
|
514
|
+
# timeout=timeout,
|
515
|
+
# stream=True,
|
516
|
+
# )
|
517
|
+
# # Check for a 403 error
|
518
|
+
# if response.status_code == 403:
|
519
|
+
# logger.warning("403 Forbidden error. Retrying...")
|
520
|
+
# # Retry the request after a short delay
|
521
|
+
# time.sleep(random.uniform(1, 3))
|
522
|
+
# response = requests.get(
|
523
|
+
# url, headers=headers, proxies=proxies_glob, timeout=timeout, stream=True
|
524
|
+
# )
|
525
|
+
# # Raise an error if retry also fails
|
526
|
+
# response.raise_for_status()
|
527
|
+
|
528
|
+
# # Raise an error for other HTTP status codes
|
529
|
+
# response.raise_for_status()
|
530
|
+
|
531
|
+
# # Get the content type
|
532
|
+
# content_type = (
|
533
|
+
# response.headers.get("content-type", "").split(";")[0].lower()
|
534
|
+
# )
|
535
|
+
# if response.encoding:
|
536
|
+
# content = response.content.decode(response.encoding)
|
537
|
+
# else:
|
538
|
+
# content = None
|
539
|
+
# # logger.info(f"Content type: {content_type}")
|
540
|
+
|
541
|
+
# # Check if content type is supported
|
542
|
+
# if content_type in CONTENT_PARSERS and content:
|
543
|
+
# return content_type, CONTENT_PARSERS[content_type](content, parser)
|
544
|
+
# else:
|
545
|
+
# logger.warning("Unsupported content type")
|
546
|
+
# return None, None
|
547
|
+
# elif "se" in driver.lower():
|
548
|
+
# import random
|
549
|
+
# from selenium import webdriver
|
550
|
+
# from selenium.webdriver.chrome.service import Service
|
551
|
+
# from selenium.webdriver.common.by import By
|
552
|
+
# from selenium.webdriver.chrome.options import Options
|
553
|
+
# from selenium.webdriver.support.ui import WebDriverWait
|
554
|
+
# from selenium.webdriver.support import expected_conditions as EC
|
555
|
+
# from webdriver_manager.chrome import ChromeDriverManager
|
556
|
+
# from selenium.webdriver.common.desired_capabilities import (
|
557
|
+
# DesiredCapabilities,
|
558
|
+
# )
|
559
|
+
|
560
|
+
# chrome_options = Options()
|
561
|
+
# chrome_options.add_argument("--headless")
|
562
|
+
# chrome_options.add_argument("--no-sandbox")
|
563
|
+
# chrome_options.add_argument("--disable-dev-shm-usage")
|
564
|
+
# chrome_options.add_argument(f"user-agent={user_agent()}")
|
565
|
+
# if proxy:
|
566
|
+
# chrome_options.add_argument(f"--proxy-server={proxy}")
|
567
|
+
# if disable_images:
|
568
|
+
# prefs = {"profile.managed_default_content_settings.images": 2}
|
569
|
+
# chrome_options.add_experimental_option("prefs", prefs)
|
570
|
+
# # chrome_options.page_load_strategy = capability
|
571
|
+
# try:
|
572
|
+
# # Try to install ChromeDriver using webdriver-manager
|
573
|
+
|
574
|
+
# # driver_pah='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/129.0.6668.100/chromedriver-mac-arm64/chromedriver'
|
575
|
+
# # service=Service(executable_path=driver_path)
|
576
|
+
|
577
|
+
# service = Service(ChromeDriverManager().install())
|
578
|
+
# driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
579
|
+
# except Exception as e:
|
580
|
+
# print(f"Error occurred: {e}")
|
581
|
+
# print("Attempting to reinstall webdriver-manager...")
|
582
|
+
# try:
|
583
|
+
# service = Service(ChromeDriverManager().install())
|
584
|
+
# driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
585
|
+
# except Exception as reinstall_error:
|
586
|
+
# print(
|
587
|
+
# f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 更新了webdriver-manager以后得到解决"
|
588
|
+
# )
|
589
|
+
# try:
|
590
|
+
# ips.upgrade("webdriver-manager", uninstall=True)
|
591
|
+
# service = Service(ChromeDriverManager().install())
|
592
|
+
# driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
593
|
+
# except Exception as e:
|
594
|
+
# print(
|
595
|
+
# f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 但是更新了'webdriver-manager'之后依然没有解决"
|
596
|
+
# )
|
597
|
+
|
598
|
+
# # 隐式等等待
|
599
|
+
# if 3 < wait < 5:
|
600
|
+
# wait_ = random.uniform(3, 5)
|
601
|
+
# elif 5 <= wait < 8:
|
602
|
+
# wait_ = random.uniform(5, 8)
|
603
|
+
# elif 8 <= wait < 12:
|
604
|
+
# wait_ = random.uniform(8, 10)
|
605
|
+
# else:
|
606
|
+
# wait_ = 0
|
607
|
+
# driver_.implicitly_wait(wait_)
|
608
|
+
|
609
|
+
# if wait_until is not None and wait_until_kind is not None:
|
610
|
+
# strategy = corr_by_kind(wait_until_kind)
|
611
|
+
# WebDriverWait(driver_, timeout).until(
|
612
|
+
# EC.presence_of_element_located((strategy, wait_until))
|
613
|
+
# )
|
614
|
+
# if login_url and login_dict:
|
615
|
+
# cookies = get_cookies(url=login_url, login=login_dict)
|
616
|
+
# driver_.get(url)
|
617
|
+
# for cookie_name, cookie_value in cookies.items():
|
618
|
+
# driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
619
|
+
# else:
|
620
|
+
# if cookies:
|
621
|
+
# driver_.get(url)
|
622
|
+
# if isinstance(cookies, str):
|
623
|
+
# cookies=parse_cookies(cookies)
|
624
|
+
# for cookie_name, cookie_value in cookies.items():
|
625
|
+
# driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
626
|
+
# if not javascript:
|
627
|
+
# driver_.execute_cdp_cmd(
|
628
|
+
# "Emulation.setScriptExecutionDisabled", {"value": True}
|
629
|
+
# )
|
630
|
+
|
631
|
+
# if login_url:
|
632
|
+
# driver_.get(login_url)
|
633
|
+
# WebDriverWait(driver_, timeout).until(
|
634
|
+
# EC.presence_of_element_located((username_by, username_field))
|
635
|
+
# ).send_keys(username)
|
636
|
+
# WebDriverWait(driver_, timeout).until(
|
637
|
+
# EC.presence_of_element_located((password_by, password_field))
|
638
|
+
# ).send_keys(password)
|
639
|
+
# WebDriverWait(driver_, timeout).until(
|
640
|
+
# EC.element_to_be_clickable((submit_by, submit_field))
|
641
|
+
# ).click()
|
642
|
+
|
643
|
+
# driver_.get(url)
|
644
|
+
|
645
|
+
# if iframe_name:
|
646
|
+
# iframe = WebDriverWait(driver_, timeout).until(
|
647
|
+
# EC.presence_of_element_located((By.NAME, iframe_name))
|
648
|
+
# )
|
649
|
+
# driver_.switch_to.frame(iframe)
|
650
|
+
|
651
|
+
# # WebDriverWait(driver, timeout).until(
|
652
|
+
# # EC.presence_of_element_located((by, where))
|
653
|
+
# # )
|
654
|
+
|
655
|
+
# # # scroll down the page by a certain number of pixels
|
656
|
+
# scroll_smth_steps(driver_)
|
657
|
+
|
658
|
+
# # 设置轮询
|
659
|
+
# for attempt in range(scroll_try):
|
660
|
+
# page_source = driver_.page_source
|
661
|
+
# content = BeautifulSoup(page_source, "html.parser")
|
662
|
+
# if content and content.find_all(by):
|
663
|
+
# break
|
664
|
+
# time.sleep(
|
665
|
+
# random.uniform(2, 4)
|
666
|
+
# ) # Wait for a random time before polling again
|
667
|
+
|
668
|
+
# driver_.quit()
|
669
|
+
|
670
|
+
# # content = BeautifulSoup(page_source, "html.parser")
|
671
|
+
# if content:
|
672
|
+
# return "text/html", content
|
673
|
+
# else:
|
674
|
+
# logger.warning("Selenium could not fetch content")
|
675
|
+
# return None, None
|
676
|
+
# elif 'scr' in driver.lower():
|
677
|
+
# settings = {
|
678
|
+
# "USER_AGENT": user_agent(),
|
679
|
+
# "DOWNLOAD_DELAY": 1, # Prevent overloading the server
|
680
|
+
# "COOKIES_ENABLED": True if cookies else False,
|
681
|
+
# "LOG_LEVEL": "WARNING", # Reduce log verbosity
|
682
|
+
# }
|
683
|
+
# content=fetch_scrapy(url,
|
684
|
+
# parser=parser,
|
685
|
+
# cookies=cookies,
|
686
|
+
# headers=headers,
|
687
|
+
# settings=settings)
|
688
|
+
# return parser, content
|
689
|
+
|
690
|
+
# except requests.RequestException as e:
|
691
|
+
# logger.error(f"Error fetching URL '{url}': {e}")
|
692
|
+
# return None, None
|
693
|
+
|
694
|
+
def _clean_temp():
|
695
|
+
import os
|
696
|
+
import shutil
|
697
|
+
import tempfile
|
698
|
+
from pathlib import Path
|
699
|
+
|
700
|
+
# Get the parent folder of the tempdir
|
701
|
+
temp_dir = Path(tempfile.gettempdir()).parent # moves from /T to parent dir
|
702
|
+
|
703
|
+
for subdir in temp_dir.iterdir():
|
704
|
+
if subdir.is_dir():
|
705
|
+
for d in subdir.iterdir():
|
706
|
+
if "com.google.Chrome.code_sign_clone" in d.name:
|
707
|
+
try:
|
708
|
+
print(f"Removing: {d}")
|
709
|
+
shutil.rmtree(d)
|
710
|
+
except Exception as e:
|
711
|
+
print(f"Error removing {d}: {e}")
|
410
712
|
def fetch_all(
|
411
713
|
url,
|
412
714
|
parser="lxml",
|
413
715
|
driver="request", # request or selenium
|
414
716
|
by=By.TAG_NAME,
|
415
717
|
timeout=10,
|
416
|
-
retry=
|
718
|
+
retry=3, # Increased default retries
|
417
719
|
wait=0,
|
418
720
|
wait_until=None,
|
419
721
|
wait_until_kind=None,
|
@@ -427,231 +729,218 @@ def fetch_all(
|
|
427
729
|
username_by=By.NAME,
|
428
730
|
password_by=By.NAME,
|
429
731
|
submit_by=By.NAME,
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
disable_images=False, # Add option to disable images
|
732
|
+
proxy=None,
|
733
|
+
javascript=True,
|
734
|
+
disable_images=False,
|
434
735
|
iframe_name=None,
|
435
736
|
login_dict=None,
|
436
|
-
cookies=None,
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
url, headers=headers, cookies=cookie_jar, timeout=timeout, stream=True
|
463
|
-
)
|
464
|
-
else:
|
465
|
-
response = requests.get(url, headers=headers, timeout=timeout, stream=True)
|
737
|
+
cookies=None,
|
738
|
+
verify_ssl=True, # Added SSL verification option
|
739
|
+
follow_redirects=True, # Added redirect control
|
740
|
+
):
|
741
|
+
"""
|
742
|
+
Enhanced fetch function with better error handling and reliability.
|
743
|
+
|
744
|
+
Returns:
|
745
|
+
tuple: (content_type, parsed_content) or (None, None) on failure
|
746
|
+
"""
|
747
|
+
def _parse_content(content, content_type, parser):
|
748
|
+
"""Helper function to parse content with fallback"""
|
749
|
+
try:
|
750
|
+
if content_type in CONTENT_PARSERS:
|
751
|
+
return CONTENT_PARSERS[content_type](content, parser)
|
752
|
+
|
753
|
+
# Fallback parsing attempts
|
754
|
+
if content_type.startswith('text/'):
|
755
|
+
try:
|
756
|
+
return BeautifulSoup(content, parser)
|
757
|
+
except:
|
758
|
+
return content
|
759
|
+
return content
|
760
|
+
except Exception as e:
|
761
|
+
logger.warning(f"Content parsing failed: {e}")
|
762
|
+
return content
|
466
763
|
|
467
|
-
|
468
|
-
|
469
|
-
|
764
|
+
def _make_request(url, headers, cookies, timeout, verify_ssl, follow_redirects):
|
765
|
+
"""Helper function for HTTP requests with retries"""
|
766
|
+
for attempt in range(retry):
|
767
|
+
try:
|
470
768
|
response = requests.get(
|
471
|
-
|
769
|
+
url,
|
472
770
|
headers=headers,
|
473
|
-
|
771
|
+
cookies=cookies,
|
474
772
|
timeout=timeout,
|
475
773
|
stream=True,
|
774
|
+
verify=verify_ssl,
|
775
|
+
allow_redirects=follow_redirects
|
476
776
|
)
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
url, headers=headers, proxies=proxies_glob, timeout=timeout, stream=True
|
484
|
-
)
|
485
|
-
# Raise an error if retry also fails
|
777
|
+
|
778
|
+
# Handle redirects manually if needed
|
779
|
+
if not follow_redirects and response.is_redirect:
|
780
|
+
logger.info(f"Redirect detected to: {response.headers['Location']}")
|
781
|
+
return None, None
|
782
|
+
|
486
783
|
response.raise_for_status()
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
784
|
+
return response, None
|
785
|
+
|
786
|
+
except requests.RequestException as e:
|
787
|
+
logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
788
|
+
if attempt == retry - 1:
|
789
|
+
return None, e
|
790
|
+
time.sleep(random.uniform(1, 3))
|
791
|
+
|
792
|
+
# Convert driver integer to string if needed
|
793
|
+
if isinstance(driver, int):
|
794
|
+
drivers = ["request", "selenium", "scrapy"]
|
795
|
+
try:
|
796
|
+
driver = drivers[driver]
|
797
|
+
except IndexError:
|
798
|
+
driver = "request"
|
799
|
+
|
800
|
+
headers = {"User-Agent": user_agent()}
|
801
|
+
|
802
|
+
# Prepare cookies
|
803
|
+
cookie_jar = None
|
804
|
+
if cookies:
|
805
|
+
from requests.cookies import RequestsCookieJar
|
806
|
+
cookie_jar = RequestsCookieJar()
|
807
|
+
if isinstance(cookies, str):
|
808
|
+
cookies = parse_cookies(cookies)
|
809
|
+
for name, value in cookies.items():
|
810
|
+
cookie_jar.set(name, value)
|
811
|
+
|
812
|
+
try:
|
813
|
+
if "req" in driver.lower():
|
814
|
+
response, error = _make_request(
|
815
|
+
url, headers, cookie_jar, timeout, verify_ssl, follow_redirects
|
494
816
|
)
|
495
|
-
if
|
496
|
-
content = response.content.decode(response.encoding)
|
497
|
-
else:
|
498
|
-
content = None
|
499
|
-
# logger.info(f"Content type: {content_type}")
|
500
|
-
|
501
|
-
# Check if content type is supported
|
502
|
-
if content_type in CONTENT_PARSERS and content:
|
503
|
-
return content_type, CONTENT_PARSERS[content_type](content, parser)
|
504
|
-
else:
|
505
|
-
logger.warning("Unsupported content type")
|
817
|
+
if error:
|
506
818
|
return None, None
|
819
|
+
|
820
|
+
content_type = response.headers.get("content-type", "").split(";")[0].lower()
|
821
|
+
content = response.content.decode(response.encoding or 'utf-8', errors='replace')
|
822
|
+
|
823
|
+
return content_type, _parse_content(content, content_type, parser)
|
824
|
+
|
507
825
|
elif "se" in driver.lower():
|
508
|
-
import random
|
509
826
|
from selenium import webdriver
|
510
827
|
from selenium.webdriver.chrome.service import Service
|
511
|
-
from selenium.webdriver.common.by import By
|
512
828
|
from selenium.webdriver.chrome.options import Options
|
513
|
-
from selenium.webdriver.support.ui import WebDriverWait
|
514
|
-
from selenium.webdriver.support import expected_conditions as EC
|
515
829
|
from webdriver_manager.chrome import ChromeDriverManager
|
516
|
-
from selenium.
|
517
|
-
|
518
|
-
)
|
519
|
-
|
830
|
+
from selenium.common.exceptions import WebDriverException
|
831
|
+
|
520
832
|
chrome_options = Options()
|
521
833
|
chrome_options.add_argument("--headless")
|
522
834
|
chrome_options.add_argument("--no-sandbox")
|
835
|
+
chrome_options.add_argument("--disable-gpu")
|
523
836
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
837
|
+
chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
|
524
838
|
chrome_options.add_argument(f"user-agent={user_agent()}")
|
839
|
+
|
525
840
|
if proxy:
|
526
841
|
chrome_options.add_argument(f"--proxy-server={proxy}")
|
527
842
|
if disable_images:
|
528
|
-
|
529
|
-
|
530
|
-
|
843
|
+
chrome_options.add_experimental_option(
|
844
|
+
"prefs", {"profile.managed_default_content_settings.images": 2}
|
845
|
+
)
|
846
|
+
|
847
|
+
driver_instance = None
|
531
848
|
try:
|
532
|
-
# Try
|
533
|
-
|
534
|
-
# driver_pah='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/129.0.6668.100/chromedriver-mac-arm64/chromedriver'
|
535
|
-
# service=Service(executable_path=driver_path)
|
536
|
-
|
849
|
+
# Try with latest ChromeDriver first
|
537
850
|
service = Service(ChromeDriverManager().install())
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
851
|
+
driver_instance = webdriver.Chrome(service=service, options=chrome_options)
|
852
|
+
|
853
|
+
# Configure wait times
|
854
|
+
if 3 < wait < 5:
|
855
|
+
wait_time = random.uniform(3, 5)
|
856
|
+
elif 5 <= wait < 8:
|
857
|
+
wait_time = random.uniform(5, 8)
|
858
|
+
elif 8 <= wait < 12:
|
859
|
+
wait_time = random.uniform(8, 10)
|
860
|
+
else:
|
861
|
+
wait_time = 0
|
862
|
+
|
863
|
+
driver_instance.implicitly_wait(wait_time)
|
864
|
+
|
865
|
+
# Handle login if needed
|
866
|
+
if login_url and login_dict:
|
867
|
+
cookies = get_cookies(url=login_url, login=login_dict)
|
868
|
+
driver_instance.get(url)
|
869
|
+
for name, value in cookies.items():
|
870
|
+
driver_instance.add_cookie({"name": name, "value": value})
|
871
|
+
elif cookies:
|
872
|
+
driver_instance.get(url)
|
873
|
+
if isinstance(cookies, str):
|
874
|
+
cookies = parse_cookies(cookies)
|
875
|
+
for name, value in cookies.items():
|
876
|
+
driver_instance.add_cookie({"name": name, "value": value})
|
877
|
+
|
878
|
+
if not javascript:
|
879
|
+
driver_instance.execute_cdp_cmd(
|
880
|
+
"Emulation.setScriptExecutionDisabled", {"value": True}
|
881
|
+
)
|
882
|
+
|
883
|
+
# Navigate to target URL
|
884
|
+
driver_instance.get(url)
|
885
|
+
|
886
|
+
# Handle iframes if needed
|
887
|
+
if iframe_name:
|
888
|
+
iframe = WebDriverWait(driver_instance, timeout).until(
|
889
|
+
EC.presence_of_element_located((By.NAME, iframe_name))
|
548
890
|
)
|
891
|
+
driver_instance.switch_to.frame(iframe)
|
892
|
+
|
893
|
+
# Scroll to trigger dynamic content
|
894
|
+
scroll_smth_steps(driver_instance)
|
895
|
+
|
896
|
+
# Get page source with retries
|
897
|
+
content = None
|
898
|
+
for attempt in range(scroll_try):
|
549
899
|
try:
|
550
|
-
|
551
|
-
|
552
|
-
|
900
|
+
page_source = driver_instance.page_source
|
901
|
+
content = BeautifulSoup(page_source, parser)
|
902
|
+
if content and content.find_all(by):
|
903
|
+
break
|
553
904
|
except Exception as e:
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
wait_ = random.uniform(8, 10)
|
565
|
-
else:
|
566
|
-
wait_ = 0
|
567
|
-
driver_.implicitly_wait(wait_)
|
568
|
-
|
569
|
-
if wait_until is not None and wait_until_kind is not None:
|
570
|
-
strategy = corr_by_kind(wait_until_kind)
|
571
|
-
WebDriverWait(driver_, timeout).until(
|
572
|
-
EC.presence_of_element_located((strategy, wait_until))
|
573
|
-
)
|
574
|
-
if login_url and login_dict:
|
575
|
-
cookies = get_cookies(url=login_url, login=login_dict)
|
576
|
-
driver_.get(url)
|
577
|
-
for cookie_name, cookie_value in cookies.items():
|
578
|
-
driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
579
|
-
else:
|
580
|
-
if cookies:
|
581
|
-
driver_.get(url)
|
582
|
-
if isinstance(cookies, str):
|
583
|
-
cookies=parse_cookies(cookies)
|
584
|
-
for cookie_name, cookie_value in cookies.items():
|
585
|
-
driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
586
|
-
if not javascript:
|
587
|
-
driver_.execute_cdp_cmd(
|
588
|
-
"Emulation.setScriptExecutionDisabled", {"value": True}
|
589
|
-
)
|
590
|
-
|
591
|
-
if login_url:
|
592
|
-
driver_.get(login_url)
|
593
|
-
WebDriverWait(driver_, timeout).until(
|
594
|
-
EC.presence_of_element_located((username_by, username_field))
|
595
|
-
).send_keys(username)
|
596
|
-
WebDriverWait(driver_, timeout).until(
|
597
|
-
EC.presence_of_element_located((password_by, password_field))
|
598
|
-
).send_keys(password)
|
599
|
-
WebDriverWait(driver_, timeout).until(
|
600
|
-
EC.element_to_be_clickable((submit_by, submit_field))
|
601
|
-
).click()
|
602
|
-
|
603
|
-
driver_.get(url)
|
604
|
-
|
605
|
-
if iframe_name:
|
606
|
-
iframe = WebDriverWait(driver_, timeout).until(
|
607
|
-
EC.presence_of_element_located((By.NAME, iframe_name))
|
608
|
-
)
|
609
|
-
driver_.switch_to.frame(iframe)
|
610
|
-
|
611
|
-
# WebDriverWait(driver, timeout).until(
|
612
|
-
# EC.presence_of_element_located((by, where))
|
613
|
-
# )
|
614
|
-
|
615
|
-
# # scroll down the page by a certain number of pixels
|
616
|
-
scroll_smth_steps(driver_)
|
617
|
-
|
618
|
-
# 设置轮询
|
619
|
-
for attempt in range(scroll_try):
|
620
|
-
page_source = driver_.page_source
|
621
|
-
content = BeautifulSoup(page_source, "html.parser")
|
622
|
-
if content and content.find_all(by):
|
623
|
-
break
|
624
|
-
time.sleep(
|
625
|
-
random.uniform(2, 4)
|
626
|
-
) # Wait for a random time before polling again
|
627
|
-
|
628
|
-
driver_.quit()
|
629
|
-
|
630
|
-
# content = BeautifulSoup(page_source, "html.parser")
|
631
|
-
if content:
|
632
|
-
return "text/html", content
|
633
|
-
else:
|
634
|
-
logger.warning("Selenium could not fetch content")
|
905
|
+
logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
906
|
+
time.sleep(random.uniform(1, 3))
|
907
|
+
try:
|
908
|
+
_clean_temp()
|
909
|
+
except Exception as e:
|
910
|
+
print(e)
|
911
|
+
return "text/html", content if content else None
|
912
|
+
|
913
|
+
except WebDriverException as e:
|
914
|
+
logger.error(f"Selenium error: {e}")
|
635
915
|
return None, None
|
916
|
+
finally:
|
917
|
+
if driver_instance:
|
918
|
+
driver_instance.quit()
|
919
|
+
|
636
920
|
elif 'scr' in driver.lower():
|
637
921
|
settings = {
|
638
922
|
"USER_AGENT": user_agent(),
|
639
|
-
"DOWNLOAD_DELAY": 1,
|
640
|
-
"COOKIES_ENABLED":
|
641
|
-
"LOG_LEVEL": "WARNING",
|
923
|
+
"DOWNLOAD_DELAY": 1,
|
924
|
+
"COOKIES_ENABLED": bool(cookies),
|
925
|
+
"LOG_LEVEL": "WARNING",
|
926
|
+
"RETRY_TIMES": retry,
|
927
|
+
"DOWNLOAD_TIMEOUT": timeout,
|
642
928
|
}
|
643
|
-
content=fetch_scrapy(
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
929
|
+
content = fetch_scrapy(
|
930
|
+
url,
|
931
|
+
parser=parser,
|
932
|
+
cookies=cookies,
|
933
|
+
headers=headers,
|
934
|
+
settings=settings
|
935
|
+
)
|
936
|
+
return parser, content
|
937
|
+
|
938
|
+
except Exception as e:
|
939
|
+
logger.error(f"Unexpected error in fetch_all: {e}")
|
652
940
|
return None, None
|
653
|
-
|
654
|
-
|
941
|
+
|
942
|
+
return None, None
|
943
|
+
|
655
944
|
# # Function to change Tor IP address
|
656
945
|
# def renew_tor_ip():
|
657
946
|
# with Controller.from_port(port=9051) as controller:
|
@@ -1050,6 +1339,8 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
|
|
1050
1339
|
if rm_folder:
|
1051
1340
|
ips.rm_folder(dir_save)
|
1052
1341
|
content_type, content = fetch_all(url, driver=driver)
|
1342
|
+
if content_type is None:
|
1343
|
+
content_type=""
|
1053
1344
|
if "html" in content_type.lower():
|
1054
1345
|
# Create the directory if it doesn't exist
|
1055
1346
|
os.makedirs(dir_save, exist_ok=True)
|
@@ -1193,7 +1484,9 @@ def fetch_selenium(
|
|
1193
1484
|
chrome_options = Options()
|
1194
1485
|
chrome_options.add_argument("--headless")
|
1195
1486
|
chrome_options.add_argument("--no-sandbox")
|
1487
|
+
chrome_options.add_argument("--disable-gpu")
|
1196
1488
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
1489
|
+
chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
|
1197
1490
|
chrome_options.add_argument(f"user-agent={user_agent()}")
|
1198
1491
|
if proxy:
|
1199
1492
|
chrome_options.add_argument(f"--proxy-server={proxy}")
|
@@ -1299,7 +1592,7 @@ def fetch(
|
|
1299
1592
|
condition_ = [texts.empty, booster]
|
1300
1593
|
else:
|
1301
1594
|
condition_ = [not texts, booster]
|
1302
|
-
if any(condition_):
|
1595
|
+
if any(condition_):
|
1303
1596
|
print("trying to use 'fetcher2'...")
|
1304
1597
|
texts = fetch_selenium(
|
1305
1598
|
url=url, where=where, what=what, extend=extend, **kws
|
@@ -1307,6 +1600,7 @@ def fetch(
|
|
1307
1600
|
if texts:
|
1308
1601
|
return texts
|
1309
1602
|
else:
|
1603
|
+
print("got nothing")
|
1310
1604
|
return fetch(
|
1311
1605
|
url,
|
1312
1606
|
where=where,
|