py2ls 0.2.5.10__py3-none-any.whl → 0.2.5.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/index +0 -0
- py2ls/ich2ls.py +1955 -296
- py2ls/im2.py +67 -0
- py2ls/ips.py +5913 -801
- py2ls/ips_lab.py +17172 -0
- py2ls/netfinder.py +511 -210
- py2ls/plot.py +13 -7
- py2ls/stats.py +1 -144
- {py2ls-0.2.5.10.dist-info → py2ls-0.2.5.14.dist-info}/METADATA +1 -1
- {py2ls-0.2.5.10.dist-info → py2ls-0.2.5.14.dist-info}/RECORD +11 -9
- {py2ls-0.2.5.10.dist-info → py2ls-0.2.5.14.dist-info}/WHEEL +0 -0
py2ls/netfinder.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
from bs4 import BeautifulSoup
|
1
|
+
from bs4 import BeautifulSoup, NavigableString
|
2
2
|
import scrapy
|
3
3
|
import requests
|
4
4
|
import os
|
5
|
+
import chardet
|
5
6
|
import pandas as pd
|
6
7
|
import logging
|
7
8
|
import json
|
@@ -116,8 +117,8 @@ def extract_text_from_content(
|
|
116
117
|
|
117
118
|
def extract_text(element):
|
118
119
|
texts = ""
|
119
|
-
if isinstance(element,
|
120
|
-
texts += element.strip()
|
120
|
+
if isinstance(element, NavigableString) and element.strip():
|
121
|
+
texts += element.strip() + " "
|
121
122
|
elif hasattr(element, "children"):
|
122
123
|
for child in element.children:
|
123
124
|
texts += extract_text(child)
|
@@ -172,6 +173,8 @@ def extract_text_from_content(
|
|
172
173
|
texts = ""
|
173
174
|
for tag in result_set:
|
174
175
|
texts = texts + " " + extract_text(tag) + " \n"
|
176
|
+
# texts = texts + " " + tag.get_text(" ", strip=True)+ " \n"
|
177
|
+
|
175
178
|
text_list = [tx.strip() for tx in texts.split(" \n") if tx.strip()]
|
176
179
|
return text_list
|
177
180
|
else:
|
@@ -332,7 +335,6 @@ def parse_cookies(cookies_str):
|
|
332
335
|
cookies_dict[cookie_name] = cookie_value
|
333
336
|
|
334
337
|
return cookies_dict
|
335
|
-
|
336
338
|
def fetch_scrapy(
|
337
339
|
url,
|
338
340
|
parser="html.parser",
|
@@ -358,23 +360,60 @@ def fetch_scrapy(
|
|
358
360
|
from scrapy.signalmanager import dispatcher
|
359
361
|
from scrapy import signals
|
360
362
|
from twisted.internet import reactor, defer
|
363
|
+
from twisted.internet.error import ReactorNotRestartable
|
361
364
|
import scrapy
|
365
|
+
import logging
|
366
|
+
|
367
|
+
# Disable Scrapy's excessive logging
|
368
|
+
logging.getLogger('scrapy').setLevel(logging.WARNING)
|
369
|
+
logging.getLogger('twisted').setLevel(logging.WARNING)
|
362
370
|
|
363
371
|
# Container for scraped content
|
364
372
|
content = []
|
365
373
|
|
374
|
+
# Define the spider class inside the function
|
375
|
+
class FetchSpider(scrapy.Spider):
|
376
|
+
name = "fetch_spider"
|
377
|
+
|
378
|
+
def __init__(self, url=None, parser=None, cookies=None, headers=None, *args, **kwargs):
|
379
|
+
super(FetchSpider, self).__init__(*args, **kwargs)
|
380
|
+
self.start_urls = [url]
|
381
|
+
self.parser = parser
|
382
|
+
self.cookies = cookies
|
383
|
+
self.headers = headers
|
384
|
+
|
385
|
+
def start_requests(self):
|
386
|
+
for url in self.start_urls:
|
387
|
+
yield scrapy.Request(
|
388
|
+
url,
|
389
|
+
cookies=self.cookies,
|
390
|
+
headers=self.headers,
|
391
|
+
callback=self.parse
|
392
|
+
)
|
393
|
+
|
394
|
+
def parse(self, response):
|
395
|
+
from bs4 import BeautifulSoup
|
396
|
+
soup = BeautifulSoup(response.text, self.parser)
|
397
|
+
yield {
|
398
|
+
"content": soup,
|
399
|
+
"url": response.url,
|
400
|
+
"status": response.status
|
401
|
+
}
|
402
|
+
|
366
403
|
# Callback function for item scraped signal
|
367
404
|
def handle_item(item, response, spider):
|
368
|
-
content.append(item
|
405
|
+
content.append(item)
|
369
406
|
|
370
407
|
# Scrapy settings
|
371
408
|
process_settings = settings or get_project_settings()
|
372
409
|
process_settings.update(
|
373
410
|
{
|
374
|
-
"USER_AGENT": "
|
375
|
-
"DOWNLOAD_DELAY": 1,
|
411
|
+
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
412
|
+
"DOWNLOAD_DELAY": 1,
|
376
413
|
"COOKIES_ENABLED": bool(cookies),
|
377
|
-
"LOG_LEVEL": "ERROR",
|
414
|
+
"LOG_LEVEL": "ERROR",
|
415
|
+
"RETRY_ENABLED": False,
|
416
|
+
"HTTPERROR_ALLOW_ALL": True,
|
378
417
|
}
|
379
418
|
)
|
380
419
|
|
@@ -394,26 +433,292 @@ def fetch_scrapy(
|
|
394
433
|
)
|
395
434
|
reactor.stop()
|
396
435
|
|
397
|
-
#
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
436
|
+
# Handle reactor execution
|
437
|
+
try:
|
438
|
+
if not reactor.running:
|
439
|
+
crawl()
|
440
|
+
reactor.run(installSignalHandlers=0)
|
441
|
+
else:
|
442
|
+
# This case is problematic - reactor can't be restarted
|
443
|
+
raise RuntimeError("Reactor already running. Cannot run multiple crawls in same process.")
|
444
|
+
except ReactorNotRestartable:
|
445
|
+
raise RuntimeError("Scrapy reactor cannot be restarted. Create a new process for additional crawls.")
|
405
446
|
|
406
447
|
# Return the first scraped content or None if empty
|
407
448
|
return content[0] if content else None
|
408
|
-
|
409
|
-
|
449
|
+
|
450
|
+
# def fetch_all(
|
451
|
+
# url,
|
452
|
+
# parser="lxml",
|
453
|
+
# driver="request", # request or selenium
|
454
|
+
# by=By.TAG_NAME,
|
455
|
+
# timeout=10,
|
456
|
+
# retry=2,
|
457
|
+
# wait=0,
|
458
|
+
# wait_until=None,
|
459
|
+
# wait_until_kind=None,
|
460
|
+
# scroll_try=3,
|
461
|
+
# login_url=None,
|
462
|
+
# username=None,
|
463
|
+
# password=None,
|
464
|
+
# username_field="username",
|
465
|
+
# password_field="password",
|
466
|
+
# submit_field="submit",
|
467
|
+
# username_by=By.NAME,
|
468
|
+
# password_by=By.NAME,
|
469
|
+
# submit_by=By.NAME,
|
470
|
+
# # capability='eager', # eager or none
|
471
|
+
# proxy=None, # Add proxy parameter
|
472
|
+
# javascript=True, # Add JavaScript option
|
473
|
+
# disable_images=False, # Add option to disable images
|
474
|
+
# iframe_name=None,
|
475
|
+
# login_dict=None,
|
476
|
+
# cookies=None, # Add cookies parameter
|
477
|
+
# ): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
|
478
|
+
# try:
|
479
|
+
# # # Generate a random user-agent string
|
480
|
+
# # response = requests.get(url)
|
481
|
+
# # # get cookies
|
482
|
+
# # cookie=dict_from_cookiejar(response.cookies)
|
483
|
+
# # # get token from cookies
|
484
|
+
# # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
|
485
|
+
# # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
|
486
|
+
|
487
|
+
# headers = {"User-Agent": user_agent()}
|
488
|
+
# if isinstance(driver, int):
|
489
|
+
# drivers=["request", "selenium","scrapy"]
|
490
|
+
# driver=drivers[driver]
|
491
|
+
# if "req" in driver.lower():
|
492
|
+
# # response = requests.get(
|
493
|
+
# # url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
|
494
|
+
# # )
|
495
|
+
|
496
|
+
# # Handle cookies for requests
|
497
|
+
# if cookies:
|
498
|
+
# from requests.cookies import RequestsCookieJar
|
499
|
+
# cookie_jar = RequestsCookieJar()
|
500
|
+
# if isinstance(cookies, str):
|
501
|
+
# cookies=parse_cookies(cookies)
|
502
|
+
# for cookie_name, cookie_value in cookies.items():
|
503
|
+
# cookie_jar.set(cookie_name, cookie_value)
|
504
|
+
# response = requests.get(
|
505
|
+
# url, headers=headers, cookies=cookie_jar, timeout=timeout, stream=True
|
506
|
+
# )
|
507
|
+
# else:
|
508
|
+
# response = requests.get(url, headers=headers, timeout=timeout, stream=True)
|
509
|
+
|
510
|
+
# # If the response is a redirect, follow it
|
511
|
+
# while response.is_redirect:
|
512
|
+
# logger.info(f"Redirecting to: {response.headers['Location']}")
|
513
|
+
# response = requests.get(
|
514
|
+
# response.headers["Location"],
|
515
|
+
# headers=headers,
|
516
|
+
# proxies=proxies_glob,
|
517
|
+
# timeout=timeout,
|
518
|
+
# stream=True,
|
519
|
+
# )
|
520
|
+
# # Check for a 403 error
|
521
|
+
# if response.status_code == 403:
|
522
|
+
# logger.warning("403 Forbidden error. Retrying...")
|
523
|
+
# # Retry the request after a short delay
|
524
|
+
# time.sleep(random.uniform(1, 3))
|
525
|
+
# response = requests.get(
|
526
|
+
# url, headers=headers, proxies=proxies_glob, timeout=timeout, stream=True
|
527
|
+
# )
|
528
|
+
# # Raise an error if retry also fails
|
529
|
+
# response.raise_for_status()
|
530
|
+
|
531
|
+
# # Raise an error for other HTTP status codes
|
532
|
+
# response.raise_for_status()
|
533
|
+
|
534
|
+
# # Get the content type
|
535
|
+
# content_type = (
|
536
|
+
# response.headers.get("content-type", "").split(";")[0].lower()
|
537
|
+
# )
|
538
|
+
# if response.encoding:
|
539
|
+
# content = response.content.decode(response.encoding)
|
540
|
+
# else:
|
541
|
+
# content = None
|
542
|
+
# # logger.info(f"Content type: {content_type}")
|
543
|
+
|
544
|
+
# # Check if content type is supported
|
545
|
+
# if content_type in CONTENT_PARSERS and content:
|
546
|
+
# return content_type, CONTENT_PARSERS[content_type](content, parser)
|
547
|
+
# else:
|
548
|
+
# logger.warning("Unsupported content type")
|
549
|
+
# return None, None
|
550
|
+
# elif "se" in driver.lower():
|
551
|
+
# import random
|
552
|
+
# from selenium import webdriver
|
553
|
+
# from selenium.webdriver.chrome.service import Service
|
554
|
+
# from selenium.webdriver.common.by import By
|
555
|
+
# from selenium.webdriver.chrome.options import Options
|
556
|
+
# from selenium.webdriver.support.ui import WebDriverWait
|
557
|
+
# from selenium.webdriver.support import expected_conditions as EC
|
558
|
+
# from webdriver_manager.chrome import ChromeDriverManager
|
559
|
+
# from selenium.webdriver.common.desired_capabilities import (
|
560
|
+
# DesiredCapabilities,
|
561
|
+
# )
|
562
|
+
|
563
|
+
# chrome_options = Options()
|
564
|
+
# chrome_options.add_argument("--headless")
|
565
|
+
# chrome_options.add_argument("--no-sandbox")
|
566
|
+
# chrome_options.add_argument("--disable-dev-shm-usage")
|
567
|
+
# chrome_options.add_argument(f"user-agent={user_agent()}")
|
568
|
+
# if proxy:
|
569
|
+
# chrome_options.add_argument(f"--proxy-server={proxy}")
|
570
|
+
# if disable_images:
|
571
|
+
# prefs = {"profile.managed_default_content_settings.images": 2}
|
572
|
+
# chrome_options.add_experimental_option("prefs", prefs)
|
573
|
+
# # chrome_options.page_load_strategy = capability
|
574
|
+
# try:
|
575
|
+
# # Try to install ChromeDriver using webdriver-manager
|
576
|
+
|
577
|
+
# # driver_pah='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/129.0.6668.100/chromedriver-mac-arm64/chromedriver'
|
578
|
+
# # service=Service(executable_path=driver_path)
|
579
|
+
|
580
|
+
# service = Service(ChromeDriverManager().install())
|
581
|
+
# driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
582
|
+
# except Exception as e:
|
583
|
+
# print(f"Error occurred: {e}")
|
584
|
+
# print("Attempting to reinstall webdriver-manager...")
|
585
|
+
# try:
|
586
|
+
# service = Service(ChromeDriverManager().install())
|
587
|
+
# driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
588
|
+
# except Exception as reinstall_error:
|
589
|
+
# print(
|
590
|
+
# f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 更新了webdriver-manager以后得到解决"
|
591
|
+
# )
|
592
|
+
# try:
|
593
|
+
# ips.upgrade("webdriver-manager", uninstall=True)
|
594
|
+
# service = Service(ChromeDriverManager().install())
|
595
|
+
# driver_ = webdriver.Chrome(service=service, options=chrome_options)
|
596
|
+
# except Exception as e:
|
597
|
+
# print(
|
598
|
+
# f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 但是更新了'webdriver-manager'之后依然没有解决"
|
599
|
+
# )
|
600
|
+
|
601
|
+
# # 隐式等等待
|
602
|
+
# if 3 < wait < 5:
|
603
|
+
# wait_ = random.uniform(3, 5)
|
604
|
+
# elif 5 <= wait < 8:
|
605
|
+
# wait_ = random.uniform(5, 8)
|
606
|
+
# elif 8 <= wait < 12:
|
607
|
+
# wait_ = random.uniform(8, 10)
|
608
|
+
# else:
|
609
|
+
# wait_ = 0
|
610
|
+
# driver_.implicitly_wait(wait_)
|
611
|
+
|
612
|
+
# if wait_until is not None and wait_until_kind is not None:
|
613
|
+
# strategy = corr_by_kind(wait_until_kind)
|
614
|
+
# WebDriverWait(driver_, timeout).until(
|
615
|
+
# EC.presence_of_element_located((strategy, wait_until))
|
616
|
+
# )
|
617
|
+
# if login_url and login_dict:
|
618
|
+
# cookies = get_cookies(url=login_url, login=login_dict)
|
619
|
+
# driver_.get(url)
|
620
|
+
# for cookie_name, cookie_value in cookies.items():
|
621
|
+
# driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
622
|
+
# else:
|
623
|
+
# if cookies:
|
624
|
+
# driver_.get(url)
|
625
|
+
# if isinstance(cookies, str):
|
626
|
+
# cookies=parse_cookies(cookies)
|
627
|
+
# for cookie_name, cookie_value in cookies.items():
|
628
|
+
# driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
629
|
+
# if not javascript:
|
630
|
+
# driver_.execute_cdp_cmd(
|
631
|
+
# "Emulation.setScriptExecutionDisabled", {"value": True}
|
632
|
+
# )
|
633
|
+
|
634
|
+
# if login_url:
|
635
|
+
# driver_.get(login_url)
|
636
|
+
# WebDriverWait(driver_, timeout).until(
|
637
|
+
# EC.presence_of_element_located((username_by, username_field))
|
638
|
+
# ).send_keys(username)
|
639
|
+
# WebDriverWait(driver_, timeout).until(
|
640
|
+
# EC.presence_of_element_located((password_by, password_field))
|
641
|
+
# ).send_keys(password)
|
642
|
+
# WebDriverWait(driver_, timeout).until(
|
643
|
+
# EC.element_to_be_clickable((submit_by, submit_field))
|
644
|
+
# ).click()
|
645
|
+
|
646
|
+
# driver_.get(url)
|
647
|
+
|
648
|
+
# if iframe_name:
|
649
|
+
# iframe = WebDriverWait(driver_, timeout).until(
|
650
|
+
# EC.presence_of_element_located((By.NAME, iframe_name))
|
651
|
+
# )
|
652
|
+
# driver_.switch_to.frame(iframe)
|
653
|
+
|
654
|
+
# # WebDriverWait(driver, timeout).until(
|
655
|
+
# # EC.presence_of_element_located((by, where))
|
656
|
+
# # )
|
657
|
+
|
658
|
+
# # # scroll down the page by a certain number of pixels
|
659
|
+
# scroll_smth_steps(driver_)
|
660
|
+
|
661
|
+
# # 设置轮询
|
662
|
+
# for attempt in range(scroll_try):
|
663
|
+
# page_source = driver_.page_source
|
664
|
+
# content = BeautifulSoup(page_source, "html.parser")
|
665
|
+
# if content and content.find_all(by):
|
666
|
+
# break
|
667
|
+
# time.sleep(
|
668
|
+
# random.uniform(2, 4)
|
669
|
+
# ) # Wait for a random time before polling again
|
670
|
+
|
671
|
+
# driver_.quit()
|
672
|
+
|
673
|
+
# # content = BeautifulSoup(page_source, "html.parser")
|
674
|
+
# if content:
|
675
|
+
# return "text/html", content
|
676
|
+
# else:
|
677
|
+
# logger.warning("Selenium could not fetch content")
|
678
|
+
# return None, None
|
679
|
+
# elif 'scr' in driver.lower():
|
680
|
+
# settings = {
|
681
|
+
# "USER_AGENT": user_agent(),
|
682
|
+
# "DOWNLOAD_DELAY": 1, # Prevent overloading the server
|
683
|
+
# "COOKIES_ENABLED": True if cookies else False,
|
684
|
+
# "LOG_LEVEL": "WARNING", # Reduce log verbosity
|
685
|
+
# }
|
686
|
+
# content=fetch_scrapy(url,
|
687
|
+
# parser=parser,
|
688
|
+
# cookies=cookies,
|
689
|
+
# headers=headers,
|
690
|
+
# settings=settings)
|
691
|
+
# return parser, content
|
692
|
+
|
693
|
+
# except requests.RequestException as e:
|
694
|
+
# logger.error(f"Error fetching URL '{url}': {e}")
|
695
|
+
# return None, None
|
696
|
+
|
697
|
+
def _clean_temp():
|
698
|
+
import os
|
699
|
+
import shutil
|
700
|
+
import tempfile
|
701
|
+
from pathlib import Path
|
702
|
+
|
703
|
+
# Get the parent folder of the tempdir
|
704
|
+
temp_dir = Path(tempfile.gettempdir()).parent # moves from /T to parent dir
|
705
|
+
|
706
|
+
for subdir in temp_dir.iterdir():
|
707
|
+
if subdir.is_dir():
|
708
|
+
for d in subdir.iterdir():
|
709
|
+
if "com.google.Chrome.code_sign_clone" in d.name:
|
710
|
+
try:
|
711
|
+
print(f"Removing: {d}")
|
712
|
+
shutil.rmtree(d)
|
713
|
+
except Exception as e:
|
714
|
+
print(f"Error removing {d}: {e}")
|
410
715
|
def fetch_all(
|
411
716
|
url,
|
412
717
|
parser="lxml",
|
413
718
|
driver="request", # request or selenium
|
414
719
|
by=By.TAG_NAME,
|
415
720
|
timeout=10,
|
416
|
-
retry=
|
721
|
+
retry=3, # Increased default retries
|
417
722
|
wait=0,
|
418
723
|
wait_until=None,
|
419
724
|
wait_until_kind=None,
|
@@ -427,231 +732,222 @@ def fetch_all(
|
|
427
732
|
username_by=By.NAME,
|
428
733
|
password_by=By.NAME,
|
429
734
|
submit_by=By.NAME,
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
disable_images=False, # Add option to disable images
|
735
|
+
proxy=None,
|
736
|
+
javascript=True,
|
737
|
+
disable_images=False,
|
434
738
|
iframe_name=None,
|
435
739
|
login_dict=None,
|
436
|
-
cookies=None,
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
url, headers=headers, cookies=cookie_jar, timeout=timeout, stream=True
|
463
|
-
)
|
464
|
-
else:
|
465
|
-
response = requests.get(url, headers=headers, timeout=timeout, stream=True)
|
740
|
+
cookies=None,
|
741
|
+
verify_ssl=True, # Added SSL verification option
|
742
|
+
follow_redirects=True, # Added redirect control
|
743
|
+
):
|
744
|
+
"""
|
745
|
+
Enhanced fetch function with better error handling and reliability.
|
746
|
+
|
747
|
+
Returns:
|
748
|
+
tuple: (content_type, parsed_content) or (None, None) on failure
|
749
|
+
"""
|
750
|
+
def _parse_content(content, content_type, parser):
|
751
|
+
"""Helper function to parse content with fallback"""
|
752
|
+
try:
|
753
|
+
if content_type in CONTENT_PARSERS:
|
754
|
+
return CONTENT_PARSERS[content_type](content, parser)
|
755
|
+
|
756
|
+
# Fallback parsing attempts
|
757
|
+
if content_type.startswith('text/'):
|
758
|
+
try:
|
759
|
+
return BeautifulSoup(content, parser)
|
760
|
+
except:
|
761
|
+
return content
|
762
|
+
return content
|
763
|
+
except Exception as e:
|
764
|
+
logger.warning(f"Content parsing failed: {e}")
|
765
|
+
return content
|
466
766
|
|
467
|
-
|
468
|
-
|
469
|
-
|
767
|
+
def _make_request(url, headers, cookies, timeout, verify_ssl, follow_redirects):
|
768
|
+
"""Helper function for HTTP requests with retries"""
|
769
|
+
for attempt in range(retry):
|
770
|
+
try:
|
470
771
|
response = requests.get(
|
471
|
-
|
772
|
+
url,
|
472
773
|
headers=headers,
|
473
|
-
|
774
|
+
cookies=cookies,
|
474
775
|
timeout=timeout,
|
475
776
|
stream=True,
|
777
|
+
verify=verify_ssl,
|
778
|
+
allow_redirects=follow_redirects
|
476
779
|
)
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
url, headers=headers, proxies=proxies_glob, timeout=timeout, stream=True
|
484
|
-
)
|
485
|
-
# Raise an error if retry also fails
|
780
|
+
|
781
|
+
# Handle redirects manually if needed
|
782
|
+
if not follow_redirects and response.is_redirect:
|
783
|
+
logger.info(f"Redirect detected to: {response.headers['Location']}")
|
784
|
+
return None, None
|
785
|
+
|
486
786
|
response.raise_for_status()
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
787
|
+
return response, None
|
788
|
+
|
789
|
+
except requests.RequestException as e:
|
790
|
+
logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
791
|
+
if attempt == retry - 1:
|
792
|
+
return None, e
|
793
|
+
time.sleep(random.uniform(1, 3))
|
794
|
+
|
795
|
+
# Convert driver integer to string if needed
|
796
|
+
if isinstance(driver, int):
|
797
|
+
drivers = ["request", "selenium", "scrapy"]
|
798
|
+
try:
|
799
|
+
driver = drivers[driver]
|
800
|
+
except IndexError:
|
801
|
+
driver = "request"
|
802
|
+
|
803
|
+
headers = {"User-Agent": user_agent()}
|
804
|
+
|
805
|
+
# Prepare cookies
|
806
|
+
cookie_jar = None
|
807
|
+
if cookies:
|
808
|
+
from requests.cookies import RequestsCookieJar
|
809
|
+
cookie_jar = RequestsCookieJar()
|
810
|
+
if isinstance(cookies, str):
|
811
|
+
cookies = parse_cookies(cookies)
|
812
|
+
for name, value in cookies.items():
|
813
|
+
cookie_jar.set(name, value)
|
814
|
+
|
815
|
+
try:
|
816
|
+
if "req" in driver.lower():
|
817
|
+
response, error = _make_request(
|
818
|
+
url, headers, cookie_jar, timeout, verify_ssl, follow_redirects
|
494
819
|
)
|
495
|
-
if
|
496
|
-
content = response.content.decode(response.encoding)
|
497
|
-
else:
|
498
|
-
content = None
|
499
|
-
# logger.info(f"Content type: {content_type}")
|
500
|
-
|
501
|
-
# Check if content type is supported
|
502
|
-
if content_type in CONTENT_PARSERS and content:
|
503
|
-
return content_type, CONTENT_PARSERS[content_type](content, parser)
|
504
|
-
else:
|
505
|
-
logger.warning("Unsupported content type")
|
820
|
+
if error:
|
506
821
|
return None, None
|
822
|
+
content_type = response.headers.get("content-type", "").split(";")[0].lower()
|
823
|
+
try:
|
824
|
+
detected = chardet.detect(response.content)
|
825
|
+
encoding = detected.get("encoding") or "utf-8"
|
826
|
+
content = response.content.decode(encoding, errors='replace')
|
827
|
+
except:
|
828
|
+
content = response.content.decode(response.encoding or 'utf-8', errors='replace')
|
829
|
+
|
830
|
+
return content_type, _parse_content(content, content_type, parser)
|
831
|
+
|
507
832
|
elif "se" in driver.lower():
|
508
|
-
import random
|
509
833
|
from selenium import webdriver
|
510
834
|
from selenium.webdriver.chrome.service import Service
|
511
|
-
from selenium.webdriver.common.by import By
|
512
835
|
from selenium.webdriver.chrome.options import Options
|
513
|
-
from selenium.webdriver.support.ui import WebDriverWait
|
514
|
-
from selenium.webdriver.support import expected_conditions as EC
|
515
836
|
from webdriver_manager.chrome import ChromeDriverManager
|
516
|
-
from selenium.
|
517
|
-
|
518
|
-
)
|
519
|
-
|
837
|
+
from selenium.common.exceptions import WebDriverException
|
838
|
+
|
520
839
|
chrome_options = Options()
|
521
840
|
chrome_options.add_argument("--headless")
|
522
841
|
chrome_options.add_argument("--no-sandbox")
|
842
|
+
chrome_options.add_argument("--disable-gpu")
|
523
843
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
844
|
+
chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
|
524
845
|
chrome_options.add_argument(f"user-agent={user_agent()}")
|
846
|
+
|
525
847
|
if proxy:
|
526
848
|
chrome_options.add_argument(f"--proxy-server={proxy}")
|
527
849
|
if disable_images:
|
528
|
-
|
529
|
-
|
530
|
-
|
850
|
+
chrome_options.add_experimental_option(
|
851
|
+
"prefs", {"profile.managed_default_content_settings.images": 2}
|
852
|
+
)
|
853
|
+
|
854
|
+
driver_instance = None
|
531
855
|
try:
|
532
|
-
# Try
|
533
|
-
|
534
|
-
# driver_pah='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/129.0.6668.100/chromedriver-mac-arm64/chromedriver'
|
535
|
-
# service=Service(executable_path=driver_path)
|
536
|
-
|
856
|
+
# Try with latest ChromeDriver first
|
537
857
|
service = Service(ChromeDriverManager().install())
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
858
|
+
driver_instance = webdriver.Chrome(service=service, options=chrome_options)
|
859
|
+
|
860
|
+
# Configure wait times
|
861
|
+
if 3 < wait < 5:
|
862
|
+
wait_time = random.uniform(3, 5)
|
863
|
+
elif 5 <= wait < 8:
|
864
|
+
wait_time = random.uniform(5, 8)
|
865
|
+
elif 8 <= wait < 12:
|
866
|
+
wait_time = random.uniform(8, 10)
|
867
|
+
else:
|
868
|
+
wait_time = 0
|
869
|
+
|
870
|
+
driver_instance.implicitly_wait(wait_time)
|
871
|
+
|
872
|
+
# Handle login if needed
|
873
|
+
if login_url and login_dict:
|
874
|
+
cookies = get_cookies(url=login_url, login=login_dict)
|
875
|
+
driver_instance.get(url)
|
876
|
+
for name, value in cookies.items():
|
877
|
+
driver_instance.add_cookie({"name": name, "value": value})
|
878
|
+
elif cookies:
|
879
|
+
driver_instance.get(url)
|
880
|
+
if isinstance(cookies, str):
|
881
|
+
cookies = parse_cookies(cookies)
|
882
|
+
for name, value in cookies.items():
|
883
|
+
driver_instance.add_cookie({"name": name, "value": value})
|
884
|
+
|
885
|
+
if not javascript:
|
886
|
+
driver_instance.execute_cdp_cmd(
|
887
|
+
"Emulation.setScriptExecutionDisabled", {"value": True}
|
888
|
+
)
|
889
|
+
|
890
|
+
# Navigate to target URL
|
891
|
+
driver_instance.get(url)
|
892
|
+
|
893
|
+
# Handle iframes if needed
|
894
|
+
if iframe_name:
|
895
|
+
iframe = WebDriverWait(driver_instance, timeout).until(
|
896
|
+
EC.presence_of_element_located((By.NAME, iframe_name))
|
548
897
|
)
|
898
|
+
driver_instance.switch_to.frame(iframe)
|
899
|
+
|
900
|
+
# Scroll to trigger dynamic content
|
901
|
+
scroll_smth_steps(driver_instance)
|
902
|
+
|
903
|
+
# Get page source with retries
|
904
|
+
content = None
|
905
|
+
for attempt in range(scroll_try):
|
549
906
|
try:
|
550
|
-
|
551
|
-
|
552
|
-
|
907
|
+
page_source = driver_instance.page_source
|
908
|
+
content = BeautifulSoup(page_source, parser)
|
909
|
+
if content and content.find_all(by):
|
910
|
+
break
|
553
911
|
except Exception as e:
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
wait_ = random.uniform(8, 10)
|
565
|
-
else:
|
566
|
-
wait_ = 0
|
567
|
-
driver_.implicitly_wait(wait_)
|
568
|
-
|
569
|
-
if wait_until is not None and wait_until_kind is not None:
|
570
|
-
strategy = corr_by_kind(wait_until_kind)
|
571
|
-
WebDriverWait(driver_, timeout).until(
|
572
|
-
EC.presence_of_element_located((strategy, wait_until))
|
573
|
-
)
|
574
|
-
if login_url and login_dict:
|
575
|
-
cookies = get_cookies(url=login_url, login=login_dict)
|
576
|
-
driver_.get(url)
|
577
|
-
for cookie_name, cookie_value in cookies.items():
|
578
|
-
driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
579
|
-
else:
|
580
|
-
if cookies:
|
581
|
-
driver_.get(url)
|
582
|
-
if isinstance(cookies, str):
|
583
|
-
cookies=parse_cookies(cookies)
|
584
|
-
for cookie_name, cookie_value in cookies.items():
|
585
|
-
driver_.add_cookie({"name": cookie_name, "value": cookie_value})
|
586
|
-
if not javascript:
|
587
|
-
driver_.execute_cdp_cmd(
|
588
|
-
"Emulation.setScriptExecutionDisabled", {"value": True}
|
589
|
-
)
|
590
|
-
|
591
|
-
if login_url:
|
592
|
-
driver_.get(login_url)
|
593
|
-
WebDriverWait(driver_, timeout).until(
|
594
|
-
EC.presence_of_element_located((username_by, username_field))
|
595
|
-
).send_keys(username)
|
596
|
-
WebDriverWait(driver_, timeout).until(
|
597
|
-
EC.presence_of_element_located((password_by, password_field))
|
598
|
-
).send_keys(password)
|
599
|
-
WebDriverWait(driver_, timeout).until(
|
600
|
-
EC.element_to_be_clickable((submit_by, submit_field))
|
601
|
-
).click()
|
602
|
-
|
603
|
-
driver_.get(url)
|
604
|
-
|
605
|
-
if iframe_name:
|
606
|
-
iframe = WebDriverWait(driver_, timeout).until(
|
607
|
-
EC.presence_of_element_located((By.NAME, iframe_name))
|
608
|
-
)
|
609
|
-
driver_.switch_to.frame(iframe)
|
610
|
-
|
611
|
-
# WebDriverWait(driver, timeout).until(
|
612
|
-
# EC.presence_of_element_located((by, where))
|
613
|
-
# )
|
614
|
-
|
615
|
-
# # scroll down the page by a certain number of pixels
|
616
|
-
scroll_smth_steps(driver_)
|
617
|
-
|
618
|
-
# 设置轮询
|
619
|
-
for attempt in range(scroll_try):
|
620
|
-
page_source = driver_.page_source
|
621
|
-
content = BeautifulSoup(page_source, "html.parser")
|
622
|
-
if content and content.find_all(by):
|
623
|
-
break
|
624
|
-
time.sleep(
|
625
|
-
random.uniform(2, 4)
|
626
|
-
) # Wait for a random time before polling again
|
627
|
-
|
628
|
-
driver_.quit()
|
629
|
-
|
630
|
-
# content = BeautifulSoup(page_source, "html.parser")
|
631
|
-
if content:
|
632
|
-
return "text/html", content
|
633
|
-
else:
|
634
|
-
logger.warning("Selenium could not fetch content")
|
912
|
+
logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
913
|
+
time.sleep(random.uniform(1, 3))
|
914
|
+
try:
|
915
|
+
_clean_temp()
|
916
|
+
except Exception as e:
|
917
|
+
print(e)
|
918
|
+
return "text/html", content if content else None
|
919
|
+
|
920
|
+
except WebDriverException as e:
|
921
|
+
logger.error(f"Selenium error: {e}")
|
635
922
|
return None, None
|
923
|
+
finally:
|
924
|
+
if driver_instance:
|
925
|
+
driver_instance.quit()
|
926
|
+
|
636
927
|
elif 'scr' in driver.lower():
|
637
928
|
settings = {
|
638
929
|
"USER_AGENT": user_agent(),
|
639
|
-
"DOWNLOAD_DELAY": 1,
|
640
|
-
"COOKIES_ENABLED":
|
641
|
-
"LOG_LEVEL": "WARNING",
|
930
|
+
"DOWNLOAD_DELAY": 1,
|
931
|
+
"COOKIES_ENABLED": bool(cookies),
|
932
|
+
"LOG_LEVEL": "WARNING",
|
933
|
+
"RETRY_TIMES": retry,
|
934
|
+
"DOWNLOAD_TIMEOUT": timeout,
|
642
935
|
}
|
643
|
-
content=fetch_scrapy(
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
936
|
+
content = fetch_scrapy(
|
937
|
+
url,
|
938
|
+
parser=parser,
|
939
|
+
cookies=cookies,
|
940
|
+
headers=headers,
|
941
|
+
settings=settings
|
942
|
+
)
|
943
|
+
return parser, content
|
944
|
+
|
945
|
+
except Exception as e:
|
946
|
+
logger.error(f"Unexpected error in fetch_all: {e}")
|
652
947
|
return None, None
|
653
|
-
|
654
|
-
|
948
|
+
|
949
|
+
return None, None
|
950
|
+
|
655
951
|
# # Function to change Tor IP address
|
656
952
|
# def renew_tor_ip():
|
657
953
|
# with Controller.from_port(port=9051) as controller:
|
@@ -1050,6 +1346,8 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
|
|
1050
1346
|
if rm_folder:
|
1051
1347
|
ips.rm_folder(dir_save)
|
1052
1348
|
content_type, content = fetch_all(url, driver=driver)
|
1349
|
+
if content_type is None:
|
1350
|
+
content_type=""
|
1053
1351
|
if "html" in content_type.lower():
|
1054
1352
|
# Create the directory if it doesn't exist
|
1055
1353
|
os.makedirs(dir_save, exist_ok=True)
|
@@ -1193,7 +1491,9 @@ def fetch_selenium(
|
|
1193
1491
|
chrome_options = Options()
|
1194
1492
|
chrome_options.add_argument("--headless")
|
1195
1493
|
chrome_options.add_argument("--no-sandbox")
|
1494
|
+
chrome_options.add_argument("--disable-gpu")
|
1196
1495
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
1496
|
+
chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
|
1197
1497
|
chrome_options.add_argument(f"user-agent={user_agent()}")
|
1198
1498
|
if proxy:
|
1199
1499
|
chrome_options.add_argument(f"--proxy-server={proxy}")
|
@@ -1299,7 +1599,7 @@ def fetch(
|
|
1299
1599
|
condition_ = [texts.empty, booster]
|
1300
1600
|
else:
|
1301
1601
|
condition_ = [not texts, booster]
|
1302
|
-
if any(condition_):
|
1602
|
+
if any(condition_):
|
1303
1603
|
print("trying to use 'fetcher2'...")
|
1304
1604
|
texts = fetch_selenium(
|
1305
1605
|
url=url, where=where, what=what, extend=extend, **kws
|
@@ -1307,6 +1607,7 @@ def fetch(
|
|
1307
1607
|
if texts:
|
1308
1608
|
return texts
|
1309
1609
|
else:
|
1610
|
+
print("got nothing")
|
1310
1611
|
return fetch(
|
1311
1612
|
url,
|
1312
1613
|
where=where,
|