py2ls 0.2.5.9__py3-none-any.whl → 0.2.5.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/netfinder.py CHANGED
@@ -332,7 +332,6 @@ def parse_cookies(cookies_str):
332
332
  cookies_dict[cookie_name] = cookie_value
333
333
 
334
334
  return cookies_dict
335
-
336
335
  def fetch_scrapy(
337
336
  url,
338
337
  parser="html.parser",
@@ -358,23 +357,60 @@ def fetch_scrapy(
358
357
  from scrapy.signalmanager import dispatcher
359
358
  from scrapy import signals
360
359
  from twisted.internet import reactor, defer
360
+ from twisted.internet.error import ReactorNotRestartable
361
361
  import scrapy
362
+ import logging
363
+
364
+ # Disable Scrapy's excessive logging
365
+ logging.getLogger('scrapy').setLevel(logging.WARNING)
366
+ logging.getLogger('twisted').setLevel(logging.WARNING)
362
367
 
363
368
  # Container for scraped content
364
369
  content = []
365
370
 
371
+ # Define the spider class inside the function
372
+ class FetchSpider(scrapy.Spider):
373
+ name = "fetch_spider"
374
+
375
+ def __init__(self, url=None, parser=None, cookies=None, headers=None, *args, **kwargs):
376
+ super(FetchSpider, self).__init__(*args, **kwargs)
377
+ self.start_urls = [url]
378
+ self.parser = parser
379
+ self.cookies = cookies
380
+ self.headers = headers
381
+
382
+ def start_requests(self):
383
+ for url in self.start_urls:
384
+ yield scrapy.Request(
385
+ url,
386
+ cookies=self.cookies,
387
+ headers=self.headers,
388
+ callback=self.parse
389
+ )
390
+
391
+ def parse(self, response):
392
+ from bs4 import BeautifulSoup
393
+ soup = BeautifulSoup(response.text, self.parser)
394
+ yield {
395
+ "content": soup,
396
+ "url": response.url,
397
+ "status": response.status
398
+ }
399
+
366
400
  # Callback function for item scraped signal
367
401
  def handle_item(item, response, spider):
368
- content.append(item["content"])
402
+ content.append(item)
369
403
 
370
404
  # Scrapy settings
371
405
  process_settings = settings or get_project_settings()
372
406
  process_settings.update(
373
407
  {
374
- "USER_AGENT": "CustomUserAgent/1.0", # Use a custom user agent
375
- "DOWNLOAD_DELAY": 1, # Prevent overloading servers
408
+ "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
409
+ "DOWNLOAD_DELAY": 1,
376
410
  "COOKIES_ENABLED": bool(cookies),
377
- "LOG_LEVEL": "ERROR", # Minimize log verbosity
411
+ "LOG_LEVEL": "ERROR",
412
+ "RETRY_ENABLED": False,
413
+ "HTTPERROR_ALLOW_ALL": True,
378
414
  }
379
415
  )
380
416
 
@@ -394,26 +430,292 @@ def fetch_scrapy(
394
430
  )
395
431
  reactor.stop()
396
432
 
397
- # Start the reactor if not already running
398
- if not reactor.running:
399
- crawl()
400
- reactor.run() # Blocks until the crawl finishes
401
- else:
402
- # Run the crawl if the reactor is already running
403
- d = crawl()
404
- d.addBoth(lambda _: reactor.stop())
433
+ # Handle reactor execution
434
+ try:
435
+ if not reactor.running:
436
+ crawl()
437
+ reactor.run(installSignalHandlers=0)
438
+ else:
439
+ # This case is problematic - reactor can't be restarted
440
+ raise RuntimeError("Reactor already running. Cannot run multiple crawls in same process.")
441
+ except ReactorNotRestartable:
442
+ raise RuntimeError("Scrapy reactor cannot be restarted. Create a new process for additional crawls.")
405
443
 
406
444
  # Return the first scraped content or None if empty
407
445
  return content[0] if content else None
408
-
409
-
446
+
447
+ # def fetch_all(
448
+ # url,
449
+ # parser="lxml",
450
+ # driver="request", # request or selenium
451
+ # by=By.TAG_NAME,
452
+ # timeout=10,
453
+ # retry=2,
454
+ # wait=0,
455
+ # wait_until=None,
456
+ # wait_until_kind=None,
457
+ # scroll_try=3,
458
+ # login_url=None,
459
+ # username=None,
460
+ # password=None,
461
+ # username_field="username",
462
+ # password_field="password",
463
+ # submit_field="submit",
464
+ # username_by=By.NAME,
465
+ # password_by=By.NAME,
466
+ # submit_by=By.NAME,
467
+ # # capability='eager', # eager or none
468
+ # proxy=None, # Add proxy parameter
469
+ # javascript=True, # Add JavaScript option
470
+ # disable_images=False, # Add option to disable images
471
+ # iframe_name=None,
472
+ # login_dict=None,
473
+ # cookies=None, # Add cookies parameter
474
+ # ): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
475
+ # try:
476
+ # # # Generate a random user-agent string
477
+ # # response = requests.get(url)
478
+ # # # get cookies
479
+ # # cookie=dict_from_cookiejar(response.cookies)
480
+ # # # get token from cookies
481
+ # # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
482
+ # # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
483
+
484
+ # headers = {"User-Agent": user_agent()}
485
+ # if isinstance(driver, int):
486
+ # drivers=["request", "selenium","scrapy"]
487
+ # driver=drivers[driver]
488
+ # if "req" in driver.lower():
489
+ # # response = requests.get(
490
+ # # url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
491
+ # # )
492
+
493
+ # # Handle cookies for requests
494
+ # if cookies:
495
+ # from requests.cookies import RequestsCookieJar
496
+ # cookie_jar = RequestsCookieJar()
497
+ # if isinstance(cookies, str):
498
+ # cookies=parse_cookies(cookies)
499
+ # for cookie_name, cookie_value in cookies.items():
500
+ # cookie_jar.set(cookie_name, cookie_value)
501
+ # response = requests.get(
502
+ # url, headers=headers, cookies=cookie_jar, timeout=timeout, stream=True
503
+ # )
504
+ # else:
505
+ # response = requests.get(url, headers=headers, timeout=timeout, stream=True)
506
+
507
+ # # If the response is a redirect, follow it
508
+ # while response.is_redirect:
509
+ # logger.info(f"Redirecting to: {response.headers['Location']}")
510
+ # response = requests.get(
511
+ # response.headers["Location"],
512
+ # headers=headers,
513
+ # proxies=proxies_glob,
514
+ # timeout=timeout,
515
+ # stream=True,
516
+ # )
517
+ # # Check for a 403 error
518
+ # if response.status_code == 403:
519
+ # logger.warning("403 Forbidden error. Retrying...")
520
+ # # Retry the request after a short delay
521
+ # time.sleep(random.uniform(1, 3))
522
+ # response = requests.get(
523
+ # url, headers=headers, proxies=proxies_glob, timeout=timeout, stream=True
524
+ # )
525
+ # # Raise an error if retry also fails
526
+ # response.raise_for_status()
527
+
528
+ # # Raise an error for other HTTP status codes
529
+ # response.raise_for_status()
530
+
531
+ # # Get the content type
532
+ # content_type = (
533
+ # response.headers.get("content-type", "").split(";")[0].lower()
534
+ # )
535
+ # if response.encoding:
536
+ # content = response.content.decode(response.encoding)
537
+ # else:
538
+ # content = None
539
+ # # logger.info(f"Content type: {content_type}")
540
+
541
+ # # Check if content type is supported
542
+ # if content_type in CONTENT_PARSERS and content:
543
+ # return content_type, CONTENT_PARSERS[content_type](content, parser)
544
+ # else:
545
+ # logger.warning("Unsupported content type")
546
+ # return None, None
547
+ # elif "se" in driver.lower():
548
+ # import random
549
+ # from selenium import webdriver
550
+ # from selenium.webdriver.chrome.service import Service
551
+ # from selenium.webdriver.common.by import By
552
+ # from selenium.webdriver.chrome.options import Options
553
+ # from selenium.webdriver.support.ui import WebDriverWait
554
+ # from selenium.webdriver.support import expected_conditions as EC
555
+ # from webdriver_manager.chrome import ChromeDriverManager
556
+ # from selenium.webdriver.common.desired_capabilities import (
557
+ # DesiredCapabilities,
558
+ # )
559
+
560
+ # chrome_options = Options()
561
+ # chrome_options.add_argument("--headless")
562
+ # chrome_options.add_argument("--no-sandbox")
563
+ # chrome_options.add_argument("--disable-dev-shm-usage")
564
+ # chrome_options.add_argument(f"user-agent={user_agent()}")
565
+ # if proxy:
566
+ # chrome_options.add_argument(f"--proxy-server={proxy}")
567
+ # if disable_images:
568
+ # prefs = {"profile.managed_default_content_settings.images": 2}
569
+ # chrome_options.add_experimental_option("prefs", prefs)
570
+ # # chrome_options.page_load_strategy = capability
571
+ # try:
572
+ # # Try to install ChromeDriver using webdriver-manager
573
+
574
+ # # driver_pah='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/129.0.6668.100/chromedriver-mac-arm64/chromedriver'
575
+ # # service=Service(executable_path=driver_path)
576
+
577
+ # service = Service(ChromeDriverManager().install())
578
+ # driver_ = webdriver.Chrome(service=service, options=chrome_options)
579
+ # except Exception as e:
580
+ # print(f"Error occurred: {e}")
581
+ # print("Attempting to reinstall webdriver-manager...")
582
+ # try:
583
+ # service = Service(ChromeDriverManager().install())
584
+ # driver_ = webdriver.Chrome(service=service, options=chrome_options)
585
+ # except Exception as reinstall_error:
586
+ # print(
587
+ # f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 更新了webdriver-manager以后得到解决"
588
+ # )
589
+ # try:
590
+ # ips.upgrade("webdriver-manager", uninstall=True)
591
+ # service = Service(ChromeDriverManager().install())
592
+ # driver_ = webdriver.Chrome(service=service, options=chrome_options)
593
+ # except Exception as e:
594
+ # print(
595
+ # f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 但是更新了'webdriver-manager'之后依然没有解决"
596
+ # )
597
+
598
+ # # 隐式等等待
599
+ # if 3 < wait < 5:
600
+ # wait_ = random.uniform(3, 5)
601
+ # elif 5 <= wait < 8:
602
+ # wait_ = random.uniform(5, 8)
603
+ # elif 8 <= wait < 12:
604
+ # wait_ = random.uniform(8, 10)
605
+ # else:
606
+ # wait_ = 0
607
+ # driver_.implicitly_wait(wait_)
608
+
609
+ # if wait_until is not None and wait_until_kind is not None:
610
+ # strategy = corr_by_kind(wait_until_kind)
611
+ # WebDriverWait(driver_, timeout).until(
612
+ # EC.presence_of_element_located((strategy, wait_until))
613
+ # )
614
+ # if login_url and login_dict:
615
+ # cookies = get_cookies(url=login_url, login=login_dict)
616
+ # driver_.get(url)
617
+ # for cookie_name, cookie_value in cookies.items():
618
+ # driver_.add_cookie({"name": cookie_name, "value": cookie_value})
619
+ # else:
620
+ # if cookies:
621
+ # driver_.get(url)
622
+ # if isinstance(cookies, str):
623
+ # cookies=parse_cookies(cookies)
624
+ # for cookie_name, cookie_value in cookies.items():
625
+ # driver_.add_cookie({"name": cookie_name, "value": cookie_value})
626
+ # if not javascript:
627
+ # driver_.execute_cdp_cmd(
628
+ # "Emulation.setScriptExecutionDisabled", {"value": True}
629
+ # )
630
+
631
+ # if login_url:
632
+ # driver_.get(login_url)
633
+ # WebDriverWait(driver_, timeout).until(
634
+ # EC.presence_of_element_located((username_by, username_field))
635
+ # ).send_keys(username)
636
+ # WebDriverWait(driver_, timeout).until(
637
+ # EC.presence_of_element_located((password_by, password_field))
638
+ # ).send_keys(password)
639
+ # WebDriverWait(driver_, timeout).until(
640
+ # EC.element_to_be_clickable((submit_by, submit_field))
641
+ # ).click()
642
+
643
+ # driver_.get(url)
644
+
645
+ # if iframe_name:
646
+ # iframe = WebDriverWait(driver_, timeout).until(
647
+ # EC.presence_of_element_located((By.NAME, iframe_name))
648
+ # )
649
+ # driver_.switch_to.frame(iframe)
650
+
651
+ # # WebDriverWait(driver, timeout).until(
652
+ # # EC.presence_of_element_located((by, where))
653
+ # # )
654
+
655
+ # # # scroll down the page by a certain number of pixels
656
+ # scroll_smth_steps(driver_)
657
+
658
+ # # 设置轮询
659
+ # for attempt in range(scroll_try):
660
+ # page_source = driver_.page_source
661
+ # content = BeautifulSoup(page_source, "html.parser")
662
+ # if content and content.find_all(by):
663
+ # break
664
+ # time.sleep(
665
+ # random.uniform(2, 4)
666
+ # ) # Wait for a random time before polling again
667
+
668
+ # driver_.quit()
669
+
670
+ # # content = BeautifulSoup(page_source, "html.parser")
671
+ # if content:
672
+ # return "text/html", content
673
+ # else:
674
+ # logger.warning("Selenium could not fetch content")
675
+ # return None, None
676
+ # elif 'scr' in driver.lower():
677
+ # settings = {
678
+ # "USER_AGENT": user_agent(),
679
+ # "DOWNLOAD_DELAY": 1, # Prevent overloading the server
680
+ # "COOKIES_ENABLED": True if cookies else False,
681
+ # "LOG_LEVEL": "WARNING", # Reduce log verbosity
682
+ # }
683
+ # content=fetch_scrapy(url,
684
+ # parser=parser,
685
+ # cookies=cookies,
686
+ # headers=headers,
687
+ # settings=settings)
688
+ # return parser, content
689
+
690
+ # except requests.RequestException as e:
691
+ # logger.error(f"Error fetching URL '{url}': {e}")
692
+ # return None, None
693
+
694
+ def _clean_temp():
695
+ import os
696
+ import shutil
697
+ import tempfile
698
+ from pathlib import Path
699
+
700
+ # Get the parent folder of the tempdir
701
+ temp_dir = Path(tempfile.gettempdir()).parent # moves from /T to parent dir
702
+
703
+ for subdir in temp_dir.iterdir():
704
+ if subdir.is_dir():
705
+ for d in subdir.iterdir():
706
+ if "com.google.Chrome.code_sign_clone" in d.name:
707
+ try:
708
+ print(f"Removing: {d}")
709
+ shutil.rmtree(d)
710
+ except Exception as e:
711
+ print(f"Error removing {d}: {e}")
410
712
  def fetch_all(
411
713
  url,
412
714
  parser="lxml",
413
715
  driver="request", # request or selenium
414
716
  by=By.TAG_NAME,
415
717
  timeout=10,
416
- retry=2,
718
+ retry=3, # Increased default retries
417
719
  wait=0,
418
720
  wait_until=None,
419
721
  wait_until_kind=None,
@@ -427,231 +729,218 @@ def fetch_all(
427
729
  username_by=By.NAME,
428
730
  password_by=By.NAME,
429
731
  submit_by=By.NAME,
430
- # capability='eager', # eager or none
431
- proxy=None, # Add proxy parameter
432
- javascript=True, # Add JavaScript option
433
- disable_images=False, # Add option to disable images
732
+ proxy=None,
733
+ javascript=True,
734
+ disable_images=False,
434
735
  iframe_name=None,
435
736
  login_dict=None,
436
- cookies=None, # Add cookies parameter
437
- ): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
438
- try:
439
- # # Generate a random user-agent string
440
- # response = requests.get(url)
441
- # # get cookies
442
- # cookie=dict_from_cookiejar(response.cookies)
443
- # # get token from cookies
444
- # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
445
- # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
446
-
447
- headers = {"User-Agent": user_agent()}
448
- if "req" in driver.lower():
449
- # response = requests.get(
450
- # url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
451
- # )
452
-
453
- # Handle cookies for requests
454
- if cookies:
455
- from requests.cookies import RequestsCookieJar
456
- cookie_jar = RequestsCookieJar()
457
- if isinstance(cookies, str):
458
- cookies=parse_cookies(cookies)
459
- for cookie_name, cookie_value in cookies.items():
460
- cookie_jar.set(cookie_name, cookie_value)
461
- response = requests.get(
462
- url, headers=headers, cookies=cookie_jar, timeout=timeout, stream=True
463
- )
464
- else:
465
- response = requests.get(url, headers=headers, timeout=timeout, stream=True)
737
+ cookies=None,
738
+ verify_ssl=True, # Added SSL verification option
739
+ follow_redirects=True, # Added redirect control
740
+ ):
741
+ """
742
+ Enhanced fetch function with better error handling and reliability.
743
+
744
+ Returns:
745
+ tuple: (content_type, parsed_content) or (None, None) on failure
746
+ """
747
+ def _parse_content(content, content_type, parser):
748
+ """Helper function to parse content with fallback"""
749
+ try:
750
+ if content_type in CONTENT_PARSERS:
751
+ return CONTENT_PARSERS[content_type](content, parser)
752
+
753
+ # Fallback parsing attempts
754
+ if content_type.startswith('text/'):
755
+ try:
756
+ return BeautifulSoup(content, parser)
757
+ except:
758
+ return content
759
+ return content
760
+ except Exception as e:
761
+ logger.warning(f"Content parsing failed: {e}")
762
+ return content
466
763
 
467
- # If the response is a redirect, follow it
468
- while response.is_redirect:
469
- logger.info(f"Redirecting to: {response.headers['Location']}")
764
+ def _make_request(url, headers, cookies, timeout, verify_ssl, follow_redirects):
765
+ """Helper function for HTTP requests with retries"""
766
+ for attempt in range(retry):
767
+ try:
470
768
  response = requests.get(
471
- response.headers["Location"],
769
+ url,
472
770
  headers=headers,
473
- proxies=proxies_glob,
771
+ cookies=cookies,
474
772
  timeout=timeout,
475
773
  stream=True,
774
+ verify=verify_ssl,
775
+ allow_redirects=follow_redirects
476
776
  )
477
- # Check for a 403 error
478
- if response.status_code == 403:
479
- logger.warning("403 Forbidden error. Retrying...")
480
- # Retry the request after a short delay
481
- time.sleep(random.uniform(1, 3))
482
- response = requests.get(
483
- url, headers=headers, proxies=proxies_glob, timeout=timeout, stream=True
484
- )
485
- # Raise an error if retry also fails
777
+
778
+ # Handle redirects manually if needed
779
+ if not follow_redirects and response.is_redirect:
780
+ logger.info(f"Redirect detected to: {response.headers['Location']}")
781
+ return None, None
782
+
486
783
  response.raise_for_status()
487
-
488
- # Raise an error for other HTTP status codes
489
- response.raise_for_status()
490
-
491
- # Get the content type
492
- content_type = (
493
- response.headers.get("content-type", "").split(";")[0].lower()
784
+ return response, None
785
+
786
+ except requests.RequestException as e:
787
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
788
+ if attempt == retry - 1:
789
+ return None, e
790
+ time.sleep(random.uniform(1, 3))
791
+
792
+ # Convert driver integer to string if needed
793
+ if isinstance(driver, int):
794
+ drivers = ["request", "selenium", "scrapy"]
795
+ try:
796
+ driver = drivers[driver]
797
+ except IndexError:
798
+ driver = "request"
799
+
800
+ headers = {"User-Agent": user_agent()}
801
+
802
+ # Prepare cookies
803
+ cookie_jar = None
804
+ if cookies:
805
+ from requests.cookies import RequestsCookieJar
806
+ cookie_jar = RequestsCookieJar()
807
+ if isinstance(cookies, str):
808
+ cookies = parse_cookies(cookies)
809
+ for name, value in cookies.items():
810
+ cookie_jar.set(name, value)
811
+
812
+ try:
813
+ if "req" in driver.lower():
814
+ response, error = _make_request(
815
+ url, headers, cookie_jar, timeout, verify_ssl, follow_redirects
494
816
  )
495
- if response.encoding:
496
- content = response.content.decode(response.encoding)
497
- else:
498
- content = None
499
- # logger.info(f"Content type: {content_type}")
500
-
501
- # Check if content type is supported
502
- if content_type in CONTENT_PARSERS and content:
503
- return content_type, CONTENT_PARSERS[content_type](content, parser)
504
- else:
505
- logger.warning("Unsupported content type")
817
+ if error:
506
818
  return None, None
819
+
820
+ content_type = response.headers.get("content-type", "").split(";")[0].lower()
821
+ content = response.content.decode(response.encoding or 'utf-8', errors='replace')
822
+
823
+ return content_type, _parse_content(content, content_type, parser)
824
+
507
825
  elif "se" in driver.lower():
508
- import random
509
826
  from selenium import webdriver
510
827
  from selenium.webdriver.chrome.service import Service
511
- from selenium.webdriver.common.by import By
512
828
  from selenium.webdriver.chrome.options import Options
513
- from selenium.webdriver.support.ui import WebDriverWait
514
- from selenium.webdriver.support import expected_conditions as EC
515
829
  from webdriver_manager.chrome import ChromeDriverManager
516
- from selenium.webdriver.common.desired_capabilities import (
517
- DesiredCapabilities,
518
- )
519
-
830
+ from selenium.common.exceptions import WebDriverException
831
+
520
832
  chrome_options = Options()
521
833
  chrome_options.add_argument("--headless")
522
834
  chrome_options.add_argument("--no-sandbox")
835
+ chrome_options.add_argument("--disable-gpu")
523
836
  chrome_options.add_argument("--disable-dev-shm-usage")
837
+ chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
524
838
  chrome_options.add_argument(f"user-agent={user_agent()}")
839
+
525
840
  if proxy:
526
841
  chrome_options.add_argument(f"--proxy-server={proxy}")
527
842
  if disable_images:
528
- prefs = {"profile.managed_default_content_settings.images": 2}
529
- chrome_options.add_experimental_option("prefs", prefs)
530
- # chrome_options.page_load_strategy = capability
843
+ chrome_options.add_experimental_option(
844
+ "prefs", {"profile.managed_default_content_settings.images": 2}
845
+ )
846
+
847
+ driver_instance = None
531
848
  try:
532
- # Try to install ChromeDriver using webdriver-manager
533
-
534
- # driver_pah='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/129.0.6668.100/chromedriver-mac-arm64/chromedriver'
535
- # service=Service(executable_path=driver_path)
536
-
849
+ # Try with latest ChromeDriver first
537
850
  service = Service(ChromeDriverManager().install())
538
- driver_ = webdriver.Chrome(service=service, options=chrome_options)
539
- except Exception as e:
540
- print(f"Error occurred: {e}")
541
- print("Attempting to reinstall webdriver-manager...")
542
- try:
543
- service = Service(ChromeDriverManager().install())
544
- driver_ = webdriver.Chrome(service=service, options=chrome_options)
545
- except Exception as reinstall_error:
546
- print(
547
- f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 更新了webdriver-manager以后得到解决"
851
+ driver_instance = webdriver.Chrome(service=service, options=chrome_options)
852
+
853
+ # Configure wait times
854
+ if 3 < wait < 5:
855
+ wait_time = random.uniform(3, 5)
856
+ elif 5 <= wait < 8:
857
+ wait_time = random.uniform(5, 8)
858
+ elif 8 <= wait < 12:
859
+ wait_time = random.uniform(8, 10)
860
+ else:
861
+ wait_time = 0
862
+
863
+ driver_instance.implicitly_wait(wait_time)
864
+
865
+ # Handle login if needed
866
+ if login_url and login_dict:
867
+ cookies = get_cookies(url=login_url, login=login_dict)
868
+ driver_instance.get(url)
869
+ for name, value in cookies.items():
870
+ driver_instance.add_cookie({"name": name, "value": value})
871
+ elif cookies:
872
+ driver_instance.get(url)
873
+ if isinstance(cookies, str):
874
+ cookies = parse_cookies(cookies)
875
+ for name, value in cookies.items():
876
+ driver_instance.add_cookie({"name": name, "value": value})
877
+
878
+ if not javascript:
879
+ driver_instance.execute_cdp_cmd(
880
+ "Emulation.setScriptExecutionDisabled", {"value": True}
881
+ )
882
+
883
+ # Navigate to target URL
884
+ driver_instance.get(url)
885
+
886
+ # Handle iframes if needed
887
+ if iframe_name:
888
+ iframe = WebDriverWait(driver_instance, timeout).until(
889
+ EC.presence_of_element_located((By.NAME, iframe_name))
548
890
  )
891
+ driver_instance.switch_to.frame(iframe)
892
+
893
+ # Scroll to trigger dynamic content
894
+ scroll_smth_steps(driver_instance)
895
+
896
+ # Get page source with retries
897
+ content = None
898
+ for attempt in range(scroll_try):
549
899
  try:
550
- ips.upgrade("webdriver-manager", uninstall=True)
551
- service = Service(ChromeDriverManager().install())
552
- driver_ = webdriver.Chrome(service=service, options=chrome_options)
900
+ page_source = driver_instance.page_source
901
+ content = BeautifulSoup(page_source, parser)
902
+ if content and content.find_all(by):
903
+ break
553
904
  except Exception as e:
554
- print(
555
- f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 但是更新了'webdriver-manager'之后依然没有解决"
556
- )
557
-
558
- # 隐式等等待
559
- if 3 < wait < 5:
560
- wait_ = random.uniform(3, 5)
561
- elif 5 <= wait < 8:
562
- wait_ = random.uniform(5, 8)
563
- elif 8 <= wait < 12:
564
- wait_ = random.uniform(8, 10)
565
- else:
566
- wait_ = 0
567
- driver_.implicitly_wait(wait_)
568
-
569
- if wait_until is not None and wait_until_kind is not None:
570
- strategy = corr_by_kind(wait_until_kind)
571
- WebDriverWait(driver_, timeout).until(
572
- EC.presence_of_element_located((strategy, wait_until))
573
- )
574
- if login_url and login_dict:
575
- cookies = get_cookies(url=login_url, login=login_dict)
576
- driver_.get(url)
577
- for cookie_name, cookie_value in cookies.items():
578
- driver_.add_cookie({"name": cookie_name, "value": cookie_value})
579
- else:
580
- if cookies:
581
- driver_.get(url)
582
- if isinstance(cookies, str):
583
- cookies=parse_cookies(cookies)
584
- for cookie_name, cookie_value in cookies.items():
585
- driver_.add_cookie({"name": cookie_name, "value": cookie_value})
586
- if not javascript:
587
- driver_.execute_cdp_cmd(
588
- "Emulation.setScriptExecutionDisabled", {"value": True}
589
- )
590
-
591
- if login_url:
592
- driver_.get(login_url)
593
- WebDriverWait(driver_, timeout).until(
594
- EC.presence_of_element_located((username_by, username_field))
595
- ).send_keys(username)
596
- WebDriverWait(driver_, timeout).until(
597
- EC.presence_of_element_located((password_by, password_field))
598
- ).send_keys(password)
599
- WebDriverWait(driver_, timeout).until(
600
- EC.element_to_be_clickable((submit_by, submit_field))
601
- ).click()
602
-
603
- driver_.get(url)
604
-
605
- if iframe_name:
606
- iframe = WebDriverWait(driver_, timeout).until(
607
- EC.presence_of_element_located((By.NAME, iframe_name))
608
- )
609
- driver_.switch_to.frame(iframe)
610
-
611
- # WebDriverWait(driver, timeout).until(
612
- # EC.presence_of_element_located((by, where))
613
- # )
614
-
615
- # # scroll down the page by a certain number of pixels
616
- scroll_smth_steps(driver_)
617
-
618
- # 设置轮询
619
- for attempt in range(scroll_try):
620
- page_source = driver_.page_source
621
- content = BeautifulSoup(page_source, "html.parser")
622
- if content and content.find_all(by):
623
- break
624
- time.sleep(
625
- random.uniform(2, 4)
626
- ) # Wait for a random time before polling again
627
-
628
- driver_.quit()
629
-
630
- # content = BeautifulSoup(page_source, "html.parser")
631
- if content:
632
- return "text/html", content
633
- else:
634
- logger.warning("Selenium could not fetch content")
905
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
906
+ time.sleep(random.uniform(1, 3))
907
+ try:
908
+ _clean_temp()
909
+ except Exception as e:
910
+ print(e)
911
+ return "text/html", content if content else None
912
+
913
+ except WebDriverException as e:
914
+ logger.error(f"Selenium error: {e}")
635
915
  return None, None
916
+ finally:
917
+ if driver_instance:
918
+ driver_instance.quit()
919
+
636
920
  elif 'scr' in driver.lower():
637
921
  settings = {
638
922
  "USER_AGENT": user_agent(),
639
- "DOWNLOAD_DELAY": 1, # Prevent overloading the server
640
- "COOKIES_ENABLED": True if cookies else False,
641
- "LOG_LEVEL": "WARNING", # Reduce log verbosity
923
+ "DOWNLOAD_DELAY": 1,
924
+ "COOKIES_ENABLED": bool(cookies),
925
+ "LOG_LEVEL": "WARNING",
926
+ "RETRY_TIMES": retry,
927
+ "DOWNLOAD_TIMEOUT": timeout,
642
928
  }
643
- content=fetch_scrapy(url,
644
- parser=parser,
645
- cookies=cookies,
646
- headers=headers,
647
- settings=settings)
648
- return parser, content
649
-
650
- except requests.RequestException as e:
651
- logger.error(f"Error fetching URL '{url}': {e}")
929
+ content = fetch_scrapy(
930
+ url,
931
+ parser=parser,
932
+ cookies=cookies,
933
+ headers=headers,
934
+ settings=settings
935
+ )
936
+ return parser, content
937
+
938
+ except Exception as e:
939
+ logger.error(f"Unexpected error in fetch_all: {e}")
652
940
  return None, None
653
-
654
-
941
+
942
+ return None, None
943
+
655
944
  # # Function to change Tor IP address
656
945
  # def renew_tor_ip():
657
946
  # with Controller.from_port(port=9051) as controller:
@@ -1050,6 +1339,8 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
1050
1339
  if rm_folder:
1051
1340
  ips.rm_folder(dir_save)
1052
1341
  content_type, content = fetch_all(url, driver=driver)
1342
+ if content_type is None:
1343
+ content_type=""
1053
1344
  if "html" in content_type.lower():
1054
1345
  # Create the directory if it doesn't exist
1055
1346
  os.makedirs(dir_save, exist_ok=True)
@@ -1193,7 +1484,9 @@ def fetch_selenium(
1193
1484
  chrome_options = Options()
1194
1485
  chrome_options.add_argument("--headless")
1195
1486
  chrome_options.add_argument("--no-sandbox")
1487
+ chrome_options.add_argument("--disable-gpu")
1196
1488
  chrome_options.add_argument("--disable-dev-shm-usage")
1489
+ chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
1197
1490
  chrome_options.add_argument(f"user-agent={user_agent()}")
1198
1491
  if proxy:
1199
1492
  chrome_options.add_argument(f"--proxy-server={proxy}")
@@ -1299,7 +1592,7 @@ def fetch(
1299
1592
  condition_ = [texts.empty, booster]
1300
1593
  else:
1301
1594
  condition_ = [not texts, booster]
1302
- if any(condition_):
1595
+ if any(condition_):
1303
1596
  print("trying to use 'fetcher2'...")
1304
1597
  texts = fetch_selenium(
1305
1598
  url=url, where=where, what=what, extend=extend, **kws
@@ -1307,6 +1600,7 @@ def fetch(
1307
1600
  if texts:
1308
1601
  return texts
1309
1602
  else:
1603
+ print("got nothing")
1310
1604
  return fetch(
1311
1605
  url,
1312
1606
  where=where,