py2ls 0.2.5.10__py3-none-any.whl → 0.2.5.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/netfinder.py CHANGED
@@ -1,7 +1,8 @@
1
- from bs4 import BeautifulSoup
1
+ from bs4 import BeautifulSoup, NavigableString
2
2
  import scrapy
3
3
  import requests
4
4
  import os
5
+ import chardet
5
6
  import pandas as pd
6
7
  import logging
7
8
  import json
@@ -116,8 +117,8 @@ def extract_text_from_content(
116
117
 
117
118
  def extract_text(element):
118
119
  texts = ""
119
- if isinstance(element, str) and element.strip():
120
- texts += element.strip()
120
+ if isinstance(element, NavigableString) and element.strip():
121
+ texts += element.strip() + " "
121
122
  elif hasattr(element, "children"):
122
123
  for child in element.children:
123
124
  texts += extract_text(child)
@@ -172,6 +173,8 @@ def extract_text_from_content(
172
173
  texts = ""
173
174
  for tag in result_set:
174
175
  texts = texts + " " + extract_text(tag) + " \n"
176
+ # texts = texts + " " + tag.get_text(" ", strip=True)+ " \n"
177
+
175
178
  text_list = [tx.strip() for tx in texts.split(" \n") if tx.strip()]
176
179
  return text_list
177
180
  else:
@@ -332,7 +335,6 @@ def parse_cookies(cookies_str):
332
335
  cookies_dict[cookie_name] = cookie_value
333
336
 
334
337
  return cookies_dict
335
-
336
338
  def fetch_scrapy(
337
339
  url,
338
340
  parser="html.parser",
@@ -358,23 +360,60 @@ def fetch_scrapy(
358
360
  from scrapy.signalmanager import dispatcher
359
361
  from scrapy import signals
360
362
  from twisted.internet import reactor, defer
363
+ from twisted.internet.error import ReactorNotRestartable
361
364
  import scrapy
365
+ import logging
366
+
367
+ # Disable Scrapy's excessive logging
368
+ logging.getLogger('scrapy').setLevel(logging.WARNING)
369
+ logging.getLogger('twisted').setLevel(logging.WARNING)
362
370
 
363
371
  # Container for scraped content
364
372
  content = []
365
373
 
374
+ # Define the spider class inside the function
375
+ class FetchSpider(scrapy.Spider):
376
+ name = "fetch_spider"
377
+
378
+ def __init__(self, url=None, parser=None, cookies=None, headers=None, *args, **kwargs):
379
+ super(FetchSpider, self).__init__(*args, **kwargs)
380
+ self.start_urls = [url]
381
+ self.parser = parser
382
+ self.cookies = cookies
383
+ self.headers = headers
384
+
385
+ def start_requests(self):
386
+ for url in self.start_urls:
387
+ yield scrapy.Request(
388
+ url,
389
+ cookies=self.cookies,
390
+ headers=self.headers,
391
+ callback=self.parse
392
+ )
393
+
394
+ def parse(self, response):
395
+ from bs4 import BeautifulSoup
396
+ soup = BeautifulSoup(response.text, self.parser)
397
+ yield {
398
+ "content": soup,
399
+ "url": response.url,
400
+ "status": response.status
401
+ }
402
+
366
403
  # Callback function for item scraped signal
367
404
  def handle_item(item, response, spider):
368
- content.append(item["content"])
405
+ content.append(item)
369
406
 
370
407
  # Scrapy settings
371
408
  process_settings = settings or get_project_settings()
372
409
  process_settings.update(
373
410
  {
374
- "USER_AGENT": "CustomUserAgent/1.0", # Use a custom user agent
375
- "DOWNLOAD_DELAY": 1, # Prevent overloading servers
411
+ "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
412
+ "DOWNLOAD_DELAY": 1,
376
413
  "COOKIES_ENABLED": bool(cookies),
377
- "LOG_LEVEL": "ERROR", # Minimize log verbosity
414
+ "LOG_LEVEL": "ERROR",
415
+ "RETRY_ENABLED": False,
416
+ "HTTPERROR_ALLOW_ALL": True,
378
417
  }
379
418
  )
380
419
 
@@ -394,26 +433,292 @@ def fetch_scrapy(
394
433
  )
395
434
  reactor.stop()
396
435
 
397
- # Start the reactor if not already running
398
- if not reactor.running:
399
- crawl()
400
- reactor.run() # Blocks until the crawl finishes
401
- else:
402
- # Run the crawl if the reactor is already running
403
- d = crawl()
404
- d.addBoth(lambda _: reactor.stop())
436
+ # Handle reactor execution
437
+ try:
438
+ if not reactor.running:
439
+ crawl()
440
+ reactor.run(installSignalHandlers=0)
441
+ else:
442
+ # This case is problematic - reactor can't be restarted
443
+ raise RuntimeError("Reactor already running. Cannot run multiple crawls in same process.")
444
+ except ReactorNotRestartable:
445
+ raise RuntimeError("Scrapy reactor cannot be restarted. Create a new process for additional crawls.")
405
446
 
406
447
  # Return the first scraped content or None if empty
407
448
  return content[0] if content else None
408
-
409
-
449
+
450
+ # def fetch_all(
451
+ # url,
452
+ # parser="lxml",
453
+ # driver="request", # request or selenium
454
+ # by=By.TAG_NAME,
455
+ # timeout=10,
456
+ # retry=2,
457
+ # wait=0,
458
+ # wait_until=None,
459
+ # wait_until_kind=None,
460
+ # scroll_try=3,
461
+ # login_url=None,
462
+ # username=None,
463
+ # password=None,
464
+ # username_field="username",
465
+ # password_field="password",
466
+ # submit_field="submit",
467
+ # username_by=By.NAME,
468
+ # password_by=By.NAME,
469
+ # submit_by=By.NAME,
470
+ # # capability='eager', # eager or none
471
+ # proxy=None, # Add proxy parameter
472
+ # javascript=True, # Add JavaScript option
473
+ # disable_images=False, # Add option to disable images
474
+ # iframe_name=None,
475
+ # login_dict=None,
476
+ # cookies=None, # Add cookies parameter
477
+ # ): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
478
+ # try:
479
+ # # # Generate a random user-agent string
480
+ # # response = requests.get(url)
481
+ # # # get cookies
482
+ # # cookie=dict_from_cookiejar(response.cookies)
483
+ # # # get token from cookies
484
+ # # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
485
+ # # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
486
+
487
+ # headers = {"User-Agent": user_agent()}
488
+ # if isinstance(driver, int):
489
+ # drivers=["request", "selenium","scrapy"]
490
+ # driver=drivers[driver]
491
+ # if "req" in driver.lower():
492
+ # # response = requests.get(
493
+ # # url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
494
+ # # )
495
+
496
+ # # Handle cookies for requests
497
+ # if cookies:
498
+ # from requests.cookies import RequestsCookieJar
499
+ # cookie_jar = RequestsCookieJar()
500
+ # if isinstance(cookies, str):
501
+ # cookies=parse_cookies(cookies)
502
+ # for cookie_name, cookie_value in cookies.items():
503
+ # cookie_jar.set(cookie_name, cookie_value)
504
+ # response = requests.get(
505
+ # url, headers=headers, cookies=cookie_jar, timeout=timeout, stream=True
506
+ # )
507
+ # else:
508
+ # response = requests.get(url, headers=headers, timeout=timeout, stream=True)
509
+
510
+ # # If the response is a redirect, follow it
511
+ # while response.is_redirect:
512
+ # logger.info(f"Redirecting to: {response.headers['Location']}")
513
+ # response = requests.get(
514
+ # response.headers["Location"],
515
+ # headers=headers,
516
+ # proxies=proxies_glob,
517
+ # timeout=timeout,
518
+ # stream=True,
519
+ # )
520
+ # # Check for a 403 error
521
+ # if response.status_code == 403:
522
+ # logger.warning("403 Forbidden error. Retrying...")
523
+ # # Retry the request after a short delay
524
+ # time.sleep(random.uniform(1, 3))
525
+ # response = requests.get(
526
+ # url, headers=headers, proxies=proxies_glob, timeout=timeout, stream=True
527
+ # )
528
+ # # Raise an error if retry also fails
529
+ # response.raise_for_status()
530
+
531
+ # # Raise an error for other HTTP status codes
532
+ # response.raise_for_status()
533
+
534
+ # # Get the content type
535
+ # content_type = (
536
+ # response.headers.get("content-type", "").split(";")[0].lower()
537
+ # )
538
+ # if response.encoding:
539
+ # content = response.content.decode(response.encoding)
540
+ # else:
541
+ # content = None
542
+ # # logger.info(f"Content type: {content_type}")
543
+
544
+ # # Check if content type is supported
545
+ # if content_type in CONTENT_PARSERS and content:
546
+ # return content_type, CONTENT_PARSERS[content_type](content, parser)
547
+ # else:
548
+ # logger.warning("Unsupported content type")
549
+ # return None, None
550
+ # elif "se" in driver.lower():
551
+ # import random
552
+ # from selenium import webdriver
553
+ # from selenium.webdriver.chrome.service import Service
554
+ # from selenium.webdriver.common.by import By
555
+ # from selenium.webdriver.chrome.options import Options
556
+ # from selenium.webdriver.support.ui import WebDriverWait
557
+ # from selenium.webdriver.support import expected_conditions as EC
558
+ # from webdriver_manager.chrome import ChromeDriverManager
559
+ # from selenium.webdriver.common.desired_capabilities import (
560
+ # DesiredCapabilities,
561
+ # )
562
+
563
+ # chrome_options = Options()
564
+ # chrome_options.add_argument("--headless")
565
+ # chrome_options.add_argument("--no-sandbox")
566
+ # chrome_options.add_argument("--disable-dev-shm-usage")
567
+ # chrome_options.add_argument(f"user-agent={user_agent()}")
568
+ # if proxy:
569
+ # chrome_options.add_argument(f"--proxy-server={proxy}")
570
+ # if disable_images:
571
+ # prefs = {"profile.managed_default_content_settings.images": 2}
572
+ # chrome_options.add_experimental_option("prefs", prefs)
573
+ # # chrome_options.page_load_strategy = capability
574
+ # try:
575
+ # # Try to install ChromeDriver using webdriver-manager
576
+
577
+ # # driver_pah='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/129.0.6668.100/chromedriver-mac-arm64/chromedriver'
578
+ # # service=Service(executable_path=driver_path)
579
+
580
+ # service = Service(ChromeDriverManager().install())
581
+ # driver_ = webdriver.Chrome(service=service, options=chrome_options)
582
+ # except Exception as e:
583
+ # print(f"Error occurred: {e}")
584
+ # print("Attempting to reinstall webdriver-manager...")
585
+ # try:
586
+ # service = Service(ChromeDriverManager().install())
587
+ # driver_ = webdriver.Chrome(service=service, options=chrome_options)
588
+ # except Exception as reinstall_error:
589
+ # print(
590
+ # f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 更新了webdriver-manager以后得到解决"
591
+ # )
592
+ # try:
593
+ # ips.upgrade("webdriver-manager", uninstall=True)
594
+ # service = Service(ChromeDriverManager().install())
595
+ # driver_ = webdriver.Chrome(service=service, options=chrome_options)
596
+ # except Exception as e:
597
+ # print(
598
+ # f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 但是更新了'webdriver-manager'之后依然没有解决"
599
+ # )
600
+
601
+ # # 隐式等等待
602
+ # if 3 < wait < 5:
603
+ # wait_ = random.uniform(3, 5)
604
+ # elif 5 <= wait < 8:
605
+ # wait_ = random.uniform(5, 8)
606
+ # elif 8 <= wait < 12:
607
+ # wait_ = random.uniform(8, 10)
608
+ # else:
609
+ # wait_ = 0
610
+ # driver_.implicitly_wait(wait_)
611
+
612
+ # if wait_until is not None and wait_until_kind is not None:
613
+ # strategy = corr_by_kind(wait_until_kind)
614
+ # WebDriverWait(driver_, timeout).until(
615
+ # EC.presence_of_element_located((strategy, wait_until))
616
+ # )
617
+ # if login_url and login_dict:
618
+ # cookies = get_cookies(url=login_url, login=login_dict)
619
+ # driver_.get(url)
620
+ # for cookie_name, cookie_value in cookies.items():
621
+ # driver_.add_cookie({"name": cookie_name, "value": cookie_value})
622
+ # else:
623
+ # if cookies:
624
+ # driver_.get(url)
625
+ # if isinstance(cookies, str):
626
+ # cookies=parse_cookies(cookies)
627
+ # for cookie_name, cookie_value in cookies.items():
628
+ # driver_.add_cookie({"name": cookie_name, "value": cookie_value})
629
+ # if not javascript:
630
+ # driver_.execute_cdp_cmd(
631
+ # "Emulation.setScriptExecutionDisabled", {"value": True}
632
+ # )
633
+
634
+ # if login_url:
635
+ # driver_.get(login_url)
636
+ # WebDriverWait(driver_, timeout).until(
637
+ # EC.presence_of_element_located((username_by, username_field))
638
+ # ).send_keys(username)
639
+ # WebDriverWait(driver_, timeout).until(
640
+ # EC.presence_of_element_located((password_by, password_field))
641
+ # ).send_keys(password)
642
+ # WebDriverWait(driver_, timeout).until(
643
+ # EC.element_to_be_clickable((submit_by, submit_field))
644
+ # ).click()
645
+
646
+ # driver_.get(url)
647
+
648
+ # if iframe_name:
649
+ # iframe = WebDriverWait(driver_, timeout).until(
650
+ # EC.presence_of_element_located((By.NAME, iframe_name))
651
+ # )
652
+ # driver_.switch_to.frame(iframe)
653
+
654
+ # # WebDriverWait(driver, timeout).until(
655
+ # # EC.presence_of_element_located((by, where))
656
+ # # )
657
+
658
+ # # # scroll down the page by a certain number of pixels
659
+ # scroll_smth_steps(driver_)
660
+
661
+ # # 设置轮询
662
+ # for attempt in range(scroll_try):
663
+ # page_source = driver_.page_source
664
+ # content = BeautifulSoup(page_source, "html.parser")
665
+ # if content and content.find_all(by):
666
+ # break
667
+ # time.sleep(
668
+ # random.uniform(2, 4)
669
+ # ) # Wait for a random time before polling again
670
+
671
+ # driver_.quit()
672
+
673
+ # # content = BeautifulSoup(page_source, "html.parser")
674
+ # if content:
675
+ # return "text/html", content
676
+ # else:
677
+ # logger.warning("Selenium could not fetch content")
678
+ # return None, None
679
+ # elif 'scr' in driver.lower():
680
+ # settings = {
681
+ # "USER_AGENT": user_agent(),
682
+ # "DOWNLOAD_DELAY": 1, # Prevent overloading the server
683
+ # "COOKIES_ENABLED": True if cookies else False,
684
+ # "LOG_LEVEL": "WARNING", # Reduce log verbosity
685
+ # }
686
+ # content=fetch_scrapy(url,
687
+ # parser=parser,
688
+ # cookies=cookies,
689
+ # headers=headers,
690
+ # settings=settings)
691
+ # return parser, content
692
+
693
+ # except requests.RequestException as e:
694
+ # logger.error(f"Error fetching URL '{url}': {e}")
695
+ # return None, None
696
+
697
+ def _clean_temp():
698
+ import os
699
+ import shutil
700
+ import tempfile
701
+ from pathlib import Path
702
+
703
+ # Get the parent folder of the tempdir
704
+ temp_dir = Path(tempfile.gettempdir()).parent # moves from /T to parent dir
705
+
706
+ for subdir in temp_dir.iterdir():
707
+ if subdir.is_dir():
708
+ for d in subdir.iterdir():
709
+ if "com.google.Chrome.code_sign_clone" in d.name:
710
+ try:
711
+ print(f"Removing: {d}")
712
+ shutil.rmtree(d)
713
+ except Exception as e:
714
+ print(f"Error removing {d}: {e}")
410
715
  def fetch_all(
411
716
  url,
412
717
  parser="lxml",
413
718
  driver="request", # request or selenium
414
719
  by=By.TAG_NAME,
415
720
  timeout=10,
416
- retry=2,
721
+ retry=3, # Increased default retries
417
722
  wait=0,
418
723
  wait_until=None,
419
724
  wait_until_kind=None,
@@ -427,231 +732,222 @@ def fetch_all(
427
732
  username_by=By.NAME,
428
733
  password_by=By.NAME,
429
734
  submit_by=By.NAME,
430
- # capability='eager', # eager or none
431
- proxy=None, # Add proxy parameter
432
- javascript=True, # Add JavaScript option
433
- disable_images=False, # Add option to disable images
735
+ proxy=None,
736
+ javascript=True,
737
+ disable_images=False,
434
738
  iframe_name=None,
435
739
  login_dict=None,
436
- cookies=None, # Add cookies parameter
437
- ): # Add option to handle iframe): # lxml is faster, # parser="html.parser"
438
- try:
439
- # # Generate a random user-agent string
440
- # response = requests.get(url)
441
- # # get cookies
442
- # cookie=dict_from_cookiejar(response.cookies)
443
- # # get token from cookies
444
- # scrf_token=re.findall(r'csrf-token=(.*?);', response.headers.get('Set-Cookie'))[0]
445
- # headers = {"User-Agent": user_agent(), "X-CSRF-Token":scrf_token}
446
-
447
- headers = {"User-Agent": user_agent()}
448
- if "req" in driver.lower():
449
- # response = requests.get(
450
- # url, headers=headers, proxies=proxies_glob, timeout=30, stream=True
451
- # )
452
-
453
- # Handle cookies for requests
454
- if cookies:
455
- from requests.cookies import RequestsCookieJar
456
- cookie_jar = RequestsCookieJar()
457
- if isinstance(cookies, str):
458
- cookies=parse_cookies(cookies)
459
- for cookie_name, cookie_value in cookies.items():
460
- cookie_jar.set(cookie_name, cookie_value)
461
- response = requests.get(
462
- url, headers=headers, cookies=cookie_jar, timeout=timeout, stream=True
463
- )
464
- else:
465
- response = requests.get(url, headers=headers, timeout=timeout, stream=True)
740
+ cookies=None,
741
+ verify_ssl=True, # Added SSL verification option
742
+ follow_redirects=True, # Added redirect control
743
+ ):
744
+ """
745
+ Enhanced fetch function with better error handling and reliability.
746
+
747
+ Returns:
748
+ tuple: (content_type, parsed_content) or (None, None) on failure
749
+ """
750
+ def _parse_content(content, content_type, parser):
751
+ """Helper function to parse content with fallback"""
752
+ try:
753
+ if content_type in CONTENT_PARSERS:
754
+ return CONTENT_PARSERS[content_type](content, parser)
755
+
756
+ # Fallback parsing attempts
757
+ if content_type.startswith('text/'):
758
+ try:
759
+ return BeautifulSoup(content, parser)
760
+ except:
761
+ return content
762
+ return content
763
+ except Exception as e:
764
+ logger.warning(f"Content parsing failed: {e}")
765
+ return content
466
766
 
467
- # If the response is a redirect, follow it
468
- while response.is_redirect:
469
- logger.info(f"Redirecting to: {response.headers['Location']}")
767
+ def _make_request(url, headers, cookies, timeout, verify_ssl, follow_redirects):
768
+ """Helper function for HTTP requests with retries"""
769
+ for attempt in range(retry):
770
+ try:
470
771
  response = requests.get(
471
- response.headers["Location"],
772
+ url,
472
773
  headers=headers,
473
- proxies=proxies_glob,
774
+ cookies=cookies,
474
775
  timeout=timeout,
475
776
  stream=True,
777
+ verify=verify_ssl,
778
+ allow_redirects=follow_redirects
476
779
  )
477
- # Check for a 403 error
478
- if response.status_code == 403:
479
- logger.warning("403 Forbidden error. Retrying...")
480
- # Retry the request after a short delay
481
- time.sleep(random.uniform(1, 3))
482
- response = requests.get(
483
- url, headers=headers, proxies=proxies_glob, timeout=timeout, stream=True
484
- )
485
- # Raise an error if retry also fails
780
+
781
+ # Handle redirects manually if needed
782
+ if not follow_redirects and response.is_redirect:
783
+ logger.info(f"Redirect detected to: {response.headers['Location']}")
784
+ return None, None
785
+
486
786
  response.raise_for_status()
487
-
488
- # Raise an error for other HTTP status codes
489
- response.raise_for_status()
490
-
491
- # Get the content type
492
- content_type = (
493
- response.headers.get("content-type", "").split(";")[0].lower()
787
+ return response, None
788
+
789
+ except requests.RequestException as e:
790
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
791
+ if attempt == retry - 1:
792
+ return None, e
793
+ time.sleep(random.uniform(1, 3))
794
+
795
+ # Convert driver integer to string if needed
796
+ if isinstance(driver, int):
797
+ drivers = ["request", "selenium", "scrapy"]
798
+ try:
799
+ driver = drivers[driver]
800
+ except IndexError:
801
+ driver = "request"
802
+
803
+ headers = {"User-Agent": user_agent()}
804
+
805
+ # Prepare cookies
806
+ cookie_jar = None
807
+ if cookies:
808
+ from requests.cookies import RequestsCookieJar
809
+ cookie_jar = RequestsCookieJar()
810
+ if isinstance(cookies, str):
811
+ cookies = parse_cookies(cookies)
812
+ for name, value in cookies.items():
813
+ cookie_jar.set(name, value)
814
+
815
+ try:
816
+ if "req" in driver.lower():
817
+ response, error = _make_request(
818
+ url, headers, cookie_jar, timeout, verify_ssl, follow_redirects
494
819
  )
495
- if response.encoding:
496
- content = response.content.decode(response.encoding)
497
- else:
498
- content = None
499
- # logger.info(f"Content type: {content_type}")
500
-
501
- # Check if content type is supported
502
- if content_type in CONTENT_PARSERS and content:
503
- return content_type, CONTENT_PARSERS[content_type](content, parser)
504
- else:
505
- logger.warning("Unsupported content type")
820
+ if error:
506
821
  return None, None
822
+ content_type = response.headers.get("content-type", "").split(";")[0].lower()
823
+ try:
824
+ detected = chardet.detect(response.content)
825
+ encoding = detected.get("encoding") or "utf-8"
826
+ content = response.content.decode(encoding, errors='replace')
827
+ except:
828
+ content = response.content.decode(response.encoding or 'utf-8', errors='replace')
829
+
830
+ return content_type, _parse_content(content, content_type, parser)
831
+
507
832
  elif "se" in driver.lower():
508
- import random
509
833
  from selenium import webdriver
510
834
  from selenium.webdriver.chrome.service import Service
511
- from selenium.webdriver.common.by import By
512
835
  from selenium.webdriver.chrome.options import Options
513
- from selenium.webdriver.support.ui import WebDriverWait
514
- from selenium.webdriver.support import expected_conditions as EC
515
836
  from webdriver_manager.chrome import ChromeDriverManager
516
- from selenium.webdriver.common.desired_capabilities import (
517
- DesiredCapabilities,
518
- )
519
-
837
+ from selenium.common.exceptions import WebDriverException
838
+
520
839
  chrome_options = Options()
521
840
  chrome_options.add_argument("--headless")
522
841
  chrome_options.add_argument("--no-sandbox")
842
+ chrome_options.add_argument("--disable-gpu")
523
843
  chrome_options.add_argument("--disable-dev-shm-usage")
844
+ chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
524
845
  chrome_options.add_argument(f"user-agent={user_agent()}")
846
+
525
847
  if proxy:
526
848
  chrome_options.add_argument(f"--proxy-server={proxy}")
527
849
  if disable_images:
528
- prefs = {"profile.managed_default_content_settings.images": 2}
529
- chrome_options.add_experimental_option("prefs", prefs)
530
- # chrome_options.page_load_strategy = capability
850
+ chrome_options.add_experimental_option(
851
+ "prefs", {"profile.managed_default_content_settings.images": 2}
852
+ )
853
+
854
+ driver_instance = None
531
855
  try:
532
- # Try to install ChromeDriver using webdriver-manager
533
-
534
- # driver_pah='/Users/macjianfeng/.wdm/drivers/chromedriver/mac64/129.0.6668.100/chromedriver-mac-arm64/chromedriver'
535
- # service=Service(executable_path=driver_path)
536
-
856
+ # Try with latest ChromeDriver first
537
857
  service = Service(ChromeDriverManager().install())
538
- driver_ = webdriver.Chrome(service=service, options=chrome_options)
539
- except Exception as e:
540
- print(f"Error occurred: {e}")
541
- print("Attempting to reinstall webdriver-manager...")
542
- try:
543
- service = Service(ChromeDriverManager().install())
544
- driver_ = webdriver.Chrome(service=service, options=chrome_options)
545
- except Exception as reinstall_error:
546
- print(
547
- f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 更新了webdriver-manager以后得到解决"
858
+ driver_instance = webdriver.Chrome(service=service, options=chrome_options)
859
+
860
+ # Configure wait times
861
+ if 3 < wait < 5:
862
+ wait_time = random.uniform(3, 5)
863
+ elif 5 <= wait < 8:
864
+ wait_time = random.uniform(5, 8)
865
+ elif 8 <= wait < 12:
866
+ wait_time = random.uniform(8, 10)
867
+ else:
868
+ wait_time = 0
869
+
870
+ driver_instance.implicitly_wait(wait_time)
871
+
872
+ # Handle login if needed
873
+ if login_url and login_dict:
874
+ cookies = get_cookies(url=login_url, login=login_dict)
875
+ driver_instance.get(url)
876
+ for name, value in cookies.items():
877
+ driver_instance.add_cookie({"name": name, "value": value})
878
+ elif cookies:
879
+ driver_instance.get(url)
880
+ if isinstance(cookies, str):
881
+ cookies = parse_cookies(cookies)
882
+ for name, value in cookies.items():
883
+ driver_instance.add_cookie({"name": name, "value": value})
884
+
885
+ if not javascript:
886
+ driver_instance.execute_cdp_cmd(
887
+ "Emulation.setScriptExecutionDisabled", {"value": True}
888
+ )
889
+
890
+ # Navigate to target URL
891
+ driver_instance.get(url)
892
+
893
+ # Handle iframes if needed
894
+ if iframe_name:
895
+ iframe = WebDriverWait(driver_instance, timeout).until(
896
+ EC.presence_of_element_located((By.NAME, iframe_name))
548
897
  )
898
+ driver_instance.switch_to.frame(iframe)
899
+
900
+ # Scroll to trigger dynamic content
901
+ scroll_smth_steps(driver_instance)
902
+
903
+ # Get page source with retries
904
+ content = None
905
+ for attempt in range(scroll_try):
549
906
  try:
550
- ips.upgrade("webdriver-manager", uninstall=True)
551
- service = Service(ChromeDriverManager().install())
552
- driver_ = webdriver.Chrome(service=service, options=chrome_options)
907
+ page_source = driver_instance.page_source
908
+ content = BeautifulSoup(page_source, parser)
909
+ if content and content.find_all(by):
910
+ break
553
911
  except Exception as e:
554
- print(
555
- f"Reinstallation failed: {reinstall_error}\n之前发生过类似的问题, 但是更新了'webdriver-manager'之后依然没有解决"
556
- )
557
-
558
- # 隐式等等待
559
- if 3 < wait < 5:
560
- wait_ = random.uniform(3, 5)
561
- elif 5 <= wait < 8:
562
- wait_ = random.uniform(5, 8)
563
- elif 8 <= wait < 12:
564
- wait_ = random.uniform(8, 10)
565
- else:
566
- wait_ = 0
567
- driver_.implicitly_wait(wait_)
568
-
569
- if wait_until is not None and wait_until_kind is not None:
570
- strategy = corr_by_kind(wait_until_kind)
571
- WebDriverWait(driver_, timeout).until(
572
- EC.presence_of_element_located((strategy, wait_until))
573
- )
574
- if login_url and login_dict:
575
- cookies = get_cookies(url=login_url, login=login_dict)
576
- driver_.get(url)
577
- for cookie_name, cookie_value in cookies.items():
578
- driver_.add_cookie({"name": cookie_name, "value": cookie_value})
579
- else:
580
- if cookies:
581
- driver_.get(url)
582
- if isinstance(cookies, str):
583
- cookies=parse_cookies(cookies)
584
- for cookie_name, cookie_value in cookies.items():
585
- driver_.add_cookie({"name": cookie_name, "value": cookie_value})
586
- if not javascript:
587
- driver_.execute_cdp_cmd(
588
- "Emulation.setScriptExecutionDisabled", {"value": True}
589
- )
590
-
591
- if login_url:
592
- driver_.get(login_url)
593
- WebDriverWait(driver_, timeout).until(
594
- EC.presence_of_element_located((username_by, username_field))
595
- ).send_keys(username)
596
- WebDriverWait(driver_, timeout).until(
597
- EC.presence_of_element_located((password_by, password_field))
598
- ).send_keys(password)
599
- WebDriverWait(driver_, timeout).until(
600
- EC.element_to_be_clickable((submit_by, submit_field))
601
- ).click()
602
-
603
- driver_.get(url)
604
-
605
- if iframe_name:
606
- iframe = WebDriverWait(driver_, timeout).until(
607
- EC.presence_of_element_located((By.NAME, iframe_name))
608
- )
609
- driver_.switch_to.frame(iframe)
610
-
611
- # WebDriverWait(driver, timeout).until(
612
- # EC.presence_of_element_located((by, where))
613
- # )
614
-
615
- # # scroll down the page by a certain number of pixels
616
- scroll_smth_steps(driver_)
617
-
618
- # 设置轮询
619
- for attempt in range(scroll_try):
620
- page_source = driver_.page_source
621
- content = BeautifulSoup(page_source, "html.parser")
622
- if content and content.find_all(by):
623
- break
624
- time.sleep(
625
- random.uniform(2, 4)
626
- ) # Wait for a random time before polling again
627
-
628
- driver_.quit()
629
-
630
- # content = BeautifulSoup(page_source, "html.parser")
631
- if content:
632
- return "text/html", content
633
- else:
634
- logger.warning("Selenium could not fetch content")
912
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
913
+ time.sleep(random.uniform(1, 3))
914
+ try:
915
+ _clean_temp()
916
+ except Exception as e:
917
+ print(e)
918
+ return "text/html", content if content else None
919
+
920
+ except WebDriverException as e:
921
+ logger.error(f"Selenium error: {e}")
635
922
  return None, None
923
+ finally:
924
+ if driver_instance:
925
+ driver_instance.quit()
926
+
636
927
  elif 'scr' in driver.lower():
637
928
  settings = {
638
929
  "USER_AGENT": user_agent(),
639
- "DOWNLOAD_DELAY": 1, # Prevent overloading the server
640
- "COOKIES_ENABLED": True if cookies else False,
641
- "LOG_LEVEL": "WARNING", # Reduce log verbosity
930
+ "DOWNLOAD_DELAY": 1,
931
+ "COOKIES_ENABLED": bool(cookies),
932
+ "LOG_LEVEL": "WARNING",
933
+ "RETRY_TIMES": retry,
934
+ "DOWNLOAD_TIMEOUT": timeout,
642
935
  }
643
- content=fetch_scrapy(url,
644
- parser=parser,
645
- cookies=cookies,
646
- headers=headers,
647
- settings=settings)
648
- return parser, content
649
-
650
- except requests.RequestException as e:
651
- logger.error(f"Error fetching URL '{url}': {e}")
936
+ content = fetch_scrapy(
937
+ url,
938
+ parser=parser,
939
+ cookies=cookies,
940
+ headers=headers,
941
+ settings=settings
942
+ )
943
+ return parser, content
944
+
945
+ except Exception as e:
946
+ logger.error(f"Unexpected error in fetch_all: {e}")
652
947
  return None, None
653
-
654
-
948
+
949
+ return None, None
950
+
655
951
  # # Function to change Tor IP address
656
952
  # def renew_tor_ip():
657
953
  # with Controller.from_port(port=9051) as controller:
@@ -1050,6 +1346,8 @@ def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=
1050
1346
  if rm_folder:
1051
1347
  ips.rm_folder(dir_save)
1052
1348
  content_type, content = fetch_all(url, driver=driver)
1349
+ if content_type is None:
1350
+ content_type=""
1053
1351
  if "html" in content_type.lower():
1054
1352
  # Create the directory if it doesn't exist
1055
1353
  os.makedirs(dir_save, exist_ok=True)
@@ -1193,7 +1491,9 @@ def fetch_selenium(
1193
1491
  chrome_options = Options()
1194
1492
  chrome_options.add_argument("--headless")
1195
1493
  chrome_options.add_argument("--no-sandbox")
1494
+ chrome_options.add_argument("--disable-gpu")
1196
1495
  chrome_options.add_argument("--disable-dev-shm-usage")
1496
+ chrome_options.add_argument(f'--user-data-dir={os.path.expanduser("~/selenium_profile")}')
1197
1497
  chrome_options.add_argument(f"user-agent={user_agent()}")
1198
1498
  if proxy:
1199
1499
  chrome_options.add_argument(f"--proxy-server={proxy}")
@@ -1299,7 +1599,7 @@ def fetch(
1299
1599
  condition_ = [texts.empty, booster]
1300
1600
  else:
1301
1601
  condition_ = [not texts, booster]
1302
- if any(condition_):
1602
+ if any(condition_):
1303
1603
  print("trying to use 'fetcher2'...")
1304
1604
  texts = fetch_selenium(
1305
1605
  url=url, where=where, what=what, extend=extend, **kws
@@ -1307,6 +1607,7 @@ def fetch(
1307
1607
  if texts:
1308
1608
  return texts
1309
1609
  else:
1610
+ print("got nothing")
1310
1611
  return fetch(
1311
1612
  url,
1312
1613
  where=where,