abstract-webtools 0.1.6.141__py3-none-any.whl → 0.1.6.143__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ import os
2
+ #from ..abstract_webtools import urlManager
3
+ from .urlManager import *
4
+ from urllib.parse import urlparse
5
+ from abstract_utilities import *
6
+ from selenium import webdriver
7
+ from selenium.webdriver.chrome.options import Options
8
+ import logging
9
+ import urllib3
10
+ from abstract_security import get_env_value
11
+ # Suppress urllib3 warnings and debug logs
12
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
14
+
15
+ # Suppress Selenium logs
16
+ logging.getLogger("selenium").setLevel(logging.WARNING)
17
+
18
+
19
+ # Setup Chrome options
20
+ chrome_options = Options()
21
+ chrome_options.binary_location = get_env_value('CHROME_BINARY')
22
+ chrome_options.add_argument("--headless") # Run in headless mode
23
+ chrome_options.add_argument("--no-sandbox")
24
+ chrome_options.add_argument("--disable-dev-shm-usage")
25
+ chrome_options.add_argument("--disable-gpu")
26
+ chrome_options.add_argument("--disable-software-rasterizer")
27
+ chrome_options.add_argument("--disable-extensions")
28
+ chrome_options.add_argument("--remote-debugging-port=9222")
29
+
30
+
31
+
32
+ class SingletonMeta(type):
33
+ _instances = {}
34
+ def __call__(cls, *args, **kwargs):
35
+ if cls not in cls._instances:
36
+ instance = super().__call__(*args, **kwargs)
37
+ cls._instances[cls] = instance
38
+ return cls._instances[cls]
39
+
40
+ class seleniumManager(metaclass=SingletonMeta):
41
+ def __init__(self, url):
42
+ if not hasattr(self, 'initialized'): # Prevent reinitialization
43
+ self.initialized = True
44
+ parsed_url = urlparse(url)
45
+ self.domain = parsed_url.netloc
46
+ self.scheme = parsed_url.scheme
47
+ self.base_url= f"{self.scheme}{self.domain}"
48
+ self.site_dir = os.path.join(os.getcwd(), self.domain)
49
+ os.makedirs(self.site_dir, exist_ok=True)
50
+ self.drivers = {}
51
+ self.page_type = []
52
+
53
+ def get_url_to_path(self, url):
54
+ url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
55
+ parsed_url = urlparse(url)
56
+ if parsed_url.netloc == self.domain:
57
+ paths = parsed_url.path.split('/')
58
+ dir_path = self.site_dir
59
+ for path in paths[:-1]:
60
+ dir_path = os.path.join(dir_path, path)
61
+ os.makedirs(dir_path, exist_ok=True)
62
+ self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
63
+
64
+ dir_path = os.path.join(dir_path, paths[-1])
65
+ return dir_path
66
+
67
+ def saved_url_check(self, url):
68
+ path = self.get_url_to_path(url)
69
+ return path
70
+
71
+ def get_with_netloc(self, url):
72
+ parsed_url = urlparse(url)
73
+ if parsed_url.netloc == '':
74
+ url = f"{self.scheme}://{self.domain}/{url.strip()}"
75
+ return url
76
+
77
+ def get_driver(self, url):
78
+ if url and url not in self.drivers:
79
+ # chrome_options = Options()
80
+ # chrome_options.add_argument("--headless")
81
+ driver = webdriver.Chrome(options=chrome_options)
82
+ self.drivers[url] = driver
83
+ driver.get(url)
84
+ return self.drivers[url]
85
+ def normalize_url(url, base_url=None):
86
+ """
87
+ Normalize and resolve relative URLs, ensuring proper domain and format.
88
+ """
89
+ # If URL starts with the base URL repeated, remove the extra part
90
+ manager = seleniumManager(url)
91
+ base_url = manager.base_url
92
+ if url.startswith(base_url):
93
+ url = url[len(base_url):]
94
+
95
+ # Resolve the URL against the base URL
96
+ normalized_url = urljoin(base_url, url.split('#')[0])
97
+
98
+ # Ensure only URLs belonging to the base domain are kept
99
+ if not normalized_url.startswith(base_url):
100
+ return None
101
+
102
+ return normalized_url
103
+ # Function to get Selenium page source
104
+ def get_selenium_source(url):
105
+ url_mgr = urlManager(url)
106
+ if url_mgr.url:
107
+ url = str(url_mgr.url)
108
+ manager = seleniumManager(url)
109
+ driver = manager.get_driver(url)
110
+ try:
111
+ # Get page source
112
+ page_source = driver.page_source
113
+ return page_source
114
+ finally:
115
+ # Don't quit the driver unless you're done with all interactions
116
+ pass
@@ -10,12 +10,3 @@ class SSLManager:
10
10
  def get_context(self):
11
11
  return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
12
12
 
13
- class SSLManagerSingleton:
14
- _instance = None
15
- @staticmethod
16
- def get_instance(ciphers=None, ssl_options_list=None, certification=None):
17
- if SSLManagerSingleton._instance is None:
18
- SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
19
- elif SSLManagerSingleton._instance.cipher_manager.ciphers_string != ciphers or SSLManagerSingleton._instance.ssl_options_list !=ssl_options_list or SSLManagerSingleton._instance.certification !=certification:
20
- SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
21
- return SSLManagerSingleton._instance
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.141
3
+ Version: 0.1.6.143
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -21,8 +21,9 @@ abstract_webtools/managers/dynamicRateLimiter.py,sha256=ycn5VQEPnmxjNMew4IVh-t5t
21
21
  abstract_webtools/managers/get_test.py,sha256=nISrhUGdyvRv18wTGoifGhizBFoHeK0N3FymMASloFw,825
22
22
  abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
23
23
  abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
24
+ abstract_webtools/managers/seleneumManager.py,sha256=1toMSoIPZmKwU88FMDTJl0DL398Zg_7uH-O1QqJpZC4,4184
24
25
  abstract_webtools/managers/seleniumManager.py,sha256=RRpA1_oOnZuzzQ4S6VX7tDFcI31E_mOou2CZOOZH6yI,4274
25
- abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
26
+ abstract_webtools/managers/sslManager.py,sha256=I9YUqJo8_KwLOwfBTAoSfzKSfR4Vtjw1HQXsXRnCV-g,641
26
27
  abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
27
28
  abstract_webtools/managers/userAgentManager.py,sha256=cUaOlcCTzftVBCp9ZHwMXR9IB1wAE-03YSVwUBaIFLM,2514
28
29
  abstract_webtools/managers/videoDownloader.py,sha256=mKXhKYNnJwPaiqAsHvFTBGLdXFgR3wdV0G1OIimiKbE,15424
@@ -45,7 +46,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=75gwqVXIRwgVqzATBC-
45
46
  abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
46
47
  abstract_webtools/managers/urlManager/urlManager (Copy).py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
47
48
  abstract_webtools/managers/urlManager/urlManager.py,sha256=vY4KQXtcrlC2YtlultxQpVe581l5kAuT5VGA0WrI16g,8945
48
- abstract_webtools-0.1.6.141.dist-info/METADATA,sha256=B2mQsvg0AVXNBUQADJ9ttFvCh0YleyRUjvFmcPfCpNo,7289
49
- abstract_webtools-0.1.6.141.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
50
- abstract_webtools-0.1.6.141.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
51
- abstract_webtools-0.1.6.141.dist-info/RECORD,,
49
+ abstract_webtools-0.1.6.143.dist-info/METADATA,sha256=E_BIW_Q1IeDMeemA0jKQF4nlGmwnPvIJTCVbyFplnTA,7289
50
+ abstract_webtools-0.1.6.143.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
51
+ abstract_webtools-0.1.6.143.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
52
+ abstract_webtools-0.1.6.143.dist-info/RECORD,,