abstract-webtools 0.1.6.142__py3-none-any.whl → 0.1.6.143__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/seleneumManager.py +116 -0
- abstract_webtools/managers/sslManager.py +0 -9
- {abstract_webtools-0.1.6.142.dist-info → abstract_webtools-0.1.6.143.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.142.dist-info → abstract_webtools-0.1.6.143.dist-info}/RECORD +6 -5
- {abstract_webtools-0.1.6.142.dist-info → abstract_webtools-0.1.6.143.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.142.dist-info → abstract_webtools-0.1.6.143.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,116 @@
|
|
1
|
+
import os
|
2
|
+
#from ..abstract_webtools import urlManager
|
3
|
+
from .urlManager import *
|
4
|
+
from urllib.parse import urlparse
|
5
|
+
from abstract_utilities import *
|
6
|
+
from selenium import webdriver
|
7
|
+
from selenium.webdriver.chrome.options import Options
|
8
|
+
import logging
|
9
|
+
import urllib3
|
10
|
+
from abstract_security import get_env_value
|
11
|
+
# Suppress urllib3 warnings and debug logs
|
12
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
13
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
14
|
+
|
15
|
+
# Suppress Selenium logs
|
16
|
+
logging.getLogger("selenium").setLevel(logging.WARNING)
|
17
|
+
|
18
|
+
|
19
|
+
# Setup Chrome options
|
20
|
+
chrome_options = Options()
|
21
|
+
chrome_options.binary_location = get_env_value('CHROME_BINARY')
|
22
|
+
chrome_options.add_argument("--headless") # Run in headless mode
|
23
|
+
chrome_options.add_argument("--no-sandbox")
|
24
|
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
25
|
+
chrome_options.add_argument("--disable-gpu")
|
26
|
+
chrome_options.add_argument("--disable-software-rasterizer")
|
27
|
+
chrome_options.add_argument("--disable-extensions")
|
28
|
+
chrome_options.add_argument("--remote-debugging-port=9222")
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
class SingletonMeta(type):
|
33
|
+
_instances = {}
|
34
|
+
def __call__(cls, *args, **kwargs):
|
35
|
+
if cls not in cls._instances:
|
36
|
+
instance = super().__call__(*args, **kwargs)
|
37
|
+
cls._instances[cls] = instance
|
38
|
+
return cls._instances[cls]
|
39
|
+
|
40
|
+
class seleniumManager(metaclass=SingletonMeta):
|
41
|
+
def __init__(self, url):
|
42
|
+
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
43
|
+
self.initialized = True
|
44
|
+
parsed_url = urlparse(url)
|
45
|
+
self.domain = parsed_url.netloc
|
46
|
+
self.scheme = parsed_url.scheme
|
47
|
+
self.base_url= f"{self.scheme}{self.domain}"
|
48
|
+
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
49
|
+
os.makedirs(self.site_dir, exist_ok=True)
|
50
|
+
self.drivers = {}
|
51
|
+
self.page_type = []
|
52
|
+
|
53
|
+
def get_url_to_path(self, url):
|
54
|
+
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
55
|
+
parsed_url = urlparse(url)
|
56
|
+
if parsed_url.netloc == self.domain:
|
57
|
+
paths = parsed_url.path.split('/')
|
58
|
+
dir_path = self.site_dir
|
59
|
+
for path in paths[:-1]:
|
60
|
+
dir_path = os.path.join(dir_path, path)
|
61
|
+
os.makedirs(dir_path, exist_ok=True)
|
62
|
+
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
|
63
|
+
|
64
|
+
dir_path = os.path.join(dir_path, paths[-1])
|
65
|
+
return dir_path
|
66
|
+
|
67
|
+
def saved_url_check(self, url):
|
68
|
+
path = self.get_url_to_path(url)
|
69
|
+
return path
|
70
|
+
|
71
|
+
def get_with_netloc(self, url):
|
72
|
+
parsed_url = urlparse(url)
|
73
|
+
if parsed_url.netloc == '':
|
74
|
+
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
75
|
+
return url
|
76
|
+
|
77
|
+
def get_driver(self, url):
|
78
|
+
if url and url not in self.drivers:
|
79
|
+
# chrome_options = Options()
|
80
|
+
# chrome_options.add_argument("--headless")
|
81
|
+
driver = webdriver.Chrome(options=chrome_options)
|
82
|
+
self.drivers[url] = driver
|
83
|
+
driver.get(url)
|
84
|
+
return self.drivers[url]
|
85
|
+
def normalize_url(url, base_url=None):
|
86
|
+
"""
|
87
|
+
Normalize and resolve relative URLs, ensuring proper domain and format.
|
88
|
+
"""
|
89
|
+
# If URL starts with the base URL repeated, remove the extra part
|
90
|
+
manager = seleniumManager(url)
|
91
|
+
base_url = manager.base_url
|
92
|
+
if url.startswith(base_url):
|
93
|
+
url = url[len(base_url):]
|
94
|
+
|
95
|
+
# Resolve the URL against the base URL
|
96
|
+
normalized_url = urljoin(base_url, url.split('#')[0])
|
97
|
+
|
98
|
+
# Ensure only URLs belonging to the base domain are kept
|
99
|
+
if not normalized_url.startswith(base_url):
|
100
|
+
return None
|
101
|
+
|
102
|
+
return normalized_url
|
103
|
+
# Function to get Selenium page source
|
104
|
+
def get_selenium_source(url):
|
105
|
+
url_mgr = urlManager(url)
|
106
|
+
if url_mgr.url:
|
107
|
+
url = str(url_mgr.url)
|
108
|
+
manager = seleniumManager(url)
|
109
|
+
driver = manager.get_driver(url)
|
110
|
+
try:
|
111
|
+
# Get page source
|
112
|
+
page_source = driver.page_source
|
113
|
+
return page_source
|
114
|
+
finally:
|
115
|
+
# Don't quit the driver unless you're done with all interactions
|
116
|
+
pass
|
@@ -10,12 +10,3 @@ class SSLManager:
|
|
10
10
|
def get_context(self):
|
11
11
|
return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
|
12
12
|
|
13
|
-
class SSLManagerSingleton:
|
14
|
-
_instance = None
|
15
|
-
@staticmethod
|
16
|
-
def get_instance(ciphers=None, ssl_options_list=None, certification=None):
|
17
|
-
if SSLManagerSingleton._instance is None:
|
18
|
-
SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
|
19
|
-
elif SSLManagerSingleton._instance.cipher_manager.ciphers_string != ciphers or SSLManagerSingleton._instance.ssl_options_list !=ssl_options_list or SSLManagerSingleton._instance.certification !=certification:
|
20
|
-
SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
|
21
|
-
return SSLManagerSingleton._instance
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.143
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -21,8 +21,9 @@ abstract_webtools/managers/dynamicRateLimiter.py,sha256=ycn5VQEPnmxjNMew4IVh-t5t
|
|
21
21
|
abstract_webtools/managers/get_test.py,sha256=nISrhUGdyvRv18wTGoifGhizBFoHeK0N3FymMASloFw,825
|
22
22
|
abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
|
23
23
|
abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
|
24
|
+
abstract_webtools/managers/seleneumManager.py,sha256=1toMSoIPZmKwU88FMDTJl0DL398Zg_7uH-O1QqJpZC4,4184
|
24
25
|
abstract_webtools/managers/seleniumManager.py,sha256=RRpA1_oOnZuzzQ4S6VX7tDFcI31E_mOou2CZOOZH6yI,4274
|
25
|
-
abstract_webtools/managers/sslManager.py,sha256=
|
26
|
+
abstract_webtools/managers/sslManager.py,sha256=I9YUqJo8_KwLOwfBTAoSfzKSfR4Vtjw1HQXsXRnCV-g,641
|
26
27
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
27
28
|
abstract_webtools/managers/userAgentManager.py,sha256=cUaOlcCTzftVBCp9ZHwMXR9IB1wAE-03YSVwUBaIFLM,2514
|
28
29
|
abstract_webtools/managers/videoDownloader.py,sha256=mKXhKYNnJwPaiqAsHvFTBGLdXFgR3wdV0G1OIimiKbE,15424
|
@@ -45,7 +46,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=75gwqVXIRwgVqzATBC-
|
|
45
46
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
46
47
|
abstract_webtools/managers/urlManager/urlManager (Copy).py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
47
48
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vY4KQXtcrlC2YtlultxQpVe581l5kAuT5VGA0WrI16g,8945
|
48
|
-
abstract_webtools-0.1.6.
|
49
|
-
abstract_webtools-0.1.6.
|
50
|
-
abstract_webtools-0.1.6.
|
51
|
-
abstract_webtools-0.1.6.
|
49
|
+
abstract_webtools-0.1.6.143.dist-info/METADATA,sha256=E_BIW_Q1IeDMeemA0jKQF4nlGmwnPvIJTCVbyFplnTA,7289
|
50
|
+
abstract_webtools-0.1.6.143.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
51
|
+
abstract_webtools-0.1.6.143.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
52
|
+
abstract_webtools-0.1.6.143.dist-info/RECORD,,
|
File without changes
|
{abstract_webtools-0.1.6.142.dist-info → abstract_webtools-0.1.6.143.dist-info}/top_level.txt
RENAMED
File without changes
|