abstract-webtools 0.1.6.40__py3-none-any.whl → 0.1.6.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/seleniumManager.py +53 -62
- {abstract_webtools-0.1.6.40.dist-info → abstract_webtools-0.1.6.41.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.40.dist-info → abstract_webtools-0.1.6.41.dist-info}/RECORD +6 -6
- {abstract_webtools-0.1.6.40.dist-info → abstract_webtools-0.1.6.41.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.6.40.dist-info → abstract_webtools-0.1.6.41.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.40.dist-info → abstract_webtools-0.1.6.41.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,35 @@
|
|
1
1
|
import os
|
2
|
+
from ..abstract_webtools import *
|
3
|
+
from .urlManager import *
|
2
4
|
from urllib.parse import urlparse
|
5
|
+
from abstract_utilities import *
|
3
6
|
from selenium import webdriver
|
4
7
|
from selenium.webdriver.chrome.options import Options
|
5
|
-
from selenium.webdriver.chrome.service import Service
|
6
|
-
from webdriver_manager.chrome import ChromeDriverManager # For automatic ChromeDriver installation
|
7
8
|
import logging
|
8
9
|
import urllib3
|
9
|
-
from ..abstract_webtools import * # Assuming this is a valid import
|
10
|
-
from .urlManager import *
|
11
10
|
|
12
11
|
# Suppress urllib3 warnings and debug logs
|
13
12
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
14
13
|
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
14
|
+
|
15
|
+
# Suppress Selenium logs
|
15
16
|
logging.getLogger("selenium").setLevel(logging.WARNING)
|
16
17
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
18
|
+
import os
|
19
|
+
from selenium import webdriver
|
20
|
+
from selenium.webdriver.chrome.options import Options
|
21
|
+
|
22
|
+
# Setup Chrome options
|
23
|
+
chrome_options = Options()
|
24
|
+
chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
25
|
+
chrome_options.add_argument("--headless") # Run in headless mode
|
26
|
+
chrome_options.add_argument("--no-sandbox")
|
27
|
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
28
|
+
chrome_options.add_argument("--disable-gpu")
|
29
|
+
chrome_options.add_argument("--disable-software-rasterizer")
|
30
|
+
chrome_options.add_argument("--disable-extensions")
|
31
|
+
chrome_options.add_argument("--remote-debugging-port=9222")
|
32
|
+
|
27
33
|
|
28
34
|
class SingletonMeta(type):
|
29
35
|
_instances = {}
|
@@ -33,21 +39,21 @@ class SingletonMeta(type):
|
|
33
39
|
cls._instances[cls] = instance
|
34
40
|
return cls._instances[cls]
|
35
41
|
|
36
|
-
class
|
42
|
+
class seleniumManager(metaclass=SingletonMeta):
|
37
43
|
def __init__(self, url):
|
38
44
|
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
39
45
|
self.initialized = True
|
40
46
|
parsed_url = urlparse(url)
|
41
47
|
self.domain = parsed_url.netloc
|
42
|
-
self.scheme = parsed_url.scheme
|
43
|
-
self.base_url
|
48
|
+
self.scheme = parsed_url.scheme
|
49
|
+
self.base_url= f"{self.scheme}{self.domain}"
|
44
50
|
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
45
51
|
os.makedirs(self.site_dir, exist_ok=True)
|
46
52
|
self.drivers = {}
|
47
53
|
self.page_type = []
|
48
|
-
|
54
|
+
|
49
55
|
def get_url_to_path(self, url):
|
50
|
-
url = eatAll(str(url), ['',
|
56
|
+
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
51
57
|
parsed_url = urlparse(url)
|
52
58
|
if parsed_url.netloc == self.domain:
|
53
59
|
paths = parsed_url.path.split('/')
|
@@ -55,74 +61,59 @@ class SeleniumManager(metaclass=SingletonMeta):
|
|
55
61
|
for path in paths[:-1]:
|
56
62
|
dir_path = os.path.join(dir_path, path)
|
57
63
|
os.makedirs(dir_path, exist_ok=True)
|
58
|
-
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if
|
64
|
+
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
|
65
|
+
|
59
66
|
dir_path = os.path.join(dir_path, paths[-1])
|
60
67
|
return dir_path
|
61
68
|
|
62
69
|
def saved_url_check(self, url):
|
63
|
-
|
70
|
+
path = self.get_url_to_path(url)
|
71
|
+
return path
|
64
72
|
|
65
73
|
def get_with_netloc(self, url):
|
66
74
|
parsed_url = urlparse(url)
|
67
|
-
if
|
68
|
-
url = f"{self.scheme}://{self.domain}/{url.strip(
|
75
|
+
if parsed_url.netloc == '':
|
76
|
+
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
69
77
|
return url
|
70
78
|
|
71
79
|
def get_driver(self, url):
|
72
80
|
if url and url not in self.drivers:
|
73
|
-
# Set up Chrome options
|
74
81
|
chrome_options = Options()
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
# Specify Chrome binary location if needed (optional, comment out if not applicable)
|
79
|
-
# chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
80
|
-
|
81
|
-
# Automatically install and use ChromeDriver
|
82
|
-
service = Service(ChromeDriverManager().install())
|
83
|
-
driver = webdriver.Chrome(service=service, options=chrome_options)
|
84
|
-
|
82
|
+
chrome_options.add_argument("--headless")
|
83
|
+
driver = webdriver.Chrome(options=chrome_options)
|
85
84
|
self.drivers[url] = driver
|
86
85
|
driver.get(url)
|
87
86
|
return self.drivers[url]
|
88
|
-
|
89
|
-
def quit_driver(self, url):
|
90
|
-
"""Clean up a specific driver instance."""
|
91
|
-
if url in self.drivers:
|
92
|
-
self.drivers[url].quit()
|
93
|
-
del self.drivers[url]
|
94
|
-
|
95
|
-
def quit_all_drivers(self):
|
96
|
-
"""Clean up all driver instances."""
|
97
|
-
for driver in self.drivers.values():
|
98
|
-
driver.quit()
|
99
|
-
self.drivers.clear()
|
100
|
-
|
101
87
|
def normalize_url(url, base_url=None):
|
102
|
-
"""
|
103
|
-
|
104
|
-
|
88
|
+
"""
|
89
|
+
Normalize and resolve relative URLs, ensuring proper domain and format.
|
90
|
+
"""
|
91
|
+
# If URL starts with the base URL repeated, remove the extra part
|
92
|
+
manager = seleniumManager(url)
|
93
|
+
base_url = manager.base_url
|
105
94
|
if url.startswith(base_url):
|
106
95
|
url = url[len(base_url):]
|
96
|
+
|
97
|
+
# Resolve the URL against the base URL
|
107
98
|
normalized_url = urljoin(base_url, url.split('#')[0])
|
99
|
+
|
100
|
+
# Ensure only URLs belonging to the base domain are kept
|
108
101
|
if not normalized_url.startswith(base_url):
|
109
102
|
return None
|
110
|
-
return normalized_url
|
111
103
|
|
104
|
+
return normalized_url
|
105
|
+
# Function to get Selenium page source
|
112
106
|
def get_selenium_source(url):
|
113
|
-
|
114
|
-
url_mgr = urlManager(url) # Assuming urlManager is defined elsewhere
|
107
|
+
url_mgr = urlManager(url)
|
115
108
|
if url_mgr.url:
|
116
109
|
url = str(url_mgr.url)
|
117
|
-
manager =
|
110
|
+
manager = seleniumManager(url)
|
118
111
|
driver = manager.get_driver(url)
|
119
112
|
try:
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
113
|
+
# Get page source
|
114
|
+
page_source = driver.page_source
|
115
|
+
return page_source
|
116
|
+
finally:
|
117
|
+
# Don't quit the driver unless you're done with all interactions
|
118
|
+
pass
|
125
119
|
|
126
|
-
# Ensure cleanup on program exit (optional)
|
127
|
-
import atexit
|
128
|
-
atexit.register(lambda: SeleniumManager(url="").quit_all_drivers()) # Cleanup all drivers on exit
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.41
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -17,7 +17,7 @@ abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rg
|
|
17
17
|
abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
|
18
18
|
abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
|
19
19
|
abstract_webtools/managers/requestManager.py,sha256=zXD31WAYghV1OjnTQzRQnQGqZz6_J4mjHTdNLnBop_0,17343
|
20
|
-
abstract_webtools/managers/seleniumManager.py,sha256=
|
20
|
+
abstract_webtools/managers/seleniumManager.py,sha256=qSY8gH3N5YJIMwE_Alj9HNQRip_PziIo4_T9AZE_FQo,4273
|
21
21
|
abstract_webtools/managers/soupManager.py,sha256=-_mRCWlyzfKlF64UU53WXBmCvJ98jQ4GyHh8S8Pw3xs,17198
|
22
22
|
abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
|
23
23
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
@@ -35,8 +35,8 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
|
|
35
35
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
36
36
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
37
37
|
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
38
|
-
abstract_webtools-0.1.6.
|
39
|
-
abstract_webtools-0.1.6.
|
40
|
-
abstract_webtools-0.1.6.
|
41
|
-
abstract_webtools-0.1.6.
|
42
|
-
abstract_webtools-0.1.6.
|
38
|
+
abstract_webtools-0.1.6.41.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
39
|
+
abstract_webtools-0.1.6.41.dist-info/METADATA,sha256=euYP6UF8duXUhTchwl2pkfy0FAYcK8coOLdh0PRZLDs,16051
|
40
|
+
abstract_webtools-0.1.6.41.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
41
|
+
abstract_webtools-0.1.6.41.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
42
|
+
abstract_webtools-0.1.6.41.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|