abstract-webtools 0.1.6.39__py3-none-any.whl → 0.1.6.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/seleniumManager.py +62 -53
- {abstract_webtools-0.1.6.39.dist-info → abstract_webtools-0.1.6.40.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.39.dist-info → abstract_webtools-0.1.6.40.dist-info}/RECORD +6 -6
- {abstract_webtools-0.1.6.39.dist-info → abstract_webtools-0.1.6.40.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.6.39.dist-info → abstract_webtools-0.1.6.40.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.39.dist-info → abstract_webtools-0.1.6.40.dist-info}/top_level.txt +0 -0
@@ -1,35 +1,29 @@
|
|
1
1
|
import os
|
2
|
-
from ..abstract_webtools import *
|
3
|
-
from .urlManager import *
|
4
2
|
from urllib.parse import urlparse
|
5
|
-
from abstract_utilities import *
|
6
3
|
from selenium import webdriver
|
7
4
|
from selenium.webdriver.chrome.options import Options
|
5
|
+
from selenium.webdriver.chrome.service import Service
|
6
|
+
from webdriver_manager.chrome import ChromeDriverManager # For automatic ChromeDriver installation
|
8
7
|
import logging
|
9
8
|
import urllib3
|
9
|
+
from ..abstract_webtools import * # Assuming this is a valid import
|
10
|
+
from .urlManager import *
|
10
11
|
|
11
12
|
# Suppress urllib3 warnings and debug logs
|
12
13
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
13
14
|
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
14
|
-
|
15
|
-
# Suppress Selenium logs
|
16
15
|
logging.getLogger("selenium").setLevel(logging.WARNING)
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
#
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
chrome_options.add_argument("--disable-gpu")
|
29
|
-
chrome_options.add_argument("--disable-software-rasterizer")
|
30
|
-
chrome_options.add_argument("--disable-extensions")
|
31
|
-
chrome_options.add_argument("--remote-debugging-port=9222")
|
32
|
-
|
17
|
+
# Default Chrome options (can be overridden)
|
18
|
+
DEFAULT_CHROME_OPTIONS = [
|
19
|
+
"--headless", # Run in headless mode
|
20
|
+
"--no-sandbox",
|
21
|
+
"--disable-dev-shm-usage", # Avoid memory issues on servers
|
22
|
+
"--disable-gpu",
|
23
|
+
"--disable-software-rasterizer",
|
24
|
+
"--disable-extensions",
|
25
|
+
"--remote-debugging-port=9222"
|
26
|
+
]
|
33
27
|
|
34
28
|
class SingletonMeta(type):
|
35
29
|
_instances = {}
|
@@ -39,21 +33,21 @@ class SingletonMeta(type):
|
|
39
33
|
cls._instances[cls] = instance
|
40
34
|
return cls._instances[cls]
|
41
35
|
|
42
|
-
class
|
36
|
+
class SeleniumManager(metaclass=SingletonMeta):
|
43
37
|
def __init__(self, url):
|
44
38
|
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
45
39
|
self.initialized = True
|
46
40
|
parsed_url = urlparse(url)
|
47
41
|
self.domain = parsed_url.netloc
|
48
|
-
self.scheme = parsed_url.scheme
|
49
|
-
self.base_url= f"{self.scheme}{self.domain}"
|
42
|
+
self.scheme = parsed_url.scheme or "https" # Default to https if scheme is missing
|
43
|
+
self.base_url = f"{self.scheme}://{self.domain}"
|
50
44
|
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
51
45
|
os.makedirs(self.site_dir, exist_ok=True)
|
52
46
|
self.drivers = {}
|
53
47
|
self.page_type = []
|
54
|
-
|
48
|
+
|
55
49
|
def get_url_to_path(self, url):
|
56
|
-
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
50
|
+
url = eatAll(str(url), ['', ' ', '\n', '\t', '\\', '/']) # Assuming eatAll is defined elsewhere
|
57
51
|
parsed_url = urlparse(url)
|
58
52
|
if parsed_url.netloc == self.domain:
|
59
53
|
paths = parsed_url.path.split('/')
|
@@ -61,59 +55,74 @@ class seleniumManager(metaclass=SingletonMeta):
|
|
61
55
|
for path in paths[:-1]:
|
62
56
|
dir_path = os.path.join(dir_path, path)
|
63
57
|
os.makedirs(dir_path, exist_ok=True)
|
64
|
-
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if
|
65
|
-
|
58
|
+
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if not self.page_type else self.page_type[-1])
|
66
59
|
dir_path = os.path.join(dir_path, paths[-1])
|
67
60
|
return dir_path
|
68
61
|
|
69
62
|
def saved_url_check(self, url):
|
70
|
-
|
71
|
-
return path
|
63
|
+
return self.get_url_to_path(url)
|
72
64
|
|
73
65
|
def get_with_netloc(self, url):
|
74
66
|
parsed_url = urlparse(url)
|
75
|
-
if parsed_url.netloc
|
76
|
-
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
67
|
+
if not parsed_url.netloc:
|
68
|
+
url = f"{self.scheme}://{self.domain}/{url.strip('/')}"
|
77
69
|
return url
|
78
70
|
|
79
71
|
def get_driver(self, url):
|
80
72
|
if url and url not in self.drivers:
|
73
|
+
# Set up Chrome options
|
81
74
|
chrome_options = Options()
|
82
|
-
|
83
|
-
|
75
|
+
for option in DEFAULT_CHROME_OPTIONS:
|
76
|
+
chrome_options.add_argument(option)
|
77
|
+
|
78
|
+
# Specify Chrome binary location if needed (optional, comment out if not applicable)
|
79
|
+
# chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
80
|
+
|
81
|
+
# Automatically install and use ChromeDriver
|
82
|
+
service = Service(ChromeDriverManager().install())
|
83
|
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
84
|
+
|
84
85
|
self.drivers[url] = driver
|
85
86
|
driver.get(url)
|
86
87
|
return self.drivers[url]
|
88
|
+
|
89
|
+
def quit_driver(self, url):
|
90
|
+
"""Clean up a specific driver instance."""
|
91
|
+
if url in self.drivers:
|
92
|
+
self.drivers[url].quit()
|
93
|
+
del self.drivers[url]
|
94
|
+
|
95
|
+
def quit_all_drivers(self):
|
96
|
+
"""Clean up all driver instances."""
|
97
|
+
for driver in self.drivers.values():
|
98
|
+
driver.quit()
|
99
|
+
self.drivers.clear()
|
100
|
+
|
87
101
|
def normalize_url(url, base_url=None):
|
88
|
-
"""
|
89
|
-
|
90
|
-
|
91
|
-
# If URL starts with the base URL repeated, remove the extra part
|
92
|
-
manager = seleniumManager(url)
|
93
|
-
base_url = manager.base_url
|
102
|
+
"""Normalize and resolve relative URLs."""
|
103
|
+
manager = SeleniumManager(url)
|
104
|
+
base_url = manager.base_url if base_url is None else base_url
|
94
105
|
if url.startswith(base_url):
|
95
106
|
url = url[len(base_url):]
|
96
|
-
|
97
|
-
# Resolve the URL against the base URL
|
98
107
|
normalized_url = urljoin(base_url, url.split('#')[0])
|
99
|
-
|
100
|
-
# Ensure only URLs belonging to the base domain are kept
|
101
108
|
if not normalized_url.startswith(base_url):
|
102
109
|
return None
|
103
|
-
|
104
110
|
return normalized_url
|
105
|
-
|
111
|
+
|
106
112
|
def get_selenium_source(url):
|
107
|
-
|
113
|
+
"""Fetch page source using Selenium."""
|
114
|
+
url_mgr = urlManager(url) # Assuming urlManager is defined elsewhere
|
108
115
|
if url_mgr.url:
|
109
116
|
url = str(url_mgr.url)
|
110
|
-
manager =
|
117
|
+
manager = SeleniumManager(url)
|
111
118
|
driver = manager.get_driver(url)
|
112
119
|
try:
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
pass
|
120
|
+
return driver.page_source
|
121
|
+
except Exception as e:
|
122
|
+
logging.error(f"Error fetching page source for {url}: {e}")
|
123
|
+
return None
|
124
|
+
# Note: Driver is not quit here to maintain Singleton behavior
|
119
125
|
|
126
|
+
# Ensure cleanup on program exit (optional)
|
127
|
+
import atexit
|
128
|
+
atexit.register(lambda: SeleniumManager(url="").quit_all_drivers()) # Cleanup all drivers on exit
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.40
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -17,7 +17,7 @@ abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rg
|
|
17
17
|
abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
|
18
18
|
abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
|
19
19
|
abstract_webtools/managers/requestManager.py,sha256=zXD31WAYghV1OjnTQzRQnQGqZz6_J4mjHTdNLnBop_0,17343
|
20
|
-
abstract_webtools/managers/seleniumManager.py,sha256=
|
20
|
+
abstract_webtools/managers/seleniumManager.py,sha256=B7X6nTfxs1eHFDo7LKB1N5LhDytZQzHPgJjna2c2j6E,5017
|
21
21
|
abstract_webtools/managers/soupManager.py,sha256=-_mRCWlyzfKlF64UU53WXBmCvJ98jQ4GyHh8S8Pw3xs,17198
|
22
22
|
abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
|
23
23
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
@@ -35,8 +35,8 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
|
|
35
35
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
36
36
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
37
37
|
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
38
|
-
abstract_webtools-0.1.6.
|
39
|
-
abstract_webtools-0.1.6.
|
40
|
-
abstract_webtools-0.1.6.
|
41
|
-
abstract_webtools-0.1.6.
|
42
|
-
abstract_webtools-0.1.6.
|
38
|
+
abstract_webtools-0.1.6.40.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
39
|
+
abstract_webtools-0.1.6.40.dist-info/METADATA,sha256=kKCrv_8-h4JuBeI1YGfB-sWkaQp-H5LS5IyPfw1ooUs,16051
|
40
|
+
abstract_webtools-0.1.6.40.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
41
|
+
abstract_webtools-0.1.6.40.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
42
|
+
abstract_webtools-0.1.6.40.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|