abstract-webtools 0.1.6.136__py3-none-any.whl → 0.1.6.138__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/main.py +0 -0
- abstract_webtools/managers/seleniumManager.py +1 -1
- abstract_webtools/managers/urlManager/urlManager (Copy).py +220 -0
- abstract_webtools/managers/urlManager/urlManager.py +102 -73
- abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.138.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.138.dist-info}/RECORD +7 -13
- {abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.138.dist-info}/top_level.txt +0 -0
- abstract_webtools/__pycache__/abstract_webtools.cpython-312.pyc +0 -0
- abstract_webtools/abstract_userpit.py +0 -169
- abstract_webtools/managers/linkManager.py +0 -189
- abstract_webtools/managers/requestManager.py +0 -353
- abstract_webtools/managers/soupManager.py +0 -362
- abstract_webtools/managers/urlManager.py +0 -230
- abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4 +0 -0
- {abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.138.dist-info}/WHEEL +0 -0
abstract_webtools/main.py
CHANGED
File without changes
|
@@ -21,7 +21,7 @@ from selenium.webdriver.chrome.options import Options
|
|
21
21
|
|
22
22
|
# Setup Chrome options
|
23
23
|
chrome_options = Options()
|
24
|
-
chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
24
|
+
#chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
25
25
|
chrome_options.add_argument("--headless") # Run in headless mode
|
26
26
|
chrome_options.add_argument("--no-sandbox")
|
27
27
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
@@ -0,0 +1,220 @@
|
|
1
|
+
import re
|
2
|
+
import urllib.parse
|
3
|
+
import requests
|
4
|
+
from urllib.parse import urlparse, urlunparse, urljoin
|
5
|
+
|
6
|
+
from ...abstract_webtools import *
|
7
|
+
|
8
|
+
class urlManager:
|
9
|
+
"""
|
10
|
+
Revised urlManager for managing and cleaning URLs.
|
11
|
+
|
12
|
+
It splits URLs into their components, normalizes them (trimming spaces, lowercasing
|
13
|
+
scheme and domain, removing default ports, and cleaning up paths), and then creates
|
14
|
+
a list of potential variants (with/without www, http/https) so that a valid version
|
15
|
+
can be determined.
|
16
|
+
"""
|
17
|
+
def __init__(self, url=None, session=None):
|
18
|
+
url = url or 'www.example.com'
|
19
|
+
self._url = url
|
20
|
+
self.session = session or requests
|
21
|
+
self.clean_urls = self.clean_url(url)
|
22
|
+
self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
|
23
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
24
|
+
self.all_urls = []
|
25
|
+
|
26
|
+
def url_to_pieces(self, url):
|
27
|
+
"""
|
28
|
+
Split a URL into protocol, domain, path, and query components.
|
29
|
+
Uses urlparse for robustness.
|
30
|
+
"""
|
31
|
+
try:
|
32
|
+
parsed = urlparse(url)
|
33
|
+
protocol = parsed.scheme if parsed.scheme else None
|
34
|
+
domain = parsed.netloc if parsed.netloc else None
|
35
|
+
path = parsed.path or ""
|
36
|
+
query = parsed.query or ""
|
37
|
+
except Exception as e:
|
38
|
+
print(f'The URL {url} was not reachable: {e}')
|
39
|
+
protocol, domain, path, query = None, None, "", ""
|
40
|
+
return protocol, domain, path, query
|
41
|
+
|
42
|
+
def clean_url(self, url=None) -> list:
|
43
|
+
"""
|
44
|
+
Normalize and clean the URL, then return a list of potential URL variants.
|
45
|
+
|
46
|
+
This method:
|
47
|
+
- Strips whitespace.
|
48
|
+
- Adds a scheme (defaults to https) if missing.
|
49
|
+
- Lowercases the scheme and domain.
|
50
|
+
- Removes default ports.
|
51
|
+
- Cleans up the path (removing duplicate slashes and trailing slash).
|
52
|
+
- Generates variants with and without 'www', and with both http and https.
|
53
|
+
"""
|
54
|
+
url = url or self._url
|
55
|
+
url = url.strip()
|
56
|
+
# Ensure the URL has a scheme
|
57
|
+
if not re.match(r'https?://', url):
|
58
|
+
url = 'https://' + url
|
59
|
+
|
60
|
+
parsed = urlparse(url)
|
61
|
+
scheme = parsed.scheme.lower()
|
62
|
+
netloc = parsed.netloc.lower()
|
63
|
+
# Remove default port numbers if present
|
64
|
+
if ':' in netloc:
|
65
|
+
host, port = netloc.split(':', 1)
|
66
|
+
if (scheme == "http" and port == "80") or (scheme == "https" and port == "443"):
|
67
|
+
netloc = host
|
68
|
+
|
69
|
+
# Normalize the path: remove duplicate slashes and a trailing slash
|
70
|
+
path = re.sub(r'//+', '/', parsed.path).rstrip('/')
|
71
|
+
|
72
|
+
# Rebuild the cleaned URL without query or fragment
|
73
|
+
cleaned_url = urlunparse((scheme, netloc, path, '', '', ''))
|
74
|
+
|
75
|
+
variants = []
|
76
|
+
# Add the primary variant
|
77
|
+
variants.append(cleaned_url)
|
78
|
+
# Generate a variant with/without 'www'
|
79
|
+
if netloc.startswith('www.'):
|
80
|
+
no_www = netloc[4:]
|
81
|
+
variants.append(urlunparse((scheme, no_www, path, '', '', '')))
|
82
|
+
else:
|
83
|
+
variants.append(urlunparse((scheme, f"www.{netloc}", path, '', '', '')))
|
84
|
+
|
85
|
+
# Also generate variants with the alternate scheme
|
86
|
+
alt_scheme = 'http' if scheme == 'https' else 'https'
|
87
|
+
for variant in list(variants):
|
88
|
+
parsed_variant = urlparse(variant)
|
89
|
+
alt_variant = urlunparse((alt_scheme, parsed_variant.netloc, parsed_variant.path, '', '', ''))
|
90
|
+
variants.append(alt_variant)
|
91
|
+
|
92
|
+
# Remove duplicates while preserving order
|
93
|
+
seen = set()
|
94
|
+
unique_variants = []
|
95
|
+
for v in variants:
|
96
|
+
if v not in seen:
|
97
|
+
unique_variants.append(v)
|
98
|
+
seen.add(v)
|
99
|
+
return unique_variants
|
100
|
+
|
101
|
+
def get_correct_url(self, url=None, clean_urls=None) -> str:
|
102
|
+
"""
|
103
|
+
Attempts each URL variant by making an HTTP GET request.
|
104
|
+
Returns the first variant that returns a 200 OK response.
|
105
|
+
"""
|
106
|
+
if url is None and clean_urls is None:
|
107
|
+
url = self._url
|
108
|
+
clean_urls = self.clean_urls
|
109
|
+
if url is not None and clean_urls is None:
|
110
|
+
clean_urls = self.clean_url(url)
|
111
|
+
elif url is None and clean_urls is not None:
|
112
|
+
url = self._url
|
113
|
+
|
114
|
+
for candidate in clean_urls:
|
115
|
+
try:
|
116
|
+
response = self.session.get(candidate, timeout=5)
|
117
|
+
if response.status_code == 200:
|
118
|
+
return candidate
|
119
|
+
except requests.exceptions.RequestException as e:
|
120
|
+
print(f"Failed to reach {candidate}: {e}")
|
121
|
+
return None
|
122
|
+
|
123
|
+
def update_url(self, url):
|
124
|
+
"""
|
125
|
+
Update the URL and refresh related attributes.
|
126
|
+
"""
|
127
|
+
self._url = url
|
128
|
+
self.clean_urls = self.clean_url(url)
|
129
|
+
self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
|
130
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
131
|
+
self.all_urls = []
|
132
|
+
|
133
|
+
def get_domain(self, url=None):
|
134
|
+
url = url or self.url
|
135
|
+
return urlparse(url).netloc
|
136
|
+
|
137
|
+
def url_join(self, base_url, path):
|
138
|
+
"""
|
139
|
+
Joins a base URL with a relative path.
|
140
|
+
"""
|
141
|
+
base_url = base_url.strip().rstrip('/')
|
142
|
+
path = path.strip().lstrip('/')
|
143
|
+
return f"{base_url}/{path}"
|
144
|
+
|
145
|
+
@property
|
146
|
+
def url(self):
|
147
|
+
return self._url
|
148
|
+
|
149
|
+
@url.setter
|
150
|
+
def url(self, new_url):
|
151
|
+
self._url = new_url
|
152
|
+
|
153
|
+
def is_valid_url(self, url=None):
|
154
|
+
"""
|
155
|
+
Check if the given URL is valid.
|
156
|
+
"""
|
157
|
+
url = url or self.url
|
158
|
+
parsed = urlparse(url)
|
159
|
+
return bool(parsed.scheme) and bool(parsed.netloc)
|
160
|
+
|
161
|
+
def make_valid(self, href, url=None):
|
162
|
+
"""
|
163
|
+
Validate a href. If it's not already valid, join it with the base URL.
|
164
|
+
"""
|
165
|
+
if self.is_valid_url(href):
|
166
|
+
return href
|
167
|
+
base = url or self.url
|
168
|
+
new_link = urljoin(base, href)
|
169
|
+
if self.is_valid_url(new_link):
|
170
|
+
return new_link
|
171
|
+
return False
|
172
|
+
|
173
|
+
def get_relative_href(self, base, href):
|
174
|
+
"""
|
175
|
+
For a relative href, join it with the base URL and strip any query or fragment.
|
176
|
+
"""
|
177
|
+
joined = urljoin(base, href)
|
178
|
+
parsed = urlparse(joined)
|
179
|
+
clean_href = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))
|
180
|
+
return clean_href
|
181
|
+
|
182
|
+
def url_basename(self, url=None):
|
183
|
+
url = url or self.url
|
184
|
+
path = urlparse(url).path
|
185
|
+
return path.strip('/').split('/')[-1]
|
186
|
+
|
187
|
+
def base_url(self, url=None):
|
188
|
+
url = url or self.url
|
189
|
+
match = re.match(r'https?://[^?#/]+/', url)
|
190
|
+
if match:
|
191
|
+
return match.group()
|
192
|
+
return None
|
193
|
+
|
194
|
+
def urljoin(self, base, path):
|
195
|
+
return urljoin(base, path)
|
196
|
+
|
197
|
+
class urlManagerSingleton:
|
198
|
+
_instance = None
|
199
|
+
|
200
|
+
@staticmethod
|
201
|
+
def get_instance(url=None, session=requests):
|
202
|
+
if urlManagerSingleton._instance is None:
|
203
|
+
urlManagerSingleton._instance = urlManager(url, session=session)
|
204
|
+
elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
|
205
|
+
urlManagerSingleton._instance = urlManager(url, session=session)
|
206
|
+
return urlManagerSingleton._instance
|
207
|
+
|
208
|
+
def get_url(url=None, url_mgr=None):
|
209
|
+
if not url and not url_mgr:
|
210
|
+
return None
|
211
|
+
if url:
|
212
|
+
url_mgr = urlManager(url)
|
213
|
+
return url_mgr.url
|
214
|
+
|
215
|
+
def get_url_mgr(url=None, url_mgr=None):
|
216
|
+
if url_mgr is None and url:
|
217
|
+
url_mgr = urlManager(url=url)
|
218
|
+
if url_mgr and url is None:
|
219
|
+
url = url_mgr.url
|
220
|
+
return url_mgr
|
@@ -1,9 +1,9 @@
|
|
1
1
|
import re
|
2
|
-
import
|
2
|
+
import logging
|
3
3
|
import requests
|
4
4
|
from urllib.parse import urlparse, urlunparse, urljoin
|
5
5
|
|
6
|
-
|
6
|
+
logging.basicConfig(level=logging.INFO)
|
7
7
|
|
8
8
|
class urlManager:
|
9
9
|
"""
|
@@ -13,30 +13,38 @@ class urlManager:
|
|
13
13
|
scheme and domain, removing default ports, and cleaning up paths), and then creates
|
14
14
|
a list of potential variants (with/without www, http/https) so that a valid version
|
15
15
|
can be determined.
|
16
|
+
|
17
|
+
Now handles url=None gracefully: sets internals to None/empty and methods return None or empty values without errors.
|
16
18
|
"""
|
17
19
|
def __init__(self, url=None, session=None):
|
18
|
-
|
19
|
-
self.
|
20
|
-
self.
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
20
|
+
self._url = url # Allow None
|
21
|
+
self.session = session or requests.Session()
|
22
|
+
if self._url is None:
|
23
|
+
self.clean_urls = []
|
24
|
+
self.url = None
|
25
|
+
self.protocol = None
|
26
|
+
self.domain = None
|
27
|
+
self.path = ""
|
28
|
+
self.query = ""
|
29
|
+
self.all_urls = []
|
30
|
+
else:
|
31
|
+
self.clean_urls = self.clean_url()
|
32
|
+
self.url = self.get_correct_url() or self._url
|
33
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
34
|
+
self.all_urls = []
|
35
|
+
|
26
36
|
def url_to_pieces(self, url):
|
27
37
|
"""
|
28
38
|
Split a URL into protocol, domain, path, and query components.
|
29
39
|
Uses urlparse for robustness.
|
30
40
|
"""
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
print(f'The URL {url} was not reachable: {e}')
|
39
|
-
protocol, domain, path, query = None, None, "", ""
|
41
|
+
if url is None:
|
42
|
+
return None, None, "", ""
|
43
|
+
parsed = urlparse(url)
|
44
|
+
protocol = parsed.scheme or None
|
45
|
+
domain = parsed.netloc or None
|
46
|
+
path = parsed.path or ""
|
47
|
+
query = parsed.query or ""
|
40
48
|
return protocol, domain, path, query
|
41
49
|
|
42
50
|
def clean_url(self, url=None) -> list:
|
@@ -48,15 +56,19 @@ class urlManager:
|
|
48
56
|
- Adds a scheme (defaults to https) if missing.
|
49
57
|
- Lowercases the scheme and domain.
|
50
58
|
- Removes default ports.
|
51
|
-
- Cleans up the path (removing duplicate slashes and trailing slash).
|
59
|
+
- Cleans up the path (removing duplicate slashes and trailing slash if not a file-like path).
|
60
|
+
- Preserves params and query; strips fragment.
|
52
61
|
- Generates variants with and without 'www', and with both http and https.
|
53
62
|
"""
|
54
|
-
url = url or self._url
|
63
|
+
url = (url or self._url) # Use self._url if url None
|
64
|
+
if url is None:
|
65
|
+
return []
|
55
66
|
url = url.strip()
|
67
|
+
if not url:
|
68
|
+
return []
|
56
69
|
# Ensure the URL has a scheme
|
57
|
-
if not re.match(r'https?://', url):
|
70
|
+
if not re.match(r'https?://', url, re.IGNORECASE):
|
58
71
|
url = 'https://' + url
|
59
|
-
|
60
72
|
parsed = urlparse(url)
|
61
73
|
scheme = parsed.scheme.lower()
|
62
74
|
netloc = parsed.netloc.lower()
|
@@ -66,58 +78,55 @@ class urlManager:
|
|
66
78
|
if (scheme == "http" and port == "80") or (scheme == "https" and port == "443"):
|
67
79
|
netloc = host
|
68
80
|
|
69
|
-
# Normalize the path: remove duplicate slashes
|
70
|
-
path = re.sub(r'//+', '/', parsed.path)
|
81
|
+
# Normalize the path: remove duplicate slashes; rstrip '/' only if path isn't root or file-like
|
82
|
+
path = re.sub(r'//+', '/', parsed.path)
|
83
|
+
if path != '/' and '.' not in path.split('/')[-1]: # Fixed: check if last segment has '.' for file-like
|
84
|
+
path = path.rstrip('/')
|
71
85
|
|
72
|
-
# Rebuild the cleaned URL
|
73
|
-
cleaned_url = urlunparse((scheme, netloc, path,
|
86
|
+
# Rebuild the cleaned URL, preserving params and query, stripping fragment
|
87
|
+
cleaned_url = urlunparse((scheme, netloc, path, parsed.params, parsed.query, ''))
|
74
88
|
|
75
|
-
variants = []
|
76
|
-
# Add the primary variant
|
77
|
-
variants.append(cleaned_url)
|
89
|
+
variants = [cleaned_url]
|
78
90
|
# Generate a variant with/without 'www'
|
79
91
|
if netloc.startswith('www.'):
|
80
92
|
no_www = netloc[4:]
|
81
|
-
variants.append(urlunparse((scheme, no_www, path,
|
93
|
+
variants.append(urlunparse((scheme, no_www, path, parsed.params, parsed.query, '')))
|
82
94
|
else:
|
83
|
-
variants.append(urlunparse((scheme, f"www.{netloc}", path,
|
95
|
+
variants.append(urlunparse((scheme, f"www.{netloc}", path, parsed.params, parsed.query, '')))
|
84
96
|
|
85
|
-
#
|
97
|
+
# Generate variants with the alternate scheme
|
86
98
|
alt_scheme = 'http' if scheme == 'https' else 'https'
|
87
99
|
for variant in list(variants):
|
88
100
|
parsed_variant = urlparse(variant)
|
89
|
-
alt_variant = urlunparse((alt_scheme, parsed_variant.netloc, parsed_variant.path,
|
101
|
+
alt_variant = urlunparse((alt_scheme, parsed_variant.netloc, parsed_variant.path, parsed_variant.params, parsed_variant.query, ''))
|
90
102
|
variants.append(alt_variant)
|
91
103
|
|
92
104
|
# Remove duplicates while preserving order
|
93
105
|
seen = set()
|
94
|
-
unique_variants = []
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
seen.add(v)
|
106
|
+
unique_variants = [v for v in variants if v not in seen and not seen.add(v)]
|
107
|
+
|
108
|
+
# Sort to prefer HTTPS variants first
|
109
|
+
unique_variants.sort(key=lambda v: (not v.startswith('https'), v))
|
99
110
|
return unique_variants
|
100
111
|
|
101
112
|
def get_correct_url(self, url=None, clean_urls=None) -> str:
|
102
113
|
"""
|
103
|
-
Attempts each URL variant by making an HTTP
|
114
|
+
Attempts each URL variant by making an HTTP HEAD request (lighter than GET).
|
104
115
|
Returns the first variant that returns a 200 OK response.
|
105
116
|
"""
|
106
|
-
if
|
107
|
-
|
108
|
-
|
109
|
-
|
117
|
+
if self._url is None:
|
118
|
+
return None
|
119
|
+
clean_urls = clean_urls or self.clean_urls
|
120
|
+
url = url or self._url
|
121
|
+
if not clean_urls:
|
110
122
|
clean_urls = self.clean_url(url)
|
111
|
-
elif url is None and clean_urls is not None:
|
112
|
-
url = self._url
|
113
|
-
|
114
123
|
for candidate in clean_urls:
|
115
124
|
try:
|
116
|
-
response = self.session.
|
125
|
+
response = self.session.head(candidate, timeout=5, allow_redirects=True)
|
117
126
|
if response.status_code == 200:
|
118
127
|
return candidate
|
119
128
|
except requests.exceptions.RequestException as e:
|
120
|
-
|
129
|
+
logging.info(f"Failed to reach {candidate}: {e}")
|
121
130
|
return None
|
122
131
|
|
123
132
|
def update_url(self, url):
|
@@ -125,23 +134,26 @@ class urlManager:
|
|
125
134
|
Update the URL and refresh related attributes.
|
126
135
|
"""
|
127
136
|
self._url = url
|
128
|
-
self.
|
129
|
-
|
130
|
-
|
131
|
-
|
137
|
+
if self._url is None:
|
138
|
+
self.clean_urls = []
|
139
|
+
self.url = None
|
140
|
+
self.protocol = None
|
141
|
+
self.domain = None
|
142
|
+
self.path = ""
|
143
|
+
self.query = ""
|
144
|
+
self.all_urls = []
|
145
|
+
else:
|
146
|
+
self.clean_urls = self.clean_url(url)
|
147
|
+
self.url = self.get_correct_url() or url
|
148
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
149
|
+
self.all_urls = []
|
132
150
|
|
133
151
|
def get_domain(self, url=None):
|
152
|
+
if self._url is None and url is None:
|
153
|
+
return None
|
134
154
|
url = url or self.url
|
135
155
|
return urlparse(url).netloc
|
136
156
|
|
137
|
-
def url_join(self, base_url, path):
|
138
|
-
"""
|
139
|
-
Joins a base URL with a relative path.
|
140
|
-
"""
|
141
|
-
base_url = base_url.strip().rstrip('/')
|
142
|
-
path = path.strip().lstrip('/')
|
143
|
-
return f"{base_url}/{path}"
|
144
|
-
|
145
157
|
@property
|
146
158
|
def url(self):
|
147
159
|
return self._url
|
@@ -154,51 +166,68 @@ class urlManager:
|
|
154
166
|
"""
|
155
167
|
Check if the given URL is valid.
|
156
168
|
"""
|
169
|
+
if url is None and self._url is None:
|
170
|
+
return False
|
157
171
|
url = url or self.url
|
172
|
+
if url is None:
|
173
|
+
return False
|
158
174
|
parsed = urlparse(url)
|
159
175
|
return bool(parsed.scheme) and bool(parsed.netloc)
|
160
|
-
|
176
|
+
|
161
177
|
def make_valid(self, href, url=None):
|
162
178
|
"""
|
163
179
|
Validate a href. If it's not already valid, join it with the base URL.
|
164
180
|
"""
|
181
|
+
if self._url is None and url is None:
|
182
|
+
return None
|
165
183
|
if self.is_valid_url(href):
|
166
184
|
return href
|
167
185
|
base = url or self.url
|
186
|
+
if base is None:
|
187
|
+
return None
|
168
188
|
new_link = urljoin(base, href)
|
169
189
|
if self.is_valid_url(new_link):
|
170
190
|
return new_link
|
171
|
-
return
|
191
|
+
return None
|
172
192
|
|
173
193
|
def get_relative_href(self, base, href):
|
174
194
|
"""
|
175
195
|
For a relative href, join it with the base URL and strip any query or fragment.
|
176
196
|
"""
|
197
|
+
if base is None:
|
198
|
+
return None
|
177
199
|
joined = urljoin(base, href)
|
178
200
|
parsed = urlparse(joined)
|
179
201
|
clean_href = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))
|
180
202
|
return clean_href
|
181
203
|
|
182
204
|
def url_basename(self, url=None):
|
205
|
+
if self._url is None and url is None:
|
206
|
+
return ""
|
183
207
|
url = url or self.url
|
208
|
+
if url is None:
|
209
|
+
return ""
|
184
210
|
path = urlparse(url).path
|
185
211
|
return path.strip('/').split('/')[-1]
|
186
212
|
|
187
213
|
def base_url(self, url=None):
|
214
|
+
if self._url is None and url is None:
|
215
|
+
return None
|
188
216
|
url = url or self.url
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
return
|
217
|
+
if url is None:
|
218
|
+
return None
|
219
|
+
parsed = urlparse(url)
|
220
|
+
return urlunparse((parsed.scheme, parsed.netloc, '/', '', '', ''))
|
193
221
|
|
194
222
|
def urljoin(self, base, path):
|
223
|
+
if base is None:
|
224
|
+
return None
|
195
225
|
return urljoin(base, path)
|
196
226
|
|
197
227
|
class urlManagerSingleton:
|
198
228
|
_instance = None
|
199
|
-
|
200
229
|
@staticmethod
|
201
|
-
def get_instance(url=None, session=requests):
|
230
|
+
def get_instance(url=None, session=requests.Session()):
|
202
231
|
if urlManagerSingleton._instance is None:
|
203
232
|
urlManagerSingleton._instance = urlManager(url, session=session)
|
204
233
|
elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
|
@@ -208,13 +237,13 @@ class urlManagerSingleton:
|
|
208
237
|
def get_url(url=None, url_mgr=None):
|
209
238
|
if not url and not url_mgr:
|
210
239
|
return None
|
211
|
-
if url:
|
240
|
+
if url_mgr is None and url is not None:
|
212
241
|
url_mgr = urlManager(url)
|
213
|
-
return url_mgr.url
|
242
|
+
return url_mgr.url if url_mgr else None
|
214
243
|
|
215
244
|
def get_url_mgr(url=None, url_mgr=None):
|
216
|
-
if url_mgr is None
|
217
|
-
|
245
|
+
if url_mgr is None:
|
246
|
+
url_mgr = urlManager(url=url) # Always create instance, even if url=None
|
218
247
|
if url_mgr and url is None:
|
219
248
|
url = url_mgr.url
|
220
249
|
return url_mgr
|
abstract_webtools/soup_gui.py
CHANGED
File without changes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.138
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -1,5 +1,4 @@
|
|
1
1
|
abstract_webtools/__init__.py,sha256=ubqE7spnA8YUcPkXP1-niLIuza07_GGIq2ww3d2Gdsk,118
|
2
|
-
abstract_webtools/abstract_userpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-P2w1u6ng,6462
|
3
2
|
abstract_webtools/abstract_usurpit.py,sha256=2idbYXLFhXh8VPfdYgWICNH8dehnZRCdt4U5sTsVxo4,9663
|
4
3
|
abstract_webtools/abstract_webtools.py,sha256=3NzGmJlZvrdVtEcUi2K5iUgWr1822IBPhIN9us2e2t0,3859
|
5
4
|
abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
|
@@ -11,7 +10,6 @@ abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,132
|
|
11
10
|
abstract_webtools/soup_gui.py,sha256=n95YAps1R6DpMwR4UbthSqQby0C5WHUa9tsW-f2qpLg,5184
|
12
11
|
abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE70B8,10441
|
13
12
|
abstract_webtools/url_grabber_new.py,sha256=xb23qo4anOY0Ax3CAfaHJ8s5VEz61Sinh-XpEDFW7Is,3621
|
14
|
-
abstract_webtools/__pycache__/abstract_webtools.cpython-312.pyc,sha256=Rb2nPDCUG6i7nEs-I128lozwKteIVXzZxygV-zJVALs,4606
|
15
13
|
abstract_webtools/managers/__init__.py,sha256=9pgy52NB-ONxLqoCRF52GZ6G7GM6Uc0-fgA1HvKcwxc,407
|
16
14
|
abstract_webtools/managers/allss\.py,sha256=IBhlyRQHfK-BtwUnSEbIPqlI1MtZ8-XsdaHv0b91HQ0,269
|
17
15
|
abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
|
@@ -21,15 +19,11 @@ abstract_webtools/managers/curlMgr.py,sha256=ghi0QsSAxjZu3HALFST5Kv_262XhHSAPGlQ
|
|
21
19
|
abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
|
22
20
|
abstract_webtools/managers/dynamicRateLimiter.py,sha256=ycn5VQEPnmxjNMew4IVh-t5t43jhM39uypoOK9bJDDg,7662
|
23
21
|
abstract_webtools/managers/get_test.py,sha256=nISrhUGdyvRv18wTGoifGhizBFoHeK0N3FymMASloFw,825
|
24
|
-
abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
|
25
22
|
abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
|
26
23
|
abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
|
27
|
-
abstract_webtools/managers/
|
28
|
-
abstract_webtools/managers/seleniumManager.py,sha256=qSY8gH3N5YJIMwE_Alj9HNQRip_PziIo4_T9AZE_FQo,4273
|
29
|
-
abstract_webtools/managers/soupManager.py,sha256=-_mRCWlyzfKlF64UU53WXBmCvJ98jQ4GyHh8S8Pw3xs,17198
|
24
|
+
abstract_webtools/managers/seleniumManager.py,sha256=RRpA1_oOnZuzzQ4S6VX7tDFcI31E_mOou2CZOOZH6yI,4274
|
30
25
|
abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
|
31
26
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
32
|
-
abstract_webtools/managers/urlManager.py,sha256=Dvf-TiSo5j_YjZS2Eq6lFfbhveneD6NA_wEE0xUXy_E,8858
|
33
27
|
abstract_webtools/managers/userAgentManager.py,sha256=cUaOlcCTzftVBCp9ZHwMXR9IB1wAE-03YSVwUBaIFLM,2514
|
34
28
|
abstract_webtools/managers/videoDownloader.py,sha256=mKXhKYNnJwPaiqAsHvFTBGLdXFgR3wdV0G1OIimiKbE,15424
|
35
29
|
abstract_webtools/managers/videoDownloader2.py,sha256=v3H6akdhvVWGrB-r35m3cp_-aKkNWadpfCiMylOnv6w,12748
|
@@ -43,9 +37,9 @@ abstract_webtools/managers/soupManager/__init__.py,sha256=mqfXfqM9sWlYpOkoXUqtBo
|
|
43
37
|
abstract_webtools/managers/soupManager/asoueces.py,sha256=OaXqolZl0dI7b09NYwJ3Wnhuxf89ahZ1GjsOqy0GXfk,3506
|
44
38
|
abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2sIkg-bHxBt2mKpYMyZd-nJjLQ,17201
|
45
39
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
46
|
-
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
47
|
-
abstract_webtools/managers/
|
48
|
-
abstract_webtools-0.1.6.
|
49
|
-
abstract_webtools-0.1.6.
|
50
|
-
abstract_webtools-0.1.6.
|
51
|
-
abstract_webtools-0.1.6.
|
40
|
+
abstract_webtools/managers/urlManager/urlManager (Copy).py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
41
|
+
abstract_webtools/managers/urlManager/urlManager.py,sha256=LG8WiEsf16vMzum48D5rmbRNK6VzYG4FyOTr2FJiOEc,9133
|
42
|
+
abstract_webtools-0.1.6.138.dist-info/METADATA,sha256=wBxaIqQkImZwBOPVoOzJYc2UAp3jxtuL7y9SWULLiUY,7289
|
43
|
+
abstract_webtools-0.1.6.138.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
44
|
+
abstract_webtools-0.1.6.138.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
45
|
+
abstract_webtools-0.1.6.138.dist-info/RECORD,,
|
{abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.138.dist-info}/top_level.txt
RENAMED
File without changes
|
Binary file
|