abstract-webtools 0.1.6.136__tar.gz → 0.1.6.138__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/setup.py +1 -1
  3. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/seleniumManager.py +1 -1
  4. abstract_webtools-0.1.6.138/src/abstract_webtools/managers/urlManager/urlManager.py +249 -0
  5. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
  6. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools.egg-info/SOURCES.txt +1 -0
  7. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  8. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools.egg-info/requires.txt +0 -0
  9. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools.egg-info/top_level.txt +0 -0
  10. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/README.md +0 -0
  11. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/pyproject.toml +0 -0
  12. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/setup.cfg +0 -0
  13. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/__init__.py +0 -0
  14. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/abstract_usurpit.py +0 -0
  15. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/abstract_webtools.py +0 -0
  16. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/big_user_agent_list.py +0 -0
  17. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/domain_identifier.py +0 -0
  18. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/extention_list.py +0 -0
  19. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/find_dirs.py +0 -0
  20. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/k2s_downloader.py +0 -0
  21. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/main.py +0 -0
  22. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/__init__.py +0 -0
  23. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/allss//.py" +0 -0
  24. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/cipherManager.py +0 -0
  25. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/clownworld/__init__.py +0 -0
  26. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/clownworld/get_bolshevid_video.py +0 -0
  27. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/crawlManager.py +0 -0
  28. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
  29. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/curlMgr.py +0 -0
  30. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/domainManager.py +0 -0
  31. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  32. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/get_test.py +0 -0
  33. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
  34. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
  35. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  36. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/networkManager.py +0 -0
  37. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
  38. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/requestManager/requestManager.py +0 -0
  39. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
  40. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
  41. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
  42. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/sslManager.py +0 -0
  43. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  44. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
  45. /abstract_webtools-0.1.6.136/src/abstract_webtools/managers/urlManager/urlManager.py → /abstract_webtools-0.1.6.138/src/abstract_webtools/managers/urlManager/urlManager (Copy).py +0 -0
  46. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  47. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  48. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
  49. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/soup_gui.py +0 -0
  50. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/url_grabber.py +0 -0
  51. {abstract_webtools-0.1.6.136 → abstract_webtools-0.1.6.138}/src/abstract_webtools/url_grabber_new.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.136
3
+ Version: 0.1.6.138
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.136',
7
+ version='0.1.6.138',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -21,7 +21,7 @@ from selenium.webdriver.chrome.options import Options
21
21
 
22
22
  # Setup Chrome options
23
23
  chrome_options = Options()
24
- chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
24
+ #chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
25
25
  chrome_options.add_argument("--headless") # Run in headless mode
26
26
  chrome_options.add_argument("--no-sandbox")
27
27
  chrome_options.add_argument("--disable-dev-shm-usage")
@@ -0,0 +1,249 @@
1
+ import re
2
+ import logging
3
+ import requests
4
+ from urllib.parse import urlparse, urlunparse, urljoin
5
+
6
+ logging.basicConfig(level=logging.INFO)
7
+
8
+ class urlManager:
9
+ """
10
+ Revised urlManager for managing and cleaning URLs.
11
+
12
+ It splits URLs into their components, normalizes them (trimming spaces, lowercasing
13
+ scheme and domain, removing default ports, and cleaning up paths), and then creates
14
+ a list of potential variants (with/without www, http/https) so that a valid version
15
+ can be determined.
16
+
17
+ Now handles url=None gracefully: sets internals to None/empty and methods return None or empty values without errors.
18
+ """
19
+ def __init__(self, url=None, session=None):
20
+ self._url = url # Allow None
21
+ self.session = session or requests.Session()
22
+ if self._url is None:
23
+ self.clean_urls = []
24
+ self.url = None
25
+ self.protocol = None
26
+ self.domain = None
27
+ self.path = ""
28
+ self.query = ""
29
+ self.all_urls = []
30
+ else:
31
+ self.clean_urls = self.clean_url()
32
+ self.url = self.get_correct_url() or self._url
33
+ self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
34
+ self.all_urls = []
35
+
36
+ def url_to_pieces(self, url):
37
+ """
38
+ Split a URL into protocol, domain, path, and query components.
39
+ Uses urlparse for robustness.
40
+ """
41
+ if url is None:
42
+ return None, None, "", ""
43
+ parsed = urlparse(url)
44
+ protocol = parsed.scheme or None
45
+ domain = parsed.netloc or None
46
+ path = parsed.path or ""
47
+ query = parsed.query or ""
48
+ return protocol, domain, path, query
49
+
50
+ def clean_url(self, url=None) -> list:
51
+ """
52
+ Normalize and clean the URL, then return a list of potential URL variants.
53
+
54
+ This method:
55
+ - Strips whitespace.
56
+ - Adds a scheme (defaults to https) if missing.
57
+ - Lowercases the scheme and domain.
58
+ - Removes default ports.
59
+ - Cleans up the path (removing duplicate slashes and trailing slash if not a file-like path).
60
+ - Preserves params and query; strips fragment.
61
+ - Generates variants with and without 'www', and with both http and https.
62
+ """
63
+ url = (url or self._url) # Use self._url if url None
64
+ if url is None:
65
+ return []
66
+ url = url.strip()
67
+ if not url:
68
+ return []
69
+ # Ensure the URL has a scheme
70
+ if not re.match(r'https?://', url, re.IGNORECASE):
71
+ url = 'https://' + url
72
+ parsed = urlparse(url)
73
+ scheme = parsed.scheme.lower()
74
+ netloc = parsed.netloc.lower()
75
+ # Remove default port numbers if present
76
+ if ':' in netloc:
77
+ host, port = netloc.split(':', 1)
78
+ if (scheme == "http" and port == "80") or (scheme == "https" and port == "443"):
79
+ netloc = host
80
+
81
+ # Normalize the path: remove duplicate slashes; rstrip '/' only if path isn't root or file-like
82
+ path = re.sub(r'//+', '/', parsed.path)
83
+ if path != '/' and '.' not in path.split('/')[-1]: # Fixed: check if last segment has '.' for file-like
84
+ path = path.rstrip('/')
85
+
86
+ # Rebuild the cleaned URL, preserving params and query, stripping fragment
87
+ cleaned_url = urlunparse((scheme, netloc, path, parsed.params, parsed.query, ''))
88
+
89
+ variants = [cleaned_url]
90
+ # Generate a variant with/without 'www'
91
+ if netloc.startswith('www.'):
92
+ no_www = netloc[4:]
93
+ variants.append(urlunparse((scheme, no_www, path, parsed.params, parsed.query, '')))
94
+ else:
95
+ variants.append(urlunparse((scheme, f"www.{netloc}", path, parsed.params, parsed.query, '')))
96
+
97
+ # Generate variants with the alternate scheme
98
+ alt_scheme = 'http' if scheme == 'https' else 'https'
99
+ for variant in list(variants):
100
+ parsed_variant = urlparse(variant)
101
+ alt_variant = urlunparse((alt_scheme, parsed_variant.netloc, parsed_variant.path, parsed_variant.params, parsed_variant.query, ''))
102
+ variants.append(alt_variant)
103
+
104
+ # Remove duplicates while preserving order
105
+ seen = set()
106
+ unique_variants = [v for v in variants if v not in seen and not seen.add(v)]
107
+
108
+ # Sort to prefer HTTPS variants first
109
+ unique_variants.sort(key=lambda v: (not v.startswith('https'), v))
110
+ return unique_variants
111
+
112
+ def get_correct_url(self, url=None, clean_urls=None) -> str:
113
+ """
114
+ Attempts each URL variant by making an HTTP HEAD request (lighter than GET).
115
+ Returns the first variant that returns a 200 OK response.
116
+ """
117
+ if self._url is None:
118
+ return None
119
+ clean_urls = clean_urls or self.clean_urls
120
+ url = url or self._url
121
+ if not clean_urls:
122
+ clean_urls = self.clean_url(url)
123
+ for candidate in clean_urls:
124
+ try:
125
+ response = self.session.head(candidate, timeout=5, allow_redirects=True)
126
+ if response.status_code == 200:
127
+ return candidate
128
+ except requests.exceptions.RequestException as e:
129
+ logging.info(f"Failed to reach {candidate}: {e}")
130
+ return None
131
+
132
+ def update_url(self, url):
133
+ """
134
+ Update the URL and refresh related attributes.
135
+ """
136
+ self._url = url
137
+ if self._url is None:
138
+ self.clean_urls = []
139
+ self.url = None
140
+ self.protocol = None
141
+ self.domain = None
142
+ self.path = ""
143
+ self.query = ""
144
+ self.all_urls = []
145
+ else:
146
+ self.clean_urls = self.clean_url(url)
147
+ self.url = self.get_correct_url() or url
148
+ self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
149
+ self.all_urls = []
150
+
151
+ def get_domain(self, url=None):
152
+ if self._url is None and url is None:
153
+ return None
154
+ url = url or self.url
155
+ return urlparse(url).netloc
156
+
157
+ @property
158
+ def url(self):
159
+ return self._url
160
+
161
+ @url.setter
162
+ def url(self, new_url):
163
+ self._url = new_url
164
+
165
+ def is_valid_url(self, url=None):
166
+ """
167
+ Check if the given URL is valid.
168
+ """
169
+ if url is None and self._url is None:
170
+ return False
171
+ url = url or self.url
172
+ if url is None:
173
+ return False
174
+ parsed = urlparse(url)
175
+ return bool(parsed.scheme) and bool(parsed.netloc)
176
+
177
+ def make_valid(self, href, url=None):
178
+ """
179
+ Validate a href. If it's not already valid, join it with the base URL.
180
+ """
181
+ if self._url is None and url is None:
182
+ return None
183
+ if self.is_valid_url(href):
184
+ return href
185
+ base = url or self.url
186
+ if base is None:
187
+ return None
188
+ new_link = urljoin(base, href)
189
+ if self.is_valid_url(new_link):
190
+ return new_link
191
+ return None
192
+
193
+ def get_relative_href(self, base, href):
194
+ """
195
+ For a relative href, join it with the base URL and strip any query or fragment.
196
+ """
197
+ if base is None:
198
+ return None
199
+ joined = urljoin(base, href)
200
+ parsed = urlparse(joined)
201
+ clean_href = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))
202
+ return clean_href
203
+
204
+ def url_basename(self, url=None):
205
+ if self._url is None and url is None:
206
+ return ""
207
+ url = url or self.url
208
+ if url is None:
209
+ return ""
210
+ path = urlparse(url).path
211
+ return path.strip('/').split('/')[-1]
212
+
213
+ def base_url(self, url=None):
214
+ if self._url is None and url is None:
215
+ return None
216
+ url = url or self.url
217
+ if url is None:
218
+ return None
219
+ parsed = urlparse(url)
220
+ return urlunparse((parsed.scheme, parsed.netloc, '/', '', '', ''))
221
+
222
+ def urljoin(self, base, path):
223
+ if base is None:
224
+ return None
225
+ return urljoin(base, path)
226
+
227
+ class urlManagerSingleton:
228
+ _instance = None
229
+ @staticmethod
230
+ def get_instance(url=None, session=requests.Session()):
231
+ if urlManagerSingleton._instance is None:
232
+ urlManagerSingleton._instance = urlManager(url, session=session)
233
+ elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
234
+ urlManagerSingleton._instance = urlManager(url, session=session)
235
+ return urlManagerSingleton._instance
236
+
237
+ def get_url(url=None, url_mgr=None):
238
+ if not url and not url_mgr:
239
+ return None
240
+ if url_mgr is None and url is not None:
241
+ url_mgr = urlManager(url)
242
+ return url_mgr.url if url_mgr else None
243
+
244
+ def get_url_mgr(url=None, url_mgr=None):
245
+ if url_mgr is None:
246
+ url_mgr = urlManager(url=url) # Always create instance, even if url=None
247
+ if url_mgr and url is None:
248
+ url = url_mgr.url
249
+ return url_mgr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.136
3
+ Version: 0.1.6.138
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -46,4 +46,5 @@ src/abstract_webtools/managers/soupManager/__init__.py
46
46
  src/abstract_webtools/managers/soupManager/asoueces.py
47
47
  src/abstract_webtools/managers/soupManager/soupManager.py
48
48
  src/abstract_webtools/managers/urlManager/__init__.py
49
+ src/abstract_webtools/managers/urlManager/urlManager (Copy).py
49
50
  src/abstract_webtools/managers/urlManager/urlManager.py