abstract-webtools 0.1.6.38__py3-none-any.whl → 0.1.6.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,35 +1,29 @@
1
1
  import os
2
- from ..abstract_webtools import *
3
- from .urlManager import *
4
2
  from urllib.parse import urlparse
5
- from abstract_utilities import *
6
3
  from selenium import webdriver
7
4
  from selenium.webdriver.chrome.options import Options
5
+ from selenium.webdriver.chrome.service import Service
6
+ from webdriver_manager.chrome import ChromeDriverManager # For automatic ChromeDriver installation
8
7
  import logging
9
8
  import urllib3
9
+ from ..abstract_webtools import * # Assuming this is a valid import
10
+ from .urlManager import *
10
11
 
11
12
  # Suppress urllib3 warnings and debug logs
12
13
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13
14
  logging.getLogger("urllib3").setLevel(logging.WARNING)
14
-
15
- # Suppress Selenium logs
16
15
  logging.getLogger("selenium").setLevel(logging.WARNING)
17
16
 
18
- import os
19
- from selenium import webdriver
20
- from selenium.webdriver.chrome.options import Options
21
-
22
- # Setup Chrome options
23
- chrome_options = Options()
24
- chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
25
- chrome_options.add_argument("--headless") # Run in headless mode
26
- chrome_options.add_argument("--no-sandbox")
27
- chrome_options.add_argument("--disable-dev-shm-usage")
28
- chrome_options.add_argument("--disable-gpu")
29
- chrome_options.add_argument("--disable-software-rasterizer")
30
- chrome_options.add_argument("--disable-extensions")
31
- chrome_options.add_argument("--remote-debugging-port=9222")
32
-
17
+ # Default Chrome options (can be overridden)
18
+ DEFAULT_CHROME_OPTIONS = [
19
+ "--headless", # Run in headless mode
20
+ "--no-sandbox",
21
+ "--disable-dev-shm-usage", # Avoid memory issues on servers
22
+ "--disable-gpu",
23
+ "--disable-software-rasterizer",
24
+ "--disable-extensions",
25
+ "--remote-debugging-port=9222"
26
+ ]
33
27
 
34
28
  class SingletonMeta(type):
35
29
  _instances = {}
@@ -39,21 +33,21 @@ class SingletonMeta(type):
39
33
  cls._instances[cls] = instance
40
34
  return cls._instances[cls]
41
35
 
42
- class seleniumManager(metaclass=SingletonMeta):
36
+ class SeleniumManager(metaclass=SingletonMeta):
43
37
  def __init__(self, url):
44
38
  if not hasattr(self, 'initialized'): # Prevent reinitialization
45
39
  self.initialized = True
46
40
  parsed_url = urlparse(url)
47
41
  self.domain = parsed_url.netloc
48
- self.scheme = parsed_url.scheme
49
- self.base_url= f"{self.scheme}{self.domain}"
42
+ self.scheme = parsed_url.scheme or "https" # Default to https if scheme is missing
43
+ self.base_url = f"{self.scheme}://{self.domain}"
50
44
  self.site_dir = os.path.join(os.getcwd(), self.domain)
51
45
  os.makedirs(self.site_dir, exist_ok=True)
52
46
  self.drivers = {}
53
47
  self.page_type = []
54
-
48
+
55
49
  def get_url_to_path(self, url):
56
- url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
50
+ url = eatAll(str(url), ['', ' ', '\n', '\t', '\\', '/']) # Assuming eatAll is defined elsewhere
57
51
  parsed_url = urlparse(url)
58
52
  if parsed_url.netloc == self.domain:
59
53
  paths = parsed_url.path.split('/')
@@ -61,59 +55,74 @@ class seleniumManager(metaclass=SingletonMeta):
61
55
  for path in paths[:-1]:
62
56
  dir_path = os.path.join(dir_path, path)
63
57
  os.makedirs(dir_path, exist_ok=True)
64
- self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
65
-
58
+ self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if not self.page_type else self.page_type[-1])
66
59
  dir_path = os.path.join(dir_path, paths[-1])
67
60
  return dir_path
68
61
 
69
62
  def saved_url_check(self, url):
70
- path = self.get_url_to_path(url)
71
- return path
63
+ return self.get_url_to_path(url)
72
64
 
73
65
  def get_with_netloc(self, url):
74
66
  parsed_url = urlparse(url)
75
- if parsed_url.netloc == '':
76
- url = f"{self.scheme}://{self.domain}/{url.strip()}"
67
+ if not parsed_url.netloc:
68
+ url = f"{self.scheme}://{self.domain}/{url.strip('/')}"
77
69
  return url
78
70
 
79
71
  def get_driver(self, url):
80
72
  if url and url not in self.drivers:
73
+ # Set up Chrome options
81
74
  chrome_options = Options()
82
- chrome_options.add_argument("--headless")
83
- driver = webdriver.Chrome(options=chrome_options)
75
+ for option in DEFAULT_CHROME_OPTIONS:
76
+ chrome_options.add_argument(option)
77
+
78
+ # Specify Chrome binary location if needed (optional, comment out if not applicable)
79
+ # chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
80
+
81
+ # Automatically install and use ChromeDriver
82
+ service = Service(ChromeDriverManager().install())
83
+ driver = webdriver.Chrome(service=service, options=chrome_options)
84
+
84
85
  self.drivers[url] = driver
85
86
  driver.get(url)
86
87
  return self.drivers[url]
88
+
89
+ def quit_driver(self, url):
90
+ """Clean up a specific driver instance."""
91
+ if url in self.drivers:
92
+ self.drivers[url].quit()
93
+ del self.drivers[url]
94
+
95
+ def quit_all_drivers(self):
96
+ """Clean up all driver instances."""
97
+ for driver in self.drivers.values():
98
+ driver.quit()
99
+ self.drivers.clear()
100
+
87
101
  def normalize_url(url, base_url=None):
88
- """
89
- Normalize and resolve relative URLs, ensuring proper domain and format.
90
- """
91
- # If URL starts with the base URL repeated, remove the extra part
92
- manager = seleniumManager(url)
93
- base_url = manager.base_url
102
+ """Normalize and resolve relative URLs."""
103
+ manager = SeleniumManager(url)
104
+ base_url = manager.base_url if base_url is None else base_url
94
105
  if url.startswith(base_url):
95
106
  url = url[len(base_url):]
96
-
97
- # Resolve the URL against the base URL
98
107
  normalized_url = urljoin(base_url, url.split('#')[0])
99
-
100
- # Ensure only URLs belonging to the base domain are kept
101
108
  if not normalized_url.startswith(base_url):
102
109
  return None
103
-
104
110
  return normalized_url
105
- # Function to get Selenium page source
111
+
106
112
  def get_selenium_source(url):
107
- url_mgr = urlManager(url)
113
+ """Fetch page source using Selenium."""
114
+ url_mgr = urlManager(url) # Assuming urlManager is defined elsewhere
108
115
  if url_mgr.url:
109
116
  url = str(url_mgr.url)
110
- manager = seleniumManager(url)
117
+ manager = SeleniumManager(url)
111
118
  driver = manager.get_driver(url)
112
119
  try:
113
- # Get page source
114
- page_source = driver.page_source
115
- return page_source
116
- finally:
117
- # Don't quit the driver unless you're done with all interactions
118
- pass
120
+ return driver.page_source
121
+ except Exception as e:
122
+ logging.error(f"Error fetching page source for {url}: {e}")
123
+ return None
124
+ # Note: Driver is not quit here to maintain Singleton behavior
119
125
 
126
+ # Ensure cleanup on program exit (optional)
127
+ import atexit
128
+ atexit.register(lambda: SeleniumManager(url="").quit_all_drivers()) # Cleanup all drivers on exit
@@ -0,0 +1,135 @@
1
+ text = """{"title": "NoviSoul
2
+ novissbm@gmail.com", "href": "http://www.youtube.com/signin?authuser=0&next=%2Fwatch%3Fv%3DEaIYRM1yrM4&action_handle_signin=true", "description": ""},
3
+ {"title": "Sign in", "href": "https://accounts.google.com/ServiceLogin?continue=http%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26hl%3Den_GB%26next%3D%252Fwatch%253Fv%253DEaIYRM1yrM4%26nomobiletemp%3D1&uilel=3&service=youtube&passive=true&hl=en_GB", "description": ""},
4
+ {"title": "Sign up", "href": "http://www.youtube.com/signup?next=%2Fwatch%3Fv%3DEaIYRM1yrM4", "description": ""},
5
+ {"title": "9:58
6
+
7
+
8
+
9
+
10
+
11
+ Physics of Free Energy Deviceby Eugene Jeong
12
+
13
+ 336,881 views", "href": "http://www.youtube.com/watch?v=EB-jWfzkz_E", "description": ""},
14
+ {"title": "4:49
15
+
16
+
17
+
18
+
19
+
20
+ [www.witts.ws] Self-Running 40kW (40,000 Watt) Fuelless Generator (1 of 3)by wits2014
21
+
22
+ 488,638 views", "href": "http://www.youtube.com/watch?v=LFu-s6ZmGyE", "description": ""},
23
+ {"title": "2:33
24
+
25
+
26
+
27
+
28
+
29
+ Free Energy - Evidence of military antigravity technologyby DoubleMarkez
30
+
31
+ 390,020 views", "href": "http://www.youtube.com/watch?v=qljY-YfFaPc", "description": ""},
32
+ {"title": "15:01
33
+
34
+
35
+
36
+
37
+
38
+ APEX 2013 SSBM L10 Shroomed VS CT EMP Mew2Kingby Jason AxelrodRecommended for you", "href": "http://www.youtube.com/watch?v=pc7v49k5FhY", "description": ""},
39
+ {"title": "161
40
+
41
+
42
+ videos
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+ Play all
59
+
60
+
61
+
62
+ washby dle3276", "href": "http://www.youtube.com/watch?v=AmcSt5hU4qA&list=PL4517CA6C6244A844", "description": ""},
63
+ {"title": "10:31
64
+
65
+
66
+
67
+
68
+
69
+ Pyramid Magnet - free energy - english subtitleby MrTermsof
70
+
71
+ 616,081 views", "href": "http://www.youtube.com/watch?v=pMbHswNoGWM", "description": ""},
72
+ {"title": "4:11
73
+
74
+
75
+
76
+
77
+
78
+ My all new newman motor 1.(TheDaftman)by theDaftman
79
+
80
+ 1,147,470 views", "href": "http://www.youtube.com/watch?v=dL4B_DNBtvc", "description": ""},
81
+ {"title": "2:18
82
+
83
+
84
+
85
+
86
+
87
+ Is there free energy in magnets?by aetherix01
88
+
89
+ 371,642 views", "href": "http://www.youtube.com/watch?v=vrn5B9a8aOk", "description": ""},
90
+ {"title": "3:00
91
+
92
+
93
+
94
+
95
+
96
+ The Most Dangerous Video On The Internet - Trevor Paglenby killuminati63
97
+
98
+ 585,755 views", "href": "http://www.youtube.com/watch?v=9xEuhEHDJM8", "description": ""},
99
+ {"title": "2:18
100
+
101
+
102
+
103
+
104
+
105
+ Free Energy - Magnet Motorby ATBootstrap
106
+
107
+ 358,641 views", "href": "http://www.youtube.com/watch?v=hfkwCE3BeBs", "description": ""},
108
+ {"title": "2:38
109
+
110
+
111
+
112
+
113
+
114
+ 100% free energy generator is easy to buildby LifeHack2012
115
+
116
+ 238,092 views", "href": "http://www.youtube.com/watch?v=GEUyhhMEs7U", "description": ""},
117
+ {"title": "3:41
118
+
119
+
120
+
121
+
122
+
123
+ 5KW free energy –±–µ—Å—Ç–æ–ø–ª–∏–≤–Ω—ã–π –≥–µ–Ω–µ—Ä–∞—Ç–æ—Ä Kapanadze –ö–∞–ø–∞–Ω–∞–¥–∑–µby Alexander Frolov
124
+
125
+ 488,213 views", "href": "http://www.youtube.com/watch?v=uxQ99R4gOWY", "description": ""},""".split('\n')
126
+ sources = ' '.join([te for te in text if te])
127
+ while True:
128
+ if ' ' in sources:
129
+ sources = sources.replace(' ',' ').replace('\t',' ')
130
+ else:
131
+ break
132
+ sources = sources.replace('}, {','},{').replace('},{','},\n{')
133
+ input(sources)
134
+
135
+
@@ -1,230 +1,220 @@
1
+ import re
2
+ import urllib.parse
3
+ import requests
4
+ from urllib.parse import urlparse, urlunparse, urljoin
5
+
1
6
  from ...abstract_webtools import *
7
+
2
8
  class urlManager:
3
9
  """
4
- urlManager is a class for managing URLs, including cleaning, validating, and finding the correct version.
5
-
6
- Args:
7
- url (str or None): The URL to manage (default is None).
8
- session (requests.Session): A custom requests session (default is the requests module's session).
9
-
10
- Attributes:
11
- session (requests.Session): The requests session used for making HTTP requests.
12
- clean_urls (list): List of cleaned URL variations.
13
- url (str): The current URL.
14
- protocol (str): The protocol part of the URL (e.g., "https").
15
- domain (str): The domain part of the URL (e.g., "example.com").
16
- path (str): The path part of the URL (e.g., "/path/to/resource").
17
- query (str): The query part of the URL (e.g., "?param=value").
18
- all_urls (list): List of all URLs (not used in the provided code).
19
-
20
- Methods:
21
- url_to_pieces(url): Split a URL into its protocol, domain, path, and query components.
22
- clean_url(url): Return a list of potential URL versions with and without 'www' and 'http(s)'.
23
- get_correct_url(url): Get the correct version of the URL from possible variations.
24
- update_url(url): Update the URL and related attributes.
25
- get_domain(url): Get the domain name from a URL.
26
- url_join(url, path): Join a base URL with a path.
27
- is_valid_url(url): Check if a URL is valid.
28
- make_valid(href, url): Make a URL valid by joining it with a base URL.
29
- get_relative_href(url, href): Get the relative href URL by joining it with a base URL.
30
-
31
- Note:
32
- - The urlManager class provides methods for managing URLs, including cleaning and validating them.
33
- - It also includes methods for joining and validating relative URLs.
10
+ Revised urlManager for managing and cleaning URLs.
11
+
12
+ It splits URLs into their components, normalizes them (trimming spaces, lowercasing
13
+ scheme and domain, removing default ports, and cleaning up paths), and then creates
14
+ a list of potential variants (with/without www, http/https) so that a valid version
15
+ can be determined.
34
16
  """
35
-
36
17
  def __init__(self, url=None, session=None):
37
- """
38
- Initialize a urlManager instance.
39
-
40
- Args:
41
- url (str or None): The URL to manage (default is None).
42
- session (requests.Session): A custom requests session (default is the requests module's session).
43
- """
44
18
  url = url or 'www.example.com'
45
- self._url=url
46
- self.url = url
47
- self.session= session or requests
48
- self.clean_urls = self.clean_url(url=url)
49
- self.url = self.get_correct_url(clean_urls=self.clean_urls)
50
- url_pieces = self.url_to_pieces(url=self.url)
51
- self.protocol,self.domain,self.path,self.query=url_pieces
19
+ self._url = url
20
+ self.session = session or requests
21
+ self.clean_urls = self.clean_url(url)
22
+ self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
23
+ self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
52
24
  self.all_urls = []
25
+
53
26
  def url_to_pieces(self, url):
54
-
27
+ """
28
+ Split a URL into protocol, domain, path, and query components.
29
+ Uses urlparse for robustness.
30
+ """
55
31
  try:
56
- match = re.match(r'^(https?)?://?([^/]+)(/[^?]+)?(\?.+)?', url)
57
- if match:
58
- protocol = match.group(1) if match.group(1) else None
59
- domain = match.group(2) if match.group(1) else None
60
- path = match.group(3) if match.group(3) else "" # Handle None
61
- query = match.group(4) if match.group(4) else "" # Handle None
62
- except:
63
- print(f'the url {url} was not reachable')
64
- protocol,domain,path,query=None,None,"",""
32
+ parsed = urlparse(url)
33
+ protocol = parsed.scheme if parsed.scheme else None
34
+ domain = parsed.netloc if parsed.netloc else None
35
+ path = parsed.path or ""
36
+ query = parsed.query or ""
37
+ except Exception as e:
38
+ print(f'The URL {url} was not reachable: {e}')
39
+ protocol, domain, path, query = None, None, "", ""
65
40
  return protocol, domain, path, query
66
41
 
67
- def clean_url(self,url=None) -> list:
68
- """
69
- Given a URL, return a list with potential URL versions including with and without 'www.',
70
- and with 'http://' and 'https://'.
71
- """
72
- url = url or self.url
73
- urls=[]
74
- if url:
75
- # Remove http:// or https:// prefix
76
- cleaned = url.replace("http://", "").replace("https://", "")
77
- no_subdomain = cleaned.replace("www.", "", 1)
78
-
79
- urls = [
80
- f"https://{cleaned}",
81
- f"http://{cleaned}",
82
- ]
83
-
84
- # Add variants without 'www' if it was present
85
- if cleaned != no_subdomain:
86
- urls.extend([
87
- f"https://{no_subdomain}",
88
- f"http://{no_subdomain}",
89
- ])
90
-
91
- # Add variants with 'www' if it wasn't present
92
- else:
93
- urls.extend([
94
- f"https://www.{cleaned}",
95
- f"http://www.{cleaned}",
96
- ])
97
-
98
- return urls
99
-
100
- def get_correct_url(self,url=None,clean_urls=None) -> (str or None):
101
- """
102
- Gets the correct URL from the possible variations by trying each one with an HTTP request.
103
-
104
- Args:
105
- url (str): The URL to find the correct version of.
106
- session (type(requests.Session), optional): The requests session to use for making HTTP requests.
107
- Defaults to requests.
108
-
109
- Returns:
110
- str: The correct version of the URL if found, or None if none of the variations are valid.
111
- """
112
- self.url = url
113
- if url==None and clean_urls != None:
114
- if self.url:
115
- url=self.url or clean_urls[0]
116
- if url!=None and clean_urls==None:
117
- clean_urls=self.clean_url(url)
118
- elif url==None and clean_urls==None:
119
- url=self.url
120
- clean_urls=self.clean_urls
121
- # Get the correct URL from the possible variations
122
- for url in clean_urls:
42
+ def clean_url(self, url=None) -> list:
43
+ """
44
+ Normalize and clean the URL, then return a list of potential URL variants.
45
+
46
+ This method:
47
+ - Strips whitespace.
48
+ - Adds a scheme (defaults to https) if missing.
49
+ - Lowercases the scheme and domain.
50
+ - Removes default ports.
51
+ - Cleans up the path (removing duplicate slashes and trailing slash).
52
+ - Generates variants with and without 'www', and with both http and https.
53
+ """
54
+ url = url or self._url
55
+ url = url.strip()
56
+ # Ensure the URL has a scheme
57
+ if not re.match(r'https?://', url):
58
+ url = 'https://' + url
59
+
60
+ parsed = urlparse(url)
61
+ scheme = parsed.scheme.lower()
62
+ netloc = parsed.netloc.lower()
63
+ # Remove default port numbers if present
64
+ if ':' in netloc:
65
+ host, port = netloc.split(':', 1)
66
+ if (scheme == "http" and port == "80") or (scheme == "https" and port == "443"):
67
+ netloc = host
68
+
69
+ # Normalize the path: remove duplicate slashes and a trailing slash
70
+ path = re.sub(r'//+', '/', parsed.path).rstrip('/')
71
+
72
+ # Rebuild the cleaned URL without query or fragment
73
+ cleaned_url = urlunparse((scheme, netloc, path, '', '', ''))
74
+
75
+ variants = []
76
+ # Add the primary variant
77
+ variants.append(cleaned_url)
78
+ # Generate a variant with/without 'www'
79
+ if netloc.startswith('www.'):
80
+ no_www = netloc[4:]
81
+ variants.append(urlunparse((scheme, no_www, path, '', '', '')))
82
+ else:
83
+ variants.append(urlunparse((scheme, f"www.{netloc}", path, '', '', '')))
84
+
85
+ # Also generate variants with the alternate scheme
86
+ alt_scheme = 'http' if scheme == 'https' else 'https'
87
+ for variant in list(variants):
88
+ parsed_variant = urlparse(variant)
89
+ alt_variant = urlunparse((alt_scheme, parsed_variant.netloc, parsed_variant.path, '', '', ''))
90
+ variants.append(alt_variant)
91
+
92
+ # Remove duplicates while preserving order
93
+ seen = set()
94
+ unique_variants = []
95
+ for v in variants:
96
+ if v not in seen:
97
+ unique_variants.append(v)
98
+ seen.add(v)
99
+ return unique_variants
100
+
101
+ def get_correct_url(self, url=None, clean_urls=None) -> str:
102
+ """
103
+ Attempts each URL variant by making an HTTP GET request.
104
+ Returns the first variant that returns a 200 OK response.
105
+ """
106
+ if url is None and clean_urls is None:
107
+ url = self._url
108
+ clean_urls = self.clean_urls
109
+ if url is not None and clean_urls is None:
110
+ clean_urls = self.clean_url(url)
111
+ elif url is None and clean_urls is not None:
112
+ url = self._url
113
+
114
+ for candidate in clean_urls:
123
115
  try:
124
- source = self.session.get(url)
125
- return url
116
+ response = self.session.get(candidate, timeout=5)
117
+ if response.status_code == 200:
118
+ return candidate
126
119
  except requests.exceptions.RequestException as e:
127
- print(e)
120
+ print(f"Failed to reach {candidate}: {e}")
128
121
  return None
129
- def update_url(self,url):
130
- # These methods seem essential for setting up the urlManager object.
131
- self.url = url
132
- self.clean_urls = self.clean_url()
133
- self.correct_url = self.get_correct_url()
134
- self.url =self.correct_url
135
- self.protocol,self.domain,self.path,self.query=self.url_to_pieces(url=self.url)
122
+
123
+ def update_url(self, url):
124
+ """
125
+ Update the URL and refresh related attributes.
126
+ """
127
+ self._url = url
128
+ self.clean_urls = self.clean_url(url)
129
+ self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
130
+ self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
136
131
  self.all_urls = []
137
- def get_domain(self,url=None):
138
- url = url or self.url
132
+
133
+ def get_domain(self, url=None):
134
+ url = url or self.url
139
135
  return urlparse(url).netloc
140
- def url_join(self,url,path):
141
- url = eatOuter(url,['/'])
142
- path = eatInner(path,['/'])
143
- slash=''
144
- if path[0] not in ['?','&']:
145
- slash = '/'
146
- url = url+slash+path
147
- return url
136
+
137
+ def url_join(self, base_url, path):
138
+ """
139
+ Joins a base URL with a relative path.
140
+ """
141
+ base_url = base_url.strip().rstrip('/')
142
+ path = path.strip().lstrip('/')
143
+ return f"{base_url}/{path}"
144
+
148
145
  @property
149
146
  def url(self):
150
147
  return self._url
148
+
151
149
  @url.setter
152
150
  def url(self, new_url):
153
151
  self._url = new_url
154
- def is_valid_url(self,url=None):
152
+
153
+ def is_valid_url(self, url=None):
155
154
  """
156
155
  Check if the given URL is valid.
157
156
  """
158
- url = url or self.url
157
+ url = url or self.url
159
158
  parsed = urlparse(url)
160
- return bool(parsed.netloc) and bool(parsed.scheme)
159
+ return bool(parsed.scheme) and bool(parsed.netloc)
161
160
 
162
- def make_valid(self,href,url=None):
163
- def is_valid_url(url):
164
- url = url or self.url
165
- """
166
- Check if the given URL is valid.
167
- """
168
- parsed = urlparse(url)
169
- return bool(parsed.netloc) and bool(parsed.scheme)
170
- if is_valid_url(href):
161
+ def make_valid(self, href, url=None):
162
+ """
163
+ Validate a href. If it's not already valid, join it with the base URL.
164
+ """
165
+ if self.is_valid_url(href):
171
166
  return href
172
- new_link=urljoin(url,href)
173
- if is_valid_url(new_link):
167
+ base = url or self.url
168
+ new_link = urljoin(base, href)
169
+ if self.is_valid_url(new_link):
174
170
  return new_link
175
171
  return False
176
-
177
- def get_relative_href(self,url,href):
178
- # join the URL if it's relative (not an absolute link)
179
- url = url or self.url
180
- href = urljoin(url, href)
181
- parsed_href = urlparse(href)
182
- # remove URL GET parameters, URL fragments, etc.
183
- href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
184
- return href
185
- def url_basename(self,url=None):
186
- url = url or self.url
187
- path = urllib.parse.urlparse(url).path
172
+
173
+ def get_relative_href(self, base, href):
174
+ """
175
+ For a relative href, join it with the base URL and strip any query or fragment.
176
+ """
177
+ joined = urljoin(base, href)
178
+ parsed = urlparse(joined)
179
+ clean_href = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))
180
+ return clean_href
181
+
182
+ def url_basename(self, url=None):
183
+ url = url or self.url
184
+ path = urlparse(url).path
188
185
  return path.strip('/').split('/')[-1]
189
186
 
187
+ def base_url(self, url=None):
188
+ url = url or self.url
189
+ match = re.match(r'https?://[^?#/]+/', url)
190
+ if match:
191
+ return match.group()
192
+ return None
193
+
194
+ def urljoin(self, base, path):
195
+ return urljoin(base, path)
190
196
 
191
- def base_url(self,url=None):
192
- url = url or self.url
193
- return re.match(r'https?://[^?#]+/', url).group()
194
-
195
-
196
- def urljoin(self,base, path):
197
- if isinstance(path, bytes):
198
- path = path.decode()
199
- if not isinstance(path, str) or not path:
200
- return None
201
- if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
202
- return path
203
- if isinstance(base, bytes):
204
- base = base.decode()
205
- if not isinstance(base, str) or not re.match(
206
- r'^(?:https?:)?//', base):
207
- return None
208
- return urllib.parse.urljoin(base, path)
209
197
  class urlManagerSingleton:
210
198
  _instance = None
199
+
211
200
  @staticmethod
212
- def get_instance(url=None,session=requests):
201
+ def get_instance(url=None, session=requests):
213
202
  if urlManagerSingleton._instance is None:
214
- urlManagerSingleton._instance = urlManager(url,session=session)
203
+ urlManagerSingleton._instance = urlManager(url, session=session)
215
204
  elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
216
- urlManagerSingleton._instance = urlManager(url,session=session)
205
+ urlManagerSingleton._instance = urlManager(url, session=session)
217
206
  return urlManagerSingleton._instance
218
207
 
219
- def get_url(url=None,url_mgr=None):
208
+ def get_url(url=None, url_mgr=None):
220
209
  if not url and not url_mgr:
221
210
  return None
222
211
  if url:
223
212
  url_mgr = urlManager(url)
224
213
  return url_mgr.url
225
- def get_url_mgr(url=None,url_mgr=None):
226
- if url_mgr == None and url:
214
+
215
+ def get_url_mgr(url=None, url_mgr=None):
216
+ if url_mgr is None and url:
227
217
  url_mgr = urlManager(url=url)
228
- if url_mgr and url == None:
218
+ if url_mgr and url is None:
229
219
  url = url_mgr.url
230
- return url_mgr
220
+ return url_mgr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.38
3
+ Version: 0.1.6.40
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -17,7 +17,7 @@ abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rg
17
17
  abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
18
18
  abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
19
19
  abstract_webtools/managers/requestManager.py,sha256=zXD31WAYghV1OjnTQzRQnQGqZz6_J4mjHTdNLnBop_0,17343
20
- abstract_webtools/managers/seleniumManager.py,sha256=qSY8gH3N5YJIMwE_Alj9HNQRip_PziIo4_T9AZE_FQo,4273
20
+ abstract_webtools/managers/seleniumManager.py,sha256=B7X6nTfxs1eHFDo7LKB1N5LhDytZQzHPgJjna2c2j6E,5017
21
21
  abstract_webtools/managers/soupManager.py,sha256=-_mRCWlyzfKlF64UU53WXBmCvJ98jQ4GyHh8S8Pw3xs,17198
22
22
  abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
23
23
  abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
@@ -30,12 +30,13 @@ abstract_webtools/managers/linkManager/linkManager.py,sha256=roxOzOELca0rOlcMaJk
30
30
  abstract_webtools/managers/requestManager/__init__.py,sha256=z2qGtweEoO_OKr959LGxVXEMu1hu7PIkmh89BEh5TI8,30
31
31
  abstract_webtools/managers/requestManager/requestManager.py,sha256=MrPJAXRNDXjwE_BzJF3xwraT54IxVNmPU4eHhDgNmbE,17351
32
32
  abstract_webtools/managers/soupManager/__init__.py,sha256=mqfXfqM9sWlYpOkoXUqtBoVvk2KQx1862NnmRVJwGtY,27
33
+ abstract_webtools/managers/soupManager/asoueces.py,sha256=OaXqolZl0dI7b09NYwJ3Wnhuxf89ahZ1GjsOqy0GXfk,3506
33
34
  abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2sIkg-bHxBt2mKpYMyZd-nJjLQ,17201
34
35
  abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
35
- abstract_webtools/managers/urlManager/urlManager.py,sha256=-HH6TEBXvCU2CfBdy3mwqn0eSqEZyWlrbF8B0XOj1LU,8859
36
+ abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
36
37
  abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
37
- abstract_webtools-0.1.6.38.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
38
- abstract_webtools-0.1.6.38.dist-info/METADATA,sha256=wtJTD9Lcdho-o-q_2fW6Ds1QfyIbZNbeWgCQSvLRMZo,16051
39
- abstract_webtools-0.1.6.38.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
40
- abstract_webtools-0.1.6.38.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
41
- abstract_webtools-0.1.6.38.dist-info/RECORD,,
38
+ abstract_webtools-0.1.6.40.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
39
+ abstract_webtools-0.1.6.40.dist-info/METADATA,sha256=kKCrv_8-h4JuBeI1YGfB-sWkaQp-H5LS5IyPfw1ooUs,16051
40
+ abstract_webtools-0.1.6.40.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
41
+ abstract_webtools-0.1.6.40.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
42
+ abstract_webtools-0.1.6.40.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5