abstract-webtools 0.1.6.38__tar.gz → 0.1.6.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {abstract_webtools-0.1.6.38/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.40}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/setup.py +1 -1
  3. abstract_webtools-0.1.6.40/src/abstract_webtools/managers/seleniumManager.py +128 -0
  4. abstract_webtools-0.1.6.40/src/abstract_webtools/managers/soupManager/asoueces.py +135 -0
  5. abstract_webtools-0.1.6.40/src/abstract_webtools/managers/urlManager/urlManager.py +220 -0
  6. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
  7. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/SOURCES.txt +1 -0
  8. abstract_webtools-0.1.6.38/src/abstract_webtools/managers/seleniumManager.py +0 -119
  9. abstract_webtools-0.1.6.38/src/abstract_webtools/managers/urlManager/urlManager.py +0 -230
  10. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/LICENSE +0 -0
  11. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/README.md +0 -0
  12. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/pyproject.toml +0 -0
  13. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/setup.cfg +0 -0
  14. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/__init__.py +0 -0
  15. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/abstract_webtools.py +0 -0
  16. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/big_user_agent_list.py +0 -0
  17. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/main.py +0 -0
  18. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/__init__.py +0 -0
  19. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/cipherManager.py +0 -0
  20. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/crawlManager.py +0 -0
  21. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
  22. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/curlMgr.py +0 -0
  23. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/domainManager.py +0 -0
  24. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  25. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/get_test.py +0 -0
  26. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
  27. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
  28. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  29. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/networkManager.py +0 -0
  30. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
  31. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/requestManager/requestManager.py +0 -0
  32. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
  33. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
  34. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/sslManager.py +0 -0
  35. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  36. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
  37. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  38. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  39. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
  40. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/soup_gui.py +0 -0
  41. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/url_grabber.py +0 -0
  42. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/url_grabber_new.py +0 -0
  43. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  44. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/requires.txt +0 -0
  45. {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.38
3
+ Version: 0.1.6.40
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.38',
7
+ version='0.1.6.40',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -0,0 +1,128 @@
1
+ import os
2
+ from urllib.parse import urlparse
3
+ from selenium import webdriver
4
+ from selenium.webdriver.chrome.options import Options
5
+ from selenium.webdriver.chrome.service import Service
6
+ from webdriver_manager.chrome import ChromeDriverManager # For automatic ChromeDriver installation
7
+ import logging
8
+ import urllib3
9
+ from ..abstract_webtools import * # Assuming this is a valid import
10
+ from .urlManager import *
11
+
12
+ # Suppress urllib3 warnings and debug logs
13
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
14
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
15
+ logging.getLogger("selenium").setLevel(logging.WARNING)
16
+
17
+ # Default Chrome options (can be overridden)
18
+ DEFAULT_CHROME_OPTIONS = [
19
+ "--headless", # Run in headless mode
20
+ "--no-sandbox",
21
+ "--disable-dev-shm-usage", # Avoid memory issues on servers
22
+ "--disable-gpu",
23
+ "--disable-software-rasterizer",
24
+ "--disable-extensions",
25
+ "--remote-debugging-port=9222"
26
+ ]
27
+
28
+ class SingletonMeta(type):
29
+ _instances = {}
30
+ def __call__(cls, *args, **kwargs):
31
+ if cls not in cls._instances:
32
+ instance = super().__call__(*args, **kwargs)
33
+ cls._instances[cls] = instance
34
+ return cls._instances[cls]
35
+
36
+ class SeleniumManager(metaclass=SingletonMeta):
37
+ def __init__(self, url):
38
+ if not hasattr(self, 'initialized'): # Prevent reinitialization
39
+ self.initialized = True
40
+ parsed_url = urlparse(url)
41
+ self.domain = parsed_url.netloc
42
+ self.scheme = parsed_url.scheme or "https" # Default to https if scheme is missing
43
+ self.base_url = f"{self.scheme}://{self.domain}"
44
+ self.site_dir = os.path.join(os.getcwd(), self.domain)
45
+ os.makedirs(self.site_dir, exist_ok=True)
46
+ self.drivers = {}
47
+ self.page_type = []
48
+
49
+ def get_url_to_path(self, url):
50
+ url = eatAll(str(url), ['', ' ', '\n', '\t', '\\', '/']) # Assuming eatAll is defined elsewhere
51
+ parsed_url = urlparse(url)
52
+ if parsed_url.netloc == self.domain:
53
+ paths = parsed_url.path.split('/')
54
+ dir_path = self.site_dir
55
+ for path in paths[:-1]:
56
+ dir_path = os.path.join(dir_path, path)
57
+ os.makedirs(dir_path, exist_ok=True)
58
+ self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if not self.page_type else self.page_type[-1])
59
+ dir_path = os.path.join(dir_path, paths[-1])
60
+ return dir_path
61
+
62
+ def saved_url_check(self, url):
63
+ return self.get_url_to_path(url)
64
+
65
+ def get_with_netloc(self, url):
66
+ parsed_url = urlparse(url)
67
+ if not parsed_url.netloc:
68
+ url = f"{self.scheme}://{self.domain}/{url.strip('/')}"
69
+ return url
70
+
71
+ def get_driver(self, url):
72
+ if url and url not in self.drivers:
73
+ # Set up Chrome options
74
+ chrome_options = Options()
75
+ for option in DEFAULT_CHROME_OPTIONS:
76
+ chrome_options.add_argument(option)
77
+
78
+ # Specify Chrome binary location if needed (optional, comment out if not applicable)
79
+ # chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
80
+
81
+ # Automatically install and use ChromeDriver
82
+ service = Service(ChromeDriverManager().install())
83
+ driver = webdriver.Chrome(service=service, options=chrome_options)
84
+
85
+ self.drivers[url] = driver
86
+ driver.get(url)
87
+ return self.drivers[url]
88
+
89
+ def quit_driver(self, url):
90
+ """Clean up a specific driver instance."""
91
+ if url in self.drivers:
92
+ self.drivers[url].quit()
93
+ del self.drivers[url]
94
+
95
+ def quit_all_drivers(self):
96
+ """Clean up all driver instances."""
97
+ for driver in self.drivers.values():
98
+ driver.quit()
99
+ self.drivers.clear()
100
+
101
+ def normalize_url(url, base_url=None):
102
+ """Normalize and resolve relative URLs."""
103
+ manager = SeleniumManager(url)
104
+ base_url = manager.base_url if base_url is None else base_url
105
+ if url.startswith(base_url):
106
+ url = url[len(base_url):]
107
+ normalized_url = urljoin(base_url, url.split('#')[0])
108
+ if not normalized_url.startswith(base_url):
109
+ return None
110
+ return normalized_url
111
+
112
+ def get_selenium_source(url):
113
+ """Fetch page source using Selenium."""
114
+ url_mgr = urlManager(url) # Assuming urlManager is defined elsewhere
115
+ if url_mgr.url:
116
+ url = str(url_mgr.url)
117
+ manager = SeleniumManager(url)
118
+ driver = manager.get_driver(url)
119
+ try:
120
+ return driver.page_source
121
+ except Exception as e:
122
+ logging.error(f"Error fetching page source for {url}: {e}")
123
+ return None
124
+ # Note: Driver is not quit here to maintain Singleton behavior
125
+
126
+ # Ensure cleanup on program exit (optional)
127
+ import atexit
128
+ atexit.register(lambda: SeleniumManager(url="").quit_all_drivers()) # Cleanup all drivers on exit
@@ -0,0 +1,135 @@
1
+ text = """{"title": "NoviSoul
2
+ novissbm@gmail.com", "href": "http://www.youtube.com/signin?authuser=0&next=%2Fwatch%3Fv%3DEaIYRM1yrM4&action_handle_signin=true", "description": ""},
3
+ {"title": "Sign in", "href": "https://accounts.google.com/ServiceLogin?continue=http%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26hl%3Den_GB%26next%3D%252Fwatch%253Fv%253DEaIYRM1yrM4%26nomobiletemp%3D1&uilel=3&service=youtube&passive=true&hl=en_GB", "description": ""},
4
+ {"title": "Sign up", "href": "http://www.youtube.com/signup?next=%2Fwatch%3Fv%3DEaIYRM1yrM4", "description": ""},
5
+ {"title": "9:58
6
+
7
+
8
+
9
+
10
+
11
+ Physics of Free Energy Deviceby Eugene Jeong
12
+
13
+ 336,881 views", "href": "http://www.youtube.com/watch?v=EB-jWfzkz_E", "description": ""},
14
+ {"title": "4:49
15
+
16
+
17
+
18
+
19
+
20
+ [www.witts.ws] Self-Running 40kW (40,000 Watt) Fuelless Generator (1 of 3)by wits2014
21
+
22
+ 488,638 views", "href": "http://www.youtube.com/watch?v=LFu-s6ZmGyE", "description": ""},
23
+ {"title": "2:33
24
+
25
+
26
+
27
+
28
+
29
+ Free Energy - Evidence of military antigravity technologyby DoubleMarkez
30
+
31
+ 390,020 views", "href": "http://www.youtube.com/watch?v=qljY-YfFaPc", "description": ""},
32
+ {"title": "15:01
33
+
34
+
35
+
36
+
37
+
38
+ APEX 2013 SSBM L10 Shroomed VS CT EMP Mew2Kingby Jason AxelrodRecommended for you", "href": "http://www.youtube.com/watch?v=pc7v49k5FhY", "description": ""},
39
+ {"title": "161
40
+
41
+
42
+ videos
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+ Play all
59
+
60
+
61
+
62
+ washby dle3276", "href": "http://www.youtube.com/watch?v=AmcSt5hU4qA&list=PL4517CA6C6244A844", "description": ""},
63
+ {"title": "10:31
64
+
65
+
66
+
67
+
68
+
69
+ Pyramid Magnet - free energy - english subtitleby MrTermsof
70
+
71
+ 616,081 views", "href": "http://www.youtube.com/watch?v=pMbHswNoGWM", "description": ""},
72
+ {"title": "4:11
73
+
74
+
75
+
76
+
77
+
78
+ My all new newman motor 1.(TheDaftman)by theDaftman
79
+
80
+ 1,147,470 views", "href": "http://www.youtube.com/watch?v=dL4B_DNBtvc", "description": ""},
81
+ {"title": "2:18
82
+
83
+
84
+
85
+
86
+
87
+ Is there free energy in magnets?by aetherix01
88
+
89
+ 371,642 views", "href": "http://www.youtube.com/watch?v=vrn5B9a8aOk", "description": ""},
90
+ {"title": "3:00
91
+
92
+
93
+
94
+
95
+
96
+ The Most Dangerous Video On The Internet - Trevor Paglenby killuminati63
97
+
98
+ 585,755 views", "href": "http://www.youtube.com/watch?v=9xEuhEHDJM8", "description": ""},
99
+ {"title": "2:18
100
+
101
+
102
+
103
+
104
+
105
+ Free Energy - Magnet Motorby ATBootstrap
106
+
107
+ 358,641 views", "href": "http://www.youtube.com/watch?v=hfkwCE3BeBs", "description": ""},
108
+ {"title": "2:38
109
+
110
+
111
+
112
+
113
+
114
+ 100% free energy generator is easy to buildby LifeHack2012
115
+
116
+ 238,092 views", "href": "http://www.youtube.com/watch?v=GEUyhhMEs7U", "description": ""},
117
+ {"title": "3:41
118
+
119
+
120
+
121
+
122
+
123
+ 5KW free energy –±–µ—Å—Ç–æ–ø–ª–∏–≤–Ω—ã–π –≥–µ–Ω–µ—Ä–∞—Ç–æ—Ä Kapanadze –ö–∞–ø–∞–Ω–∞–¥–∑–µby Alexander Frolov
124
+
125
+ 488,213 views", "href": "http://www.youtube.com/watch?v=uxQ99R4gOWY", "description": ""},""".split('\n')
126
+ sources = ' '.join([te for te in text if te])
127
+ while True:
128
+ if ' ' in sources:
129
+ sources = sources.replace(' ',' ').replace('\t',' ')
130
+ else:
131
+ break
132
+ sources = sources.replace('}, {','},{').replace('},{','},\n{')
133
+ input(sources)
134
+
135
+
@@ -0,0 +1,220 @@
1
+ import re
2
+ import urllib.parse
3
+ import requests
4
+ from urllib.parse import urlparse, urlunparse, urljoin
5
+
6
+ from ...abstract_webtools import *
7
+
8
+ class urlManager:
9
+ """
10
+ Revised urlManager for managing and cleaning URLs.
11
+
12
+ It splits URLs into their components, normalizes them (trimming spaces, lowercasing
13
+ scheme and domain, removing default ports, and cleaning up paths), and then creates
14
+ a list of potential variants (with/without www, http/https) so that a valid version
15
+ can be determined.
16
+ """
17
+ def __init__(self, url=None, session=None):
18
+ url = url or 'www.example.com'
19
+ self._url = url
20
+ self.session = session or requests
21
+ self.clean_urls = self.clean_url(url)
22
+ self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
23
+ self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
24
+ self.all_urls = []
25
+
26
+ def url_to_pieces(self, url):
27
+ """
28
+ Split a URL into protocol, domain, path, and query components.
29
+ Uses urlparse for robustness.
30
+ """
31
+ try:
32
+ parsed = urlparse(url)
33
+ protocol = parsed.scheme if parsed.scheme else None
34
+ domain = parsed.netloc if parsed.netloc else None
35
+ path = parsed.path or ""
36
+ query = parsed.query or ""
37
+ except Exception as e:
38
+ print(f'The URL {url} was not reachable: {e}')
39
+ protocol, domain, path, query = None, None, "", ""
40
+ return protocol, domain, path, query
41
+
42
+ def clean_url(self, url=None) -> list:
43
+ """
44
+ Normalize and clean the URL, then return a list of potential URL variants.
45
+
46
+ This method:
47
+ - Strips whitespace.
48
+ - Adds a scheme (defaults to https) if missing.
49
+ - Lowercases the scheme and domain.
50
+ - Removes default ports.
51
+ - Cleans up the path (removing duplicate slashes and trailing slash).
52
+ - Generates variants with and without 'www', and with both http and https.
53
+ """
54
+ url = url or self._url
55
+ url = url.strip()
56
+ # Ensure the URL has a scheme
57
+ if not re.match(r'https?://', url):
58
+ url = 'https://' + url
59
+
60
+ parsed = urlparse(url)
61
+ scheme = parsed.scheme.lower()
62
+ netloc = parsed.netloc.lower()
63
+ # Remove default port numbers if present
64
+ if ':' in netloc:
65
+ host, port = netloc.split(':', 1)
66
+ if (scheme == "http" and port == "80") or (scheme == "https" and port == "443"):
67
+ netloc = host
68
+
69
+ # Normalize the path: remove duplicate slashes and a trailing slash
70
+ path = re.sub(r'//+', '/', parsed.path).rstrip('/')
71
+
72
+ # Rebuild the cleaned URL without query or fragment
73
+ cleaned_url = urlunparse((scheme, netloc, path, '', '', ''))
74
+
75
+ variants = []
76
+ # Add the primary variant
77
+ variants.append(cleaned_url)
78
+ # Generate a variant with/without 'www'
79
+ if netloc.startswith('www.'):
80
+ no_www = netloc[4:]
81
+ variants.append(urlunparse((scheme, no_www, path, '', '', '')))
82
+ else:
83
+ variants.append(urlunparse((scheme, f"www.{netloc}", path, '', '', '')))
84
+
85
+ # Also generate variants with the alternate scheme
86
+ alt_scheme = 'http' if scheme == 'https' else 'https'
87
+ for variant in list(variants):
88
+ parsed_variant = urlparse(variant)
89
+ alt_variant = urlunparse((alt_scheme, parsed_variant.netloc, parsed_variant.path, '', '', ''))
90
+ variants.append(alt_variant)
91
+
92
+ # Remove duplicates while preserving order
93
+ seen = set()
94
+ unique_variants = []
95
+ for v in variants:
96
+ if v not in seen:
97
+ unique_variants.append(v)
98
+ seen.add(v)
99
+ return unique_variants
100
+
101
+ def get_correct_url(self, url=None, clean_urls=None) -> str:
102
+ """
103
+ Attempts each URL variant by making an HTTP GET request.
104
+ Returns the first variant that returns a 200 OK response.
105
+ """
106
+ if url is None and clean_urls is None:
107
+ url = self._url
108
+ clean_urls = self.clean_urls
109
+ if url is not None and clean_urls is None:
110
+ clean_urls = self.clean_url(url)
111
+ elif url is None and clean_urls is not None:
112
+ url = self._url
113
+
114
+ for candidate in clean_urls:
115
+ try:
116
+ response = self.session.get(candidate, timeout=5)
117
+ if response.status_code == 200:
118
+ return candidate
119
+ except requests.exceptions.RequestException as e:
120
+ print(f"Failed to reach {candidate}: {e}")
121
+ return None
122
+
123
+ def update_url(self, url):
124
+ """
125
+ Update the URL and refresh related attributes.
126
+ """
127
+ self._url = url
128
+ self.clean_urls = self.clean_url(url)
129
+ self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
130
+ self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
131
+ self.all_urls = []
132
+
133
+ def get_domain(self, url=None):
134
+ url = url or self.url
135
+ return urlparse(url).netloc
136
+
137
+ def url_join(self, base_url, path):
138
+ """
139
+ Joins a base URL with a relative path.
140
+ """
141
+ base_url = base_url.strip().rstrip('/')
142
+ path = path.strip().lstrip('/')
143
+ return f"{base_url}/{path}"
144
+
145
+ @property
146
+ def url(self):
147
+ return self._url
148
+
149
+ @url.setter
150
+ def url(self, new_url):
151
+ self._url = new_url
152
+
153
+ def is_valid_url(self, url=None):
154
+ """
155
+ Check if the given URL is valid.
156
+ """
157
+ url = url or self.url
158
+ parsed = urlparse(url)
159
+ return bool(parsed.scheme) and bool(parsed.netloc)
160
+
161
+ def make_valid(self, href, url=None):
162
+ """
163
+ Validate a href. If it's not already valid, join it with the base URL.
164
+ """
165
+ if self.is_valid_url(href):
166
+ return href
167
+ base = url or self.url
168
+ new_link = urljoin(base, href)
169
+ if self.is_valid_url(new_link):
170
+ return new_link
171
+ return False
172
+
173
+ def get_relative_href(self, base, href):
174
+ """
175
+ For a relative href, join it with the base URL and strip any query or fragment.
176
+ """
177
+ joined = urljoin(base, href)
178
+ parsed = urlparse(joined)
179
+ clean_href = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))
180
+ return clean_href
181
+
182
+ def url_basename(self, url=None):
183
+ url = url or self.url
184
+ path = urlparse(url).path
185
+ return path.strip('/').split('/')[-1]
186
+
187
+ def base_url(self, url=None):
188
+ url = url or self.url
189
+ match = re.match(r'https?://[^?#/]+/', url)
190
+ if match:
191
+ return match.group()
192
+ return None
193
+
194
+ def urljoin(self, base, path):
195
+ return urljoin(base, path)
196
+
197
+ class urlManagerSingleton:
198
+ _instance = None
199
+
200
+ @staticmethod
201
+ def get_instance(url=None, session=requests):
202
+ if urlManagerSingleton._instance is None:
203
+ urlManagerSingleton._instance = urlManager(url, session=session)
204
+ elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
205
+ urlManagerSingleton._instance = urlManager(url, session=session)
206
+ return urlManagerSingleton._instance
207
+
208
+ def get_url(url=None, url_mgr=None):
209
+ if not url and not url_mgr:
210
+ return None
211
+ if url:
212
+ url_mgr = urlManager(url)
213
+ return url_mgr.url
214
+
215
+ def get_url_mgr(url=None, url_mgr=None):
216
+ if url_mgr is None and url:
217
+ url_mgr = urlManager(url=url)
218
+ if url_mgr and url is None:
219
+ url = url_mgr.url
220
+ return url_mgr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.38
3
+ Version: 0.1.6.40
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -36,6 +36,7 @@ src/abstract_webtools/managers/linkManager/linkManager.py
36
36
  src/abstract_webtools/managers/requestManager/__init__.py
37
37
  src/abstract_webtools/managers/requestManager/requestManager.py
38
38
  src/abstract_webtools/managers/soupManager/__init__.py
39
+ src/abstract_webtools/managers/soupManager/asoueces.py
39
40
  src/abstract_webtools/managers/soupManager/soupManager.py
40
41
  src/abstract_webtools/managers/urlManager/__init__.py
41
42
  src/abstract_webtools/managers/urlManager/urlManager.py
@@ -1,119 +0,0 @@
1
- import os
2
- from ..abstract_webtools import *
3
- from .urlManager import *
4
- from urllib.parse import urlparse
5
- from abstract_utilities import *
6
- from selenium import webdriver
7
- from selenium.webdriver.chrome.options import Options
8
- import logging
9
- import urllib3
10
-
11
- # Suppress urllib3 warnings and debug logs
12
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13
- logging.getLogger("urllib3").setLevel(logging.WARNING)
14
-
15
- # Suppress Selenium logs
16
- logging.getLogger("selenium").setLevel(logging.WARNING)
17
-
18
- import os
19
- from selenium import webdriver
20
- from selenium.webdriver.chrome.options import Options
21
-
22
- # Setup Chrome options
23
- chrome_options = Options()
24
- chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
25
- chrome_options.add_argument("--headless") # Run in headless mode
26
- chrome_options.add_argument("--no-sandbox")
27
- chrome_options.add_argument("--disable-dev-shm-usage")
28
- chrome_options.add_argument("--disable-gpu")
29
- chrome_options.add_argument("--disable-software-rasterizer")
30
- chrome_options.add_argument("--disable-extensions")
31
- chrome_options.add_argument("--remote-debugging-port=9222")
32
-
33
-
34
- class SingletonMeta(type):
35
- _instances = {}
36
- def __call__(cls, *args, **kwargs):
37
- if cls not in cls._instances:
38
- instance = super().__call__(*args, **kwargs)
39
- cls._instances[cls] = instance
40
- return cls._instances[cls]
41
-
42
- class seleniumManager(metaclass=SingletonMeta):
43
- def __init__(self, url):
44
- if not hasattr(self, 'initialized'): # Prevent reinitialization
45
- self.initialized = True
46
- parsed_url = urlparse(url)
47
- self.domain = parsed_url.netloc
48
- self.scheme = parsed_url.scheme
49
- self.base_url= f"{self.scheme}{self.domain}"
50
- self.site_dir = os.path.join(os.getcwd(), self.domain)
51
- os.makedirs(self.site_dir, exist_ok=True)
52
- self.drivers = {}
53
- self.page_type = []
54
-
55
- def get_url_to_path(self, url):
56
- url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
57
- parsed_url = urlparse(url)
58
- if parsed_url.netloc == self.domain:
59
- paths = parsed_url.path.split('/')
60
- dir_path = self.site_dir
61
- for path in paths[:-1]:
62
- dir_path = os.path.join(dir_path, path)
63
- os.makedirs(dir_path, exist_ok=True)
64
- self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
65
-
66
- dir_path = os.path.join(dir_path, paths[-1])
67
- return dir_path
68
-
69
- def saved_url_check(self, url):
70
- path = self.get_url_to_path(url)
71
- return path
72
-
73
- def get_with_netloc(self, url):
74
- parsed_url = urlparse(url)
75
- if parsed_url.netloc == '':
76
- url = f"{self.scheme}://{self.domain}/{url.strip()}"
77
- return url
78
-
79
- def get_driver(self, url):
80
- if url and url not in self.drivers:
81
- chrome_options = Options()
82
- chrome_options.add_argument("--headless")
83
- driver = webdriver.Chrome(options=chrome_options)
84
- self.drivers[url] = driver
85
- driver.get(url)
86
- return self.drivers[url]
87
- def normalize_url(url, base_url=None):
88
- """
89
- Normalize and resolve relative URLs, ensuring proper domain and format.
90
- """
91
- # If URL starts with the base URL repeated, remove the extra part
92
- manager = seleniumManager(url)
93
- base_url = manager.base_url
94
- if url.startswith(base_url):
95
- url = url[len(base_url):]
96
-
97
- # Resolve the URL against the base URL
98
- normalized_url = urljoin(base_url, url.split('#')[0])
99
-
100
- # Ensure only URLs belonging to the base domain are kept
101
- if not normalized_url.startswith(base_url):
102
- return None
103
-
104
- return normalized_url
105
- # Function to get Selenium page source
106
- def get_selenium_source(url):
107
- url_mgr = urlManager(url)
108
- if url_mgr.url:
109
- url = str(url_mgr.url)
110
- manager = seleniumManager(url)
111
- driver = manager.get_driver(url)
112
- try:
113
- # Get page source
114
- page_source = driver.page_source
115
- return page_source
116
- finally:
117
- # Don't quit the driver unless you're done with all interactions
118
- pass
119
-
@@ -1,230 +0,0 @@
1
- from ...abstract_webtools import *
2
- class urlManager:
3
- """
4
- urlManager is a class for managing URLs, including cleaning, validating, and finding the correct version.
5
-
6
- Args:
7
- url (str or None): The URL to manage (default is None).
8
- session (requests.Session): A custom requests session (default is the requests module's session).
9
-
10
- Attributes:
11
- session (requests.Session): The requests session used for making HTTP requests.
12
- clean_urls (list): List of cleaned URL variations.
13
- url (str): The current URL.
14
- protocol (str): The protocol part of the URL (e.g., "https").
15
- domain (str): The domain part of the URL (e.g., "example.com").
16
- path (str): The path part of the URL (e.g., "/path/to/resource").
17
- query (str): The query part of the URL (e.g., "?param=value").
18
- all_urls (list): List of all URLs (not used in the provided code).
19
-
20
- Methods:
21
- url_to_pieces(url): Split a URL into its protocol, domain, path, and query components.
22
- clean_url(url): Return a list of potential URL versions with and without 'www' and 'http(s)'.
23
- get_correct_url(url): Get the correct version of the URL from possible variations.
24
- update_url(url): Update the URL and related attributes.
25
- get_domain(url): Get the domain name from a URL.
26
- url_join(url, path): Join a base URL with a path.
27
- is_valid_url(url): Check if a URL is valid.
28
- make_valid(href, url): Make a URL valid by joining it with a base URL.
29
- get_relative_href(url, href): Get the relative href URL by joining it with a base URL.
30
-
31
- Note:
32
- - The urlManager class provides methods for managing URLs, including cleaning and validating them.
33
- - It also includes methods for joining and validating relative URLs.
34
- """
35
-
36
- def __init__(self, url=None, session=None):
37
- """
38
- Initialize a urlManager instance.
39
-
40
- Args:
41
- url (str or None): The URL to manage (default is None).
42
- session (requests.Session): A custom requests session (default is the requests module's session).
43
- """
44
- url = url or 'www.example.com'
45
- self._url=url
46
- self.url = url
47
- self.session= session or requests
48
- self.clean_urls = self.clean_url(url=url)
49
- self.url = self.get_correct_url(clean_urls=self.clean_urls)
50
- url_pieces = self.url_to_pieces(url=self.url)
51
- self.protocol,self.domain,self.path,self.query=url_pieces
52
- self.all_urls = []
53
- def url_to_pieces(self, url):
54
-
55
- try:
56
- match = re.match(r'^(https?)?://?([^/]+)(/[^?]+)?(\?.+)?', url)
57
- if match:
58
- protocol = match.group(1) if match.group(1) else None
59
- domain = match.group(2) if match.group(1) else None
60
- path = match.group(3) if match.group(3) else "" # Handle None
61
- query = match.group(4) if match.group(4) else "" # Handle None
62
- except:
63
- print(f'the url {url} was not reachable')
64
- protocol,domain,path,query=None,None,"",""
65
- return protocol, domain, path, query
66
-
67
- def clean_url(self,url=None) -> list:
68
- """
69
- Given a URL, return a list with potential URL versions including with and without 'www.',
70
- and with 'http://' and 'https://'.
71
- """
72
- url = url or self.url
73
- urls=[]
74
- if url:
75
- # Remove http:// or https:// prefix
76
- cleaned = url.replace("http://", "").replace("https://", "")
77
- no_subdomain = cleaned.replace("www.", "", 1)
78
-
79
- urls = [
80
- f"https://{cleaned}",
81
- f"http://{cleaned}",
82
- ]
83
-
84
- # Add variants without 'www' if it was present
85
- if cleaned != no_subdomain:
86
- urls.extend([
87
- f"https://{no_subdomain}",
88
- f"http://{no_subdomain}",
89
- ])
90
-
91
- # Add variants with 'www' if it wasn't present
92
- else:
93
- urls.extend([
94
- f"https://www.{cleaned}",
95
- f"http://www.{cleaned}",
96
- ])
97
-
98
- return urls
99
-
100
- def get_correct_url(self,url=None,clean_urls=None) -> (str or None):
101
- """
102
- Gets the correct URL from the possible variations by trying each one with an HTTP request.
103
-
104
- Args:
105
- url (str): The URL to find the correct version of.
106
- session (type(requests.Session), optional): The requests session to use for making HTTP requests.
107
- Defaults to requests.
108
-
109
- Returns:
110
- str: The correct version of the URL if found, or None if none of the variations are valid.
111
- """
112
- self.url = url
113
- if url==None and clean_urls != None:
114
- if self.url:
115
- url=self.url or clean_urls[0]
116
- if url!=None and clean_urls==None:
117
- clean_urls=self.clean_url(url)
118
- elif url==None and clean_urls==None:
119
- url=self.url
120
- clean_urls=self.clean_urls
121
- # Get the correct URL from the possible variations
122
- for url in clean_urls:
123
- try:
124
- source = self.session.get(url)
125
- return url
126
- except requests.exceptions.RequestException as e:
127
- print(e)
128
- return None
129
- def update_url(self,url):
130
- # These methods seem essential for setting up the urlManager object.
131
- self.url = url
132
- self.clean_urls = self.clean_url()
133
- self.correct_url = self.get_correct_url()
134
- self.url =self.correct_url
135
- self.protocol,self.domain,self.path,self.query=self.url_to_pieces(url=self.url)
136
- self.all_urls = []
137
- def get_domain(self,url=None):
138
- url = url or self.url
139
- return urlparse(url).netloc
140
- def url_join(self,url,path):
141
- url = eatOuter(url,['/'])
142
- path = eatInner(path,['/'])
143
- slash=''
144
- if path[0] not in ['?','&']:
145
- slash = '/'
146
- url = url+slash+path
147
- return url
148
- @property
149
- def url(self):
150
- return self._url
151
- @url.setter
152
- def url(self, new_url):
153
- self._url = new_url
154
- def is_valid_url(self,url=None):
155
- """
156
- Check if the given URL is valid.
157
- """
158
- url = url or self.url
159
- parsed = urlparse(url)
160
- return bool(parsed.netloc) and bool(parsed.scheme)
161
-
162
- def make_valid(self,href,url=None):
163
- def is_valid_url(url):
164
- url = url or self.url
165
- """
166
- Check if the given URL is valid.
167
- """
168
- parsed = urlparse(url)
169
- return bool(parsed.netloc) and bool(parsed.scheme)
170
- if is_valid_url(href):
171
- return href
172
- new_link=urljoin(url,href)
173
- if is_valid_url(new_link):
174
- return new_link
175
- return False
176
-
177
- def get_relative_href(self,url,href):
178
- # join the URL if it's relative (not an absolute link)
179
- url = url or self.url
180
- href = urljoin(url, href)
181
- parsed_href = urlparse(href)
182
- # remove URL GET parameters, URL fragments, etc.
183
- href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
184
- return href
185
- def url_basename(self,url=None):
186
- url = url or self.url
187
- path = urllib.parse.urlparse(url).path
188
- return path.strip('/').split('/')[-1]
189
-
190
-
191
- def base_url(self,url=None):
192
- url = url or self.url
193
- return re.match(r'https?://[^?#]+/', url).group()
194
-
195
-
196
- def urljoin(self,base, path):
197
- if isinstance(path, bytes):
198
- path = path.decode()
199
- if not isinstance(path, str) or not path:
200
- return None
201
- if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
202
- return path
203
- if isinstance(base, bytes):
204
- base = base.decode()
205
- if not isinstance(base, str) or not re.match(
206
- r'^(?:https?:)?//', base):
207
- return None
208
- return urllib.parse.urljoin(base, path)
209
- class urlManagerSingleton:
210
- _instance = None
211
- @staticmethod
212
- def get_instance(url=None,session=requests):
213
- if urlManagerSingleton._instance is None:
214
- urlManagerSingleton._instance = urlManager(url,session=session)
215
- elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
216
- urlManagerSingleton._instance = urlManager(url,session=session)
217
- return urlManagerSingleton._instance
218
-
219
- def get_url(url=None,url_mgr=None):
220
- if not url and not url_mgr:
221
- return None
222
- if url:
223
- url_mgr = urlManager(url)
224
- return url_mgr.url
225
- def get_url_mgr(url=None,url_mgr=None):
226
- if url_mgr == None and url:
227
- url_mgr = urlManager(url=url)
228
- if url_mgr and url == None:
229
- url = url_mgr.url
230
- return url_mgr