abstract-webtools 0.1.6.38__py3-none-any.whl → 0.1.6.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/seleniumManager.py +62 -53
- abstract_webtools/managers/soupManager/asoueces.py +135 -0
- abstract_webtools/managers/urlManager/urlManager.py +173 -183
- {abstract_webtools-0.1.6.38.dist-info → abstract_webtools-0.1.6.40.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.38.dist-info → abstract_webtools-0.1.6.40.dist-info}/RECORD +8 -7
- {abstract_webtools-0.1.6.38.dist-info → abstract_webtools-0.1.6.40.dist-info}/WHEEL +1 -1
- {abstract_webtools-0.1.6.38.dist-info → abstract_webtools-0.1.6.40.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.6.38.dist-info → abstract_webtools-0.1.6.40.dist-info}/top_level.txt +0 -0
@@ -1,35 +1,29 @@
|
|
1
1
|
import os
|
2
|
-
from ..abstract_webtools import *
|
3
|
-
from .urlManager import *
|
4
2
|
from urllib.parse import urlparse
|
5
|
-
from abstract_utilities import *
|
6
3
|
from selenium import webdriver
|
7
4
|
from selenium.webdriver.chrome.options import Options
|
5
|
+
from selenium.webdriver.chrome.service import Service
|
6
|
+
from webdriver_manager.chrome import ChromeDriverManager # For automatic ChromeDriver installation
|
8
7
|
import logging
|
9
8
|
import urllib3
|
9
|
+
from ..abstract_webtools import * # Assuming this is a valid import
|
10
|
+
from .urlManager import *
|
10
11
|
|
11
12
|
# Suppress urllib3 warnings and debug logs
|
12
13
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
13
14
|
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
14
|
-
|
15
|
-
# Suppress Selenium logs
|
16
15
|
logging.getLogger("selenium").setLevel(logging.WARNING)
|
17
16
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
#
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
chrome_options.add_argument("--disable-gpu")
|
29
|
-
chrome_options.add_argument("--disable-software-rasterizer")
|
30
|
-
chrome_options.add_argument("--disable-extensions")
|
31
|
-
chrome_options.add_argument("--remote-debugging-port=9222")
|
32
|
-
|
17
|
+
# Default Chrome options (can be overridden)
|
18
|
+
DEFAULT_CHROME_OPTIONS = [
|
19
|
+
"--headless", # Run in headless mode
|
20
|
+
"--no-sandbox",
|
21
|
+
"--disable-dev-shm-usage", # Avoid memory issues on servers
|
22
|
+
"--disable-gpu",
|
23
|
+
"--disable-software-rasterizer",
|
24
|
+
"--disable-extensions",
|
25
|
+
"--remote-debugging-port=9222"
|
26
|
+
]
|
33
27
|
|
34
28
|
class SingletonMeta(type):
|
35
29
|
_instances = {}
|
@@ -39,21 +33,21 @@ class SingletonMeta(type):
|
|
39
33
|
cls._instances[cls] = instance
|
40
34
|
return cls._instances[cls]
|
41
35
|
|
42
|
-
class
|
36
|
+
class SeleniumManager(metaclass=SingletonMeta):
|
43
37
|
def __init__(self, url):
|
44
38
|
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
45
39
|
self.initialized = True
|
46
40
|
parsed_url = urlparse(url)
|
47
41
|
self.domain = parsed_url.netloc
|
48
|
-
self.scheme = parsed_url.scheme
|
49
|
-
self.base_url= f"{self.scheme}{self.domain}"
|
42
|
+
self.scheme = parsed_url.scheme or "https" # Default to https if scheme is missing
|
43
|
+
self.base_url = f"{self.scheme}://{self.domain}"
|
50
44
|
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
51
45
|
os.makedirs(self.site_dir, exist_ok=True)
|
52
46
|
self.drivers = {}
|
53
47
|
self.page_type = []
|
54
|
-
|
48
|
+
|
55
49
|
def get_url_to_path(self, url):
|
56
|
-
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
50
|
+
url = eatAll(str(url), ['', ' ', '\n', '\t', '\\', '/']) # Assuming eatAll is defined elsewhere
|
57
51
|
parsed_url = urlparse(url)
|
58
52
|
if parsed_url.netloc == self.domain:
|
59
53
|
paths = parsed_url.path.split('/')
|
@@ -61,59 +55,74 @@ class seleniumManager(metaclass=SingletonMeta):
|
|
61
55
|
for path in paths[:-1]:
|
62
56
|
dir_path = os.path.join(dir_path, path)
|
63
57
|
os.makedirs(dir_path, exist_ok=True)
|
64
|
-
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if
|
65
|
-
|
58
|
+
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if not self.page_type else self.page_type[-1])
|
66
59
|
dir_path = os.path.join(dir_path, paths[-1])
|
67
60
|
return dir_path
|
68
61
|
|
69
62
|
def saved_url_check(self, url):
|
70
|
-
|
71
|
-
return path
|
63
|
+
return self.get_url_to_path(url)
|
72
64
|
|
73
65
|
def get_with_netloc(self, url):
|
74
66
|
parsed_url = urlparse(url)
|
75
|
-
if parsed_url.netloc
|
76
|
-
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
67
|
+
if not parsed_url.netloc:
|
68
|
+
url = f"{self.scheme}://{self.domain}/{url.strip('/')}"
|
77
69
|
return url
|
78
70
|
|
79
71
|
def get_driver(self, url):
|
80
72
|
if url and url not in self.drivers:
|
73
|
+
# Set up Chrome options
|
81
74
|
chrome_options = Options()
|
82
|
-
|
83
|
-
|
75
|
+
for option in DEFAULT_CHROME_OPTIONS:
|
76
|
+
chrome_options.add_argument(option)
|
77
|
+
|
78
|
+
# Specify Chrome binary location if needed (optional, comment out if not applicable)
|
79
|
+
# chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
80
|
+
|
81
|
+
# Automatically install and use ChromeDriver
|
82
|
+
service = Service(ChromeDriverManager().install())
|
83
|
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
84
|
+
|
84
85
|
self.drivers[url] = driver
|
85
86
|
driver.get(url)
|
86
87
|
return self.drivers[url]
|
88
|
+
|
89
|
+
def quit_driver(self, url):
|
90
|
+
"""Clean up a specific driver instance."""
|
91
|
+
if url in self.drivers:
|
92
|
+
self.drivers[url].quit()
|
93
|
+
del self.drivers[url]
|
94
|
+
|
95
|
+
def quit_all_drivers(self):
|
96
|
+
"""Clean up all driver instances."""
|
97
|
+
for driver in self.drivers.values():
|
98
|
+
driver.quit()
|
99
|
+
self.drivers.clear()
|
100
|
+
|
87
101
|
def normalize_url(url, base_url=None):
|
88
|
-
"""
|
89
|
-
|
90
|
-
|
91
|
-
# If URL starts with the base URL repeated, remove the extra part
|
92
|
-
manager = seleniumManager(url)
|
93
|
-
base_url = manager.base_url
|
102
|
+
"""Normalize and resolve relative URLs."""
|
103
|
+
manager = SeleniumManager(url)
|
104
|
+
base_url = manager.base_url if base_url is None else base_url
|
94
105
|
if url.startswith(base_url):
|
95
106
|
url = url[len(base_url):]
|
96
|
-
|
97
|
-
# Resolve the URL against the base URL
|
98
107
|
normalized_url = urljoin(base_url, url.split('#')[0])
|
99
|
-
|
100
|
-
# Ensure only URLs belonging to the base domain are kept
|
101
108
|
if not normalized_url.startswith(base_url):
|
102
109
|
return None
|
103
|
-
|
104
110
|
return normalized_url
|
105
|
-
|
111
|
+
|
106
112
|
def get_selenium_source(url):
|
107
|
-
|
113
|
+
"""Fetch page source using Selenium."""
|
114
|
+
url_mgr = urlManager(url) # Assuming urlManager is defined elsewhere
|
108
115
|
if url_mgr.url:
|
109
116
|
url = str(url_mgr.url)
|
110
|
-
manager =
|
117
|
+
manager = SeleniumManager(url)
|
111
118
|
driver = manager.get_driver(url)
|
112
119
|
try:
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
pass
|
120
|
+
return driver.page_source
|
121
|
+
except Exception as e:
|
122
|
+
logging.error(f"Error fetching page source for {url}: {e}")
|
123
|
+
return None
|
124
|
+
# Note: Driver is not quit here to maintain Singleton behavior
|
119
125
|
|
126
|
+
# Ensure cleanup on program exit (optional)
|
127
|
+
import atexit
|
128
|
+
atexit.register(lambda: SeleniumManager(url="").quit_all_drivers()) # Cleanup all drivers on exit
|
@@ -0,0 +1,135 @@
|
|
1
|
+
text = """{"title": "NoviSoul
|
2
|
+
novissbm@gmail.com", "href": "http://www.youtube.com/signin?authuser=0&next=%2Fwatch%3Fv%3DEaIYRM1yrM4&action_handle_signin=true", "description": ""},
|
3
|
+
{"title": "Sign in", "href": "https://accounts.google.com/ServiceLogin?continue=http%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26hl%3Den_GB%26next%3D%252Fwatch%253Fv%253DEaIYRM1yrM4%26nomobiletemp%3D1&uilel=3&service=youtube&passive=true&hl=en_GB", "description": ""},
|
4
|
+
{"title": "Sign up", "href": "http://www.youtube.com/signup?next=%2Fwatch%3Fv%3DEaIYRM1yrM4", "description": ""},
|
5
|
+
{"title": "9:58
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
Physics of Free Energy Deviceby Eugene Jeong
|
12
|
+
|
13
|
+
336,881 views", "href": "http://www.youtube.com/watch?v=EB-jWfzkz_E", "description": ""},
|
14
|
+
{"title": "4:49
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
[www.witts.ws] Self-Running 40kW (40,000 Watt) Fuelless Generator (1 of 3)by wits2014
|
21
|
+
|
22
|
+
488,638 views", "href": "http://www.youtube.com/watch?v=LFu-s6ZmGyE", "description": ""},
|
23
|
+
{"title": "2:33
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
Free Energy - Evidence of military antigravity technologyby DoubleMarkez
|
30
|
+
|
31
|
+
390,020 views", "href": "http://www.youtube.com/watch?v=qljY-YfFaPc", "description": ""},
|
32
|
+
{"title": "15:01
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
APEX 2013 SSBM L10 Shroomed VS CT EMP Mew2Kingby Jason AxelrodRecommended for you", "href": "http://www.youtube.com/watch?v=pc7v49k5FhY", "description": ""},
|
39
|
+
{"title": "161
|
40
|
+
|
41
|
+
|
42
|
+
videos
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
Play all
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
washby dle3276", "href": "http://www.youtube.com/watch?v=AmcSt5hU4qA&list=PL4517CA6C6244A844", "description": ""},
|
63
|
+
{"title": "10:31
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
Pyramid Magnet - free energy - english subtitleby MrTermsof
|
70
|
+
|
71
|
+
616,081 views", "href": "http://www.youtube.com/watch?v=pMbHswNoGWM", "description": ""},
|
72
|
+
{"title": "4:11
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
My all new newman motor 1.(TheDaftman)by theDaftman
|
79
|
+
|
80
|
+
1,147,470 views", "href": "http://www.youtube.com/watch?v=dL4B_DNBtvc", "description": ""},
|
81
|
+
{"title": "2:18
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
Is there free energy in magnets?by aetherix01
|
88
|
+
|
89
|
+
371,642 views", "href": "http://www.youtube.com/watch?v=vrn5B9a8aOk", "description": ""},
|
90
|
+
{"title": "3:00
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
The Most Dangerous Video On The Internet - Trevor Paglenby killuminati63
|
97
|
+
|
98
|
+
585,755 views", "href": "http://www.youtube.com/watch?v=9xEuhEHDJM8", "description": ""},
|
99
|
+
{"title": "2:18
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
Free Energy - Magnet Motorby ATBootstrap
|
106
|
+
|
107
|
+
358,641 views", "href": "http://www.youtube.com/watch?v=hfkwCE3BeBs", "description": ""},
|
108
|
+
{"title": "2:38
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
100% free energy generator is easy to buildby LifeHack2012
|
115
|
+
|
116
|
+
238,092 views", "href": "http://www.youtube.com/watch?v=GEUyhhMEs7U", "description": ""},
|
117
|
+
{"title": "3:41
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
5KW free energy –±–µ—Å—Ç–æ–ø–ª–∏–≤–Ω—ã–π –≥–µ–Ω–µ—Ä–∞—Ç–æ—Ä Kapanadze –ö–∞–ø–∞–Ω–∞–¥–∑–µby Alexander Frolov
|
124
|
+
|
125
|
+
488,213 views", "href": "http://www.youtube.com/watch?v=uxQ99R4gOWY", "description": ""},""".split('\n')
|
126
|
+
sources = ' '.join([te for te in text if te])
|
127
|
+
while True:
|
128
|
+
if ' ' in sources:
|
129
|
+
sources = sources.replace(' ',' ').replace('\t',' ')
|
130
|
+
else:
|
131
|
+
break
|
132
|
+
sources = sources.replace('}, {','},{').replace('},{','},\n{')
|
133
|
+
input(sources)
|
134
|
+
|
135
|
+
|
@@ -1,230 +1,220 @@
|
|
1
|
+
import re
|
2
|
+
import urllib.parse
|
3
|
+
import requests
|
4
|
+
from urllib.parse import urlparse, urlunparse, urljoin
|
5
|
+
|
1
6
|
from ...abstract_webtools import *
|
7
|
+
|
2
8
|
class urlManager:
|
3
9
|
"""
|
4
|
-
urlManager
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
Attributes:
|
11
|
-
session (requests.Session): The requests session used for making HTTP requests.
|
12
|
-
clean_urls (list): List of cleaned URL variations.
|
13
|
-
url (str): The current URL.
|
14
|
-
protocol (str): The protocol part of the URL (e.g., "https").
|
15
|
-
domain (str): The domain part of the URL (e.g., "example.com").
|
16
|
-
path (str): The path part of the URL (e.g., "/path/to/resource").
|
17
|
-
query (str): The query part of the URL (e.g., "?param=value").
|
18
|
-
all_urls (list): List of all URLs (not used in the provided code).
|
19
|
-
|
20
|
-
Methods:
|
21
|
-
url_to_pieces(url): Split a URL into its protocol, domain, path, and query components.
|
22
|
-
clean_url(url): Return a list of potential URL versions with and without 'www' and 'http(s)'.
|
23
|
-
get_correct_url(url): Get the correct version of the URL from possible variations.
|
24
|
-
update_url(url): Update the URL and related attributes.
|
25
|
-
get_domain(url): Get the domain name from a URL.
|
26
|
-
url_join(url, path): Join a base URL with a path.
|
27
|
-
is_valid_url(url): Check if a URL is valid.
|
28
|
-
make_valid(href, url): Make a URL valid by joining it with a base URL.
|
29
|
-
get_relative_href(url, href): Get the relative href URL by joining it with a base URL.
|
30
|
-
|
31
|
-
Note:
|
32
|
-
- The urlManager class provides methods for managing URLs, including cleaning and validating them.
|
33
|
-
- It also includes methods for joining and validating relative URLs.
|
10
|
+
Revised urlManager for managing and cleaning URLs.
|
11
|
+
|
12
|
+
It splits URLs into their components, normalizes them (trimming spaces, lowercasing
|
13
|
+
scheme and domain, removing default ports, and cleaning up paths), and then creates
|
14
|
+
a list of potential variants (with/without www, http/https) so that a valid version
|
15
|
+
can be determined.
|
34
16
|
"""
|
35
|
-
|
36
17
|
def __init__(self, url=None, session=None):
|
37
|
-
"""
|
38
|
-
Initialize a urlManager instance.
|
39
|
-
|
40
|
-
Args:
|
41
|
-
url (str or None): The URL to manage (default is None).
|
42
|
-
session (requests.Session): A custom requests session (default is the requests module's session).
|
43
|
-
"""
|
44
18
|
url = url or 'www.example.com'
|
45
|
-
self._url=url
|
46
|
-
self.
|
47
|
-
self.
|
48
|
-
self.
|
49
|
-
self.
|
50
|
-
url_pieces = self.url_to_pieces(url=self.url)
|
51
|
-
self.protocol,self.domain,self.path,self.query=url_pieces
|
19
|
+
self._url = url
|
20
|
+
self.session = session or requests
|
21
|
+
self.clean_urls = self.clean_url(url)
|
22
|
+
self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
|
23
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
52
24
|
self.all_urls = []
|
25
|
+
|
53
26
|
def url_to_pieces(self, url):
|
54
|
-
|
27
|
+
"""
|
28
|
+
Split a URL into protocol, domain, path, and query components.
|
29
|
+
Uses urlparse for robustness.
|
30
|
+
"""
|
55
31
|
try:
|
56
|
-
|
57
|
-
if
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
protocol,domain,path,query=None,None,"",""
|
32
|
+
parsed = urlparse(url)
|
33
|
+
protocol = parsed.scheme if parsed.scheme else None
|
34
|
+
domain = parsed.netloc if parsed.netloc else None
|
35
|
+
path = parsed.path or ""
|
36
|
+
query = parsed.query or ""
|
37
|
+
except Exception as e:
|
38
|
+
print(f'The URL {url} was not reachable: {e}')
|
39
|
+
protocol, domain, path, query = None, None, "", ""
|
65
40
|
return protocol, domain, path, query
|
66
41
|
|
67
|
-
def clean_url(self,url=None) -> list:
|
68
|
-
"""
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
42
|
+
def clean_url(self, url=None) -> list:
|
43
|
+
"""
|
44
|
+
Normalize and clean the URL, then return a list of potential URL variants.
|
45
|
+
|
46
|
+
This method:
|
47
|
+
- Strips whitespace.
|
48
|
+
- Adds a scheme (defaults to https) if missing.
|
49
|
+
- Lowercases the scheme and domain.
|
50
|
+
- Removes default ports.
|
51
|
+
- Cleans up the path (removing duplicate slashes and trailing slash).
|
52
|
+
- Generates variants with and without 'www', and with both http and https.
|
53
|
+
"""
|
54
|
+
url = url or self._url
|
55
|
+
url = url.strip()
|
56
|
+
# Ensure the URL has a scheme
|
57
|
+
if not re.match(r'https?://', url):
|
58
|
+
url = 'https://' + url
|
59
|
+
|
60
|
+
parsed = urlparse(url)
|
61
|
+
scheme = parsed.scheme.lower()
|
62
|
+
netloc = parsed.netloc.lower()
|
63
|
+
# Remove default port numbers if present
|
64
|
+
if ':' in netloc:
|
65
|
+
host, port = netloc.split(':', 1)
|
66
|
+
if (scheme == "http" and port == "80") or (scheme == "https" and port == "443"):
|
67
|
+
netloc = host
|
68
|
+
|
69
|
+
# Normalize the path: remove duplicate slashes and a trailing slash
|
70
|
+
path = re.sub(r'//+', '/', parsed.path).rstrip('/')
|
71
|
+
|
72
|
+
# Rebuild the cleaned URL without query or fragment
|
73
|
+
cleaned_url = urlunparse((scheme, netloc, path, '', '', ''))
|
74
|
+
|
75
|
+
variants = []
|
76
|
+
# Add the primary variant
|
77
|
+
variants.append(cleaned_url)
|
78
|
+
# Generate a variant with/without 'www'
|
79
|
+
if netloc.startswith('www.'):
|
80
|
+
no_www = netloc[4:]
|
81
|
+
variants.append(urlunparse((scheme, no_www, path, '', '', '')))
|
82
|
+
else:
|
83
|
+
variants.append(urlunparse((scheme, f"www.{netloc}", path, '', '', '')))
|
84
|
+
|
85
|
+
# Also generate variants with the alternate scheme
|
86
|
+
alt_scheme = 'http' if scheme == 'https' else 'https'
|
87
|
+
for variant in list(variants):
|
88
|
+
parsed_variant = urlparse(variant)
|
89
|
+
alt_variant = urlunparse((alt_scheme, parsed_variant.netloc, parsed_variant.path, '', '', ''))
|
90
|
+
variants.append(alt_variant)
|
91
|
+
|
92
|
+
# Remove duplicates while preserving order
|
93
|
+
seen = set()
|
94
|
+
unique_variants = []
|
95
|
+
for v in variants:
|
96
|
+
if v not in seen:
|
97
|
+
unique_variants.append(v)
|
98
|
+
seen.add(v)
|
99
|
+
return unique_variants
|
100
|
+
|
101
|
+
def get_correct_url(self, url=None, clean_urls=None) -> str:
|
102
|
+
"""
|
103
|
+
Attempts each URL variant by making an HTTP GET request.
|
104
|
+
Returns the first variant that returns a 200 OK response.
|
105
|
+
"""
|
106
|
+
if url is None and clean_urls is None:
|
107
|
+
url = self._url
|
108
|
+
clean_urls = self.clean_urls
|
109
|
+
if url is not None and clean_urls is None:
|
110
|
+
clean_urls = self.clean_url(url)
|
111
|
+
elif url is None and clean_urls is not None:
|
112
|
+
url = self._url
|
113
|
+
|
114
|
+
for candidate in clean_urls:
|
123
115
|
try:
|
124
|
-
|
125
|
-
|
116
|
+
response = self.session.get(candidate, timeout=5)
|
117
|
+
if response.status_code == 200:
|
118
|
+
return candidate
|
126
119
|
except requests.exceptions.RequestException as e:
|
127
|
-
print(e)
|
120
|
+
print(f"Failed to reach {candidate}: {e}")
|
128
121
|
return None
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
self.
|
135
|
-
self.
|
122
|
+
|
123
|
+
def update_url(self, url):
|
124
|
+
"""
|
125
|
+
Update the URL and refresh related attributes.
|
126
|
+
"""
|
127
|
+
self._url = url
|
128
|
+
self.clean_urls = self.clean_url(url)
|
129
|
+
self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
|
130
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
136
131
|
self.all_urls = []
|
137
|
-
|
138
|
-
|
132
|
+
|
133
|
+
def get_domain(self, url=None):
|
134
|
+
url = url or self.url
|
139
135
|
return urlparse(url).netloc
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
return
|
136
|
+
|
137
|
+
def url_join(self, base_url, path):
|
138
|
+
"""
|
139
|
+
Joins a base URL with a relative path.
|
140
|
+
"""
|
141
|
+
base_url = base_url.strip().rstrip('/')
|
142
|
+
path = path.strip().lstrip('/')
|
143
|
+
return f"{base_url}/{path}"
|
144
|
+
|
148
145
|
@property
|
149
146
|
def url(self):
|
150
147
|
return self._url
|
148
|
+
|
151
149
|
@url.setter
|
152
150
|
def url(self, new_url):
|
153
151
|
self._url = new_url
|
154
|
-
|
152
|
+
|
153
|
+
def is_valid_url(self, url=None):
|
155
154
|
"""
|
156
155
|
Check if the given URL is valid.
|
157
156
|
"""
|
158
|
-
url = url or self.url
|
157
|
+
url = url or self.url
|
159
158
|
parsed = urlparse(url)
|
160
|
-
return bool(parsed.
|
159
|
+
return bool(parsed.scheme) and bool(parsed.netloc)
|
161
160
|
|
162
|
-
def make_valid(self,href,url=None):
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
"""
|
168
|
-
parsed = urlparse(url)
|
169
|
-
return bool(parsed.netloc) and bool(parsed.scheme)
|
170
|
-
if is_valid_url(href):
|
161
|
+
def make_valid(self, href, url=None):
|
162
|
+
"""
|
163
|
+
Validate a href. If it's not already valid, join it with the base URL.
|
164
|
+
"""
|
165
|
+
if self.is_valid_url(href):
|
171
166
|
return href
|
172
|
-
|
173
|
-
|
167
|
+
base = url or self.url
|
168
|
+
new_link = urljoin(base, href)
|
169
|
+
if self.is_valid_url(new_link):
|
174
170
|
return new_link
|
175
171
|
return False
|
176
|
-
|
177
|
-
def get_relative_href(self,
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
return
|
185
|
-
|
186
|
-
|
187
|
-
|
172
|
+
|
173
|
+
def get_relative_href(self, base, href):
|
174
|
+
"""
|
175
|
+
For a relative href, join it with the base URL and strip any query or fragment.
|
176
|
+
"""
|
177
|
+
joined = urljoin(base, href)
|
178
|
+
parsed = urlparse(joined)
|
179
|
+
clean_href = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))
|
180
|
+
return clean_href
|
181
|
+
|
182
|
+
def url_basename(self, url=None):
|
183
|
+
url = url or self.url
|
184
|
+
path = urlparse(url).path
|
188
185
|
return path.strip('/').split('/')[-1]
|
189
186
|
|
187
|
+
def base_url(self, url=None):
|
188
|
+
url = url or self.url
|
189
|
+
match = re.match(r'https?://[^?#/]+/', url)
|
190
|
+
if match:
|
191
|
+
return match.group()
|
192
|
+
return None
|
193
|
+
|
194
|
+
def urljoin(self, base, path):
|
195
|
+
return urljoin(base, path)
|
190
196
|
|
191
|
-
def base_url(self,url=None):
|
192
|
-
url = url or self.url
|
193
|
-
return re.match(r'https?://[^?#]+/', url).group()
|
194
|
-
|
195
|
-
|
196
|
-
def urljoin(self,base, path):
|
197
|
-
if isinstance(path, bytes):
|
198
|
-
path = path.decode()
|
199
|
-
if not isinstance(path, str) or not path:
|
200
|
-
return None
|
201
|
-
if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
|
202
|
-
return path
|
203
|
-
if isinstance(base, bytes):
|
204
|
-
base = base.decode()
|
205
|
-
if not isinstance(base, str) or not re.match(
|
206
|
-
r'^(?:https?:)?//', base):
|
207
|
-
return None
|
208
|
-
return urllib.parse.urljoin(base, path)
|
209
197
|
class urlManagerSingleton:
|
210
198
|
_instance = None
|
199
|
+
|
211
200
|
@staticmethod
|
212
|
-
def get_instance(url=None,session=requests):
|
201
|
+
def get_instance(url=None, session=requests):
|
213
202
|
if urlManagerSingleton._instance is None:
|
214
|
-
urlManagerSingleton._instance = urlManager(url,session=session)
|
203
|
+
urlManagerSingleton._instance = urlManager(url, session=session)
|
215
204
|
elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
|
216
|
-
urlManagerSingleton._instance = urlManager(url,session=session)
|
205
|
+
urlManagerSingleton._instance = urlManager(url, session=session)
|
217
206
|
return urlManagerSingleton._instance
|
218
207
|
|
219
|
-
def get_url(url=None,url_mgr=None):
|
208
|
+
def get_url(url=None, url_mgr=None):
|
220
209
|
if not url and not url_mgr:
|
221
210
|
return None
|
222
211
|
if url:
|
223
212
|
url_mgr = urlManager(url)
|
224
213
|
return url_mgr.url
|
225
|
-
|
226
|
-
|
214
|
+
|
215
|
+
def get_url_mgr(url=None, url_mgr=None):
|
216
|
+
if url_mgr is None and url:
|
227
217
|
url_mgr = urlManager(url=url)
|
228
|
-
if url_mgr and url
|
218
|
+
if url_mgr and url is None:
|
229
219
|
url = url_mgr.url
|
230
|
-
return url_mgr
|
220
|
+
return url_mgr
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.40
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -17,7 +17,7 @@ abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rg
|
|
17
17
|
abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
|
18
18
|
abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
|
19
19
|
abstract_webtools/managers/requestManager.py,sha256=zXD31WAYghV1OjnTQzRQnQGqZz6_J4mjHTdNLnBop_0,17343
|
20
|
-
abstract_webtools/managers/seleniumManager.py,sha256=
|
20
|
+
abstract_webtools/managers/seleniumManager.py,sha256=B7X6nTfxs1eHFDo7LKB1N5LhDytZQzHPgJjna2c2j6E,5017
|
21
21
|
abstract_webtools/managers/soupManager.py,sha256=-_mRCWlyzfKlF64UU53WXBmCvJ98jQ4GyHh8S8Pw3xs,17198
|
22
22
|
abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
|
23
23
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
@@ -30,12 +30,13 @@ abstract_webtools/managers/linkManager/linkManager.py,sha256=roxOzOELca0rOlcMaJk
|
|
30
30
|
abstract_webtools/managers/requestManager/__init__.py,sha256=z2qGtweEoO_OKr959LGxVXEMu1hu7PIkmh89BEh5TI8,30
|
31
31
|
abstract_webtools/managers/requestManager/requestManager.py,sha256=MrPJAXRNDXjwE_BzJF3xwraT54IxVNmPU4eHhDgNmbE,17351
|
32
32
|
abstract_webtools/managers/soupManager/__init__.py,sha256=mqfXfqM9sWlYpOkoXUqtBoVvk2KQx1862NnmRVJwGtY,27
|
33
|
+
abstract_webtools/managers/soupManager/asoueces.py,sha256=OaXqolZl0dI7b09NYwJ3Wnhuxf89ahZ1GjsOqy0GXfk,3506
|
33
34
|
abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2sIkg-bHxBt2mKpYMyZd-nJjLQ,17201
|
34
35
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
35
|
-
abstract_webtools/managers/urlManager/urlManager.py,sha256
|
36
|
+
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
36
37
|
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
37
|
-
abstract_webtools-0.1.6.
|
38
|
-
abstract_webtools-0.1.6.
|
39
|
-
abstract_webtools-0.1.6.
|
40
|
-
abstract_webtools-0.1.6.
|
41
|
-
abstract_webtools-0.1.6.
|
38
|
+
abstract_webtools-0.1.6.40.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
39
|
+
abstract_webtools-0.1.6.40.dist-info/METADATA,sha256=kKCrv_8-h4JuBeI1YGfB-sWkaQp-H5LS5IyPfw1ooUs,16051
|
40
|
+
abstract_webtools-0.1.6.40.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
41
|
+
abstract_webtools-0.1.6.40.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
42
|
+
abstract_webtools-0.1.6.40.dist-info/RECORD,,
|
File without changes
|
File without changes
|