abstract-webtools 0.1.6.38__tar.gz → 0.1.6.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {abstract_webtools-0.1.6.38/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.40}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/setup.py +1 -1
- abstract_webtools-0.1.6.40/src/abstract_webtools/managers/seleniumManager.py +128 -0
- abstract_webtools-0.1.6.40/src/abstract_webtools/managers/soupManager/asoueces.py +135 -0
- abstract_webtools-0.1.6.40/src/abstract_webtools/managers/urlManager/urlManager.py +220 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/SOURCES.txt +1 -0
- abstract_webtools-0.1.6.38/src/abstract_webtools/managers/seleniumManager.py +0 -119
- abstract_webtools-0.1.6.38/src/abstract_webtools/managers/urlManager/urlManager.py +0 -230
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/LICENSE +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/README.md +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/pyproject.toml +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/setup.cfg +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/__init__.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/abstract_webtools.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/big_user_agent_list.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/main.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/__init__.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/cipherManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/crawlManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/curlMgr.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/domainManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/get_test.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/mySocketClient.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/networkManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/requestManager/requestManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/sslManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/userAgentManager.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/videoDownloader.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/url_grabber.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/url_grabber_new.py +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/requires.txt +0 -0
- {abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/top_level.txt +0 -0
{abstract_webtools-0.1.6.38/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.40}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.40
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
setuptools.setup(
|
6
6
|
name='abstract_webtools',
|
7
|
-
version='0.1.6.
|
7
|
+
version='0.1.6.40',
|
8
8
|
author='putkoff',
|
9
9
|
author_email='partners@abstractendeavors.com',
|
10
10
|
description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
|
@@ -0,0 +1,128 @@
|
|
1
|
+
import os
|
2
|
+
from urllib.parse import urlparse
|
3
|
+
from selenium import webdriver
|
4
|
+
from selenium.webdriver.chrome.options import Options
|
5
|
+
from selenium.webdriver.chrome.service import Service
|
6
|
+
from webdriver_manager.chrome import ChromeDriverManager # For automatic ChromeDriver installation
|
7
|
+
import logging
|
8
|
+
import urllib3
|
9
|
+
from ..abstract_webtools import * # Assuming this is a valid import
|
10
|
+
from .urlManager import *
|
11
|
+
|
12
|
+
# Suppress urllib3 warnings and debug logs
|
13
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
14
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
15
|
+
logging.getLogger("selenium").setLevel(logging.WARNING)
|
16
|
+
|
17
|
+
# Default Chrome options (can be overridden)
|
18
|
+
DEFAULT_CHROME_OPTIONS = [
|
19
|
+
"--headless", # Run in headless mode
|
20
|
+
"--no-sandbox",
|
21
|
+
"--disable-dev-shm-usage", # Avoid memory issues on servers
|
22
|
+
"--disable-gpu",
|
23
|
+
"--disable-software-rasterizer",
|
24
|
+
"--disable-extensions",
|
25
|
+
"--remote-debugging-port=9222"
|
26
|
+
]
|
27
|
+
|
28
|
+
class SingletonMeta(type):
|
29
|
+
_instances = {}
|
30
|
+
def __call__(cls, *args, **kwargs):
|
31
|
+
if cls not in cls._instances:
|
32
|
+
instance = super().__call__(*args, **kwargs)
|
33
|
+
cls._instances[cls] = instance
|
34
|
+
return cls._instances[cls]
|
35
|
+
|
36
|
+
class SeleniumManager(metaclass=SingletonMeta):
|
37
|
+
def __init__(self, url):
|
38
|
+
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
39
|
+
self.initialized = True
|
40
|
+
parsed_url = urlparse(url)
|
41
|
+
self.domain = parsed_url.netloc
|
42
|
+
self.scheme = parsed_url.scheme or "https" # Default to https if scheme is missing
|
43
|
+
self.base_url = f"{self.scheme}://{self.domain}"
|
44
|
+
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
45
|
+
os.makedirs(self.site_dir, exist_ok=True)
|
46
|
+
self.drivers = {}
|
47
|
+
self.page_type = []
|
48
|
+
|
49
|
+
def get_url_to_path(self, url):
|
50
|
+
url = eatAll(str(url), ['', ' ', '\n', '\t', '\\', '/']) # Assuming eatAll is defined elsewhere
|
51
|
+
parsed_url = urlparse(url)
|
52
|
+
if parsed_url.netloc == self.domain:
|
53
|
+
paths = parsed_url.path.split('/')
|
54
|
+
dir_path = self.site_dir
|
55
|
+
for path in paths[:-1]:
|
56
|
+
dir_path = os.path.join(dir_path, path)
|
57
|
+
os.makedirs(dir_path, exist_ok=True)
|
58
|
+
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if not self.page_type else self.page_type[-1])
|
59
|
+
dir_path = os.path.join(dir_path, paths[-1])
|
60
|
+
return dir_path
|
61
|
+
|
62
|
+
def saved_url_check(self, url):
|
63
|
+
return self.get_url_to_path(url)
|
64
|
+
|
65
|
+
def get_with_netloc(self, url):
|
66
|
+
parsed_url = urlparse(url)
|
67
|
+
if not parsed_url.netloc:
|
68
|
+
url = f"{self.scheme}://{self.domain}/{url.strip('/')}"
|
69
|
+
return url
|
70
|
+
|
71
|
+
def get_driver(self, url):
|
72
|
+
if url and url not in self.drivers:
|
73
|
+
# Set up Chrome options
|
74
|
+
chrome_options = Options()
|
75
|
+
for option in DEFAULT_CHROME_OPTIONS:
|
76
|
+
chrome_options.add_argument(option)
|
77
|
+
|
78
|
+
# Specify Chrome binary location if needed (optional, comment out if not applicable)
|
79
|
+
# chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
80
|
+
|
81
|
+
# Automatically install and use ChromeDriver
|
82
|
+
service = Service(ChromeDriverManager().install())
|
83
|
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
84
|
+
|
85
|
+
self.drivers[url] = driver
|
86
|
+
driver.get(url)
|
87
|
+
return self.drivers[url]
|
88
|
+
|
89
|
+
def quit_driver(self, url):
|
90
|
+
"""Clean up a specific driver instance."""
|
91
|
+
if url in self.drivers:
|
92
|
+
self.drivers[url].quit()
|
93
|
+
del self.drivers[url]
|
94
|
+
|
95
|
+
def quit_all_drivers(self):
|
96
|
+
"""Clean up all driver instances."""
|
97
|
+
for driver in self.drivers.values():
|
98
|
+
driver.quit()
|
99
|
+
self.drivers.clear()
|
100
|
+
|
101
|
+
def normalize_url(url, base_url=None):
|
102
|
+
"""Normalize and resolve relative URLs."""
|
103
|
+
manager = SeleniumManager(url)
|
104
|
+
base_url = manager.base_url if base_url is None else base_url
|
105
|
+
if url.startswith(base_url):
|
106
|
+
url = url[len(base_url):]
|
107
|
+
normalized_url = urljoin(base_url, url.split('#')[0])
|
108
|
+
if not normalized_url.startswith(base_url):
|
109
|
+
return None
|
110
|
+
return normalized_url
|
111
|
+
|
112
|
+
def get_selenium_source(url):
|
113
|
+
"""Fetch page source using Selenium."""
|
114
|
+
url_mgr = urlManager(url) # Assuming urlManager is defined elsewhere
|
115
|
+
if url_mgr.url:
|
116
|
+
url = str(url_mgr.url)
|
117
|
+
manager = SeleniumManager(url)
|
118
|
+
driver = manager.get_driver(url)
|
119
|
+
try:
|
120
|
+
return driver.page_source
|
121
|
+
except Exception as e:
|
122
|
+
logging.error(f"Error fetching page source for {url}: {e}")
|
123
|
+
return None
|
124
|
+
# Note: Driver is not quit here to maintain Singleton behavior
|
125
|
+
|
126
|
+
# Ensure cleanup on program exit (optional)
|
127
|
+
import atexit
|
128
|
+
atexit.register(lambda: SeleniumManager(url="").quit_all_drivers()) # Cleanup all drivers on exit
|
@@ -0,0 +1,135 @@
|
|
1
|
+
text = """{"title": "NoviSoul
|
2
|
+
novissbm@gmail.com", "href": "http://www.youtube.com/signin?authuser=0&next=%2Fwatch%3Fv%3DEaIYRM1yrM4&action_handle_signin=true", "description": ""},
|
3
|
+
{"title": "Sign in", "href": "https://accounts.google.com/ServiceLogin?continue=http%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26hl%3Den_GB%26next%3D%252Fwatch%253Fv%253DEaIYRM1yrM4%26nomobiletemp%3D1&uilel=3&service=youtube&passive=true&hl=en_GB", "description": ""},
|
4
|
+
{"title": "Sign up", "href": "http://www.youtube.com/signup?next=%2Fwatch%3Fv%3DEaIYRM1yrM4", "description": ""},
|
5
|
+
{"title": "9:58
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
Physics of Free Energy Deviceby Eugene Jeong
|
12
|
+
|
13
|
+
336,881 views", "href": "http://www.youtube.com/watch?v=EB-jWfzkz_E", "description": ""},
|
14
|
+
{"title": "4:49
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
[www.witts.ws] Self-Running 40kW (40,000 Watt) Fuelless Generator (1 of 3)by wits2014
|
21
|
+
|
22
|
+
488,638 views", "href": "http://www.youtube.com/watch?v=LFu-s6ZmGyE", "description": ""},
|
23
|
+
{"title": "2:33
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
Free Energy - Evidence of military antigravity technologyby DoubleMarkez
|
30
|
+
|
31
|
+
390,020 views", "href": "http://www.youtube.com/watch?v=qljY-YfFaPc", "description": ""},
|
32
|
+
{"title": "15:01
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
APEX 2013 SSBM L10 Shroomed VS CT EMP Mew2Kingby Jason AxelrodRecommended for you", "href": "http://www.youtube.com/watch?v=pc7v49k5FhY", "description": ""},
|
39
|
+
{"title": "161
|
40
|
+
|
41
|
+
|
42
|
+
videos
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
Play all
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
washby dle3276", "href": "http://www.youtube.com/watch?v=AmcSt5hU4qA&list=PL4517CA6C6244A844", "description": ""},
|
63
|
+
{"title": "10:31
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
Pyramid Magnet - free energy - english subtitleby MrTermsof
|
70
|
+
|
71
|
+
616,081 views", "href": "http://www.youtube.com/watch?v=pMbHswNoGWM", "description": ""},
|
72
|
+
{"title": "4:11
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
My all new newman motor 1.(TheDaftman)by theDaftman
|
79
|
+
|
80
|
+
1,147,470 views", "href": "http://www.youtube.com/watch?v=dL4B_DNBtvc", "description": ""},
|
81
|
+
{"title": "2:18
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
Is there free energy in magnets?by aetherix01
|
88
|
+
|
89
|
+
371,642 views", "href": "http://www.youtube.com/watch?v=vrn5B9a8aOk", "description": ""},
|
90
|
+
{"title": "3:00
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
The Most Dangerous Video On The Internet - Trevor Paglenby killuminati63
|
97
|
+
|
98
|
+
585,755 views", "href": "http://www.youtube.com/watch?v=9xEuhEHDJM8", "description": ""},
|
99
|
+
{"title": "2:18
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
Free Energy - Magnet Motorby ATBootstrap
|
106
|
+
|
107
|
+
358,641 views", "href": "http://www.youtube.com/watch?v=hfkwCE3BeBs", "description": ""},
|
108
|
+
{"title": "2:38
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
100% free energy generator is easy to buildby LifeHack2012
|
115
|
+
|
116
|
+
238,092 views", "href": "http://www.youtube.com/watch?v=GEUyhhMEs7U", "description": ""},
|
117
|
+
{"title": "3:41
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
5KW free energy –±–µ—Å—Ç–æ–ø–ª–∏–≤–Ω—ã–π –≥–µ–Ω–µ—Ä–∞—Ç–æ—Ä Kapanadze –ö–∞–ø–∞–Ω–∞–¥–∑–µby Alexander Frolov
|
124
|
+
|
125
|
+
488,213 views", "href": "http://www.youtube.com/watch?v=uxQ99R4gOWY", "description": ""},""".split('\n')
|
126
|
+
sources = ' '.join([te for te in text if te])
|
127
|
+
while True:
|
128
|
+
if ' ' in sources:
|
129
|
+
sources = sources.replace(' ',' ').replace('\t',' ')
|
130
|
+
else:
|
131
|
+
break
|
132
|
+
sources = sources.replace('}, {','},{').replace('},{','},\n{')
|
133
|
+
input(sources)
|
134
|
+
|
135
|
+
|
@@ -0,0 +1,220 @@
|
|
1
|
+
import re
|
2
|
+
import urllib.parse
|
3
|
+
import requests
|
4
|
+
from urllib.parse import urlparse, urlunparse, urljoin
|
5
|
+
|
6
|
+
from ...abstract_webtools import *
|
7
|
+
|
8
|
+
class urlManager:
|
9
|
+
"""
|
10
|
+
Revised urlManager for managing and cleaning URLs.
|
11
|
+
|
12
|
+
It splits URLs into their components, normalizes them (trimming spaces, lowercasing
|
13
|
+
scheme and domain, removing default ports, and cleaning up paths), and then creates
|
14
|
+
a list of potential variants (with/without www, http/https) so that a valid version
|
15
|
+
can be determined.
|
16
|
+
"""
|
17
|
+
def __init__(self, url=None, session=None):
|
18
|
+
url = url or 'www.example.com'
|
19
|
+
self._url = url
|
20
|
+
self.session = session or requests
|
21
|
+
self.clean_urls = self.clean_url(url)
|
22
|
+
self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
|
23
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
24
|
+
self.all_urls = []
|
25
|
+
|
26
|
+
def url_to_pieces(self, url):
|
27
|
+
"""
|
28
|
+
Split a URL into protocol, domain, path, and query components.
|
29
|
+
Uses urlparse for robustness.
|
30
|
+
"""
|
31
|
+
try:
|
32
|
+
parsed = urlparse(url)
|
33
|
+
protocol = parsed.scheme if parsed.scheme else None
|
34
|
+
domain = parsed.netloc if parsed.netloc else None
|
35
|
+
path = parsed.path or ""
|
36
|
+
query = parsed.query or ""
|
37
|
+
except Exception as e:
|
38
|
+
print(f'The URL {url} was not reachable: {e}')
|
39
|
+
protocol, domain, path, query = None, None, "", ""
|
40
|
+
return protocol, domain, path, query
|
41
|
+
|
42
|
+
def clean_url(self, url=None) -> list:
|
43
|
+
"""
|
44
|
+
Normalize and clean the URL, then return a list of potential URL variants.
|
45
|
+
|
46
|
+
This method:
|
47
|
+
- Strips whitespace.
|
48
|
+
- Adds a scheme (defaults to https) if missing.
|
49
|
+
- Lowercases the scheme and domain.
|
50
|
+
- Removes default ports.
|
51
|
+
- Cleans up the path (removing duplicate slashes and trailing slash).
|
52
|
+
- Generates variants with and without 'www', and with both http and https.
|
53
|
+
"""
|
54
|
+
url = url or self._url
|
55
|
+
url = url.strip()
|
56
|
+
# Ensure the URL has a scheme
|
57
|
+
if not re.match(r'https?://', url):
|
58
|
+
url = 'https://' + url
|
59
|
+
|
60
|
+
parsed = urlparse(url)
|
61
|
+
scheme = parsed.scheme.lower()
|
62
|
+
netloc = parsed.netloc.lower()
|
63
|
+
# Remove default port numbers if present
|
64
|
+
if ':' in netloc:
|
65
|
+
host, port = netloc.split(':', 1)
|
66
|
+
if (scheme == "http" and port == "80") or (scheme == "https" and port == "443"):
|
67
|
+
netloc = host
|
68
|
+
|
69
|
+
# Normalize the path: remove duplicate slashes and a trailing slash
|
70
|
+
path = re.sub(r'//+', '/', parsed.path).rstrip('/')
|
71
|
+
|
72
|
+
# Rebuild the cleaned URL without query or fragment
|
73
|
+
cleaned_url = urlunparse((scheme, netloc, path, '', '', ''))
|
74
|
+
|
75
|
+
variants = []
|
76
|
+
# Add the primary variant
|
77
|
+
variants.append(cleaned_url)
|
78
|
+
# Generate a variant with/without 'www'
|
79
|
+
if netloc.startswith('www.'):
|
80
|
+
no_www = netloc[4:]
|
81
|
+
variants.append(urlunparse((scheme, no_www, path, '', '', '')))
|
82
|
+
else:
|
83
|
+
variants.append(urlunparse((scheme, f"www.{netloc}", path, '', '', '')))
|
84
|
+
|
85
|
+
# Also generate variants with the alternate scheme
|
86
|
+
alt_scheme = 'http' if scheme == 'https' else 'https'
|
87
|
+
for variant in list(variants):
|
88
|
+
parsed_variant = urlparse(variant)
|
89
|
+
alt_variant = urlunparse((alt_scheme, parsed_variant.netloc, parsed_variant.path, '', '', ''))
|
90
|
+
variants.append(alt_variant)
|
91
|
+
|
92
|
+
# Remove duplicates while preserving order
|
93
|
+
seen = set()
|
94
|
+
unique_variants = []
|
95
|
+
for v in variants:
|
96
|
+
if v not in seen:
|
97
|
+
unique_variants.append(v)
|
98
|
+
seen.add(v)
|
99
|
+
return unique_variants
|
100
|
+
|
101
|
+
def get_correct_url(self, url=None, clean_urls=None) -> str:
|
102
|
+
"""
|
103
|
+
Attempts each URL variant by making an HTTP GET request.
|
104
|
+
Returns the first variant that returns a 200 OK response.
|
105
|
+
"""
|
106
|
+
if url is None and clean_urls is None:
|
107
|
+
url = self._url
|
108
|
+
clean_urls = self.clean_urls
|
109
|
+
if url is not None and clean_urls is None:
|
110
|
+
clean_urls = self.clean_url(url)
|
111
|
+
elif url is None and clean_urls is not None:
|
112
|
+
url = self._url
|
113
|
+
|
114
|
+
for candidate in clean_urls:
|
115
|
+
try:
|
116
|
+
response = self.session.get(candidate, timeout=5)
|
117
|
+
if response.status_code == 200:
|
118
|
+
return candidate
|
119
|
+
except requests.exceptions.RequestException as e:
|
120
|
+
print(f"Failed to reach {candidate}: {e}")
|
121
|
+
return None
|
122
|
+
|
123
|
+
def update_url(self, url):
|
124
|
+
"""
|
125
|
+
Update the URL and refresh related attributes.
|
126
|
+
"""
|
127
|
+
self._url = url
|
128
|
+
self.clean_urls = self.clean_url(url)
|
129
|
+
self.url = self.get_correct_url(clean_urls=self.clean_urls) or url
|
130
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
131
|
+
self.all_urls = []
|
132
|
+
|
133
|
+
def get_domain(self, url=None):
|
134
|
+
url = url or self.url
|
135
|
+
return urlparse(url).netloc
|
136
|
+
|
137
|
+
def url_join(self, base_url, path):
|
138
|
+
"""
|
139
|
+
Joins a base URL with a relative path.
|
140
|
+
"""
|
141
|
+
base_url = base_url.strip().rstrip('/')
|
142
|
+
path = path.strip().lstrip('/')
|
143
|
+
return f"{base_url}/{path}"
|
144
|
+
|
145
|
+
@property
|
146
|
+
def url(self):
|
147
|
+
return self._url
|
148
|
+
|
149
|
+
@url.setter
|
150
|
+
def url(self, new_url):
|
151
|
+
self._url = new_url
|
152
|
+
|
153
|
+
def is_valid_url(self, url=None):
|
154
|
+
"""
|
155
|
+
Check if the given URL is valid.
|
156
|
+
"""
|
157
|
+
url = url or self.url
|
158
|
+
parsed = urlparse(url)
|
159
|
+
return bool(parsed.scheme) and bool(parsed.netloc)
|
160
|
+
|
161
|
+
def make_valid(self, href, url=None):
|
162
|
+
"""
|
163
|
+
Validate a href. If it's not already valid, join it with the base URL.
|
164
|
+
"""
|
165
|
+
if self.is_valid_url(href):
|
166
|
+
return href
|
167
|
+
base = url or self.url
|
168
|
+
new_link = urljoin(base, href)
|
169
|
+
if self.is_valid_url(new_link):
|
170
|
+
return new_link
|
171
|
+
return False
|
172
|
+
|
173
|
+
def get_relative_href(self, base, href):
|
174
|
+
"""
|
175
|
+
For a relative href, join it with the base URL and strip any query or fragment.
|
176
|
+
"""
|
177
|
+
joined = urljoin(base, href)
|
178
|
+
parsed = urlparse(joined)
|
179
|
+
clean_href = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', ''))
|
180
|
+
return clean_href
|
181
|
+
|
182
|
+
def url_basename(self, url=None):
|
183
|
+
url = url or self.url
|
184
|
+
path = urlparse(url).path
|
185
|
+
return path.strip('/').split('/')[-1]
|
186
|
+
|
187
|
+
def base_url(self, url=None):
|
188
|
+
url = url or self.url
|
189
|
+
match = re.match(r'https?://[^?#/]+/', url)
|
190
|
+
if match:
|
191
|
+
return match.group()
|
192
|
+
return None
|
193
|
+
|
194
|
+
def urljoin(self, base, path):
|
195
|
+
return urljoin(base, path)
|
196
|
+
|
197
|
+
class urlManagerSingleton:
|
198
|
+
_instance = None
|
199
|
+
|
200
|
+
@staticmethod
|
201
|
+
def get_instance(url=None, session=requests):
|
202
|
+
if urlManagerSingleton._instance is None:
|
203
|
+
urlManagerSingleton._instance = urlManager(url, session=session)
|
204
|
+
elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
|
205
|
+
urlManagerSingleton._instance = urlManager(url, session=session)
|
206
|
+
return urlManagerSingleton._instance
|
207
|
+
|
208
|
+
def get_url(url=None, url_mgr=None):
|
209
|
+
if not url and not url_mgr:
|
210
|
+
return None
|
211
|
+
if url:
|
212
|
+
url_mgr = urlManager(url)
|
213
|
+
return url_mgr.url
|
214
|
+
|
215
|
+
def get_url_mgr(url=None, url_mgr=None):
|
216
|
+
if url_mgr is None and url:
|
217
|
+
url_mgr = urlManager(url=url)
|
218
|
+
if url_mgr and url is None:
|
219
|
+
url = url_mgr.url
|
220
|
+
return url_mgr
|
{abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40/src/abstract_webtools.egg-info}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.40
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
{abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools.egg-info/SOURCES.txt
RENAMED
@@ -36,6 +36,7 @@ src/abstract_webtools/managers/linkManager/linkManager.py
|
|
36
36
|
src/abstract_webtools/managers/requestManager/__init__.py
|
37
37
|
src/abstract_webtools/managers/requestManager/requestManager.py
|
38
38
|
src/abstract_webtools/managers/soupManager/__init__.py
|
39
|
+
src/abstract_webtools/managers/soupManager/asoueces.py
|
39
40
|
src/abstract_webtools/managers/soupManager/soupManager.py
|
40
41
|
src/abstract_webtools/managers/urlManager/__init__.py
|
41
42
|
src/abstract_webtools/managers/urlManager/urlManager.py
|
@@ -1,119 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from ..abstract_webtools import *
|
3
|
-
from .urlManager import *
|
4
|
-
from urllib.parse import urlparse
|
5
|
-
from abstract_utilities import *
|
6
|
-
from selenium import webdriver
|
7
|
-
from selenium.webdriver.chrome.options import Options
|
8
|
-
import logging
|
9
|
-
import urllib3
|
10
|
-
|
11
|
-
# Suppress urllib3 warnings and debug logs
|
12
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
13
|
-
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
14
|
-
|
15
|
-
# Suppress Selenium logs
|
16
|
-
logging.getLogger("selenium").setLevel(logging.WARNING)
|
17
|
-
|
18
|
-
import os
|
19
|
-
from selenium import webdriver
|
20
|
-
from selenium.webdriver.chrome.options import Options
|
21
|
-
|
22
|
-
# Setup Chrome options
|
23
|
-
chrome_options = Options()
|
24
|
-
chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
25
|
-
chrome_options.add_argument("--headless") # Run in headless mode
|
26
|
-
chrome_options.add_argument("--no-sandbox")
|
27
|
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
28
|
-
chrome_options.add_argument("--disable-gpu")
|
29
|
-
chrome_options.add_argument("--disable-software-rasterizer")
|
30
|
-
chrome_options.add_argument("--disable-extensions")
|
31
|
-
chrome_options.add_argument("--remote-debugging-port=9222")
|
32
|
-
|
33
|
-
|
34
|
-
class SingletonMeta(type):
|
35
|
-
_instances = {}
|
36
|
-
def __call__(cls, *args, **kwargs):
|
37
|
-
if cls not in cls._instances:
|
38
|
-
instance = super().__call__(*args, **kwargs)
|
39
|
-
cls._instances[cls] = instance
|
40
|
-
return cls._instances[cls]
|
41
|
-
|
42
|
-
class seleniumManager(metaclass=SingletonMeta):
|
43
|
-
def __init__(self, url):
|
44
|
-
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
45
|
-
self.initialized = True
|
46
|
-
parsed_url = urlparse(url)
|
47
|
-
self.domain = parsed_url.netloc
|
48
|
-
self.scheme = parsed_url.scheme
|
49
|
-
self.base_url= f"{self.scheme}{self.domain}"
|
50
|
-
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
51
|
-
os.makedirs(self.site_dir, exist_ok=True)
|
52
|
-
self.drivers = {}
|
53
|
-
self.page_type = []
|
54
|
-
|
55
|
-
def get_url_to_path(self, url):
|
56
|
-
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
57
|
-
parsed_url = urlparse(url)
|
58
|
-
if parsed_url.netloc == self.domain:
|
59
|
-
paths = parsed_url.path.split('/')
|
60
|
-
dir_path = self.site_dir
|
61
|
-
for path in paths[:-1]:
|
62
|
-
dir_path = os.path.join(dir_path, path)
|
63
|
-
os.makedirs(dir_path, exist_ok=True)
|
64
|
-
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
|
65
|
-
|
66
|
-
dir_path = os.path.join(dir_path, paths[-1])
|
67
|
-
return dir_path
|
68
|
-
|
69
|
-
def saved_url_check(self, url):
|
70
|
-
path = self.get_url_to_path(url)
|
71
|
-
return path
|
72
|
-
|
73
|
-
def get_with_netloc(self, url):
|
74
|
-
parsed_url = urlparse(url)
|
75
|
-
if parsed_url.netloc == '':
|
76
|
-
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
77
|
-
return url
|
78
|
-
|
79
|
-
def get_driver(self, url):
|
80
|
-
if url and url not in self.drivers:
|
81
|
-
chrome_options = Options()
|
82
|
-
chrome_options.add_argument("--headless")
|
83
|
-
driver = webdriver.Chrome(options=chrome_options)
|
84
|
-
self.drivers[url] = driver
|
85
|
-
driver.get(url)
|
86
|
-
return self.drivers[url]
|
87
|
-
def normalize_url(url, base_url=None):
|
88
|
-
"""
|
89
|
-
Normalize and resolve relative URLs, ensuring proper domain and format.
|
90
|
-
"""
|
91
|
-
# If URL starts with the base URL repeated, remove the extra part
|
92
|
-
manager = seleniumManager(url)
|
93
|
-
base_url = manager.base_url
|
94
|
-
if url.startswith(base_url):
|
95
|
-
url = url[len(base_url):]
|
96
|
-
|
97
|
-
# Resolve the URL against the base URL
|
98
|
-
normalized_url = urljoin(base_url, url.split('#')[0])
|
99
|
-
|
100
|
-
# Ensure only URLs belonging to the base domain are kept
|
101
|
-
if not normalized_url.startswith(base_url):
|
102
|
-
return None
|
103
|
-
|
104
|
-
return normalized_url
|
105
|
-
# Function to get Selenium page source
|
106
|
-
def get_selenium_source(url):
|
107
|
-
url_mgr = urlManager(url)
|
108
|
-
if url_mgr.url:
|
109
|
-
url = str(url_mgr.url)
|
110
|
-
manager = seleniumManager(url)
|
111
|
-
driver = manager.get_driver(url)
|
112
|
-
try:
|
113
|
-
# Get page source
|
114
|
-
page_source = driver.page_source
|
115
|
-
return page_source
|
116
|
-
finally:
|
117
|
-
# Don't quit the driver unless you're done with all interactions
|
118
|
-
pass
|
119
|
-
|
@@ -1,230 +0,0 @@
|
|
1
|
-
from ...abstract_webtools import *
|
2
|
-
class urlManager:
|
3
|
-
"""
|
4
|
-
urlManager is a class for managing URLs, including cleaning, validating, and finding the correct version.
|
5
|
-
|
6
|
-
Args:
|
7
|
-
url (str or None): The URL to manage (default is None).
|
8
|
-
session (requests.Session): A custom requests session (default is the requests module's session).
|
9
|
-
|
10
|
-
Attributes:
|
11
|
-
session (requests.Session): The requests session used for making HTTP requests.
|
12
|
-
clean_urls (list): List of cleaned URL variations.
|
13
|
-
url (str): The current URL.
|
14
|
-
protocol (str): The protocol part of the URL (e.g., "https").
|
15
|
-
domain (str): The domain part of the URL (e.g., "example.com").
|
16
|
-
path (str): The path part of the URL (e.g., "/path/to/resource").
|
17
|
-
query (str): The query part of the URL (e.g., "?param=value").
|
18
|
-
all_urls (list): List of all URLs (not used in the provided code).
|
19
|
-
|
20
|
-
Methods:
|
21
|
-
url_to_pieces(url): Split a URL into its protocol, domain, path, and query components.
|
22
|
-
clean_url(url): Return a list of potential URL versions with and without 'www' and 'http(s)'.
|
23
|
-
get_correct_url(url): Get the correct version of the URL from possible variations.
|
24
|
-
update_url(url): Update the URL and related attributes.
|
25
|
-
get_domain(url): Get the domain name from a URL.
|
26
|
-
url_join(url, path): Join a base URL with a path.
|
27
|
-
is_valid_url(url): Check if a URL is valid.
|
28
|
-
make_valid(href, url): Make a URL valid by joining it with a base URL.
|
29
|
-
get_relative_href(url, href): Get the relative href URL by joining it with a base URL.
|
30
|
-
|
31
|
-
Note:
|
32
|
-
- The urlManager class provides methods for managing URLs, including cleaning and validating them.
|
33
|
-
- It also includes methods for joining and validating relative URLs.
|
34
|
-
"""
|
35
|
-
|
36
|
-
def __init__(self, url=None, session=None):
|
37
|
-
"""
|
38
|
-
Initialize a urlManager instance.
|
39
|
-
|
40
|
-
Args:
|
41
|
-
url (str or None): The URL to manage (default is None).
|
42
|
-
session (requests.Session): A custom requests session (default is the requests module's session).
|
43
|
-
"""
|
44
|
-
url = url or 'www.example.com'
|
45
|
-
self._url=url
|
46
|
-
self.url = url
|
47
|
-
self.session= session or requests
|
48
|
-
self.clean_urls = self.clean_url(url=url)
|
49
|
-
self.url = self.get_correct_url(clean_urls=self.clean_urls)
|
50
|
-
url_pieces = self.url_to_pieces(url=self.url)
|
51
|
-
self.protocol,self.domain,self.path,self.query=url_pieces
|
52
|
-
self.all_urls = []
|
53
|
-
def url_to_pieces(self, url):
|
54
|
-
|
55
|
-
try:
|
56
|
-
match = re.match(r'^(https?)?://?([^/]+)(/[^?]+)?(\?.+)?', url)
|
57
|
-
if match:
|
58
|
-
protocol = match.group(1) if match.group(1) else None
|
59
|
-
domain = match.group(2) if match.group(1) else None
|
60
|
-
path = match.group(3) if match.group(3) else "" # Handle None
|
61
|
-
query = match.group(4) if match.group(4) else "" # Handle None
|
62
|
-
except:
|
63
|
-
print(f'the url {url} was not reachable')
|
64
|
-
protocol,domain,path,query=None,None,"",""
|
65
|
-
return protocol, domain, path, query
|
66
|
-
|
67
|
-
def clean_url(self,url=None) -> list:
|
68
|
-
"""
|
69
|
-
Given a URL, return a list with potential URL versions including with and without 'www.',
|
70
|
-
and with 'http://' and 'https://'.
|
71
|
-
"""
|
72
|
-
url = url or self.url
|
73
|
-
urls=[]
|
74
|
-
if url:
|
75
|
-
# Remove http:// or https:// prefix
|
76
|
-
cleaned = url.replace("http://", "").replace("https://", "")
|
77
|
-
no_subdomain = cleaned.replace("www.", "", 1)
|
78
|
-
|
79
|
-
urls = [
|
80
|
-
f"https://{cleaned}",
|
81
|
-
f"http://{cleaned}",
|
82
|
-
]
|
83
|
-
|
84
|
-
# Add variants without 'www' if it was present
|
85
|
-
if cleaned != no_subdomain:
|
86
|
-
urls.extend([
|
87
|
-
f"https://{no_subdomain}",
|
88
|
-
f"http://{no_subdomain}",
|
89
|
-
])
|
90
|
-
|
91
|
-
# Add variants with 'www' if it wasn't present
|
92
|
-
else:
|
93
|
-
urls.extend([
|
94
|
-
f"https://www.{cleaned}",
|
95
|
-
f"http://www.{cleaned}",
|
96
|
-
])
|
97
|
-
|
98
|
-
return urls
|
99
|
-
|
100
|
-
def get_correct_url(self,url=None,clean_urls=None) -> (str or None):
|
101
|
-
"""
|
102
|
-
Gets the correct URL from the possible variations by trying each one with an HTTP request.
|
103
|
-
|
104
|
-
Args:
|
105
|
-
url (str): The URL to find the correct version of.
|
106
|
-
session (type(requests.Session), optional): The requests session to use for making HTTP requests.
|
107
|
-
Defaults to requests.
|
108
|
-
|
109
|
-
Returns:
|
110
|
-
str: The correct version of the URL if found, or None if none of the variations are valid.
|
111
|
-
"""
|
112
|
-
self.url = url
|
113
|
-
if url==None and clean_urls != None:
|
114
|
-
if self.url:
|
115
|
-
url=self.url or clean_urls[0]
|
116
|
-
if url!=None and clean_urls==None:
|
117
|
-
clean_urls=self.clean_url(url)
|
118
|
-
elif url==None and clean_urls==None:
|
119
|
-
url=self.url
|
120
|
-
clean_urls=self.clean_urls
|
121
|
-
# Get the correct URL from the possible variations
|
122
|
-
for url in clean_urls:
|
123
|
-
try:
|
124
|
-
source = self.session.get(url)
|
125
|
-
return url
|
126
|
-
except requests.exceptions.RequestException as e:
|
127
|
-
print(e)
|
128
|
-
return None
|
129
|
-
def update_url(self,url):
|
130
|
-
# These methods seem essential for setting up the urlManager object.
|
131
|
-
self.url = url
|
132
|
-
self.clean_urls = self.clean_url()
|
133
|
-
self.correct_url = self.get_correct_url()
|
134
|
-
self.url =self.correct_url
|
135
|
-
self.protocol,self.domain,self.path,self.query=self.url_to_pieces(url=self.url)
|
136
|
-
self.all_urls = []
|
137
|
-
def get_domain(self,url=None):
|
138
|
-
url = url or self.url
|
139
|
-
return urlparse(url).netloc
|
140
|
-
def url_join(self,url,path):
|
141
|
-
url = eatOuter(url,['/'])
|
142
|
-
path = eatInner(path,['/'])
|
143
|
-
slash=''
|
144
|
-
if path[0] not in ['?','&']:
|
145
|
-
slash = '/'
|
146
|
-
url = url+slash+path
|
147
|
-
return url
|
148
|
-
@property
|
149
|
-
def url(self):
|
150
|
-
return self._url
|
151
|
-
@url.setter
|
152
|
-
def url(self, new_url):
|
153
|
-
self._url = new_url
|
154
|
-
def is_valid_url(self,url=None):
|
155
|
-
"""
|
156
|
-
Check if the given URL is valid.
|
157
|
-
"""
|
158
|
-
url = url or self.url
|
159
|
-
parsed = urlparse(url)
|
160
|
-
return bool(parsed.netloc) and bool(parsed.scheme)
|
161
|
-
|
162
|
-
def make_valid(self,href,url=None):
|
163
|
-
def is_valid_url(url):
|
164
|
-
url = url or self.url
|
165
|
-
"""
|
166
|
-
Check if the given URL is valid.
|
167
|
-
"""
|
168
|
-
parsed = urlparse(url)
|
169
|
-
return bool(parsed.netloc) and bool(parsed.scheme)
|
170
|
-
if is_valid_url(href):
|
171
|
-
return href
|
172
|
-
new_link=urljoin(url,href)
|
173
|
-
if is_valid_url(new_link):
|
174
|
-
return new_link
|
175
|
-
return False
|
176
|
-
|
177
|
-
def get_relative_href(self,url,href):
|
178
|
-
# join the URL if it's relative (not an absolute link)
|
179
|
-
url = url or self.url
|
180
|
-
href = urljoin(url, href)
|
181
|
-
parsed_href = urlparse(href)
|
182
|
-
# remove URL GET parameters, URL fragments, etc.
|
183
|
-
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
|
184
|
-
return href
|
185
|
-
def url_basename(self,url=None):
|
186
|
-
url = url or self.url
|
187
|
-
path = urllib.parse.urlparse(url).path
|
188
|
-
return path.strip('/').split('/')[-1]
|
189
|
-
|
190
|
-
|
191
|
-
def base_url(self,url=None):
|
192
|
-
url = url or self.url
|
193
|
-
return re.match(r'https?://[^?#]+/', url).group()
|
194
|
-
|
195
|
-
|
196
|
-
def urljoin(self,base, path):
|
197
|
-
if isinstance(path, bytes):
|
198
|
-
path = path.decode()
|
199
|
-
if not isinstance(path, str) or not path:
|
200
|
-
return None
|
201
|
-
if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
|
202
|
-
return path
|
203
|
-
if isinstance(base, bytes):
|
204
|
-
base = base.decode()
|
205
|
-
if not isinstance(base, str) or not re.match(
|
206
|
-
r'^(?:https?:)?//', base):
|
207
|
-
return None
|
208
|
-
return urllib.parse.urljoin(base, path)
|
209
|
-
class urlManagerSingleton:
|
210
|
-
_instance = None
|
211
|
-
@staticmethod
|
212
|
-
def get_instance(url=None,session=requests):
|
213
|
-
if urlManagerSingleton._instance is None:
|
214
|
-
urlManagerSingleton._instance = urlManager(url,session=session)
|
215
|
-
elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
|
216
|
-
urlManagerSingleton._instance = urlManager(url,session=session)
|
217
|
-
return urlManagerSingleton._instance
|
218
|
-
|
219
|
-
def get_url(url=None,url_mgr=None):
|
220
|
-
if not url and not url_mgr:
|
221
|
-
return None
|
222
|
-
if url:
|
223
|
-
url_mgr = urlManager(url)
|
224
|
-
return url_mgr.url
|
225
|
-
def get_url_mgr(url=None,url_mgr=None):
|
226
|
-
if url_mgr == None and url:
|
227
|
-
url_mgr = urlManager(url=url)
|
228
|
-
if url_mgr and url == None:
|
229
|
-
url = url_mgr.url
|
230
|
-
return url_mgr
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/abstract_webtools.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/curlMgr.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/managers/get_test.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/url_grabber.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.38 → abstract_webtools-0.1.6.40}/src/abstract_webtools/url_grabber_new.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|