abstract-webtools 0.1.6.15__py3-none-any.whl → 0.1.6.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/crawlManager.py +4 -4
- abstract_webtools/managers/urlManager.py +19 -14
- {abstract_webtools-0.1.6.15.dist-info → abstract_webtools-0.1.6.17.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.15.dist-info → abstract_webtools-0.1.6.17.dist-info}/RECORD +7 -7
- {abstract_webtools-0.1.6.15.dist-info → abstract_webtools-0.1.6.17.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.6.15.dist-info → abstract_webtools-0.1.6.17.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.15.dist-info → abstract_webtools-0.1.6.17.dist-info}/top_level.txt +0 -0
@@ -15,9 +15,8 @@ class crawlManager:
|
|
15
15
|
|
16
16
|
def get_new_source_and_url(self, url=None):
|
17
17
|
"""Fetches new source code and response for a given URL."""
|
18
|
-
|
19
|
-
|
20
|
-
self.req_mgr.set_url(url)
|
18
|
+
url = url
|
19
|
+
self.req_mgr = get_req_mgr(url=url)
|
21
20
|
self.source_code = self.req_mgr.source_code
|
22
21
|
self.response = self.req_mgr.response
|
23
22
|
|
@@ -119,8 +118,9 @@ class crawlManager:
|
|
119
118
|
|
120
119
|
return meta_info
|
121
120
|
|
122
|
-
def generate_sitemap(self):
|
121
|
+
def generate_sitemap(self,url=None):
|
123
122
|
"""Generates a sitemap.xml file with URLs, images, change frequency, and priority."""
|
123
|
+
url = url or self.url
|
124
124
|
urls = self.get_all_website_links()
|
125
125
|
with open('sitemap.xml', 'w', encoding='utf-8') as f:
|
126
126
|
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
@@ -41,8 +41,9 @@ class urlManager:
|
|
41
41
|
url (str or None): The URL to manage (default is None).
|
42
42
|
session (requests.Session): A custom requests session (default is the requests module's session).
|
43
43
|
"""
|
44
|
-
|
45
|
-
self.
|
44
|
+
url = url or 'www.example.com'
|
45
|
+
self._url=url
|
46
|
+
self.url = url
|
46
47
|
self.session= session or requests
|
47
48
|
self.clean_urls = self.clean_url(url=url)
|
48
49
|
self.url = self.get_correct_url(clean_urls=self.clean_urls)
|
@@ -68,8 +69,7 @@ class urlManager:
|
|
68
69
|
Given a URL, return a list with potential URL versions including with and without 'www.',
|
69
70
|
and with 'http://' and 'https://'.
|
70
71
|
"""
|
71
|
-
|
72
|
-
url=self.url
|
72
|
+
url = url or self.url
|
73
73
|
urls=[]
|
74
74
|
if url:
|
75
75
|
# Remove http:// or https:// prefix
|
@@ -134,7 +134,8 @@ class urlManager:
|
|
134
134
|
self.url =self.correct_url
|
135
135
|
self.protocol,self.domain,self.path,self.query=self.url_to_pieces(url=self.url)
|
136
136
|
self.all_urls = []
|
137
|
-
def get_domain(self,url):
|
137
|
+
def get_domain(self,url=None):
|
138
|
+
url = url or self.url
|
138
139
|
return urlparse(url).netloc
|
139
140
|
def url_join(self,url,path):
|
140
141
|
url = eatOuter(url,['/'])
|
@@ -150,16 +151,17 @@ class urlManager:
|
|
150
151
|
@url.setter
|
151
152
|
def url(self, new_url):
|
152
153
|
self._url = new_url
|
153
|
-
|
154
|
-
def is_valid_url(url):
|
154
|
+
def is_valid_url(self,url=None):
|
155
155
|
"""
|
156
156
|
Check if the given URL is valid.
|
157
157
|
"""
|
158
|
+
url = url or self.url
|
158
159
|
parsed = urlparse(url)
|
159
160
|
return bool(parsed.netloc) and bool(parsed.scheme)
|
160
|
-
|
161
|
-
def make_valid(href,url):
|
161
|
+
|
162
|
+
def make_valid(self,href,url=None):
|
162
163
|
def is_valid_url(url):
|
164
|
+
url = url or self.url
|
163
165
|
"""
|
164
166
|
Check if the given URL is valid.
|
165
167
|
"""
|
@@ -171,24 +173,27 @@ class urlManager:
|
|
171
173
|
if is_valid_url(new_link):
|
172
174
|
return new_link
|
173
175
|
return False
|
174
|
-
|
175
|
-
def get_relative_href(url,href):
|
176
|
+
|
177
|
+
def get_relative_href(self,url,href):
|
176
178
|
# join the URL if it's relative (not an absolute link)
|
179
|
+
url = url or self.url
|
177
180
|
href = urljoin(url, href)
|
178
181
|
parsed_href = urlparse(href)
|
179
182
|
# remove URL GET parameters, URL fragments, etc.
|
180
183
|
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
|
181
184
|
return href
|
182
|
-
def url_basename(url):
|
185
|
+
def url_basename(self,url=None):
|
186
|
+
url = url or self.url
|
183
187
|
path = urllib.parse.urlparse(url).path
|
184
188
|
return path.strip('/').split('/')[-1]
|
185
189
|
|
186
190
|
|
187
|
-
def base_url(url):
|
191
|
+
def base_url(self,url=None):
|
192
|
+
url = url or self.url
|
188
193
|
return re.match(r'https?://[^?#]+/', url).group()
|
189
194
|
|
190
195
|
|
191
|
-
def urljoin(base, path):
|
196
|
+
def urljoin(self,base, path):
|
192
197
|
if isinstance(path, bytes):
|
193
198
|
path = path.decode()
|
194
199
|
if not isinstance(path, str) or not path:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.17
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -7,7 +7,7 @@ abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE7
|
|
7
7
|
abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f5GoZmw0,3454
|
8
8
|
abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
|
9
9
|
abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
|
10
|
-
abstract_webtools/managers/crawlManager.py,sha256=
|
10
|
+
abstract_webtools/managers/crawlManager.py,sha256=RVRWiceEKuLSPIWtPYGGz85zRxamwOOgsMLIhJBU14Q,7908
|
11
11
|
abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
|
12
12
|
abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
|
13
13
|
abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
|
@@ -18,11 +18,11 @@ abstract_webtools/managers/seleniumManager.py,sha256=qSY8gH3N5YJIMwE_Alj9HNQRip_
|
|
18
18
|
abstract_webtools/managers/soupManager.py,sha256=7nDB_QKneGjyTZUzchfbdHNvxxYiTyIn8AHon8ObTSY,17148
|
19
19
|
abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
|
20
20
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
21
|
-
abstract_webtools/managers/urlManager.py,sha256=
|
21
|
+
abstract_webtools/managers/urlManager.py,sha256=Dvf-TiSo5j_YjZS2Eq6lFfbhveneD6NA_wEE0xUXy_E,8858
|
22
22
|
abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
|
23
23
|
abstract_webtools/managers/videoDownloader.py,sha256=6G_aLc05BTMUYUWc7iqYtHF_BaR7DnCNK_NJ-QnjsYY,10531
|
24
|
-
abstract_webtools-0.1.6.
|
25
|
-
abstract_webtools-0.1.6.
|
26
|
-
abstract_webtools-0.1.6.
|
27
|
-
abstract_webtools-0.1.6.
|
28
|
-
abstract_webtools-0.1.6.
|
24
|
+
abstract_webtools-0.1.6.17.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
25
|
+
abstract_webtools-0.1.6.17.dist-info/METADATA,sha256=L9KnzarSqKPuWx_jpndNebg2haRXUAc1XAwzVj09RyI,15858
|
26
|
+
abstract_webtools-0.1.6.17.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
27
|
+
abstract_webtools-0.1.6.17.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
28
|
+
abstract_webtools-0.1.6.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|