abstract-webtools 0.1.5.99__py3-none-any.whl → 0.1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/crawlManager.py +20 -3
- {abstract_webtools-0.1.5.99.dist-info → abstract_webtools-0.1.6.0.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.5.99.dist-info → abstract_webtools-0.1.6.0.dist-info}/RECORD +6 -6
- {abstract_webtools-0.1.5.99.dist-info → abstract_webtools-0.1.6.0.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.5.99.dist-info → abstract_webtools-0.1.6.0.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.5.99.dist-info → abstract_webtools-0.1.6.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,21 @@
|
|
1
1
|
from .soupManager import *
|
2
|
+
|
3
|
+
def normalize_url(url, base_url):
|
4
|
+
"""
|
5
|
+
Normalize and resolve relative URLs, ensuring proper domain and format.
|
6
|
+
"""
|
7
|
+
# If URL starts with the base URL repeated, remove the extra part
|
8
|
+
if url.startswith(base_url):
|
9
|
+
url = url[len(base_url):]
|
10
|
+
|
11
|
+
# Resolve the URL against the base URL
|
12
|
+
normalized_url = urljoin(base_url, url.split('#')[0])
|
13
|
+
|
14
|
+
# Ensure only URLs belonging to the base domain are kept
|
15
|
+
if not normalized_url.startswith(base_url):
|
16
|
+
return None
|
17
|
+
|
18
|
+
return normalized_url
|
2
19
|
class CrawlManager():
|
3
20
|
def __init__(self,url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
|
4
21
|
self.url=url
|
@@ -56,7 +73,7 @@ class CrawlManager():
|
|
56
73
|
"""
|
57
74
|
all_urls=[self.url_mgr.url]
|
58
75
|
domain = self.url_mgr.domain
|
59
|
-
all_attribs =
|
76
|
+
all_attribs = get_attribs(self.url_mgr.url)
|
60
77
|
for href in all_attribs.get('href',[]):
|
61
78
|
if href == "" or href is None:
|
62
79
|
# href empty tag
|
@@ -146,7 +163,7 @@ class CrawlManager():
|
|
146
163
|
# Fetch the title if available
|
147
164
|
meta_tags = soup_mgr.find_all("meta")
|
148
165
|
url = eatAll(str(url),['',' ','\n','\t','\\','/'])
|
149
|
-
attribs =
|
166
|
+
attribs = get_attribs(url)
|
150
167
|
soup = get_soup(url)
|
151
168
|
|
152
169
|
for meta_tag in meta_tags:
|
@@ -177,7 +194,7 @@ class CrawlManager():
|
|
177
194
|
string += f' <url>\n <loc>{url}</loc>\n'
|
178
195
|
preprocess=[]
|
179
196
|
self.get_new_source_and_url(url=url)
|
180
|
-
links =
|
197
|
+
links = get_attribs(url)
|
181
198
|
images = [link for link in links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
|
182
199
|
|
183
200
|
for img in images:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.6.0
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -7,7 +7,7 @@ abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE7
|
|
7
7
|
abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f5GoZmw0,3454
|
8
8
|
abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
|
9
9
|
abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
|
10
|
-
abstract_webtools/managers/crawlManager.py,sha256=
|
10
|
+
abstract_webtools/managers/crawlManager.py,sha256=6i7mydDrJoFKLrw1hOLXAW782MSuj57IGaTS3Pwcy1k,11127
|
11
11
|
abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
|
12
12
|
abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
|
13
13
|
abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
|
@@ -21,8 +21,8 @@ abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_
|
|
21
21
|
abstract_webtools/managers/urlManager.py,sha256=XqMrCM84BeWEfWtHc_8UFpT91ZtG-okzdKdCuC49vsA,8678
|
22
22
|
abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
|
23
23
|
abstract_webtools/managers/videoDownloader.py,sha256=6G_aLc05BTMUYUWc7iqYtHF_BaR7DnCNK_NJ-QnjsYY,10531
|
24
|
-
abstract_webtools-0.1.
|
25
|
-
abstract_webtools-0.1.
|
26
|
-
abstract_webtools-0.1.
|
27
|
-
abstract_webtools-0.1.
|
28
|
-
abstract_webtools-0.1.
|
24
|
+
abstract_webtools-0.1.6.0.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
25
|
+
abstract_webtools-0.1.6.0.dist-info/METADATA,sha256=V7pW8-JWmyVtLlcEGXCc3jT4tcGas2vkkls4-NUFx6w,15857
|
26
|
+
abstract_webtools-0.1.6.0.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
27
|
+
abstract_webtools-0.1.6.0.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
28
|
+
abstract_webtools-0.1.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|