abstract-webtools 0.1.5.99__py3-none-any.whl → 0.1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,22 @@
1
1
  from .soupManager import *
2
- class CrawlManager():
2
+
3
+ def normalize_url(url, base_url):
4
+ """
5
+ Normalize and resolve relative URLs, ensuring proper domain and format.
6
+ """
7
+ # If URL starts with the base URL repeated, remove the extra part
8
+ if url.startswith(base_url):
9
+ url = url[len(base_url):]
10
+
11
+ # Resolve the URL against the base URL
12
+ normalized_url = urljoin(base_url, url.split('#')[0])
13
+
14
+ # Ensure only URLs belonging to the base domain are kept
15
+ if not normalized_url.startswith(base_url):
16
+ return None
17
+
18
+ return normalized_url
19
+ class crawlManager():
3
20
  def __init__(self,url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
4
21
  self.url=url
5
22
  self.source_code=source_code
@@ -56,7 +73,7 @@ class CrawlManager():
56
73
  """
57
74
  all_urls=[self.url_mgr.url]
58
75
  domain = self.url_mgr.domain
59
- all_attribs = get_all_attribute_values(url=self.url_mgr.url,url_mgr=self.url_mgr)
76
+ all_attribs = get_attribs(self.url_mgr.url)
60
77
  for href in all_attribs.get('href',[]):
61
78
  if href == "" or href is None:
62
79
  # href empty tag
@@ -146,7 +163,7 @@ class CrawlManager():
146
163
  # Fetch the title if available
147
164
  meta_tags = soup_mgr.find_all("meta")
148
165
  url = eatAll(str(url),['',' ','\n','\t','\\','/'])
149
- attribs = get_all_attribute_values(url)
166
+ attribs = get_attribs(url)
150
167
  soup = get_soup(url)
151
168
 
152
169
  for meta_tag in meta_tags:
@@ -177,7 +194,7 @@ class CrawlManager():
177
194
  string += f' <url>\n <loc>{url}</loc>\n'
178
195
  preprocess=[]
179
196
  self.get_new_source_and_url(url=url)
180
- links = get_all_attribute_values(url)
197
+ links = get_attribs(url)
181
198
  images = [link for link in links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
182
199
 
183
200
  for img in images:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.5.99
3
+ Version: 0.1.6.1
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -7,7 +7,7 @@ abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE7
7
7
  abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f5GoZmw0,3454
8
8
  abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
9
9
  abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
10
- abstract_webtools/managers/crawlManager.py,sha256=m5vuZsB7EndZ5DZQ7gXS_B4HsRfIURMrye42knp35BM,10668
10
+ abstract_webtools/managers/crawlManager.py,sha256=E9AT_01cY-3wAudC7sPED1qRKEFk-7WQPmPg5lD9hCA,11127
11
11
  abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
12
12
  abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
13
13
  abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
@@ -21,8 +21,8 @@ abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_
21
21
  abstract_webtools/managers/urlManager.py,sha256=XqMrCM84BeWEfWtHc_8UFpT91ZtG-okzdKdCuC49vsA,8678
22
22
  abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
23
23
  abstract_webtools/managers/videoDownloader.py,sha256=6G_aLc05BTMUYUWc7iqYtHF_BaR7DnCNK_NJ-QnjsYY,10531
24
- abstract_webtools-0.1.5.99.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
25
- abstract_webtools-0.1.5.99.dist-info/METADATA,sha256=D53IoHFui8dJQ6rBjjye2-Ujte-GcVlREwlh6-DMf6E,15858
26
- abstract_webtools-0.1.5.99.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
27
- abstract_webtools-0.1.5.99.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
28
- abstract_webtools-0.1.5.99.dist-info/RECORD,,
24
+ abstract_webtools-0.1.6.1.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
25
+ abstract_webtools-0.1.6.1.dist-info/METADATA,sha256=joH9ZQdN-2xBPb2Pcg5x4q6UwZUnJkN0597155uOfQA,15857
26
+ abstract_webtools-0.1.6.1.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
27
+ abstract_webtools-0.1.6.1.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
28
+ abstract_webtools-0.1.6.1.dist-info/RECORD,,