abstract-webtools 0.1.6.3__py3-none-any.whl → 0.1.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,6 @@
1
1
  from .soupManager import *
2
2
 
3
- def normalize_url(url, base_url):
4
- """
5
- Normalize and resolve relative URLs, ensuring proper domain and format.
6
- """
7
- # If URL starts with the base URL repeated, remove the extra part
8
- if url.startswith(base_url):
9
- url = url[len(base_url):]
10
3
 
11
- # Resolve the URL against the base URL
12
- normalized_url = urljoin(base_url, url.split('#')[0])
13
-
14
- # Ensure only URLs belonging to the base domain are kept
15
- if not normalized_url.startswith(base_url):
16
- return None
17
-
18
- return normalized_url
19
4
  class crawlManager():
20
5
  def __init__(self,url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
21
6
  self.url=url
@@ -46,6 +46,7 @@ class seleniumManager(metaclass=SingletonMeta):
46
46
  parsed_url = urlparse(url)
47
47
  self.domain = parsed_url.netloc
48
48
  self.scheme = parsed_url.scheme
49
+ self.base_url= f"{self.scheme}{self.domain}"
49
50
  self.site_dir = os.path.join(os.getcwd(), self.domain)
50
51
  os.makedirs(self.site_dir, exist_ok=True)
51
52
  self.drivers = {}
@@ -83,7 +84,24 @@ class seleniumManager(metaclass=SingletonMeta):
83
84
  self.drivers[url] = driver
84
85
  driver.get(url)
85
86
  return self.drivers[url]
87
+ def normalize_url(url, base_url=None):
88
+ """
89
+ Normalize and resolve relative URLs, ensuring proper domain and format.
90
+ """
91
+ # If URL starts with the base URL repeated, remove the extra part
92
+ manager = seleniumManager(url)
93
+ base_url = manager.base_url
94
+ if url.startswith(base_url):
95
+ url = url[len(base_url):]
86
96
 
97
+ # Resolve the URL against the base URL
98
+ normalized_url = urljoin(base_url, url.split('#')[0])
99
+
100
+ # Ensure only URLs belonging to the base domain are kept
101
+ if not normalized_url.startswith(base_url):
102
+ return None
103
+
104
+ return normalized_url
87
105
  # Function to get Selenium page source
88
106
  def get_selenium_source(url):
89
107
  url_mgr = urlManager(url)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.3
3
+ Version: 0.1.6.4
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -7,22 +7,22 @@ abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE7
7
7
  abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f5GoZmw0,3454
8
8
  abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
9
9
  abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
10
- abstract_webtools/managers/crawlManager.py,sha256=NBHyyoKOcy77-0yJGfwaeTD6UZhEJx-k47qIvnLdG-s,11166
10
+ abstract_webtools/managers/crawlManager.py,sha256=vpNFw7ws-ce94XZc3yzvFOcAIknGX_LwAB5nTctxIQs,10645
11
11
  abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
12
12
  abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
13
13
  abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
14
14
  abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
15
15
  abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
16
16
  abstract_webtools/managers/requestManager.py,sha256=YksYgRivMMuZNOzyL5vaoXv9MLhgYeuLxO_UJiqPGWw,17312
17
- abstract_webtools/managers/seleniumManager.py,sha256=CtQQYtDrFfgp8ujC6i5SCe0b_hgIA1K68io0aO4igoM,3623
17
+ abstract_webtools/managers/seleniumManager.py,sha256=qSY8gH3N5YJIMwE_Alj9HNQRip_PziIo4_T9AZE_FQo,4273
18
18
  abstract_webtools/managers/soupManager.py,sha256=7nSaq7OHaimc8602BihAXCA2ra0dbsB26d4yJzsRARc,16548
19
19
  abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
20
20
  abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
21
21
  abstract_webtools/managers/urlManager.py,sha256=XqMrCM84BeWEfWtHc_8UFpT91ZtG-okzdKdCuC49vsA,8678
22
22
  abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
23
23
  abstract_webtools/managers/videoDownloader.py,sha256=6G_aLc05BTMUYUWc7iqYtHF_BaR7DnCNK_NJ-QnjsYY,10531
24
- abstract_webtools-0.1.6.3.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
25
- abstract_webtools-0.1.6.3.dist-info/METADATA,sha256=i8w_enqqacZOgS8uS-l-Vyz5MhnzgJ68ZIrg3M9x0Xg,15857
26
- abstract_webtools-0.1.6.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
27
- abstract_webtools-0.1.6.3.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
28
- abstract_webtools-0.1.6.3.dist-info/RECORD,,
24
+ abstract_webtools-0.1.6.4.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
25
+ abstract_webtools-0.1.6.4.dist-info/METADATA,sha256=JdSW5nzlcxthd2T54jsAaXQ8LAcGpZYIg0Bz2J0ct_k,15857
26
+ abstract_webtools-0.1.6.4.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
27
+ abstract_webtools-0.1.6.4.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
28
+ abstract_webtools-0.1.6.4.dist-info/RECORD,,