abstract-webtools 0.1.6.60__py3-none-any.whl → 0.1.6.61__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/abstract_usurpit.py +15 -15
- {abstract_webtools-0.1.6.60.dist-info → abstract_webtools-0.1.6.61.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.60.dist-info → abstract_webtools-0.1.6.61.dist-info}/RECORD +5 -5
- {abstract_webtools-0.1.6.60.dist-info → abstract_webtools-0.1.6.61.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.60.dist-info → abstract_webtools-0.1.6.61.dist-info}/top_level.txt +0 -0
@@ -53,12 +53,14 @@ def save_page(url, content,output_dir):
|
|
53
53
|
with open(page_full_path, 'w', encoding='utf-8') as f:
|
54
54
|
f.write(content)
|
55
55
|
print(f"Saved page: {page_full_path}")
|
56
|
-
def save_asset(asset_url, base_url,output_dir,downloaded_assets,session):
|
56
|
+
def save_asset(asset_url, base_url,output_dir,downloaded_assets=None,session=None):
|
57
57
|
"""
|
58
58
|
Download and save assets like images, CSS, JS files.
|
59
59
|
"""
|
60
|
+
session=requests.Session()
|
61
|
+
downloaded_assets = downloaded_assets or set()
|
60
62
|
asset_url = normalize_url(asset_url, base_url)
|
61
|
-
if asset_url in downloaded_assets:
|
63
|
+
if asset_url in list(downloaded_assets):
|
62
64
|
return
|
63
65
|
downloaded_assets.add(asset_url)
|
64
66
|
|
@@ -102,18 +104,18 @@ class usurpManager():
|
|
102
104
|
"Access-Control-Allow-Origin": "*"})
|
103
105
|
|
104
106
|
def process_page(self,url, depth, base_domain):
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
107
|
+
"""
|
108
|
+
Process a single page: download assets, save HTML, and crawl links.
|
109
|
+
"""
|
110
|
+
print(url)
|
111
|
+
if url in self.visited_pages or depth > self.MAX_DEPTH:
|
112
|
+
return
|
113
|
+
self.visited_pages.add(url)
|
114
|
+
|
115
|
+
|
114
116
|
# Fetch the page content
|
115
117
|
response = self.session.get(url)
|
116
|
-
response.raise_for_status()
|
118
|
+
#response.raise_for_status()
|
117
119
|
content = response.text
|
118
120
|
|
119
121
|
# Use your get_soup_mgr function to get the soup and attributes
|
@@ -150,8 +152,6 @@ class usurpManager():
|
|
150
152
|
time.sleep(self.WAIT_BETWEEN_REQUESTS)
|
151
153
|
self.process_page(normalized_link, depth + 1, base_domain)
|
152
154
|
|
153
|
-
except Exception as e:
|
154
|
-
print(f"Failed to process page {url}: {e}")
|
155
155
|
|
156
156
|
def main(self):
|
157
157
|
# Ensure output directory exists
|
@@ -163,7 +163,7 @@ class usurpManager():
|
|
163
163
|
self.process_page(self.BASE_URL, 0, base_domain)
|
164
164
|
print("Website copying completed.")
|
165
165
|
def test_download(url=None,directory=None):
|
166
|
-
url=url or 'https://
|
166
|
+
url=url or 'https://www.youtube.com/watch?v=jRGrNDV2mKc&list=RDMMjRGrNDV2mKc&start_radio=1'
|
167
167
|
output_dir= directory or os.path.join(os.getcwd(),'testit')
|
168
168
|
os.makedirs(output_dir,exist_ok=True)
|
169
169
|
site_mgr = usurpManager(url,output_dir)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.61
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -1,6 +1,6 @@
|
|
1
1
|
abstract_webtools/__init__.py,sha256=zNMp-9f0Q6BXWxR-tgHrEqKP8GeXw9z7VYzbqIeEydo,132
|
2
2
|
abstract_webtools/abstract_userpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-P2w1u6ng,6462
|
3
|
-
abstract_webtools/abstract_usurpit.py,sha256=
|
3
|
+
abstract_webtools/abstract_usurpit.py,sha256=7PDUb5LNETjvU1rhfJaToKLIKmSXRkcJAmM4wOX7PsQ,7170
|
4
4
|
abstract_webtools/abstract_webtools.py,sha256=3NzGmJlZvrdVtEcUi2K5iUgWr1822IBPhIN9us2e2t0,3859
|
5
5
|
abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
|
6
6
|
abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
|
@@ -37,7 +37,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
|
|
37
37
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
38
38
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
39
39
|
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
40
|
-
abstract_webtools-0.1.6.
|
41
|
-
abstract_webtools-0.1.6.
|
42
|
-
abstract_webtools-0.1.6.
|
43
|
-
abstract_webtools-0.1.6.
|
40
|
+
abstract_webtools-0.1.6.61.dist-info/METADATA,sha256=QENrp5W8V0PoOXOrU0YEi093XXyaoQRmBMJMXmVko2k,16029
|
41
|
+
abstract_webtools-0.1.6.61.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
42
|
+
abstract_webtools-0.1.6.61.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
43
|
+
abstract_webtools-0.1.6.61.dist-info/RECORD,,
|
File without changes
|
File without changes
|