abstract-webtools 0.1.6.119__py3-none-any.whl → 0.1.6.121__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/abstract_usurpit.py +72 -43
- {abstract_webtools-0.1.6.119.dist-info → abstract_webtools-0.1.6.121.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.119.dist-info → abstract_webtools-0.1.6.121.dist-info}/RECORD +5 -5
- {abstract_webtools-0.1.6.119.dist-info → abstract_webtools-0.1.6.121.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.119.dist-info → abstract_webtools-0.1.6.121.dist-info}/top_level.txt +0 -0
@@ -53,7 +53,7 @@ def is_valid_url(url, base_domain):
|
|
53
53
|
"""
|
54
54
|
parsed = urlparse(url)
|
55
55
|
return parsed.scheme in ('http', 'https') and parsed.netloc == base_domain
|
56
|
-
def
|
56
|
+
def get_save_page_path(url, output_dir):
|
57
57
|
"""
|
58
58
|
Save HTML page to local directory.
|
59
59
|
"""
|
@@ -66,12 +66,22 @@ def save_page(url, content,output_dir):
|
|
66
66
|
page_path += '.html'
|
67
67
|
|
68
68
|
page_full_path = os.path.join(output_dir, page_path)
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
69
|
+
return page_full_path
|
70
|
+
def save_page(url, content,output_dir):
|
71
|
+
page_full_path = get_save_page_path(url=url,
|
72
|
+
output_dir=output_dir)
|
73
|
+
if page_full_path:
|
74
|
+
dirname = os.path.dirname(page_full_path)
|
75
|
+
os.makedirs(dirname, exist_ok=True)
|
76
|
+
|
77
|
+
with open(page_full_path, 'w', encoding='utf-8') as f:
|
78
|
+
f.write(content)
|
79
|
+
print(f"Saved page: {page_full_path}")
|
80
|
+
def get_asset_path(asset_url,
|
81
|
+
base_url,
|
82
|
+
output_dir,
|
83
|
+
downloaded_assets=None,
|
84
|
+
session=None):
|
75
85
|
"""
|
76
86
|
Download and save assets like images, CSS, JS files.
|
77
87
|
"""
|
@@ -88,17 +98,29 @@ def save_asset(asset_url, base_url,output_dir,downloaded_assets=None,session=Non
|
|
88
98
|
return # Skip if asset path is empty
|
89
99
|
|
90
100
|
asset_full_path = os.path.join(output_dir, asset_path)
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
101
|
+
return asset_full_path
|
102
|
+
def save_asset(asset_url,
|
103
|
+
base_url,
|
104
|
+
output_dir,
|
105
|
+
downloaded_assets=None,
|
106
|
+
session=None):
|
107
|
+
asset_full_path = get_asset_path(asset_url=asset_url,
|
108
|
+
base_url=base_url,
|
109
|
+
output_dir=output_dir,
|
110
|
+
downloaded_assets=downloaded_assets,
|
111
|
+
session=session)
|
112
|
+
if asset_full_path:
|
113
|
+
os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
|
114
|
+
|
115
|
+
try:
|
116
|
+
response = session.get(asset_url, stream=True)
|
117
|
+
response.raise_for_status()
|
118
|
+
with open(asset_full_path, 'wb') as f:
|
119
|
+
shutil.copyfileobj(response.raw, f)
|
120
|
+
print(f"Saved asset: {asset_full_path}")
|
121
|
+
except Exception as e:
|
122
|
+
print(f"Failed to save asset {asset_url}: {e}")
|
123
|
+
return downloaded_assets
|
102
124
|
class usurpManager():
|
103
125
|
def __init__(self,url,output_dir=None,max_depth=None,wait_between_requests=None,operating_system=None, browser=None, version=None,user_agent=None,website_bot=None):
|
104
126
|
self.url = url
|
@@ -135,34 +157,41 @@ class usurpManager():
|
|
135
157
|
response = self.session.get(url)
|
136
158
|
#response.raise_for_status()
|
137
159
|
content = response.text
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
if
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
160
|
+
page_full_path = get_save_page_path(url=url,
|
161
|
+
output_dir=self.OUTPUT_DIR)
|
162
|
+
if not os.path.exists(page_full_path):
|
163
|
+
# Use your get_soup_mgr function to get the soup and attributes
|
164
|
+
soup_mgr = get_soup_mgr(url=url)
|
165
|
+
soup = soup_mgr.soup
|
166
|
+
all_attributes = soup_mgr.get_all_attribute_values()
|
167
|
+
# Now you can use all_attributes as needed
|
168
|
+
get_asset_path(asset_url=full_asset_url,
|
169
|
+
base_url=self.url,
|
170
|
+
output_dir=self.OUTPUT_DIR,
|
171
|
+
downloaded_assets=self.downloaded_assets,
|
172
|
+
session=self.session)
|
173
|
+
# Update asset links to local paths
|
174
|
+
for tag in soup.find_all(['img', 'script', 'link']):
|
175
|
+
attr = 'src' if tag.name != 'link' else 'href'
|
176
|
+
asset_url = tag.get(attr)
|
177
|
+
if asset_url:
|
178
|
+
full_asset_url = normalize_url(asset_url, url)
|
179
|
+
parsed_asset_url = urlparse(full_asset_url)
|
180
|
+
|
181
|
+
if is_valid_url(full_asset_url, base_domain):
|
182
|
+
self.downloaded_assets = save_asset(full_asset_url, self.url,self.OUTPUT_DIR,self.downloaded_assets,self.session)
|
183
|
+
# Update tag to point to the local asset
|
184
|
+
local_asset_path = '/' + parsed_asset_url.path.lstrip('/')
|
185
|
+
tag[attr] = local_asset_path
|
186
|
+
|
187
|
+
# Save the modified page
|
188
|
+
save_page(url, str(soup),self.OUTPUT_DIR)
|
189
|
+
else:
|
190
|
+
print(f"skippinng {page_full_path} because it already exists")
|
162
191
|
# Use your linkManager to find all domain links
|
163
192
|
link_mgr = linkManager(url=url)
|
164
193
|
all_domains = link_mgr.find_all_domain()
|
165
|
-
|
194
|
+
|
166
195
|
# Process each domain link
|
167
196
|
for link_url in make_list(all_domains):
|
168
197
|
normalized_link = normalize_url(link_url, url)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.121
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -1,6 +1,6 @@
|
|
1
1
|
abstract_webtools/__init__.py,sha256=zNMp-9f0Q6BXWxR-tgHrEqKP8GeXw9z7VYzbqIeEydo,132
|
2
2
|
abstract_webtools/abstract_userpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-P2w1u6ng,6462
|
3
|
-
abstract_webtools/abstract_usurpit.py,sha256=
|
3
|
+
abstract_webtools/abstract_usurpit.py,sha256=IfCERQn6uvp5KuzAuxQY55qaLaZ9DN9gZvUGsUQRjss,9409
|
4
4
|
abstract_webtools/abstract_webtools.py,sha256=3NzGmJlZvrdVtEcUi2K5iUgWr1822IBPhIN9us2e2t0,3859
|
5
5
|
abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
|
6
6
|
abstract_webtools/domain_identifier.py,sha256=AvWlGD7C19rySa_J_Brxi3kz43LMWvGsshuuZNg7MvI,3320
|
@@ -42,7 +42,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
|
|
42
42
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
43
43
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
44
44
|
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
45
|
-
abstract_webtools-0.1.6.
|
46
|
-
abstract_webtools-0.1.6.
|
47
|
-
abstract_webtools-0.1.6.
|
48
|
-
abstract_webtools-0.1.6.
|
45
|
+
abstract_webtools-0.1.6.121.dist-info/METADATA,sha256=qgke70iRJYCmxi1Rqe_DZ9-CaTFSy3paHMPr1miR3ls,7289
|
46
|
+
abstract_webtools-0.1.6.121.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
47
|
+
abstract_webtools-0.1.6.121.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
48
|
+
abstract_webtools-0.1.6.121.dist-info/RECORD,,
|
File without changes
|
{abstract_webtools-0.1.6.119.dist-info → abstract_webtools-0.1.6.121.dist-info}/top_level.txt
RENAMED
File without changes
|