abstract-webtools 0.1.6.121__py3-none-any.whl → 0.1.6.122__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/__pycache__/abstract_webtools.cpython-312.pyc +0 -0
- abstract_webtools/abstract_usurpit.py +91 -63
- abstract_webtools/url_grabber_new.py +95 -45
- {abstract_webtools-0.1.6.121.dist-info → abstract_webtools-0.1.6.122.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.121.dist-info → abstract_webtools-0.1.6.122.dist-info}/RECORD +7 -6
- {abstract_webtools-0.1.6.121.dist-info → abstract_webtools-0.1.6.122.dist-info}/WHEEL +1 -1
- {abstract_webtools-0.1.6.121.dist-info → abstract_webtools-0.1.6.122.dist-info}/top_level.txt +0 -0
Binary file
|
@@ -6,7 +6,20 @@ import shutil
|
|
6
6
|
import time
|
7
7
|
from abstract_webtools import *
|
8
8
|
from abstract_utilities import *
|
9
|
-
|
9
|
+
def get_abs_path():
|
10
|
+
return os.path.abspath(__file__)
|
11
|
+
|
12
|
+
def get_abs_dir():
|
13
|
+
abs_path = get_abs_path()
|
14
|
+
return os.path.dirname(abs_path)
|
15
|
+
def join_abs_path(path):
|
16
|
+
abs_dir = get_abs_dir()
|
17
|
+
return os.path.join(abs_dir,path)
|
18
|
+
def get_rel_dir():
|
19
|
+
return os.getcwd()
|
20
|
+
def join_rel_path(path):
|
21
|
+
rel_dir = get_rel_dir()
|
22
|
+
return os.path.join(rel_dir,path)
|
10
23
|
# Import your custom classes/functions
|
11
24
|
# from your_module import linkManager, get_soup_mgr
|
12
25
|
def make_directory(directory=None,path=None):
|
@@ -16,6 +29,29 @@ def make_directory(directory=None,path=None):
|
|
16
29
|
directory = os.path.join(base_dir,path)
|
17
30
|
os.makedirs(directory,exist_ok=True)
|
18
31
|
return directory
|
32
|
+
def get_paths(*paths):
|
33
|
+
all_paths = []
|
34
|
+
for path in paths:
|
35
|
+
all_paths+=path.split('/')
|
36
|
+
return all_paths
|
37
|
+
def makeAllDirs(*paths):
|
38
|
+
full_path= ''
|
39
|
+
paths = get_paths(*paths)
|
40
|
+
for i,path in enumerate(paths):
|
41
|
+
if i == 0:
|
42
|
+
full_path = path
|
43
|
+
if not full_path.startswith('/'):
|
44
|
+
full_path = join_rel_path(full_path)
|
45
|
+
else:
|
46
|
+
full_path = os.path.join(full_path,path)
|
47
|
+
os.makedirs(full_path,exist_ok=True)
|
48
|
+
return full_path
|
49
|
+
def currate_full_path(full_path):
|
50
|
+
dirname = os.path.dirname(full_path)
|
51
|
+
basename = os.path.basename(full_path)
|
52
|
+
full_dirname = makeAllDirs(dirname)
|
53
|
+
full_path = os.path.join(full_dirname,basename)
|
54
|
+
return full_path
|
19
55
|
def get_domain_name_from_url(url):
|
20
56
|
parsed_url = urlparse(url)
|
21
57
|
netloc = parsed_url.netloc
|
@@ -70,9 +106,10 @@ def get_save_page_path(url, output_dir):
|
|
70
106
|
def save_page(url, content,output_dir):
|
71
107
|
page_full_path = get_save_page_path(url=url,
|
72
108
|
output_dir=output_dir)
|
109
|
+
page_full_path = currate_full_path(page_full_path)
|
73
110
|
if page_full_path:
|
74
111
|
dirname = os.path.dirname(page_full_path)
|
75
|
-
|
112
|
+
|
76
113
|
|
77
114
|
with open(page_full_path, 'w', encoding='utf-8') as f:
|
78
115
|
f.write(content)
|
@@ -144,60 +181,56 @@ class usurpManager():
|
|
144
181
|
"Access-Control-Allow-Origin": "*"})
|
145
182
|
|
146
183
|
def process_page(self,url, depth, base_domain):
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
normalized_link = normalize_url(link_url, url)
|
198
|
-
if is_valid_url(normalized_link, base_domain):
|
199
|
-
time.sleep(self.WAIT_BETWEEN_REQUESTS)
|
200
|
-
self.process_page(normalized_link, depth + 1, base_domain)
|
184
|
+
"""
|
185
|
+
Process a single page: download assets, save HTML, and crawl links.
|
186
|
+
"""
|
187
|
+
print(url)
|
188
|
+
if url in self.visited_pages or depth > self.MAX_DEPTH:
|
189
|
+
return
|
190
|
+
self.visited_pages.add(url)
|
191
|
+
|
192
|
+
|
193
|
+
# Fetch the page content
|
194
|
+
response = self.session.get(url)
|
195
|
+
#response.raise_for_status()
|
196
|
+
content = response.text
|
197
|
+
|
198
|
+
# Use your get_soup_mgr function to get the soup and attributes
|
199
|
+
soup_mgr = get_soup_mgr(url=url)
|
200
|
+
soup = soup_mgr.soup
|
201
|
+
all_attributes = soup_mgr.get_all_attribute_values()
|
202
|
+
# Now you can use all_attributes as needed
|
203
|
+
|
204
|
+
# Update asset links to local paths
|
205
|
+
for tag in soup.find_all(['img', 'script', 'link']):
|
206
|
+
attr = 'src' if tag.name != 'link' else 'href'
|
207
|
+
asset_url = tag.get(attr)
|
208
|
+
if asset_url:
|
209
|
+
full_asset_url = normalize_url(asset_url, url)
|
210
|
+
parsed_asset_url = urlparse(full_asset_url)
|
211
|
+
|
212
|
+
if is_valid_url(full_asset_url, base_domain):
|
213
|
+
self.downloaded_assets = save_asset(full_asset_url,
|
214
|
+
self.url,
|
215
|
+
self.OUTPUT_DIR,
|
216
|
+
self.downloaded_assets,
|
217
|
+
self.session)
|
218
|
+
# Update tag to point to the local asset
|
219
|
+
local_asset_path = '/' + parsed_asset_url.path.lstrip('/')
|
220
|
+
tag[attr] = local_asset_path
|
221
|
+
|
222
|
+
# Save the modified page
|
223
|
+
save_page(url, str(soup),self.OUTPUT_DIR)
|
224
|
+
# Use your linkManager to find all domain links
|
225
|
+
link_mgr = linkManager(url=url)
|
226
|
+
all_domains = link_mgr.find_all_domain()
|
227
|
+
|
228
|
+
# Process each domain link
|
229
|
+
for link_url in make_list(all_domains):
|
230
|
+
normalized_link = normalize_url(link_url, url)
|
231
|
+
if is_valid_url(normalized_link, base_domain):
|
232
|
+
time.sleep(self.WAIT_BETWEEN_REQUESTS)
|
233
|
+
self.process_page(normalized_link, depth + 1, base_domain)
|
201
234
|
|
202
235
|
|
203
236
|
def main(self):
|
@@ -209,14 +242,9 @@ class usurpManager():
|
|
209
242
|
|
210
243
|
self.process_page(self.BASE_URL, 0, base_domain)
|
211
244
|
print("Website copying completed.")
|
212
|
-
|
213
|
-
url=url or 'https://www.youtube.com/watch?v=jRGrNDV2mKc&list=RDMMjRGrNDV2mKc&start_radio=1'
|
214
|
-
|
215
|
-
output_dir= directory or get_directory_from_url(url) or os.path.join(os.getcwd(),'testit')
|
216
|
-
os.makedirs(output_dir,exist_ok=True)
|
217
|
-
site_mgr = usurpManager(url,output_dir)
|
218
|
-
|
245
|
+
|
219
246
|
def usurpit(url,output_dir=None,max_depth=None,wait_between_requests=None,operating_system=None, browser=None, version=None,user_agent=None,website_bot=None):
|
220
247
|
output_dir = get_domain_name_from_url(url) or make_directory(path='usurped')
|
221
248
|
site_mgr = usurpManager(url,output_dir=output_dir,max_depth=max_depth,wait_between_requests=wait_between_requests,operating_system=operating_system, browser=browser, version=version,user_agent=user_agent,website_bot=website_bot)
|
222
249
|
site_mgr.main()
|
250
|
+
|
@@ -1,45 +1,95 @@
|
|
1
|
-
from
|
2
|
-
from abstract_webtools import
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
self.
|
9
|
-
self.
|
10
|
-
self.
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
layout =
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
1
|
+
from PyQt5 import QtWidgets, QtCore
|
2
|
+
from abstract_webtools import urlManager, requestManager, SoupManager, LinkManager
|
3
|
+
from abstract_gui import get_user_agents, get_cipher_list
|
4
|
+
|
5
|
+
class UrlGrabberWidget(QtWidgets.QWidget):
|
6
|
+
def __init__(self, initial_url="https://example.com", parent=None):
|
7
|
+
super().__init__(parent)
|
8
|
+
self.initial_url = initial_url
|
9
|
+
self.setup_ui()
|
10
|
+
self.setup_logic()
|
11
|
+
self.init_managers()
|
12
|
+
|
13
|
+
def setup_ui(self):
|
14
|
+
layout = QtWidgets.QVBoxLayout(self)
|
15
|
+
|
16
|
+
# URL input and grab button
|
17
|
+
self.url_input = QtWidgets.QLineEdit(self.initial_url)
|
18
|
+
self.status_label = QtWidgets.QLabel("Status: Unknown")
|
19
|
+
self.grab_btn = QtWidgets.QPushButton("Grab URL")
|
20
|
+
|
21
|
+
url_layout = QtWidgets.QHBoxLayout()
|
22
|
+
url_layout.addWidget(QtWidgets.QLabel("URL:"))
|
23
|
+
url_layout.addWidget(self.url_input)
|
24
|
+
url_layout.addWidget(self.grab_btn)
|
25
|
+
|
26
|
+
# User agent input
|
27
|
+
self.user_agent_box = QtWidgets.QComboBox()
|
28
|
+
self.user_agent_box.addItems(get_user_agents())
|
29
|
+
|
30
|
+
# Source output
|
31
|
+
self.source_code_edit = QtWidgets.QPlainTextEdit()
|
32
|
+
self.source_code_edit.setReadOnly(True)
|
33
|
+
|
34
|
+
# Soup output
|
35
|
+
self.soup_result_edit = QtWidgets.QPlainTextEdit()
|
36
|
+
|
37
|
+
# Action buttons
|
38
|
+
self.action_btn = QtWidgets.QPushButton("Parse")
|
39
|
+
self.get_text_btn = QtWidgets.QPushButton("Get All Text")
|
40
|
+
self.send_btn = QtWidgets.QPushButton("Send Soup")
|
41
|
+
|
42
|
+
# Assemble layout
|
43
|
+
layout.addLayout(url_layout)
|
44
|
+
layout.addWidget(self.status_label)
|
45
|
+
layout.addWidget(QtWidgets.QLabel("User-Agent:"))
|
46
|
+
layout.addWidget(self.user_agent_box)
|
47
|
+
layout.addWidget(QtWidgets.QLabel("Source Code:"))
|
48
|
+
layout.addWidget(self.source_code_edit)
|
49
|
+
layout.addWidget(QtWidgets.QLabel("Soup Result:"))
|
50
|
+
layout.addWidget(self.soup_result_edit)
|
51
|
+
|
52
|
+
btn_layout = QtWidgets.QHBoxLayout()
|
53
|
+
btn_layout.addWidget(self.action_btn)
|
54
|
+
btn_layout.addWidget(self.get_text_btn)
|
55
|
+
btn_layout.addWidget(self.send_btn)
|
56
|
+
layout.addLayout(btn_layout)
|
57
|
+
|
58
|
+
self.setLayout(layout)
|
59
|
+
|
60
|
+
def setup_logic(self):
|
61
|
+
self.grab_btn.clicked.connect(self.grab_url)
|
62
|
+
self.action_btn.clicked.connect(self.parse_html)
|
63
|
+
self.get_text_btn.clicked.connect(self.get_all_text)
|
64
|
+
self.send_btn.clicked.connect(self.send_soup)
|
65
|
+
|
66
|
+
def init_managers(self):
|
67
|
+
self.url_mgr = urlManager(url=self.initial_url)
|
68
|
+
self.request_mgr = None
|
69
|
+
self.soup_mgr = None
|
70
|
+
self.link_mgr = None
|
71
|
+
|
72
|
+
def grab_url(self):
|
73
|
+
url = self.url_input.text().strip()
|
74
|
+
self.url_mgr = urlManager(url=url)
|
75
|
+
self.request_mgr = requestManager(url_mgr=self.url_mgr)
|
76
|
+
if self.request_mgr.source_code:
|
77
|
+
self.soup_mgr = SoupManager(url_mgr=self.url_mgr, request_mgr=self.request_mgr)
|
78
|
+
self.link_mgr = LinkManager(url_mgr=self.url_mgr, request_mgr=self.request_mgr, soup_mgr=self.soup_mgr)
|
79
|
+
self.status_label.setText("Status: Success")
|
80
|
+
self.source_code_edit.setPlainText(self.request_mgr.source_code)
|
81
|
+
else:
|
82
|
+
self.status_label.setText("Status: Failed")
|
83
|
+
|
84
|
+
def parse_html(self):
|
85
|
+
if self.soup_mgr:
|
86
|
+
self.soup_result_edit.setPlainText(self.soup_mgr.soup)
|
87
|
+
|
88
|
+
def get_all_text(self):
|
89
|
+
if self.soup_mgr:
|
90
|
+
self.soup_result_edit.setPlainText(self.soup_mgr.extract_text_sections())
|
91
|
+
|
92
|
+
def send_soup(self):
|
93
|
+
soup = self.soup_result_edit.toPlainText()
|
94
|
+
print("Soup sent:", soup[:300]) # or emit a signal
|
95
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.122
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -1,6 +1,6 @@
|
|
1
1
|
abstract_webtools/__init__.py,sha256=zNMp-9f0Q6BXWxR-tgHrEqKP8GeXw9z7VYzbqIeEydo,132
|
2
2
|
abstract_webtools/abstract_userpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-P2w1u6ng,6462
|
3
|
-
abstract_webtools/abstract_usurpit.py,sha256=
|
3
|
+
abstract_webtools/abstract_usurpit.py,sha256=2idbYXLFhXh8VPfdYgWICNH8dehnZRCdt4U5sTsVxo4,9663
|
4
4
|
abstract_webtools/abstract_webtools.py,sha256=3NzGmJlZvrdVtEcUi2K5iUgWr1822IBPhIN9us2e2t0,3859
|
5
5
|
abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
|
6
6
|
abstract_webtools/domain_identifier.py,sha256=AvWlGD7C19rySa_J_Brxi3kz43LMWvGsshuuZNg7MvI,3320
|
@@ -10,7 +10,8 @@ abstract_webtools/k2s_downloader.py,sha256=t0tCKAfDNQGn9tKh3eg0XVU0bY-MmYITwJa3A
|
|
10
10
|
abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
|
11
11
|
abstract_webtools/soup_gui.py,sha256=n95YAps1R6DpMwR4UbthSqQby0C5WHUa9tsW-f2qpLg,5184
|
12
12
|
abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE70B8,10441
|
13
|
-
abstract_webtools/url_grabber_new.py,sha256=
|
13
|
+
abstract_webtools/url_grabber_new.py,sha256=xb23qo4anOY0Ax3CAfaHJ8s5VEz61Sinh-XpEDFW7Is,3621
|
14
|
+
abstract_webtools/__pycache__/abstract_webtools.cpython-312.pyc,sha256=Rb2nPDCUG6i7nEs-I128lozwKteIVXzZxygV-zJVALs,4606
|
14
15
|
abstract_webtools/managers/__init__.py,sha256=9pgy52NB-ONxLqoCRF52GZ6G7GM6Uc0-fgA1HvKcwxc,407
|
15
16
|
abstract_webtools/managers/allss\.py,sha256=IBhlyRQHfK-BtwUnSEbIPqlI1MtZ8-XsdaHv0b91HQ0,269
|
16
17
|
abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
|
@@ -42,7 +43,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
|
|
42
43
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
43
44
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
44
45
|
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
45
|
-
abstract_webtools-0.1.6.
|
46
|
-
abstract_webtools-0.1.6.
|
47
|
-
abstract_webtools-0.1.6.
|
48
|
-
abstract_webtools-0.1.6.
|
46
|
+
abstract_webtools-0.1.6.122.dist-info/METADATA,sha256=eCQxVrpP4p0xz9SbZt3JjNS9dZ5RZ6gd0nnRy0wPQpM,7289
|
47
|
+
abstract_webtools-0.1.6.122.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
48
|
+
abstract_webtools-0.1.6.122.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
49
|
+
abstract_webtools-0.1.6.122.dist-info/RECORD,,
|
{abstract_webtools-0.1.6.121.dist-info → abstract_webtools-0.1.6.122.dist-info}/top_level.txt
RENAMED
File without changes
|