abstract-webtools 0.1.6.120__py3-none-any.whl → 0.1.6.122__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,20 @@ import shutil
6
6
  import time
7
7
  from abstract_webtools import *
8
8
  from abstract_utilities import *
9
-
9
+ def get_abs_path():
10
+ return os.path.abspath(__file__)
11
+
12
+ def get_abs_dir():
13
+ abs_path = get_abs_path()
14
+ return os.path.dirname(abs_path)
15
+ def join_abs_path(path):
16
+ abs_dir = get_abs_dir()
17
+ return os.path.join(abs_dir,path)
18
+ def get_rel_dir():
19
+ return os.getcwd()
20
+ def join_rel_path(path):
21
+ rel_dir = get_rel_dir()
22
+ return os.path.join(rel_dir,path)
10
23
  # Import your custom classes/functions
11
24
  # from your_module import linkManager, get_soup_mgr
12
25
  def make_directory(directory=None,path=None):
@@ -16,6 +29,29 @@ def make_directory(directory=None,path=None):
16
29
  directory = os.path.join(base_dir,path)
17
30
  os.makedirs(directory,exist_ok=True)
18
31
  return directory
32
+ def get_paths(*paths):
33
+ all_paths = []
34
+ for path in paths:
35
+ all_paths+=path.split('/')
36
+ return all_paths
37
+ def makeAllDirs(*paths):
38
+ full_path= ''
39
+ paths = get_paths(*paths)
40
+ for i,path in enumerate(paths):
41
+ if i == 0:
42
+ full_path = path
43
+ if not full_path.startswith('/'):
44
+ full_path = join_rel_path(full_path)
45
+ else:
46
+ full_path = os.path.join(full_path,path)
47
+ os.makedirs(full_path,exist_ok=True)
48
+ return full_path
49
+ def currate_full_path(full_path):
50
+ dirname = os.path.dirname(full_path)
51
+ basename = os.path.basename(full_path)
52
+ full_dirname = makeAllDirs(dirname)
53
+ full_path = os.path.join(full_dirname,basename)
54
+ return full_path
19
55
  def get_domain_name_from_url(url):
20
56
  parsed_url = urlparse(url)
21
57
  netloc = parsed_url.netloc
@@ -70,9 +106,10 @@ def get_save_page_path(url, output_dir):
70
106
  def save_page(url, content,output_dir):
71
107
  page_full_path = get_save_page_path(url=url,
72
108
  output_dir=output_dir)
109
+ page_full_path = currate_full_path(page_full_path)
73
110
  if page_full_path:
74
111
  dirname = os.path.dirname(page_full_path)
75
- os.makedirs(dirname, exist_ok=True)
112
+
76
113
 
77
114
  with open(page_full_path, 'w', encoding='utf-8') as f:
78
115
  f.write(content)
@@ -144,59 +181,56 @@ class usurpManager():
144
181
  "Access-Control-Allow-Origin": "*"})
145
182
 
146
183
  def process_page(self,url, depth, base_domain):
147
- """
148
- Process a single page: download assets, save HTML, and crawl links.
149
- """
150
- print(url)
151
- if url in self.visited_pages or depth > self.MAX_DEPTH:
152
- return
153
- self.visited_pages.add(url)
154
-
155
-
156
- # Fetch the page content
157
- response = self.session.get(url)
158
- #response.raise_for_status()
159
- content = response.text
160
- page_full_path = get_save_page_path(url=url,
161
- output_dir=self.OUTPUT_DIR)
162
- if not os.path.exists(page_full_path):
163
- # Use your get_soup_mgr function to get the soup and attributes
164
- soup_mgr = get_soup_mgr(url=url)
165
- soup = soup_mgr.soup
166
- all_attributes = soup_mgr.get_all_attribute_values()
167
- # Now you can use all_attributes as needed
168
- get_asset_path(asset_url=full_asset_url,
169
- base_url=self.url,
170
- output_dir=self.OUTPUT_DIR,
171
- downloaded_assets=self.downloaded_assets,
172
- session=self.session)
173
- # Update asset links to local paths
174
- for tag in soup.find_all(['img', 'script', 'link']):
175
- attr = 'src' if tag.name != 'link' else 'href'
176
- asset_url = tag.get(attr)
177
- if asset_url:
178
- full_asset_url = normalize_url(asset_url, url)
179
- parsed_asset_url = urlparse(full_asset_url)
180
-
181
- if is_valid_url(full_asset_url, base_domain):
182
- self.downloaded_assets = save_asset(full_asset_url, self.url,self.OUTPUT_DIR,self.downloaded_assets,self.session)
183
- # Update tag to point to the local asset
184
- local_asset_path = '/' + parsed_asset_url.path.lstrip('/')
185
- tag[attr] = local_asset_path
186
-
187
- # Save the modified page
188
- save_page(url, str(soup),self.OUTPUT_DIR)
189
-
190
- # Use your linkManager to find all domain links
191
- link_mgr = linkManager(url=url)
192
- all_domains = link_mgr.find_all_domain()
193
-
194
- # Process each domain link
195
- for link_url in make_list(all_domains):
196
- normalized_link = normalize_url(link_url, url)
197
- if is_valid_url(normalized_link, base_domain):
198
- time.sleep(self.WAIT_BETWEEN_REQUESTS)
199
- self.process_page(normalized_link, depth + 1, base_domain)
184
+ """
185
+ Process a single page: download assets, save HTML, and crawl links.
186
+ """
187
+ print(url)
188
+ if url in self.visited_pages or depth > self.MAX_DEPTH:
189
+ return
190
+ self.visited_pages.add(url)
191
+
192
+
193
+ # Fetch the page content
194
+ response = self.session.get(url)
195
+ #response.raise_for_status()
196
+ content = response.text
197
+
198
+ # Use your get_soup_mgr function to get the soup and attributes
199
+ soup_mgr = get_soup_mgr(url=url)
200
+ soup = soup_mgr.soup
201
+ all_attributes = soup_mgr.get_all_attribute_values()
202
+ # Now you can use all_attributes as needed
203
+
204
+ # Update asset links to local paths
205
+ for tag in soup.find_all(['img', 'script', 'link']):
206
+ attr = 'src' if tag.name != 'link' else 'href'
207
+ asset_url = tag.get(attr)
208
+ if asset_url:
209
+ full_asset_url = normalize_url(asset_url, url)
210
+ parsed_asset_url = urlparse(full_asset_url)
211
+
212
+ if is_valid_url(full_asset_url, base_domain):
213
+ self.downloaded_assets = save_asset(full_asset_url,
214
+ self.url,
215
+ self.OUTPUT_DIR,
216
+ self.downloaded_assets,
217
+ self.session)
218
+ # Update tag to point to the local asset
219
+ local_asset_path = '/' + parsed_asset_url.path.lstrip('/')
220
+ tag[attr] = local_asset_path
221
+
222
+ # Save the modified page
223
+ save_page(url, str(soup),self.OUTPUT_DIR)
224
+ # Use your linkManager to find all domain links
225
+ link_mgr = linkManager(url=url)
226
+ all_domains = link_mgr.find_all_domain()
227
+
228
+ # Process each domain link
229
+ for link_url in make_list(all_domains):
230
+ normalized_link = normalize_url(link_url, url)
231
+ if is_valid_url(normalized_link, base_domain):
232
+ time.sleep(self.WAIT_BETWEEN_REQUESTS)
233
+ self.process_page(normalized_link, depth + 1, base_domain)
200
234
 
201
235
 
202
236
  def main(self):
@@ -208,14 +242,9 @@ class usurpManager():
208
242
 
209
243
  self.process_page(self.BASE_URL, 0, base_domain)
210
244
  print("Website copying completed.")
211
- def test_download(url=None,directory=None):
212
- url=url or 'https://www.youtube.com/watch?v=jRGrNDV2mKc&list=RDMMjRGrNDV2mKc&start_radio=1'
213
-
214
- output_dir= directory or get_directory_from_url(url) or os.path.join(os.getcwd(),'testit')
215
- os.makedirs(output_dir,exist_ok=True)
216
- site_mgr = usurpManager(url,output_dir)
217
-
245
+
218
246
  def usurpit(url,output_dir=None,max_depth=None,wait_between_requests=None,operating_system=None, browser=None, version=None,user_agent=None,website_bot=None):
219
247
  output_dir = get_domain_name_from_url(url) or make_directory(path='usurped')
220
248
  site_mgr = usurpManager(url,output_dir=output_dir,max_depth=max_depth,wait_between_requests=wait_between_requests,operating_system=operating_system, browser=browser, version=version,user_agent=user_agent,website_bot=website_bot)
221
249
  site_mgr.main()
250
+
@@ -1,45 +1,95 @@
1
- from abstract_gui import AbstractWindowManager,make_component
2
- from abstract_webtools import UserAgentManager,UrlManager,SafeRequest,SoupManager,LinkManager,CipherManager
3
-
4
- class GuiGrabber:
5
- def __init__(self,url="www.example.com"):
6
- self.window_mgr = AbstractWindowManager()
7
- self.window_name = self.window_mgr.add_window(title="Gui_Grabber",layout=[],event_handlers=[self.while_window])
8
- self.url = url
9
- self.parse_type_choices = ['html.parser', 'lxml', 'html5lib']
10
- self.window_mgr.while_window()
11
- def layout(event,values,window):
12
- # Add a dropdown for selecting BeautifulSoup parsing capabilities
13
- make_component("theme",'LightGrey1')
14
- layout = [[make_component("Text",'URL:', size=(8, 1)),
15
- make_component("Input",url, key='-URL-',enable_events=True),
16
- make_component("Text",'status:'),
17
- make_component("Text",'200',key="-STATUS_CODE-"),
18
- make_component("Text",f'success: {self.url} is valid',key="-URL_WARNING-"),
19
- make_component("Button",'Grab URL',key='-GRAB_URL-',visible=True)],
20
- [make_component("Checkbox",'Custom User-Agent', default=False, key='-CUSTOMUA-', enable_events=True)],
21
- [make_component("Text",'User-Agent:', size=(8, 1)),
22
- make_component("Combo",get_user_agents(), default_value='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', key='-USERAGENT-', disabled=False)],
23
- [self.get_cypher_checks()],
24
- [make_component("Button",'Grab URL'),
25
- make_component("Button",'Action'),
26
- make_component("Button",'Get All Text')],
27
- [make_component("Text",'Parsing Capabilities:', size=(15, 1)),
28
- make_component("DropDown",parse_type_choices, default_value='html.parser', key='-parse_type-',enable_events=True)],
29
- [get_multi_line({"key":'-SOURCECODE-'})],
30
- [make_component("Text",'find soup:'),[[
31
- make_component("Checkbox",'',default=True,key='-SOUP_TAG_BOOL-',enable_events=True),
32
- make_component("Combo",[], size=(15, 1),key='-SOUP_TAG-',enable_events=True)],
33
- [make_component("Checkbox",'',default=False,key='-SOUP_ATTRIBUTE_BOOL-',enable_events=True),
34
- make_component("Combo",[], size=(15, 1),key='-SOUP_ATTRIBUTE-',enable_events=True)],
35
- [make_component("Checkbox",'',default=False,key='-SOUP_ATTRIBUTE_1_BOOL-',enable_events=True),
36
- make_component("Combo",[], size=(15, 1),key='-SOUP_ATTRIBUTE_1-',enable_events=True)],
37
- [make_component("Checkbox",'',default=False,key='-SOUP_ATTRIBUTE_2_BOOL-',enable_events=True),
38
- make_component("Combo",[], size=(15, 1),key='-SOUP_ATTRIBUTE_2-',enable_events=True)],
39
- make_component("Input",key='-SOUP_VALUES_INPUT-'),
40
- make_component("Button",'get soup'),
41
- make_component("Button",'all soup'),
42
- make_component("Button",'Send Soup')]],
43
- [get_multi_line({"key":"-FIND_ALL_OUTPUT-"})]]
44
- return layout
45
- GuiGrabber()
1
+ from PyQt5 import QtWidgets, QtCore
2
+ from abstract_webtools import urlManager, requestManager, SoupManager, LinkManager
3
+ from abstract_gui import get_user_agents, get_cipher_list
4
+
5
+ class UrlGrabberWidget(QtWidgets.QWidget):
6
+ def __init__(self, initial_url="https://example.com", parent=None):
7
+ super().__init__(parent)
8
+ self.initial_url = initial_url
9
+ self.setup_ui()
10
+ self.setup_logic()
11
+ self.init_managers()
12
+
13
+ def setup_ui(self):
14
+ layout = QtWidgets.QVBoxLayout(self)
15
+
16
+ # URL input and grab button
17
+ self.url_input = QtWidgets.QLineEdit(self.initial_url)
18
+ self.status_label = QtWidgets.QLabel("Status: Unknown")
19
+ self.grab_btn = QtWidgets.QPushButton("Grab URL")
20
+
21
+ url_layout = QtWidgets.QHBoxLayout()
22
+ url_layout.addWidget(QtWidgets.QLabel("URL:"))
23
+ url_layout.addWidget(self.url_input)
24
+ url_layout.addWidget(self.grab_btn)
25
+
26
+ # User agent input
27
+ self.user_agent_box = QtWidgets.QComboBox()
28
+ self.user_agent_box.addItems(get_user_agents())
29
+
30
+ # Source output
31
+ self.source_code_edit = QtWidgets.QPlainTextEdit()
32
+ self.source_code_edit.setReadOnly(True)
33
+
34
+ # Soup output
35
+ self.soup_result_edit = QtWidgets.QPlainTextEdit()
36
+
37
+ # Action buttons
38
+ self.action_btn = QtWidgets.QPushButton("Parse")
39
+ self.get_text_btn = QtWidgets.QPushButton("Get All Text")
40
+ self.send_btn = QtWidgets.QPushButton("Send Soup")
41
+
42
+ # Assemble layout
43
+ layout.addLayout(url_layout)
44
+ layout.addWidget(self.status_label)
45
+ layout.addWidget(QtWidgets.QLabel("User-Agent:"))
46
+ layout.addWidget(self.user_agent_box)
47
+ layout.addWidget(QtWidgets.QLabel("Source Code:"))
48
+ layout.addWidget(self.source_code_edit)
49
+ layout.addWidget(QtWidgets.QLabel("Soup Result:"))
50
+ layout.addWidget(self.soup_result_edit)
51
+
52
+ btn_layout = QtWidgets.QHBoxLayout()
53
+ btn_layout.addWidget(self.action_btn)
54
+ btn_layout.addWidget(self.get_text_btn)
55
+ btn_layout.addWidget(self.send_btn)
56
+ layout.addLayout(btn_layout)
57
+
58
+ self.setLayout(layout)
59
+
60
+ def setup_logic(self):
61
+ self.grab_btn.clicked.connect(self.grab_url)
62
+ self.action_btn.clicked.connect(self.parse_html)
63
+ self.get_text_btn.clicked.connect(self.get_all_text)
64
+ self.send_btn.clicked.connect(self.send_soup)
65
+
66
+ def init_managers(self):
67
+ self.url_mgr = urlManager(url=self.initial_url)
68
+ self.request_mgr = None
69
+ self.soup_mgr = None
70
+ self.link_mgr = None
71
+
72
+ def grab_url(self):
73
+ url = self.url_input.text().strip()
74
+ self.url_mgr = urlManager(url=url)
75
+ self.request_mgr = requestManager(url_mgr=self.url_mgr)
76
+ if self.request_mgr.source_code:
77
+ self.soup_mgr = SoupManager(url_mgr=self.url_mgr, request_mgr=self.request_mgr)
78
+ self.link_mgr = LinkManager(url_mgr=self.url_mgr, request_mgr=self.request_mgr, soup_mgr=self.soup_mgr)
79
+ self.status_label.setText("Status: Success")
80
+ self.source_code_edit.setPlainText(self.request_mgr.source_code)
81
+ else:
82
+ self.status_label.setText("Status: Failed")
83
+
84
+ def parse_html(self):
85
+ if self.soup_mgr:
86
+ self.soup_result_edit.setPlainText(self.soup_mgr.soup)
87
+
88
+ def get_all_text(self):
89
+ if self.soup_mgr:
90
+ self.soup_result_edit.setPlainText(self.soup_mgr.extract_text_sections())
91
+
92
+ def send_soup(self):
93
+ soup = self.soup_result_edit.toPlainText()
94
+ print("Soup sent:", soup[:300]) # or emit a signal
95
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.120
3
+ Version: 0.1.6.122
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -1,6 +1,6 @@
1
1
  abstract_webtools/__init__.py,sha256=zNMp-9f0Q6BXWxR-tgHrEqKP8GeXw9z7VYzbqIeEydo,132
2
2
  abstract_webtools/abstract_userpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-P2w1u6ng,6462
3
- abstract_webtools/abstract_usurpit.py,sha256=LTt5wXzKaZKoobqElSzQfIIjUj-e0ATkIFGzQ6Qvn1Y,9313
3
+ abstract_webtools/abstract_usurpit.py,sha256=2idbYXLFhXh8VPfdYgWICNH8dehnZRCdt4U5sTsVxo4,9663
4
4
  abstract_webtools/abstract_webtools.py,sha256=3NzGmJlZvrdVtEcUi2K5iUgWr1822IBPhIN9us2e2t0,3859
5
5
  abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
6
6
  abstract_webtools/domain_identifier.py,sha256=AvWlGD7C19rySa_J_Brxi3kz43LMWvGsshuuZNg7MvI,3320
@@ -10,7 +10,8 @@ abstract_webtools/k2s_downloader.py,sha256=t0tCKAfDNQGn9tKh3eg0XVU0bY-MmYITwJa3A
10
10
  abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
11
11
  abstract_webtools/soup_gui.py,sha256=n95YAps1R6DpMwR4UbthSqQby0C5WHUa9tsW-f2qpLg,5184
12
12
  abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE70B8,10441
13
- abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f5GoZmw0,3454
13
+ abstract_webtools/url_grabber_new.py,sha256=xb23qo4anOY0Ax3CAfaHJ8s5VEz61Sinh-XpEDFW7Is,3621
14
+ abstract_webtools/__pycache__/abstract_webtools.cpython-312.pyc,sha256=Rb2nPDCUG6i7nEs-I128lozwKteIVXzZxygV-zJVALs,4606
14
15
  abstract_webtools/managers/__init__.py,sha256=9pgy52NB-ONxLqoCRF52GZ6G7GM6Uc0-fgA1HvKcwxc,407
15
16
  abstract_webtools/managers/allss\.py,sha256=IBhlyRQHfK-BtwUnSEbIPqlI1MtZ8-XsdaHv0b91HQ0,269
16
17
  abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
@@ -42,7 +43,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
42
43
  abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
43
44
  abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
44
45
  abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
45
- abstract_webtools-0.1.6.120.dist-info/METADATA,sha256=51NMbY8xPY9HbZrJUuycBo1mswqdEXUwpj1NuBFp6K4,7289
46
- abstract_webtools-0.1.6.120.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
47
- abstract_webtools-0.1.6.120.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
48
- abstract_webtools-0.1.6.120.dist-info/RECORD,,
46
+ abstract_webtools-0.1.6.122.dist-info/METADATA,sha256=eCQxVrpP4p0xz9SbZt3JjNS9dZ5RZ6gd0nnRy0wPQpM,7289
47
+ abstract_webtools-0.1.6.122.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
48
+ abstract_webtools-0.1.6.122.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
49
+ abstract_webtools-0.1.6.122.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.8.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5