abstract-webtools 0.1.5.86__tar.gz → 0.1.5.88__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {abstract_webtools-0.1.5.86/src/abstract_webtools.egg-info → abstract_webtools-0.1.5.88}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/setup.py +1 -1
  3. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/crawlManager.py +82 -22
  4. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/requestManager.py +6 -1
  5. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/seleniumManager.py +4 -3
  6. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/soupManager.py +53 -16
  7. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/urlManager.py +4 -4
  8. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/url_grabber.py +33 -33
  9. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
  10. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/LICENSE +0 -0
  11. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/README.md +0 -0
  12. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/pyproject.toml +0 -0
  13. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/setup.cfg +0 -0
  14. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/__init__.py +0 -0
  15. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/abstract_webtools.py +0 -0
  16. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/big_user_agent_list.py +0 -0
  17. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/main.py +0 -0
  18. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/__init__.py +0 -0
  19. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/cipherManager.py +0 -0
  20. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/domainManager.py +0 -0
  21. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  22. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/linkManager.py +0 -0
  23. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  24. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/networkManager.py +0 -0
  25. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/sslManager.py +0 -0
  26. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  27. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  28. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  29. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/soup_gui.py +0 -0
  30. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools/url_grabber_new.py +0 -0
  31. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools.egg-info/SOURCES.txt +0 -0
  32. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  33. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools.egg-info/requires.txt +0 -0
  34. {abstract_webtools-0.1.5.86 → abstract_webtools-0.1.5.88}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.5.86
3
+ Version: 0.1.5.88
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.5.86',
7
+ version='0.1.5.88',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -1,14 +1,17 @@
1
- class CrawlManager:
2
- def __init__(self,url=None,source_code=None,parse_type="html.parser"):
1
+ from .soupManager import *
2
+ class CrawlManager():
3
+ def __init__(self,url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
3
4
  self.url=url
4
5
  self.source_code=source_code
5
6
  self.parse_type=parse_type
6
- get_new_source_and_url(self,url)
7
+ self.url_mgr = url_mgr or urlManager(url=self.url)
8
+ self.req_mgr = requestManager(url_mgr=self.url_mgr)
9
+ self.get_new_source_and_url(url)
7
10
  def get_new_source_and_url(self,url=None):
8
11
  if url == None:
9
12
  url = self.url
10
- self.response = self.request_mgr.response
11
- self.source_code=self.request_mgr.source_code
13
+ self.response = self.req_mgr.response
14
+ self.source_code=self.req_mgr.source_code
12
15
  def get_classes_and_meta_info():
13
16
  class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
14
17
  attrs = 'href','src'
@@ -41,7 +44,36 @@ class CrawlManager:
41
44
  agg_js['external_links']=external_links
42
45
 
43
46
  return agg_js
47
+ def get_all_website_links(self,tag="a",attr="href") -> list:
48
+ """
49
+ Returns all URLs that are found on the specified URL and belong to the same website.
44
50
 
51
+ Args:
52
+ url (str): The URL to search for links.
53
+
54
+ Returns:
55
+ list: A list of URLs that belong to the same website as the specified URL.
56
+ """
57
+ all_urls=[self.url_mgr.url]
58
+ domain = self.url_mgr.domain
59
+ all_attribs = get_attribs(self.url_mgr.url)
60
+ for href in all_attribs.get('href',[]):
61
+ if href == "" or href is None:
62
+ # href empty tag
63
+ continue
64
+ href=self.url_mgr.get_relative_href(self.url_mgr.url,href)
65
+ if not self.url_mgr.is_valid_url(href):
66
+ # not a valid URL
67
+ continue
68
+ if href in all_urls:
69
+ # already in the set
70
+ continue
71
+ if domain not in href:
72
+ # external link
73
+ continue
74
+ all_urls.append(href)
75
+
76
+ return all_urls
45
77
 
46
78
  def correct_xml(xml_string):
47
79
  # Parse the XML string
@@ -58,21 +90,21 @@ class CrawlManager:
58
90
  return corrected_xml
59
91
 
60
92
 
61
- def determine_values(self):
93
+ def determine_values(self,url=None):
62
94
  # This is just a mockup. In a real application, you'd analyze the URL or its content.
63
-
95
+ url = url or self.url
64
96
  # Assuming a blog site
65
- if 'blog' in self.url:
66
- if '2023' in self.url: # Assuming it's a current year article
97
+ if 'blog' in url:
98
+ if '2023' in url: # Assuming it's a current year article
67
99
  return ('weekly', '0.8')
68
100
  else:
69
101
  return ('monthly', '0.6')
70
- elif 'contact' in self.url:
102
+ elif 'contact' in url:
71
103
  return ('yearly', '0.3')
72
104
  else: # Homepage or main categories
73
105
  return ('weekly', '1.0')
74
- def crawl(url, max_depth=3, depth=1):
75
-
106
+ def crawl(self,url, max_depth=3, depth=1):
107
+ visited=set()
76
108
  if depth > max_depth:
77
109
  return []
78
110
 
@@ -82,8 +114,8 @@ class CrawlManager:
82
114
  visited.add(url)
83
115
 
84
116
  try:
85
-
86
- links = [a['href'] for a in self.soup.find_all('a', href=True)]
117
+ soup = get_soup(url)
118
+ links = [a['href'] for a in soup.find_all('a', href=True)]
87
119
  valid_links = []
88
120
 
89
121
  for link in links:
@@ -107,11 +139,24 @@ class CrawlManager:
107
139
 
108
140
  # Define or import required functions here, like get_all_website_links, determine_values,
109
141
  # discover_classes_and_meta_images, and extract_links_from_url.
110
- def get_meta_info(self):
111
-
142
+ def get_meta_info(self,url=None):
143
+ url = url or self.url
144
+ soup_mgr = soupManager(url=url)
112
145
  meta_info = {}
113
146
  # Fetch the title if available
114
- title_tag = parse_title()
147
+ meta_tags = soup_mgr.find_all("meta")
148
+ url = eatAll(str(url),['',' ','\n','\t','\\','/'])
149
+ attribs = get_attribs(url)
150
+ soup = get_soup(url)
151
+
152
+ for meta_tag in meta_tags:
153
+ for attr, values in meta_tag.attrs.items():
154
+
155
+ if attr not in meta_tag:
156
+ meta_tag[attr] = []
157
+ if values not in meta_tag[attr]:
158
+ meta_tag[attr].append(values)
159
+ title_tag = soup.find_all("title")
115
160
  if title_tag:
116
161
  meta_info["title"] = title_tag
117
162
  # Fetch meta tags
@@ -128,13 +173,14 @@ class CrawlManager:
128
173
  with open('sitemap.xml', 'w', encoding='utf-8') as f:
129
174
  string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n'
130
175
 
131
- for url in self.all_site_links:
176
+ for url in self.get_all_website_links():
132
177
  string += f' <url>\n <loc>{url}</loc>\n'
133
178
  preprocess=[]
134
179
  self.get_new_source_and_url(url=url)
135
- links = extract_links_from_url(url)
136
-
137
- for img in links['images']:
180
+ links = get_attribs(url)
181
+ images = [link for link in links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
182
+
183
+ for img in images:
138
184
  if str(img).lower() not in preprocess:
139
185
  try:
140
186
  escaped_img = img.replace('&', '&amp;')
@@ -144,7 +190,7 @@ class CrawlManager:
144
190
  except:
145
191
  pass
146
192
  preprocess.append(str(img).lower())
147
- frequency, priority = determine_values(url)
193
+ frequency, priority = self.determine_values(url)
148
194
  string += f' <changefreq>{frequency}</changefreq>\n'
149
195
  string += f' <priority>{priority}</priority>\n'
150
196
  string += f' </url>\n'
@@ -176,6 +222,7 @@ class CrawlManager:
176
222
  print("\nExternal Links:")
177
223
  for ext_link in links['external_links']:
178
224
  print(f"\t{ext_link}")
225
+
179
226
  class CrawlManagerSingleton():
180
227
  _instance = None
181
228
  @staticmethod
@@ -185,3 +232,16 @@ class CrawlManagerSingleton():
185
232
  elif parse_type != CrawlManagerSingleton._instance.parse_type or url != CrawlManagerSingleton._instance.url or source_code != CrawlManagerSingleton._instance.source_code:
186
233
  CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
187
234
  return CrawlManagerSingleton._instance
235
+ def get_crawl_mgr(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
236
+ url = get_url(url=url,url_mgr=url_mgr)
237
+ url_mgr = get_url(url=url,url_mgr=url_mgr)
238
+ req_mgr=get_req_mgr(url=url,url_mgr=url_mgr,source_code=source_code)
239
+ source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr)
240
+ soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,parse_type=parse_type)
241
+ crawl_mgr = CrawlManager(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
242
+ return crawl_mgr
243
+ def get_domain_crawl(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser",max_depth=3, depth=1):
244
+ crawl_mgr = get_crawl_mgr(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
245
+ url = get_url(url=url,url_mgr=url_mgr)
246
+ all_domain_links = crawl_mgr.crawl(url=url, max_depth=max_depth, depth=depth)
247
+ return all_domain_links
@@ -343,6 +343,11 @@ class SafeRequestSingleton:
343
343
  def get_req_mgr(req_mgr=None,url=None,url_mgr=None,source_code=None):
344
344
  if req_mgr:
345
345
  url_mgr = req_mgr.url_mgr
346
+ url = get_url(url=url,url_mgr=url_mgr)
346
347
  url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
347
- req_mgr = req_mgr or requestManager(url_mgr=url_mgr,source_code=source_code)
348
+ req_mgr = req_mgr or requestManager(url_mgr=url_mgr,url=url,source_code=source_code)
348
349
  return req_mgr
350
+ def get_source(url=None,url_mgr=None,source_code=None):
351
+ # Placeholder for actual implementation.
352
+ req_mgr = get_req_mgr(req_mgr=req_mgr,url=url,url_mgr=url_mgr,source_code=source_code)
353
+ return req_mgr.source_code
@@ -60,9 +60,10 @@ class domainManager(metaclass=SingletonMeta):
60
60
  for path in paths[:-1]:
61
61
  dir_path = os.path.join(dir_path, path)
62
62
  os.makedirs(dir_path, exist_ok=True)
63
- self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
64
- dir_path = os.path.join(dir_path, paths[-1])
65
- return dir_path
63
+ self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
64
+
65
+ dir_path = os.path.join(dir_path, paths[-1])
66
+ return dir_path
66
67
 
67
68
  def saved_url_check(self, url):
68
69
  path = self.get_url_to_path(url)
@@ -51,6 +51,11 @@ class soupManager:
51
51
  source_code = str(source_code)
52
52
  self.source_code = source_code
53
53
  self.soup= BeautifulSoup(self.source_code, self.parse_type)
54
+ self.all_tags_and_attribute_names = self.get_all_tags_and_attribute_names()
55
+ self.all_tags = self.all_tags_and_attribute_names.get('tags')
56
+ self.all_attribute_names = self.all_tags_and_attribute_names.get('attributes')
57
+ self.all_tags_and_attributes = self.all_tags + self.all_attribute_names
58
+
54
59
  self._all_links_data = None
55
60
  self._meta_tags_data = None
56
61
  def re_initialize(self):
@@ -106,7 +111,7 @@ class soupManager:
106
111
  """
107
112
  all_urls=[self.url_mgr.url]
108
113
  domain = self.url_mgr.domain
109
- all_desired=self.get_all_desired_soup(tag=tag,attr=attr)
114
+ all_desired=self.get_all_website_links(tag=tag,attr=attr)
110
115
  for tag in all_desired:
111
116
  href = tag.attrs.get(attr)
112
117
  if href == "" or href is None:
@@ -274,22 +279,50 @@ class soupManager:
274
279
  attribute_names_list = list(attribute_names)
275
280
  return {"tags":tag_names_list,"attributes":attribute_names_list}
276
281
 
277
- def get_all_attribute_values(self):
278
- attribute_values={}
279
- get_all = self.find_tags_by_attributes()
280
- for tag in get_all: # True matches all tags
281
- for attr, value in tag.attrs.items():
282
- # If attribute is not yet in the dictionary, add it with an empty set
283
- if attr not in attribute_values:
284
- attribute_values[attr] = set()
285
- # If the attribute value is a list (e.g., class), extend the set with the list
286
- if isinstance(value, list):
287
- attribute_values[attr].update(value)
288
- else:
289
- attribute_values[attr].add(value)
282
+ def get_all_attribute_values(self, tag_list=None):
283
+ """
284
+ Collects all attribute values for each specified tag or all tags if none are specified.
285
+
286
+ Parameters:
287
+ - tag_list: List of specific tags to retrieve attributes from, e.g., ['script', 'img'].
288
+ If None, retrieves attributes for all tags.
289
+
290
+ Returns:
291
+ - attribute_values: Dictionary where each key is an attribute and the value is a list of unique values for that attribute.
292
+ """
293
+ attribute_values = {}
294
+ tag_list = tag_list or self.all_tags_and_attributes
295
+ # Get all tags matching tag_list criteria
296
+ for tag_name in tag_list:
297
+ for tag in self.soup.find_all(tag_name):
298
+ for attr, value in tag.attrs.items():
299
+ if attr not in attribute_values:
300
+ attribute_values[attr] = set()
301
+
302
+ # Add attribute values
303
+ if isinstance(value, list):
304
+ attribute_values[attr].update(value)
305
+ else:
306
+ attribute_values[attr].add(value)
307
+
308
+ # Convert each set to a list for consistency
290
309
  for attr, values in attribute_values.items():
291
310
  attribute_values[attr] = list(values)
311
+
312
+ # Capture JavaScript URLs inside <script> tags
313
+ attribute_values['script_links'] = self.get_js_links()
314
+
292
315
  return attribute_values
316
+
317
+ def get_js_links(self):
318
+ """Extract URLs embedded in JavaScript within <script> tags."""
319
+ js_links = []
320
+ script_tags = self.soup.find_all('script')
321
+ for script in script_tags:
322
+ # Find URLs in the JavaScript code
323
+ urls_in_js = re.findall(r'["\'](https?://[^"\']+|/[^"\']+)["\']', script.get_text())
324
+ js_links.extend(urls_in_js)
325
+ return list(set(js_links)) # Remove duplicates
293
326
 
294
327
  @property
295
328
  def url(self):
@@ -309,6 +342,10 @@ class SoupManagerSingleton():
309
342
  return SoupManagerSingleton._instance
310
343
  def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None):
311
344
  url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
312
- req_mgr = get_req_mgr(url_mgr=url_mgr,source_code=source_code)
313
- soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr)
345
+ url = get_url(url=url,url_mgr=url_mgr)
346
+ req_mgr = get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
347
+ soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code)
314
348
  return soup_mgr
349
+ def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None):
350
+ soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
351
+ return soup_mgr.get_all_attribute_values(tags_list=tags_list)
@@ -218,8 +218,8 @@ def get_url(url=None,url_mgr=None):
218
218
  url_mgr = urlManager(url)
219
219
  return url_mgr.url
220
220
  def get_url_mgr(url=None,url_mgr=None):
221
- if not url and not url_mgr:
222
- return None
223
- if url:
224
- url_mgr = urlManager(url)
221
+ if url_mgr == None and url:
222
+ url_mgr = urlManager(url=url)
223
+ if url_mgr and url == None:
224
+ url = url_mgr.url
225
225
  return url_mgr
@@ -25,7 +25,7 @@ def get_attrs(values):
25
25
  else:
26
26
  tags_js['attribute']=tags_js['attribute'][0]
27
27
  return tags_js
28
- def get_user_agent_manager(user_agent=None):
28
+ def get_user_agent_mgr(user_agent=None):
29
29
  return UserAgentManager(user_agent=user_agent)
30
30
  def get_cipher_list():
31
31
  return CipherManager().get_default_ciphers()
@@ -128,78 +128,78 @@ def process_url(window,values):
128
128
  if warn_url=='' or warn_url == None:
129
129
  update_status(window,warn,warn_url,response_code,valid)
130
130
  return False
131
- temp_url=UrlManager(url=warn_url).url
131
+ temp_url=urlManager(url=warn_url).url
132
132
  if temp_url:
133
133
  valid='valid'
134
- response_code = SafeRequest(url=temp_mgr).response.status_code
134
+ response_code = requestManager(url=temp_mgr).response.status_code
135
135
  warn = 'success'
136
136
  warn_url = temp_mgr
137
137
  update_status(window,warn,warn_url,response_code,valid)
138
138
  return temp_mgr
139
139
  update_status(window,warn,warn_url,response_code,valid)
140
140
  return False
141
- def update_url(url_manager,request_manager,soup_manager,link_manager,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
141
+ def update_url(url_mgr,request_mgr,soup_mgr,link_mgr,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
142
142
  ciphers = CipherManager(cipher_list=cipher_list).ciphers_string
143
- request_manager = SafeRequest(url_manager=url_manager,ciphers=ciphers,user_agent=get_user_agents()[0])
144
- if request_manager.source_code:
145
- soup_manager= SoupManager(url_manager=url_manager,request_manager=request_manager)
146
- link_manager= LinkManager(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager)
147
- window['-URL-'].update(value=url_manager.url)
148
- window['-CIPHERS_OUTPUT-'].update(value=request_manager.ciphers)
149
- return update_source_code(url_manager,request_manager,soup_manager,link_manager,values)
143
+ request_mgr = requestManager(url_mgr=url_mgr,ciphers=ciphers,user_agent=get_user_agents()[0])
144
+ if request_mgr.source_code:
145
+ soup_mgr= SoupManager(url_mgr=url_mgr,request_mgr=request_mgr)
146
+ link_mgr= LinkManager(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr)
147
+ window['-URL-'].update(value=url_mgr.url)
148
+ window['-CIPHERS_OUTPUT-'].update(value=request_mgr.ciphers)
149
+ return update_source_code(url_mgr,request_mgr,soup_mgr,link_mgr,values)
150
150
  else:
151
- return url_manager,request_manager,soup_manager,link_manager
152
- def update_source_code(url_manager,request_manager,soup_manager,link_manager,values):
151
+ return url_mgr,request_mgr,soup_mgr,link_mgr
152
+ def update_source_code(url_mgr,request_mgr,soup_mgr,link_mgr,values):
153
153
  parse_type = values['-parse_type-']
154
- if parse_type != soup_manager.parse_type:
155
- soup_manager.update_parse_type(parse_type=parse_type)
156
- all_tags=soup_manager.get_all_tags_and_attribute_names()
157
- window['-SOURCECODE-'].update(value=soup_manager.soup)
154
+ if parse_type != soup_mgr.parse_type:
155
+ soup_mgr.update_parse_type(parse_type=parse_type)
156
+ all_tags=soup_mgr.get_all_tags_and_attribute_names()
157
+ window['-SOURCECODE-'].update(value=soup_mgr.soup)
158
158
  if values['-SOUP_TAG-'] != all_tags['tags']:
159
159
  window['-SOUP_TAG-'].update(values=all_tags['tags'],value=all_tags['tags'][0])
160
160
  if values['-SOUP_ATTRIBUTE-'] != all_tags['attributes']:
161
161
  window['-SOUP_ATTRIBUTE-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
162
162
  window['-SOUP_ATTRIBUTE_1-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
163
163
  window['-SOUP_ATTRIBUTE_2-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
164
- return url_manager,request_manager,soup_manager,link_manager
164
+ return url_mgr,request_mgr,soup_mgr,link_mgr
165
165
  def url_grabber_while(window,initial_url="www.example.com"):
166
166
  return_data=None
167
167
  url_grab = False
168
- url_manager=UrlManager(url=initial_url)
169
- request_manager = SafeRequest(url_manager=url_manager)
170
- soup_manager= SoupManager(url_manager=url_manager,request_manager=request_manager)
171
- link_manager= LinkManager(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager)
168
+ url_mgr=urlManager(url=initial_url)
169
+ request_mgr = requestManager(url_mgr=url_mgr)
170
+ soup_mgr= SoupManager(url_mgr=url_mgr,request_mgr=request_mgr)
171
+ link_mgr= LinkManager(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr)
172
172
  while True:
173
173
  event, values = window.read()
174
174
  if event == sg.WINDOW_CLOSED:
175
175
  break
176
176
  if event=='-GRAB_URL-' or not url_grab:
177
177
  url=values['-URL-']
178
- if UrlManager(url=url).url:
179
- if url != url_manager.url or url == initial_url:
180
- url_manager = UrlManager(url=url)
178
+ if urlManager(url=url).url:
179
+ if url != url_mgr.url or url == initial_url:
180
+ url_mgr = urlManager(url=url)
181
181
 
182
- url_manager,request_manager,soup_manager,link_manager=update_url(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager,link_manager=link_manager,values=values)
183
- window['-URL-'].update(value=url_manager.url)
182
+ url_mgr,request_mgr,soup_mgr,link_mgr=update_url(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr,link_mgr=link_mgr,values=values)
183
+ window['-URL-'].update(value=url_mgr.url)
184
184
  url_grab=True
185
185
  if event == 'get soup':
186
186
  tags_js = get_attrs(values)
187
- all_desired=soup_manager.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
187
+ all_desired=soup_mgr.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
188
188
  window['-FIND_ALL_OUTPUT-'].update(value=all_desired)
189
189
  if event == '-CUSTOMUA-':
190
190
  window['-SOURCECODE-'].update(disabled=values['-CUSTOMUA-'])
191
191
  if not values['-CUSTOMUA-']:
192
- window['-USERAGENT-'].update(value=user_agent_manager.user_agent_header)
192
+ window['-USERAGENT-'].update(value=user_agent_mgr.user_agent_header)
193
193
  window['-USERAGENT-'].update(disabled=True)
194
194
  else:
195
195
  window['-USERAGENT-'].update(disabled=False)
196
196
  if event=='Get All Text':
197
- window['-FIND_ALL_OUTPUT-'].update(value=soup_manager.extract_text_sections())
197
+ window['-FIND_ALL_OUTPUT-'].update(value=soup_mgr.extract_text_sections())
198
198
  if event == 'Action':
199
199
  parse_type = values['-parse_type-']
200
- if parse_type != soup_manager.parse_type:
201
- soup_manager.update_parse_type(parse_type=parse_type)
202
- window['-SOURCECODE-'].update(value=soup_manager.soup)
200
+ if parse_type != soup_mgr.parse_type:
201
+ soup_mgr.update_parse_type(parse_type=parse_type)
202
+ window['-SOURCECODE-'].update(value=soup_mgr.soup)
203
203
  elif event == 'Send Soup':
204
204
  return_data = values['-FIND_ALL_OUTPUT-']
205
205
  break
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.5.86
3
+ Version: 0.1.5.88
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff