abstract-webtools 0.1.6.10__py3-none-any.whl → 0.1.6.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,231 +1,150 @@
1
- from .soupManager import *
1
+ from abstract_webtools import * #.soupManager import *
2
2
 
3
3
 
4
- class crawlManager():
5
- def __init__(self,url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
6
- self.url=url
7
- self.source_code=source_code
8
- self.parse_type=parse_type
4
+ from urllib.parse import urlparse, urljoin
5
+ import os
6
+ import xml.etree.ElementTree as ET
7
+ from bs4 import BeautifulSoup
8
+
9
+ class CrawlManager:
10
+ def __init__(self, url=None, req_mgr=None, url_mgr=None, source_code=None, parse_type="html.parser"):
11
+ self.url = url
12
+ self.source_code = source_code
13
+ self.parse_type = parse_type
9
14
  self.url_mgr = url_mgr or urlManager(url=self.url)
10
- self.req_mgr = requestManager(url_mgr=self.url_mgr)
15
+ self.req_mgr = req_mgr or requestManager(url_mgr=self.url_mgr)
11
16
  self.get_new_source_and_url(url)
12
- def get_new_source_and_url(self,url=None):
13
- if url == None:
17
+
18
+ def get_new_source_and_url(self, url=None):
19
+ """Fetches new source code and response for a given URL."""
20
+ if url is None:
14
21
  url = self.url
22
+ self.req_mgr.set_url(url)
23
+ self.source_code = self.req_mgr.source_code
15
24
  self.response = self.req_mgr.response
16
- self.source_code=self.req_mgr.source_code
17
- def get_classes_and_meta_info():
18
- class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
19
- attrs = 'href','src'
20
- unique_classes, images=discover_classes_and_images(self,tag_name,class_name_1,class_name_2,class_value,attrs)
21
- return unique_classes, images
22
- def extract_links_from_url(self,url=None):
23
- """
24
- Extracts all href and src links from a given URL's source code.
25
25
 
26
- Args:
27
- base_url (str): The URL from which to extract links.
26
+ def get_classes_and_meta_info(self):
27
+ """Returns unique classes and image links from meta tags."""
28
+ tag_name = 'meta'
29
+ class_name_1, class_name_2 = 'class', 'property'
30
+ class_value = 'og:image'
31
+ attrs = ['href', 'src']
32
+ unique_classes, images = discover_classes_and_images(self, tag_name, class_name_1, class_name_2, class_value, attrs)
33
+ return unique_classes, images
28
34
 
29
- Returns:
30
- dict: Dictionary containing image links and external links under the parent page.
31
- """
35
+ def extract_links_from_url(self, url=None):
36
+ """Extracts all href and src links from a given URL's source code."""
32
37
  url = url or self.url_mgr.url
33
- soup_mgr = get_soup_mgr(url=url)
34
- agg_js = {'images':[],'external_links':[]}
38
+ soup = BeautifulSoup(self.source_code, self.parse_type)
39
+ links = {'images': [], 'external_links': []}
40
+
41
+ if self.response:
42
+ for attr in ['href', 'src']:
43
+ for tag in soup.find_all(attrs={attr: True}):
44
+ link = tag.get(attr)
45
+ if link:
46
+ absolute_link = urljoin(url, link)
47
+ if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp')):
48
+ links['images'].append(absolute_link)
49
+ elif urlparse(absolute_link).netloc != urlparse(url).netloc:
50
+ links['external_links'].append(absolute_link)
35
51
 
36
- if self.response != None:
37
- attrs = 'href','src'
38
- href_links,src_links='',''
39
- links = [href_links,src_links]
40
- for i,each in enumerate(attrs):
41
- links[i]= [a[attr[i]] for a in soup_mgr.get_find_all_with_attributes(attrs[i])]
42
- # Convert all links to absolute links
43
- absolute_links = [(url, link) for link in links[0] + links[1]]
44
- # Separate images and external links
45
- images = [link for link in absolute_links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
46
- external_links = [link for link in absolute_links if urlparse(link).netloc != urlparse(url).netloc]
47
- agg_js['images']=images
48
- agg_js['external_links']=external_links
49
-
50
- return agg_js
51
- def get_all_website_links(self,tag="a",attr="href") -> list:
52
- """
53
- Returns all URLs that are found on the specified URL and belong to the same website.
54
-
55
- Args:
56
- url (str): The URL to search for links.
57
-
58
- Returns:
59
- list: A list of URLs that belong to the same website as the specified URL.
60
- """
61
- all_urls=[self.url_mgr.url]
52
+ return links
53
+
54
+ def get_all_website_links(self):
55
+ """Finds all internal links on the website that belong to the same domain."""
56
+ all_urls = [self.url_mgr.url]
62
57
  domain = self.url_mgr.domain
63
58
  all_attribs = get_all_attribute_values(self.url_mgr.url)
64
- for href in all_attribs.get('href',[]):
65
- if href == "" or href is None:
66
- # href empty tag
67
- continue
68
- href=self.url_mgr.get_relative_href(self.url_mgr.url,href)
69
- if not self.url_mgr.is_valid_url(href):
70
- # not a valid URL
71
- continue
72
- if href in all_urls:
73
- # already in the set
74
- continue
75
- if domain not in href:
76
- # external link
59
+
60
+ for href in all_attribs.get('href', []):
61
+ if not href or not self.url_mgr.is_valid_url(href):
77
62
  continue
78
- all_urls.append(href)
79
-
63
+ full_url = urljoin(self.url_mgr.url, href)
64
+ if domain in full_url and full_url not in all_urls:
65
+ all_urls.append(full_url)
66
+
80
67
  return all_urls
81
68
 
82
- def correct_xml(xml_string):
83
- # Parse the XML string
69
+ def correct_xml(self, xml_string):
70
+ """Corrects XML by encoding special characters in <image:loc> tags."""
84
71
  root = ET.fromstring(xml_string)
85
-
86
- # Loop through each <image:loc> element and correct its text if needed
87
72
  for image_loc in root.findall(".//image:loc", namespaces={'image': 'http://www.google.com/schemas/sitemap-image/1.1'}):
88
- # Replace '&' with '&amp;' in the element's text
89
73
  if '&' in image_loc.text:
90
74
  image_loc.text = image_loc.text.replace('&', '&amp;')
75
+ return ET.tostring(root, encoding='utf-8').decode('utf-8')
91
76
 
92
- # Convert the corrected XML back to string
93
- corrected_xml = ET.tostring(root, encoding='utf-8').decode('utf-8')
94
- return corrected_xml
95
-
96
-
97
- def determine_values(self,url=None):
98
- # This is just a mockup. In a real application, you'd analyze the URL or its content.
77
+ def determine_values(self, url=None):
78
+ """Determines frequency and priority based on URL type."""
99
79
  url = url or self.url
100
- # Assuming a blog site
101
80
  if 'blog' in url:
102
- if '2023' in url: # Assuming it's a current year article
103
- return ('weekly', '0.8')
104
- else:
105
- return ('monthly', '0.6')
81
+ return ('weekly', '0.8') if '2023' in url else ('monthly', '0.6')
106
82
  elif 'contact' in url:
107
83
  return ('yearly', '0.3')
108
- else: # Homepage or main categories
109
- return ('weekly', '1.0')
110
- def crawl(self,url, max_depth=3, depth=1):
111
- visited=set()
112
- if depth > max_depth:
113
- return []
84
+ return ('weekly', '1.0')
114
85
 
115
- if url in visited:
86
+ def crawl(self, url, max_depth=3, depth=1, visited=None):
87
+ """Recursively crawls the site up to max_depth and returns valid internal links."""
88
+ visited = visited or set()
89
+ if depth > max_depth or url in visited:
116
90
  return []
117
91
 
118
92
  visited.add(url)
119
-
120
93
  try:
121
94
  soup = get_soup(url)
122
- links = [a['href'] for a in soup.find_all('a', href=True)]
123
- valid_links = []
124
-
125
- for link in links:
126
- parsed_link = urlparse(link)
127
- base_url = "{}://{}".format(parsed_link.scheme, parsed_link.netloc)
128
-
129
- if base_url == url: # Avoiding external URLs
130
- final_link = urljoin(url, parsed_link.path)
131
- if final_link not in valid_links:
132
- valid_links.append(final_link)
133
-
134
- for link in valid_links:
135
- crawl(link, max_depth, depth+1)
136
-
137
- return valid_links
138
-
95
+ links = []
96
+ for tag in soup.find_all('a', href=True):
97
+ link = urljoin(url, tag['href'])
98
+ if urlparse(link).netloc == urlparse(url).netloc and link not in visited:
99
+ links.append(link)
100
+ self.crawl(link, max_depth, depth + 1, visited)
101
+ return links
139
102
  except Exception as e:
140
103
  print(f"Error crawling {url}: {e}")
141
104
  return []
142
105
 
143
-
144
- # Define or import required functions here, like get_all_website_links, determine_values,
145
- # discover_classes_and_meta_images, and extract_links_from_url.
146
- def get_meta_info(self,url=None):
106
+ def get_meta_info(self, url=None):
107
+ """Fetches metadata, including title and meta tags, from the page."""
147
108
  url = url or self.url
148
- soup_mgr = soupManager(url=url)
149
- meta_info = {}
150
- # Fetch the title if available
151
- meta_tags = soup_mgr.find_all("meta")
152
- url = eatAll(str(url),['',' ','\n','\t','\\','/'])
153
- attribs = get_all_attribute_values(url)
154
- soup = get_soup(url)
109
+ soup = BeautifulSoup(self.source_code, self.parse_type)
110
+ meta_info = {"title": None, "meta_tags": {}}
155
111
 
156
- for meta_tag in meta_tags:
157
- for attr, values in meta_tag.attrs.items():
158
-
159
- if attr not in meta_tag:
160
- meta_tag[attr] = []
161
- if values not in meta_tag[attr]:
162
- meta_tag[attr].append(values)
163
- title_tag = soup.find_all("title")
112
+ title_tag = soup.find("title")
164
113
  if title_tag:
165
- meta_info["title"] = title_tag
166
- # Fetch meta tags
167
- for meta_tag in soup.find_all('meta'):
168
- name = meta_tag.get('name') or meta_tag.get('property')
169
- if name:
170
- content = meta_tag.get('content')
171
- if content:
172
- meta_info[name] = content
114
+ meta_info["title"] = title_tag.text
115
+
116
+ for meta in soup.find_all('meta'):
117
+ name = meta.get('name') or meta.get('property')
118
+ content = meta.get('content')
119
+ if name and content:
120
+ meta_info["meta_tags"][name] = content
173
121
 
174
122
  return meta_info
175
- def generate_sitemap(self,domain):
176
-
123
+
124
+ def generate_sitemap(self):
125
+ """Generates a sitemap.xml file with URLs, images, change frequency, and priority."""
126
+ urls = self.get_all_website_links()
177
127
  with open('sitemap.xml', 'w', encoding='utf-8') as f:
178
- string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n'
179
-
180
- for url in self.get_all_website_links():
181
- string += f' <url>\n <loc>{url}</loc>\n'
182
- preprocess=[]
183
- self.get_new_source_and_url(url=url)
184
- links = get_all_attribute_values(url)
185
- images = [link for link in links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
128
+ f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
129
+ f.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" ')
130
+ f.write('xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n')
186
131
 
187
- for img in images:
188
- if str(img).lower() not in preprocess:
189
- try:
190
- escaped_img = img.replace('&', '&amp;')
191
-
192
- str_write = f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n'
193
- string += str_write
194
- except:
195
- pass
196
- preprocess.append(str(img).lower())
132
+ for url in urls:
133
+ f.write(f' <url>\n <loc>{url}</loc>\n')
197
134
  frequency, priority = self.determine_values(url)
198
- string += f' <changefreq>{frequency}</changefreq>\n'
199
- string += f' <priority>{priority}</priority>\n'
200
- string += f' </url>\n'
201
-
202
- string += '</urlset>\n'
203
- f.write(string)
204
- # Output summary
205
- print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
135
+ f.write(f' <changefreq>{frequency}</changefreq>\n')
136
+ f.write(f' <priority>{priority}</priority>\n')
137
+
138
+ images = [img for img in self.extract_links_from_url(url)['images']]
139
+ for img in images:
140
+ escaped_img = img.replace('&', '&amp;')
141
+ f.write(f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n')
142
+
143
+ f.write(' </url>\n')
144
+
145
+ f.write('</urlset>\n')
206
146
 
207
- # Output class and link details
208
- for url in urls:
209
- print(f"\nDetails for {url}:")
210
- classes, meta_img_refs = discover_classes_and_meta_images(url)
211
-
212
- print("\nClasses with href or src attributes:")
213
- for class_name in classes:
214
- print(f"\t{class_name}")
215
-
216
- print("\nMeta Image References:")
217
- for img_ref in meta_img_refs:
218
- print(f"\t{img_ref}")
219
-
220
- links = extract_links_from_url(url)
221
-
222
- print("\nImages:")
223
- for img in links['images']:
224
- print(f"\t{img}")
225
-
226
- print("\nExternal Links:")
227
- for ext_link in links['external_links']:
228
- print(f"\t{ext_link}")
147
+ print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
229
148
 
230
149
  class crawlManagerSingleton():
231
150
  _instance = None
@@ -237,8 +156,9 @@ class crawlManagerSingleton():
237
156
  crawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
238
157
  return crawlManagerSingleton._instance
239
158
  def get_crawl_mgr(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
159
+
160
+ url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
240
161
  url = get_url(url=url,url_mgr=url_mgr)
241
- url_mgr = get_url(url=url,url_mgr=url_mgr)
242
162
  req_mgr=get_req_mgr(url=url,url_mgr=url_mgr,source_code=source_code)
243
163
  source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr)
244
164
  soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,parse_type=parse_type)
@@ -249,3 +169,4 @@ def get_domain_crawl(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_t
249
169
  url = get_url(url=url,url_mgr=url_mgr)
250
170
  all_domain_links = crawl_mgr.crawl(url=url, max_depth=max_depth, depth=depth)
251
171
  return all_domain_links
172
+ get_domain_crawl(url='https://www.tradingview.com/symbols/SOLUSD/')
@@ -87,7 +87,8 @@ class requestManager:
87
87
  timeout = None,
88
88
  last_request_time=None,
89
89
  max_retries=None,
90
- request_wait_limit=None):
90
+ request_wait_limit=
91
+ None):
91
92
  self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
92
93
  self.url=get_url(url=url,url_mgr=self.url_mgr)
93
94
  self._url_mgr = self.url_mgr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.10
3
+ Version: 0.1.6.11
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -7,13 +7,13 @@ abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE7
7
7
  abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f5GoZmw0,3454
8
8
  abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
9
9
  abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
10
- abstract_webtools/managers/crawlManager.py,sha256=_Pci_Rm1jZzFjFle_PerkecrZZ_unBxcK2HPEmS98HM,10736
10
+ abstract_webtools/managers/crawlManager.py,sha256=ZcCDw_i8OEvfAyRvNu-Krx3kTigrFAFO4ZKMOhiXd9o,7967
11
11
  abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
12
12
  abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
13
13
  abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
14
14
  abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
15
15
  abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
16
- abstract_webtools/managers/requestManager.py,sha256=YksYgRivMMuZNOzyL5vaoXv9MLhgYeuLxO_UJiqPGWw,17312
16
+ abstract_webtools/managers/requestManager.py,sha256=ko07C9igeTr_KtfeijaO126WRVyDw6jkvJhBHlFdwho,17330
17
17
  abstract_webtools/managers/seleniumManager.py,sha256=qSY8gH3N5YJIMwE_Alj9HNQRip_PziIo4_T9AZE_FQo,4273
18
18
  abstract_webtools/managers/soupManager.py,sha256=7nDB_QKneGjyTZUzchfbdHNvxxYiTyIn8AHon8ObTSY,17148
19
19
  abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
@@ -21,8 +21,8 @@ abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_
21
21
  abstract_webtools/managers/urlManager.py,sha256=XqMrCM84BeWEfWtHc_8UFpT91ZtG-okzdKdCuC49vsA,8678
22
22
  abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
23
23
  abstract_webtools/managers/videoDownloader.py,sha256=6G_aLc05BTMUYUWc7iqYtHF_BaR7DnCNK_NJ-QnjsYY,10531
24
- abstract_webtools-0.1.6.10.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
25
- abstract_webtools-0.1.6.10.dist-info/METADATA,sha256=vVVUtiXTGSf_Z2RGB7qE2LBFPxOOG_7dNxoR_JeoXwY,15858
26
- abstract_webtools-0.1.6.10.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
27
- abstract_webtools-0.1.6.10.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
28
- abstract_webtools-0.1.6.10.dist-info/RECORD,,
24
+ abstract_webtools-0.1.6.11.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
25
+ abstract_webtools-0.1.6.11.dist-info/METADATA,sha256=X43t8XSI9MPYjaEZQqgCrWgMpty6ZSyrjm317YE4VQ4,15858
26
+ abstract_webtools-0.1.6.11.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
27
+ abstract_webtools-0.1.6.11.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
28
+ abstract_webtools-0.1.6.11.dist-info/RECORD,,