abstract-webtools 0.1.6.10__py3-none-any.whl → 0.1.6.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/crawlManager.py +107 -186
- abstract_webtools/managers/requestManager.py +2 -1
- {abstract_webtools-0.1.6.10.dist-info → abstract_webtools-0.1.6.11.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.10.dist-info → abstract_webtools-0.1.6.11.dist-info}/RECORD +7 -7
- {abstract_webtools-0.1.6.10.dist-info → abstract_webtools-0.1.6.11.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.6.10.dist-info → abstract_webtools-0.1.6.11.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.10.dist-info → abstract_webtools-0.1.6.11.dist-info}/top_level.txt +0 -0
@@ -1,231 +1,150 @@
|
|
1
|
-
from
|
1
|
+
from abstract_webtools import * #.soupManager import *
|
2
2
|
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
from urllib.parse import urlparse, urljoin
|
5
|
+
import os
|
6
|
+
import xml.etree.ElementTree as ET
|
7
|
+
from bs4 import BeautifulSoup
|
8
|
+
|
9
|
+
class CrawlManager:
|
10
|
+
def __init__(self, url=None, req_mgr=None, url_mgr=None, source_code=None, parse_type="html.parser"):
|
11
|
+
self.url = url
|
12
|
+
self.source_code = source_code
|
13
|
+
self.parse_type = parse_type
|
9
14
|
self.url_mgr = url_mgr or urlManager(url=self.url)
|
10
|
-
self.req_mgr = requestManager(url_mgr=self.url_mgr)
|
15
|
+
self.req_mgr = req_mgr or requestManager(url_mgr=self.url_mgr)
|
11
16
|
self.get_new_source_and_url(url)
|
12
|
-
|
13
|
-
|
17
|
+
|
18
|
+
def get_new_source_and_url(self, url=None):
|
19
|
+
"""Fetches new source code and response for a given URL."""
|
20
|
+
if url is None:
|
14
21
|
url = self.url
|
22
|
+
self.req_mgr.set_url(url)
|
23
|
+
self.source_code = self.req_mgr.source_code
|
15
24
|
self.response = self.req_mgr.response
|
16
|
-
self.source_code=self.req_mgr.source_code
|
17
|
-
def get_classes_and_meta_info():
|
18
|
-
class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
|
19
|
-
attrs = 'href','src'
|
20
|
-
unique_classes, images=discover_classes_and_images(self,tag_name,class_name_1,class_name_2,class_value,attrs)
|
21
|
-
return unique_classes, images
|
22
|
-
def extract_links_from_url(self,url=None):
|
23
|
-
"""
|
24
|
-
Extracts all href and src links from a given URL's source code.
|
25
25
|
|
26
|
-
|
27
|
-
|
26
|
+
def get_classes_and_meta_info(self):
|
27
|
+
"""Returns unique classes and image links from meta tags."""
|
28
|
+
tag_name = 'meta'
|
29
|
+
class_name_1, class_name_2 = 'class', 'property'
|
30
|
+
class_value = 'og:image'
|
31
|
+
attrs = ['href', 'src']
|
32
|
+
unique_classes, images = discover_classes_and_images(self, tag_name, class_name_1, class_name_2, class_value, attrs)
|
33
|
+
return unique_classes, images
|
28
34
|
|
29
|
-
|
30
|
-
|
31
|
-
"""
|
35
|
+
def extract_links_from_url(self, url=None):
|
36
|
+
"""Extracts all href and src links from a given URL's source code."""
|
32
37
|
url = url or self.url_mgr.url
|
33
|
-
|
34
|
-
|
38
|
+
soup = BeautifulSoup(self.source_code, self.parse_type)
|
39
|
+
links = {'images': [], 'external_links': []}
|
40
|
+
|
41
|
+
if self.response:
|
42
|
+
for attr in ['href', 'src']:
|
43
|
+
for tag in soup.find_all(attrs={attr: True}):
|
44
|
+
link = tag.get(attr)
|
45
|
+
if link:
|
46
|
+
absolute_link = urljoin(url, link)
|
47
|
+
if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp')):
|
48
|
+
links['images'].append(absolute_link)
|
49
|
+
elif urlparse(absolute_link).netloc != urlparse(url).netloc:
|
50
|
+
links['external_links'].append(absolute_link)
|
35
51
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
links[i]= [a[attr[i]] for a in soup_mgr.get_find_all_with_attributes(attrs[i])]
|
42
|
-
# Convert all links to absolute links
|
43
|
-
absolute_links = [(url, link) for link in links[0] + links[1]]
|
44
|
-
# Separate images and external links
|
45
|
-
images = [link for link in absolute_links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
|
46
|
-
external_links = [link for link in absolute_links if urlparse(link).netloc != urlparse(url).netloc]
|
47
|
-
agg_js['images']=images
|
48
|
-
agg_js['external_links']=external_links
|
49
|
-
|
50
|
-
return agg_js
|
51
|
-
def get_all_website_links(self,tag="a",attr="href") -> list:
|
52
|
-
"""
|
53
|
-
Returns all URLs that are found on the specified URL and belong to the same website.
|
54
|
-
|
55
|
-
Args:
|
56
|
-
url (str): The URL to search for links.
|
57
|
-
|
58
|
-
Returns:
|
59
|
-
list: A list of URLs that belong to the same website as the specified URL.
|
60
|
-
"""
|
61
|
-
all_urls=[self.url_mgr.url]
|
52
|
+
return links
|
53
|
+
|
54
|
+
def get_all_website_links(self):
|
55
|
+
"""Finds all internal links on the website that belong to the same domain."""
|
56
|
+
all_urls = [self.url_mgr.url]
|
62
57
|
domain = self.url_mgr.domain
|
63
58
|
all_attribs = get_all_attribute_values(self.url_mgr.url)
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
continue
|
68
|
-
href=self.url_mgr.get_relative_href(self.url_mgr.url,href)
|
69
|
-
if not self.url_mgr.is_valid_url(href):
|
70
|
-
# not a valid URL
|
71
|
-
continue
|
72
|
-
if href in all_urls:
|
73
|
-
# already in the set
|
74
|
-
continue
|
75
|
-
if domain not in href:
|
76
|
-
# external link
|
59
|
+
|
60
|
+
for href in all_attribs.get('href', []):
|
61
|
+
if not href or not self.url_mgr.is_valid_url(href):
|
77
62
|
continue
|
78
|
-
|
79
|
-
|
63
|
+
full_url = urljoin(self.url_mgr.url, href)
|
64
|
+
if domain in full_url and full_url not in all_urls:
|
65
|
+
all_urls.append(full_url)
|
66
|
+
|
80
67
|
return all_urls
|
81
68
|
|
82
|
-
def correct_xml(xml_string):
|
83
|
-
|
69
|
+
def correct_xml(self, xml_string):
|
70
|
+
"""Corrects XML by encoding special characters in <image:loc> tags."""
|
84
71
|
root = ET.fromstring(xml_string)
|
85
|
-
|
86
|
-
# Loop through each <image:loc> element and correct its text if needed
|
87
72
|
for image_loc in root.findall(".//image:loc", namespaces={'image': 'http://www.google.com/schemas/sitemap-image/1.1'}):
|
88
|
-
# Replace '&' with '&' in the element's text
|
89
73
|
if '&' in image_loc.text:
|
90
74
|
image_loc.text = image_loc.text.replace('&', '&')
|
75
|
+
return ET.tostring(root, encoding='utf-8').decode('utf-8')
|
91
76
|
|
92
|
-
|
93
|
-
|
94
|
-
return corrected_xml
|
95
|
-
|
96
|
-
|
97
|
-
def determine_values(self,url=None):
|
98
|
-
# This is just a mockup. In a real application, you'd analyze the URL or its content.
|
77
|
+
def determine_values(self, url=None):
|
78
|
+
"""Determines frequency and priority based on URL type."""
|
99
79
|
url = url or self.url
|
100
|
-
# Assuming a blog site
|
101
80
|
if 'blog' in url:
|
102
|
-
if '2023' in url
|
103
|
-
return ('weekly', '0.8')
|
104
|
-
else:
|
105
|
-
return ('monthly', '0.6')
|
81
|
+
return ('weekly', '0.8') if '2023' in url else ('monthly', '0.6')
|
106
82
|
elif 'contact' in url:
|
107
83
|
return ('yearly', '0.3')
|
108
|
-
|
109
|
-
return ('weekly', '1.0')
|
110
|
-
def crawl(self,url, max_depth=3, depth=1):
|
111
|
-
visited=set()
|
112
|
-
if depth > max_depth:
|
113
|
-
return []
|
84
|
+
return ('weekly', '1.0')
|
114
85
|
|
115
|
-
|
86
|
+
def crawl(self, url, max_depth=3, depth=1, visited=None):
|
87
|
+
"""Recursively crawls the site up to max_depth and returns valid internal links."""
|
88
|
+
visited = visited or set()
|
89
|
+
if depth > max_depth or url in visited:
|
116
90
|
return []
|
117
91
|
|
118
92
|
visited.add(url)
|
119
|
-
|
120
93
|
try:
|
121
94
|
soup = get_soup(url)
|
122
|
-
links = [
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
if base_url == url: # Avoiding external URLs
|
130
|
-
final_link = urljoin(url, parsed_link.path)
|
131
|
-
if final_link not in valid_links:
|
132
|
-
valid_links.append(final_link)
|
133
|
-
|
134
|
-
for link in valid_links:
|
135
|
-
crawl(link, max_depth, depth+1)
|
136
|
-
|
137
|
-
return valid_links
|
138
|
-
|
95
|
+
links = []
|
96
|
+
for tag in soup.find_all('a', href=True):
|
97
|
+
link = urljoin(url, tag['href'])
|
98
|
+
if urlparse(link).netloc == urlparse(url).netloc and link not in visited:
|
99
|
+
links.append(link)
|
100
|
+
self.crawl(link, max_depth, depth + 1, visited)
|
101
|
+
return links
|
139
102
|
except Exception as e:
|
140
103
|
print(f"Error crawling {url}: {e}")
|
141
104
|
return []
|
142
105
|
|
143
|
-
|
144
|
-
|
145
|
-
# discover_classes_and_meta_images, and extract_links_from_url.
|
146
|
-
def get_meta_info(self,url=None):
|
106
|
+
def get_meta_info(self, url=None):
|
107
|
+
"""Fetches metadata, including title and meta tags, from the page."""
|
147
108
|
url = url or self.url
|
148
|
-
|
149
|
-
meta_info = {}
|
150
|
-
# Fetch the title if available
|
151
|
-
meta_tags = soup_mgr.find_all("meta")
|
152
|
-
url = eatAll(str(url),['',' ','\n','\t','\\','/'])
|
153
|
-
attribs = get_all_attribute_values(url)
|
154
|
-
soup = get_soup(url)
|
109
|
+
soup = BeautifulSoup(self.source_code, self.parse_type)
|
110
|
+
meta_info = {"title": None, "meta_tags": {}}
|
155
111
|
|
156
|
-
|
157
|
-
for attr, values in meta_tag.attrs.items():
|
158
|
-
|
159
|
-
if attr not in meta_tag:
|
160
|
-
meta_tag[attr] = []
|
161
|
-
if values not in meta_tag[attr]:
|
162
|
-
meta_tag[attr].append(values)
|
163
|
-
title_tag = soup.find_all("title")
|
112
|
+
title_tag = soup.find("title")
|
164
113
|
if title_tag:
|
165
|
-
meta_info["title"] = title_tag
|
166
|
-
|
167
|
-
for
|
168
|
-
name =
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
meta_info[name] = content
|
114
|
+
meta_info["title"] = title_tag.text
|
115
|
+
|
116
|
+
for meta in soup.find_all('meta'):
|
117
|
+
name = meta.get('name') or meta.get('property')
|
118
|
+
content = meta.get('content')
|
119
|
+
if name and content:
|
120
|
+
meta_info["meta_tags"][name] = content
|
173
121
|
|
174
122
|
return meta_info
|
175
|
-
|
176
|
-
|
123
|
+
|
124
|
+
def generate_sitemap(self):
|
125
|
+
"""Generates a sitemap.xml file with URLs, images, change frequency, and priority."""
|
126
|
+
urls = self.get_all_website_links()
|
177
127
|
with open('sitemap.xml', 'w', encoding='utf-8') as f:
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
string += f' <url>\n <loc>{url}</loc>\n'
|
182
|
-
preprocess=[]
|
183
|
-
self.get_new_source_and_url(url=url)
|
184
|
-
links = get_all_attribute_values(url)
|
185
|
-
images = [link for link in links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
|
128
|
+
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
129
|
+
f.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" ')
|
130
|
+
f.write('xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n')
|
186
131
|
|
187
|
-
|
188
|
-
|
189
|
-
try:
|
190
|
-
escaped_img = img.replace('&', '&')
|
191
|
-
|
192
|
-
str_write = f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n'
|
193
|
-
string += str_write
|
194
|
-
except:
|
195
|
-
pass
|
196
|
-
preprocess.append(str(img).lower())
|
132
|
+
for url in urls:
|
133
|
+
f.write(f' <url>\n <loc>{url}</loc>\n')
|
197
134
|
frequency, priority = self.determine_values(url)
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
135
|
+
f.write(f' <changefreq>{frequency}</changefreq>\n')
|
136
|
+
f.write(f' <priority>{priority}</priority>\n')
|
137
|
+
|
138
|
+
images = [img for img in self.extract_links_from_url(url)['images']]
|
139
|
+
for img in images:
|
140
|
+
escaped_img = img.replace('&', '&')
|
141
|
+
f.write(f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n')
|
142
|
+
|
143
|
+
f.write(' </url>\n')
|
144
|
+
|
145
|
+
f.write('</urlset>\n')
|
206
146
|
|
207
|
-
|
208
|
-
for url in urls:
|
209
|
-
print(f"\nDetails for {url}:")
|
210
|
-
classes, meta_img_refs = discover_classes_and_meta_images(url)
|
211
|
-
|
212
|
-
print("\nClasses with href or src attributes:")
|
213
|
-
for class_name in classes:
|
214
|
-
print(f"\t{class_name}")
|
215
|
-
|
216
|
-
print("\nMeta Image References:")
|
217
|
-
for img_ref in meta_img_refs:
|
218
|
-
print(f"\t{img_ref}")
|
219
|
-
|
220
|
-
links = extract_links_from_url(url)
|
221
|
-
|
222
|
-
print("\nImages:")
|
223
|
-
for img in links['images']:
|
224
|
-
print(f"\t{img}")
|
225
|
-
|
226
|
-
print("\nExternal Links:")
|
227
|
-
for ext_link in links['external_links']:
|
228
|
-
print(f"\t{ext_link}")
|
147
|
+
print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
|
229
148
|
|
230
149
|
class crawlManagerSingleton():
|
231
150
|
_instance = None
|
@@ -237,8 +156,9 @@ class crawlManagerSingleton():
|
|
237
156
|
crawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
238
157
|
return crawlManagerSingleton._instance
|
239
158
|
def get_crawl_mgr(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
|
159
|
+
|
160
|
+
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
240
161
|
url = get_url(url=url,url_mgr=url_mgr)
|
241
|
-
url_mgr = get_url(url=url,url_mgr=url_mgr)
|
242
162
|
req_mgr=get_req_mgr(url=url,url_mgr=url_mgr,source_code=source_code)
|
243
163
|
source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr)
|
244
164
|
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,parse_type=parse_type)
|
@@ -249,3 +169,4 @@ def get_domain_crawl(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_t
|
|
249
169
|
url = get_url(url=url,url_mgr=url_mgr)
|
250
170
|
all_domain_links = crawl_mgr.crawl(url=url, max_depth=max_depth, depth=depth)
|
251
171
|
return all_domain_links
|
172
|
+
get_domain_crawl(url='https://www.tradingview.com/symbols/SOLUSD/')
|
@@ -87,7 +87,8 @@ class requestManager:
|
|
87
87
|
timeout = None,
|
88
88
|
last_request_time=None,
|
89
89
|
max_retries=None,
|
90
|
-
request_wait_limit=
|
90
|
+
request_wait_limit=
|
91
|
+
None):
|
91
92
|
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
92
93
|
self.url=get_url(url=url,url_mgr=self.url_mgr)
|
93
94
|
self._url_mgr = self.url_mgr
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.11
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -7,13 +7,13 @@ abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE7
|
|
7
7
|
abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f5GoZmw0,3454
|
8
8
|
abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
|
9
9
|
abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
|
10
|
-
abstract_webtools/managers/crawlManager.py,sha256=
|
10
|
+
abstract_webtools/managers/crawlManager.py,sha256=ZcCDw_i8OEvfAyRvNu-Krx3kTigrFAFO4ZKMOhiXd9o,7967
|
11
11
|
abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
|
12
12
|
abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
|
13
13
|
abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
|
14
14
|
abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
|
15
15
|
abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
|
16
|
-
abstract_webtools/managers/requestManager.py,sha256=
|
16
|
+
abstract_webtools/managers/requestManager.py,sha256=ko07C9igeTr_KtfeijaO126WRVyDw6jkvJhBHlFdwho,17330
|
17
17
|
abstract_webtools/managers/seleniumManager.py,sha256=qSY8gH3N5YJIMwE_Alj9HNQRip_PziIo4_T9AZE_FQo,4273
|
18
18
|
abstract_webtools/managers/soupManager.py,sha256=7nDB_QKneGjyTZUzchfbdHNvxxYiTyIn8AHon8ObTSY,17148
|
19
19
|
abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
|
@@ -21,8 +21,8 @@ abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_
|
|
21
21
|
abstract_webtools/managers/urlManager.py,sha256=XqMrCM84BeWEfWtHc_8UFpT91ZtG-okzdKdCuC49vsA,8678
|
22
22
|
abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
|
23
23
|
abstract_webtools/managers/videoDownloader.py,sha256=6G_aLc05BTMUYUWc7iqYtHF_BaR7DnCNK_NJ-QnjsYY,10531
|
24
|
-
abstract_webtools-0.1.6.
|
25
|
-
abstract_webtools-0.1.6.
|
26
|
-
abstract_webtools-0.1.6.
|
27
|
-
abstract_webtools-0.1.6.
|
28
|
-
abstract_webtools-0.1.6.
|
24
|
+
abstract_webtools-0.1.6.11.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
25
|
+
abstract_webtools-0.1.6.11.dist-info/METADATA,sha256=X43t8XSI9MPYjaEZQqgCrWgMpty6ZSyrjm317YE4VQ4,15858
|
26
|
+
abstract_webtools-0.1.6.11.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
27
|
+
abstract_webtools-0.1.6.11.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
28
|
+
abstract_webtools-0.1.6.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|