abstract-webtools 0.1.5.8__py3-none-any.whl → 0.1.5.82__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/__init__.py +2 -1
- abstract_webtools/abstract_webtools.py +11 -1772
- abstract_webtools/managers/__init__.py +14 -0
- abstract_webtools/managers/cipherManager.py +38 -0
- abstract_webtools/managers/crawlManager.py +187 -0
- abstract_webtools/managers/domainManager.py +48 -0
- abstract_webtools/managers/dynamicRateLimiter.py +138 -0
- abstract_webtools/managers/linkManager.py +189 -0
- abstract_webtools/managers/mySocketClient.py +46 -0
- abstract_webtools/managers/networkManager.py +15 -0
- abstract_webtools/managers/requestManager.py +348 -0
- abstract_webtools/managers/seleniumManager.py +85 -0
- abstract_webtools/managers/soupManager.py +313 -0
- abstract_webtools/managers/sslManager.py +21 -0
- abstract_webtools/managers/tlsAdapter.py +27 -0
- abstract_webtools/managers/urlManager.py +225 -0
- abstract_webtools/managers/userAgentManager.py +42 -0
- abstract_webtools/managers/videoDownloader.py +205 -0
- abstract_webtools/url_grabber.py +1 -1
- {abstract_webtools-0.1.5.8.dist-info → abstract_webtools-0.1.5.82.dist-info}/METADATA +6 -6
- abstract_webtools-0.1.5.82.dist-info/RECORD +28 -0
- {abstract_webtools-0.1.5.8.dist-info → abstract_webtools-0.1.5.82.dist-info}/WHEEL +1 -1
- abstract_webtools-0.1.5.8.dist-info/RECORD +0 -12
- {abstract_webtools-0.1.5.8.dist-info → abstract_webtools-0.1.5.82.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.5.8.dist-info → abstract_webtools-0.1.5.82.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
from .cipherManager import *
|
2
|
+
from .crawlManager import *
|
3
|
+
from .dynamicRateLimiter import *
|
4
|
+
from .linkManager import *
|
5
|
+
from .mySocketClient import *
|
6
|
+
from .networkManager import *
|
7
|
+
from .requestManager import *
|
8
|
+
from .soupManager import *
|
9
|
+
from .sslManager import *
|
10
|
+
from .tlsAdapter import *
|
11
|
+
from .urlManager import *
|
12
|
+
from .userAgentManager import *
|
13
|
+
from .videoDownloader import *
|
14
|
+
from .seleniumManager import *
|
@@ -0,0 +1,38 @@
|
|
1
|
+
class CipherManager:
|
2
|
+
@staticmethod
|
3
|
+
def get_default_ciphers()-> list:
|
4
|
+
return [
|
5
|
+
"ECDHE-RSA-AES256-GCM-SHA384", "ECDHE-ECDSA-AES256-GCM-SHA384",
|
6
|
+
"ECDHE-RSA-AES256-SHA384", "ECDHE-ECDSA-AES256-SHA384",
|
7
|
+
"ECDHE-RSA-AES256-SHA", "ECDHE-ECDSA-AES256-SHA",
|
8
|
+
"ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-SHA256",
|
9
|
+
"ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES128-SHA256",
|
10
|
+
"AES256-SHA", "AES128-SHA"
|
11
|
+
]
|
12
|
+
|
13
|
+
def __init__(self,cipher_list=None):
|
14
|
+
if cipher_list == None:
|
15
|
+
cipher_list=self.get_default_ciphers()
|
16
|
+
self.cipher_list = cipher_list
|
17
|
+
self.create_list()
|
18
|
+
self.ciphers_string = self.add_string_list()
|
19
|
+
def add_string_list(self):
|
20
|
+
if len(self.cipher_list)==0:
|
21
|
+
return ''
|
22
|
+
return','.join(self.cipher_list)
|
23
|
+
def create_list(self):
|
24
|
+
if self.cipher_list == None:
|
25
|
+
self.cipher_list= []
|
26
|
+
elif isinstance(self.cipher_list, str):
|
27
|
+
self.cipher_list=self.cipher_list.split(',')
|
28
|
+
if isinstance(self.cipher_list, str):
|
29
|
+
self.cipher_list=[self.cipher_list]
|
30
|
+
class CipherManagerSingleton:
|
31
|
+
_instance = None
|
32
|
+
@staticmethod
|
33
|
+
def get_instance(cipher_list=None):
|
34
|
+
if CipherManagerSingleton._instance is None:
|
35
|
+
CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
|
36
|
+
elif CipherManagerSingleton._instance.cipher_list != cipher_list:
|
37
|
+
CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
|
38
|
+
return CipherManagerSingleton._instance
|
@@ -0,0 +1,187 @@
|
|
1
|
+
class CrawlManager:
|
2
|
+
def __init__(self,url=None,source_code=None,parse_type="html.parser"):
|
3
|
+
self.url=url
|
4
|
+
self.source_code=source_code
|
5
|
+
self.parse_type=parse_type
|
6
|
+
get_new_source_and_url(self,url)
|
7
|
+
def get_new_source_and_url(self,url=None):
|
8
|
+
if url == None:
|
9
|
+
url = self.url
|
10
|
+
self.response = self.request_mgr.response
|
11
|
+
self.source_code=self.request_mgr.source_code
|
12
|
+
def get_classes_and_meta_info():
|
13
|
+
class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
|
14
|
+
attrs = 'href','src'
|
15
|
+
unique_classes, images=discover_classes_and_images(self,tag_name,class_name_1,class_name_2,class_value,attrs)
|
16
|
+
return unique_classes, images
|
17
|
+
def extract_links_from_url(self):
|
18
|
+
"""
|
19
|
+
Extracts all href and src links from a given URL's source code.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
base_url (str): The URL from which to extract links.
|
23
|
+
|
24
|
+
Returns:
|
25
|
+
dict: Dictionary containing image links and external links under the parent page.
|
26
|
+
"""
|
27
|
+
agg_js = {'images':[],'external_links':[]}
|
28
|
+
|
29
|
+
if self.response != None:
|
30
|
+
attrs = 'href','src'
|
31
|
+
href_links,src_links='',''
|
32
|
+
links = [href_links,src_links]
|
33
|
+
for i,each in enumerate(attrs):
|
34
|
+
links[i]= [a[attr[i]] for a in get_find_all_with_attributes(self, attrs[i])]
|
35
|
+
# Convert all links to absolute links
|
36
|
+
absolute_links = [(url, link) for link in links[0] + links[1]]
|
37
|
+
# Separate images and external links
|
38
|
+
images = [link for link in absolute_links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
|
39
|
+
external_links = [link for link in absolute_links if urlparse(link).netloc != urlparse(url).netloc]
|
40
|
+
agg_js['images']=images
|
41
|
+
agg_js['external_links']=external_links
|
42
|
+
|
43
|
+
return agg_js
|
44
|
+
|
45
|
+
|
46
|
+
def correct_xml(xml_string):
|
47
|
+
# Parse the XML string
|
48
|
+
root = ET.fromstring(xml_string)
|
49
|
+
|
50
|
+
# Loop through each <image:loc> element and correct its text if needed
|
51
|
+
for image_loc in root.findall(".//image:loc", namespaces={'image': 'http://www.google.com/schemas/sitemap-image/1.1'}):
|
52
|
+
# Replace '&' with '&' in the element's text
|
53
|
+
if '&' in image_loc.text:
|
54
|
+
image_loc.text = image_loc.text.replace('&', '&')
|
55
|
+
|
56
|
+
# Convert the corrected XML back to string
|
57
|
+
corrected_xml = ET.tostring(root, encoding='utf-8').decode('utf-8')
|
58
|
+
return corrected_xml
|
59
|
+
|
60
|
+
|
61
|
+
def determine_values(self):
|
62
|
+
# This is just a mockup. In a real application, you'd analyze the URL or its content.
|
63
|
+
|
64
|
+
# Assuming a blog site
|
65
|
+
if 'blog' in self.url:
|
66
|
+
if '2023' in self.url: # Assuming it's a current year article
|
67
|
+
return ('weekly', '0.8')
|
68
|
+
else:
|
69
|
+
return ('monthly', '0.6')
|
70
|
+
elif 'contact' in self.url:
|
71
|
+
return ('yearly', '0.3')
|
72
|
+
else: # Homepage or main categories
|
73
|
+
return ('weekly', '1.0')
|
74
|
+
def crawl(url, max_depth=3, depth=1):
|
75
|
+
|
76
|
+
if depth > max_depth:
|
77
|
+
return []
|
78
|
+
|
79
|
+
if url in visited:
|
80
|
+
return []
|
81
|
+
|
82
|
+
visited.add(url)
|
83
|
+
|
84
|
+
try:
|
85
|
+
|
86
|
+
links = [a['href'] for a in self.soup.find_all('a', href=True)]
|
87
|
+
valid_links = []
|
88
|
+
|
89
|
+
for link in links:
|
90
|
+
parsed_link = urlparse(link)
|
91
|
+
base_url = "{}://{}".format(parsed_link.scheme, parsed_link.netloc)
|
92
|
+
|
93
|
+
if base_url == url: # Avoiding external URLs
|
94
|
+
final_link = urljoin(url, parsed_link.path)
|
95
|
+
if final_link not in valid_links:
|
96
|
+
valid_links.append(final_link)
|
97
|
+
|
98
|
+
for link in valid_links:
|
99
|
+
crawl(link, max_depth, depth+1)
|
100
|
+
|
101
|
+
return valid_links
|
102
|
+
|
103
|
+
except Exception as e:
|
104
|
+
print(f"Error crawling {url}: {e}")
|
105
|
+
return []
|
106
|
+
|
107
|
+
|
108
|
+
# Define or import required functions here, like get_all_website_links, determine_values,
|
109
|
+
# discover_classes_and_meta_images, and extract_links_from_url.
|
110
|
+
def get_meta_info(self):
|
111
|
+
|
112
|
+
meta_info = {}
|
113
|
+
# Fetch the title if available
|
114
|
+
title_tag = parse_title()
|
115
|
+
if title_tag:
|
116
|
+
meta_info["title"] = title_tag
|
117
|
+
# Fetch meta tags
|
118
|
+
for meta_tag in soup.find_all('meta'):
|
119
|
+
name = meta_tag.get('name') or meta_tag.get('property')
|
120
|
+
if name:
|
121
|
+
content = meta_tag.get('content')
|
122
|
+
if content:
|
123
|
+
meta_info[name] = content
|
124
|
+
|
125
|
+
return meta_info
|
126
|
+
def generate_sitemap(self,domain):
|
127
|
+
|
128
|
+
with open('sitemap.xml', 'w', encoding='utf-8') as f:
|
129
|
+
string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n'
|
130
|
+
|
131
|
+
for url in self.all_site_links:
|
132
|
+
string += f' <url>\n <loc>{url}</loc>\n'
|
133
|
+
preprocess=[]
|
134
|
+
self.get_new_source_and_url(url=url)
|
135
|
+
links = extract_links_from_url(url)
|
136
|
+
|
137
|
+
for img in links['images']:
|
138
|
+
if str(img).lower() not in preprocess:
|
139
|
+
try:
|
140
|
+
escaped_img = img.replace('&', '&')
|
141
|
+
|
142
|
+
str_write = f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n'
|
143
|
+
string += str_write
|
144
|
+
except:
|
145
|
+
pass
|
146
|
+
preprocess.append(str(img).lower())
|
147
|
+
frequency, priority = determine_values(url)
|
148
|
+
string += f' <changefreq>{frequency}</changefreq>\n'
|
149
|
+
string += f' <priority>{priority}</priority>\n'
|
150
|
+
string += f' </url>\n'
|
151
|
+
|
152
|
+
string += '</urlset>\n'
|
153
|
+
f.write(string)
|
154
|
+
# Output summary
|
155
|
+
print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
|
156
|
+
|
157
|
+
# Output class and link details
|
158
|
+
for url in urls:
|
159
|
+
print(f"\nDetails for {url}:")
|
160
|
+
classes, meta_img_refs = discover_classes_and_meta_images(url)
|
161
|
+
|
162
|
+
print("\nClasses with href or src attributes:")
|
163
|
+
for class_name in classes:
|
164
|
+
print(f"\t{class_name}")
|
165
|
+
|
166
|
+
print("\nMeta Image References:")
|
167
|
+
for img_ref in meta_img_refs:
|
168
|
+
print(f"\t{img_ref}")
|
169
|
+
|
170
|
+
links = extract_links_from_url(url)
|
171
|
+
|
172
|
+
print("\nImages:")
|
173
|
+
for img in links['images']:
|
174
|
+
print(f"\t{img}")
|
175
|
+
|
176
|
+
print("\nExternal Links:")
|
177
|
+
for ext_link in links['external_links']:
|
178
|
+
print(f"\t{ext_link}")
|
179
|
+
class CrawlManagerSingleton():
|
180
|
+
_instance = None
|
181
|
+
@staticmethod
|
182
|
+
def get_instance(url=None,source_code=None,parse_type="html.parser"):
|
183
|
+
if CrawlManagerSingleton._instance is None:
|
184
|
+
CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
185
|
+
elif parse_type != CrawlManagerSingleton._instance.parse_type or url != CrawlManagerSingleton._instance.url or source_code != CrawlManagerSingleton._instance.source_code:
|
186
|
+
CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
187
|
+
return CrawlManagerSingleton._instance
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from ..abstract_webtools import *
|
2
|
+
class domainManager(metaclass=SingletonMeta):
|
3
|
+
def __init__(self, url):
|
4
|
+
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
5
|
+
self.initialized = True
|
6
|
+
parsed_url = urlparse(url)
|
7
|
+
self.domain = parsed_url.netloc
|
8
|
+
self.scheme = parsed_url.scheme
|
9
|
+
def get_url_to_path(self, url):
|
10
|
+
url = eatAll(str(url),['',' ','\n','\t','\\','/'])
|
11
|
+
parsed_url = urlparse(url)
|
12
|
+
if 'data:image' in url:
|
13
|
+
input(url)
|
14
|
+
if parsed_url.netloc == self.domain:
|
15
|
+
paths = parsed_url.path.split('/')
|
16
|
+
dir_path =self.site_dir
|
17
|
+
for path in paths[:-1]:
|
18
|
+
dir_path = os.path.join(dir_path, path)
|
19
|
+
os.makedirs(dir_path, exist_ok=True)
|
20
|
+
#if 'svg' in url:
|
21
|
+
#$ input(url)
|
22
|
+
# dir_path = get_image_name('contents',directory=dir_path,ext='png',url=item_url)
|
23
|
+
|
24
|
+
|
25
|
+
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
|
26
|
+
|
27
|
+
dir_path = os.path.join(dir_path, paths[-1])
|
28
|
+
return dir_path
|
29
|
+
|
30
|
+
def saved_url_check(self, url):
|
31
|
+
|
32
|
+
path = self.get_url_to_path(url)
|
33
|
+
return path
|
34
|
+
|
35
|
+
def get_with_netloc(self, url):
|
36
|
+
parsed_url = urlparse(url)
|
37
|
+
if parsed_url.netloc == '':
|
38
|
+
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
39
|
+
return url
|
40
|
+
|
41
|
+
def get_driver(self, url):
|
42
|
+
if url and url not in self.drivers:
|
43
|
+
chrome_options = Options()
|
44
|
+
chrome_options.add_argument("--headless")
|
45
|
+
driver = webdriver.Chrome(options=chrome_options)
|
46
|
+
self.drivers[url] = driver
|
47
|
+
driver.get(url)
|
48
|
+
return self.drivers[url]
|
@@ -0,0 +1,138 @@
|
|
1
|
+
class DynamicRateLimiterManager:
|
2
|
+
def __init__(self, service_name='ethereum'):
|
3
|
+
self.services = {}
|
4
|
+
self.service_name = service_name
|
5
|
+
self.add_service(service_name)
|
6
|
+
|
7
|
+
def add_service(self, service_name="default", low_limit=10, high_limit=30, limit_epoch=60, starting_tokens=10, epoch_cycle_adjustment=True):
|
8
|
+
if service_name in self.services:
|
9
|
+
print(f"Service {service_name} already exists!")
|
10
|
+
return
|
11
|
+
self.services[service_name] = DynamicRateLimiter(low_limit=low_limit, high_limit=high_limit, limit_epoch=limit_epoch, starting_tokens=starting_tokens, epoch_cycle_adjustment=epoch_cycle_adjustment)
|
12
|
+
|
13
|
+
def request(self, request_url, service_name=None):
|
14
|
+
service_name = service_name or self.service_name
|
15
|
+
if service_name not in self.services:
|
16
|
+
self.add_service(service_name)
|
17
|
+
|
18
|
+
limiter = self.services[service_name]
|
19
|
+
|
20
|
+
while True:
|
21
|
+
if limiter.request():
|
22
|
+
response = requests.get(request_url) # Actual request
|
23
|
+
if response.status_code == 200:
|
24
|
+
limiter.request_tracker(True)
|
25
|
+
return response.json()
|
26
|
+
elif response.status_code == 429:
|
27
|
+
limiter.request_tracker(False)
|
28
|
+
print(f"Rate limited by {service_name}. Adjusting limit and retrying...")
|
29
|
+
time.sleep(limiter.get_sleep()["current_sleep"])
|
30
|
+
else:
|
31
|
+
print(f"Unexpected response: {response.status_code}. Message: {response.text}")
|
32
|
+
return None
|
33
|
+
else:
|
34
|
+
print(f"Rate limit reached for {service_name}. Waiting for the next epoch...")
|
35
|
+
time.sleep(limiter.get_sleep()["current_sleep"])
|
36
|
+
|
37
|
+
def log_request(self, service_name, success):
|
38
|
+
print(f"[{service_name}] Request {'succeeded' if success else 'denied'}. Current tokens: {self.services[service_name].get_current_tokens()}")
|
39
|
+
|
40
|
+
class DynamicRateLimiter:
|
41
|
+
def __init__(self, low_limit, high_limit, limit_epoch, starting_tokens=None,epoch_cycle_adjustment:int=None):
|
42
|
+
self.low_limit = low_limit
|
43
|
+
self.high_limit = high_limit
|
44
|
+
self.limit_epoch = limit_epoch # in seconds
|
45
|
+
self.request_status_json = {"succesful":[],"unsuccesful":[],"last_requested":get_time_stamp(),"first_requested":get_time_stamp(),"epoch_left":self.limit_epoch,"last_fail":get_time_stamp(),"count_since_fail":0}
|
46
|
+
self.current_limit = starting_tokens or low_limit # Default to high_limit if starting_tokens isn't provided
|
47
|
+
self.epoch_cycle_adjustment = epoch_cycle_adjustment
|
48
|
+
# Additional attributes for tracking adjustment logic
|
49
|
+
self.last_adjusted_time = get_time_stamp()
|
50
|
+
self.successful_epochs_since_last_adjustment = 0
|
51
|
+
self.request_count_in_current_epoch = 0
|
52
|
+
|
53
|
+
def _refill_tokens(self):
|
54
|
+
time_since_last_request = get_time_stamp() - self.request_status_json["last_requested"]
|
55
|
+
new_tokens = (time_since_last_request / self.limit_epoch) * self.current_limit
|
56
|
+
self.tokens = min(self.current_limit, self.get_current_tokens())
|
57
|
+
def request_tracker(self,success):
|
58
|
+
if success:
|
59
|
+
self.request_status_json["succesful"].append(get_time_stamp())
|
60
|
+
else:
|
61
|
+
self.request_status_json["unsuccesful"].append(get_time_stamp())
|
62
|
+
self.request_status_json["last_fail"]=get_time_stamp()
|
63
|
+
self.request_status_json["count_since_fail"]=0
|
64
|
+
self.adjust_limit()
|
65
|
+
self.request_status_json["last_requested"]=get_time_stamp()
|
66
|
+
def calculate_tokens(self):
|
67
|
+
successful = []
|
68
|
+
for each in self.request_status_json["succesful"]:
|
69
|
+
if (get_time_stamp() - each)<self.limit_epoch:
|
70
|
+
successful.append(each)
|
71
|
+
self.request_status_json["succesful"]=successful
|
72
|
+
unsuccessful = []
|
73
|
+
for each in self.request_status_json["unsuccesful"]:
|
74
|
+
if (get_time_stamp() - each)<self.limit_epoch:
|
75
|
+
unsuccessful.append(each)
|
76
|
+
self.request_status_json["unsuccesful"]=unsuccessful
|
77
|
+
if len(successful)==0 and len(unsuccessful)==0:
|
78
|
+
pass
|
79
|
+
elif len(successful)!=0 and len(unsuccessful)==0:
|
80
|
+
self.request_status_json["first_requested"] = successful[0]
|
81
|
+
elif len(successful)==0 and len(unsuccessful)!=0:
|
82
|
+
self.request_status_json["first_requested"] = unsuccessful[0]
|
83
|
+
else:
|
84
|
+
self.request_status_json["first_requested"] = min(unsuccessful[0],successful[0])
|
85
|
+
self.request_status_json["epoch_left"]=self.limit_epoch-(self.request_status_json["last_requested"]-self.request_status_json["first_requested"])
|
86
|
+
|
87
|
+
return self.request_status_json
|
88
|
+
def get_current_tokens(self):
|
89
|
+
self.request_status_json = self.calculate_tokens()
|
90
|
+
total_requests = len(self.request_status_json["succesful"])+len(self.request_status_json["unsuccesful"])
|
91
|
+
return max(0,self.current_limit-total_requests)
|
92
|
+
def get_sleep(self):
|
93
|
+
self.request_status_json = self.calculate_tokens()
|
94
|
+
self.request_status_json["current_sleep"]=self.request_status_json["epoch_left"]/max(1,self.get_current_tokens())
|
95
|
+
return self.request_status_json
|
96
|
+
def request(self):
|
97
|
+
self._refill_tokens()
|
98
|
+
if self.tokens > 0:
|
99
|
+
return True # The request can be made
|
100
|
+
else:
|
101
|
+
if self.tokens == 0:
|
102
|
+
self.request_status_json["count_since_fail"]+=1
|
103
|
+
if self.epoch_cycle_adjustment != None:
|
104
|
+
if self.request_status_json["count_since_fail"] >=self.epoch_cycle_adjustment:
|
105
|
+
self.current_limit=min(self.current_limit+1,self.high_limit)
|
106
|
+
return False # The request cannot be made
|
107
|
+
def _adjust_limit(self):
|
108
|
+
current_time = get_time_stamp()
|
109
|
+
if current_time - self.last_adjusted_time >= self.limit_epoch:
|
110
|
+
if len(self.clear_epoch()["succesful"]) >= self.tokens:
|
111
|
+
# We hit the rate limit this epoch, decrease our limit
|
112
|
+
self.tokens = max(1, self.tokens - 1)
|
113
|
+
else:
|
114
|
+
self.successful_epochs_since_last_adjustment += 1
|
115
|
+
if self.successful_epochs_since_last_adjustment >= 5:
|
116
|
+
# We've had 5 successful epochs, increase our limit
|
117
|
+
self.current_limit = min(self.high_limit, self.tokens + 1)
|
118
|
+
self.successful_epochs_since_last_adjustment = 0
|
119
|
+
|
120
|
+
# Reset our counters for the new epoch
|
121
|
+
self.last_adjusted_time = current_time
|
122
|
+
self.request_count_in_current_epoch = 0
|
123
|
+
def adjust_limit(self):
|
124
|
+
# Set the tokens to succesful requests_made - 1
|
125
|
+
self.tokens = len(self.calculate_tokens()["succesful"])
|
126
|
+
|
127
|
+
# Adjust the high_limit
|
128
|
+
self.current_limit = self.tokens
|
129
|
+
|
130
|
+
# Log the adjustment
|
131
|
+
print(f"Adjusted tokens to: {self.tokens} and high_limit to: {self.current_limit}")
|
132
|
+
class DynamicRateLimiterManagerSingleton:
|
133
|
+
_instance = None
|
134
|
+
@staticmethod
|
135
|
+
def get_instance(service_name="default", low_limit=10, high_limit=30, limit_epoch=60,starting_tokens=10,epoch_cycle_adjustment=True):
|
136
|
+
if DynamicRateLimiterManagerSingleton._instance is None:
|
137
|
+
DynamicRateLimiterManagerSingleton._instance = DynamicRateLimiterManager(service_name=service_name, low_limit=low_limit, high_limit=limit_epoch, limit_epoch=60,starting_tokens=starting_tokens,epoch_cycle_adjustment=epoch_cycle_adjustment)
|
138
|
+
return DynamicRateLimiterManagerSingleton._instance
|
@@ -0,0 +1,189 @@
|
|
1
|
+
from ..abstract_webtools import *
|
2
|
+
from .urlManager import *
|
3
|
+
from .requestManager import *
|
4
|
+
from .soupManager import *
|
5
|
+
def get_url(url=None,url_mgr=None):
|
6
|
+
if not url and not url_mgr:
|
7
|
+
return None
|
8
|
+
if url:
|
9
|
+
url_mgr = urlManager(url)
|
10
|
+
return url_mgr.url
|
11
|
+
def get_url_mgr(url=None,url_mgr=None):
|
12
|
+
if not url and not url_mgr:
|
13
|
+
return None
|
14
|
+
if url:
|
15
|
+
url_mgr = urlManager(url)
|
16
|
+
return url_mgr
|
17
|
+
class linkManager:
|
18
|
+
"""
|
19
|
+
LinkManager is a class for managing and extracting links and image links from a web page.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
url (str): The URL of the web page (default is "https://example.com").
|
23
|
+
source_code (str or None): The source code of the web page (default is None).
|
24
|
+
url_mgr (UrlManager or None): An instance of UrlManager (default is None).
|
25
|
+
request_manager (requestManager or None): An instance of requestManager (default is None).
|
26
|
+
soup_manager (SoupManager or None): An instance of SoupManager (default is None).
|
27
|
+
image_link_tags (str): HTML tags to identify image links (default is 'img').
|
28
|
+
img_link_attrs (str): HTML attributes to identify image link URLs (default is 'src').
|
29
|
+
link_tags (str): HTML tags to identify links (default is 'a').
|
30
|
+
link_attrs (str): HTML attributes to identify link URLs (default is 'href').
|
31
|
+
strict_order_tags (bool): Flag to indicate if tags and attributes should be matched strictly (default is False).
|
32
|
+
img_attr_value_desired (list or None): Desired attribute values for image links (default is None).
|
33
|
+
img_attr_value_undesired (list or None): Undesired attribute values for image links (default is None).
|
34
|
+
link_attr_value_desired (list or None): Desired attribute values for links (default is None).
|
35
|
+
link_attr_value_undesired (list or None): Undesired attribute values for links (default is None).
|
36
|
+
associated_data_attr (list): HTML attributes to associate with the extracted links (default is ["data-title", 'alt', 'title']).
|
37
|
+
get_img (list): HTML attributes used to identify associated images (default is ["data-title", 'alt', 'title']).
|
38
|
+
|
39
|
+
Methods:
|
40
|
+
re_initialize(): Reinitialize the LinkManager with the current settings.
|
41
|
+
update_url_mgr(url_mgr): Update the URL manager with a new instance.
|
42
|
+
update_url(url): Update the URL and reinitialize the LinkManager.
|
43
|
+
update_source_code(source_code): Update the source code and reinitialize the LinkManager.
|
44
|
+
update_soup_manager(soup_manager): Update the SoupManager and reinitialize the LinkManager.
|
45
|
+
update_desired(...): Update the desired settings and reinitialize the LinkManager.
|
46
|
+
find_all_desired(...): Find all desired links or image links based on the specified criteria.
|
47
|
+
find_all_domain(): Find all unique domain names in the extracted links.
|
48
|
+
|
49
|
+
Note:
|
50
|
+
- The LinkManager class helps manage and extract links and image links from web pages.
|
51
|
+
- The class provides flexibility in specifying criteria for link extraction.
|
52
|
+
"""
|
53
|
+
def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup_manager=None,image_link_tags='img',img_link_attrs='src',link_tags='a',link_attrs='href',strict_order_tags=False,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
|
54
|
+
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
55
|
+
self.url=self.url_mgr.url
|
56
|
+
self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
|
57
|
+
self.source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
|
58
|
+
self.soup_mgr = get_soup_mgr(req_mgr=self.req_mgr,url_mgr=self.url_mgr,source_code = source_code)
|
59
|
+
|
60
|
+
self.strict_order_tags=strict_order_tags
|
61
|
+
self.image_link_tags=image_link_tags
|
62
|
+
self.img_link_attrs=img_link_attrs
|
63
|
+
self.link_tags=link_tags
|
64
|
+
self.link_attrs=link_attrs
|
65
|
+
self.img_attr_value_desired=img_attr_value_desired
|
66
|
+
self.img_attr_value_undesired=img_attr_value_undesired
|
67
|
+
self.link_attr_value_desired=link_attr_value_desired
|
68
|
+
self.link_attr_value_undesired=link_attr_value_undesired
|
69
|
+
self.associated_data_attr=associated_data_attr
|
70
|
+
self.get_img=get_img
|
71
|
+
self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,
|
72
|
+
attr=self.img_link_attrs,
|
73
|
+
attr_value_desired=self.img_attr_value_desired,
|
74
|
+
attr_value_undesired=self.img_attr_value_undesired)
|
75
|
+
self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,
|
76
|
+
attr=self.link_attrs,
|
77
|
+
attr_value_desired=self.link_attr_value_desired,
|
78
|
+
attr_value_undesired=self.link_attr_value_undesired,
|
79
|
+
associated_data_attr=self.associated_data_attr,
|
80
|
+
get_img=get_img)
|
81
|
+
def re_initialize(self):
|
82
|
+
self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,attr=self.img_link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.img_attr_value_desired,attr_value_undesired=self.img_attr_value_undesired)
|
83
|
+
self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,attr=self.link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.link_attr_value_desired,attr_value_undesired=self.link_attr_value_undesired,associated_data_attr=self.associated_data_attr,get_img=self.get_img)
|
84
|
+
def update_url_mgr(self,url_mgr):
|
85
|
+
self.url_mgr=url_mgr
|
86
|
+
self.url=self.url_mgr.url
|
87
|
+
self.req_mgr.update_url_mgr(url_mgr=self.url_mgr)
|
88
|
+
self.soup_mgr.update_url_mgr(url_mgr=self.url_mgr)
|
89
|
+
self.source_code=self.soup_mgr.source_code
|
90
|
+
self.re_initialize()
|
91
|
+
def update_url(self,url):
|
92
|
+
self.url=url
|
93
|
+
self.url_mgr.update_url(url=self.url)
|
94
|
+
self.url=self.url_mgr.url
|
95
|
+
self.req_mgr.update_url(url=self.url)
|
96
|
+
self.soup_mgr.update_url(url=self.url)
|
97
|
+
self.source_code=self.soup_mgr.source_code
|
98
|
+
self.re_initialize()
|
99
|
+
def update_source_code(self,source_code):
|
100
|
+
self.source_code=source_code
|
101
|
+
if self.source_code != self.soup_mgr.source_code:
|
102
|
+
self.soup_mgr.update_source_code(source_code=self.source_code)
|
103
|
+
self.re_initialize()
|
104
|
+
def update_soup_manager(self,soup_manager):
|
105
|
+
self.soup_mgr=soup_manager
|
106
|
+
self.source_code=self.soup_mgr.source_code
|
107
|
+
self.re_initialize()
|
108
|
+
def update_desired(self,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,image_link_tags=None,img_link_attrs=None,link_tags=None,link_attrs=None,strict_order_tags=None,associated_data_attr=None,get_img=None):
|
109
|
+
self.strict_order_tags = strict_order_tags or self.strict_order_tags
|
110
|
+
self.img_attr_value_desired=img_attr_value_desired or self.img_attr_value_desired
|
111
|
+
self.img_attr_value_undesired=img_attr_value_undesired or self.img_attr_value_undesired
|
112
|
+
self.link_attr_value_desired=link_attr_value_desired or self.link_attr_value_desired
|
113
|
+
self.link_attr_value_undesired=link_attr_value_undesired or self.link_attr_value_undesired
|
114
|
+
self.image_link_tags=image_link_tags or self.image_link_tags
|
115
|
+
self.img_link_attrs=img_link_attrs or self.img_link_attrs
|
116
|
+
self.link_tags=link_tags or self.link_tags
|
117
|
+
self.link_attrs=link_attrs or self.link_attrs
|
118
|
+
self.associated_data_attr=associated_data_attr or self.associated_data_attr
|
119
|
+
self.get_img=get_img or self.get_img
|
120
|
+
self.re_initialize()
|
121
|
+
def find_all_desired(self,tag='img',attr='src',strict_order_tags=False,attr_value_desired=None,attr_value_undesired=None,associated_data_attr=None,get_img=None):
|
122
|
+
def make_list(obj):
|
123
|
+
if isinstance(obj,list) or obj==None:
|
124
|
+
return obj
|
125
|
+
return [obj]
|
126
|
+
def get_desired_value(attr,attr_value_desired=None,attr_value_undesired=None):
|
127
|
+
if attr_value_desired:
|
128
|
+
for value in attr_value_desired:
|
129
|
+
if value not in attr:
|
130
|
+
return False
|
131
|
+
if attr_value_undesired:
|
132
|
+
for value in attr_value_undesired:
|
133
|
+
if value in attr:
|
134
|
+
return False
|
135
|
+
return True
|
136
|
+
attr_value_desired,attr_value_undesired,associated_data_attr,tags,attribs=make_list(attr_value_desired),make_list(attr_value_undesired),make_list(associated_data_attr),make_list(tag),make_list(attr)
|
137
|
+
desired_ls = []
|
138
|
+
assiciated_data=[]
|
139
|
+
for i,tag in enumerate(tags):
|
140
|
+
attribs_list=attribs
|
141
|
+
if strict_order_tags:
|
142
|
+
if len(attribs)<=i:
|
143
|
+
attribs_list=[None]
|
144
|
+
else:
|
145
|
+
attribs_list=make_list(attribs[i])
|
146
|
+
for attr in attribs_list:
|
147
|
+
for component in self.soup_mgr.soup.find_all(tag):
|
148
|
+
if attr in component.attrs and get_desired_value(attr=component[attr],attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired):
|
149
|
+
if component[attr] not in desired_ls:
|
150
|
+
desired_ls.append(component[attr])
|
151
|
+
assiciated_data.append({"value":component[attr]})
|
152
|
+
if associated_data_attr:
|
153
|
+
for data in associated_data_attr:
|
154
|
+
if data in component.attrs:
|
155
|
+
assiciated_data[-1][data]=component.attrs[data]
|
156
|
+
if get_img and component.attrs[data]:
|
157
|
+
if data in get_img and len(component.attrs[data])!=0:
|
158
|
+
for each in self.soup_mgr.soup.find_all('img'):
|
159
|
+
if 'alt' in each.attrs:
|
160
|
+
if each.attrs['alt'] == component.attrs[data] and 'src' in each.attrs:
|
161
|
+
assiciated_data[-1]['image']=each.attrs['src']
|
162
|
+
desired_ls.append(assiciated_data)
|
163
|
+
return desired_ls
|
164
|
+
def find_all_domain(self):
|
165
|
+
domain = urlparse(self.url_mgr.url).netloc
|
166
|
+
domains_ls=[self.url_mgr.url]
|
167
|
+
for url in self.all_desired_links[:-1]:
|
168
|
+
if self.url_mgr.is_valid_url(url):
|
169
|
+
parse = urlparse(url)
|
170
|
+
comp_domain = parse.netloc
|
171
|
+
if url not in domains_ls and comp_domain == domain:
|
172
|
+
domains_ls.append(url)
|
173
|
+
return domains_ls
|
174
|
+
def find_all_desired_links(self,tag='img', attr='src',attr_value_desired=None,strict_order_tags=False,attr_value_undesired=None,associated_data_attr=None,all_desired=None,get_img=None):
|
175
|
+
all_desired = all_desired or self.find_all_desired(tag=tag,attr=attr,strict_order_tags=strict_order_tags,attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired,associated_data_attr=associated_data_attr,get_img=get_img)
|
176
|
+
assiciated_attrs = all_desired[-1]
|
177
|
+
valid_assiciated_attrs = []
|
178
|
+
desired_links=[]
|
179
|
+
for i,attr in enumerate(all_desired[:-1]):
|
180
|
+
valid_attr=self.url_mgr.make_valid(attr,self.url_mgr.protocol+'://'+self.url_mgr.domain)
|
181
|
+
if valid_attr:
|
182
|
+
desired_links.append(valid_attr)
|
183
|
+
valid_assiciated_attrs.append(assiciated_attrs[i])
|
184
|
+
valid_assiciated_attrs[-1]["link"]=valid_attr
|
185
|
+
desired_links.append(valid_assiciated_attrs)
|
186
|
+
return desired_links
|
187
|
+
|
188
|
+
|
189
|
+
|
@@ -0,0 +1,46 @@
|
|
1
|
+
class MySocketClient:
|
2
|
+
def __init__(self, ip_address=None, port=None,domain=None):
|
3
|
+
self.sock
|
4
|
+
self.ip_address= ip_address or None
|
5
|
+
self.port = port or None
|
6
|
+
|
7
|
+
self.domain = domain or None
|
8
|
+
def receive_data(self):
|
9
|
+
chunks = []
|
10
|
+
while True:
|
11
|
+
chunk = self.sock.recv(4096)
|
12
|
+
if chunk:
|
13
|
+
chunks.append(chunk)
|
14
|
+
else:
|
15
|
+
break
|
16
|
+
return b''.join(chunks).decode('utf-8')
|
17
|
+
def _parse_socket_response_as_json(self, data, *args, **kwargs):
|
18
|
+
return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
|
19
|
+
def process_data(self):
|
20
|
+
data = self.receive_data()
|
21
|
+
return self._parse_socket_response_as_json(data)
|
22
|
+
def _parse_json(self,json_string):
|
23
|
+
return json.loads(json_string)
|
24
|
+
def get_ip(self,domain=None):
|
25
|
+
try:
|
26
|
+
return self.sock.gethostbyname(domain if domain != None else self.domain)
|
27
|
+
except self.sock.gaierror:
|
28
|
+
return None
|
29
|
+
def grt_host_name(self,ip_address=None):
|
30
|
+
return self.sock.gethostbyaddr(ip_address if ip_address != None else self.ip_address)
|
31
|
+
def toggle_sock(self):
|
32
|
+
if self.sock != None:
|
33
|
+
self.sock.close()
|
34
|
+
else:
|
35
|
+
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
36
|
+
if host and socket:
|
37
|
+
self.sock.connect((host, port))
|
38
|
+
class MySocketClient():
|
39
|
+
_instance = None
|
40
|
+
@staticmethod
|
41
|
+
def get_instance(ip_address='local_host',port=22,domain="example.com"):
|
42
|
+
if MySocketClientSingleton._instance is None:
|
43
|
+
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain=domain)
|
44
|
+
elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or UrlManagerSingleton._instance.domain != domain:
|
45
|
+
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain=domain)
|
46
|
+
return MySocketClient
|