abstract-webtools 0.1.5.81__tar.gz → 0.1.5.82__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/PKG-INFO +5 -1
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/setup.py +1 -1
- abstract_webtools-0.1.5.82/src/abstract_webtools/__init__.py +3 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/abstract_webtools.py +100 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/__init__.py +14 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/cipherManager.py +38 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/crawlManager.py +187 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/domainManager.py +48 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/dynamicRateLimiter.py +138 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/linkManager.py +189 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/mySocketClient.py +46 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/networkManager.py +15 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/requestManager.py +348 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/seleniumManager.py +85 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/soupManager.py +313 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/sslManager.py +21 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/tlsAdapter.py +27 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/urlManager.py +225 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/userAgentManager.py +42 -0
- abstract_webtools-0.1.5.82/src/abstract_webtools/managers/videoDownloader.py +205 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools/url_grabber.py +1 -1
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools.egg-info/PKG-INFO +6 -2
- abstract_webtools-0.1.5.82/src/abstract_webtools.egg-info/SOURCES.txt +33 -0
- abstract_webtools-0.1.5.81/src/abstract_webtools/__init__.py +0 -2
- abstract_webtools-0.1.5.81/src/abstract_webtools/abstract_webtools.py +0 -1852
- abstract_webtools-0.1.5.81/src/abstract_webtools.egg-info/SOURCES.txt +0 -17
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/LICENSE +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/README.md +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/pyproject.toml +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/setup.cfg +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools/big_user_agent_list.py +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools/main.py +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools/url_grabber_new.py +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools.egg-info/requires.txt +2 -2
- {abstract_webtools-0.1.5.81 → abstract_webtools-0.1.5.82}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.5.
|
3
|
+
Version: 0.1.5.82
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -13,6 +13,10 @@ Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Requires-Python: >=3.6
|
14
14
|
Description-Content-Type: text/markdown
|
15
15
|
License-File: LICENSE
|
16
|
+
Requires-Dist: abstract_utilities>=0.2.2.30
|
17
|
+
Requires-Dist: PySimpleGUI>=4.60.5
|
18
|
+
Requires-Dist: urllib3>=2.0.4
|
19
|
+
Requires-Dist: requests>=2.31.0
|
16
20
|
|
17
21
|
# Abstract WebTools
|
18
22
|
Provides utilities for inspecting and parsing web content, including React components and URL utilities, with enhanced capabilities for managing HTTP requests and TLS configurations.
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
setuptools.setup(
|
6
6
|
name='abstract_webtools',
|
7
|
-
version='0.1.5.
|
7
|
+
version='0.1.5.82',
|
8
8
|
author='putkoff',
|
9
9
|
author_email='partners@abstractendeavors.com',
|
10
10
|
description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
|
@@ -0,0 +1,100 @@
|
|
1
|
+
"""
|
2
|
+
# `abstract_webtools.py` Documentation
|
3
|
+
|
4
|
+
This script, `abstract_webtools.py`, is a component of the `abstract_webtools` module and is a part of the `abstract_essentials` package. It provides a set of tools and functions to interact with and parse web content.
|
5
|
+
|
6
|
+
## Contents
|
7
|
+
|
8
|
+
1. **Imports**
|
9
|
+
- Essential modules and classes for web requests, SSL configurations, and URL parsing are imported at the beginning.
|
10
|
+
|
11
|
+
2. **Core Functions**
|
12
|
+
|
13
|
+
- `get_status(url: str) -> int or None`:
|
14
|
+
Fetches the HTTP status code for a given URL.
|
15
|
+
|
16
|
+
- `clean_url(url: str) -> list`:
|
17
|
+
Returns variations of the given URL with different protocols.
|
18
|
+
|
19
|
+
- `get_correct_url(url: str, session: requests.Session) -> str or None`:
|
20
|
+
Identifies the correct URL from possible variations using HTTP requests.
|
21
|
+
|
22
|
+
- `try_request(url: str, session: requests.Session) -> requests.Response or None`:
|
23
|
+
Attempts to make an HTTP request to a given URL.
|
24
|
+
|
25
|
+
- `is_valid(url: str) -> bool`:
|
26
|
+
Validates if a given URL is structurally correct.
|
27
|
+
|
28
|
+
- `desktop_user_agents() -> list`:
|
29
|
+
Returns a list of popular desktop user-agent strings.
|
30
|
+
|
31
|
+
- `get_user_agent(user_agent: str) -> dict`:
|
32
|
+
Returns a dictionary containing the user-agent header.
|
33
|
+
|
34
|
+
3. **TLSAdapter Class**
|
35
|
+
|
36
|
+
A custom HTTPAdapter class that manages SSL options and ciphers for web requests.
|
37
|
+
|
38
|
+
- `TLSAdapter.__init__(self, ssl_options: int)`:
|
39
|
+
Initializes the adapter with specific SSL options.
|
40
|
+
|
41
|
+
- Several methods to handle cipher strings, creation of cipher strings, and initialization of the pool manager with custom SSL configurations.
|
42
|
+
|
43
|
+
4. **Advanced Web Functions**
|
44
|
+
|
45
|
+
- `get_Source_code(url: str, user_agent: str) -> str or None`:
|
46
|
+
Retrieves the source code of a website with a custom user-agent.
|
47
|
+
|
48
|
+
- `parse_react_source(url: str) -> list`:
|
49
|
+
Extracts JavaScript and JSX source code from the specified URL.
|
50
|
+
|
51
|
+
- `get_all_website_links(url: str) -> list`:
|
52
|
+
Lists all the internal URLs found on a specific webpage.
|
53
|
+
|
54
|
+
- `parse_all(url: str) -> dict`:
|
55
|
+
Parses source code to extract details about elements, attributes, and class names.
|
56
|
+
|
57
|
+
- `extract_elements(url: str, element_type: str, attribute_name: str, class_name: str)`:
|
58
|
+
Extracts specific portions of source code based on provided filters. The function signature seems to be cut off, so the full details aren't available.
|
59
|
+
|
60
|
+
## Usage
|
61
|
+
|
62
|
+
The functions and classes provided in this module allow users to interact with websites, from simple actions like getting the status code of a URL to more advanced functionalities such as parsing ReactJS source codes or extracting specific HTML elements from a website.
|
63
|
+
|
64
|
+
To utilize this module, simply import the required function or class and use it in your application. The functions have been designed to be intuitive and the provided docstrings give clear guidance on their usage.
|
65
|
+
|
66
|
+
Author: putkoff
|
67
|
+
Version: 1.0
|
68
|
+
"""
|
69
|
+
# -*- coding: UTF-8 -*-
|
70
|
+
# Google Chrome Driver
|
71
|
+
import os
|
72
|
+
import ssl
|
73
|
+
import re
|
74
|
+
import yt_dlp
|
75
|
+
import socket
|
76
|
+
import shutil
|
77
|
+
import logging
|
78
|
+
import requests
|
79
|
+
from bs4 import BeautifulSoup
|
80
|
+
from selenium import webdriver
|
81
|
+
import xml.etree.ElementTree as ET
|
82
|
+
from typing import Optional,List,Union
|
83
|
+
from requests.adapters import HTTPAdapter
|
84
|
+
from urllib.parse import urlparse, urljoin
|
85
|
+
from requests.packages.urllib3.util import ssl_
|
86
|
+
from requests.packages.urllib3.poolmanager import PoolManager
|
87
|
+
from urllib.parse import urlparse, parse_qs
|
88
|
+
import time
|
89
|
+
import requests
|
90
|
+
from .managers import *
|
91
|
+
from abstract_utilities import get_time_stamp,get_sleep,sleep_count_down,eatInner,eatAll,eatOuter,ThreadManager
|
92
|
+
logging.basicConfig(level=logging.INFO)
|
93
|
+
def try_request(request):
|
94
|
+
try:
|
95
|
+
respnse = requests.get(url)
|
96
|
+
except Exception as e:
|
97
|
+
print(f'request for url failed: {e}')
|
98
|
+
response = None
|
99
|
+
return response
|
100
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
from .cipherManager import *
|
2
|
+
from .crawlManager import *
|
3
|
+
from .dynamicRateLimiter import *
|
4
|
+
from .linkManager import *
|
5
|
+
from .mySocketClient import *
|
6
|
+
from .networkManager import *
|
7
|
+
from .requestManager import *
|
8
|
+
from .soupManager import *
|
9
|
+
from .sslManager import *
|
10
|
+
from .tlsAdapter import *
|
11
|
+
from .urlManager import *
|
12
|
+
from .userAgentManager import *
|
13
|
+
from .videoDownloader import *
|
14
|
+
from .seleniumManager import *
|
@@ -0,0 +1,38 @@
|
|
1
|
+
class CipherManager:
|
2
|
+
@staticmethod
|
3
|
+
def get_default_ciphers()-> list:
|
4
|
+
return [
|
5
|
+
"ECDHE-RSA-AES256-GCM-SHA384", "ECDHE-ECDSA-AES256-GCM-SHA384",
|
6
|
+
"ECDHE-RSA-AES256-SHA384", "ECDHE-ECDSA-AES256-SHA384",
|
7
|
+
"ECDHE-RSA-AES256-SHA", "ECDHE-ECDSA-AES256-SHA",
|
8
|
+
"ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-SHA256",
|
9
|
+
"ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES128-SHA256",
|
10
|
+
"AES256-SHA", "AES128-SHA"
|
11
|
+
]
|
12
|
+
|
13
|
+
def __init__(self,cipher_list=None):
|
14
|
+
if cipher_list == None:
|
15
|
+
cipher_list=self.get_default_ciphers()
|
16
|
+
self.cipher_list = cipher_list
|
17
|
+
self.create_list()
|
18
|
+
self.ciphers_string = self.add_string_list()
|
19
|
+
def add_string_list(self):
|
20
|
+
if len(self.cipher_list)==0:
|
21
|
+
return ''
|
22
|
+
return','.join(self.cipher_list)
|
23
|
+
def create_list(self):
|
24
|
+
if self.cipher_list == None:
|
25
|
+
self.cipher_list= []
|
26
|
+
elif isinstance(self.cipher_list, str):
|
27
|
+
self.cipher_list=self.cipher_list.split(',')
|
28
|
+
if isinstance(self.cipher_list, str):
|
29
|
+
self.cipher_list=[self.cipher_list]
|
30
|
+
class CipherManagerSingleton:
|
31
|
+
_instance = None
|
32
|
+
@staticmethod
|
33
|
+
def get_instance(cipher_list=None):
|
34
|
+
if CipherManagerSingleton._instance is None:
|
35
|
+
CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
|
36
|
+
elif CipherManagerSingleton._instance.cipher_list != cipher_list:
|
37
|
+
CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
|
38
|
+
return CipherManagerSingleton._instance
|
@@ -0,0 +1,187 @@
|
|
1
|
+
class CrawlManager:
|
2
|
+
def __init__(self,url=None,source_code=None,parse_type="html.parser"):
|
3
|
+
self.url=url
|
4
|
+
self.source_code=source_code
|
5
|
+
self.parse_type=parse_type
|
6
|
+
get_new_source_and_url(self,url)
|
7
|
+
def get_new_source_and_url(self,url=None):
|
8
|
+
if url == None:
|
9
|
+
url = self.url
|
10
|
+
self.response = self.request_mgr.response
|
11
|
+
self.source_code=self.request_mgr.source_code
|
12
|
+
def get_classes_and_meta_info():
|
13
|
+
class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
|
14
|
+
attrs = 'href','src'
|
15
|
+
unique_classes, images=discover_classes_and_images(self,tag_name,class_name_1,class_name_2,class_value,attrs)
|
16
|
+
return unique_classes, images
|
17
|
+
def extract_links_from_url(self):
|
18
|
+
"""
|
19
|
+
Extracts all href and src links from a given URL's source code.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
base_url (str): The URL from which to extract links.
|
23
|
+
|
24
|
+
Returns:
|
25
|
+
dict: Dictionary containing image links and external links under the parent page.
|
26
|
+
"""
|
27
|
+
agg_js = {'images':[],'external_links':[]}
|
28
|
+
|
29
|
+
if self.response != None:
|
30
|
+
attrs = 'href','src'
|
31
|
+
href_links,src_links='',''
|
32
|
+
links = [href_links,src_links]
|
33
|
+
for i,each in enumerate(attrs):
|
34
|
+
links[i]= [a[attr[i]] for a in get_find_all_with_attributes(self, attrs[i])]
|
35
|
+
# Convert all links to absolute links
|
36
|
+
absolute_links = [(url, link) for link in links[0] + links[1]]
|
37
|
+
# Separate images and external links
|
38
|
+
images = [link for link in absolute_links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
|
39
|
+
external_links = [link for link in absolute_links if urlparse(link).netloc != urlparse(url).netloc]
|
40
|
+
agg_js['images']=images
|
41
|
+
agg_js['external_links']=external_links
|
42
|
+
|
43
|
+
return agg_js
|
44
|
+
|
45
|
+
|
46
|
+
def correct_xml(xml_string):
|
47
|
+
# Parse the XML string
|
48
|
+
root = ET.fromstring(xml_string)
|
49
|
+
|
50
|
+
# Loop through each <image:loc> element and correct its text if needed
|
51
|
+
for image_loc in root.findall(".//image:loc", namespaces={'image': 'http://www.google.com/schemas/sitemap-image/1.1'}):
|
52
|
+
# Replace '&' with '&' in the element's text
|
53
|
+
if '&' in image_loc.text:
|
54
|
+
image_loc.text = image_loc.text.replace('&', '&')
|
55
|
+
|
56
|
+
# Convert the corrected XML back to string
|
57
|
+
corrected_xml = ET.tostring(root, encoding='utf-8').decode('utf-8')
|
58
|
+
return corrected_xml
|
59
|
+
|
60
|
+
|
61
|
+
def determine_values(self):
|
62
|
+
# This is just a mockup. In a real application, you'd analyze the URL or its content.
|
63
|
+
|
64
|
+
# Assuming a blog site
|
65
|
+
if 'blog' in self.url:
|
66
|
+
if '2023' in self.url: # Assuming it's a current year article
|
67
|
+
return ('weekly', '0.8')
|
68
|
+
else:
|
69
|
+
return ('monthly', '0.6')
|
70
|
+
elif 'contact' in self.url:
|
71
|
+
return ('yearly', '0.3')
|
72
|
+
else: # Homepage or main categories
|
73
|
+
return ('weekly', '1.0')
|
74
|
+
def crawl(url, max_depth=3, depth=1):
|
75
|
+
|
76
|
+
if depth > max_depth:
|
77
|
+
return []
|
78
|
+
|
79
|
+
if url in visited:
|
80
|
+
return []
|
81
|
+
|
82
|
+
visited.add(url)
|
83
|
+
|
84
|
+
try:
|
85
|
+
|
86
|
+
links = [a['href'] for a in self.soup.find_all('a', href=True)]
|
87
|
+
valid_links = []
|
88
|
+
|
89
|
+
for link in links:
|
90
|
+
parsed_link = urlparse(link)
|
91
|
+
base_url = "{}://{}".format(parsed_link.scheme, parsed_link.netloc)
|
92
|
+
|
93
|
+
if base_url == url: # Avoiding external URLs
|
94
|
+
final_link = urljoin(url, parsed_link.path)
|
95
|
+
if final_link not in valid_links:
|
96
|
+
valid_links.append(final_link)
|
97
|
+
|
98
|
+
for link in valid_links:
|
99
|
+
crawl(link, max_depth, depth+1)
|
100
|
+
|
101
|
+
return valid_links
|
102
|
+
|
103
|
+
except Exception as e:
|
104
|
+
print(f"Error crawling {url}: {e}")
|
105
|
+
return []
|
106
|
+
|
107
|
+
|
108
|
+
# Define or import required functions here, like get_all_website_links, determine_values,
|
109
|
+
# discover_classes_and_meta_images, and extract_links_from_url.
|
110
|
+
def get_meta_info(self):
|
111
|
+
|
112
|
+
meta_info = {}
|
113
|
+
# Fetch the title if available
|
114
|
+
title_tag = parse_title()
|
115
|
+
if title_tag:
|
116
|
+
meta_info["title"] = title_tag
|
117
|
+
# Fetch meta tags
|
118
|
+
for meta_tag in soup.find_all('meta'):
|
119
|
+
name = meta_tag.get('name') or meta_tag.get('property')
|
120
|
+
if name:
|
121
|
+
content = meta_tag.get('content')
|
122
|
+
if content:
|
123
|
+
meta_info[name] = content
|
124
|
+
|
125
|
+
return meta_info
|
126
|
+
def generate_sitemap(self,domain):
|
127
|
+
|
128
|
+
with open('sitemap.xml', 'w', encoding='utf-8') as f:
|
129
|
+
string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n'
|
130
|
+
|
131
|
+
for url in self.all_site_links:
|
132
|
+
string += f' <url>\n <loc>{url}</loc>\n'
|
133
|
+
preprocess=[]
|
134
|
+
self.get_new_source_and_url(url=url)
|
135
|
+
links = extract_links_from_url(url)
|
136
|
+
|
137
|
+
for img in links['images']:
|
138
|
+
if str(img).lower() not in preprocess:
|
139
|
+
try:
|
140
|
+
escaped_img = img.replace('&', '&')
|
141
|
+
|
142
|
+
str_write = f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n'
|
143
|
+
string += str_write
|
144
|
+
except:
|
145
|
+
pass
|
146
|
+
preprocess.append(str(img).lower())
|
147
|
+
frequency, priority = determine_values(url)
|
148
|
+
string += f' <changefreq>{frequency}</changefreq>\n'
|
149
|
+
string += f' <priority>{priority}</priority>\n'
|
150
|
+
string += f' </url>\n'
|
151
|
+
|
152
|
+
string += '</urlset>\n'
|
153
|
+
f.write(string)
|
154
|
+
# Output summary
|
155
|
+
print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
|
156
|
+
|
157
|
+
# Output class and link details
|
158
|
+
for url in urls:
|
159
|
+
print(f"\nDetails for {url}:")
|
160
|
+
classes, meta_img_refs = discover_classes_and_meta_images(url)
|
161
|
+
|
162
|
+
print("\nClasses with href or src attributes:")
|
163
|
+
for class_name in classes:
|
164
|
+
print(f"\t{class_name}")
|
165
|
+
|
166
|
+
print("\nMeta Image References:")
|
167
|
+
for img_ref in meta_img_refs:
|
168
|
+
print(f"\t{img_ref}")
|
169
|
+
|
170
|
+
links = extract_links_from_url(url)
|
171
|
+
|
172
|
+
print("\nImages:")
|
173
|
+
for img in links['images']:
|
174
|
+
print(f"\t{img}")
|
175
|
+
|
176
|
+
print("\nExternal Links:")
|
177
|
+
for ext_link in links['external_links']:
|
178
|
+
print(f"\t{ext_link}")
|
179
|
+
class CrawlManagerSingleton():
|
180
|
+
_instance = None
|
181
|
+
@staticmethod
|
182
|
+
def get_instance(url=None,source_code=None,parse_type="html.parser"):
|
183
|
+
if CrawlManagerSingleton._instance is None:
|
184
|
+
CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
185
|
+
elif parse_type != CrawlManagerSingleton._instance.parse_type or url != CrawlManagerSingleton._instance.url or source_code != CrawlManagerSingleton._instance.source_code:
|
186
|
+
CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
187
|
+
return CrawlManagerSingleton._instance
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from ..abstract_webtools import *
|
2
|
+
class domainManager(metaclass=SingletonMeta):
|
3
|
+
def __init__(self, url):
|
4
|
+
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
5
|
+
self.initialized = True
|
6
|
+
parsed_url = urlparse(url)
|
7
|
+
self.domain = parsed_url.netloc
|
8
|
+
self.scheme = parsed_url.scheme
|
9
|
+
def get_url_to_path(self, url):
|
10
|
+
url = eatAll(str(url),['',' ','\n','\t','\\','/'])
|
11
|
+
parsed_url = urlparse(url)
|
12
|
+
if 'data:image' in url:
|
13
|
+
input(url)
|
14
|
+
if parsed_url.netloc == self.domain:
|
15
|
+
paths = parsed_url.path.split('/')
|
16
|
+
dir_path =self.site_dir
|
17
|
+
for path in paths[:-1]:
|
18
|
+
dir_path = os.path.join(dir_path, path)
|
19
|
+
os.makedirs(dir_path, exist_ok=True)
|
20
|
+
#if 'svg' in url:
|
21
|
+
#$ input(url)
|
22
|
+
# dir_path = get_image_name('contents',directory=dir_path,ext='png',url=item_url)
|
23
|
+
|
24
|
+
|
25
|
+
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
|
26
|
+
|
27
|
+
dir_path = os.path.join(dir_path, paths[-1])
|
28
|
+
return dir_path
|
29
|
+
|
30
|
+
def saved_url_check(self, url):
|
31
|
+
|
32
|
+
path = self.get_url_to_path(url)
|
33
|
+
return path
|
34
|
+
|
35
|
+
def get_with_netloc(self, url):
|
36
|
+
parsed_url = urlparse(url)
|
37
|
+
if parsed_url.netloc == '':
|
38
|
+
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
39
|
+
return url
|
40
|
+
|
41
|
+
def get_driver(self, url):
|
42
|
+
if url and url not in self.drivers:
|
43
|
+
chrome_options = Options()
|
44
|
+
chrome_options.add_argument("--headless")
|
45
|
+
driver = webdriver.Chrome(options=chrome_options)
|
46
|
+
self.drivers[url] = driver
|
47
|
+
driver.get(url)
|
48
|
+
return self.drivers[url]
|
@@ -0,0 +1,138 @@
|
|
1
|
+
class DynamicRateLimiterManager:
|
2
|
+
def __init__(self, service_name='ethereum'):
|
3
|
+
self.services = {}
|
4
|
+
self.service_name = service_name
|
5
|
+
self.add_service(service_name)
|
6
|
+
|
7
|
+
def add_service(self, service_name="default", low_limit=10, high_limit=30, limit_epoch=60, starting_tokens=10, epoch_cycle_adjustment=True):
|
8
|
+
if service_name in self.services:
|
9
|
+
print(f"Service {service_name} already exists!")
|
10
|
+
return
|
11
|
+
self.services[service_name] = DynamicRateLimiter(low_limit=low_limit, high_limit=high_limit, limit_epoch=limit_epoch, starting_tokens=starting_tokens, epoch_cycle_adjustment=epoch_cycle_adjustment)
|
12
|
+
|
13
|
+
def request(self, request_url, service_name=None):
|
14
|
+
service_name = service_name or self.service_name
|
15
|
+
if service_name not in self.services:
|
16
|
+
self.add_service(service_name)
|
17
|
+
|
18
|
+
limiter = self.services[service_name]
|
19
|
+
|
20
|
+
while True:
|
21
|
+
if limiter.request():
|
22
|
+
response = requests.get(request_url) # Actual request
|
23
|
+
if response.status_code == 200:
|
24
|
+
limiter.request_tracker(True)
|
25
|
+
return response.json()
|
26
|
+
elif response.status_code == 429:
|
27
|
+
limiter.request_tracker(False)
|
28
|
+
print(f"Rate limited by {service_name}. Adjusting limit and retrying...")
|
29
|
+
time.sleep(limiter.get_sleep()["current_sleep"])
|
30
|
+
else:
|
31
|
+
print(f"Unexpected response: {response.status_code}. Message: {response.text}")
|
32
|
+
return None
|
33
|
+
else:
|
34
|
+
print(f"Rate limit reached for {service_name}. Waiting for the next epoch...")
|
35
|
+
time.sleep(limiter.get_sleep()["current_sleep"])
|
36
|
+
|
37
|
+
def log_request(self, service_name, success):
|
38
|
+
print(f"[{service_name}] Request {'succeeded' if success else 'denied'}. Current tokens: {self.services[service_name].get_current_tokens()}")
|
39
|
+
|
40
|
+
class DynamicRateLimiter:
|
41
|
+
def __init__(self, low_limit, high_limit, limit_epoch, starting_tokens=None,epoch_cycle_adjustment:int=None):
|
42
|
+
self.low_limit = low_limit
|
43
|
+
self.high_limit = high_limit
|
44
|
+
self.limit_epoch = limit_epoch # in seconds
|
45
|
+
self.request_status_json = {"succesful":[],"unsuccesful":[],"last_requested":get_time_stamp(),"first_requested":get_time_stamp(),"epoch_left":self.limit_epoch,"last_fail":get_time_stamp(),"count_since_fail":0}
|
46
|
+
self.current_limit = starting_tokens or low_limit # Default to high_limit if starting_tokens isn't provided
|
47
|
+
self.epoch_cycle_adjustment = epoch_cycle_adjustment
|
48
|
+
# Additional attributes for tracking adjustment logic
|
49
|
+
self.last_adjusted_time = get_time_stamp()
|
50
|
+
self.successful_epochs_since_last_adjustment = 0
|
51
|
+
self.request_count_in_current_epoch = 0
|
52
|
+
|
53
|
+
def _refill_tokens(self):
|
54
|
+
time_since_last_request = get_time_stamp() - self.request_status_json["last_requested"]
|
55
|
+
new_tokens = (time_since_last_request / self.limit_epoch) * self.current_limit
|
56
|
+
self.tokens = min(self.current_limit, self.get_current_tokens())
|
57
|
+
def request_tracker(self,success):
|
58
|
+
if success:
|
59
|
+
self.request_status_json["succesful"].append(get_time_stamp())
|
60
|
+
else:
|
61
|
+
self.request_status_json["unsuccesful"].append(get_time_stamp())
|
62
|
+
self.request_status_json["last_fail"]=get_time_stamp()
|
63
|
+
self.request_status_json["count_since_fail"]=0
|
64
|
+
self.adjust_limit()
|
65
|
+
self.request_status_json["last_requested"]=get_time_stamp()
|
66
|
+
def calculate_tokens(self):
|
67
|
+
successful = []
|
68
|
+
for each in self.request_status_json["succesful"]:
|
69
|
+
if (get_time_stamp() - each)<self.limit_epoch:
|
70
|
+
successful.append(each)
|
71
|
+
self.request_status_json["succesful"]=successful
|
72
|
+
unsuccessful = []
|
73
|
+
for each in self.request_status_json["unsuccesful"]:
|
74
|
+
if (get_time_stamp() - each)<self.limit_epoch:
|
75
|
+
unsuccessful.append(each)
|
76
|
+
self.request_status_json["unsuccesful"]=unsuccessful
|
77
|
+
if len(successful)==0 and len(unsuccessful)==0:
|
78
|
+
pass
|
79
|
+
elif len(successful)!=0 and len(unsuccessful)==0:
|
80
|
+
self.request_status_json["first_requested"] = successful[0]
|
81
|
+
elif len(successful)==0 and len(unsuccessful)!=0:
|
82
|
+
self.request_status_json["first_requested"] = unsuccessful[0]
|
83
|
+
else:
|
84
|
+
self.request_status_json["first_requested"] = min(unsuccessful[0],successful[0])
|
85
|
+
self.request_status_json["epoch_left"]=self.limit_epoch-(self.request_status_json["last_requested"]-self.request_status_json["first_requested"])
|
86
|
+
|
87
|
+
return self.request_status_json
|
88
|
+
def get_current_tokens(self):
|
89
|
+
self.request_status_json = self.calculate_tokens()
|
90
|
+
total_requests = len(self.request_status_json["succesful"])+len(self.request_status_json["unsuccesful"])
|
91
|
+
return max(0,self.current_limit-total_requests)
|
92
|
+
def get_sleep(self):
|
93
|
+
self.request_status_json = self.calculate_tokens()
|
94
|
+
self.request_status_json["current_sleep"]=self.request_status_json["epoch_left"]/max(1,self.get_current_tokens())
|
95
|
+
return self.request_status_json
|
96
|
+
def request(self):
|
97
|
+
self._refill_tokens()
|
98
|
+
if self.tokens > 0:
|
99
|
+
return True # The request can be made
|
100
|
+
else:
|
101
|
+
if self.tokens == 0:
|
102
|
+
self.request_status_json["count_since_fail"]+=1
|
103
|
+
if self.epoch_cycle_adjustment != None:
|
104
|
+
if self.request_status_json["count_since_fail"] >=self.epoch_cycle_adjustment:
|
105
|
+
self.current_limit=min(self.current_limit+1,self.high_limit)
|
106
|
+
return False # The request cannot be made
|
107
|
+
def _adjust_limit(self):
|
108
|
+
current_time = get_time_stamp()
|
109
|
+
if current_time - self.last_adjusted_time >= self.limit_epoch:
|
110
|
+
if len(self.clear_epoch()["succesful"]) >= self.tokens:
|
111
|
+
# We hit the rate limit this epoch, decrease our limit
|
112
|
+
self.tokens = max(1, self.tokens - 1)
|
113
|
+
else:
|
114
|
+
self.successful_epochs_since_last_adjustment += 1
|
115
|
+
if self.successful_epochs_since_last_adjustment >= 5:
|
116
|
+
# We've had 5 successful epochs, increase our limit
|
117
|
+
self.current_limit = min(self.high_limit, self.tokens + 1)
|
118
|
+
self.successful_epochs_since_last_adjustment = 0
|
119
|
+
|
120
|
+
# Reset our counters for the new epoch
|
121
|
+
self.last_adjusted_time = current_time
|
122
|
+
self.request_count_in_current_epoch = 0
|
123
|
+
def adjust_limit(self):
|
124
|
+
# Set the tokens to succesful requests_made - 1
|
125
|
+
self.tokens = len(self.calculate_tokens()["succesful"])
|
126
|
+
|
127
|
+
# Adjust the high_limit
|
128
|
+
self.current_limit = self.tokens
|
129
|
+
|
130
|
+
# Log the adjustment
|
131
|
+
print(f"Adjusted tokens to: {self.tokens} and high_limit to: {self.current_limit}")
|
132
|
+
class DynamicRateLimiterManagerSingleton:
|
133
|
+
_instance = None
|
134
|
+
@staticmethod
|
135
|
+
def get_instance(service_name="default", low_limit=10, high_limit=30, limit_epoch=60,starting_tokens=10,epoch_cycle_adjustment=True):
|
136
|
+
if DynamicRateLimiterManagerSingleton._instance is None:
|
137
|
+
DynamicRateLimiterManagerSingleton._instance = DynamicRateLimiterManager(service_name=service_name, low_limit=low_limit, high_limit=limit_epoch, limit_epoch=60,starting_tokens=starting_tokens,epoch_cycle_adjustment=epoch_cycle_adjustment)
|
138
|
+
return DynamicRateLimiterManagerSingleton._instance
|