abstract-webtools 0.1.5.81__py3-none-any.whl → 0.1.5.82__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ from .cipherManager import *
2
+ from .crawlManager import *
3
+ from .dynamicRateLimiter import *
4
+ from .linkManager import *
5
+ from .mySocketClient import *
6
+ from .networkManager import *
7
+ from .requestManager import *
8
+ from .soupManager import *
9
+ from .sslManager import *
10
+ from .tlsAdapter import *
11
+ from .urlManager import *
12
+ from .userAgentManager import *
13
+ from .videoDownloader import *
14
+ from .seleniumManager import *
@@ -0,0 +1,38 @@
1
+ class CipherManager:
2
+ @staticmethod
3
+ def get_default_ciphers()-> list:
4
+ return [
5
+ "ECDHE-RSA-AES256-GCM-SHA384", "ECDHE-ECDSA-AES256-GCM-SHA384",
6
+ "ECDHE-RSA-AES256-SHA384", "ECDHE-ECDSA-AES256-SHA384",
7
+ "ECDHE-RSA-AES256-SHA", "ECDHE-ECDSA-AES256-SHA",
8
+ "ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-SHA256",
9
+ "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES128-SHA256",
10
+ "AES256-SHA", "AES128-SHA"
11
+ ]
12
+
13
+ def __init__(self,cipher_list=None):
14
+ if cipher_list == None:
15
+ cipher_list=self.get_default_ciphers()
16
+ self.cipher_list = cipher_list
17
+ self.create_list()
18
+ self.ciphers_string = self.add_string_list()
19
+ def add_string_list(self):
20
+ if len(self.cipher_list)==0:
21
+ return ''
22
+ return','.join(self.cipher_list)
23
+ def create_list(self):
24
+ if self.cipher_list == None:
25
+ self.cipher_list= []
26
+ elif isinstance(self.cipher_list, str):
27
+ self.cipher_list=self.cipher_list.split(',')
28
+ if isinstance(self.cipher_list, str):
29
+ self.cipher_list=[self.cipher_list]
30
+ class CipherManagerSingleton:
31
+ _instance = None
32
+ @staticmethod
33
+ def get_instance(cipher_list=None):
34
+ if CipherManagerSingleton._instance is None:
35
+ CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
36
+ elif CipherManagerSingleton._instance.cipher_list != cipher_list:
37
+ CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
38
+ return CipherManagerSingleton._instance
@@ -0,0 +1,187 @@
1
+ class CrawlManager:
2
+ def __init__(self,url=None,source_code=None,parse_type="html.parser"):
3
+ self.url=url
4
+ self.source_code=source_code
5
+ self.parse_type=parse_type
6
+ get_new_source_and_url(self,url)
7
+ def get_new_source_and_url(self,url=None):
8
+ if url == None:
9
+ url = self.url
10
+ self.response = self.request_mgr.response
11
+ self.source_code=self.request_mgr.source_code
12
+ def get_classes_and_meta_info():
13
+ class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
14
+ attrs = 'href','src'
15
+ unique_classes, images=discover_classes_and_images(self,tag_name,class_name_1,class_name_2,class_value,attrs)
16
+ return unique_classes, images
17
+ def extract_links_from_url(self):
18
+ """
19
+ Extracts all href and src links from a given URL's source code.
20
+
21
+ Args:
22
+ base_url (str): The URL from which to extract links.
23
+
24
+ Returns:
25
+ dict: Dictionary containing image links and external links under the parent page.
26
+ """
27
+ agg_js = {'images':[],'external_links':[]}
28
+
29
+ if self.response != None:
30
+ attrs = 'href','src'
31
+ href_links,src_links='',''
32
+ links = [href_links,src_links]
33
+ for i,each in enumerate(attrs):
34
+ links[i]= [a[attr[i]] for a in get_find_all_with_attributes(self, attrs[i])]
35
+ # Convert all links to absolute links
36
+ absolute_links = [(url, link) for link in links[0] + links[1]]
37
+ # Separate images and external links
38
+ images = [link for link in absolute_links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
39
+ external_links = [link for link in absolute_links if urlparse(link).netloc != urlparse(url).netloc]
40
+ agg_js['images']=images
41
+ agg_js['external_links']=external_links
42
+
43
+ return agg_js
44
+
45
+
46
+ def correct_xml(xml_string):
47
+ # Parse the XML string
48
+ root = ET.fromstring(xml_string)
49
+
50
+ # Loop through each <image:loc> element and correct its text if needed
51
+ for image_loc in root.findall(".//image:loc", namespaces={'image': 'http://www.google.com/schemas/sitemap-image/1.1'}):
52
+ # Replace '&' with '&amp;' in the element's text
53
+ if '&' in image_loc.text:
54
+ image_loc.text = image_loc.text.replace('&', '&amp;')
55
+
56
+ # Convert the corrected XML back to string
57
+ corrected_xml = ET.tostring(root, encoding='utf-8').decode('utf-8')
58
+ return corrected_xml
59
+
60
+
61
+ def determine_values(self):
62
+ # This is just a mockup. In a real application, you'd analyze the URL or its content.
63
+
64
+ # Assuming a blog site
65
+ if 'blog' in self.url:
66
+ if '2023' in self.url: # Assuming it's a current year article
67
+ return ('weekly', '0.8')
68
+ else:
69
+ return ('monthly', '0.6')
70
+ elif 'contact' in self.url:
71
+ return ('yearly', '0.3')
72
+ else: # Homepage or main categories
73
+ return ('weekly', '1.0')
74
+ def crawl(url, max_depth=3, depth=1):
75
+
76
+ if depth > max_depth:
77
+ return []
78
+
79
+ if url in visited:
80
+ return []
81
+
82
+ visited.add(url)
83
+
84
+ try:
85
+
86
+ links = [a['href'] for a in self.soup.find_all('a', href=True)]
87
+ valid_links = []
88
+
89
+ for link in links:
90
+ parsed_link = urlparse(link)
91
+ base_url = "{}://{}".format(parsed_link.scheme, parsed_link.netloc)
92
+
93
+ if base_url == url: # Avoiding external URLs
94
+ final_link = urljoin(url, parsed_link.path)
95
+ if final_link not in valid_links:
96
+ valid_links.append(final_link)
97
+
98
+ for link in valid_links:
99
+ crawl(link, max_depth, depth+1)
100
+
101
+ return valid_links
102
+
103
+ except Exception as e:
104
+ print(f"Error crawling {url}: {e}")
105
+ return []
106
+
107
+
108
+ # Define or import required functions here, like get_all_website_links, determine_values,
109
+ # discover_classes_and_meta_images, and extract_links_from_url.
110
+ def get_meta_info(self):
111
+
112
+ meta_info = {}
113
+ # Fetch the title if available
114
+ title_tag = parse_title()
115
+ if title_tag:
116
+ meta_info["title"] = title_tag
117
+ # Fetch meta tags
118
+ for meta_tag in soup.find_all('meta'):
119
+ name = meta_tag.get('name') or meta_tag.get('property')
120
+ if name:
121
+ content = meta_tag.get('content')
122
+ if content:
123
+ meta_info[name] = content
124
+
125
+ return meta_info
126
+ def generate_sitemap(self,domain):
127
+
128
+ with open('sitemap.xml', 'w', encoding='utf-8') as f:
129
+ string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n'
130
+
131
+ for url in self.all_site_links:
132
+ string += f' <url>\n <loc>{url}</loc>\n'
133
+ preprocess=[]
134
+ self.get_new_source_and_url(url=url)
135
+ links = extract_links_from_url(url)
136
+
137
+ for img in links['images']:
138
+ if str(img).lower() not in preprocess:
139
+ try:
140
+ escaped_img = img.replace('&', '&amp;')
141
+
142
+ str_write = f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n'
143
+ string += str_write
144
+ except:
145
+ pass
146
+ preprocess.append(str(img).lower())
147
+ frequency, priority = determine_values(url)
148
+ string += f' <changefreq>{frequency}</changefreq>\n'
149
+ string += f' <priority>{priority}</priority>\n'
150
+ string += f' </url>\n'
151
+
152
+ string += '</urlset>\n'
153
+ f.write(string)
154
+ # Output summary
155
+ print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
156
+
157
+ # Output class and link details
158
+ for url in urls:
159
+ print(f"\nDetails for {url}:")
160
+ classes, meta_img_refs = discover_classes_and_meta_images(url)
161
+
162
+ print("\nClasses with href or src attributes:")
163
+ for class_name in classes:
164
+ print(f"\t{class_name}")
165
+
166
+ print("\nMeta Image References:")
167
+ for img_ref in meta_img_refs:
168
+ print(f"\t{img_ref}")
169
+
170
+ links = extract_links_from_url(url)
171
+
172
+ print("\nImages:")
173
+ for img in links['images']:
174
+ print(f"\t{img}")
175
+
176
+ print("\nExternal Links:")
177
+ for ext_link in links['external_links']:
178
+ print(f"\t{ext_link}")
179
+ class CrawlManagerSingleton():
180
+ _instance = None
181
+ @staticmethod
182
+ def get_instance(url=None,source_code=None,parse_type="html.parser"):
183
+ if CrawlManagerSingleton._instance is None:
184
+ CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
185
+ elif parse_type != CrawlManagerSingleton._instance.parse_type or url != CrawlManagerSingleton._instance.url or source_code != CrawlManagerSingleton._instance.source_code:
186
+ CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
187
+ return CrawlManagerSingleton._instance
@@ -0,0 +1,48 @@
1
+ from ..abstract_webtools import *
2
+ class domainManager(metaclass=SingletonMeta):
3
+ def __init__(self, url):
4
+ if not hasattr(self, 'initialized'): # Prevent reinitialization
5
+ self.initialized = True
6
+ parsed_url = urlparse(url)
7
+ self.domain = parsed_url.netloc
8
+ self.scheme = parsed_url.scheme
9
+ def get_url_to_path(self, url):
10
+ url = eatAll(str(url),['',' ','\n','\t','\\','/'])
11
+ parsed_url = urlparse(url)
12
+ if 'data:image' in url:
13
+ input(url)
14
+ if parsed_url.netloc == self.domain:
15
+ paths = parsed_url.path.split('/')
16
+ dir_path =self.site_dir
17
+ for path in paths[:-1]:
18
+ dir_path = os.path.join(dir_path, path)
19
+ os.makedirs(dir_path, exist_ok=True)
20
+ #if 'svg' in url:
21
+ #$ input(url)
22
+ # dir_path = get_image_name('contents',directory=dir_path,ext='png',url=item_url)
23
+
24
+
25
+ self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
26
+
27
+ dir_path = os.path.join(dir_path, paths[-1])
28
+ return dir_path
29
+
30
+ def saved_url_check(self, url):
31
+
32
+ path = self.get_url_to_path(url)
33
+ return path
34
+
35
+ def get_with_netloc(self, url):
36
+ parsed_url = urlparse(url)
37
+ if parsed_url.netloc == '':
38
+ url = f"{self.scheme}://{self.domain}/{url.strip()}"
39
+ return url
40
+
41
+ def get_driver(self, url):
42
+ if url and url not in self.drivers:
43
+ chrome_options = Options()
44
+ chrome_options.add_argument("--headless")
45
+ driver = webdriver.Chrome(options=chrome_options)
46
+ self.drivers[url] = driver
47
+ driver.get(url)
48
+ return self.drivers[url]
@@ -0,0 +1,138 @@
1
+ class DynamicRateLimiterManager:
2
+ def __init__(self, service_name='ethereum'):
3
+ self.services = {}
4
+ self.service_name = service_name
5
+ self.add_service(service_name)
6
+
7
+ def add_service(self, service_name="default", low_limit=10, high_limit=30, limit_epoch=60, starting_tokens=10, epoch_cycle_adjustment=True):
8
+ if service_name in self.services:
9
+ print(f"Service {service_name} already exists!")
10
+ return
11
+ self.services[service_name] = DynamicRateLimiter(low_limit=low_limit, high_limit=high_limit, limit_epoch=limit_epoch, starting_tokens=starting_tokens, epoch_cycle_adjustment=epoch_cycle_adjustment)
12
+
13
+ def request(self, request_url, service_name=None):
14
+ service_name = service_name or self.service_name
15
+ if service_name not in self.services:
16
+ self.add_service(service_name)
17
+
18
+ limiter = self.services[service_name]
19
+
20
+ while True:
21
+ if limiter.request():
22
+ response = requests.get(request_url) # Actual request
23
+ if response.status_code == 200:
24
+ limiter.request_tracker(True)
25
+ return response.json()
26
+ elif response.status_code == 429:
27
+ limiter.request_tracker(False)
28
+ print(f"Rate limited by {service_name}. Adjusting limit and retrying...")
29
+ time.sleep(limiter.get_sleep()["current_sleep"])
30
+ else:
31
+ print(f"Unexpected response: {response.status_code}. Message: {response.text}")
32
+ return None
33
+ else:
34
+ print(f"Rate limit reached for {service_name}. Waiting for the next epoch...")
35
+ time.sleep(limiter.get_sleep()["current_sleep"])
36
+
37
+ def log_request(self, service_name, success):
38
+ print(f"[{service_name}] Request {'succeeded' if success else 'denied'}. Current tokens: {self.services[service_name].get_current_tokens()}")
39
+
40
+ class DynamicRateLimiter:
41
+ def __init__(self, low_limit, high_limit, limit_epoch, starting_tokens=None,epoch_cycle_adjustment:int=None):
42
+ self.low_limit = low_limit
43
+ self.high_limit = high_limit
44
+ self.limit_epoch = limit_epoch # in seconds
45
+ self.request_status_json = {"succesful":[],"unsuccesful":[],"last_requested":get_time_stamp(),"first_requested":get_time_stamp(),"epoch_left":self.limit_epoch,"last_fail":get_time_stamp(),"count_since_fail":0}
46
+ self.current_limit = starting_tokens or low_limit # Default to high_limit if starting_tokens isn't provided
47
+ self.epoch_cycle_adjustment = epoch_cycle_adjustment
48
+ # Additional attributes for tracking adjustment logic
49
+ self.last_adjusted_time = get_time_stamp()
50
+ self.successful_epochs_since_last_adjustment = 0
51
+ self.request_count_in_current_epoch = 0
52
+
53
+ def _refill_tokens(self):
54
+ time_since_last_request = get_time_stamp() - self.request_status_json["last_requested"]
55
+ new_tokens = (time_since_last_request / self.limit_epoch) * self.current_limit
56
+ self.tokens = min(self.current_limit, self.get_current_tokens())
57
+ def request_tracker(self,success):
58
+ if success:
59
+ self.request_status_json["succesful"].append(get_time_stamp())
60
+ else:
61
+ self.request_status_json["unsuccesful"].append(get_time_stamp())
62
+ self.request_status_json["last_fail"]=get_time_stamp()
63
+ self.request_status_json["count_since_fail"]=0
64
+ self.adjust_limit()
65
+ self.request_status_json["last_requested"]=get_time_stamp()
66
+ def calculate_tokens(self):
67
+ successful = []
68
+ for each in self.request_status_json["succesful"]:
69
+ if (get_time_stamp() - each)<self.limit_epoch:
70
+ successful.append(each)
71
+ self.request_status_json["succesful"]=successful
72
+ unsuccessful = []
73
+ for each in self.request_status_json["unsuccesful"]:
74
+ if (get_time_stamp() - each)<self.limit_epoch:
75
+ unsuccessful.append(each)
76
+ self.request_status_json["unsuccesful"]=unsuccessful
77
+ if len(successful)==0 and len(unsuccessful)==0:
78
+ pass
79
+ elif len(successful)!=0 and len(unsuccessful)==0:
80
+ self.request_status_json["first_requested"] = successful[0]
81
+ elif len(successful)==0 and len(unsuccessful)!=0:
82
+ self.request_status_json["first_requested"] = unsuccessful[0]
83
+ else:
84
+ self.request_status_json["first_requested"] = min(unsuccessful[0],successful[0])
85
+ self.request_status_json["epoch_left"]=self.limit_epoch-(self.request_status_json["last_requested"]-self.request_status_json["first_requested"])
86
+
87
+ return self.request_status_json
88
+ def get_current_tokens(self):
89
+ self.request_status_json = self.calculate_tokens()
90
+ total_requests = len(self.request_status_json["succesful"])+len(self.request_status_json["unsuccesful"])
91
+ return max(0,self.current_limit-total_requests)
92
+ def get_sleep(self):
93
+ self.request_status_json = self.calculate_tokens()
94
+ self.request_status_json["current_sleep"]=self.request_status_json["epoch_left"]/max(1,self.get_current_tokens())
95
+ return self.request_status_json
96
+ def request(self):
97
+ self._refill_tokens()
98
+ if self.tokens > 0:
99
+ return True # The request can be made
100
+ else:
101
+ if self.tokens == 0:
102
+ self.request_status_json["count_since_fail"]+=1
103
+ if self.epoch_cycle_adjustment != None:
104
+ if self.request_status_json["count_since_fail"] >=self.epoch_cycle_adjustment:
105
+ self.current_limit=min(self.current_limit+1,self.high_limit)
106
+ return False # The request cannot be made
107
+ def _adjust_limit(self):
108
+ current_time = get_time_stamp()
109
+ if current_time - self.last_adjusted_time >= self.limit_epoch:
110
+ if len(self.clear_epoch()["succesful"]) >= self.tokens:
111
+ # We hit the rate limit this epoch, decrease our limit
112
+ self.tokens = max(1, self.tokens - 1)
113
+ else:
114
+ self.successful_epochs_since_last_adjustment += 1
115
+ if self.successful_epochs_since_last_adjustment >= 5:
116
+ # We've had 5 successful epochs, increase our limit
117
+ self.current_limit = min(self.high_limit, self.tokens + 1)
118
+ self.successful_epochs_since_last_adjustment = 0
119
+
120
+ # Reset our counters for the new epoch
121
+ self.last_adjusted_time = current_time
122
+ self.request_count_in_current_epoch = 0
123
+ def adjust_limit(self):
124
+ # Set the tokens to succesful requests_made - 1
125
+ self.tokens = len(self.calculate_tokens()["succesful"])
126
+
127
+ # Adjust the high_limit
128
+ self.current_limit = self.tokens
129
+
130
+ # Log the adjustment
131
+ print(f"Adjusted tokens to: {self.tokens} and high_limit to: {self.current_limit}")
132
+ class DynamicRateLimiterManagerSingleton:
133
+ _instance = None
134
+ @staticmethod
135
+ def get_instance(service_name="default", low_limit=10, high_limit=30, limit_epoch=60,starting_tokens=10,epoch_cycle_adjustment=True):
136
+ if DynamicRateLimiterManagerSingleton._instance is None:
137
+ DynamicRateLimiterManagerSingleton._instance = DynamicRateLimiterManager(service_name=service_name, low_limit=low_limit, high_limit=limit_epoch, limit_epoch=60,starting_tokens=starting_tokens,epoch_cycle_adjustment=epoch_cycle_adjustment)
138
+ return DynamicRateLimiterManagerSingleton._instance
@@ -0,0 +1,189 @@
1
+ from ..abstract_webtools import *
2
+ from .urlManager import *
3
+ from .requestManager import *
4
+ from .soupManager import *
5
+ def get_url(url=None,url_mgr=None):
6
+ if not url and not url_mgr:
7
+ return None
8
+ if url:
9
+ url_mgr = urlManager(url)
10
+ return url_mgr.url
11
+ def get_url_mgr(url=None,url_mgr=None):
12
+ if not url and not url_mgr:
13
+ return None
14
+ if url:
15
+ url_mgr = urlManager(url)
16
+ return url_mgr
17
+ class linkManager:
18
+ """
19
+ LinkManager is a class for managing and extracting links and image links from a web page.
20
+
21
+ Args:
22
+ url (str): The URL of the web page (default is "https://example.com").
23
+ source_code (str or None): The source code of the web page (default is None).
24
+ url_mgr (UrlManager or None): An instance of UrlManager (default is None).
25
+ request_manager (requestManager or None): An instance of requestManager (default is None).
26
+ soup_manager (SoupManager or None): An instance of SoupManager (default is None).
27
+ image_link_tags (str): HTML tags to identify image links (default is 'img').
28
+ img_link_attrs (str): HTML attributes to identify image link URLs (default is 'src').
29
+ link_tags (str): HTML tags to identify links (default is 'a').
30
+ link_attrs (str): HTML attributes to identify link URLs (default is 'href').
31
+ strict_order_tags (bool): Flag to indicate if tags and attributes should be matched strictly (default is False).
32
+ img_attr_value_desired (list or None): Desired attribute values for image links (default is None).
33
+ img_attr_value_undesired (list or None): Undesired attribute values for image links (default is None).
34
+ link_attr_value_desired (list or None): Desired attribute values for links (default is None).
35
+ link_attr_value_undesired (list or None): Undesired attribute values for links (default is None).
36
+ associated_data_attr (list): HTML attributes to associate with the extracted links (default is ["data-title", 'alt', 'title']).
37
+ get_img (list): HTML attributes used to identify associated images (default is ["data-title", 'alt', 'title']).
38
+
39
+ Methods:
40
+ re_initialize(): Reinitialize the LinkManager with the current settings.
41
+ update_url_mgr(url_mgr): Update the URL manager with a new instance.
42
+ update_url(url): Update the URL and reinitialize the LinkManager.
43
+ update_source_code(source_code): Update the source code and reinitialize the LinkManager.
44
+ update_soup_manager(soup_manager): Update the SoupManager and reinitialize the LinkManager.
45
+ update_desired(...): Update the desired settings and reinitialize the LinkManager.
46
+ find_all_desired(...): Find all desired links or image links based on the specified criteria.
47
+ find_all_domain(): Find all unique domain names in the extracted links.
48
+
49
+ Note:
50
+ - The LinkManager class helps manage and extract links and image links from web pages.
51
+ - The class provides flexibility in specifying criteria for link extraction.
52
+ """
53
+ def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup_manager=None,image_link_tags='img',img_link_attrs='src',link_tags='a',link_attrs='href',strict_order_tags=False,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
54
+ self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
55
+ self.url=self.url_mgr.url
56
+ self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
57
+ self.source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
58
+ self.soup_mgr = get_soup_mgr(req_mgr=self.req_mgr,url_mgr=self.url_mgr,source_code = source_code)
59
+
60
+ self.strict_order_tags=strict_order_tags
61
+ self.image_link_tags=image_link_tags
62
+ self.img_link_attrs=img_link_attrs
63
+ self.link_tags=link_tags
64
+ self.link_attrs=link_attrs
65
+ self.img_attr_value_desired=img_attr_value_desired
66
+ self.img_attr_value_undesired=img_attr_value_undesired
67
+ self.link_attr_value_desired=link_attr_value_desired
68
+ self.link_attr_value_undesired=link_attr_value_undesired
69
+ self.associated_data_attr=associated_data_attr
70
+ self.get_img=get_img
71
+ self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,
72
+ attr=self.img_link_attrs,
73
+ attr_value_desired=self.img_attr_value_desired,
74
+ attr_value_undesired=self.img_attr_value_undesired)
75
+ self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,
76
+ attr=self.link_attrs,
77
+ attr_value_desired=self.link_attr_value_desired,
78
+ attr_value_undesired=self.link_attr_value_undesired,
79
+ associated_data_attr=self.associated_data_attr,
80
+ get_img=get_img)
81
+ def re_initialize(self):
82
+ self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,attr=self.img_link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.img_attr_value_desired,attr_value_undesired=self.img_attr_value_undesired)
83
+ self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,attr=self.link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.link_attr_value_desired,attr_value_undesired=self.link_attr_value_undesired,associated_data_attr=self.associated_data_attr,get_img=self.get_img)
84
+ def update_url_mgr(self,url_mgr):
85
+ self.url_mgr=url_mgr
86
+ self.url=self.url_mgr.url
87
+ self.req_mgr.update_url_mgr(url_mgr=self.url_mgr)
88
+ self.soup_mgr.update_url_mgr(url_mgr=self.url_mgr)
89
+ self.source_code=self.soup_mgr.source_code
90
+ self.re_initialize()
91
+ def update_url(self,url):
92
+ self.url=url
93
+ self.url_mgr.update_url(url=self.url)
94
+ self.url=self.url_mgr.url
95
+ self.req_mgr.update_url(url=self.url)
96
+ self.soup_mgr.update_url(url=self.url)
97
+ self.source_code=self.soup_mgr.source_code
98
+ self.re_initialize()
99
+ def update_source_code(self,source_code):
100
+ self.source_code=source_code
101
+ if self.source_code != self.soup_mgr.source_code:
102
+ self.soup_mgr.update_source_code(source_code=self.source_code)
103
+ self.re_initialize()
104
+ def update_soup_manager(self,soup_manager):
105
+ self.soup_mgr=soup_manager
106
+ self.source_code=self.soup_mgr.source_code
107
+ self.re_initialize()
108
+ def update_desired(self,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,image_link_tags=None,img_link_attrs=None,link_tags=None,link_attrs=None,strict_order_tags=None,associated_data_attr=None,get_img=None):
109
+ self.strict_order_tags = strict_order_tags or self.strict_order_tags
110
+ self.img_attr_value_desired=img_attr_value_desired or self.img_attr_value_desired
111
+ self.img_attr_value_undesired=img_attr_value_undesired or self.img_attr_value_undesired
112
+ self.link_attr_value_desired=link_attr_value_desired or self.link_attr_value_desired
113
+ self.link_attr_value_undesired=link_attr_value_undesired or self.link_attr_value_undesired
114
+ self.image_link_tags=image_link_tags or self.image_link_tags
115
+ self.img_link_attrs=img_link_attrs or self.img_link_attrs
116
+ self.link_tags=link_tags or self.link_tags
117
+ self.link_attrs=link_attrs or self.link_attrs
118
+ self.associated_data_attr=associated_data_attr or self.associated_data_attr
119
+ self.get_img=get_img or self.get_img
120
+ self.re_initialize()
121
+ def find_all_desired(self,tag='img',attr='src',strict_order_tags=False,attr_value_desired=None,attr_value_undesired=None,associated_data_attr=None,get_img=None):
122
+ def make_list(obj):
123
+ if isinstance(obj,list) or obj==None:
124
+ return obj
125
+ return [obj]
126
+ def get_desired_value(attr,attr_value_desired=None,attr_value_undesired=None):
127
+ if attr_value_desired:
128
+ for value in attr_value_desired:
129
+ if value not in attr:
130
+ return False
131
+ if attr_value_undesired:
132
+ for value in attr_value_undesired:
133
+ if value in attr:
134
+ return False
135
+ return True
136
+ attr_value_desired,attr_value_undesired,associated_data_attr,tags,attribs=make_list(attr_value_desired),make_list(attr_value_undesired),make_list(associated_data_attr),make_list(tag),make_list(attr)
137
+ desired_ls = []
138
+ assiciated_data=[]
139
+ for i,tag in enumerate(tags):
140
+ attribs_list=attribs
141
+ if strict_order_tags:
142
+ if len(attribs)<=i:
143
+ attribs_list=[None]
144
+ else:
145
+ attribs_list=make_list(attribs[i])
146
+ for attr in attribs_list:
147
+ for component in self.soup_mgr.soup.find_all(tag):
148
+ if attr in component.attrs and get_desired_value(attr=component[attr],attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired):
149
+ if component[attr] not in desired_ls:
150
+ desired_ls.append(component[attr])
151
+ assiciated_data.append({"value":component[attr]})
152
+ if associated_data_attr:
153
+ for data in associated_data_attr:
154
+ if data in component.attrs:
155
+ assiciated_data[-1][data]=component.attrs[data]
156
+ if get_img and component.attrs[data]:
157
+ if data in get_img and len(component.attrs[data])!=0:
158
+ for each in self.soup_mgr.soup.find_all('img'):
159
+ if 'alt' in each.attrs:
160
+ if each.attrs['alt'] == component.attrs[data] and 'src' in each.attrs:
161
+ assiciated_data[-1]['image']=each.attrs['src']
162
+ desired_ls.append(assiciated_data)
163
+ return desired_ls
164
+ def find_all_domain(self):
165
+ domain = urlparse(self.url_mgr.url).netloc
166
+ domains_ls=[self.url_mgr.url]
167
+ for url in self.all_desired_links[:-1]:
168
+ if self.url_mgr.is_valid_url(url):
169
+ parse = urlparse(url)
170
+ comp_domain = parse.netloc
171
+ if url not in domains_ls and comp_domain == domain:
172
+ domains_ls.append(url)
173
+ return domains_ls
174
+ def find_all_desired_links(self,tag='img', attr='src',attr_value_desired=None,strict_order_tags=False,attr_value_undesired=None,associated_data_attr=None,all_desired=None,get_img=None):
175
+ all_desired = all_desired or self.find_all_desired(tag=tag,attr=attr,strict_order_tags=strict_order_tags,attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired,associated_data_attr=associated_data_attr,get_img=get_img)
176
+ assiciated_attrs = all_desired[-1]
177
+ valid_assiciated_attrs = []
178
+ desired_links=[]
179
+ for i,attr in enumerate(all_desired[:-1]):
180
+ valid_attr=self.url_mgr.make_valid(attr,self.url_mgr.protocol+'://'+self.url_mgr.domain)
181
+ if valid_attr:
182
+ desired_links.append(valid_attr)
183
+ valid_assiciated_attrs.append(assiciated_attrs[i])
184
+ valid_assiciated_attrs[-1]["link"]=valid_attr
185
+ desired_links.append(valid_assiciated_attrs)
186
+ return desired_links
187
+
188
+
189
+
@@ -0,0 +1,46 @@
1
+ class MySocketClient:
2
+ def __init__(self, ip_address=None, port=None,domain=None):
3
+ self.sock
4
+ self.ip_address= ip_address or None
5
+ self.port = port or None
6
+
7
+ self.domain = domain or None
8
+ def receive_data(self):
9
+ chunks = []
10
+ while True:
11
+ chunk = self.sock.recv(4096)
12
+ if chunk:
13
+ chunks.append(chunk)
14
+ else:
15
+ break
16
+ return b''.join(chunks).decode('utf-8')
17
+ def _parse_socket_response_as_json(self, data, *args, **kwargs):
18
+ return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
19
+ def process_data(self):
20
+ data = self.receive_data()
21
+ return self._parse_socket_response_as_json(data)
22
+ def _parse_json(self,json_string):
23
+ return json.loads(json_string)
24
+ def get_ip(self,domain=None):
25
+ try:
26
+ return self.sock.gethostbyname(domain if domain != None else self.domain)
27
+ except self.sock.gaierror:
28
+ return None
29
+ def grt_host_name(self,ip_address=None):
30
+ return self.sock.gethostbyaddr(ip_address if ip_address != None else self.ip_address)
31
+ def toggle_sock(self):
32
+ if self.sock != None:
33
+ self.sock.close()
34
+ else:
35
+ self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
36
+ if host and socket:
37
+ self.sock.connect((host, port))
38
+ class MySocketClient():
39
+ _instance = None
40
+ @staticmethod
41
+ def get_instance(ip_address='local_host',port=22,domain="example.com"):
42
+ if MySocketClientSingleton._instance is None:
43
+ MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain=domain)
44
+ elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or UrlManagerSingleton._instance.domain != domain:
45
+ MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain=domain)
46
+ return MySocketClient