abstract-webtools 0.1.6.9__tar.gz → 0.1.6.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {abstract_webtools-0.1.6.9/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.11}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/setup.py +1 -1
  3. abstract_webtools-0.1.6.11/src/abstract_webtools/managers/crawlManager.py +172 -0
  4. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/requestManager.py +2 -1
  5. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
  6. abstract_webtools-0.1.6.9/src/abstract_webtools/managers/crawlManager.py +0 -250
  7. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/LICENSE +0 -0
  8. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/README.md +0 -0
  9. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/pyproject.toml +0 -0
  10. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/setup.cfg +0 -0
  11. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/__init__.py +0 -0
  12. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/abstract_webtools.py +0 -0
  13. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/big_user_agent_list.py +0 -0
  14. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/main.py +0 -0
  15. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/__init__.py +0 -0
  16. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/cipherManager.py +0 -0
  17. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/domainManager.py +0 -0
  18. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  19. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/linkManager.py +0 -0
  20. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  21. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/networkManager.py +0 -0
  22. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/seleniumManager.py +0 -0
  23. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/soupManager.py +0 -0
  24. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/sslManager.py +0 -0
  25. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  26. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/urlManager.py +0 -0
  27. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  28. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  29. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/soup_gui.py +0 -0
  30. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/url_grabber.py +0 -0
  31. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools/url_grabber_new.py +0 -0
  32. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools.egg-info/SOURCES.txt +0 -0
  33. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  34. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools.egg-info/requires.txt +0 -0
  35. {abstract_webtools-0.1.6.9 → abstract_webtools-0.1.6.11}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.9
3
+ Version: 0.1.6.11
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.09',
7
+ version='0.1.6.11',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -0,0 +1,172 @@
1
+ from abstract_webtools import * #.soupManager import *
2
+
3
+
4
+ from urllib.parse import urlparse, urljoin
5
+ import os
6
+ import xml.etree.ElementTree as ET
7
+ from bs4 import BeautifulSoup
8
+
9
+ class CrawlManager:
10
+ def __init__(self, url=None, req_mgr=None, url_mgr=None, source_code=None, parse_type="html.parser"):
11
+ self.url = url
12
+ self.source_code = source_code
13
+ self.parse_type = parse_type
14
+ self.url_mgr = url_mgr or urlManager(url=self.url)
15
+ self.req_mgr = req_mgr or requestManager(url_mgr=self.url_mgr)
16
+ self.get_new_source_and_url(url)
17
+
18
+ def get_new_source_and_url(self, url=None):
19
+ """Fetches new source code and response for a given URL."""
20
+ if url is None:
21
+ url = self.url
22
+ self.req_mgr.set_url(url)
23
+ self.source_code = self.req_mgr.source_code
24
+ self.response = self.req_mgr.response
25
+
26
+ def get_classes_and_meta_info(self):
27
+ """Returns unique classes and image links from meta tags."""
28
+ tag_name = 'meta'
29
+ class_name_1, class_name_2 = 'class', 'property'
30
+ class_value = 'og:image'
31
+ attrs = ['href', 'src']
32
+ unique_classes, images = discover_classes_and_images(self, tag_name, class_name_1, class_name_2, class_value, attrs)
33
+ return unique_classes, images
34
+
35
+ def extract_links_from_url(self, url=None):
36
+ """Extracts all href and src links from a given URL's source code."""
37
+ url = url or self.url_mgr.url
38
+ soup = BeautifulSoup(self.source_code, self.parse_type)
39
+ links = {'images': [], 'external_links': []}
40
+
41
+ if self.response:
42
+ for attr in ['href', 'src']:
43
+ for tag in soup.find_all(attrs={attr: True}):
44
+ link = tag.get(attr)
45
+ if link:
46
+ absolute_link = urljoin(url, link)
47
+ if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp')):
48
+ links['images'].append(absolute_link)
49
+ elif urlparse(absolute_link).netloc != urlparse(url).netloc:
50
+ links['external_links'].append(absolute_link)
51
+
52
+ return links
53
+
54
+ def get_all_website_links(self):
55
+ """Finds all internal links on the website that belong to the same domain."""
56
+ all_urls = [self.url_mgr.url]
57
+ domain = self.url_mgr.domain
58
+ all_attribs = get_all_attribute_values(self.url_mgr.url)
59
+
60
+ for href in all_attribs.get('href', []):
61
+ if not href or not self.url_mgr.is_valid_url(href):
62
+ continue
63
+ full_url = urljoin(self.url_mgr.url, href)
64
+ if domain in full_url and full_url not in all_urls:
65
+ all_urls.append(full_url)
66
+
67
+ return all_urls
68
+
69
+ def correct_xml(self, xml_string):
70
+ """Corrects XML by encoding special characters in <image:loc> tags."""
71
+ root = ET.fromstring(xml_string)
72
+ for image_loc in root.findall(".//image:loc", namespaces={'image': 'http://www.google.com/schemas/sitemap-image/1.1'}):
73
+ if '&' in image_loc.text:
74
+ image_loc.text = image_loc.text.replace('&', '&amp;')
75
+ return ET.tostring(root, encoding='utf-8').decode('utf-8')
76
+
77
+ def determine_values(self, url=None):
78
+ """Determines frequency and priority based on URL type."""
79
+ url = url or self.url
80
+ if 'blog' in url:
81
+ return ('weekly', '0.8') if '2023' in url else ('monthly', '0.6')
82
+ elif 'contact' in url:
83
+ return ('yearly', '0.3')
84
+ return ('weekly', '1.0')
85
+
86
+ def crawl(self, url, max_depth=3, depth=1, visited=None):
87
+ """Recursively crawls the site up to max_depth and returns valid internal links."""
88
+ visited = visited or set()
89
+ if depth > max_depth or url in visited:
90
+ return []
91
+
92
+ visited.add(url)
93
+ try:
94
+ soup = get_soup(url)
95
+ links = []
96
+ for tag in soup.find_all('a', href=True):
97
+ link = urljoin(url, tag['href'])
98
+ if urlparse(link).netloc == urlparse(url).netloc and link not in visited:
99
+ links.append(link)
100
+ self.crawl(link, max_depth, depth + 1, visited)
101
+ return links
102
+ except Exception as e:
103
+ print(f"Error crawling {url}: {e}")
104
+ return []
105
+
106
+ def get_meta_info(self, url=None):
107
+ """Fetches metadata, including title and meta tags, from the page."""
108
+ url = url or self.url
109
+ soup = BeautifulSoup(self.source_code, self.parse_type)
110
+ meta_info = {"title": None, "meta_tags": {}}
111
+
112
+ title_tag = soup.find("title")
113
+ if title_tag:
114
+ meta_info["title"] = title_tag.text
115
+
116
+ for meta in soup.find_all('meta'):
117
+ name = meta.get('name') or meta.get('property')
118
+ content = meta.get('content')
119
+ if name and content:
120
+ meta_info["meta_tags"][name] = content
121
+
122
+ return meta_info
123
+
124
+ def generate_sitemap(self):
125
+ """Generates a sitemap.xml file with URLs, images, change frequency, and priority."""
126
+ urls = self.get_all_website_links()
127
+ with open('sitemap.xml', 'w', encoding='utf-8') as f:
128
+ f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
129
+ f.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" ')
130
+ f.write('xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n')
131
+
132
+ for url in urls:
133
+ f.write(f' <url>\n <loc>{url}</loc>\n')
134
+ frequency, priority = self.determine_values(url)
135
+ f.write(f' <changefreq>{frequency}</changefreq>\n')
136
+ f.write(f' <priority>{priority}</priority>\n')
137
+
138
+ images = [img for img in self.extract_links_from_url(url)['images']]
139
+ for img in images:
140
+ escaped_img = img.replace('&', '&amp;')
141
+ f.write(f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n')
142
+
143
+ f.write(' </url>\n')
144
+
145
+ f.write('</urlset>\n')
146
+
147
+ print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
148
+
149
+ class crawlManagerSingleton():
150
+ _instance = None
151
+ @staticmethod
152
+ def get_instance(url=None,source_code=None,parse_type="html.parser"):
153
+ if crawlManagerSingleton._instance is None:
154
+ crawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
155
+ elif parse_type != crawlManagerSingleton._instance.parse_type or url != crawlManagerSingleton._instance.url or source_code != crawlManagerSingleton._instance.source_code:
156
+ crawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
157
+ return crawlManagerSingleton._instance
158
+ def get_crawl_mgr(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
159
+
160
+ url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
161
+ url = get_url(url=url,url_mgr=url_mgr)
162
+ req_mgr=get_req_mgr(url=url,url_mgr=url_mgr,source_code=source_code)
163
+ source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr)
164
+ soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,parse_type=parse_type)
165
+ crawl_mgr = crawlManager(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
166
+ return crawl_mgr
167
+ def get_domain_crawl(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser",max_depth=3, depth=1):
168
+ crawl_mgr = get_crawl_mgr(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
169
+ url = get_url(url=url,url_mgr=url_mgr)
170
+ all_domain_links = crawl_mgr.crawl(url=url, max_depth=max_depth, depth=depth)
171
+ return all_domain_links
172
+ get_domain_crawl(url='https://www.tradingview.com/symbols/SOLUSD/')
@@ -87,7 +87,8 @@ class requestManager:
87
87
  timeout = None,
88
88
  last_request_time=None,
89
89
  max_retries=None,
90
- request_wait_limit=None):
90
+ request_wait_limit=
91
+ None):
91
92
  self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
92
93
  self.url=get_url(url=url,url_mgr=self.url_mgr)
93
94
  self._url_mgr = self.url_mgr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.9
3
+ Version: 0.1.6.11
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -1,250 +0,0 @@
1
- from .soupManager import *
2
-
3
-
4
- class crawlManager():
5
- def __init__(self,url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
6
- self.url=url
7
- self.source_code=source_code
8
- self.parse_type=parse_type
9
- self.url_mgr = url_mgr or urlManager(url=self.url)
10
- self.req_mgr = requestManager(url_mgr=self.url_mgr)
11
- self.get_new_source_and_url(url)
12
- def get_new_source_and_url(self,url=None):
13
- if url == None:
14
- url = self.url
15
- self.response = self.req_mgr.response
16
- self.source_code=self.req_mgr.source_code
17
- def get_classes_and_meta_info():
18
- class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
19
- attrs = 'href','src'
20
- unique_classes, images=discover_classes_and_images(self,tag_name,class_name_1,class_name_2,class_value,attrs)
21
- return unique_classes, images
22
- def extract_links_from_url(self,url=None):
23
- """
24
- Extracts all href and src links from a given URL's source code.
25
-
26
- Args:
27
- base_url (str): The URL from which to extract links.
28
-
29
- Returns:
30
- dict: Dictionary containing image links and external links under the parent page.
31
- """
32
- url = url or self.url
33
- agg_js = {'images':[],'external_links':[]}
34
-
35
- if self.response != None:
36
- attrs = 'href','src'
37
- href_links,src_links='',''
38
- links = [href_links,src_links]
39
- for i,each in enumerate(attrs):
40
- links[i]= [a[attr[i]] for a in get_find_all_with_attributes(self, attrs[i])]
41
- # Convert all links to absolute links
42
- absolute_links = [(url, link) for link in links[0] + links[1]]
43
- # Separate images and external links
44
- images = [link for link in absolute_links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
45
- external_links = [link for link in absolute_links if urlparse(link).netloc != urlparse(url).netloc]
46
- agg_js['images']=images
47
- agg_js['external_links']=external_links
48
-
49
- return agg_js
50
- def get_all_website_links(self,tag="a",attr="href") -> list:
51
- """
52
- Returns all URLs that are found on the specified URL and belong to the same website.
53
-
54
- Args:
55
- url (str): The URL to search for links.
56
-
57
- Returns:
58
- list: A list of URLs that belong to the same website as the specified URL.
59
- """
60
- all_urls=[self.url_mgr.url]
61
- domain = self.url_mgr.domain
62
- all_attribs = get_all_attribute_values(self.url_mgr.url)
63
- for href in all_attribs.get('href',[]):
64
- if href == "" or href is None:
65
- # href empty tag
66
- continue
67
- href=self.url_mgr.get_relative_href(self.url_mgr.url,href)
68
- if not self.url_mgr.is_valid_url(href):
69
- # not a valid URL
70
- continue
71
- if href in all_urls:
72
- # already in the set
73
- continue
74
- if domain not in href:
75
- # external link
76
- continue
77
- all_urls.append(href)
78
-
79
- return all_urls
80
-
81
- def correct_xml(xml_string):
82
- # Parse the XML string
83
- root = ET.fromstring(xml_string)
84
-
85
- # Loop through each <image:loc> element and correct its text if needed
86
- for image_loc in root.findall(".//image:loc", namespaces={'image': 'http://www.google.com/schemas/sitemap-image/1.1'}):
87
- # Replace '&' with '&amp;' in the element's text
88
- if '&' in image_loc.text:
89
- image_loc.text = image_loc.text.replace('&', '&amp;')
90
-
91
- # Convert the corrected XML back to string
92
- corrected_xml = ET.tostring(root, encoding='utf-8').decode('utf-8')
93
- return corrected_xml
94
-
95
-
96
- def determine_values(self,url=None):
97
- # This is just a mockup. In a real application, you'd analyze the URL or its content.
98
- url = url or self.url
99
- # Assuming a blog site
100
- if 'blog' in url:
101
- if '2023' in url: # Assuming it's a current year article
102
- return ('weekly', '0.8')
103
- else:
104
- return ('monthly', '0.6')
105
- elif 'contact' in url:
106
- return ('yearly', '0.3')
107
- else: # Homepage or main categories
108
- return ('weekly', '1.0')
109
- def crawl(self,url, max_depth=3, depth=1):
110
- visited=set()
111
- if depth > max_depth:
112
- return []
113
-
114
- if url in visited:
115
- return []
116
-
117
- visited.add(url)
118
-
119
- try:
120
- soup = get_soup(url)
121
- links = [a['href'] for a in soup.find_all('a', href=True)]
122
- valid_links = []
123
-
124
- for link in links:
125
- parsed_link = urlparse(link)
126
- base_url = "{}://{}".format(parsed_link.scheme, parsed_link.netloc)
127
-
128
- if base_url == url: # Avoiding external URLs
129
- final_link = urljoin(url, parsed_link.path)
130
- if final_link not in valid_links:
131
- valid_links.append(final_link)
132
-
133
- for link in valid_links:
134
- crawl(link, max_depth, depth+1)
135
-
136
- return valid_links
137
-
138
- except Exception as e:
139
- print(f"Error crawling {url}: {e}")
140
- return []
141
-
142
-
143
- # Define or import required functions here, like get_all_website_links, determine_values,
144
- # discover_classes_and_meta_images, and extract_links_from_url.
145
- def get_meta_info(self,url=None):
146
- url = url or self.url
147
- soup_mgr = soupManager(url=url)
148
- meta_info = {}
149
- # Fetch the title if available
150
- meta_tags = soup_mgr.find_all("meta")
151
- url = eatAll(str(url),['',' ','\n','\t','\\','/'])
152
- attribs = get_all_attribute_values(url)
153
- soup = get_soup(url)
154
-
155
- for meta_tag in meta_tags:
156
- for attr, values in meta_tag.attrs.items():
157
-
158
- if attr not in meta_tag:
159
- meta_tag[attr] = []
160
- if values not in meta_tag[attr]:
161
- meta_tag[attr].append(values)
162
- title_tag = soup.find_all("title")
163
- if title_tag:
164
- meta_info["title"] = title_tag
165
- # Fetch meta tags
166
- for meta_tag in soup.find_all('meta'):
167
- name = meta_tag.get('name') or meta_tag.get('property')
168
- if name:
169
- content = meta_tag.get('content')
170
- if content:
171
- meta_info[name] = content
172
-
173
- return meta_info
174
- def generate_sitemap(self,domain):
175
-
176
- with open('sitemap.xml', 'w', encoding='utf-8') as f:
177
- string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n'
178
-
179
- for url in self.get_all_website_links():
180
- string += f' <url>\n <loc>{url}</loc>\n'
181
- preprocess=[]
182
- self.get_new_source_and_url(url=url)
183
- links = get_all_attribute_values(url)
184
- images = [link for link in links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
185
-
186
- for img in images:
187
- if str(img).lower() not in preprocess:
188
- try:
189
- escaped_img = img.replace('&', '&amp;')
190
-
191
- str_write = f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n'
192
- string += str_write
193
- except:
194
- pass
195
- preprocess.append(str(img).lower())
196
- frequency, priority = self.determine_values(url)
197
- string += f' <changefreq>{frequency}</changefreq>\n'
198
- string += f' <priority>{priority}</priority>\n'
199
- string += f' </url>\n'
200
-
201
- string += '</urlset>\n'
202
- f.write(string)
203
- # Output summary
204
- print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
205
-
206
- # Output class and link details
207
- for url in urls:
208
- print(f"\nDetails for {url}:")
209
- classes, meta_img_refs = discover_classes_and_meta_images(url)
210
-
211
- print("\nClasses with href or src attributes:")
212
- for class_name in classes:
213
- print(f"\t{class_name}")
214
-
215
- print("\nMeta Image References:")
216
- for img_ref in meta_img_refs:
217
- print(f"\t{img_ref}")
218
-
219
- links = extract_links_from_url(url)
220
-
221
- print("\nImages:")
222
- for img in links['images']:
223
- print(f"\t{img}")
224
-
225
- print("\nExternal Links:")
226
- for ext_link in links['external_links']:
227
- print(f"\t{ext_link}")
228
-
229
- class crawlManagerSingleton():
230
- _instance = None
231
- @staticmethod
232
- def get_instance(url=None,source_code=None,parse_type="html.parser"):
233
- if crawlManagerSingleton._instance is None:
234
- crawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
235
- elif parse_type != crawlManagerSingleton._instance.parse_type or url != crawlManagerSingleton._instance.url or source_code != crawlManagerSingleton._instance.source_code:
236
- crawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
237
- return crawlManagerSingleton._instance
238
- def get_crawl_mgr(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
239
- url = get_url(url=url,url_mgr=url_mgr)
240
- url_mgr = get_url(url=url,url_mgr=url_mgr)
241
- req_mgr=get_req_mgr(url=url,url_mgr=url_mgr,source_code=source_code)
242
- source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr)
243
- soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,parse_type=parse_type)
244
- crawl_mgr = crawlManager(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
245
- return crawl_mgr
246
- def get_domain_crawl(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser",max_depth=3, depth=1):
247
- crawl_mgr = get_crawl_mgr(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
248
- url = get_url(url=url,url_mgr=url_mgr)
249
- all_domain_links = crawl_mgr.crawl(url=url, max_depth=max_depth, depth=depth)
250
- return all_domain_links