abstract-webtools 0.1.6.136__py3-none-any.whl → 0.1.6.138__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,169 +0,0 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- from urllib.parse import urljoin, urlparse
4
- import os
5
- import shutil
6
- import time
7
- from abstract_webtools import *
8
-
9
-
10
- # Import your custom classes/functions
11
- # from your_module import linkManager, get_soup_mgr
12
-
13
- # Configuration
14
- def normalize_url(url, base_url):
15
- """
16
- Normalize and resolve relative URLs, ensuring proper domain and format.
17
- """
18
- # If URL starts with the base URL repeated, remove the extra part
19
- if url.startswith(base_url):
20
- url = url[len(base_url):]
21
-
22
- # Resolve the URL against the base URL
23
- normalized_url = urljoin(base_url, url.split('#')[0])
24
-
25
- # Ensure only URLs belonging to the base domain are kept
26
- if not normalized_url.startswith(base_url):
27
- return None
28
-
29
- return normalized_url
30
-
31
-
32
- def is_valid_url(url, base_domain):
33
- """
34
- Check if the URL is valid and belongs to the same domain.
35
- """
36
- parsed = urlparse(url)
37
- return parsed.scheme in ('http', 'https') and parsed.netloc == base_domain
38
- def save_page(url, content,output_dir):
39
- """
40
- Save HTML page to local directory.
41
- """
42
- parsed_url = urlparse(url)
43
- page_path = parsed_url.path.lstrip('/')
44
-
45
- if not page_path or page_path.endswith('/'):
46
- page_path = os.path.join(page_path, 'index.html')
47
- elif not os.path.splitext(page_path)[1]:
48
- page_path += '.html'
49
-
50
- page_full_path = os.path.join(output_dir, page_path)
51
- os.makedirs(os.path.dirname(page_full_path), exist_ok=True)
52
-
53
- with open(page_full_path, 'w', encoding='utf-8') as f:
54
- f.write(content)
55
- print(f"Saved page: {page_full_path}")
56
- def save_asset(asset_url, base_url,output_dir,session):
57
- """
58
- Download and save assets like images, CSS, JS files.
59
- """
60
- asset_url = normalize_url(asset_url, base_url)
61
- if asset_url in downloaded_assets:
62
- return
63
- downloaded_assets.add(asset_url)
64
-
65
- parsed_url = urlparse(asset_url)
66
- asset_path = parsed_url.path.lstrip('/')
67
- if not asset_path:
68
- return # Skip if asset path is empty
69
-
70
- asset_full_path = os.path.join(output_dir, asset_path)
71
- os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
72
-
73
- try:
74
- response = session.get(asset_url, stream=True)
75
- response.raise_for_status()
76
- with open(asset_full_path, 'wb') as f:
77
- shutil.copyfileobj(response.raw, f)
78
- print(f"Saved asset: {asset_full_path}")
79
- except Exception as e:
80
- print(f"Failed to save asset {asset_url}: {e}")
81
- class usurpManager():
82
- def __init__(self,url,output_dir=None,max_depth=None,wait_between_requests=None,operating_system=None, browser=None, version=None,user_agent=None,website_bot=None):
83
- self.url = url
84
- website_bot = website_bot or 'http://yourwebsite.com/bot'
85
- self.user_agent_mgr = UserAgentManager(operating_system=operating_system,browser=browser,version=version,user_agent=user_agent)
86
- self.BASE_URL = urlManager(url=self.url).url # Replace with your website's URL
87
- self.OUTPUT_DIR = output_dir or 'download_site'
88
- self.MAX_DEPTH = max_depth or 5 # Adjust as needed
89
- self.WAIT_BETWEEN_REQUESTS = wait_between_requests or 1 # Seconds to wait between requests
90
- USER_AGENT = self.user_agent_mgr.get_user_agent()
91
- self.USER_AGENT = f"{USER_AGENT};{website_bot})" # Customize as needed
92
- # Initialize global sets
93
- self.visited_pages = set()
94
- self.downloaded_assets = set()
95
-
96
- # Session with custom headers
97
- self.session = requests.Session()
98
- self.session.headers.update({
99
- 'User-Agent': USER_AGENT,
100
- 'Accept-Language': 'en-US,en;q=0.5',
101
- "Access-Control-Allow-Origin": "*"})
102
-
103
- def process_page(self,url, depth, base_domain):
104
- """
105
- Process a single page: download assets, save HTML, and crawl links.
106
- """
107
- print(url)
108
- if url in self.visited_pages or depth > self.MAX_DEPTH:
109
- return
110
- self.visited_pages.add(url)
111
-
112
- try:
113
- # Fetch the page content
114
- response = self.session.get(url)
115
- response.raise_for_status()
116
- content = response.text
117
-
118
- # Use your get_soup_mgr function to get the soup and attributes
119
- soup_mgr = get_soup_mgr(url=url)
120
- soup = soup_mgr.soup
121
- all_attributes = soup_mgr.get_all_attribute_values()
122
- # Now you can use all_attributes as needed
123
-
124
- # Update asset links to local paths
125
- for tag in soup.find_all(['img', 'script', 'link']):
126
- attr = 'src' if tag.name != 'link' else 'href'
127
- asset_url = tag.get(attr)
128
- if asset_url:
129
- full_asset_url = normalize_url(asset_url, url)
130
- parsed_asset_url = urlparse(full_asset_url)
131
-
132
- if is_valid_url(full_asset_url, base_domain):
133
- save_asset(full_asset_url, self.url,self.session)
134
- # Update tag to point to the local asset
135
- local_asset_path = '/' + parsed_asset_url.path.lstrip('/')
136
- tag[attr] = local_asset_path
137
-
138
- # Save the modified page
139
- save_page(url, str(soup),self.OUTPUT_DIR)
140
-
141
- # Use your linkManager to find all domain links
142
- link_mgr = linkManager(url=url)
143
- all_domains = link_mgr.find_all_domain()
144
-
145
- # Process each domain link
146
- for link_url in all_domains:
147
- normalized_link = normalize_url(link_url, url)
148
- if is_valid_url(normalized_link, base_domain):
149
- time.sleep(self.WAIT_BETWEEN_REQUESTS)
150
- self.process_page(normalized_link, depth + 1, base_domain)
151
-
152
- except Exception as e:
153
- print(f"Failed to process page {url}: {e}")
154
-
155
- def main(self):
156
- # Ensure output directory exists
157
- os.makedirs(self.OUTPUT_DIR, exist_ok=True)
158
-
159
- base_parsed = urlparse(self.BASE_URL)
160
- base_domain = base_parsed.netloc
161
-
162
- self.process_page(self.BASE_URL, 0, base_domain)
163
- print("Website copying completed.")
164
- def test_download(url,directory):
165
- url=url or 'https://algassert.com/quantum/2016/01/07/Delayed-Choice-Quantum-Erasure.html'
166
- output_dir= directory or os.path.join(os.getcwd(),'testit')
167
- os.makedirs(output_dir,exist_ok=True)
168
- site_mgr = usurpManager(url,output_dir)
169
- site_mgr.main()
@@ -1,189 +0,0 @@
1
- from ..abstract_webtools import *
2
- from .urlManager import *
3
- from .requestManager import *
4
- from .soupManager import *
5
- def get_url(url=None,url_mgr=None):
6
- if not url and not url_mgr:
7
- return None
8
- if url:
9
- url_mgr = urlManager(url)
10
- return url_mgr.url
11
- def get_url_mgr(url=None,url_mgr=None):
12
- if not url and not url_mgr:
13
- return None
14
- if url:
15
- url_mgr = urlManager(url)
16
- return url_mgr
17
- class linkManager:
18
- """
19
- LinkManager is a class for managing and extracting links and image links from a web page.
20
-
21
- Args:
22
- url (str): The URL of the web page (default is "https://example.com").
23
- source_code (str or None): The source code of the web page (default is None).
24
- url_mgr (UrlManager or None): An instance of UrlManager (default is None).
25
- request_manager (requestManager or None): An instance of requestManager (default is None).
26
- soup_manager (SoupManager or None): An instance of SoupManager (default is None).
27
- image_link_tags (str): HTML tags to identify image links (default is 'img').
28
- img_link_attrs (str): HTML attributes to identify image link URLs (default is 'src').
29
- link_tags (str): HTML tags to identify links (default is 'a').
30
- link_attrs (str): HTML attributes to identify link URLs (default is 'href').
31
- strict_order_tags (bool): Flag to indicate if tags and attributes should be matched strictly (default is False).
32
- img_attr_value_desired (list or None): Desired attribute values for image links (default is None).
33
- img_attr_value_undesired (list or None): Undesired attribute values for image links (default is None).
34
- link_attr_value_desired (list or None): Desired attribute values for links (default is None).
35
- link_attr_value_undesired (list or None): Undesired attribute values for links (default is None).
36
- associated_data_attr (list): HTML attributes to associate with the extracted links (default is ["data-title", 'alt', 'title']).
37
- get_img (list): HTML attributes used to identify associated images (default is ["data-title", 'alt', 'title']).
38
-
39
- Methods:
40
- re_initialize(): Reinitialize the LinkManager with the current settings.
41
- update_url_mgr(url_mgr): Update the URL manager with a new instance.
42
- update_url(url): Update the URL and reinitialize the LinkManager.
43
- update_source_code(source_code): Update the source code and reinitialize the LinkManager.
44
- update_soup_manager(soup_manager): Update the SoupManager and reinitialize the LinkManager.
45
- update_desired(...): Update the desired settings and reinitialize the LinkManager.
46
- find_all_desired(...): Find all desired links or image links based on the specified criteria.
47
- find_all_domain(): Find all unique domain names in the extracted links.
48
-
49
- Note:
50
- - The LinkManager class helps manage and extract links and image links from web pages.
51
- - The class provides flexibility in specifying criteria for link extraction.
52
- """
53
- def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup_manager=None,image_link_tags='img',img_link_attrs='src',link_tags='a',link_attrs='href',strict_order_tags=False,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
54
- self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
55
- self.url=self.url_mgr.url
56
- self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
57
- self.source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
58
- self.soup_mgr = get_soup_mgr(req_mgr=self.req_mgr,url_mgr=self.url_mgr,source_code = source_code)
59
-
60
- self.strict_order_tags=strict_order_tags
61
- self.image_link_tags=image_link_tags
62
- self.img_link_attrs=img_link_attrs
63
- self.link_tags=link_tags
64
- self.link_attrs=link_attrs
65
- self.img_attr_value_desired=img_attr_value_desired
66
- self.img_attr_value_undesired=img_attr_value_undesired
67
- self.link_attr_value_desired=link_attr_value_desired
68
- self.link_attr_value_undesired=link_attr_value_undesired
69
- self.associated_data_attr=associated_data_attr
70
- self.get_img=get_img
71
- self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,
72
- attr=self.img_link_attrs,
73
- attr_value_desired=self.img_attr_value_desired,
74
- attr_value_undesired=self.img_attr_value_undesired)
75
- self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,
76
- attr=self.link_attrs,
77
- attr_value_desired=self.link_attr_value_desired,
78
- attr_value_undesired=self.link_attr_value_undesired,
79
- associated_data_attr=self.associated_data_attr,
80
- get_img=get_img)
81
- def re_initialize(self):
82
- self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,attr=self.img_link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.img_attr_value_desired,attr_value_undesired=self.img_attr_value_undesired)
83
- self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,attr=self.link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.link_attr_value_desired,attr_value_undesired=self.link_attr_value_undesired,associated_data_attr=self.associated_data_attr,get_img=self.get_img)
84
- def update_url_mgr(self,url_mgr):
85
- self.url_mgr=url_mgr
86
- self.url=self.url_mgr.url
87
- self.req_mgr.update_url_mgr(url_mgr=self.url_mgr)
88
- self.soup_mgr.update_url_mgr(url_mgr=self.url_mgr)
89
- self.source_code=self.soup_mgr.source_code
90
- self.re_initialize()
91
- def update_url(self,url):
92
- self.url=url
93
- self.url_mgr.update_url(url=self.url)
94
- self.url=self.url_mgr.url
95
- self.req_mgr.update_url(url=self.url)
96
- self.soup_mgr.update_url(url=self.url)
97
- self.source_code=self.soup_mgr.source_code
98
- self.re_initialize()
99
- def update_source_code(self,source_code):
100
- self.source_code=source_code
101
- if self.source_code != self.soup_mgr.source_code:
102
- self.soup_mgr.update_source_code(source_code=self.source_code)
103
- self.re_initialize()
104
- def update_soup_manager(self,soup_manager):
105
- self.soup_mgr=soup_manager
106
- self.source_code=self.soup_mgr.source_code
107
- self.re_initialize()
108
- def update_desired(self,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,image_link_tags=None,img_link_attrs=None,link_tags=None,link_attrs=None,strict_order_tags=None,associated_data_attr=None,get_img=None):
109
- self.strict_order_tags = strict_order_tags or self.strict_order_tags
110
- self.img_attr_value_desired=img_attr_value_desired or self.img_attr_value_desired
111
- self.img_attr_value_undesired=img_attr_value_undesired or self.img_attr_value_undesired
112
- self.link_attr_value_desired=link_attr_value_desired or self.link_attr_value_desired
113
- self.link_attr_value_undesired=link_attr_value_undesired or self.link_attr_value_undesired
114
- self.image_link_tags=image_link_tags or self.image_link_tags
115
- self.img_link_attrs=img_link_attrs or self.img_link_attrs
116
- self.link_tags=link_tags or self.link_tags
117
- self.link_attrs=link_attrs or self.link_attrs
118
- self.associated_data_attr=associated_data_attr or self.associated_data_attr
119
- self.get_img=get_img or self.get_img
120
- self.re_initialize()
121
- def find_all_desired(self,tag='img',attr='src',strict_order_tags=False,attr_value_desired=None,attr_value_undesired=None,associated_data_attr=None,get_img=None):
122
- def make_list(obj):
123
- if isinstance(obj,list) or obj==None:
124
- return obj
125
- return [obj]
126
- def get_desired_value(attr,attr_value_desired=None,attr_value_undesired=None):
127
- if attr_value_desired:
128
- for value in attr_value_desired:
129
- if value not in attr:
130
- return False
131
- if attr_value_undesired:
132
- for value in attr_value_undesired:
133
- if value in attr:
134
- return False
135
- return True
136
- attr_value_desired,attr_value_undesired,associated_data_attr,tags,attribs=make_list(attr_value_desired),make_list(attr_value_undesired),make_list(associated_data_attr),make_list(tag),make_list(attr)
137
- desired_ls = []
138
- assiciated_data=[]
139
- for i,tag in enumerate(tags):
140
- attribs_list=attribs
141
- if strict_order_tags:
142
- if len(attribs)<=i:
143
- attribs_list=[None]
144
- else:
145
- attribs_list=make_list(attribs[i])
146
- for attr in attribs_list:
147
- for component in self.soup_mgr.soup.find_all(tag):
148
- if attr in component.attrs and get_desired_value(attr=component[attr],attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired):
149
- if component[attr] not in desired_ls:
150
- desired_ls.append(component[attr])
151
- assiciated_data.append({"value":component[attr]})
152
- if associated_data_attr:
153
- for data in associated_data_attr:
154
- if data in component.attrs:
155
- assiciated_data[-1][data]=component.attrs[data]
156
- if get_img and component.attrs[data]:
157
- if data in get_img and len(component.attrs[data])!=0:
158
- for each in self.soup_mgr.soup.find_all('img'):
159
- if 'alt' in each.attrs:
160
- if each.attrs['alt'] == component.attrs[data] and 'src' in each.attrs:
161
- assiciated_data[-1]['image']=each.attrs['src']
162
- desired_ls.append(assiciated_data)
163
- return desired_ls
164
- def find_all_domain(self):
165
- domain = urlparse(self.url_mgr.url).netloc
166
- domains_ls=[self.url_mgr.url]
167
- for url in self.all_desired_links[:-1]:
168
- if self.url_mgr.is_valid_url(url):
169
- parse = urlparse(url)
170
- comp_domain = parse.netloc
171
- if url not in domains_ls and comp_domain == domain:
172
- domains_ls.append(url)
173
- return domains_ls
174
- def find_all_desired_links(self,tag='img', attr='src',attr_value_desired=None,strict_order_tags=False,attr_value_undesired=None,associated_data_attr=None,all_desired=None,get_img=None):
175
- all_desired = all_desired or self.find_all_desired(tag=tag,attr=attr,strict_order_tags=strict_order_tags,attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired,associated_data_attr=associated_data_attr,get_img=get_img)
176
- assiciated_attrs = all_desired[-1]
177
- valid_assiciated_attrs = []
178
- desired_links=[]
179
- for i,attr in enumerate(all_desired[:-1]):
180
- valid_attr=self.url_mgr.make_valid(attr,self.url_mgr.protocol+'://'+self.url_mgr.domain)
181
- if valid_attr:
182
- desired_links.append(valid_attr)
183
- valid_assiciated_attrs.append(assiciated_attrs[i])
184
- valid_assiciated_attrs[-1]["link"]=valid_attr
185
- desired_links.append(valid_assiciated_attrs)
186
- return desired_links
187
-
188
-
189
-