abstract-webtools 0.1.6.51__py3-none-any.whl → 0.1.6.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,169 @@
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urljoin, urlparse
4
+ import os
5
+ import shutil
6
+ import time
7
+ from abstract_webtools import *
8
+
9
+
10
+ # Import your custom classes/functions
11
+ # from your_module import linkManager, get_soup_mgr
12
+
13
+ # Configuration
14
+ def normalize_url(url, base_url):
15
+ """
16
+ Normalize and resolve relative URLs, ensuring proper domain and format.
17
+ """
18
+ # If URL starts with the base URL repeated, remove the extra part
19
+ if url.startswith(base_url):
20
+ url = url[len(base_url):]
21
+
22
+ # Resolve the URL against the base URL
23
+ normalized_url = urljoin(base_url, url.split('#')[0])
24
+
25
+ # Ensure only URLs belonging to the base domain are kept
26
+ if not normalized_url.startswith(base_url):
27
+ return None
28
+
29
+ return normalized_url
30
+
31
+
32
+ def is_valid_url(url, base_domain):
33
+ """
34
+ Check if the URL is valid and belongs to the same domain.
35
+ """
36
+ parsed = urlparse(url)
37
+ return parsed.scheme in ('http', 'https') and parsed.netloc == base_domain
38
+ def save_page(url, content,output_dir):
39
+ """
40
+ Save HTML page to local directory.
41
+ """
42
+ parsed_url = urlparse(url)
43
+ page_path = parsed_url.path.lstrip('/')
44
+
45
+ if not page_path or page_path.endswith('/'):
46
+ page_path = os.path.join(page_path, 'index.html')
47
+ elif not os.path.splitext(page_path)[1]:
48
+ page_path += '.html'
49
+
50
+ page_full_path = os.path.join(output_dir, page_path)
51
+ os.makedirs(os.path.dirname(page_full_path), exist_ok=True)
52
+
53
+ with open(page_full_path, 'w', encoding='utf-8') as f:
54
+ f.write(content)
55
+ print(f"Saved page: {page_full_path}")
56
+ def save_asset(asset_url, base_url,output_dir,session):
57
+ """
58
+ Download and save assets like images, CSS, JS files.
59
+ """
60
+ asset_url = normalize_url(asset_url, base_url)
61
+ if asset_url in downloaded_assets:
62
+ return
63
+ downloaded_assets.add(asset_url)
64
+
65
+ parsed_url = urlparse(asset_url)
66
+ asset_path = parsed_url.path.lstrip('/')
67
+ if not asset_path:
68
+ return # Skip if asset path is empty
69
+
70
+ asset_full_path = os.path.join(output_dir, asset_path)
71
+ os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
72
+
73
+ try:
74
+ response = session.get(asset_url, stream=True)
75
+ response.raise_for_status()
76
+ with open(asset_full_path, 'wb') as f:
77
+ shutil.copyfileobj(response.raw, f)
78
+ print(f"Saved asset: {asset_full_path}")
79
+ except Exception as e:
80
+ print(f"Failed to save asset {asset_url}: {e}")
81
+ class usurpManager():
82
+ def __init__(self,url,output_dir=None,max_depth=None,wait_between_requests=None,operating_system=None, browser=None, version=None,user_agent=None,website_bot=None):
83
+ self.url = url
84
+ website_bot = website_bot or 'http://yourwebsite.com/bot'
85
+ self.user_agent_mgr = UserAgentManager(operating_system=operating_system,browser=browser,version=version,user_agent=user_agent)
86
+ self.BASE_URL = urlManager(url=self.url).url # Replace with your website's URL
87
+ self.OUTPUT_DIR = output_dir or 'download_site'
88
+ self.MAX_DEPTH = max_depth or 5 # Adjust as needed
89
+ self.WAIT_BETWEEN_REQUESTS = wait_between_requests or 1 # Seconds to wait between requests
90
+ USER_AGENT = self.user_agent_mgr.get_user_agent()
91
+ self.USER_AGENT = f"{USER_AGENT};{website_bot})" # Customize as needed
92
+ # Initialize global sets
93
+ self.visited_pages = set()
94
+ self.downloaded_assets = set()
95
+
96
+ # Session with custom headers
97
+ self.session = requests.Session()
98
+ self.session.headers.update({
99
+ 'User-Agent': USER_AGENT,
100
+ 'Accept-Language': 'en-US,en;q=0.5',
101
+ "Access-Control-Allow-Origin": "*"})
102
+
103
+ def process_page(self,url, depth, base_domain):
104
+ """
105
+ Process a single page: download assets, save HTML, and crawl links.
106
+ """
107
+ print(url)
108
+ if url in self.visited_pages or depth > self.MAX_DEPTH:
109
+ return
110
+ self.visited_pages.add(url)
111
+
112
+ try:
113
+ # Fetch the page content
114
+ response = self.session.get(url)
115
+ response.raise_for_status()
116
+ content = response.text
117
+
118
+ # Use your get_soup_mgr function to get the soup and attributes
119
+ soup_mgr = get_soup_mgr(url=url)
120
+ soup = soup_mgr.soup
121
+ all_attributes = soup_mgr.get_all_attribute_values()
122
+ # Now you can use all_attributes as needed
123
+
124
+ # Update asset links to local paths
125
+ for tag in soup.find_all(['img', 'script', 'link']):
126
+ attr = 'src' if tag.name != 'link' else 'href'
127
+ asset_url = tag.get(attr)
128
+ if asset_url:
129
+ full_asset_url = normalize_url(asset_url, url)
130
+ parsed_asset_url = urlparse(full_asset_url)
131
+
132
+ if is_valid_url(full_asset_url, base_domain):
133
+ save_asset(full_asset_url, self.url,self.session)
134
+ # Update tag to point to the local asset
135
+ local_asset_path = '/' + parsed_asset_url.path.lstrip('/')
136
+ tag[attr] = local_asset_path
137
+
138
+ # Save the modified page
139
+ save_page(url, str(soup),self.OUTPUT_DIR)
140
+
141
+ # Use your linkManager to find all domain links
142
+ link_mgr = linkManager(url=url)
143
+ all_domains = link_mgr.find_all_domain()
144
+
145
+ # Process each domain link
146
+ for link_url in all_domains:
147
+ normalized_link = normalize_url(link_url, url)
148
+ if is_valid_url(normalized_link, base_domain):
149
+ time.sleep(self.WAIT_BETWEEN_REQUESTS)
150
+ self.process_page(normalized_link, depth + 1, base_domain)
151
+
152
+ except Exception as e:
153
+ print(f"Failed to process page {url}: {e}")
154
+
155
+ def main(self):
156
+ # Ensure output directory exists
157
+ os.makedirs(self.OUTPUT_DIR, exist_ok=True)
158
+
159
+ base_parsed = urlparse(self.BASE_URL)
160
+ base_domain = base_parsed.netloc
161
+
162
+ self.process_page(self.BASE_URL, 0, base_domain)
163
+ print("Website copying completed.")
164
+ def test_download(url,directory):
165
+ url=url or 'https://algassert.com/quantum/2016/01/07/Delayed-Choice-Quantum-Erasure.html'
166
+ output_dir= directory or os.path.join(os.getcwd(),'testit')
167
+ os.makedirs(output_dir,exist_ok=True)
168
+ site_mgr = usurpManager(url,output_dir)
169
+ site_mgr.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.51
3
+ Version: 0.1.6.52
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -1,5 +1,6 @@
1
1
  abstract_webtools/__init__.py,sha256=Bb8BHx0wYZjxAQc_RnNdw9-qTraQkWjTdwzlpRq6dY4,124
2
2
  abstract_webtools/abstract_userpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-P2w1u6ng,6462
3
+ abstract_webtools/abstract_usurpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-P2w1u6ng,6462
3
4
  abstract_webtools/abstract_webtools.py,sha256=3NzGmJlZvrdVtEcUi2K5iUgWr1822IBPhIN9us2e2t0,3859
4
5
  abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
5
6
  abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
@@ -36,7 +37,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
36
37
  abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
37
38
  abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
38
39
  abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
39
- abstract_webtools-0.1.6.51.dist-info/METADATA,sha256=TtzlsxJA5LOLDdFCR-afNeQ28MWM5Xma7wZPY-WhKmY,16029
40
- abstract_webtools-0.1.6.51.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
41
- abstract_webtools-0.1.6.51.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
42
- abstract_webtools-0.1.6.51.dist-info/RECORD,,
40
+ abstract_webtools-0.1.6.52.dist-info/METADATA,sha256=vwuIvZCia6x82Pw8HkBqWaNU1ScWTHcOg1L-tb7auPE,16029
41
+ abstract_webtools-0.1.6.52.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
42
+ abstract_webtools-0.1.6.52.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
43
+ abstract_webtools-0.1.6.52.dist-info/RECORD,,