abstract-webtools 0.1.6.119__tar.gz → 0.1.6.120__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/setup.py +1 -1
  3. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/abstract_usurpit.py +70 -42
  4. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
  5. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/README.md +0 -0
  6. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/pyproject.toml +0 -0
  7. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/setup.cfg +0 -0
  8. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/__init__.py +0 -0
  9. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/abstract_webtools.py +0 -0
  10. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/big_user_agent_list.py +0 -0
  11. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/domain_identifier.py +0 -0
  12. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/extention_list.py +0 -0
  13. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/find_dirs.py +0 -0
  14. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/k2s_downloader.py +0 -0
  15. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/main.py +0 -0
  16. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/__init__.py +0 -0
  17. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/allss//.py" +0 -0
  18. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/cipherManager.py +0 -0
  19. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/crawlManager.py +0 -0
  20. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
  21. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/curlMgr.py +0 -0
  22. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/domainManager.py +0 -0
  23. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  24. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/get_test.py +0 -0
  25. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
  26. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
  27. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  28. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/networkManager.py +0 -0
  29. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
  30. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/requestManager/requestManager.py +0 -0
  31. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/seleniumManager.py +0 -0
  32. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
  33. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
  34. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
  35. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/sslManager.py +0 -0
  36. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  37. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
  38. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/urlManager/urlManager.py +0 -0
  39. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  40. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  41. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
  42. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/soup_gui.py +0 -0
  43. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/url_grabber.py +0 -0
  44. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools/url_grabber_new.py +0 -0
  45. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools.egg-info/SOURCES.txt +0 -0
  46. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  47. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools.egg-info/requires.txt +0 -0
  48. {abstract_webtools-0.1.6.119 → abstract_webtools-0.1.6.120}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.119
3
+ Version: 0.1.6.120
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.119',
7
+ version='0.1.6.120',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -53,7 +53,7 @@ def is_valid_url(url, base_domain):
53
53
  """
54
54
  parsed = urlparse(url)
55
55
  return parsed.scheme in ('http', 'https') and parsed.netloc == base_domain
56
- def save_page(url, content,output_dir):
56
+ def get_save_page_path(url, output_dir):
57
57
  """
58
58
  Save HTML page to local directory.
59
59
  """
@@ -66,12 +66,22 @@ def save_page(url, content,output_dir):
66
66
  page_path += '.html'
67
67
 
68
68
  page_full_path = os.path.join(output_dir, page_path)
69
- os.makedirs(os.path.dirname(page_full_path), exist_ok=True)
70
-
71
- with open(page_full_path, 'w', encoding='utf-8') as f:
72
- f.write(content)
73
- print(f"Saved page: {page_full_path}")
74
- def save_asset(asset_url, base_url,output_dir,downloaded_assets=None,session=None):
69
+ return page_full_path
70
+ def save_page(url, content,output_dir):
71
+ page_full_path = get_save_page_path(url=url,
72
+ output_dir=output_dir)
73
+ if page_full_path:
74
+ dirname = os.path.dirname(page_full_path)
75
+ os.makedirs(dirname, exist_ok=True)
76
+
77
+ with open(page_full_path, 'w', encoding='utf-8') as f:
78
+ f.write(content)
79
+ print(f"Saved page: {page_full_path}")
80
+ def get_asset_path(asset_url,
81
+ base_url,
82
+ output_dir,
83
+ downloaded_assets=None,
84
+ session=None):
75
85
  """
76
86
  Download and save assets like images, CSS, JS files.
77
87
  """
@@ -88,17 +98,29 @@ def save_asset(asset_url, base_url,output_dir,downloaded_assets=None,session=Non
88
98
  return # Skip if asset path is empty
89
99
 
90
100
  asset_full_path = os.path.join(output_dir, asset_path)
91
- os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
92
-
93
- try:
94
- response = session.get(asset_url, stream=True)
95
- response.raise_for_status()
96
- with open(asset_full_path, 'wb') as f:
97
- shutil.copyfileobj(response.raw, f)
98
- print(f"Saved asset: {asset_full_path}")
99
- except Exception as e:
100
- print(f"Failed to save asset {asset_url}: {e}")
101
- return downloaded_assets
101
+ return asset_full_path
102
+ def save_asset(asset_url,
103
+ base_url,
104
+ output_dir,
105
+ downloaded_assets=None,
106
+ session=None):
107
+ asset_full_path = get_asset_path(asset_url=asset_url,
108
+ base_url=base_url,
109
+ output_dir=output_dir,
110
+ downloaded_assets=downloaded_assets,
111
+ session=session)
112
+ if asset_full_path:
113
+ os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
114
+
115
+ try:
116
+ response = session.get(asset_url, stream=True)
117
+ response.raise_for_status()
118
+ with open(asset_full_path, 'wb') as f:
119
+ shutil.copyfileobj(response.raw, f)
120
+ print(f"Saved asset: {asset_full_path}")
121
+ except Exception as e:
122
+ print(f"Failed to save asset {asset_url}: {e}")
123
+ return downloaded_assets
102
124
  class usurpManager():
103
125
  def __init__(self,url,output_dir=None,max_depth=None,wait_between_requests=None,operating_system=None, browser=None, version=None,user_agent=None,website_bot=None):
104
126
  self.url = url
@@ -135,34 +157,40 @@ class usurpManager():
135
157
  response = self.session.get(url)
136
158
  #response.raise_for_status()
137
159
  content = response.text
138
-
139
- # Use your get_soup_mgr function to get the soup and attributes
140
- soup_mgr = get_soup_mgr(url=url)
141
- soup = soup_mgr.soup
142
- all_attributes = soup_mgr.get_all_attribute_values()
143
- # Now you can use all_attributes as needed
144
-
145
- # Update asset links to local paths
146
- for tag in soup.find_all(['img', 'script', 'link']):
147
- attr = 'src' if tag.name != 'link' else 'href'
148
- asset_url = tag.get(attr)
149
- if asset_url:
150
- full_asset_url = normalize_url(asset_url, url)
151
- parsed_asset_url = urlparse(full_asset_url)
152
-
153
- if is_valid_url(full_asset_url, base_domain):
154
- self.downloaded_assets = save_asset(full_asset_url, self.url,self.OUTPUT_DIR,self.downloaded_assets,self.session)
155
- # Update tag to point to the local asset
156
- local_asset_path = '/' + parsed_asset_url.path.lstrip('/')
157
- tag[attr] = local_asset_path
158
-
159
- # Save the modified page
160
- save_page(url, str(soup),self.OUTPUT_DIR)
160
+ page_full_path = get_save_page_path(url=url,
161
+ output_dir=self.OUTPUT_DIR)
162
+ if not os.path.exists(page_full_path):
163
+ # Use your get_soup_mgr function to get the soup and attributes
164
+ soup_mgr = get_soup_mgr(url=url)
165
+ soup = soup_mgr.soup
166
+ all_attributes = soup_mgr.get_all_attribute_values()
167
+ # Now you can use all_attributes as needed
168
+ get_asset_path(asset_url=full_asset_url,
169
+ base_url=self.url,
170
+ output_dir=self.OUTPUT_DIR,
171
+ downloaded_assets=self.downloaded_assets,
172
+ session=self.session)
173
+ # Update asset links to local paths
174
+ for tag in soup.find_all(['img', 'script', 'link']):
175
+ attr = 'src' if tag.name != 'link' else 'href'
176
+ asset_url = tag.get(attr)
177
+ if asset_url:
178
+ full_asset_url = normalize_url(asset_url, url)
179
+ parsed_asset_url = urlparse(full_asset_url)
180
+
181
+ if is_valid_url(full_asset_url, base_domain):
182
+ self.downloaded_assets = save_asset(full_asset_url, self.url,self.OUTPUT_DIR,self.downloaded_assets,self.session)
183
+ # Update tag to point to the local asset
184
+ local_asset_path = '/' + parsed_asset_url.path.lstrip('/')
185
+ tag[attr] = local_asset_path
186
+
187
+ # Save the modified page
188
+ save_page(url, str(soup),self.OUTPUT_DIR)
161
189
 
162
190
  # Use your linkManager to find all domain links
163
191
  link_mgr = linkManager(url=url)
164
192
  all_domains = link_mgr.find_all_domain()
165
-
193
+
166
194
  # Process each domain link
167
195
  for link_url in make_list(all_domains):
168
196
  normalized_link = normalize_url(link_url, url)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.119
3
+ Version: 0.1.6.120
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff