abstract-webtools 0.1.6.124__tar.gz → 0.1.6.125__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/setup.py +1 -1
  3. abstract_webtools-0.1.6.125/src/abstract_webtools/managers/clownworld/__init__.py +1 -0
  4. abstract_webtools-0.1.6.125/src/abstract_webtools/managers/clownworld/get_bolshevid_video.py +232 -0
  5. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
  6. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools.egg-info/SOURCES.txt +2 -0
  7. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/README.md +0 -0
  8. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/pyproject.toml +0 -0
  9. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/setup.cfg +0 -0
  10. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/__init__.py +0 -0
  11. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/abstract_usurpit.py +0 -0
  12. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/abstract_webtools.py +0 -0
  13. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/big_user_agent_list.py +0 -0
  14. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/domain_identifier.py +0 -0
  15. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/extention_list.py +0 -0
  16. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/find_dirs.py +0 -0
  17. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/k2s_downloader.py +0 -0
  18. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/main.py +0 -0
  19. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/__init__.py +0 -0
  20. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/allss//.py" +0 -0
  21. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/cipherManager.py +0 -0
  22. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/crawlManager.py +0 -0
  23. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
  24. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/curlMgr.py +0 -0
  25. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/domainManager.py +0 -0
  26. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  27. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/get_test.py +0 -0
  28. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
  29. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
  30. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  31. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/networkManager.py +0 -0
  32. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
  33. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/requestManager/requestManager.py +0 -0
  34. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/seleniumManager.py +0 -0
  35. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
  36. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
  37. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
  38. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/sslManager.py +0 -0
  39. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  40. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
  41. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/urlManager/urlManager.py +0 -0
  42. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  43. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  44. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
  45. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/soup_gui.py +0 -0
  46. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/url_grabber.py +0 -0
  47. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools/url_grabber_new.py +0 -0
  48. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  49. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools.egg-info/requires.txt +0 -0
  50. {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.125}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.124
3
+ Version: 0.1.6.125
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.124',
7
+ version='0.1.6.125',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -0,0 +1 @@
1
+ import * from '.get_bolshevid_videos'
@@ -0,0 +1,232 @@
1
+ from abstract_webtools import *
2
+ from abstract_utilities import get_logFile
3
+ import urllib,bs4
4
+ from urllib.parse import urlparse, parse_qs, unquote
5
+ from typing import List, Optional
6
+ logger = get_logFile(__name__)
7
+ from abstract_webtools import *
8
+ def get_metas(url):
9
+ type_vars={}
10
+ try:
11
+ for soup in get_soup(url):
12
+ for sou in soup:
13
+ for meta in sou.find_all('meta'):
14
+ meta_prop = meta.get('property')
15
+ if meta_prop:
16
+ for typ in ['twitter','og']:
17
+ if meta_prop.startswith(typ):
18
+ if typ not in type_vars:
19
+ type_vars[typ] = {}
20
+ prop_typ = meta_prop.split(':')[-1]
21
+ if meta:
22
+ type_vars[typ][prop_typ] = meta.get('content')
23
+ if prop_typ not in type_vars:
24
+ type_vars[prop_typ] = type_vars[typ][prop_typ]
25
+ except Exception as e:
26
+ logger.info(f"{e}")
27
+ return type_vars
28
+ def get_dl_vid(url,download_directory=None,output_filename=None,get_info=None,download_video=None):
29
+ try:
30
+ video_mgr = dl_video(url,download_directory=download_directory,output_filename=output_filename,get_info=get_info,download_video=download_video)
31
+ video_info = get_video_info_from_mgr(video_mgr)
32
+ if video_info:
33
+ return video_info
34
+ except:
35
+ pass
36
+
37
+ def for_dl_soup_vid(url,download_directory=None,output_filename=None,get_info=None,download_video=None):
38
+ videos = soupManager(url).soup.find_all('video')
39
+ for video in videos:
40
+ video_info=None
41
+ try:
42
+ if video and isinstance(video,dict):
43
+ video_mgr = dl_video(video.get("src"),download_directory=download_directory,output_filename=output_filename,get_info=get_info,download_video=download_video)
44
+ video_info = get_video_info_from_mgr(video_mgr)
45
+ except:
46
+ video_info=None
47
+ if video_info:
48
+ return video_info
49
+
50
+ def for_dl_video(url,download_directory=None,output_filename=None,get_info=None,download_video=None):
51
+ download_directory =bool_or_default(download_directory,default=os.getcwd())
52
+ get_info = bool_or_default(get_info,default=True)
53
+ download_video =bool_or_default(download_video,default=True)
54
+ meta_data = get_metas(url) or {}
55
+ logger.info(meta_data)
56
+ kwargs = {"download_directory":download_directory,"output_filename":output_filename,"get_info":get_info,"download_video":download_video}
57
+ for func in [for_dl_soup_vid,get_dl_vid]:
58
+ context = {}
59
+ try:
60
+ video_info = func(url,**kwargs)
61
+ for key in ['file_path','id']:
62
+ value = make_list(get_any_value(video_info,key) or None)[0]
63
+ if isinstance(value,dict):
64
+ context.update(value)
65
+ else:
66
+ context[key] = value
67
+ if video_info and isinstance(video_info,dict):
68
+ file_path = video_info.get('file_path')
69
+ ext = os.path.splitext(file_path)[-1]
70
+ file_id = video_info.get('id')
71
+ for key,value in meta_data.items():
72
+ if isinstance(value,dict):
73
+ context.update(value)
74
+ else:
75
+ context[key] = value
76
+
77
+ new_dir = os.path.join(download_directory,str(file_id))
78
+ os.makedirs(new_dir,exist_ok=True)
79
+ video_path = os.path.join(new_dir,f"video{ext}")
80
+ shutil.move(file_path,video_path)
81
+ info_path = os.path.join(new_dir,'info.json')
82
+ context['file_path']=video_path
83
+ video_info['context']=context
84
+ safe_dump_to_json(data=video_info,file_path=info_path)
85
+ except Exception as e:
86
+ logger.info(f"{e}")
87
+ video_info=None
88
+ if video_info:
89
+
90
+ logger.info(video_info)
91
+ return video_info
92
+
93
+ def is_valid_url(url: str) -> bool:
94
+ """Check if a string is a valid URL."""
95
+ try:
96
+ result = urlparse(url)
97
+ return all([result.scheme, result.netloc]) # Must have scheme (e.g., http) and netloc (e.g., example.com)
98
+ except ValueError:
99
+ return False
100
+
101
+ def process_urls(urls: List[str], extract: str = "base") -> List[str]:
102
+ """
103
+ Process a list of URLs to extract either the base URL or a specific query parameter.
104
+ Args:
105
+ urls: List of URLs or strings containing URLs.
106
+ extract: What to extract ("base" for base URL, or a query parameter like "v").
107
+ Returns:
108
+ List of unique, processed URL components.
109
+ """
110
+ result = []
111
+ for url in make_list(urls):
112
+ # Handle strings that may contain multiple URLs or fragments
113
+ url = unquote(url.strip()) # Decode URL-encoded characters
114
+ if not is_valid_url(url):
115
+ # Try to extract URLs from fragments containing 'http'
116
+ if 'http' in url:
117
+ for part in url.split('http')[1:]:
118
+ candidate = f"http{part}"
119
+ if is_valid_url(candidate):
120
+ parsed = urlparse(candidate)
121
+ if extract == "base":
122
+ base_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
123
+ if base_url not in result:
124
+ result.append(base_url)
125
+ else:
126
+ query_params = parse_qs(parsed.query)
127
+ values = query_params.get(extract, [])
128
+ result.extend([v for v in values if v and v not in result])
129
+ continue
130
+ # Valid URL
131
+ parsed = urlparse(url)
132
+ if extract == "base":
133
+ base_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
134
+ if base_url not in result:
135
+ result.append(base_url)
136
+ else:
137
+ query_params = parse_qs(parsed.query)
138
+ values = query_params.get(extract, [])
139
+ result.extend([v for v in values if v and v not in result])
140
+ return result
141
+ def get_urls(url,urls = []):
142
+ for url in make_list(url):
143
+ if isinstance(url,list):
144
+ for url in make_list(url):
145
+ urls = get_urls(url,urls = urls)
146
+ elif isinstance(url,dict):
147
+ urls.append(url.get('value'))
148
+ else:
149
+ urls.append(url)
150
+ return urls
151
+ def get_url_list(urls = []):
152
+ url_list = []
153
+ for url in make_list(urls):
154
+ string = ''
155
+ if url.startswith('http'):
156
+ string = 'http'
157
+ for piece in url.split('http'):
158
+ string += piece
159
+ url_list.append(string)
160
+ string = 'http'
161
+ return url_list
162
+ def get_desired_links(url):
163
+ urls = make_list(url)
164
+ try:
165
+ urlMgr = linkManager(url)
166
+ urls = urlMgr.all_desired_links
167
+ except Exception as e:
168
+ logger.info(f"{e}")
169
+ return urls
170
+ def deriveUrlList(url):
171
+ urls = get_desired_links(url)
172
+ url_functions = [get_urls,get_url_list,process_urls]
173
+ for url_function in url_functions:
174
+ try:
175
+ urls = url_function(urls)
176
+ except Exception as e:
177
+ logger.info(f"{e}")
178
+ input()
179
+ return urls
180
+ def validate_video_urls(urls,
181
+ get_info_url=False,
182
+ get_for_video=False,
183
+ download_directory=None,
184
+ output_filename=None,
185
+ get_info=True,
186
+ download_video=False):
187
+ output_urls = []
188
+ for url in make_list(urls):
189
+ video_info=None
190
+ if url:
191
+ video_info = dl_video(
192
+ url,
193
+ download_directory=download_directory,
194
+ output_filename=output_filename,
195
+ get_info=get_info,
196
+ download_video=download_video
197
+ )
198
+ if video_info:
199
+ output_urls.append(url)
200
+ if get_info_url or get_for_video:
201
+ output_urls[-1] = video_info
202
+ if get_for_video:
203
+ dl_info = for_dl_video(
204
+ url,
205
+ download_directory=download_directory,
206
+ output_filename=output_filename,
207
+ get_info=True,
208
+ download_video=True
209
+ )
210
+ if dl_info:
211
+ output_urls[-1]=dl_info
212
+ if get_info_url:
213
+ if isinstance(output_urls[-1],dict):
214
+ output_urls[-1]['initial_url'] = url
215
+
216
+ return output_urls
217
+ def get_bolshevid_videos(url,
218
+ get_info_url=True,
219
+ get_for_video=True,
220
+ download_directory=None,
221
+ output_filename=None,
222
+ get_info=True,
223
+ download_video=False):
224
+ urls = deriveUrlList(url)
225
+ video_urls = validate_video_urls(urls,
226
+ get_info_url=get_info_url,
227
+ get_for_video=get_for_video,
228
+ download_directory=download_directory,
229
+ output_filename=output_filename,
230
+ get_info=get_info,
231
+ download_video=download_video)
232
+ return video_urls
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.124
3
+ Version: 0.1.6.125
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -36,6 +36,8 @@ src/abstract_webtools/managers/tlsAdapter.py
36
36
  src/abstract_webtools/managers/userAgentManager.py
37
37
  src/abstract_webtools/managers/videoDownloader.py
38
38
  src/abstract_webtools/managers/videoDownloader2.py
39
+ src/abstract_webtools/managers/clownworld/__init__.py
40
+ src/abstract_webtools/managers/clownworld/get_bolshevid_video.py
39
41
  src/abstract_webtools/managers/linkManager/__init__.py
40
42
  src/abstract_webtools/managers/linkManager/linkManager.py
41
43
  src/abstract_webtools/managers/requestManager/__init__.py