abstract-webtools 0.1.6.124__tar.gz → 0.1.6.126__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/setup.py +1 -1
- abstract_webtools-0.1.6.126/src/abstract_webtools/managers/clownworld/__init__.py +1 -0
- abstract_webtools-0.1.6.126/src/abstract_webtools/managers/clownworld/get_bolshevid_video.py +232 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools.egg-info/SOURCES.txt +2 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/README.md +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/pyproject.toml +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/setup.cfg +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/__init__.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/abstract_usurpit.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/abstract_webtools.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/big_user_agent_list.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/domain_identifier.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/extention_list.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/find_dirs.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/k2s_downloader.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/main.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/__init__.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/allss//.py" +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/cipherManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/crawlManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/curlMgr.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/domainManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/get_test.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/mySocketClient.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/networkManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/requestManager/requestManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/seleniumManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/sslManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/urlManager/urlManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/userAgentManager.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/videoDownloader.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/url_grabber.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/url_grabber_new.py +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools.egg-info/requires.txt +0 -0
- {abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.126
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
setuptools.setup(
|
6
6
|
name='abstract_webtools',
|
7
|
-
version='0.1.6.
|
7
|
+
version='0.1.6.126',
|
8
8
|
author='putkoff',
|
9
9
|
author_email='partners@abstractendeavors.com',
|
10
10
|
description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
|
@@ -0,0 +1 @@
|
|
1
|
+
import * from .get_bolshevid_videos
|
@@ -0,0 +1,232 @@
|
|
1
|
+
from abstract_webtools import *
|
2
|
+
from abstract_utilities import get_logFile
|
3
|
+
import urllib,bs4
|
4
|
+
from urllib.parse import urlparse, parse_qs, unquote
|
5
|
+
from typing import List, Optional
|
6
|
+
logger = get_logFile(__name__)
|
7
|
+
from abstract_webtools import *
|
8
|
+
def get_metas(url):
|
9
|
+
type_vars={}
|
10
|
+
try:
|
11
|
+
for soup in get_soup(url):
|
12
|
+
for sou in soup:
|
13
|
+
for meta in sou.find_all('meta'):
|
14
|
+
meta_prop = meta.get('property')
|
15
|
+
if meta_prop:
|
16
|
+
for typ in ['twitter','og']:
|
17
|
+
if meta_prop.startswith(typ):
|
18
|
+
if typ not in type_vars:
|
19
|
+
type_vars[typ] = {}
|
20
|
+
prop_typ = meta_prop.split(':')[-1]
|
21
|
+
if meta:
|
22
|
+
type_vars[typ][prop_typ] = meta.get('content')
|
23
|
+
if prop_typ not in type_vars:
|
24
|
+
type_vars[prop_typ] = type_vars[typ][prop_typ]
|
25
|
+
except Exception as e:
|
26
|
+
logger.info(f"{e}")
|
27
|
+
return type_vars
|
28
|
+
def get_dl_vid(url,download_directory=None,output_filename=None,get_info=None,download_video=None):
|
29
|
+
try:
|
30
|
+
video_mgr = dl_video(url,download_directory=download_directory,output_filename=output_filename,get_info=get_info,download_video=download_video)
|
31
|
+
video_info = get_video_info_from_mgr(video_mgr)
|
32
|
+
if video_info:
|
33
|
+
return video_info
|
34
|
+
except:
|
35
|
+
pass
|
36
|
+
|
37
|
+
def for_dl_soup_vid(url,download_directory=None,output_filename=None,get_info=None,download_video=None):
|
38
|
+
videos = soupManager(url).soup.find_all('video')
|
39
|
+
for video in videos:
|
40
|
+
video_info=None
|
41
|
+
try:
|
42
|
+
if video and isinstance(video,dict):
|
43
|
+
video_mgr = dl_video(video.get("src"),download_directory=download_directory,output_filename=output_filename,get_info=get_info,download_video=download_video)
|
44
|
+
video_info = get_video_info_from_mgr(video_mgr)
|
45
|
+
except:
|
46
|
+
video_info=None
|
47
|
+
if video_info:
|
48
|
+
return video_info
|
49
|
+
|
50
|
+
def for_dl_video(url,download_directory=None,output_filename=None,get_info=None,download_video=None):
|
51
|
+
download_directory =bool_or_default(download_directory,default=os.getcwd())
|
52
|
+
get_info = bool_or_default(get_info,default=True)
|
53
|
+
download_video =bool_or_default(download_video,default=True)
|
54
|
+
meta_data = get_metas(url) or {}
|
55
|
+
logger.info(meta_data)
|
56
|
+
kwargs = {"download_directory":download_directory,"output_filename":output_filename,"get_info":get_info,"download_video":download_video}
|
57
|
+
for func in [for_dl_soup_vid,get_dl_vid]:
|
58
|
+
context = {}
|
59
|
+
try:
|
60
|
+
video_info = func(url,**kwargs)
|
61
|
+
for key in ['file_path','id']:
|
62
|
+
value = make_list(get_any_value(video_info,key) or None)[0]
|
63
|
+
if isinstance(value,dict):
|
64
|
+
context.update(value)
|
65
|
+
else:
|
66
|
+
context[key] = value
|
67
|
+
if video_info and isinstance(video_info,dict):
|
68
|
+
file_path = video_info.get('file_path')
|
69
|
+
ext = os.path.splitext(file_path)[-1]
|
70
|
+
file_id = video_info.get('id')
|
71
|
+
for key,value in meta_data.items():
|
72
|
+
if isinstance(value,dict):
|
73
|
+
context.update(value)
|
74
|
+
else:
|
75
|
+
context[key] = value
|
76
|
+
|
77
|
+
new_dir = os.path.join(download_directory,str(file_id))
|
78
|
+
os.makedirs(new_dir,exist_ok=True)
|
79
|
+
video_path = os.path.join(new_dir,f"video{ext}")
|
80
|
+
shutil.move(file_path,video_path)
|
81
|
+
info_path = os.path.join(new_dir,'info.json')
|
82
|
+
context['file_path']=video_path
|
83
|
+
video_info['context']=context
|
84
|
+
safe_dump_to_json(data=video_info,file_path=info_path)
|
85
|
+
except Exception as e:
|
86
|
+
logger.info(f"{e}")
|
87
|
+
video_info=None
|
88
|
+
if video_info:
|
89
|
+
|
90
|
+
logger.info(video_info)
|
91
|
+
return video_info
|
92
|
+
|
93
|
+
def is_valid_url(url: str) -> bool:
|
94
|
+
"""Check if a string is a valid URL."""
|
95
|
+
try:
|
96
|
+
result = urlparse(url)
|
97
|
+
return all([result.scheme, result.netloc]) # Must have scheme (e.g., http) and netloc (e.g., example.com)
|
98
|
+
except ValueError:
|
99
|
+
return False
|
100
|
+
|
101
|
+
def process_urls(urls: List[str], extract: str = "base") -> List[str]:
|
102
|
+
"""
|
103
|
+
Process a list of URLs to extract either the base URL or a specific query parameter.
|
104
|
+
Args:
|
105
|
+
urls: List of URLs or strings containing URLs.
|
106
|
+
extract: What to extract ("base" for base URL, or a query parameter like "v").
|
107
|
+
Returns:
|
108
|
+
List of unique, processed URL components.
|
109
|
+
"""
|
110
|
+
result = []
|
111
|
+
for url in make_list(urls):
|
112
|
+
# Handle strings that may contain multiple URLs or fragments
|
113
|
+
url = unquote(url.strip()) # Decode URL-encoded characters
|
114
|
+
if not is_valid_url(url):
|
115
|
+
# Try to extract URLs from fragments containing 'http'
|
116
|
+
if 'http' in url:
|
117
|
+
for part in url.split('http')[1:]:
|
118
|
+
candidate = f"http{part}"
|
119
|
+
if is_valid_url(candidate):
|
120
|
+
parsed = urlparse(candidate)
|
121
|
+
if extract == "base":
|
122
|
+
base_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
123
|
+
if base_url not in result:
|
124
|
+
result.append(base_url)
|
125
|
+
else:
|
126
|
+
query_params = parse_qs(parsed.query)
|
127
|
+
values = query_params.get(extract, [])
|
128
|
+
result.extend([v for v in values if v and v not in result])
|
129
|
+
continue
|
130
|
+
# Valid URL
|
131
|
+
parsed = urlparse(url)
|
132
|
+
if extract == "base":
|
133
|
+
base_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
134
|
+
if base_url not in result:
|
135
|
+
result.append(base_url)
|
136
|
+
else:
|
137
|
+
query_params = parse_qs(parsed.query)
|
138
|
+
values = query_params.get(extract, [])
|
139
|
+
result.extend([v for v in values if v and v not in result])
|
140
|
+
return result
|
141
|
+
def get_urls(url,urls = []):
|
142
|
+
for url in make_list(url):
|
143
|
+
if isinstance(url,list):
|
144
|
+
for url in make_list(url):
|
145
|
+
urls = get_urls(url,urls = urls)
|
146
|
+
elif isinstance(url,dict):
|
147
|
+
urls.append(url.get('value'))
|
148
|
+
else:
|
149
|
+
urls.append(url)
|
150
|
+
return urls
|
151
|
+
def get_url_list(urls = []):
|
152
|
+
url_list = []
|
153
|
+
for url in make_list(urls):
|
154
|
+
string = ''
|
155
|
+
if url.startswith('http'):
|
156
|
+
string = 'http'
|
157
|
+
for piece in url.split('http'):
|
158
|
+
string += piece
|
159
|
+
url_list.append(string)
|
160
|
+
string = 'http'
|
161
|
+
return url_list
|
162
|
+
def get_desired_links(url):
|
163
|
+
urls = make_list(url)
|
164
|
+
try:
|
165
|
+
urlMgr = linkManager(url)
|
166
|
+
urls = urlMgr.all_desired_links
|
167
|
+
except Exception as e:
|
168
|
+
logger.info(f"{e}")
|
169
|
+
return urls
|
170
|
+
def deriveUrlList(url):
|
171
|
+
urls = get_desired_links(url)
|
172
|
+
url_functions = [get_urls,get_url_list,process_urls]
|
173
|
+
for url_function in url_functions:
|
174
|
+
try:
|
175
|
+
urls = url_function(urls)
|
176
|
+
except Exception as e:
|
177
|
+
logger.info(f"{e}")
|
178
|
+
input()
|
179
|
+
return urls
|
180
|
+
def validate_video_urls(urls,
|
181
|
+
get_info_url=False,
|
182
|
+
get_for_video=False,
|
183
|
+
download_directory=None,
|
184
|
+
output_filename=None,
|
185
|
+
get_info=True,
|
186
|
+
download_video=False):
|
187
|
+
output_urls = []
|
188
|
+
for url in make_list(urls):
|
189
|
+
video_info=None
|
190
|
+
if url:
|
191
|
+
video_info = dl_video(
|
192
|
+
url,
|
193
|
+
download_directory=download_directory,
|
194
|
+
output_filename=output_filename,
|
195
|
+
get_info=get_info,
|
196
|
+
download_video=download_video
|
197
|
+
)
|
198
|
+
if video_info:
|
199
|
+
output_urls.append(url)
|
200
|
+
if get_info_url or get_for_video:
|
201
|
+
output_urls[-1] = video_info
|
202
|
+
if get_for_video:
|
203
|
+
dl_info = for_dl_video(
|
204
|
+
url,
|
205
|
+
download_directory=download_directory,
|
206
|
+
output_filename=output_filename,
|
207
|
+
get_info=True,
|
208
|
+
download_video=True
|
209
|
+
)
|
210
|
+
if dl_info:
|
211
|
+
output_urls[-1]=dl_info
|
212
|
+
if get_info_url:
|
213
|
+
if isinstance(output_urls[-1],dict):
|
214
|
+
output_urls[-1]['initial_url'] = url
|
215
|
+
|
216
|
+
return output_urls
|
217
|
+
def get_bolshevid_videos(url,
|
218
|
+
get_info_url=True,
|
219
|
+
get_for_video=True,
|
220
|
+
download_directory=None,
|
221
|
+
output_filename=None,
|
222
|
+
get_info=True,
|
223
|
+
download_video=False):
|
224
|
+
urls = deriveUrlList(url)
|
225
|
+
video_urls = validate_video_urls(urls,
|
226
|
+
get_info_url=get_info_url,
|
227
|
+
get_for_video=get_for_video,
|
228
|
+
download_directory=download_directory,
|
229
|
+
output_filename=output_filename,
|
230
|
+
get_info=get_info,
|
231
|
+
download_video=download_video)
|
232
|
+
return video_urls
|
{abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools.egg-info/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.126
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -36,6 +36,8 @@ src/abstract_webtools/managers/tlsAdapter.py
|
|
36
36
|
src/abstract_webtools/managers/userAgentManager.py
|
37
37
|
src/abstract_webtools/managers/videoDownloader.py
|
38
38
|
src/abstract_webtools/managers/videoDownloader2.py
|
39
|
+
src/abstract_webtools/managers/clownworld/__init__.py
|
40
|
+
src/abstract_webtools/managers/clownworld/get_bolshevid_video.py
|
39
41
|
src/abstract_webtools/managers/linkManager/__init__.py
|
40
42
|
src/abstract_webtools/managers/linkManager/linkManager.py
|
41
43
|
src/abstract_webtools/managers/requestManager/__init__.py
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/extention_list.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/find_dirs.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/k2s_downloader.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/soup_gui.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/url_grabber.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.124 → abstract_webtools-0.1.6.126}/src/abstract_webtools/url_grabber_new.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|