abstract-webtools 0.1.6.24__py3-none-any.whl → 0.1.6.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/crawlmgr2.py +63 -0
- abstract_webtools/managers/curlMgr.py +48 -0
- abstract_webtools/managers/videoDownloader.py +225 -195
- abstract_webtools/managers/videoDownloader2.py +291 -0
- abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4 +0 -0
- {abstract_webtools-0.1.6.24.dist-info → abstract_webtools-0.1.6.26.dist-info}/METADATA +15 -6
- {abstract_webtools-0.1.6.24.dist-info → abstract_webtools-0.1.6.26.dist-info}/RECORD +10 -6
- {abstract_webtools-0.1.6.24.dist-info → abstract_webtools-0.1.6.26.dist-info}/WHEEL +1 -1
- {abstract_webtools-0.1.6.24.dist-info → abstract_webtools-0.1.6.26.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.6.24.dist-info → abstract_webtools-0.1.6.26.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,63 @@
|
|
1
|
+
import os
|
2
|
+
import requests
|
3
|
+
from bs4 import BeautifulSoup
|
4
|
+
from urllib.parse import urljoin, urlparse
|
5
|
+
|
6
|
+
visited = set()
|
7
|
+
|
8
|
+
def download_page(url, destination_dir):
|
9
|
+
"""Download a single page to the destination directory."""
|
10
|
+
|
11
|
+
# Create directory if needed
|
12
|
+
os.makedirs(destination_dir, exist_ok=True)
|
13
|
+
os.chmod(destination_dir, 0o755) # optional: set directory perms
|
14
|
+
|
15
|
+
# Download
|
16
|
+
response = requests.get(url)
|
17
|
+
response.raise_for_status()
|
18
|
+
|
19
|
+
# Build a safe file name for the HTML file
|
20
|
+
# E.g., for "https://example.com/about/", you might store "about.html"
|
21
|
+
parsed = urlparse(url)
|
22
|
+
filename = "index.html" if not parsed.path or parsed.path.endswith("/") else os.path.basename(parsed.path)
|
23
|
+
if not filename.endswith(".html"):
|
24
|
+
filename += ".html"
|
25
|
+
|
26
|
+
filepath = os.path.join(destination_dir, filename)
|
27
|
+
with open(filepath, "wb") as f:
|
28
|
+
f.write(response.content)
|
29
|
+
|
30
|
+
return response.text, filepath
|
31
|
+
|
32
|
+
def crawl(url, destination_dir):
|
33
|
+
"""Recursively download a site starting from `url`."""
|
34
|
+
if url in visited:
|
35
|
+
return
|
36
|
+
visited.add(url)
|
37
|
+
|
38
|
+
try:
|
39
|
+
html, _ = download_page(url, destination_dir)
|
40
|
+
except Exception as e:
|
41
|
+
print(f"Failed to download {url}: {e}")
|
42
|
+
return
|
43
|
+
|
44
|
+
soup = BeautifulSoup(html, "html.parser")
|
45
|
+
|
46
|
+
# Find all <a> tags with an href
|
47
|
+
for link_tag in soup.find_all("a", href=True):
|
48
|
+
link = link_tag["href"]
|
49
|
+
|
50
|
+
# Convert a relative URL to an absolute one
|
51
|
+
absolute_link = urljoin(url, link)
|
52
|
+
|
53
|
+
# (Optional) Check domain if you only want to crawl the same site
|
54
|
+
# or skip mailto:, javascript:, etc.
|
55
|
+
if absolute_link.startswith("http"):
|
56
|
+
# Recurse
|
57
|
+
crawl(absolute_link, destination_dir)
|
58
|
+
|
59
|
+
if __name__ == "__main__":
|
60
|
+
start_url = "https://svscomics.com/category/giantess/page/24"
|
61
|
+
destination = "/home/svc"
|
62
|
+
|
63
|
+
crawl(start_url, destination)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
import os
|
2
|
+
import requests
|
3
|
+
import os
|
4
|
+
import subprocess
|
5
|
+
import stat
|
6
|
+
|
7
|
+
def get_site(website, destination_dir, filename):
|
8
|
+
# Ensure the directory exists
|
9
|
+
os.makedirs(destination_dir, exist_ok=True)
|
10
|
+
|
11
|
+
# Adjust directory permissions if needed (e.g. rwxr-xr-x -> 0o755)
|
12
|
+
os.chmod(destination_dir, 0o755)
|
13
|
+
|
14
|
+
# Construct the complete file path
|
15
|
+
destination_path = os.path.join(destination_dir, filename)
|
16
|
+
|
17
|
+
# Use curl to download the site
|
18
|
+
# The example user-agent is arbitrary; you can change it to your needs
|
19
|
+
os.system(
|
20
|
+
f'curl -L --output "{destination_path}" '
|
21
|
+
f'-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
22
|
+
f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 '
|
23
|
+
f'Safari/537.36" -H "Accept: */*" "{website}"'
|
24
|
+
)
|
25
|
+
|
26
|
+
def download_site(website, destination_dir, filename):
|
27
|
+
os.makedirs(destination_dir, exist_ok=True)
|
28
|
+
os.chmod(destination_dir, 0o755) # set directory permissions if needed
|
29
|
+
|
30
|
+
destination_path = os.path.join(destination_dir, filename)
|
31
|
+
|
32
|
+
# GET the resource
|
33
|
+
response = requests.get(website, headers={
|
34
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
35
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
36
|
+
"Chrome/91.0.4472.124 Safari/537.36",
|
37
|
+
"Accept": "*/*"
|
38
|
+
}, allow_redirects=True)
|
39
|
+
|
40
|
+
# Raise an exception if the download fails
|
41
|
+
response.raise_for_status()
|
42
|
+
|
43
|
+
# Write content to file
|
44
|
+
with open(destination_path, "wb") as f:
|
45
|
+
f.write(response.content)
|
46
|
+
website = 'https://www.pornhub.com'
|
47
|
+
destination = '/home/computron/Documents/doge'
|
48
|
+
get_site(website,destination,'doge')
|
@@ -1,206 +1,236 @@
|
|
1
|
-
import
|
2
|
-
|
1
|
+
from abstract_webtools import requestManager, urlManager, soupManager, requests, linkManager
|
2
|
+
import threading,os,re,yt_dlp,urllib.request,m3u8_To_MP4,subprocess
|
3
|
+
from abstract_utilities import get_logFile,safe_dump_to_file
|
4
|
+
from m3u8 import M3U8 # Install: pip install m3u8
|
5
|
+
from urllib.parse import urljoin
|
6
|
+
from yt_dlp.postprocessor.ffmpeg import FFmpegFixupPostProcessor
|
7
|
+
from abstract_math import divide_it,add_it,multiply_it,subtract_it
|
8
|
+
logger = get_logFile('video_bp')
|
3
9
|
class VideoDownloader:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
Note:
|
25
|
-
- The VideoDownloader class uses YouTube-DL to download videos.
|
26
|
-
- It allows downloading from multiple URLs.
|
27
|
-
- You need to have YouTube-DL installed to use this class.
|
28
|
-
"""
|
29
|
-
def __init__(self, link,temp_directory=None,video_directory=None,remove_existing=True):
|
30
|
-
if video_directory==None:
|
31
|
-
video_directory=os.path.join(os.getcwd(),'videos')
|
32
|
-
if temp_directory == None:
|
33
|
-
temp_directory=os.path.join(video_directory,'temp_files')
|
34
|
-
self.thread_manager = ThreadManager()
|
35
|
-
self.pause_event = self.thread_manager.add_thread('pause_event')
|
36
|
-
self.link = link
|
37
|
-
self.temp_directory = temp_directory
|
38
|
-
self.video_directory = video_directory
|
39
|
-
self.remove_existing=remove_existing
|
40
|
-
self.video_urls=self.link if isinstance(self.link,list) else [self.link]
|
10
|
+
def __init__(self, url, title=None, download_directory=os.getcwd(), user_agent=None, video_extention='mp4',
|
11
|
+
download_video=True, get_info=False, auto_file_gen=True, standalone_download=False, output_filename=None):
|
12
|
+
self.url = url
|
13
|
+
self.monitoring = True
|
14
|
+
self.pause_event = threading.Event()
|
15
|
+
self.get_download = download_video
|
16
|
+
self.get_info = get_info
|
17
|
+
self.user_agent = user_agent
|
18
|
+
self.title = title
|
19
|
+
self.auto_file_gen = auto_file_gen
|
20
|
+
self.standalone_download = standalone_download
|
21
|
+
self.video_extention = video_extention
|
22
|
+
self.download_directory = download_directory
|
23
|
+
self.output_filename = output_filename # New parameter for custom filename
|
24
|
+
self.header = {} # Placeholder for UserAgentManagerSingleton if needed
|
25
|
+
self.base_name = os.path.basename(self.url)
|
26
|
+
self.file_name, self.ext = os.path.splitext(self.base_name)
|
27
|
+
self.video_urls = [self.url]
|
28
|
+
self.info = {}
|
41
29
|
self.starttime = None
|
42
30
|
self.downloaded = 0
|
43
|
-
self.
|
44
|
-
self.
|
45
|
-
|
46
|
-
|
47
|
-
self.
|
48
|
-
self.
|
49
|
-
|
50
|
-
|
51
|
-
self.
|
52
|
-
|
53
|
-
self.video_url=None
|
54
|
-
self.last_checked = get_time_stamp()
|
55
|
-
self.num=0
|
56
|
-
self.start()
|
57
|
-
def count_outliers(self,speed,threshold):
|
58
|
-
if speed < threshold:
|
59
|
-
self.outlier_count+=1
|
31
|
+
self.video_urls = url if isinstance(url, list) else [url]
|
32
|
+
self.send_to_dl()
|
33
|
+
|
34
|
+
def get_request(self, url):
|
35
|
+
self.request_manager = requestManagerSingleton.get_instance(url=url)
|
36
|
+
return self.request_manager
|
37
|
+
|
38
|
+
def send_to_dl(self):
|
39
|
+
if self.standalone_download:
|
40
|
+
self.standalone_downloader()
|
60
41
|
else:
|
61
|
-
self.
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
threshold = initial_avg * 0.75 # 25% under average
|
68
|
-
filtered_speeds = [speed for speed in speeds if speed >= threshold]
|
69
|
-
|
70
|
-
# Step 3: Compute the new average of the filtered list
|
71
|
-
if filtered_speeds: # Ensure the list is not empty
|
72
|
-
self.count_outliers(speeds[-1],threshold)
|
73
|
-
return filtered_speeds
|
42
|
+
self.start()
|
43
|
+
|
44
|
+
def get_headers(self, url):
|
45
|
+
response = requests.get(url)
|
46
|
+
if response.status_code == 200:
|
47
|
+
return response.headers
|
74
48
|
else:
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
print(f"{self.file_name} already existed in {self.video_directory}; removing it from {self.temp_directory}")
|
92
|
-
self.remove_temps(self.file_name)
|
93
|
-
return True
|
94
|
-
return False
|
95
|
-
def yt_dlp_downloader(self,url,ydl_opts={},download=True):
|
96
|
-
try:
|
97
|
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
98
|
-
self.info_dict=ydl.extract_info(url=url, download=download)
|
99
|
-
return True
|
100
|
-
except:
|
101
|
-
return False
|
102
|
-
def progress_callback(self, d):
|
103
|
-
self.status_dict = d
|
104
|
-
keys = ['status',
|
105
|
-
'downloaded_bytes',
|
106
|
-
'fragment_index',
|
107
|
-
'fragment_count',
|
108
|
-
'filename',
|
109
|
-
'tmpfilename',
|
110
|
-
'max_progress',
|
111
|
-
'progress_idx',
|
112
|
-
'elapsed',
|
113
|
-
'total_bytes_estimate',
|
114
|
-
'speed',
|
115
|
-
'eta',
|
116
|
-
'_eta_str',
|
117
|
-
'_speed_str',
|
118
|
-
'_percent_str',
|
119
|
-
'_total_bytes_str',
|
120
|
-
'_total_bytes_estimate_str',
|
121
|
-
'_downloaded_bytes_str',
|
122
|
-
'_elapsed_str',
|
123
|
-
'_default_template']
|
124
|
-
if self.status_dict['status'] == 'finished':
|
125
|
-
print("Done downloading, moving video to final directory...")
|
126
|
-
self.move_video()
|
127
|
-
return
|
128
|
-
if get_time_stamp()-self.last_checked>5:
|
129
|
-
print(self.status_dict['_default_template'])
|
130
|
-
self.last_checked = get_time_stamp()
|
131
|
-
if (get_time_stamp()-self.start_time/5)>6:
|
132
|
-
self.speed_track.append(self.status_dict['speed'])
|
133
|
-
self.speed_track=self.filter_outliers(self.speed_track)
|
134
|
-
|
49
|
+
logger.error(f"Failed to retrieve headers for {url}. Status code: {response.status_code}")
|
50
|
+
return {}
|
51
|
+
|
52
|
+
@staticmethod
|
53
|
+
def get_directory_path(directory, name, video_extention):
|
54
|
+
file_path = os.path.join(directory, f"{name}.{video_extention}")
|
55
|
+
i = 0
|
56
|
+
while os.path.exists(file_path):
|
57
|
+
file_path = os.path.join(directory, f"{name}_{i}.{video_extention}")
|
58
|
+
i += 1
|
59
|
+
return file_path
|
60
|
+
|
61
|
+
def progress_callback(self, stream, chunk, bytes_remaining):
|
62
|
+
total_size = stream.filesize
|
63
|
+
self.downloaded = total_size - bytes_remaining
|
64
|
+
|
135
65
|
def download(self):
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
self.
|
143
|
-
self.info_dict=None
|
144
|
-
result = self.yt_dlp_downloader(url=self.video_url,ydl_opts={'quiet': True, 'no_warnings': True},download=False)
|
145
|
-
if self.info_dict != None and result:
|
146
|
-
self.start_time = get_time_stamp()
|
147
|
-
self.downloaded = 0
|
148
|
-
self.video_title = self.info_dict.get('title', None)
|
149
|
-
self.video_ext = self.info_dict.get('ext', 'mp4')
|
150
|
-
self.file_name =f"{self.video_title}.{self.video_ext}"
|
151
|
-
self.temp_file_path = os.path.join(self.temp_directory, self.file_name)
|
152
|
-
self.complete_file_path = os.path.join(self.video_directory, self.file_name)
|
153
|
-
if not self.move_video():
|
154
|
-
self.dl_speed = []
|
155
|
-
self.percent=None
|
156
|
-
self.dl_eta=None
|
157
|
-
self.total_bytes_est=None
|
158
|
-
self.percent_speed=None
|
159
|
-
self.speed_track = []
|
160
|
-
self.outlier_count=0
|
161
|
-
ydl_opts = {
|
162
|
-
'outtmpl': self.temp_file_path,
|
163
|
-
'noprogress':True,
|
164
|
-
'progress_hooks': [self.progress_callback]
|
165
|
-
}
|
166
|
-
|
167
|
-
|
168
|
-
print("Starting download...") # Check if this point in code is reached
|
169
|
-
result = self.yt_dlp_downloader(url=self.video_url,ydl_opts=ydl_opts,download=True)
|
170
|
-
if result:
|
171
|
-
print("Download finished!") # Check if download completes
|
172
|
-
else:
|
173
|
-
print(f'error downloding {self.video_url}')
|
174
|
-
self.move_video()
|
175
|
-
else:
|
176
|
-
print(f"The video from {self.video_url} already exists in the directory {self.video_directory}. Skipping download.")
|
177
|
-
else:
|
178
|
-
print(f"could not find video info from {self.video_url} Skipping download.")
|
179
|
-
if self.num==len(self.video_urls)-1:
|
180
|
-
self.monitoring=False
|
181
|
-
self.time_interval=0
|
66
|
+
for video_url in self.video_urls:
|
67
|
+
# Use custom filename if provided, otherwise generate a short temporary one
|
68
|
+
if self.output_filename:
|
69
|
+
outtmpl = os.path.join(self.download_directory, self.output_filename)
|
70
|
+
else:
|
71
|
+
temp_id = re.sub(r'[^\w\d.-]', '_', video_url)[-20:] # Short temp ID from URL
|
72
|
+
outtmpl = os.path.join(self.download_directory, f"temp_{temp_id}.%(ext)s")
|
182
73
|
|
74
|
+
ydl_opts = {
|
75
|
+
'external_downloader': 'ffmpeg',
|
76
|
+
'outtmpl': outtmpl,
|
77
|
+
'noprogress': True,
|
78
|
+
'quiet': True, # Reduce verbosity in logs
|
79
|
+
}
|
80
|
+
try:
|
81
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
82
|
+
self.info = ydl.extract_info(video_url, download=self.get_download)
|
83
|
+
self.downloading = False
|
84
|
+
self.starttime = get_time_stamp() # Assuming get_time_stamp() exists
|
85
|
+
if self.auto_file_gen:
|
86
|
+
file_path = ydl.prepare_filename(self.info)
|
87
|
+
if self.get_info:
|
88
|
+
self.info['file_path'] = file_path # Fixed typo 'aath'
|
89
|
+
if self.get_info:
|
90
|
+
self.stop()
|
91
|
+
return self.info
|
92
|
+
except Exception as e:
|
93
|
+
logger.error(f"Failed to download {video_url}: {str(e)}")
|
94
|
+
self.stop()
|
95
|
+
return self.info
|
96
|
+
|
183
97
|
def monitor(self):
|
184
98
|
while self.monitoring:
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
99
|
+
logger.info("Monitoring...")
|
100
|
+
self.pause_event.wait(60) # Check every minute
|
101
|
+
if self.starttime:
|
102
|
+
elapsed_time = subtract_it(get_time_stamp(),self.starttime)
|
103
|
+
if self.downloaded != 0 and elapsed_time != 0:
|
104
|
+
cumulative_time = add_it(self.downloaded,elapsed_time)
|
105
|
+
percent = divide_it(self.downloaded,cumulative_time)
|
106
|
+
else:
|
107
|
+
percent = 0
|
108
|
+
if elapsed_time != 0:
|
109
|
+
try:
|
110
|
+
downloaded_minutes = divide_it(elapsed_time,60)
|
111
|
+
estimated_download_minutes = divide_it(downloaded_minutes,percent)
|
112
|
+
estimated_download_time = subtract_it(estimated_download_minutes,downloaded_minutes)
|
113
|
+
except ZeroDivisionError:
|
114
|
+
logger.warning("Caught a division by zero in monitor!")
|
115
|
+
continue
|
116
|
+
if downloaded_minutes != 0 and subtract_it(percent,downloaded_minutes) != 0:
|
117
|
+
estimated_download_minutes = divide_it(downloaded_minutes,percent)
|
118
|
+
estimated_download_time = subtract_it(estimated_download_minutes,downloaded_minutes)
|
119
|
+
logger.info(f"Estimated download time: {estimated_download_time} minutes")
|
120
|
+
if estimated_download_time >= 1.5:
|
121
|
+
logger.info("Restarting download due to slow speed...")
|
122
|
+
self.start() # Restart download
|
190
123
|
|
191
124
|
def start(self):
|
192
|
-
download_thread =
|
193
|
-
|
194
|
-
self.
|
195
|
-
self.
|
196
|
-
self.
|
197
|
-
self.
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
125
|
+
self.download_thread = threading.Thread(target=self.download)
|
126
|
+
self.download_thread.daemon = True
|
127
|
+
self.monitor_thread = threading.Thread(target=self.monitor)
|
128
|
+
self.download_thread.start()
|
129
|
+
self.monitor_thread.start()
|
130
|
+
self.download_thread.join()
|
131
|
+
self.monitor_thread.join()
|
132
|
+
|
133
|
+
def stop(self):
|
134
|
+
self.monitoring = False
|
135
|
+
self.pause_event.set()
|
136
|
+
def download_image(url, save_path=None):
|
137
|
+
"""
|
138
|
+
Downloads an image from a URL and saves it to the specified path.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
url (str): The URL of the image to download
|
142
|
+
save_path (str, optional): Path to save the image. If None, uses the filename from URL
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
str: Path where the image was saved, or None if download failed
|
146
|
+
"""
|
147
|
+
try:
|
148
|
+
# Send GET request to the URL
|
149
|
+
response = requests.get(url, stream=True)
|
150
|
+
|
151
|
+
# Check if the request was successful
|
152
|
+
if response.status_code == 200:
|
153
|
+
# Set decode_content=True to automatically handle Content-Encoding
|
154
|
+
response.raw.decode_content = True
|
155
|
+
|
156
|
+
# If no save_path provided, extract filename from URL
|
157
|
+
if save_path is None:
|
158
|
+
# Get filename from URL
|
159
|
+
filename = url.split('/')[-1]
|
160
|
+
save_path = filename
|
161
|
+
|
162
|
+
# Ensure the directory exists
|
163
|
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
164
|
+
|
165
|
+
# Write the image content to file
|
166
|
+
with open(save_path, 'wb') as f:
|
167
|
+
f.write(response.content)
|
168
|
+
|
169
|
+
print(f"Image successfully downloaded to {save_path}")
|
170
|
+
return save_path
|
171
|
+
else:
|
172
|
+
print(f"Failed to download image. Status code: {response.status_code}")
|
173
|
+
return None
|
174
|
+
|
175
|
+
except requests.exceptions.RequestException as e:
|
176
|
+
print(f"Error downloading image: {str(e)}")
|
177
|
+
return None
|
178
|
+
except Exception as e:
|
179
|
+
print(f"An unexpected error occurred: {str(e)}")
|
180
|
+
return None
|
181
|
+
def get_thumbnails(directory,info):
|
182
|
+
thumbnails_dir = os.path.join(directory,'thumbnails')
|
183
|
+
os.makedirs(thumbnails_dir, exist_ok=True)
|
184
|
+
thumbnails = info.get('thumbnails',[])
|
185
|
+
for i,thumbnail_info in enumerate(thumbnails):
|
186
|
+
thumbnail_url = thumbnail_info.get('url')
|
187
|
+
thumbnail_base_url = thumbnail_url.split('?')[0]
|
188
|
+
baseName = os.path.basename(thumbnail_base_url)
|
189
|
+
fileName,ext = os.path.splitext(baseName)
|
190
|
+
baseName = f"{fileName}{ext}"
|
191
|
+
resolution = info['thumbnails'][i].get('resolution')
|
192
|
+
if resolution:
|
193
|
+
baseName = f"{resolution}_{baseName}"
|
194
|
+
img_id = info['thumbnails'][i].get('id')
|
195
|
+
if img_id:
|
196
|
+
baseName = f"{img_id}_{baseName}"
|
197
|
+
thumbnail_path = os.path.join(thumbnails_dir,baseName)
|
198
|
+
info['thumbnails'][i]['path']=thumbnail_path
|
199
|
+
download_image(thumbnail_url, save_path=thumbnail_path)
|
200
|
+
return info
|
201
|
+
def downloadvideo(url,directory=False,rename_display=True,thumbnails=True,audio=False):
|
202
|
+
directory = directory or os.getcwd()
|
203
|
+
temp_id = re.sub(r'[^\w\d.-]', '_', url)[-20:]
|
204
|
+
temp_filename = f"temp_{temp_id}.mp4"
|
205
|
+
video_mgr = VideoDownloader(
|
206
|
+
url=url,
|
207
|
+
download_directory=directory,
|
208
|
+
download_video=True,
|
209
|
+
get_info=True,
|
210
|
+
output_filename=temp_filename
|
211
|
+
)
|
212
|
+
info = video_mgr.info
|
213
|
+
display_id= info.get('display_id') or info.get('id')
|
214
|
+
directory =os.path.join(directory,display_id)
|
215
|
+
os.makedirs(directory, exist_ok=True)
|
216
|
+
if rename_display and info and 'file_path' in info:
|
217
|
+
# Rename using metadata
|
218
|
+
video_id = info.get('id', temp_id)
|
219
|
+
title = info.get('title', 'video')[:30] # Limit to 30 chars
|
220
|
+
safe_title = re.sub(r'[^\w\d.-]', '_', title)
|
221
|
+
final_filename = f"{safe_title}_{video_id}.mp4"
|
222
|
+
new_path = os.path.join(directory, final_filename)
|
223
|
+
if os.path.exists(info['file_path']):
|
224
|
+
os.rename(info['file_path'], new_path)
|
225
|
+
info['file_path'] = new_path
|
226
|
+
info_path = os.path.join(directory,'info.json')
|
227
|
+
if thumbnails:
|
228
|
+
info = get_thumbnails(directory,info)
|
229
|
+
if audio:
|
230
|
+
try:
|
231
|
+
info = download_audio(directory, info)
|
232
|
+
except:
|
233
|
+
info['audio_path'] = audio_path
|
234
|
+
safe_dump_to_file(info,info_path)
|
235
|
+
return info
|
236
|
+
|
@@ -0,0 +1,291 @@
|
|
1
|
+
from abstract_webtools import requestManager, urlManager, soupManager, requests, linkManager
|
2
|
+
import threading,os,re,yt_dlp,urllib.request,m3u8_To_MP4,subprocess
|
3
|
+
from abstract_utilities import get_logFile,safe_dump_to_file
|
4
|
+
|
5
|
+
from m3u8 import M3U8 # Install: pip install m3u8
|
6
|
+
from urllib.parse import urljoin
|
7
|
+
from yt_dlp.postprocessor.ffmpeg import FFmpegFixupPostProcessor
|
8
|
+
from abstract_math import divide_it,add_it,multiply_it,subtract_it
|
9
|
+
from abstract_pandas import *
|
10
|
+
class VideoDownloader:
|
11
|
+
def __init__(self, url, title=None, download_directory=os.getcwd(), user_agent=None, video_extention='mp4',
|
12
|
+
download_video=True, get_info=False, auto_file_gen=True, standalone_download=False, output_filename=None):
|
13
|
+
self.url = url
|
14
|
+
self.monitoring = True
|
15
|
+
self.pause_event = threading.Event()
|
16
|
+
self.get_download = download_video
|
17
|
+
self.get_info = get_info
|
18
|
+
self.user_agent = user_agent
|
19
|
+
self.title = title
|
20
|
+
self.auto_file_gen = auto_file_gen
|
21
|
+
self.standalone_download = standalone_download
|
22
|
+
self.video_extention = video_extention
|
23
|
+
self.download_directory = download_directory
|
24
|
+
self.output_filename = output_filename # New parameter for custom filename
|
25
|
+
self.header = {} # Placeholder for UserAgentManagerSingleton if needed
|
26
|
+
self.base_name = os.path.basename(self.url)
|
27
|
+
self.file_name, self.ext = os.path.splitext(self.base_name)
|
28
|
+
self.video_urls = [self.url]
|
29
|
+
self.info = {}
|
30
|
+
self.starttime = None
|
31
|
+
self.downloaded = 0
|
32
|
+
self.video_urls = url if isinstance(url, list) else [url]
|
33
|
+
self.send_to_dl()
|
34
|
+
|
35
|
+
def get_request(self, url):
|
36
|
+
self.request_manager = requestManagerSingleton.get_instance(url=url)
|
37
|
+
return self.request_manager
|
38
|
+
|
39
|
+
def send_to_dl(self):
|
40
|
+
if self.standalone_download:
|
41
|
+
self.standalone_downloader()
|
42
|
+
else:
|
43
|
+
self.start()
|
44
|
+
|
45
|
+
def get_headers(self, url):
|
46
|
+
response = requests.get(url)
|
47
|
+
if response.status_code == 200:
|
48
|
+
return response.headers
|
49
|
+
else:
|
50
|
+
logger.error(f"Failed to retrieve headers for {url}. Status code: {response.status_code}")
|
51
|
+
return {}
|
52
|
+
|
53
|
+
@staticmethod
|
54
|
+
def get_directory_path(directory, name, video_extention):
|
55
|
+
file_path = os.path.join(directory, f"{name}.{video_extention}")
|
56
|
+
i = 0
|
57
|
+
while os.path.exists(file_path):
|
58
|
+
file_path = os.path.join(directory, f"{name}_{i}.{video_extention}")
|
59
|
+
i += 1
|
60
|
+
return file_path
|
61
|
+
|
62
|
+
def progress_callback(self, stream, chunk, bytes_remaining):
|
63
|
+
total_size = stream.filesize
|
64
|
+
self.downloaded = total_size - bytes_remaining
|
65
|
+
|
66
|
+
def download(self):
|
67
|
+
for video_url in self.video_urls:
|
68
|
+
# Use custom filename if provided, otherwise generate a short temporary one
|
69
|
+
if self.output_filename:
|
70
|
+
outtmpl = os.path.join(self.download_directory, self.output_filename)
|
71
|
+
else:
|
72
|
+
temp_id = re.sub(r'[^\w\d.-]', '_', video_url)[-20:] # Short temp ID from URL
|
73
|
+
outtmpl = os.path.join(self.download_directory, f"temp_{temp_id}.%(ext)s")
|
74
|
+
|
75
|
+
ydl_opts = {
|
76
|
+
'external_downloader': 'ffmpeg',
|
77
|
+
'outtmpl': outtmpl,
|
78
|
+
'noprogress': True,
|
79
|
+
'quiet': True, # Reduce verbosity in logs
|
80
|
+
}
|
81
|
+
try:
|
82
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
83
|
+
self.info = ydl.extract_info(video_url, download=self.get_download)
|
84
|
+
self.downloading = False
|
85
|
+
self.starttime = get_time_stamp() # Assuming get_time_stamp() exists
|
86
|
+
if self.auto_file_gen:
|
87
|
+
file_path = ydl.prepare_filename(self.info)
|
88
|
+
if self.get_info:
|
89
|
+
self.info['file_path'] = file_path # Fixed typo 'aath'
|
90
|
+
if self.get_info:
|
91
|
+
self.stop()
|
92
|
+
return self.info
|
93
|
+
except Exception as e:
|
94
|
+
logger.error(f"Failed to download {video_url}: {str(e)}")
|
95
|
+
self.stop()
|
96
|
+
return self.info
|
97
|
+
|
98
|
+
def monitor(self):
|
99
|
+
while self.monitoring:
|
100
|
+
logger.info("Monitoring...")
|
101
|
+
self.pause_event.wait(60) # Check every minute
|
102
|
+
if self.starttime:
|
103
|
+
elapsed_time = subtract_it(get_time_stamp(),self.starttime)
|
104
|
+
if self.downloaded != 0 and elapsed_time != 0:
|
105
|
+
cumulative_time = add_it(self.downloaded,elapsed_time)
|
106
|
+
percent = divide_it(self.downloaded,cumulative_time)
|
107
|
+
else:
|
108
|
+
percent = 0
|
109
|
+
if elapsed_time != 0:
|
110
|
+
try:
|
111
|
+
downloaded_minutes = divide_it(elapsed_time,60)
|
112
|
+
estimated_download_minutes = divide_it(downloaded_minutes,percent)
|
113
|
+
estimated_download_time = subtract_it(estimated_download_minutes,downloaded_minutes)
|
114
|
+
except ZeroDivisionError:
|
115
|
+
logger.warning("Caught a division by zero in monitor!")
|
116
|
+
continue
|
117
|
+
if downloaded_minutes != 0 and subtract_it(percent,downloaded_minutes) != 0:
|
118
|
+
estimated_download_minutes = divide_it(downloaded_minutes,percent)
|
119
|
+
estimated_download_time = subtract_it(estimated_download_minutes,downloaded_minutes)
|
120
|
+
logger.info(f"Estimated download time: {estimated_download_time} minutes")
|
121
|
+
if estimated_download_time >= 1.5:
|
122
|
+
logger.info("Restarting download due to slow speed...")
|
123
|
+
self.start() # Restart download
|
124
|
+
|
125
|
+
def start(self):
|
126
|
+
self.download_thread = threading.Thread(target=self.download)
|
127
|
+
self.download_thread.daemon = True
|
128
|
+
self.monitor_thread = threading.Thread(target=self.monitor)
|
129
|
+
self.download_thread.start()
|
130
|
+
self.monitor_thread.start()
|
131
|
+
self.download_thread.join()
|
132
|
+
self.monitor_thread.join()
|
133
|
+
|
134
|
+
def stop(self):
|
135
|
+
self.monitoring = False
|
136
|
+
self.pause_event.set()
|
137
|
+
|
138
|
+
def download_image(url, save_path=None):
|
139
|
+
"""
|
140
|
+
Downloads an image from a URL and saves it to the specified path.
|
141
|
+
|
142
|
+
Args:
|
143
|
+
url (str): The URL of the image to download
|
144
|
+
save_path (str, optional): Path to save the image. If None, uses the filename from URL
|
145
|
+
|
146
|
+
Returns:
|
147
|
+
str: Path where the image was saved, or None if download failed
|
148
|
+
"""
|
149
|
+
try:
|
150
|
+
# Send GET request to the URL
|
151
|
+
response = requests.get(url, stream=True)
|
152
|
+
|
153
|
+
# Check if the request was successful
|
154
|
+
if response.status_code == 200:
|
155
|
+
# Set decode_content=True to automatically handle Content-Encoding
|
156
|
+
response.raw.decode_content = True
|
157
|
+
|
158
|
+
# If no save_path provided, extract filename from URL
|
159
|
+
if save_path is None:
|
160
|
+
# Get filename from URL
|
161
|
+
filename = url.split('/')[-1]
|
162
|
+
save_path = filename
|
163
|
+
|
164
|
+
# Ensure the directory exists
|
165
|
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
166
|
+
|
167
|
+
# Write the image content to file
|
168
|
+
with open(save_path, 'wb') as f:
|
169
|
+
f.write(response.content)
|
170
|
+
|
171
|
+
print(f"Image successfully downloaded to {save_path}")
|
172
|
+
return save_path
|
173
|
+
else:
|
174
|
+
print(f"Failed to download image. Status code: {response.status_code}")
|
175
|
+
return None
|
176
|
+
|
177
|
+
except requests.exceptions.RequestException as e:
|
178
|
+
print(f"Error downloading image: {str(e)}")
|
179
|
+
return None
|
180
|
+
except Exception as e:
|
181
|
+
print(f"An unexpected error occurred: {str(e)}")
|
182
|
+
return None
|
183
|
+
def get_thumbnails(directory,info):
|
184
|
+
thumbnails_dir = os.path.join(directory,'thumbnails')
|
185
|
+
os.makedirs(thumbnails_dir, exist_ok=True)
|
186
|
+
thumbnails = info.get('thumbnails',[])
|
187
|
+
for i,thumbnail_info in enumerate(thumbnails):
|
188
|
+
thumbnail_url = thumbnail_info.get('url')
|
189
|
+
thumbnail_base_url = thumbnail_url.split('?')[0]
|
190
|
+
baseName = os.path.basename(thumbnail_base_url)
|
191
|
+
fileName,ext = os.path.splitext(baseName)
|
192
|
+
baseName = f"{fileName}{ext}"
|
193
|
+
resolution = info['thumbnails'][i].get('resolution')
|
194
|
+
if resolution:
|
195
|
+
baseName = f"{resolution}_{baseName}"
|
196
|
+
img_id = info['thumbnails'][i].get('id')
|
197
|
+
if img_id:
|
198
|
+
baseName = f"{img_id}_{baseName}"
|
199
|
+
thumbnail_path = os.path.join(thumbnails_dir,baseName)
|
200
|
+
info['thumbnails'][i]['path']=thumbnail_path
|
201
|
+
download_image(thumbnail_url, save_path=thumbnail_path)
|
202
|
+
return info
|
203
|
+
def download_audio(directory, info):
|
204
|
+
"""
|
205
|
+
Download the highest-quality audio (e.g., hls-audio-128000-Audio) from info.json and save it to a directory.
|
206
|
+
|
207
|
+
Args:
|
208
|
+
directory (str): Base directory for saving files (e.g., /var/www/clownworld/data/downloads/videos/videos/1897210679465328845/)
|
209
|
+
info (dict): Dictionary containing video metadata from info.json, including 'formats' and 'video_id'
|
210
|
+
|
211
|
+
Returns:
|
212
|
+
dict: Updated info with the audio file path
|
213
|
+
"""
|
214
|
+
# Create an 'audio' subdirectory
|
215
|
+
audio_dir = os.path.join(directory, 'audio')
|
216
|
+
os.makedirs(audio_dir, exist_ok=True)
|
217
|
+
|
218
|
+
# Find the highest-quality audio format (e.g., hls-audio-128000-Audio with 128 kbps)
|
219
|
+
audio_formats = [f for f in info.get('formats', []) if f['format_id'].startswith('hls-audio')]
|
220
|
+
if not audio_formats:
|
221
|
+
logger.info("No audio formats found in info.json")
|
222
|
+
return info
|
223
|
+
# Sort by bitrate (tbr) to get the highest quality
|
224
|
+
audio_format = max(audio_formats, key=lambda x: x.get('tbr', 0))
|
225
|
+
audio_url = audio_format.get('url')
|
226
|
+
audio_ext = audio_format.get('ext', 'mp4') # Default to MP4 if not specified
|
227
|
+
|
228
|
+
# Extract video_id for filename
|
229
|
+
video_id = info.get('video_id', 'unknown_video')
|
230
|
+
title = info.get('title', 'audio').replace(' ', '_') # Clean title for filename
|
231
|
+
filename = f"{title}_{video_id}.{audio_ext}"
|
232
|
+
audio_path = os.path.join(audio_dir, filename)
|
233
|
+
|
234
|
+
# Download and process the M3U8/HLS audio stream
|
235
|
+
try:
|
236
|
+
# Fetch the M3U8 playlist
|
237
|
+
response = requests.get(audio_url, headers={
|
238
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.19 Safari/537.36',
|
239
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
240
|
+
'Accept-Language': 'en-us,en;q=0.5',
|
241
|
+
'Sec-Fetch-Mode': 'navigate'
|
242
|
+
})
|
243
|
+
response.raise_for_status()
|
244
|
+
|
245
|
+
# Parse the M3U8 playlist
|
246
|
+
m3u8_obj = M3U8(response.text)
|
247
|
+
base_url = '/'.join(audio_url.split('/')[:-1]) + '/' # Base URL for relative segment paths
|
248
|
+
|
249
|
+
# Download all TS segments
|
250
|
+
segments = []
|
251
|
+
for segment in m3u8_obj.segments:
|
252
|
+
segment_url = urljoin(base_url, segment.uri)
|
253
|
+
segment_response = requests.get(segment_url, headers=response.request.headers)
|
254
|
+
segment_response.raise_for_status()
|
255
|
+
segments.append(segment_response.content)
|
256
|
+
|
257
|
+
# Save segments to temporary files for processing with ffmpeg
|
258
|
+
temp_dir = os.path.join(audio_dir, 'temp_segments')
|
259
|
+
os.makedirs(temp_dir, exist_ok=True)
|
260
|
+
segment_paths = []
|
261
|
+
for i, segment_data in enumerate(segments):
|
262
|
+
segment_path = os.path.join(temp_dir, f'segment_{i}.ts')
|
263
|
+
with open(segment_path, 'wb') as f:
|
264
|
+
f.write(segment_data)
|
265
|
+
segment_paths.append(segment_path)
|
266
|
+
|
267
|
+
# Use ffmpeg to concatenate TS segments into a single MP4 audio file
|
268
|
+
output_path = audio_path
|
269
|
+
try:
|
270
|
+
ffmpeg.input('concat:' + '|'.join(segment_paths), format='concat', safe=0).output(
|
271
|
+
output_path, c='copy', loglevel='quiet'
|
272
|
+
).run()
|
273
|
+
except Exception as e:
|
274
|
+
logger.info(f"FFmpeg error: {e.stderr.decode()}")
|
275
|
+
|
276
|
+
# Clean up temporary segment files
|
277
|
+
for segment_path in segment_paths:
|
278
|
+
os.remove(segment_path)
|
279
|
+
os.rmdir(temp_dir)
|
280
|
+
|
281
|
+
# Update info with the audio path
|
282
|
+
|
283
|
+
info['audio_path'] = audio_path
|
284
|
+
info['audio_url'] = f"https://clownworld.biz/data/downloads/videos/videos/{video_id}/audio/{filename}"
|
285
|
+
|
286
|
+
except requests.RequestException as e:
|
287
|
+
logger.info(f"Failed to download audio: {str(e)}")
|
288
|
+
except Exception as e:
|
289
|
+
logger.info(f"Error processing audio: {str(e)}")
|
290
|
+
|
291
|
+
return info
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.26
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -13,10 +13,19 @@ Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Requires-Python: >=3.6
|
14
14
|
Description-Content-Type: text/markdown
|
15
15
|
License-File: LICENSE
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist: PySimpleGUI
|
18
|
-
Requires-Dist: urllib3
|
19
|
-
Requires-Dist: requests
|
16
|
+
Requires-Dist: abstract_utilities>=0.2.2.30
|
17
|
+
Requires-Dist: PySimpleGUI>=4.60.5
|
18
|
+
Requires-Dist: urllib3>=2.0.4
|
19
|
+
Requires-Dist: requests>=2.31.0
|
20
|
+
Dynamic: author
|
21
|
+
Dynamic: author-email
|
22
|
+
Dynamic: classifier
|
23
|
+
Dynamic: description
|
24
|
+
Dynamic: description-content-type
|
25
|
+
Dynamic: home-page
|
26
|
+
Dynamic: requires-dist
|
27
|
+
Dynamic: requires-python
|
28
|
+
Dynamic: summary
|
20
29
|
|
21
30
|
# Abstract WebTools
|
22
31
|
Provides utilities for inspecting and parsing web content, including React components and URL utilities, with enhanced capabilities for managing HTTP requests and TLS configurations.
|
@@ -8,6 +8,8 @@ abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f
|
|
8
8
|
abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
|
9
9
|
abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
|
10
10
|
abstract_webtools/managers/crawlManager.py,sha256=62Ej6AQC6-qXX_EWOmcJ2szNvEjmebFGugMz65HF1qI,12983
|
11
|
+
abstract_webtools/managers/crawlmgr2.py,sha256=PvHas-FSlp98osc-2so9zw-2c7amUMdwIj6tmc6Rl00,1910
|
12
|
+
abstract_webtools/managers/curlMgr.py,sha256=ghi0QsSAxjZu3HALFST5Kv_262XhHSAPGlQLvmguxPY,1657
|
11
13
|
abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
|
12
14
|
abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
|
13
15
|
abstract_webtools/managers/get_test.py,sha256=nISrhUGdyvRv18wTGoifGhizBFoHeK0N3FymMASloFw,825
|
@@ -21,9 +23,11 @@ abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQq
|
|
21
23
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
22
24
|
abstract_webtools/managers/urlManager.py,sha256=Dvf-TiSo5j_YjZS2Eq6lFfbhveneD6NA_wEE0xUXy_E,8858
|
23
25
|
abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
|
24
|
-
abstract_webtools/managers/videoDownloader.py,sha256=
|
25
|
-
abstract_webtools
|
26
|
-
abstract_webtools
|
27
|
-
abstract_webtools-0.1.6.
|
28
|
-
abstract_webtools-0.1.6.
|
29
|
-
abstract_webtools-0.1.6.
|
26
|
+
abstract_webtools/managers/videoDownloader.py,sha256=6-QIcj_QOutqycuD71X5McRTbukcqPTzjYmrRRmiqWE,10274
|
27
|
+
abstract_webtools/managers/videoDownloader2.py,sha256=v3H6akdhvVWGrB-r35m3cp_-aKkNWadpfCiMylOnv6w,12748
|
28
|
+
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
29
|
+
abstract_webtools-0.1.6.26.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
30
|
+
abstract_webtools-0.1.6.26.dist-info/METADATA,sha256=gK6NdWrMjM5uth72JnNDapM393Wsmfa_OFMcZyjKq60,16051
|
31
|
+
abstract_webtools-0.1.6.26.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
32
|
+
abstract_webtools-0.1.6.26.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
33
|
+
abstract_webtools-0.1.6.26.dist-info/RECORD,,
|
File without changes
|
File without changes
|