abstract-webtools 0.1.6.23__py3-none-any.whl → 0.1.6.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,63 @@
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin, urlparse
5
+
6
+ visited = set()
7
+
8
+ def download_page(url, destination_dir):
9
+ """Download a single page to the destination directory."""
10
+
11
+ # Create directory if needed
12
+ os.makedirs(destination_dir, exist_ok=True)
13
+ os.chmod(destination_dir, 0o755) # optional: set directory perms
14
+
15
+ # Download
16
+ response = requests.get(url)
17
+ response.raise_for_status()
18
+
19
+ # Build a safe file name for the HTML file
20
+ # E.g., for "https://example.com/about/", you might store "about.html"
21
+ parsed = urlparse(url)
22
+ filename = "index.html" if not parsed.path or parsed.path.endswith("/") else os.path.basename(parsed.path)
23
+ if not filename.endswith(".html"):
24
+ filename += ".html"
25
+
26
+ filepath = os.path.join(destination_dir, filename)
27
+ with open(filepath, "wb") as f:
28
+ f.write(response.content)
29
+
30
+ return response.text, filepath
31
+
32
+ def crawl(url, destination_dir):
33
+ """Recursively download a site starting from `url`."""
34
+ if url in visited:
35
+ return
36
+ visited.add(url)
37
+
38
+ try:
39
+ html, _ = download_page(url, destination_dir)
40
+ except Exception as e:
41
+ print(f"Failed to download {url}: {e}")
42
+ return
43
+
44
+ soup = BeautifulSoup(html, "html.parser")
45
+
46
+ # Find all <a> tags with an href
47
+ for link_tag in soup.find_all("a", href=True):
48
+ link = link_tag["href"]
49
+
50
+ # Convert a relative URL to an absolute one
51
+ absolute_link = urljoin(url, link)
52
+
53
+ # (Optional) Check domain if you only want to crawl the same site
54
+ # or skip mailto:, javascript:, etc.
55
+ if absolute_link.startswith("http"):
56
+ # Recurse
57
+ crawl(absolute_link, destination_dir)
58
+
59
+ if __name__ == "__main__":
60
+ start_url = "https://svscomics.com/category/giantess/page/24"
61
+ destination = "/home/svc"
62
+
63
+ crawl(start_url, destination)
@@ -0,0 +1,48 @@
1
+ import os
2
+ import requests
3
+ import os
4
+ import subprocess
5
+ import stat
6
+
7
+ def get_site(website, destination_dir, filename):
8
+ # Ensure the directory exists
9
+ os.makedirs(destination_dir, exist_ok=True)
10
+
11
+ # Adjust directory permissions if needed (e.g. rwxr-xr-x -> 0o755)
12
+ os.chmod(destination_dir, 0o755)
13
+
14
+ # Construct the complete file path
15
+ destination_path = os.path.join(destination_dir, filename)
16
+
17
+ # Use curl to download the site
18
+ # The example user-agent is arbitrary; you can change it to your needs
19
+ os.system(
20
+ f'curl -L --output "{destination_path}" '
21
+ f'-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
22
+ f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 '
23
+ f'Safari/537.36" -H "Accept: */*" "{website}"'
24
+ )
25
+
26
+ def download_site(website, destination_dir, filename):
27
+ os.makedirs(destination_dir, exist_ok=True)
28
+ os.chmod(destination_dir, 0o755) # set directory permissions if needed
29
+
30
+ destination_path = os.path.join(destination_dir, filename)
31
+
32
+ # GET the resource
33
+ response = requests.get(website, headers={
34
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
35
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
36
+ "Chrome/91.0.4472.124 Safari/537.36",
37
+ "Accept": "*/*"
38
+ }, allow_redirects=True)
39
+
40
+ # Raise an exception if the download fails
41
+ response.raise_for_status()
42
+
43
+ # Write content to file
44
+ with open(destination_path, "wb") as f:
45
+ f.write(response.content)
46
+ website = 'https://www.pornhub.com'
47
+ destination = '/home/computron/Documents/doge'
48
+ get_site(website,destination,'doge')
@@ -340,13 +340,13 @@ class SoupManagerSingleton():
340
340
  elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
341
341
  SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
342
342
  return SoupManagerSingleton._instance
343
- def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None):
343
+ def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,parse_type="html.parser"):
344
344
  url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
345
345
  url = get_url(url=url,url_mgr=url_mgr)
346
346
  req_mgr = get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
347
347
  soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code)
348
348
  return soup_mgr
349
- def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None):
349
+ def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None,parse_type="html.parser"):
350
350
  soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
351
351
  return soup_mgr.get_all_attribute_values(tags_list=tags_list)
352
352
  def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,parse_type="html.parser"):
@@ -1,206 +1,218 @@
1
- import os
2
- from .soupManager import *
3
- class VideoDownloader:
4
- """
5
- VideoDownloader is a class for downloading videos from URLs using YouTube-DL.
6
-
7
- Args:
8
- link (str or list): The URL(s) of the video(s) to be downloaded.
9
- temp_directory (str or None): The directory to store temporary video files (default is None, uses video_directory/temp_files).
10
- video_directory (str or None): The directory to store downloaded videos (default is None, uses 'videos' in the current working directory).
11
- remove_existing (bool): Whether to remove existing video files with the same name (default is True).
12
-
13
- Methods:
14
- count_outliers(speed, threshold): Count speed outliers below the threshold.
15
- filter_outliers(speeds): Filter out speed outliers in the list of speeds.
16
- remove_temps(file_name): Remove temporary video files based on the file name.
17
- move_video(): Move the downloaded video to the final directory.
18
- yt_dlp_downloader(url, ydl_opts={}, download=True): Download video information using YouTube-DL.
19
- progress_callback(d): Callback function to monitor download progress.
20
- download(): Download video(s) based on the provided URL(s).
21
- monitor(): Monitor the download progress.
22
- start(): Start the download and monitoring threads.
1
+ from abstract_webtools import requestManager, urlManager, soupManager, requests, linkManager
2
+ import threading,os,re,yt_dlp,urllib.request,m3u8_To_MP4,subprocess
3
+ from abstract_utilities import get_logFile,safe_dump_to_file
23
4
 
24
- Note:
25
- - The VideoDownloader class uses YouTube-DL to download videos.
26
- - It allows downloading from multiple URLs.
27
- - You need to have YouTube-DL installed to use this class.
28
- """
29
- def __init__(self, link,temp_directory=None,video_directory=None,remove_existing=True):
30
- if video_directory==None:
31
- video_directory=os.path.join(os.getcwd(),'videos')
32
- if temp_directory == None:
33
- temp_directory=os.path.join(video_directory,'temp_files')
34
- self.thread_manager = ThreadManager()
35
- self.pause_event = self.thread_manager.add_thread('pause_event')
36
- self.link = link
37
- self.temp_directory = temp_directory
38
- self.video_directory = video_directory
39
- self.remove_existing=remove_existing
40
- self.video_urls=self.link if isinstance(self.link,list) else [self.link]
5
+ from m3u8 import M3U8 # Install: pip install m3u8
6
+ from urllib.parse import urljoin
7
+ from yt_dlp.postprocessor.ffmpeg import FFmpegFixupPostProcessor
8
+ from abstract_math import divide_it,add_it,multiply_it,subtract_it
9
+ from abstract_webtools import *
10
+ logger = get_logFile('video_bp')
11
+ class VideoDownloader:
12
+ def __init__(self, url, title=None, download_directory=os.getcwd(), user_agent=None, video_extention='mp4',
13
+ download_video=True, get_info=False, auto_file_gen=True, standalone_download=False, output_filename=None):
14
+ self.url = url
15
+ self.monitoring = True
16
+ self.pause_event = threading.Event()
17
+ self.get_download = download_video
18
+ self.get_info = get_info
19
+ self.user_agent = user_agent
20
+ self.title = title
21
+ self.auto_file_gen = auto_file_gen
22
+ self.standalone_download = standalone_download
23
+ self.video_extention = video_extention
24
+ self.download_directory = download_directory
25
+ self.output_filename = output_filename # New parameter for custom filename
26
+ self.header = {} # Placeholder for UserAgentManagerSingleton if needed
27
+ self.base_name = os.path.basename(self.url)
28
+ self.file_name, self.ext = os.path.splitext(self.base_name)
29
+ self.video_urls = [self.url]
30
+ self.info = {}
41
31
  self.starttime = None
42
32
  self.downloaded = 0
43
- self.time_interval=60
44
- self.monitoring=True
45
- self.temp_file_name = None
46
- self.file_name = None
47
- self.dl_speed = None
48
- self.dl_eta=None
49
- self.total_bytes_est=None
50
- self.percent_speed=None
51
- self.percent=None
52
- self.speed_track = []
53
- self.video_url=None
54
- self.last_checked = get_time_stamp()
55
- self.num=0
56
- self.start()
57
- def count_outliers(self,speed,threshold):
58
- if speed < threshold:
59
- self.outlier_count+=1
33
+ self.video_urls = url if isinstance(url, list) else [url]
34
+ self.send_to_dl()
35
+
36
+ def get_request(self, url):
37
+ self.request_manager = requestManagerSingleton.get_instance(url=url)
38
+ return self.request_manager
39
+
40
+ def send_to_dl(self):
41
+ if self.standalone_download:
42
+ self.standalone_downloader()
60
43
  else:
61
- self.outlier_count=0
62
- def filter_outliers(self,speeds):
63
- # Step 1: Compute initial average
64
- initial_avg = sum(speeds) / len(speeds)
65
-
66
- # Step 2: Remove speeds 25% under the average
67
- threshold = initial_avg * 0.75 # 25% under average
68
- filtered_speeds = [speed for speed in speeds if speed >= threshold]
69
-
70
- # Step 3: Compute the new average of the filtered list
71
- if filtered_speeds: # Ensure the list is not empty
72
- self.count_outliers(speeds[-1],threshold)
73
- return filtered_speeds
44
+ self.start()
45
+
46
+ def get_headers(self, url):
47
+ response = requests.get(url)
48
+ if response.status_code == 200:
49
+ return response.headers
74
50
  else:
75
- # This can happen if all values are outliers, it's up to you how to handle it
76
- self.outlier_count=0
77
- return speeds
78
- def remove_temps(self,file_name):
79
- for temp_vid in os.listdir(self.temp_directory):
80
- if len(file_name)<=len(temp_vid):
81
- if temp_vid[:len(file_name)] == file_name:
82
- os.remove(os.path.join(self.temp_directory,temp_vid))
83
- print(f"removing {temp_vid} from {self.temp_directory}")
84
- def move_video(self):
85
- if os.path.exists(self.temp_file_path):
86
- shutil.move(self.temp_file_path, self.video_directory)
87
- print(f"moving {self.file_name} from {self.temp_directory} to {self.video_directory}")
88
- self.remove_temps(self.file_name)
89
- return True
90
- if os.path.exists(self.complete_file_path):
91
- print(f"{self.file_name} already existed in {self.video_directory}; removing it from {self.temp_directory}")
92
- self.remove_temps(self.file_name)
93
- return True
94
- return False
95
- def yt_dlp_downloader(self,url,ydl_opts={},download=True):
96
- try:
97
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
98
- self.info_dict=ydl.extract_info(url=url, download=download)
99
- return True
100
- except:
101
- return False
102
- def progress_callback(self, d):
103
- self.status_dict = d
104
- keys = ['status',
105
- 'downloaded_bytes',
106
- 'fragment_index',
107
- 'fragment_count',
108
- 'filename',
109
- 'tmpfilename',
110
- 'max_progress',
111
- 'progress_idx',
112
- 'elapsed',
113
- 'total_bytes_estimate',
114
- 'speed',
115
- 'eta',
116
- '_eta_str',
117
- '_speed_str',
118
- '_percent_str',
119
- '_total_bytes_str',
120
- '_total_bytes_estimate_str',
121
- '_downloaded_bytes_str',
122
- '_elapsed_str',
123
- '_default_template']
124
- if self.status_dict['status'] == 'finished':
125
- print("Done downloading, moving video to final directory...")
126
- self.move_video()
127
- return
128
- if get_time_stamp()-self.last_checked>5:
129
- print(self.status_dict['_default_template'])
130
- self.last_checked = get_time_stamp()
131
- if (get_time_stamp()-self.start_time/5)>6:
132
- self.speed_track.append(self.status_dict['speed'])
133
- self.speed_track=self.filter_outliers(self.speed_track)
134
-
51
+ logger.error(f"Failed to retrieve headers for {url}. Status code: {response.status_code}")
52
+ return {}
53
+
54
+ @staticmethod
55
+ def get_directory_path(directory, name, video_extention):
56
+ file_path = os.path.join(directory, f"{name}.{video_extention}")
57
+ i = 0
58
+ while os.path.exists(file_path):
59
+ file_path = os.path.join(directory, f"{name}_{i}.{video_extention}")
60
+ i += 1
61
+ return file_path
62
+
63
+ def progress_callback(self, stream, chunk, bytes_remaining):
64
+ total_size = stream.filesize
65
+ self.downloaded = total_size - bytes_remaining
66
+
135
67
  def download(self):
136
- if not os.path.exists(self.video_directory):
137
- os.makedirs(self.video_directory,exist_ok=True)
138
- if not os.path.exists(self.temp_directory):
139
- os.makedirs(self.temp_directory,exist_ok=True)
140
- for self.num,video_url in enumerate(self.video_urls):
141
- if video_url != self.video_url or self.video_url == None:
142
- self.video_url=video_url
143
- self.info_dict=None
144
- result = self.yt_dlp_downloader(url=self.video_url,ydl_opts={'quiet': True, 'no_warnings': True},download=False)
145
- if self.info_dict != None and result:
146
- self.start_time = get_time_stamp()
147
- self.downloaded = 0
148
- self.video_title = self.info_dict.get('title', None)
149
- self.video_ext = self.info_dict.get('ext', 'mp4')
150
- self.file_name =f"{self.video_title}.{self.video_ext}"
151
- self.temp_file_path = os.path.join(self.temp_directory, self.file_name)
152
- self.complete_file_path = os.path.join(self.video_directory, self.file_name)
153
- if not self.move_video():
154
- self.dl_speed = []
155
- self.percent=None
156
- self.dl_eta=None
157
- self.total_bytes_est=None
158
- self.percent_speed=None
159
- self.speed_track = []
160
- self.outlier_count=0
161
- ydl_opts = {
162
- 'outtmpl': self.temp_file_path,
163
- 'noprogress':True,
164
- 'progress_hooks': [self.progress_callback]
165
- }
166
-
167
-
168
- print("Starting download...") # Check if this point in code is reached
169
- result = self.yt_dlp_downloader(url=self.video_url,ydl_opts=ydl_opts,download=True)
170
- if result:
171
- print("Download finished!") # Check if download completes
172
- else:
173
- print(f'error downloding {self.video_url}')
174
- self.move_video()
175
- else:
176
- print(f"The video from {self.video_url} already exists in the directory {self.video_directory}. Skipping download.")
177
- else:
178
- print(f"could not find video info from {self.video_url} Skipping download.")
179
- if self.num==len(self.video_urls)-1:
180
- self.monitoring=False
181
- self.time_interval=0
68
+ for video_url in self.video_urls:
69
+ # Use custom filename if provided, otherwise generate a short temporary one
70
+ if self.output_filename:
71
+ outtmpl = os.path.join(self.download_directory, self.output_filename)
72
+ else:
73
+ temp_id = re.sub(r'[^\w\d.-]', '_', video_url)[-20:] # Short temp ID from URL
74
+ outtmpl = os.path.join(self.download_directory, f"temp_{temp_id}.%(ext)s")
182
75
 
76
+ ydl_opts = {
77
+ 'external_downloader': 'ffmpeg',
78
+ 'outtmpl': outtmpl,
79
+ 'noprogress': True,
80
+ 'quiet': True, # Reduce verbosity in logs
81
+ }
82
+ try:
83
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
84
+ self.info = ydl.extract_info(video_url, download=self.get_download)
85
+ self.downloading = False
86
+ self.starttime = get_time_stamp() # Assuming get_time_stamp() exists
87
+ if self.auto_file_gen:
88
+ file_path = ydl.prepare_filename(self.info)
89
+ if self.get_info:
90
+ self.info['file_path'] = file_path # Fixed typo 'aath'
91
+ if self.get_info:
92
+ self.stop()
93
+ return self.info
94
+ except Exception as e:
95
+ logger.error(f"Failed to download {video_url}: {str(e)}")
96
+ self.stop()
97
+ return self.info
98
+
183
99
  def monitor(self):
184
100
  while self.monitoring:
185
- self.thread_manager.wait(name='pause_event',n=self.time_interval)# check every minute
186
- if self.monitoring:
187
- if 'eta' in self.status_dict:
188
- if self.outlier_count>=3 and (self.status_dict['eta']/60)>10:
189
- self.start()
101
+ logger.info("Monitoring...")
102
+ self.pause_event.wait(60) # Check every minute
103
+ if self.starttime:
104
+ elapsed_time = subtract_it(get_time_stamp(),self.starttime)
105
+ if self.downloaded != 0 and elapsed_time != 0:
106
+ cumulative_time = add_it(self.downloaded,elapsed_time)
107
+ percent = divide_it(self.downloaded,cumulative_time)
108
+ else:
109
+ percent = 0
110
+ if elapsed_time != 0:
111
+ try:
112
+ downloaded_minutes = divide_it(elapsed_time,60)
113
+ estimated_download_minutes = divide_it(downloaded_minutes,percent)
114
+ estimated_download_time = subtract_it(estimated_download_minutes,downloaded_minutes)
115
+ except ZeroDivisionError:
116
+ logger.warning("Caught a division by zero in monitor!")
117
+ continue
118
+ if downloaded_minutes != 0 and subtract_it(percent,downloaded_minutes) != 0:
119
+ estimated_download_minutes = divide_it(downloaded_minutes,percent)
120
+ estimated_download_time = subtract_it(estimated_download_minutes,downloaded_minutes)
121
+ logger.info(f"Estimated download time: {estimated_download_time} minutes")
122
+ if estimated_download_time >= 1.5:
123
+ logger.info("Restarting download due to slow speed...")
124
+ self.start() # Restart download
190
125
 
191
126
  def start(self):
192
- download_thread = self.thread_manager.add_thread(name='download_thread',target_function=self.download)
193
- monitor_thread = self.thread_manager.add_thread(name='monitor_thread',target_function=self.monitor)
194
- self.thread_manager.start(name='download_thread')
195
- self.thread_manager.start(name='monitor_thread')
196
- self.thread_manager.join(name='download_thread')
197
- self.thread_manager.join(name='monitor_thread')
198
- class VideoDownloaderSingleton():
199
- _instance = None
200
- @staticmethod
201
- def get_instance(url_manager,request_manager,title=None,video_extention='mp4',download_directory=os.getcwd(),user_agent=None,download=True,get_info=False):
202
- if VideoDownloaderSingleton._instance is None:
203
- VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
204
- elif VideoDownloaderSingleton._instance.title != title or video_extention != VideoDownloaderSingleton._instance.video_extention or url != VideoDownloaderSingleton._instance.url or download_directory != VideoDownloaderSingleton._instance.download_directory or user_agent != VideoDownloaderSingleton._instance.user_agent:
205
- VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
206
- return VideoDownloaderSingleton._instance
127
+ self.download_thread = threading.Thread(target=self.download)
128
+ self.download_thread.daemon = True
129
+ self.monitor_thread = threading.Thread(target=self.monitor)
130
+ self.download_thread.start()
131
+ self.monitor_thread.start()
132
+ self.download_thread.join()
133
+ self.monitor_thread.join()
134
+
135
+ def stop(self):
136
+ self.monitoring = False
137
+ self.pause_event.set()
138
+ def download_image(url, save_path=None):
139
+ """
140
+ Downloads an image from a URL and saves it to the specified path.
141
+
142
+ Args:
143
+ url (str): The URL of the image to download
144
+ save_path (str, optional): Path to save the image. If None, uses the filename from URL
145
+
146
+ Returns:
147
+ str: Path where the image was saved, or None if download failed
148
+ """
149
+ try:
150
+ # Send GET request to the URL
151
+ response = requests.get(url, stream=True)
152
+
153
+ # Check if the request was successful
154
+ if response.status_code == 200:
155
+ # Set decode_content=True to automatically handle Content-Encoding
156
+ response.raw.decode_content = True
157
+
158
+ # If no save_path provided, extract filename from URL
159
+ if save_path is None:
160
+ # Get filename from URL
161
+ filename = url.split('/')[-1]
162
+ save_path = filename
163
+
164
+ # Ensure the directory exists
165
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
166
+
167
+ # Write the image content to file
168
+ with open(save_path, 'wb') as f:
169
+ f.write(response.content)
170
+
171
+ print(f"Image successfully downloaded to {save_path}")
172
+ return save_path
173
+ else:
174
+ print(f"Failed to download image. Status code: {response.status_code}")
175
+ return None
176
+
177
+ except requests.exceptions.RequestException as e:
178
+ print(f"Error downloading image: {str(e)}")
179
+ return None
180
+ except Exception as e:
181
+ print(f"An unexpected error occurred: {str(e)}")
182
+ return None
183
+ def get_thumbnails(directory,info):
184
+ thumbnails_dir = os.path.join(directory,'thumbnails')
185
+ os.makedirs(thumbnails_dir, exist_ok=True)
186
+ thumbnails = info.get('thumbnails',[])
187
+ for i,thumbnail_info in enumerate(thumbnails):
188
+ thumbnail_url = thumbnail_info.get('url')
189
+ thumbnail_base_url = thumbnail_url.split('?')[0]
190
+ baseName = os.path.basename(thumbnail_base_url)
191
+ fileName,ext = os.path.splitext(baseName)
192
+ baseName = f"{fileName}{ext}"
193
+ resolution = info['thumbnails'][i].get('resolution')
194
+ if resolution:
195
+ baseName = f"{resolution}_{baseName}"
196
+ img_id = info['thumbnails'][i].get('id')
197
+ if img_id:
198
+ baseName = f"{img_id}_{baseName}"
199
+ thumbnail_path = os.path.join(thumbnails_dir,baseName)
200
+ info['thumbnails'][i]['path']=thumbnail_path
201
+ download_image(thumbnail_url, save_path=thumbnail_path)
202
+ return info
203
+ def downloadvideo(url,directory=False,thumbnails=True):
204
+ directory = directory or os.getcwd()
205
+ temp_id = re.sub(r'[^\w\d.-]', '_', url)[-20:]
206
+ temp_filename = f"temp_{temp_id}.mp4"
207
+ video_mgr = VideoDownloader(
208
+ url=url,
209
+ download_directory=directory,
210
+ download_video=True,
211
+ get_info=True,
212
+ output_filename=temp_filename
213
+ )
214
+ info = video_mgr.info
215
+ if thumbnails:
216
+ info = get_thumbnails(directory,info)
217
+ return info
218
+
@@ -0,0 +1,291 @@
1
+ from abstract_webtools import requestManager, urlManager, soupManager, requests, linkManager
2
+ import threading,os,re,yt_dlp,urllib.request,m3u8_To_MP4,subprocess
3
+ from abstract_utilities import get_logFile,safe_dump_to_file
4
+
5
+ from m3u8 import M3U8 # Install: pip install m3u8
6
+ from urllib.parse import urljoin
7
+ from yt_dlp.postprocessor.ffmpeg import FFmpegFixupPostProcessor
8
+ from abstract_math import divide_it,add_it,multiply_it,subtract_it
9
+ from abstract_pandas import *
10
+ class VideoDownloader:
11
+ def __init__(self, url, title=None, download_directory=os.getcwd(), user_agent=None, video_extention='mp4',
12
+ download_video=True, get_info=False, auto_file_gen=True, standalone_download=False, output_filename=None):
13
+ self.url = url
14
+ self.monitoring = True
15
+ self.pause_event = threading.Event()
16
+ self.get_download = download_video
17
+ self.get_info = get_info
18
+ self.user_agent = user_agent
19
+ self.title = title
20
+ self.auto_file_gen = auto_file_gen
21
+ self.standalone_download = standalone_download
22
+ self.video_extention = video_extention
23
+ self.download_directory = download_directory
24
+ self.output_filename = output_filename # New parameter for custom filename
25
+ self.header = {} # Placeholder for UserAgentManagerSingleton if needed
26
+ self.base_name = os.path.basename(self.url)
27
+ self.file_name, self.ext = os.path.splitext(self.base_name)
28
+ self.video_urls = [self.url]
29
+ self.info = {}
30
+ self.starttime = None
31
+ self.downloaded = 0
32
+ self.video_urls = url if isinstance(url, list) else [url]
33
+ self.send_to_dl()
34
+
35
+ def get_request(self, url):
36
+ self.request_manager = requestManagerSingleton.get_instance(url=url)
37
+ return self.request_manager
38
+
39
+ def send_to_dl(self):
40
+ if self.standalone_download:
41
+ self.standalone_downloader()
42
+ else:
43
+ self.start()
44
+
45
+ def get_headers(self, url):
46
+ response = requests.get(url)
47
+ if response.status_code == 200:
48
+ return response.headers
49
+ else:
50
+ logger.error(f"Failed to retrieve headers for {url}. Status code: {response.status_code}")
51
+ return {}
52
+
53
+ @staticmethod
54
+ def get_directory_path(directory, name, video_extention):
55
+ file_path = os.path.join(directory, f"{name}.{video_extention}")
56
+ i = 0
57
+ while os.path.exists(file_path):
58
+ file_path = os.path.join(directory, f"{name}_{i}.{video_extention}")
59
+ i += 1
60
+ return file_path
61
+
62
+ def progress_callback(self, stream, chunk, bytes_remaining):
63
+ total_size = stream.filesize
64
+ self.downloaded = total_size - bytes_remaining
65
+
66
+ def download(self):
67
+ for video_url in self.video_urls:
68
+ # Use custom filename if provided, otherwise generate a short temporary one
69
+ if self.output_filename:
70
+ outtmpl = os.path.join(self.download_directory, self.output_filename)
71
+ else:
72
+ temp_id = re.sub(r'[^\w\d.-]', '_', video_url)[-20:] # Short temp ID from URL
73
+ outtmpl = os.path.join(self.download_directory, f"temp_{temp_id}.%(ext)s")
74
+
75
+ ydl_opts = {
76
+ 'external_downloader': 'ffmpeg',
77
+ 'outtmpl': outtmpl,
78
+ 'noprogress': True,
79
+ 'quiet': True, # Reduce verbosity in logs
80
+ }
81
+ try:
82
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
83
+ self.info = ydl.extract_info(video_url, download=self.get_download)
84
+ self.downloading = False
85
+ self.starttime = get_time_stamp() # Assuming get_time_stamp() exists
86
+ if self.auto_file_gen:
87
+ file_path = ydl.prepare_filename(self.info)
88
+ if self.get_info:
89
+ self.info['file_path'] = file_path # Fixed typo 'aath'
90
+ if self.get_info:
91
+ self.stop()
92
+ return self.info
93
+ except Exception as e:
94
+ logger.error(f"Failed to download {video_url}: {str(e)}")
95
+ self.stop()
96
+ return self.info
97
+
98
+ def monitor(self):
99
+ while self.monitoring:
100
+ logger.info("Monitoring...")
101
+ self.pause_event.wait(60) # Check every minute
102
+ if self.starttime:
103
+ elapsed_time = subtract_it(get_time_stamp(),self.starttime)
104
+ if self.downloaded != 0 and elapsed_time != 0:
105
+ cumulative_time = add_it(self.downloaded,elapsed_time)
106
+ percent = divide_it(self.downloaded,cumulative_time)
107
+ else:
108
+ percent = 0
109
+ if elapsed_time != 0:
110
+ try:
111
+ downloaded_minutes = divide_it(elapsed_time,60)
112
+ estimated_download_minutes = divide_it(downloaded_minutes,percent)
113
+ estimated_download_time = subtract_it(estimated_download_minutes,downloaded_minutes)
114
+ except ZeroDivisionError:
115
+ logger.warning("Caught a division by zero in monitor!")
116
+ continue
117
+ if downloaded_minutes != 0 and subtract_it(percent,downloaded_minutes) != 0:
118
+ estimated_download_minutes = divide_it(downloaded_minutes,percent)
119
+ estimated_download_time = subtract_it(estimated_download_minutes,downloaded_minutes)
120
+ logger.info(f"Estimated download time: {estimated_download_time} minutes")
121
+ if estimated_download_time >= 1.5:
122
+ logger.info("Restarting download due to slow speed...")
123
+ self.start() # Restart download
124
+
125
+ def start(self):
126
+ self.download_thread = threading.Thread(target=self.download)
127
+ self.download_thread.daemon = True
128
+ self.monitor_thread = threading.Thread(target=self.monitor)
129
+ self.download_thread.start()
130
+ self.monitor_thread.start()
131
+ self.download_thread.join()
132
+ self.monitor_thread.join()
133
+
134
+ def stop(self):
135
+ self.monitoring = False
136
+ self.pause_event.set()
137
+
138
+ def download_image(url, save_path=None):
139
+ """
140
+ Downloads an image from a URL and saves it to the specified path.
141
+
142
+ Args:
143
+ url (str): The URL of the image to download
144
+ save_path (str, optional): Path to save the image. If None, uses the filename from URL
145
+
146
+ Returns:
147
+ str: Path where the image was saved, or None if download failed
148
+ """
149
+ try:
150
+ # Send GET request to the URL
151
+ response = requests.get(url, stream=True)
152
+
153
+ # Check if the request was successful
154
+ if response.status_code == 200:
155
+ # Set decode_content=True to automatically handle Content-Encoding
156
+ response.raw.decode_content = True
157
+
158
+ # If no save_path provided, extract filename from URL
159
+ if save_path is None:
160
+ # Get filename from URL
161
+ filename = url.split('/')[-1]
162
+ save_path = filename
163
+
164
+ # Ensure the directory exists
165
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
166
+
167
+ # Write the image content to file
168
+ with open(save_path, 'wb') as f:
169
+ f.write(response.content)
170
+
171
+ print(f"Image successfully downloaded to {save_path}")
172
+ return save_path
173
+ else:
174
+ print(f"Failed to download image. Status code: {response.status_code}")
175
+ return None
176
+
177
+ except requests.exceptions.RequestException as e:
178
+ print(f"Error downloading image: {str(e)}")
179
+ return None
180
+ except Exception as e:
181
+ print(f"An unexpected error occurred: {str(e)}")
182
+ return None
183
+ def get_thumbnails(directory,info):
184
+ thumbnails_dir = os.path.join(directory,'thumbnails')
185
+ os.makedirs(thumbnails_dir, exist_ok=True)
186
+ thumbnails = info.get('thumbnails',[])
187
+ for i,thumbnail_info in enumerate(thumbnails):
188
+ thumbnail_url = thumbnail_info.get('url')
189
+ thumbnail_base_url = thumbnail_url.split('?')[0]
190
+ baseName = os.path.basename(thumbnail_base_url)
191
+ fileName,ext = os.path.splitext(baseName)
192
+ baseName = f"{fileName}{ext}"
193
+ resolution = info['thumbnails'][i].get('resolution')
194
+ if resolution:
195
+ baseName = f"{resolution}_{baseName}"
196
+ img_id = info['thumbnails'][i].get('id')
197
+ if img_id:
198
+ baseName = f"{img_id}_{baseName}"
199
+ thumbnail_path = os.path.join(thumbnails_dir,baseName)
200
+ info['thumbnails'][i]['path']=thumbnail_path
201
+ download_image(thumbnail_url, save_path=thumbnail_path)
202
+ return info
203
+ def download_audio(directory, info):
204
+ """
205
+ Download the highest-quality audio (e.g., hls-audio-128000-Audio) from info.json and save it to a directory.
206
+
207
+ Args:
208
+ directory (str): Base directory for saving files (e.g., /var/www/clownworld/data/downloads/videos/videos/1897210679465328845/)
209
+ info (dict): Dictionary containing video metadata from info.json, including 'formats' and 'video_id'
210
+
211
+ Returns:
212
+ dict: Updated info with the audio file path
213
+ """
214
+ # Create an 'audio' subdirectory
215
+ audio_dir = os.path.join(directory, 'audio')
216
+ os.makedirs(audio_dir, exist_ok=True)
217
+
218
+ # Find the highest-quality audio format (e.g., hls-audio-128000-Audio with 128 kbps)
219
+ audio_formats = [f for f in info.get('formats', []) if f['format_id'].startswith('hls-audio')]
220
+ if not audio_formats:
221
+ logger.info("No audio formats found in info.json")
222
+ return info
223
+ # Sort by bitrate (tbr) to get the highest quality
224
+ audio_format = max(audio_formats, key=lambda x: x.get('tbr', 0))
225
+ audio_url = audio_format.get('url')
226
+ audio_ext = audio_format.get('ext', 'mp4') # Default to MP4 if not specified
227
+
228
+ # Extract video_id for filename
229
+ video_id = info.get('video_id', 'unknown_video')
230
+ title = info.get('title', 'audio').replace(' ', '_') # Clean title for filename
231
+ filename = f"{title}_{video_id}.{audio_ext}"
232
+ audio_path = os.path.join(audio_dir, filename)
233
+
234
+ # Download and process the M3U8/HLS audio stream
235
+ try:
236
+ # Fetch the M3U8 playlist
237
+ response = requests.get(audio_url, headers={
238
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.19 Safari/537.36',
239
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
240
+ 'Accept-Language': 'en-us,en;q=0.5',
241
+ 'Sec-Fetch-Mode': 'navigate'
242
+ })
243
+ response.raise_for_status()
244
+
245
+ # Parse the M3U8 playlist
246
+ m3u8_obj = M3U8(response.text)
247
+ base_url = '/'.join(audio_url.split('/')[:-1]) + '/' # Base URL for relative segment paths
248
+
249
+ # Download all TS segments
250
+ segments = []
251
+ for segment in m3u8_obj.segments:
252
+ segment_url = urljoin(base_url, segment.uri)
253
+ segment_response = requests.get(segment_url, headers=response.request.headers)
254
+ segment_response.raise_for_status()
255
+ segments.append(segment_response.content)
256
+
257
+ # Save segments to temporary files for processing with ffmpeg
258
+ temp_dir = os.path.join(audio_dir, 'temp_segments')
259
+ os.makedirs(temp_dir, exist_ok=True)
260
+ segment_paths = []
261
+ for i, segment_data in enumerate(segments):
262
+ segment_path = os.path.join(temp_dir, f'segment_{i}.ts')
263
+ with open(segment_path, 'wb') as f:
264
+ f.write(segment_data)
265
+ segment_paths.append(segment_path)
266
+
267
+ # Use ffmpeg to concatenate TS segments into a single MP4 audio file
268
+ output_path = audio_path
269
+ try:
270
+ ffmpeg.input('concat:' + '|'.join(segment_paths), format='concat', safe=0).output(
271
+ output_path, c='copy', loglevel='quiet'
272
+ ).run()
273
+ except Exception as e:
274
+ logger.info(f"FFmpeg error: {e.stderr.decode()}")
275
+
276
+ # Clean up temporary segment files
277
+ for segment_path in segment_paths:
278
+ os.remove(segment_path)
279
+ os.rmdir(temp_dir)
280
+
281
+ # Update info with the audio path
282
+
283
+ info['audio_path'] = audio_path
284
+ info['audio_url'] = f"https://clownworld.biz/data/downloads/videos/videos/{video_id}/audio/{filename}"
285
+
286
+ except requests.RequestException as e:
287
+ logger.info(f"Failed to download audio: {str(e)}")
288
+ except Exception as e:
289
+ logger.info(f"Error processing audio: {str(e)}")
290
+
291
+ return info
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.23
3
+ Version: 0.1.6.25
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -13,10 +13,19 @@ Classifier: Programming Language :: Python :: 3.11
13
13
  Requires-Python: >=3.6
14
14
  Description-Content-Type: text/markdown
15
15
  License-File: LICENSE
16
- Requires-Dist: abstract-utilities >=0.2.2.30
17
- Requires-Dist: PySimpleGUI >=4.60.5
18
- Requires-Dist: urllib3 >=2.0.4
19
- Requires-Dist: requests >=2.31.0
16
+ Requires-Dist: abstract_utilities>=0.2.2.30
17
+ Requires-Dist: PySimpleGUI>=4.60.5
18
+ Requires-Dist: urllib3>=2.0.4
19
+ Requires-Dist: requests>=2.31.0
20
+ Dynamic: author
21
+ Dynamic: author-email
22
+ Dynamic: classifier
23
+ Dynamic: description
24
+ Dynamic: description-content-type
25
+ Dynamic: home-page
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
20
29
 
21
30
  # Abstract WebTools
22
31
  Provides utilities for inspecting and parsing web content, including React components and URL utilities, with enhanced capabilities for managing HTTP requests and TLS configurations.
@@ -8,6 +8,8 @@ abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f
8
8
  abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
9
9
  abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
10
10
  abstract_webtools/managers/crawlManager.py,sha256=62Ej6AQC6-qXX_EWOmcJ2szNvEjmebFGugMz65HF1qI,12983
11
+ abstract_webtools/managers/crawlmgr2.py,sha256=PvHas-FSlp98osc-2so9zw-2c7amUMdwIj6tmc6Rl00,1910
12
+ abstract_webtools/managers/curlMgr.py,sha256=ghi0QsSAxjZu3HALFST5Kv_262XhHSAPGlQLvmguxPY,1657
11
13
  abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
12
14
  abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
13
15
  abstract_webtools/managers/get_test.py,sha256=nISrhUGdyvRv18wTGoifGhizBFoHeK0N3FymMASloFw,825
@@ -16,14 +18,16 @@ abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO
16
18
  abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
17
19
  abstract_webtools/managers/requestManager.py,sha256=zXD31WAYghV1OjnTQzRQnQGqZz6_J4mjHTdNLnBop_0,17343
18
20
  abstract_webtools/managers/seleniumManager.py,sha256=qSY8gH3N5YJIMwE_Alj9HNQRip_PziIo4_T9AZE_FQo,4273
19
- abstract_webtools/managers/soupManager.py,sha256=7nDB_QKneGjyTZUzchfbdHNvxxYiTyIn8AHon8ObTSY,17148
21
+ abstract_webtools/managers/soupManager.py,sha256=-_mRCWlyzfKlF64UU53WXBmCvJ98jQ4GyHh8S8Pw3xs,17198
20
22
  abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
21
23
  abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
22
24
  abstract_webtools/managers/urlManager.py,sha256=Dvf-TiSo5j_YjZS2Eq6lFfbhveneD6NA_wEE0xUXy_E,8858
23
25
  abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
24
- abstract_webtools/managers/videoDownloader.py,sha256=4sPV0D8f3_S8qNYSySfB_b-aBP_xAm4Ex7MJ1WIFhHE,10567
25
- abstract_webtools-0.1.6.23.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
26
- abstract_webtools-0.1.6.23.dist-info/METADATA,sha256=3SlDHjHws2FzMhvMfr8jN48y37Ad5R50UlnTaGiTaws,15858
27
- abstract_webtools-0.1.6.23.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
28
- abstract_webtools-0.1.6.23.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
29
- abstract_webtools-0.1.6.23.dist-info/RECORD,,
26
+ abstract_webtools/managers/videoDownloader.py,sha256=oFmRsN84_GACAhVpk21SzFJbHfJZMBLQHMUVLcKK9OI,9388
27
+ abstract_webtools/managers/videoDownloader2.py,sha256=v3H6akdhvVWGrB-r35m3cp_-aKkNWadpfCiMylOnv6w,12748
28
+ abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
29
+ abstract_webtools-0.1.6.25.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
30
+ abstract_webtools-0.1.6.25.dist-info/METADATA,sha256=_Jl7eCzHpI7lehgiyjXQlXWQhtDNqpFFqSojUhxx0JY,16051
31
+ abstract_webtools-0.1.6.25.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
32
+ abstract_webtools-0.1.6.25.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
33
+ abstract_webtools-0.1.6.25.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.5.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5