abstract-webtools 0.1.4.24__py3-none-any.whl → 0.1.4.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/abstract_webtools.py +73 -11
- {abstract_webtools-0.1.4.24.dist-info → abstract_webtools-0.1.4.25.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.4.24.dist-info → abstract_webtools-0.1.4.25.dist-info}/RECORD +6 -6
- {abstract_webtools-0.1.4.24.dist-info → abstract_webtools-0.1.4.25.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.4.24.dist-info → abstract_webtools-0.1.4.25.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.4.24.dist-info → abstract_webtools-0.1.4.25.dist-info}/top_level.txt +0 -0
@@ -85,6 +85,7 @@ from bs4 import BeautifulSoup
|
|
85
85
|
import xml.etree.ElementTree as ET
|
86
86
|
from abstract_utilities.time_utils import get_time_stamp,get_sleep,sleep_count_down
|
87
87
|
from abstract_utilities.string_clean import eatInner,eatAll
|
88
|
+
import socket
|
88
89
|
logging.basicConfig(level=logging.INFO)
|
89
90
|
class DynamicRateLimiterManager:
|
90
91
|
def __init__(self):
|
@@ -641,6 +642,50 @@ class SafeRequestSingleton:
|
|
641
642
|
## if source_code:
|
642
643
|
## print(source_code)
|
643
644
|
## ##
|
645
|
+
class MySocketClient:
|
646
|
+
def __init__(self, ip_address, port,domain_name):
|
647
|
+
self.sock
|
648
|
+
self.ip_address= ip_address
|
649
|
+
self.port = port
|
650
|
+
self.sock.connect((host, port))
|
651
|
+
self.domain_name =
|
652
|
+
def receive_data(self):
|
653
|
+
chunks = []
|
654
|
+
while True:
|
655
|
+
chunk = self.sock.recv(4096)
|
656
|
+
if chunk:
|
657
|
+
chunks.append(chunk)
|
658
|
+
else:
|
659
|
+
break
|
660
|
+
return b''.join(chunks).decode('utf-8')
|
661
|
+
def _parse_socket_response_as_json(self, data, *args, **kwargs):
|
662
|
+
return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
|
663
|
+
def process_data(self):
|
664
|
+
data = self.receive_data()
|
665
|
+
return self._parse_socket_response_as_json(data)
|
666
|
+
def _parse_json(self,json_string):
|
667
|
+
return json.loads(json_string)
|
668
|
+
def get_ip(self,domain=None):
|
669
|
+
try:
|
670
|
+
return self.sock.gethostbyname(domain_name if domain_name != None else self.domain_name)
|
671
|
+
except self.sock.gaierror:
|
672
|
+
return None
|
673
|
+
def grt_host_name(self,ip_address=None):
|
674
|
+
return self.sock.gethostbyaddr(ip_address if ip_address != None else self.ip_address)
|
675
|
+
def toggle_sock(self):
|
676
|
+
if self.sock != None:
|
677
|
+
self.sock.close()
|
678
|
+
else:
|
679
|
+
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
680
|
+
class MySocketClient():
|
681
|
+
_instance = None
|
682
|
+
@staticmethod
|
683
|
+
def get_instance(ip_address='local_host',port=22,domain_name="example.com"):
|
684
|
+
if MySocketClientSingleton._instance is None:
|
685
|
+
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain_name=domain_name)
|
686
|
+
elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or URLManagerSingleton._instance.domain_name != domain_name:
|
687
|
+
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain_name=domain_name)
|
688
|
+
return MySocketClient
|
644
689
|
class URLManager:
|
645
690
|
def __init__(self,url=None,session=requests):
|
646
691
|
self.url = url
|
@@ -648,8 +693,15 @@ class URLManager:
|
|
648
693
|
self.striped_url = None if url == None else self.strip_web()
|
649
694
|
self.clean_urls = None if url == None else self.clean_url(url=self.url)
|
650
695
|
self.correct_url = None if url == None else self.get_correct_url()
|
651
|
-
self.
|
696
|
+
if self.correct_url != None:
|
697
|
+
self.url = self.correct_url
|
698
|
+
self.protocol=None
|
699
|
+
self.domain_name= None if url == None else self.get_domain_name(self.correct_url)
|
700
|
+
self.path=None
|
701
|
+
self.query=None
|
702
|
+
self.strip_web()
|
652
703
|
self.all_urls=[]
|
704
|
+
|
653
705
|
def strip_web(self) -> str:
|
654
706
|
"""
|
655
707
|
Strip the 'http://' or 'https://' prefix from a URL, if present.
|
@@ -666,6 +718,13 @@ class URLManager:
|
|
666
718
|
elif self.url.startswith("https://"):
|
667
719
|
url = self.url.replace("https://", '', 1)
|
668
720
|
return url
|
721
|
+
def url_to_pieces(self):
|
722
|
+
match = re.match(r'^(https?):\/\/([^\/]+)(\/[^?]+)?(\?.+)?', self.correct_url)
|
723
|
+
if match:
|
724
|
+
self.protocol = match.group(1)
|
725
|
+
self.domain = match.group(2)
|
726
|
+
self.path = match.group(3) if match.group(3) else "" # Handle None
|
727
|
+
self.query = match.group(4) if match.group(4) else "" # Handle None
|
669
728
|
@staticmethod
|
670
729
|
def clean_url(url: str) -> list:
|
671
730
|
"""
|
@@ -780,17 +839,20 @@ class URLManagerSingleton:
|
|
780
839
|
return URLManagerSingleton._instance
|
781
840
|
|
782
841
|
class VideoDownloader:
|
783
|
-
|
784
842
|
def __init__(self, url,title=None,download_directory=os.getcwd(),user_agent=None,video_extention='mp4',download=True,get_info=False):
|
785
843
|
self.url = url
|
844
|
+
self.download = download
|
845
|
+
self.get_info = get_info
|
846
|
+
self.user_agent=user_agent
|
847
|
+
self.download_directory=download_directory
|
786
848
|
self.video_extention=video_extention
|
787
849
|
self.header = UserAgentManagerSingleton().get_instance(user_agent=user_agent).user_agent_header
|
788
850
|
self.base_name = os.path.basename(self.url)
|
789
851
|
self.file_name,self.ext = os.path.splitext(self.base_name)
|
790
|
-
self.download_directory=download_directory
|
791
852
|
self.title = url.split('/')[3] if title == None else title
|
792
|
-
self.video_urls = []
|
853
|
+
self.video_urls = [self.url]
|
793
854
|
self.fetch_video_urls()
|
855
|
+
self.info={}
|
794
856
|
self.download_videos()
|
795
857
|
def fetch_video_urls(self):
|
796
858
|
driver = webdriver.Chrome()
|
@@ -803,12 +865,12 @@ class VideoDownloader:
|
|
803
865
|
for video_url in self.video_urls:
|
804
866
|
ydl_opts = {}
|
805
867
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
806
|
-
info = ydl.extract_info(
|
807
|
-
if get_info == True:
|
808
|
-
|
809
|
-
self.base_name = os.path.basename(info['url'])
|
868
|
+
self.info = ydl.extract_info(url=video_url,download=self.download)
|
869
|
+
if self.get_info == True:
|
870
|
+
return self.info
|
871
|
+
self.base_name = os.path.basename(self.info['url'])
|
810
872
|
self.file_name,self.ext = os.path.splitext(self.base_name)
|
811
|
-
video_content =SafeRequestSingleton().get_instance(url=info['url']).response
|
873
|
+
video_content =SafeRequestSingleton().get_instance(url=self.info['url']).response
|
812
874
|
print("Start downloading")
|
813
875
|
content_length = int(video_content.headers['content-length'])
|
814
876
|
print(f'Size: {content_length / 1024 / 1024:.2f}MB')
|
@@ -824,9 +886,9 @@ class VideoDownloaderSingleton():
|
|
824
886
|
@staticmethod
|
825
887
|
def get_instance(url,title=None,video_extention='mp4',download_directory=os.getcwd(),user_agent=None,download=True,get_info=False):
|
826
888
|
if VideoDownloaderSingleton._instance is None:
|
827
|
-
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info)
|
889
|
+
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
|
828
890
|
elif VideoDownloaderSingleton._instance.title != title or video_extention != VideoDownloaderSingleton._instance.video_extention or url != VideoDownloaderSingleton._instance.url or download_directory != VideoDownloaderSingleton._instance.download_directory or user_agent != VideoDownloaderSingleton._instance.user_agent:
|
829
|
-
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info)
|
891
|
+
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
|
830
892
|
return VideoDownloaderSingleton._instance
|
831
893
|
class SoupManager:
|
832
894
|
def __init__(self, url=None, source_code=None, parse_type="html.parser"):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract-webtools
|
3
|
-
Version: 0.1.4.
|
3
|
+
Version: 0.1.4.25
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -1,13 +1,13 @@
|
|
1
1
|
abstract_webtools/__init__.py,sha256=2SWEfdPDHqqjUYsOQYlaOHF644ZYcO160nWKiAjga4w,34
|
2
2
|
abstract_webtools/abstract_crawler.py,sha256=e8jVVv1_EB8poqlrdQaJ19z9Z0t8un5uc-DKnj1Ud5s,8002
|
3
|
-
abstract_webtools/abstract_webtools.py,sha256=
|
3
|
+
abstract_webtools/abstract_webtools.py,sha256=7AbsLqzNq3molHO-lAefyBIanhIIUhYLTgUGFyIpUlo,58905
|
4
4
|
abstract_webtools/abstract_webtools2.py,sha256=dlhhgmUTaN_NgkT6GcJMVBLuXjmW38gAOeCrKxYqytk,30685
|
5
5
|
abstract_webtools/dfgdsf.py,sha256=T1pj-ne_qVfaAdu1MIdtW3q3UZqNP78Kt0OMhz4Musk,1355
|
6
6
|
abstract_webtools/grab_source_gui.py,sha256=Wz-FKLOuPQlBYz3kojXihpMbS4rqv4NWGr9ezF-Jt2g,16356
|
7
7
|
abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
|
8
8
|
abstract_webtools/sou.py,sha256=8HjmcpXJFi_kC2O-SVGebUIFY5I5B9bPP9L8BAiWhfk,4526
|
9
|
-
abstract_webtools-0.1.4.
|
10
|
-
abstract_webtools-0.1.4.
|
11
|
-
abstract_webtools-0.1.4.
|
12
|
-
abstract_webtools-0.1.4.
|
13
|
-
abstract_webtools-0.1.4.
|
9
|
+
abstract_webtools-0.1.4.25.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
10
|
+
abstract_webtools-0.1.4.25.dist-info/METADATA,sha256=6FJfWTP_KsJ5l_tx5ipCWXQrrzJG4lAmmn_bM9OwqQM,8963
|
11
|
+
abstract_webtools-0.1.4.25.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
12
|
+
abstract_webtools-0.1.4.25.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
13
|
+
abstract_webtools-0.1.4.25.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|