abstract-webtools 0.1.6.90__py3-none-any.whl → 0.1.6.92__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/domain_identifier.py +82 -0
- abstract_webtools/extention_list.py +11 -0
- abstract_webtools/find_dirs.py +81 -0
- {abstract_webtools-0.1.6.90.dist-info → abstract_webtools-0.1.6.92.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.90.dist-info → abstract_webtools-0.1.6.92.dist-info}/RECORD +7 -4
- {abstract_webtools-0.1.6.90.dist-info → abstract_webtools-0.1.6.92.dist-info}/WHEEL +1 -1
- {abstract_webtools-0.1.6.90.dist-info → abstract_webtools-0.1.6.92.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
from extention_list import get_extention,popular_extentions
|
2
|
+
from urllib.parse import urlparse, urljoin
|
3
|
+
from abstract_utilities import *
|
4
|
+
def try_request(url,timeout=None):
|
5
|
+
if timeout == None:
|
6
|
+
timeout= 5
|
7
|
+
elif timeout == 0:
|
8
|
+
timeout = None
|
9
|
+
try:
|
10
|
+
result = requests.get(url, timeout=timeout) # Set timeout to 5 seconds
|
11
|
+
except requests.exceptions.RequestException as e:
|
12
|
+
print(f"Request failed for {url}: {e}")
|
13
|
+
result = None
|
14
|
+
return result
|
15
|
+
def is_result_200(result):
|
16
|
+
try:
|
17
|
+
if result.status_code == 200:
|
18
|
+
return True
|
19
|
+
except:
|
20
|
+
return False
|
21
|
+
return False
|
22
|
+
def url_to_pieces(url):
|
23
|
+
"""
|
24
|
+
Split a URL into protocol, domain, path, and query components.
|
25
|
+
Uses urlparse for robustness.
|
26
|
+
"""
|
27
|
+
parsed_url = {'parsed':'', 'scheme':'', 'netloc':'', 'subdomain':'', 'domain':url,'extention':'', 'path':'', 'params':'', 'query':'', 'fragment':''}
|
28
|
+
try:
|
29
|
+
parsed = urlparse(url)
|
30
|
+
parsed_url['parsed']= parsed
|
31
|
+
parsed_url['scheme'] = parsed.scheme if parsed.scheme else ""
|
32
|
+
parsed_url['netloc'] = parsed.netloc if parsed.netloc else ""
|
33
|
+
parsed_url['path'] = parsed.path or ""
|
34
|
+
parsed_url['params'] = parsed.params or ""
|
35
|
+
parsed_url['query'] = parsed.query or ""
|
36
|
+
parsed_url['fragment'] = parsed.fragment or ""
|
37
|
+
if parsed_url['netloc'] == '' and parsed_url['path']:
|
38
|
+
parsed_url['netloc'] = parsed_url['path']
|
39
|
+
if '/' in parsed_url['path']:
|
40
|
+
parsed_url['netloc'] = parsed_url['path'].split('/')[0]
|
41
|
+
parsed_url['path'] = '/'+'/'.join(parsed_url['path'].split('/')[1:])
|
42
|
+
else:
|
43
|
+
parsed_url['path']=''
|
44
|
+
if parsed_url['netloc']:
|
45
|
+
if parsed_url['netloc'].startswith('www.'):
|
46
|
+
parsed_url['subdomain']= 'www.'
|
47
|
+
parsed_url['domain'] = parsed_url['netloc'][len('www.'):]
|
48
|
+
else:
|
49
|
+
parsed_url['domain'] = parsed_url['netloc']
|
50
|
+
parsed_url.update(get_extention(parsed_url['domain']))
|
51
|
+
except Exception as e:
|
52
|
+
print(f'The URL {url} was not reachable: {e}')
|
53
|
+
return parsed_url
|
54
|
+
def correct_domains(url):
|
55
|
+
urls = [url]
|
56
|
+
protocols = {'https':['','www.'],'http':['','www.'],'':['','www.']}
|
57
|
+
parsed_url = url_to_pieces(url)
|
58
|
+
scheme,subdomain,extentions = parsed_url['scheme'], parsed_url['subdomain'],make_list(parsed_url['extention'] or popular_extentions)
|
59
|
+
subdomains = protocols.get(scheme)
|
60
|
+
if subdomain in subdomains:
|
61
|
+
subdomains.remove(subdomain)
|
62
|
+
protocols[scheme] = subdomains
|
63
|
+
for extention in extentions:
|
64
|
+
link = f"{parsed_url['domain']}{extention}{parsed_url['path']}{parsed_url['params']}"
|
65
|
+
for key,values in protocols.items():
|
66
|
+
for value in values:
|
67
|
+
new_link = f"{value}{link}"
|
68
|
+
if key:
|
69
|
+
new_link = f"{key}://{new_link}"
|
70
|
+
urls.append(new_link)
|
71
|
+
return urls
|
72
|
+
def tryAllDomains(url):
|
73
|
+
urls = correct_domains(url)
|
74
|
+
for i, url in enumerate(urls):
|
75
|
+
result = try_request(url)
|
76
|
+
if is_result_200(result):
|
77
|
+
return url
|
78
|
+
def tryDomain(url):
|
79
|
+
request_mgr = requestManager(url)
|
80
|
+
return request_mgr.source_code
|
81
|
+
url='thedailydialectics'
|
82
|
+
input(tryAllDomains(url))
|
@@ -0,0 +1,11 @@
|
|
1
|
+
extentions=['.ac', '.academy', '.accountant', '.actor', '.agency', '.ai', '.airforce', '.am', '.apartments', '.archi', '.army', '.art', '.asia', '.associates', '.at', '.attorney', '.auction', '.audio', '.baby', '.band', '.bar', '.bargains', '.be', '.beer', '.berlin', '.best', '.bet', '.bid', '.bike', '.bingo', '.bio', '.biz', '.black', '.blackfriday', '.blog', '.blue', '.boston', '.boutique', '.br.com', '.build', '.builders', '.business', '.buzz', '.buz', '.ca', '.cab', '.cafe', '.camera', '.camp', '.capital', '.cards', '.care', '.careers', '.casa', '.cash', '.casino', '.catering', '.cc', '.center', '.ceo', '.ch', '.charity', '.chat', '.cheap', '.christmas', '.church', '.city', '.claims', '.cleaning', '.click', '.clinic', '.clothing', '.cloud', '.club', '.cn.com', '.co', '.co.com', '.co.in', '.co.nz', '.co.uk', '.coach', '.codes', '.coffee', '.college', '.com', '.com.co', '.com.mx', '.com.tw', '.community', '.company', '.computer', '.condos', '.construction', '.consulting', '.contact', '.contractors', '.cooking', '.cool', '.coupons', '.courses', '.credit', '.creditcard', '.cricket', '.cruises', '.cymru', '.cz', '.dance', '.date', '.dating', '.de', '.de.com', '.deals', '.degree', '.delivery', '.democrat', '.dental', '.dentist', '.desi', '.design', '.diamonds', '.diet', '.digital', '.direct', '.directory', '.discount', '.doctor', '.dog', '.domains', '.download', '.earth', '.eco', '.education', '.email', '.energy', '.engineer', '.engineering', '.enterprises', '.equipment', '.estate', '.eu', '.eu.com', '.events', '.exchange', '.expert', '.exposed', '.express', '.fail', '.faith', '.family', '.fans', '.farm', '.fashion', '.film', '.finance', '.financial', '.fish', '.fishing', '.fit', '.fitness', '.flights', '.florist', '.flowers', '.fm', '.football', '.forsale', '.foundation', '.fun', '.fund', '.furniture', '.futbol', '.fyi', '.gallery', '.games', '.garden', '.gay', '.gift', '.gifts', '.gives', '.glass', '.global', '.gmbh', '.gold', '.golf', '.graphics', '.gratis', '.green', '.gripe', '.group', '.gs', '.guide', '.guitars', '.guru', '.haus', '.healthcare', '.help', '.hiphop', '.hn', '.hockey', '.holdings', '.holiday', '.horse', '.host', '.hosting', '.house', '.how', '.immo', '.in', '.industries', '.info', '.ink', '.institue', '.insure', '.international', '.investments', '.io', '.irish', '.it', '.jetzt', '.jewelry', '.jp', '.jpn.com', '.juegos', '.kaufen', '.kim', '.kitchen', '.kiwi', '.la', '.land', '.lawyer', '.lease', '.legal', '.lgbt', '.li', '.life', '.lighting', '.limited', '.limo', '.link', '.live', '.llc', '.loan', '.loans', '.lol', '.london', '.love', '.ltd', '.luxury ', '.maison', '.managment', '.market', '.marketing', '.mba', '.me', '.me.uk', '.media', '.memorial', '.men', '.menu', '.miami', '.mobi', '.moda', '.moe', '.money', '.monster', '.mortgage', '.mx', '.nagoya', '.navy', '.net', '.net.co', '.network', '.news', '.ngo', '.ninja', '.nl', '.nyc', '.okinawa', '.one', '.ong', '.online', '.org', '.org.in', '.org.uk', '.partners', '.parts', '.party', '.pet', '.ph', '.photo', '.photography', '.photos', '.physio', '.pics', '.pictures', '.pink', '.pizza', '.pl', '.place', '.plumbing', '.plus', '.poker', '.press', '.pro', '.productions', '.promo', '.properties', '.property', '.pub', '.qpon', '.quebec', '.racing', '.realty', '.recipes', '.red', '.rehab', '.reisen', '.rent', '.rentals', '.repair', '.report', '.republican', '.rest', '.restaurant', '.review', '.reviews', '.rip', '.rocks', '.rodeo', '.run', '.sa.com', '.sale', '.sarl', '.sc', '.school', '.schule', '.science', '.se.net', '.services', '.sexy', '.sg', '.shiksha', '.shoes', '.shop', '.shopping', '.show', '.singles', '.site', '.ski', '.soccer', '.social', '.software', '.solar', '.solutions', '.soy', '.space', '.srl', '.store', '.stream', '.studio', '.study', '.style', '.supplies', '.supply', '.support', '.surf', '.surgery', '.systems', '.tattoo', '.tax', '.taxi', '.team', '.tech', '.technology', '.tel', '.tennis', '.theater', '.tienda', '.tips', '.today', '.tokyo', '.tools', '.tours', '.town', '.toys', '.trade', '.training', '.tv', '.tw', '.uk', '.uk.com', '.university', '.uno', '.us', '.us.com', '.vacations', '.vc', '.vegas', '.ventures', '.vet', '.viajes', '.video', '.villas', '.vip', '.vision', '.vodka', '.vote', '.voting', '.voyage', '.watch', '.webcam', '.website', '.wedding', '.wiki', '.win', '.wine', '.work', '.works', '.world', '.ws', '.wtf', '.xyz', '.yoga', '.za.com', '.zone']
|
2
|
+
popular_extentions = ['.com','.net','.org','.co','.us']
|
3
|
+
extentions = popular_extentions+[extention for extention in extentions if extention not in popular_extentions]
|
4
|
+
def get_extention(domain):
|
5
|
+
domain_js = {"domain":domain,"extention":''}
|
6
|
+
for extention in extentions:
|
7
|
+
if domain.endswith(extention):
|
8
|
+
domain_js["domain"] = domain[:-len(extention)]
|
9
|
+
domain_js["extention"] = extention
|
10
|
+
break
|
11
|
+
return domain_js
|
@@ -0,0 +1,81 @@
|
|
1
|
+
import os
|
2
|
+
def get_dir_size(path):
|
3
|
+
"""Calculate the total size of a directory in bytes."""
|
4
|
+
total_size = 0
|
5
|
+
try:
|
6
|
+
for dirpath, dirnames, filenames in os.walk(path):
|
7
|
+
for filename in filenames:
|
8
|
+
file_path = os.path.join(dirpath, filename)
|
9
|
+
try:
|
10
|
+
total_size += os.path.getsize(file_path)
|
11
|
+
except (OSError, PermissionError) as e:
|
12
|
+
#print(f"Error accessing {file_path}: {e}")
|
13
|
+
pass
|
14
|
+
except (OSError, PermissionError) as e:
|
15
|
+
print(f"Error accessing {path}: {e}")
|
16
|
+
return total_size
|
17
|
+
|
18
|
+
def compare_dirs(dir1_path, dir2_path):
|
19
|
+
"""Compare the sizes of two directories."""
|
20
|
+
dir1_size = get_dir_size(dir1_path)
|
21
|
+
dir2_size = get_dir_size(dir2_path)
|
22
|
+
|
23
|
+
print(f"Size of {dir1_path}: {dir1_size} bytes")
|
24
|
+
print(f"Size of {dir2_path}: {dir2_size} bytes")
|
25
|
+
|
26
|
+
if dir1_size > dir2_size:
|
27
|
+
print(f"{dir1_path} is larger than {dir2_path}")
|
28
|
+
elif dir2_size > dir1_size:
|
29
|
+
print(f"{dir2_path} is larger than {dir1_path}")
|
30
|
+
else:
|
31
|
+
print("Both directories are the same size")
|
32
|
+
twentyfourT = """/mnt/24T/evo_970
|
33
|
+
/mnt/24T/main_drive
|
34
|
+
/mnt/24T/nvmeHeatSync-new
|
35
|
+
/mnt/24T/PNY_1T
|
36
|
+
/mnt/24T/serverBack
|
37
|
+
/mnt/24T/solcatcher_backup
|
38
|
+
/mnt/24T/transferDrive
|
39
|
+
/mnt/24T/wd_black
|
40
|
+
/mnt/24T/wd_black_980_home
|
41
|
+
/mnt/24T/wdBlack_970_evo
|
42
|
+
/mnt/24T/wd_main_980
|
43
|
+
/mnt/24T/wd_nvm
|
44
|
+
/mnt/24T/.Trash-1000
|
45
|
+
/mnt/24T/testfile.txt"""
|
46
|
+
|
47
|
+
|
48
|
+
sixteenT = """/mnt/16T/24T/24T/evo980-new
|
49
|
+
/mnt/16T/24T/24T/500Gb_pny
|
50
|
+
/mnt/16T/24T/24T/wdBlack_970_evo
|
51
|
+
/mnt/16T/24T/24T/wd_nvm
|
52
|
+
/mnt/16T/24T/24T/wd_main_980
|
53
|
+
/mnt/16T/24T/24T/nvmeHeatSync-new
|
54
|
+
/mnt/16T/24T/24T/PNY_1T
|
55
|
+
/mnt/16T/24T/24T/serverBack
|
56
|
+
/mnt/16T/24T/24T/transferDrive
|
57
|
+
/mnt/16T/24T/24T/.Trash-1000
|
58
|
+
/mnt/16T/24T/24T/solcatcher_backup
|
59
|
+
/mnt/16T/24T/24T/wd_black_980_home
|
60
|
+
/mnt/16T/24T/24T/abstract_images-0.0.0.5-py3-none-any
|
61
|
+
/mnt/16T/24T/24T/evo_970
|
62
|
+
/mnt/16T/24T/24T/main_drive
|
63
|
+
/mnt/16T/24T/24T/wd_black
|
64
|
+
/mnt/16T/24T/24T/testfile.txtt"""
|
65
|
+
sixteenT = sixteenT.split('\n')
|
66
|
+
twentyfourT = twentyfourT.split('\n')
|
67
|
+
def is_dirname_in_sixteenT(dirname):
|
68
|
+
basenames = [directory for directory in sixteenT if os.path.basename(directory) == dirname]
|
69
|
+
if basenames:
|
70
|
+
return basenames[0]
|
71
|
+
for directory in twentyfourT:
|
72
|
+
dirname = os.path.basename(directory)
|
73
|
+
|
74
|
+
size1 = get_dir_size(directory))
|
75
|
+
sixteenT_dir = is_dirname_in_sixteenT(dirname)
|
76
|
+
size2 = get_dir_size(sixteenT_dir))
|
77
|
+
print(directory)
|
78
|
+
print(f"size == {size1}")
|
79
|
+
print(sixteenT_dir)
|
80
|
+
input(f"size == {size2}")
|
81
|
+
input(compare_dirs(directory, sixteenT_dir))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.92
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -3,6 +3,9 @@ abstract_webtools/abstract_userpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-
|
|
3
3
|
abstract_webtools/abstract_usurpit.py,sha256=7PDUb5LNETjvU1rhfJaToKLIKmSXRkcJAmM4wOX7PsQ,7170
|
4
4
|
abstract_webtools/abstract_webtools.py,sha256=3NzGmJlZvrdVtEcUi2K5iUgWr1822IBPhIN9us2e2t0,3859
|
5
5
|
abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
|
6
|
+
abstract_webtools/domain_identifier.py,sha256=AvWlGD7C19rySa_J_Brxi3kz43LMWvGsshuuZNg7MvI,3320
|
7
|
+
abstract_webtools/extention_list.py,sha256=gRSO4nMbuuXDYzd-ss4s64sS80ZHmUoazMCpgoKG5vE,4884
|
8
|
+
abstract_webtools/find_dirs.py,sha256=BlE4ruzMABqmv03NcutZ1j5N3pCc-Q4uNEAMpNolZCQ,2609
|
6
9
|
abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
|
7
10
|
abstract_webtools/soup_gui.py,sha256=n95YAps1R6DpMwR4UbthSqQby0C5WHUa9tsW-f2qpLg,5184
|
8
11
|
abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE70B8,10441
|
@@ -38,7 +41,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
|
|
38
41
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
39
42
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
40
43
|
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
41
|
-
abstract_webtools-0.1.6.
|
42
|
-
abstract_webtools-0.1.6.
|
43
|
-
abstract_webtools-0.1.6.
|
44
|
-
abstract_webtools-0.1.6.
|
44
|
+
abstract_webtools-0.1.6.92.dist-info/METADATA,sha256=4aTdbNUjectbYqf27r46QeI8_9Fr_azCOJQTIxUdMoM,16029
|
45
|
+
abstract_webtools-0.1.6.92.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
|
46
|
+
abstract_webtools-0.1.6.92.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
47
|
+
abstract_webtools-0.1.6.92.dist-info/RECORD,,
|
File without changes
|