abstract-webtools 0.1.6.90__py3-none-any.whl → 0.1.6.92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ from extention_list import get_extention,popular_extentions
2
+ from urllib.parse import urlparse, urljoin
3
+ from abstract_utilities import *
4
+ def try_request(url,timeout=None):
5
+ if timeout == None:
6
+ timeout= 5
7
+ elif timeout == 0:
8
+ timeout = None
9
+ try:
10
+ result = requests.get(url, timeout=timeout) # Set timeout to 5 seconds
11
+ except requests.exceptions.RequestException as e:
12
+ print(f"Request failed for {url}: {e}")
13
+ result = None
14
+ return result
15
+ def is_result_200(result):
16
+ try:
17
+ if result.status_code == 200:
18
+ return True
19
+ except:
20
+ return False
21
+ return False
22
+ def url_to_pieces(url):
23
+ """
24
+ Split a URL into protocol, domain, path, and query components.
25
+ Uses urlparse for robustness.
26
+ """
27
+ parsed_url = {'parsed':'', 'scheme':'', 'netloc':'', 'subdomain':'', 'domain':url,'extention':'', 'path':'', 'params':'', 'query':'', 'fragment':''}
28
+ try:
29
+ parsed = urlparse(url)
30
+ parsed_url['parsed']= parsed
31
+ parsed_url['scheme'] = parsed.scheme if parsed.scheme else ""
32
+ parsed_url['netloc'] = parsed.netloc if parsed.netloc else ""
33
+ parsed_url['path'] = parsed.path or ""
34
+ parsed_url['params'] = parsed.params or ""
35
+ parsed_url['query'] = parsed.query or ""
36
+ parsed_url['fragment'] = parsed.fragment or ""
37
+ if parsed_url['netloc'] == '' and parsed_url['path']:
38
+ parsed_url['netloc'] = parsed_url['path']
39
+ if '/' in parsed_url['path']:
40
+ parsed_url['netloc'] = parsed_url['path'].split('/')[0]
41
+ parsed_url['path'] = '/'+'/'.join(parsed_url['path'].split('/')[1:])
42
+ else:
43
+ parsed_url['path']=''
44
+ if parsed_url['netloc']:
45
+ if parsed_url['netloc'].startswith('www.'):
46
+ parsed_url['subdomain']= 'www.'
47
+ parsed_url['domain'] = parsed_url['netloc'][len('www.'):]
48
+ else:
49
+ parsed_url['domain'] = parsed_url['netloc']
50
+ parsed_url.update(get_extention(parsed_url['domain']))
51
+ except Exception as e:
52
+ print(f'The URL {url} was not reachable: {e}')
53
+ return parsed_url
54
+ def correct_domains(url):
55
+ urls = [url]
56
+ protocols = {'https':['','www.'],'http':['','www.'],'':['','www.']}
57
+ parsed_url = url_to_pieces(url)
58
+ scheme,subdomain,extentions = parsed_url['scheme'], parsed_url['subdomain'],make_list(parsed_url['extention'] or popular_extentions)
59
+ subdomains = protocols.get(scheme)
60
+ if subdomain in subdomains:
61
+ subdomains.remove(subdomain)
62
+ protocols[scheme] = subdomains
63
+ for extention in extentions:
64
+ link = f"{parsed_url['domain']}{extention}{parsed_url['path']}{parsed_url['params']}"
65
+ for key,values in protocols.items():
66
+ for value in values:
67
+ new_link = f"{value}{link}"
68
+ if key:
69
+ new_link = f"{key}://{new_link}"
70
+ urls.append(new_link)
71
+ return urls
72
+ def tryAllDomains(url):
73
+ urls = correct_domains(url)
74
+ for i, url in enumerate(urls):
75
+ result = try_request(url)
76
+ if is_result_200(result):
77
+ return url
78
+ def tryDomain(url):
79
+ request_mgr = requestManager(url)
80
+ return request_mgr.source_code
81
+ url='thedailydialectics'
82
+ input(tryAllDomains(url))
@@ -0,0 +1,11 @@
1
+ extentions=['.ac', '.academy', '.accountant', '.actor', '.agency', '.ai', '.airforce', '.am', '.apartments', '.archi', '.army', '.art', '.asia', '.associates', '.at', '.attorney', '.auction', '.audio', '.baby', '.band', '.bar', '.bargains', '.be', '.beer', '.berlin', '.best', '.bet', '.bid', '.bike', '.bingo', '.bio', '.biz', '.black', '.blackfriday', '.blog', '.blue', '.boston', '.boutique', '.br.com', '.build', '.builders', '.business', '.buzz', '.buz', '.ca', '.cab', '.cafe', '.camera', '.camp', '.capital', '.cards', '.care', '.careers', '.casa', '.cash', '.casino', '.catering', '.cc', '.center', '.ceo', '.ch', '.charity', '.chat', '.cheap', '.christmas', '.church', '.city', '.claims', '.cleaning', '.click', '.clinic', '.clothing', '.cloud', '.club', '.cn.com', '.co', '.co.com', '.co.in', '.co.nz', '.co.uk', '.coach', '.codes', '.coffee', '.college', '.com', '.com.co', '.com.mx', '.com.tw', '.community', '.company', '.computer', '.condos', '.construction', '.consulting', '.contact', '.contractors', '.cooking', '.cool', '.coupons', '.courses', '.credit', '.creditcard', '.cricket', '.cruises', '.cymru', '.cz', '.dance', '.date', '.dating', '.de', '.de.com', '.deals', '.degree', '.delivery', '.democrat', '.dental', '.dentist', '.desi', '.design', '.diamonds', '.diet', '.digital', '.direct', '.directory', '.discount', '.doctor', '.dog', '.domains', '.download', '.earth', '.eco', '.education', '.email', '.energy', '.engineer', '.engineering', '.enterprises', '.equipment', '.estate', '.eu', '.eu.com', '.events', '.exchange', '.expert', '.exposed', '.express', '.fail', '.faith', '.family', '.fans', '.farm', '.fashion', '.film', '.finance', '.financial', '.fish', '.fishing', '.fit', '.fitness', '.flights', '.florist', '.flowers', '.fm', '.football', '.forsale', '.foundation', '.fun', '.fund', '.furniture', '.futbol', '.fyi', '.gallery', '.games', '.garden', '.gay', '.gift', '.gifts', '.gives', '.glass', '.global', '.gmbh', '.gold', '.golf', '.graphics', '.gratis', '.green', '.gripe', '.group', '.gs', '.guide', '.guitars', '.guru', '.haus', '.healthcare', '.help', '.hiphop', '.hn', '.hockey', '.holdings', '.holiday', '.horse', '.host', '.hosting', '.house', '.how', '.immo', '.in', '.industries', '.info', '.ink', '.institue', '.insure', '.international', '.investments', '.io', '.irish', '.it', '.jetzt', '.jewelry', '.jp', '.jpn.com', '.juegos', '.kaufen', '.kim', '.kitchen', '.kiwi', '.la', '.land', '.lawyer', '.lease', '.legal', '.lgbt', '.li', '.life', '.lighting', '.limited', '.limo', '.link', '.live', '.llc', '.loan', '.loans', '.lol', '.london', '.love', '.ltd', '.luxury ', '.maison', '.managment', '.market', '.marketing', '.mba', '.me', '.me.uk', '.media', '.memorial', '.men', '.menu', '.miami', '.mobi', '.moda', '.moe', '.money', '.monster', '.mortgage', '.mx', '.nagoya', '.navy', '.net', '.net.co', '.network', '.news', '.ngo', '.ninja', '.nl', '.nyc', '.okinawa', '.one', '.ong', '.online', '.org', '.org.in', '.org.uk', '.partners', '.parts', '.party', '.pet', '.ph', '.photo', '.photography', '.photos', '.physio', '.pics', '.pictures', '.pink', '.pizza', '.pl', '.place', '.plumbing', '.plus', '.poker', '.press', '.pro', '.productions', '.promo', '.properties', '.property', '.pub', '.qpon', '.quebec', '.racing', '.realty', '.recipes', '.red', '.rehab', '.reisen', '.rent', '.rentals', '.repair', '.report', '.republican', '.rest', '.restaurant', '.review', '.reviews', '.rip', '.rocks', '.rodeo', '.run', '.sa.com', '.sale', '.sarl', '.sc', '.school', '.schule', '.science', '.se.net', '.services', '.sexy', '.sg', '.shiksha', '.shoes', '.shop', '.shopping', '.show', '.singles', '.site', '.ski', '.soccer', '.social', '.software', '.solar', '.solutions', '.soy', '.space', '.srl', '.store', '.stream', '.studio', '.study', '.style', '.supplies', '.supply', '.support', '.surf', '.surgery', '.systems', '.tattoo', '.tax', '.taxi', '.team', '.tech', '.technology', '.tel', '.tennis', '.theater', '.tienda', '.tips', '.today', '.tokyo', '.tools', '.tours', '.town', '.toys', '.trade', '.training', '.tv', '.tw', '.uk', '.uk.com', '.university', '.uno', '.us', '.us.com', '.vacations', '.vc', '.vegas', '.ventures', '.vet', '.viajes', '.video', '.villas', '.vip', '.vision', '.vodka', '.vote', '.voting', '.voyage', '.watch', '.webcam', '.website', '.wedding', '.wiki', '.win', '.wine', '.work', '.works', '.world', '.ws', '.wtf', '.xyz', '.yoga', '.za.com', '.zone']
2
+ popular_extentions = ['.com','.net','.org','.co','.us']
3
+ extentions = popular_extentions+[extention for extention in extentions if extention not in popular_extentions]
4
+ def get_extention(domain):
5
+ domain_js = {"domain":domain,"extention":''}
6
+ for extention in extentions:
7
+ if domain.endswith(extention):
8
+ domain_js["domain"] = domain[:-len(extention)]
9
+ domain_js["extention"] = extention
10
+ break
11
+ return domain_js
@@ -0,0 +1,81 @@
1
+ import os
2
+ def get_dir_size(path):
3
+ """Calculate the total size of a directory in bytes."""
4
+ total_size = 0
5
+ try:
6
+ for dirpath, dirnames, filenames in os.walk(path):
7
+ for filename in filenames:
8
+ file_path = os.path.join(dirpath, filename)
9
+ try:
10
+ total_size += os.path.getsize(file_path)
11
+ except (OSError, PermissionError) as e:
12
+ #print(f"Error accessing {file_path}: {e}")
13
+ pass
14
+ except (OSError, PermissionError) as e:
15
+ print(f"Error accessing {path}: {e}")
16
+ return total_size
17
+
18
+ def compare_dirs(dir1_path, dir2_path):
19
+ """Compare the sizes of two directories."""
20
+ dir1_size = get_dir_size(dir1_path)
21
+ dir2_size = get_dir_size(dir2_path)
22
+
23
+ print(f"Size of {dir1_path}: {dir1_size} bytes")
24
+ print(f"Size of {dir2_path}: {dir2_size} bytes")
25
+
26
+ if dir1_size > dir2_size:
27
+ print(f"{dir1_path} is larger than {dir2_path}")
28
+ elif dir2_size > dir1_size:
29
+ print(f"{dir2_path} is larger than {dir1_path}")
30
+ else:
31
+ print("Both directories are the same size")
32
+ twentyfourT = """/mnt/24T/evo_970
33
+ /mnt/24T/main_drive
34
+ /mnt/24T/nvmeHeatSync-new
35
+ /mnt/24T/PNY_1T
36
+ /mnt/24T/serverBack
37
+ /mnt/24T/solcatcher_backup
38
+ /mnt/24T/transferDrive
39
+ /mnt/24T/wd_black
40
+ /mnt/24T/wd_black_980_home
41
+ /mnt/24T/wdBlack_970_evo
42
+ /mnt/24T/wd_main_980
43
+ /mnt/24T/wd_nvm
44
+ /mnt/24T/.Trash-1000
45
+ /mnt/24T/testfile.txt"""
46
+
47
+
48
+ sixteenT = """/mnt/16T/24T/24T/evo980-new
49
+ /mnt/16T/24T/24T/500Gb_pny
50
+ /mnt/16T/24T/24T/wdBlack_970_evo
51
+ /mnt/16T/24T/24T/wd_nvm
52
+ /mnt/16T/24T/24T/wd_main_980
53
+ /mnt/16T/24T/24T/nvmeHeatSync-new
54
+ /mnt/16T/24T/24T/PNY_1T
55
+ /mnt/16T/24T/24T/serverBack
56
+ /mnt/16T/24T/24T/transferDrive
57
+ /mnt/16T/24T/24T/.Trash-1000
58
+ /mnt/16T/24T/24T/solcatcher_backup
59
+ /mnt/16T/24T/24T/wd_black_980_home
60
+ /mnt/16T/24T/24T/abstract_images-0.0.0.5-py3-none-any
61
+ /mnt/16T/24T/24T/evo_970
62
+ /mnt/16T/24T/24T/main_drive
63
+ /mnt/16T/24T/24T/wd_black
64
+ /mnt/16T/24T/24T/testfile.txtt"""
65
+ sixteenT = sixteenT.split('\n')
66
+ twentyfourT = twentyfourT.split('\n')
67
+ def is_dirname_in_sixteenT(dirname):
68
+ basenames = [directory for directory in sixteenT if os.path.basename(directory) == dirname]
69
+ if basenames:
70
+ return basenames[0]
71
+ for directory in twentyfourT:
72
+ dirname = os.path.basename(directory)
73
+
74
+ size1 = get_dir_size(directory))
75
+ sixteenT_dir = is_dirname_in_sixteenT(dirname)
76
+ size2 = get_dir_size(sixteenT_dir))
77
+ print(directory)
78
+ print(f"size == {size1}")
79
+ print(sixteenT_dir)
80
+ input(f"size == {size2}")
81
+ input(compare_dirs(directory, sixteenT_dir))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.90
3
+ Version: 0.1.6.92
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -3,6 +3,9 @@ abstract_webtools/abstract_userpit.py,sha256=Rg_0Orx79rxqEePt6Sf-evGslPq5KLlTiL-
3
3
  abstract_webtools/abstract_usurpit.py,sha256=7PDUb5LNETjvU1rhfJaToKLIKmSXRkcJAmM4wOX7PsQ,7170
4
4
  abstract_webtools/abstract_webtools.py,sha256=3NzGmJlZvrdVtEcUi2K5iUgWr1822IBPhIN9us2e2t0,3859
5
5
  abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
6
+ abstract_webtools/domain_identifier.py,sha256=AvWlGD7C19rySa_J_Brxi3kz43LMWvGsshuuZNg7MvI,3320
7
+ abstract_webtools/extention_list.py,sha256=gRSO4nMbuuXDYzd-ss4s64sS80ZHmUoazMCpgoKG5vE,4884
8
+ abstract_webtools/find_dirs.py,sha256=BlE4ruzMABqmv03NcutZ1j5N3pCc-Q4uNEAMpNolZCQ,2609
6
9
  abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
7
10
  abstract_webtools/soup_gui.py,sha256=n95YAps1R6DpMwR4UbthSqQby0C5WHUa9tsW-f2qpLg,5184
8
11
  abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE70B8,10441
@@ -38,7 +41,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
38
41
  abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
39
42
  abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
40
43
  abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
41
- abstract_webtools-0.1.6.90.dist-info/METADATA,sha256=9KBoZzDcF1imzfdgBeOglz185TT5kGvkdtbDRNqlQrw,16029
42
- abstract_webtools-0.1.6.90.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
43
- abstract_webtools-0.1.6.90.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
44
- abstract_webtools-0.1.6.90.dist-info/RECORD,,
44
+ abstract_webtools-0.1.6.92.dist-info/METADATA,sha256=4aTdbNUjectbYqf27r46QeI8_9Fr_azCOJQTIxUdMoM,16029
45
+ abstract_webtools-0.1.6.92.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
46
+ abstract_webtools-0.1.6.92.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
47
+ abstract_webtools-0.1.6.92.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5