waymore 4.2__py3-none-any.whl → 4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -1
- waymore/waymore.py +95 -40
- {waymore-4.2.dist-info → waymore-4.4.dist-info}/METADATA +2 -3
- waymore-4.4.dist-info/RECORD +8 -0
- waymore-4.2.dist-info/RECORD +0 -8
- {waymore-4.2.dist-info → waymore-4.4.dist-info}/LICENSE +0 -0
- {waymore-4.2.dist-info → waymore-4.4.dist-info}/WHEEL +0 -0
- {waymore-4.2.dist-info → waymore-4.4.dist-info}/entry_points.txt +0 -0
- {waymore-4.2.dist-info → waymore-4.4.dist-info}/top_level.txt +0 -0
waymore/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__="4.
|
|
1
|
+
__version__="4.4"
|
waymore/waymore.py
CHANGED
|
@@ -83,7 +83,7 @@ argsInputHostname = ''
|
|
|
83
83
|
responseOutputDirectory = ''
|
|
84
84
|
|
|
85
85
|
# Source Provider URLs
|
|
86
|
-
WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}
|
|
86
|
+
WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
|
|
87
87
|
CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
|
|
88
88
|
ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
|
|
89
89
|
URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
|
|
@@ -706,23 +706,55 @@ def fixArchiveOrgUrl(url):
|
|
|
706
706
|
url = url[0:newline]
|
|
707
707
|
return url
|
|
708
708
|
|
|
709
|
+
# Add a link to the linksFound collection for archived responses (included timestamp preifx)
|
|
710
|
+
def linksFoundResponseAdd(link):
|
|
711
|
+
global linksFound, argsInput, argsInputHostname
|
|
712
|
+
|
|
713
|
+
try:
|
|
714
|
+
if inputIsDomainANDPath:
|
|
715
|
+
checkInput = argsInput
|
|
716
|
+
else:
|
|
717
|
+
checkInput = argsInputHostname
|
|
718
|
+
|
|
719
|
+
# Remove the timestamp
|
|
720
|
+
linkWithoutTimestamp = link.split('/', 1)[-1]
|
|
721
|
+
|
|
722
|
+
# If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
|
|
723
|
+
parsed = urlparse(linkWithoutTimestamp.strip())
|
|
724
|
+
if parsed.port in (80, 443):
|
|
725
|
+
new_netloc = parsed.hostname
|
|
726
|
+
parsed_url = parsed._replace(netloc=new_netloc).geturl()
|
|
727
|
+
else:
|
|
728
|
+
parsed_url = linkWithoutTimestamp
|
|
729
|
+
|
|
730
|
+
# Don't write it if the link does not contain the requested domain (this can sometimes happen)
|
|
731
|
+
if parsed_url.find(checkInput) >= 0:
|
|
732
|
+
linksFound.add(link)
|
|
733
|
+
except Exception as e:
|
|
734
|
+
linksFound.add(link)
|
|
735
|
+
|
|
709
736
|
# Add a link to the linksFound collection
|
|
710
737
|
def linksFoundAdd(link):
|
|
711
738
|
global linksFound, argsInput, argsInputHostname
|
|
712
|
-
|
|
739
|
+
|
|
713
740
|
try:
|
|
714
741
|
if inputIsDomainANDPath:
|
|
715
742
|
checkInput = argsInput
|
|
716
743
|
else:
|
|
717
744
|
checkInput = argsInputHostname
|
|
745
|
+
|
|
746
|
+
# If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
|
|
747
|
+
parsed = urlparse(link.strip())
|
|
748
|
+
if parsed.port in (80, 443):
|
|
749
|
+
new_netloc = parsed.hostname
|
|
750
|
+
parsed_url = parsed._replace(netloc=new_netloc).geturl()
|
|
751
|
+
else:
|
|
752
|
+
parsed_url = link
|
|
753
|
+
|
|
718
754
|
# Don't write it if the link does not contain the requested domain (this can sometimes happen)
|
|
719
|
-
if
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
newNetloc = parsed.netloc.split(':')[0]
|
|
723
|
-
parsed = parsed._replace(netloc=newNetloc).geturl()
|
|
724
|
-
linksFound.add(parsed)
|
|
725
|
-
except:
|
|
755
|
+
if parsed_url.find(checkInput) >= 0:
|
|
756
|
+
linksFound.add(link)
|
|
757
|
+
except Exception as e:
|
|
726
758
|
linksFound.add(link)
|
|
727
759
|
|
|
728
760
|
def processArchiveUrl(url):
|
|
@@ -1851,11 +1883,15 @@ def getWaybackUrls():
|
|
|
1851
1883
|
session.mount('https://', HTTP_ADAPTER)
|
|
1852
1884
|
session.mount('http://', HTTP_ADAPTER)
|
|
1853
1885
|
resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
|
|
1854
|
-
totalPages =
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1886
|
+
# Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
|
|
1887
|
+
try:
|
|
1888
|
+
totalPages = int(resp.text.strip())
|
|
1889
|
+
|
|
1890
|
+
# If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
|
|
1891
|
+
if args.limit_requests != 0 and totalPages > args.limit_requests:
|
|
1892
|
+
totalPages = args.limit_requests
|
|
1893
|
+
except:
|
|
1894
|
+
totalPages = -1
|
|
1859
1895
|
except Exception as e:
|
|
1860
1896
|
try:
|
|
1861
1897
|
# If the rate limit was reached end now
|
|
@@ -1880,31 +1916,39 @@ def getWaybackUrls():
|
|
|
1880
1916
|
else:
|
|
1881
1917
|
writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
|
|
1882
1918
|
return
|
|
1883
|
-
|
|
1919
|
+
|
|
1884
1920
|
if args.check_only:
|
|
1885
|
-
|
|
1886
|
-
|
|
1921
|
+
if totalPages < 0:
|
|
1922
|
+
write(colored('Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.','cyan'))
|
|
1923
|
+
else:
|
|
1924
|
+
checkWayback = totalPages
|
|
1925
|
+
write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
|
|
1887
1926
|
else:
|
|
1888
1927
|
if verbose():
|
|
1889
1928
|
write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
|
|
1890
1929
|
|
|
1891
|
-
|
|
1892
|
-
|
|
1930
|
+
if totalPages < 0:
|
|
1931
|
+
write(colored('\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r','cyan'))
|
|
1893
1932
|
|
|
1894
|
-
|
|
1895
|
-
pages = []
|
|
1896
|
-
if totalPages == 1:
|
|
1897
|
-
pages.append(url)
|
|
1933
|
+
processWayBackPage(url)
|
|
1898
1934
|
else:
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1935
|
+
# if the page number was found then display it, but otherwise we will just try to increment until we have everything
|
|
1936
|
+
write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
|
|
1937
|
+
|
|
1938
|
+
# Get a list of all the page URLs we need to visit
|
|
1939
|
+
pages = []
|
|
1940
|
+
if totalPages == 1:
|
|
1941
|
+
pages.append(url)
|
|
1942
|
+
else:
|
|
1943
|
+
for page in range(0, totalPages):
|
|
1944
|
+
pages.append(url+str(page))
|
|
1945
|
+
|
|
1946
|
+
# Process the URLs from web archive
|
|
1947
|
+
if stopProgram is None:
|
|
1948
|
+
p = mp.Pool(args.processes)
|
|
1949
|
+
p.map(processWayBackPage, pages)
|
|
1950
|
+
p.close()
|
|
1951
|
+
p.join()
|
|
1908
1952
|
|
|
1909
1953
|
# Show the MIME types found (in case user wants to exclude more)
|
|
1910
1954
|
if verbose() and len(linkMimes) > 0 :
|
|
@@ -2335,7 +2379,7 @@ def getVirusTotalUrls():
|
|
|
2335
2379
|
|
|
2336
2380
|
except Exception as e:
|
|
2337
2381
|
writerr(colored('ERROR getVirusTotalUrls 1: ' + str(e), 'red'))
|
|
2338
|
-
|
|
2382
|
+
|
|
2339
2383
|
def processResponses():
|
|
2340
2384
|
"""
|
|
2341
2385
|
Get archived responses from Wayback Machine (archive.org)
|
|
@@ -2431,11 +2475,11 @@ def processResponses():
|
|
|
2431
2475
|
if args.capture_interval == 'none': # get all
|
|
2432
2476
|
collapse = ''
|
|
2433
2477
|
elif args.capture_interval == 'h': # get at most 1 capture per URL per hour
|
|
2434
|
-
collapse = 'timestamp:10
|
|
2478
|
+
collapse = '&collapse=timestamp:10'
|
|
2435
2479
|
elif args.capture_interval == 'd': # get at most 1 capture per URL per day
|
|
2436
|
-
collapse = 'timestamp:8
|
|
2480
|
+
collapse = '&collapse=timestamp:8'
|
|
2437
2481
|
elif args.capture_interval == 'm': # get at most 1 capture per URL per month
|
|
2438
|
-
collapse = 'timestamp:6
|
|
2482
|
+
collapse = '&collapse=timestamp:6'
|
|
2439
2483
|
|
|
2440
2484
|
url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords
|
|
2441
2485
|
|
|
@@ -2501,13 +2545,14 @@ def processResponses():
|
|
|
2501
2545
|
except:
|
|
2502
2546
|
pass
|
|
2503
2547
|
|
|
2504
|
-
# Go through the response to save the links found
|
|
2548
|
+
# Go through the response to save the links found
|
|
2505
2549
|
for line in resp.iter_lines():
|
|
2506
2550
|
try:
|
|
2507
2551
|
results = line.decode("utf-8")
|
|
2508
|
-
|
|
2509
|
-
|
|
2510
|
-
|
|
2552
|
+
parts = results.split(' ', 2)
|
|
2553
|
+
timestamp = parts[0]
|
|
2554
|
+
originalUrl = parts[1]
|
|
2555
|
+
linksFoundResponseAdd(timestamp+'/'+originalUrl)
|
|
2511
2556
|
except Exception as e:
|
|
2512
2557
|
writerr(colored(getSPACER('ERROR processResponses 3: Cannot to get link from line: '+str(line)), 'red'))
|
|
2513
2558
|
|
|
@@ -2528,6 +2573,16 @@ def processResponses():
|
|
|
2528
2573
|
|
|
2529
2574
|
# Get the total number of responses we will try to get and set the current file count to the success count
|
|
2530
2575
|
totalResponses = len(linkRequests)
|
|
2576
|
+
|
|
2577
|
+
# If there are no reponses to download, diaplay an error and exit
|
|
2578
|
+
if totalResponses == 0:
|
|
2579
|
+
try:
|
|
2580
|
+
if originalUrl:
|
|
2581
|
+
writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'+originalUrl+'") but they didn\'t match the input you gave. Check input and try again.'), 'red'))
|
|
2582
|
+
except:
|
|
2583
|
+
writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - check input and try again.'), 'red'))
|
|
2584
|
+
return
|
|
2585
|
+
|
|
2531
2586
|
fileCount = successCount
|
|
2532
2587
|
|
|
2533
2588
|
if args.check_only:
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: waymore
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.4
|
|
4
4
|
Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
|
|
5
5
|
Home-page: https://github.com/xnl-h4ck3r/waymore
|
|
6
6
|
Author: @xnl-h4ck3r
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
|
-
Requires-Dist: argparse
|
|
10
9
|
Requires-Dist: requests
|
|
11
10
|
Requires-Dist: pyyaml
|
|
12
11
|
Requires-Dist: termcolor
|
|
@@ -16,7 +15,7 @@ Requires-Dist: tldextract
|
|
|
16
15
|
|
|
17
16
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
|
|
18
17
|
|
|
19
|
-
## About - v4.
|
|
18
|
+
## About - v4.4
|
|
20
19
|
|
|
21
20
|
The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
|
|
22
21
|
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
waymore/__init__.py,sha256=bb3D2cWPj3M9gB4ePNX8nrpDuS8IImWiON1Cc_z3vGg,17
|
|
2
|
+
waymore/waymore.py,sha256=cnFkODCRHd4OxxBZVMWUwus5bTZ-ypTGAK_Aa9HPd-g,169799
|
|
3
|
+
waymore-4.4.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
|
|
4
|
+
waymore-4.4.dist-info/METADATA,sha256=gpUxWzvVUkCmZUB_Dd-gl_8w2P9UFh5tpfyob7wMe-o,47221
|
|
5
|
+
waymore-4.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
6
|
+
waymore-4.4.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
|
|
7
|
+
waymore-4.4.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
|
|
8
|
+
waymore-4.4.dist-info/RECORD,,
|
waymore-4.2.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
waymore/__init__.py,sha256=KS43n250T2U7gE8jspc0cFVyTllND-0M0RkTA_yyc88,17
|
|
2
|
-
waymore/waymore.py,sha256=8HdTaKE5-SzIIWWWF5kskaBVzl0FRG0DpBChR11JWjs,167332
|
|
3
|
-
waymore-4.2.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
|
|
4
|
-
waymore-4.2.dist-info/METADATA,sha256=uSCOfxcJDVsAO9z_in0-n6l8BOfD9jPNHiAsDuhuhz0,47245
|
|
5
|
-
waymore-4.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
6
|
-
waymore-4.2.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
|
|
7
|
-
waymore-4.2.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
|
|
8
|
-
waymore-4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|