waymore 5.1__py3-none-any.whl → 6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -1
- waymore/waymore.py +488 -66
- {waymore-5.1.dist-info → waymore-6.1.dist-info}/METADATA +6 -4
- waymore-6.1.dist-info/RECORD +8 -0
- waymore-5.1.dist-info/RECORD +0 -8
- {waymore-5.1.dist-info → waymore-6.1.dist-info}/LICENSE +0 -0
- {waymore-5.1.dist-info → waymore-6.1.dist-info}/WHEEL +0 -0
- {waymore-5.1.dist-info → waymore-6.1.dist-info}/entry_points.txt +0 -0
- {waymore-5.1.dist-info → waymore-6.1.dist-info}/top_level.txt +0 -0
waymore/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__="
|
|
1
|
+
__version__="6.1"
|
waymore/waymore.py
CHANGED
|
@@ -58,10 +58,12 @@ stopSource = False
|
|
|
58
58
|
successCount = 0
|
|
59
59
|
failureCount = 0
|
|
60
60
|
fileCount = 0
|
|
61
|
+
totalFileCount = 0
|
|
61
62
|
totalResponses = 0
|
|
62
63
|
totalPages = 0
|
|
63
64
|
indexFile = None
|
|
64
65
|
continueRespFile = None
|
|
66
|
+
continueRespFileURLScan = None
|
|
65
67
|
inputIsDomainANDPath = False
|
|
66
68
|
inputIsSubDomain = False
|
|
67
69
|
subs = '*.'
|
|
@@ -82,12 +84,14 @@ checkVirusTotal = 0
|
|
|
82
84
|
checkIntelx = 0
|
|
83
85
|
argsInputHostname = ''
|
|
84
86
|
responseOutputDirectory = ''
|
|
87
|
+
urlscanRequestLinks = set()
|
|
85
88
|
|
|
86
89
|
# Source Provider URLs
|
|
87
90
|
WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
|
|
88
91
|
CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
|
|
89
92
|
ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
|
|
90
|
-
URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
|
|
93
|
+
URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}{DATERANGE}&size=10000'
|
|
94
|
+
URLSCAN_DOM_URL = 'https://urlscan.io/dom/'
|
|
91
95
|
VIRUSTOTAL_URL = 'https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}'
|
|
92
96
|
INTELX_SEARCH_URL = 'https://2.intelx.io/phonebook/search'
|
|
93
97
|
INTELX_RESULTS_URL = 'https://2.intelx.io/phonebook/search/result?id='
|
|
@@ -952,7 +956,7 @@ def processArchiveUrl(url):
|
|
|
952
956
|
indexFile.write(hashValue+','+archiveUrl+' ,'+timestamp+'\n')
|
|
953
957
|
indexFile.flush()
|
|
954
958
|
except Exception as e:
|
|
955
|
-
writerr(colored(getSPACER('[ ERR ] Failed to write to
|
|
959
|
+
writerr(colored(getSPACER('[ ERR ] Failed to write to waymore_index.txt for "' + archiveUrl + '": '+ str(e)), 'red'))
|
|
956
960
|
|
|
957
961
|
# FOR DEBUGGING PURPOSES
|
|
958
962
|
try:
|
|
@@ -1188,27 +1192,6 @@ def processURLOutput():
|
|
|
1188
1192
|
if verbose():
|
|
1189
1193
|
writerr(colored("ERROR processURLOutput 1: " + str(e), "red"))
|
|
1190
1194
|
|
|
1191
|
-
def processResponsesOutput():
|
|
1192
|
-
"""
|
|
1193
|
-
Show results of the archive responses saved
|
|
1194
|
-
"""
|
|
1195
|
-
global successCount, failureCount, subs, fileCount, argsInput, DEFAULT_OUTPUT_DIR, responseOutputDirectory
|
|
1196
|
-
try:
|
|
1197
|
-
|
|
1198
|
-
if failureCount > 0:
|
|
1199
|
-
if verbose():
|
|
1200
|
-
write(colored('\nResponses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' failed)\n','red'))
|
|
1201
|
-
else:
|
|
1202
|
-
write(colored('\nResponses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' failed)\n','red'))
|
|
1203
|
-
else:
|
|
1204
|
-
if verbose():
|
|
1205
|
-
write(colored('\nResponses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
|
|
1206
|
-
else:
|
|
1207
|
-
write(colored('\nResponses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
|
|
1208
|
-
except Exception as e:
|
|
1209
|
-
if verbose():
|
|
1210
|
-
writerr(colored("ERROR processResponsesOutput 1: " + str(e), "red"))
|
|
1211
|
-
|
|
1212
1195
|
def validateArgProcesses(x):
|
|
1213
1196
|
"""
|
|
1214
1197
|
Validate the -p / --processes argument
|
|
@@ -1280,6 +1263,38 @@ def validateArgStatusCodes(x):
|
|
|
1280
1263
|
raise argparse.ArgumentTypeError('Pass HTTP status codes separated by a comma')
|
|
1281
1264
|
return x
|
|
1282
1265
|
|
|
1266
|
+
def validateArgDate(x):
|
|
1267
|
+
"""
|
|
1268
|
+
Validate the -from and -to arguments
|
|
1269
|
+
"""
|
|
1270
|
+
invalid = False
|
|
1271
|
+
|
|
1272
|
+
# Map string lengths to their corresponding datetime formats
|
|
1273
|
+
formats = {
|
|
1274
|
+
4: "%Y",
|
|
1275
|
+
6: "%Y%m",
|
|
1276
|
+
8: "%Y%m%d",
|
|
1277
|
+
10: "%Y%m%d%H",
|
|
1278
|
+
12: "%Y%m%d%H%M",
|
|
1279
|
+
14: "%Y%m%d%H%M%S",
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1282
|
+
# Check if length matches any expected format
|
|
1283
|
+
fmt = formats.get(len(x))
|
|
1284
|
+
if not fmt:
|
|
1285
|
+
invalid = True
|
|
1286
|
+
|
|
1287
|
+
# Try to parse with the matching format
|
|
1288
|
+
try:
|
|
1289
|
+
datetime.strptime(x, fmt)
|
|
1290
|
+
except ValueError:
|
|
1291
|
+
invalid = True
|
|
1292
|
+
|
|
1293
|
+
# If invalid then return an error
|
|
1294
|
+
if invalid:
|
|
1295
|
+
raise argparse.ArgumentTypeError('A valid date/time needs to be passed in YYYYMMDDhhmmss format (or part of, e.g. YYYYMM) .')
|
|
1296
|
+
return x
|
|
1297
|
+
|
|
1283
1298
|
def validateArgMimeTypes(x):
|
|
1284
1299
|
"""
|
|
1285
1300
|
Validate the -ft and -mt arguments
|
|
@@ -1534,17 +1549,22 @@ def getAlienVaultUrls():
|
|
|
1534
1549
|
except Exception as e:
|
|
1535
1550
|
writerr(colored('ERROR getAlienVaultUrls 1: ' + str(e), 'red'))
|
|
1536
1551
|
|
|
1537
|
-
def processURLScanUrl(url, httpCode, mimeType):
|
|
1552
|
+
def processURLScanUrl(url, httpCode, mimeType, urlscanID=''):
|
|
1538
1553
|
"""
|
|
1539
1554
|
Process a specific URL from urlscan.io to determine whether to save the link
|
|
1540
1555
|
"""
|
|
1541
|
-
global argsInput, argsInputHostname
|
|
1556
|
+
global argsInput, argsInputHostname, urlscanRequestLinks
|
|
1542
1557
|
|
|
1543
1558
|
addLink = True
|
|
1544
1559
|
|
|
1545
|
-
try:
|
|
1560
|
+
try:
|
|
1561
|
+
# If the input has a / in it, then a URL was passed, so the link will only be added if the URL matches
|
|
1562
|
+
if '/' in url:
|
|
1563
|
+
if argsInput not in url:
|
|
1564
|
+
addLink = False
|
|
1565
|
+
|
|
1546
1566
|
# If filters are required then test them
|
|
1547
|
-
if not args.filter_responses_only:
|
|
1567
|
+
if addLink and not args.filter_responses_only:
|
|
1548
1568
|
|
|
1549
1569
|
# If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
|
|
1550
1570
|
if args.no_subs:
|
|
@@ -1593,7 +1613,8 @@ def processURLScanUrl(url, httpCode, mimeType):
|
|
|
1593
1613
|
|
|
1594
1614
|
# Add MIME Types if --verbose option was selected
|
|
1595
1615
|
if verbose():
|
|
1596
|
-
|
|
1616
|
+
if mimeType.strip() != '':
|
|
1617
|
+
linkMimes.add(mimeType)
|
|
1597
1618
|
|
|
1598
1619
|
# Add link if it passed filters
|
|
1599
1620
|
if addLink:
|
|
@@ -1608,11 +1629,183 @@ def processURLScanUrl(url, httpCode, mimeType):
|
|
|
1608
1629
|
# Check the URL
|
|
1609
1630
|
match = re.search(r'(^|\.)'+re.escape(argsInputHostname)+'$', domainOnly, flags=re.IGNORECASE)
|
|
1610
1631
|
if match is not None:
|
|
1611
|
-
|
|
1612
|
-
|
|
1632
|
+
if args.mode in ('U','B'):
|
|
1633
|
+
linksFoundAdd(url)
|
|
1634
|
+
# If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
|
|
1635
|
+
if urlscanID != '' and args.mode in ('R','B'):
|
|
1636
|
+
if args.limit == 0 or len(urlscanRequestLinks) < args.limit:
|
|
1637
|
+
urlscanRequestLinks.add((url, URLSCAN_DOM_URL+urlscanID))
|
|
1638
|
+
|
|
1613
1639
|
except Exception as e:
|
|
1614
1640
|
writerr(colored('ERROR processURLScanUrl 1: ' + str(e), 'red'))
|
|
1641
|
+
|
|
1642
|
+
def getURLScanDOM(originalUrl, domUrl):
|
|
1643
|
+
"""
|
|
1644
|
+
Get the DOM for the passed URLScan link
|
|
1645
|
+
"""
|
|
1646
|
+
global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, continueRespFileURLScan, REGEX_404
|
|
1647
|
+
try:
|
|
1648
|
+
if stopProgram is None:
|
|
1649
|
+
|
|
1650
|
+
hashValue = ''
|
|
1651
|
+
|
|
1652
|
+
# Get memory usage every 100 responses
|
|
1653
|
+
if (successCount + failureCount) % 100 == 0:
|
|
1654
|
+
try:
|
|
1655
|
+
getMemory()
|
|
1656
|
+
except:
|
|
1657
|
+
pass
|
|
1658
|
+
|
|
1659
|
+
# Make a request to URLScan
|
|
1660
|
+
try:
|
|
1661
|
+
try:
|
|
1662
|
+
# Choose a random user agent string to use for any requests
|
|
1663
|
+
userAgent = "waymore v"+__version__+" by xnl-h4ck3r"
|
|
1664
|
+
session = requests.Session()
|
|
1665
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
1666
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
1667
|
+
resp = session.get(domUrl, headers={"User-Agent":userAgent}, allow_redirects = True)
|
|
1668
|
+
archiveHtml = str(resp.text)
|
|
1669
|
+
|
|
1670
|
+
# If there is a specific URLScan error in the response, raise an exception
|
|
1671
|
+
if archiveHtml.lower().strip() == 'not found!':
|
|
1672
|
+
raise WayBackException
|
|
1673
|
+
|
|
1674
|
+
# Only create a file if there is a response
|
|
1675
|
+
if len(archiveHtml) != 0:
|
|
1676
|
+
|
|
1677
|
+
# Add the URL as a comment at the start of the response
|
|
1678
|
+
if args.url_filename:
|
|
1679
|
+
archiveHtml = '/* Original URL: ' + originalUrl + ' */\n' + archiveHtml
|
|
1680
|
+
|
|
1681
|
+
# Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
|
|
1682
|
+
if args.url_filename:
|
|
1683
|
+
fileName = originalUrl.replace('/','-').replace(':','')
|
|
1684
|
+
fileName = fileName[0:254]
|
|
1685
|
+
else:
|
|
1686
|
+
hashValue = filehash(archiveHtml)
|
|
1687
|
+
fileName = hashValue
|
|
1688
|
+
|
|
1689
|
+
# Determine extension of file from the content-type using the mimetypes library
|
|
1690
|
+
extension = ''
|
|
1691
|
+
try:
|
|
1692
|
+
# Get path extension
|
|
1693
|
+
targetUrl = 'https://' + originalUrl.split("://")[1]
|
|
1694
|
+
parsed = urlparse(targetUrl.strip())
|
|
1695
|
+
path = parsed.path
|
|
1696
|
+
extension = path[path.rindex('.')+1:]
|
|
1697
|
+
# If there is a / in the extension then it's not the extension at all, so reset to blank
|
|
1698
|
+
if '/' in extension:
|
|
1699
|
+
extension = ''
|
|
1700
|
+
except:
|
|
1701
|
+
pass
|
|
1702
|
+
|
|
1703
|
+
# If the extension is blank, numeric, longer than 4 characters or not alphanumeric - then set to html if the content ends with HTML tag, otherwise set to unknown
|
|
1704
|
+
if extension == '':
|
|
1705
|
+
if archiveHtml.lower().strip().endswith('</html>') or archiveHtml.lower().strip().endswith('</body>') or archiveHtml.lower().strip().startswith('<!doctype html') or archiveHtml.lower().strip().startswith('<html') or archiveHtml.lower().strip().startswith('<head'):
|
|
1706
|
+
extension = 'html'
|
|
1707
|
+
else:
|
|
1708
|
+
extension = 'unknown'
|
|
1709
|
+
|
|
1710
|
+
fileName = fileName + '.' + extension
|
|
1711
|
+
|
|
1712
|
+
# If -oR / --output-responses was passed then add the file to that directory,
|
|
1713
|
+
# else add to the default "results/{target.domain}" directory in the same path as the .py file
|
|
1714
|
+
if args.output_responses != '':
|
|
1715
|
+
filePath = args.output_responses + '/' + f'{fileName}'
|
|
1716
|
+
else:
|
|
1717
|
+
filePath = (DEFAULT_OUTPUT_DIR + '/results/' + str(argsInput).replace('/','-') + '/' + f'{fileName}')
|
|
1718
|
+
|
|
1719
|
+
# Write the file
|
|
1720
|
+
try:
|
|
1721
|
+
responseFile = open(filePath, 'w', encoding='utf8')
|
|
1722
|
+
responseFile.write(archiveHtml)
|
|
1723
|
+
responseFile.close()
|
|
1724
|
+
fileCount = fileCount + 1
|
|
1725
|
+
except Exception as e:
|
|
1726
|
+
writerr(colored(getSPACER('[ ERR ] Failed to write file ' + filePath + ': '+ str(e)), 'red'))
|
|
1727
|
+
|
|
1728
|
+
# Write the hash value and URL to the index file
|
|
1729
|
+
if not args.url_filename:
|
|
1730
|
+
try:
|
|
1731
|
+
timestamp = str(datetime.now())
|
|
1732
|
+
indexFile.write(hashValue+','+domUrl+'/'+originalUrl+' ,'+timestamp+'\n')
|
|
1733
|
+
indexFile.flush()
|
|
1734
|
+
except Exception as e:
|
|
1735
|
+
writerr(colored(getSPACER('[ ERR ] Failed to write to waymore_index.txt for "' + domUrl + '": '+ str(e)), 'red'))
|
|
1736
|
+
|
|
1737
|
+
successCount = successCount + 1
|
|
1738
|
+
|
|
1739
|
+
except WayBackException as wbe:
|
|
1740
|
+
failureCount = failureCount + 1
|
|
1741
|
+
|
|
1742
|
+
except Exception as e:
|
|
1743
|
+
failureCount = failureCount + 1
|
|
1744
|
+
if verbose():
|
|
1745
|
+
try:
|
|
1746
|
+
writerr(colored(getSPACER('[ ' + str(resp.status_code) +' ] Failed to get response for "' + domUrl + '"'), 'red'))
|
|
1747
|
+
except:
|
|
1748
|
+
writerr(colored(getSPACER('[ ERR ] Failed to get response for "' + domUrl + '": '+ str(e)), 'red'))
|
|
1749
|
+
|
|
1750
|
+
# Show progress bar
|
|
1751
|
+
fillTest = (successCount + failureCount) % 2
|
|
1752
|
+
fillChar = "o"
|
|
1753
|
+
if fillTest == 0:
|
|
1754
|
+
fillChar = "O"
|
|
1755
|
+
suffix="Complete "
|
|
1756
|
+
# Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
|
|
1757
|
+
if (successCount + failureCount) % 25 == 1 or (successCount + failureCount) == totalResponses:
|
|
1758
|
+
try:
|
|
1759
|
+
getMemory()
|
|
1760
|
+
if verbose():
|
|
1761
|
+
suffix = (
|
|
1762
|
+
"Complete (Mem Usage "
|
|
1763
|
+
+ humanReadableSize(currentMemUsage)
|
|
1764
|
+
+ ", Total Mem "
|
|
1765
|
+
+ str(currentMemPercent)
|
|
1766
|
+
+ "%) "
|
|
1767
|
+
)
|
|
1768
|
+
except:
|
|
1769
|
+
if verbose():
|
|
1770
|
+
suffix = 'Complete (To show mem use, run "pip install psutil")'
|
|
1771
|
+
printProgressBar(
|
|
1772
|
+
successCount + failureCount,
|
|
1773
|
+
totalResponses,
|
|
1774
|
+
prefix="Downloading " + str(totalResponses) + " responses:",
|
|
1775
|
+
suffix=suffix,
|
|
1776
|
+
length=getProgressBarLength(),
|
|
1777
|
+
fill=fillChar
|
|
1778
|
+
)
|
|
1779
|
+
|
|
1780
|
+
# Write the total count to the continueResp.URLScan.tmp file
|
|
1781
|
+
try:
|
|
1782
|
+
continueRespFileURLScan.seek(0)
|
|
1783
|
+
continueRespFileURLScan.write(str(successCount + failureCount)+'\n')
|
|
1784
|
+
except Exception as e:
|
|
1785
|
+
if verbose():
|
|
1786
|
+
writerr(colored(getSPACER('ERROR getURLScanDOM 2: ' + str(e)), 'red'))
|
|
1787
|
+
|
|
1788
|
+
except Exception as e:
|
|
1789
|
+
if verbose():
|
|
1790
|
+
writerr(colored(getSPACER('Error for "'+domUrl+'": ' + str(e)), 'red'))
|
|
1791
|
+
|
|
1792
|
+
except Exception as e:
|
|
1793
|
+
writerr(colored('ERROR getURLScanDOM 1: ' + str(e), 'red'))
|
|
1615
1794
|
|
|
1795
|
+
def format_date_for_urlscan(date_str):
|
|
1796
|
+
# Handle different lengths of input
|
|
1797
|
+
if len(date_str) == 4: # YYYY
|
|
1798
|
+
date_str += "0101"
|
|
1799
|
+
elif len(date_str) == 6: # YYYYMM
|
|
1800
|
+
date_str += "01"
|
|
1801
|
+
|
|
1802
|
+
# Convert to YYYY-MM-DD format
|
|
1803
|
+
try:
|
|
1804
|
+
formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
|
|
1805
|
+
return formatted_date
|
|
1806
|
+
except:
|
|
1807
|
+
return ''
|
|
1808
|
+
|
|
1616
1809
|
def getURLScanUrls():
|
|
1617
1810
|
"""
|
|
1618
1811
|
Get URLs from the URLSCan API, urlscan.io
|
|
@@ -1628,9 +1821,26 @@ def getURLScanUrls():
|
|
|
1628
1821
|
|
|
1629
1822
|
# Set the URL to just the hostname
|
|
1630
1823
|
url = URLSCAN_URL.replace('{DOMAIN}',quote(argsInputHostname))
|
|
1824
|
+
|
|
1825
|
+
# If the --from-date or --to-date parameters were paassed then also add a date filter
|
|
1826
|
+
if args.from_date or args.to_date:
|
|
1827
|
+
if args.from_date:
|
|
1828
|
+
fromDate = format_date_for_urlscan(str(args.from_date)[:8])
|
|
1829
|
+
else:
|
|
1830
|
+
fromDate = '2016-01-01' # The year URLScan started
|
|
1831
|
+
if args.to_date:
|
|
1832
|
+
toDate = format_date_for_urlscan(str(args.to_date)[:8])
|
|
1833
|
+
else:
|
|
1834
|
+
toDate = 'now'
|
|
1835
|
+
url = url.replace('{DATERANGE}',f'%20date:[{fromDate}%20TO%20{toDate}]')
|
|
1836
|
+
else:
|
|
1837
|
+
url = url.replace('{DATERANGE}','')
|
|
1631
1838
|
|
|
1632
1839
|
if verbose():
|
|
1633
|
-
|
|
1840
|
+
if args.mode == 'R':
|
|
1841
|
+
write(colored('The URLScan URL requested to get links for responses: ','magenta')+colored(url+'\n','white'))
|
|
1842
|
+
else:
|
|
1843
|
+
write(colored('The URLScan URL requested to get links: ','magenta')+colored(url+'\n','white'))
|
|
1634
1844
|
|
|
1635
1845
|
if not args.check_only:
|
|
1636
1846
|
write(colored('\rGetting links from urlscan.io API (this can take a while for some domains)...\r','cyan'))
|
|
@@ -1641,7 +1851,7 @@ def getURLScanUrls():
|
|
|
1641
1851
|
# For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
|
|
1642
1852
|
# that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
|
|
1643
1853
|
# be successful all the time
|
|
1644
|
-
userAgent = "waymore by xnl-h4ck3r"
|
|
1854
|
+
userAgent = "waymore v"+__version__+" by xnl-h4ck3r"
|
|
1645
1855
|
session = requests.Session()
|
|
1646
1856
|
session.mount('https://', HTTP_ADAPTER)
|
|
1647
1857
|
session.mount('http://', HTTP_ADAPTER)
|
|
@@ -1705,7 +1915,7 @@ def getURLScanUrls():
|
|
|
1705
1915
|
totalUrls = 0
|
|
1706
1916
|
|
|
1707
1917
|
# Carry on if something was found
|
|
1708
|
-
if args.check_only:
|
|
1918
|
+
if args.check_only and args.mode != 'R':
|
|
1709
1919
|
try:
|
|
1710
1920
|
hasMore = jsonResp['has_more']
|
|
1711
1921
|
if hasMore:
|
|
@@ -1771,9 +1981,17 @@ def getURLScanUrls():
|
|
|
1771
1981
|
except:
|
|
1772
1982
|
mimeType = ''
|
|
1773
1983
|
|
|
1984
|
+
# If we are going to be downloading responses, then get the unique ID to retrieve the DOM later
|
|
1985
|
+
urlscanID = ''
|
|
1986
|
+
if args.mode in ('R','B'):
|
|
1987
|
+
try:
|
|
1988
|
+
urlscanID = urlSection['_id']
|
|
1989
|
+
except:
|
|
1990
|
+
pass
|
|
1991
|
+
|
|
1774
1992
|
# If a URL was found the process it
|
|
1775
1993
|
if foundUrl != '':
|
|
1776
|
-
processURLScanUrl(foundUrl, httpCode, mimeType)
|
|
1994
|
+
processURLScanUrl(foundUrl, httpCode, mimeType, urlscanID)
|
|
1777
1995
|
|
|
1778
1996
|
# If a pointer was found the process it
|
|
1779
1997
|
if pointer != '':
|
|
@@ -1831,19 +2049,20 @@ def getURLScanUrls():
|
|
|
1831
2049
|
jsonResp = json.loads(resp.text.strip())
|
|
1832
2050
|
|
|
1833
2051
|
# If there are no more results, or if the requests limit was specified and has been exceeded, then stop
|
|
1834
|
-
if jsonResp['results'] is None or len(jsonResp['results']) == 0 or (args.limit_requests != 0 and requestsMade > args.limit_requests):
|
|
2052
|
+
if jsonResp['results'] is None or len(jsonResp['results']) == 0 or (args.limit_requests != 0 and requestsMade > args.limit_requests) or (args.mode == 'R' and args.limit != 0 and requestsMade > args.limit):
|
|
1835
2053
|
stopSource = True
|
|
1836
2054
|
|
|
1837
2055
|
# Show the MIME types found (in case user wants to exclude more)
|
|
1838
|
-
if verbose() and len(linkMimes) > 0:
|
|
2056
|
+
if verbose() and len(linkMimes) > 0 and args.mode != 'R':
|
|
1839
2057
|
linkMimes.discard('warc/revisit')
|
|
1840
2058
|
write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
|
|
1841
2059
|
|
|
1842
2060
|
linkCount = len(linksFound) - originalLinkCount
|
|
1843
|
-
if args.
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
2061
|
+
if args.mode != 'R':
|
|
2062
|
+
if args.xwm and args.xcc and args.xav:
|
|
2063
|
+
write(getSPACER(colored('Links found on urlscan.io: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2064
|
+
else:
|
|
2065
|
+
write(getSPACER(colored('Extra links found on urlscan.io: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
1847
2066
|
|
|
1848
2067
|
except Exception as e:
|
|
1849
2068
|
writerr(colored('ERROR getURLScanUrls 1: ' + str(e), 'red'))
|
|
@@ -1941,7 +2160,9 @@ def processWayBackPage(url):
|
|
|
1941
2160
|
# Only get MIME Types if --verbose option was selected
|
|
1942
2161
|
if verbose():
|
|
1943
2162
|
try:
|
|
1944
|
-
|
|
2163
|
+
mimeType = str(results).split(' ')[2]
|
|
2164
|
+
if mimeType != '':
|
|
2165
|
+
linkMimes.add(mimeType)
|
|
1945
2166
|
except Exception as e:
|
|
1946
2167
|
if verbose():
|
|
1947
2168
|
writerr(colored(getSPACER('ERROR processWayBackPage 2: Cannot get MIME type from line: ' + str(line)),'red'))
|
|
@@ -2178,7 +2399,8 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
2178
2399
|
# Get MIME Types if --verbose option was seletced
|
|
2179
2400
|
if verbose():
|
|
2180
2401
|
try:
|
|
2181
|
-
|
|
2402
|
+
if data['mime'] != '':
|
|
2403
|
+
linkMimes.add(data['mime'])
|
|
2182
2404
|
except:
|
|
2183
2405
|
pass
|
|
2184
2406
|
linksFoundAdd(data['url'])
|
|
@@ -2715,12 +2937,200 @@ def getIntelxUrls():
|
|
|
2715
2937
|
except Exception as e:
|
|
2716
2938
|
writerr(colored('ERROR getIntelxUrls 1: ' + str(e), 'red'))
|
|
2717
2939
|
|
|
2718
|
-
def processResponses():
|
|
2940
|
+
def processResponses():
|
|
2941
|
+
"""
|
|
2942
|
+
Get archived responses from al sources
|
|
2943
|
+
"""
|
|
2944
|
+
global stopProgram, totalFileCount
|
|
2945
|
+
try:
|
|
2946
|
+
|
|
2947
|
+
# Get responses from URLScan unless excluded
|
|
2948
|
+
if stopProgram is None and not args.xus:
|
|
2949
|
+
processResponsesURLScan()
|
|
2950
|
+
|
|
2951
|
+
# Get responses from wayback machine unless excluded
|
|
2952
|
+
if stopProgram is None and not args.xwm:
|
|
2953
|
+
processResponsesWayback()
|
|
2954
|
+
|
|
2955
|
+
# If requested, generate the combined inline JS files
|
|
2956
|
+
if not args.check_only and stopProgram is None and totalFileCount > 0 and args.output_inline_js:
|
|
2957
|
+
combineInlineJS()
|
|
2958
|
+
|
|
2959
|
+
except Exception as e:
|
|
2960
|
+
writerr(colored(getSPACER('ERROR processResponses 1: ' + str(e)), 'red'))
|
|
2961
|
+
|
|
2962
|
+
def processResponsesURLScan():
|
|
2963
|
+
"""
|
|
2964
|
+
Get archived responses from URLScan (urlscan.io)
|
|
2965
|
+
"""
|
|
2966
|
+
global subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFileURLScan, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, urlscanRequestLinks, failureCount, totalFileCount, checkURLScan
|
|
2967
|
+
try:
|
|
2968
|
+
fileCount = 0
|
|
2969
|
+
failureCount = 0
|
|
2970
|
+
if not args.check_only:
|
|
2971
|
+
# Create 'results' and domain directory if needed
|
|
2972
|
+
createDirs()
|
|
2973
|
+
|
|
2974
|
+
# Get the path of the files, depending on whether -oR / --output_responses was passed
|
|
2975
|
+
try:
|
|
2976
|
+
continuePath = responseOutputDirectory + 'continueRes.URLScan.tmp'
|
|
2977
|
+
responsesPath = responseOutputDirectory + 'responses.URLScan.tmp'
|
|
2978
|
+
indexPath = responseOutputDirectory + 'waymore_index.txt'
|
|
2979
|
+
except Exception as e:
|
|
2980
|
+
if verbose():
|
|
2981
|
+
writerr(colored('ERROR processResponsesURLScan 4: ' + str(e), 'red'))
|
|
2982
|
+
|
|
2983
|
+
# Get URLs from URLScan.io if the DOM ID's haven't been retrieved yet
|
|
2984
|
+
if args.mode == 'R' and stopProgram is None and not args.check_only:
|
|
2985
|
+
write(colored('\rGetting list of response links (this can take a while for some domains)...\r','cyan'))
|
|
2986
|
+
getURLScanUrls()
|
|
2987
|
+
|
|
2988
|
+
# Check if a continueResp.URLScan.tmp and responses.URLScan.tmp files exists
|
|
2989
|
+
runPrevious = 'n'
|
|
2990
|
+
if not args.check_only and os.path.exists(continuePath) and os.path.exists(responsesPath):
|
|
2991
|
+
|
|
2992
|
+
# Load the links into the set
|
|
2993
|
+
with open(responsesPath,'rb') as fl:
|
|
2994
|
+
linkRequests = pickle.load(fl)
|
|
2995
|
+
totalPrevResponses = len(linkRequests)
|
|
2996
|
+
|
|
2997
|
+
# Get the previous end position to start again at this point
|
|
2998
|
+
try:
|
|
2999
|
+
with open(continuePath,'r') as fc:
|
|
3000
|
+
successCount = int(fc.readline().strip())
|
|
3001
|
+
except Exception as e:
|
|
3002
|
+
successCount = 0
|
|
3003
|
+
|
|
3004
|
+
# Ask the user if we should continue with previous run if the current starting position is greater than 0 and less than the total
|
|
3005
|
+
if successCount > 0 and successCount < totalPrevResponses:
|
|
3006
|
+
# If the program is not piped from or to another process, then ask whether to continue with previous run
|
|
3007
|
+
if sys.stdout.isatty() and sys.stdin.isatty():
|
|
3008
|
+
write(colored('The previous run to get archived responses for ' + argsInput + ' was not completed.\nYou can start from response ' + str(successCount) + ' of ' + str(totalPrevResponses) + ' for the previous run, or you can start a new run with your specified arguments.', 'yellow'))
|
|
3009
|
+
runPrevious = input('Continue with previous run? y/n: ')
|
|
3010
|
+
else:
|
|
3011
|
+
if CONTINUE_RESPONSES_IF_PIPED:
|
|
3012
|
+
runPrevious = 'y'
|
|
3013
|
+
writerr(colored('The previous run to get archived responses for ' + argsInput + ' was not completed. Starting from response ' + str(successCount) + ' of ' + str(totalPrevResponses) + '... ', 'yellow'))
|
|
3014
|
+
else:
|
|
3015
|
+
runPrevious = 'n'
|
|
3016
|
+
|
|
3017
|
+
# If we are going to run a new run
|
|
3018
|
+
if runPrevious.lower() == 'n':
|
|
3019
|
+
|
|
3020
|
+
# Set start point
|
|
3021
|
+
successCount = 0
|
|
3022
|
+
|
|
3023
|
+
# Get the URLScan DOM links
|
|
3024
|
+
linkRequests = []
|
|
3025
|
+
for originalUrl, domUrl in urlscanRequestLinks:
|
|
3026
|
+
linkRequests.append((originalUrl,domUrl))
|
|
3027
|
+
|
|
3028
|
+
# Write the links to a temp file
|
|
3029
|
+
if not args.check_only:
|
|
3030
|
+
with open(responsesPath,'wb') as f:
|
|
3031
|
+
pickle.dump(linkRequests, f)
|
|
3032
|
+
|
|
3033
|
+
# Get the total number of responses we will try to get and set the current file count to the success count
|
|
3034
|
+
totalResponses = len(linkRequests)
|
|
3035
|
+
checkURLScan = checkURLScan + totalResponses
|
|
3036
|
+
|
|
3037
|
+
# If there are no reponses to download, diaplay an error and exit
|
|
3038
|
+
if args.mode != 'R' and totalResponses == 0:
|
|
3039
|
+
writerr(colored(getSPACER('Failed to get responses from URLScan (urlscan.io) - check input and try again.'), 'red'))
|
|
3040
|
+
return
|
|
3041
|
+
|
|
3042
|
+
fileCount = successCount
|
|
3043
|
+
|
|
3044
|
+
if args.check_only:
|
|
3045
|
+
writerr(colored('Downloading archived responses: ','cyan')+colored('UNKNOWN requests','cyan'))
|
|
3046
|
+
writerr(colored('\n-> Downloading the responses can vary depending on the target and the rate limiting on URLScan','green'))
|
|
3047
|
+
# if args.limit == 5000 and totalResponses == 5000:
|
|
3048
|
+
# writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests (the --limit argument defaults to '+str(DEFAULT_LIMIT)+')','cyan'))
|
|
3049
|
+
# else:
|
|
3050
|
+
# writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests','white'))
|
|
3051
|
+
# minutes = round(totalResponses*2.5 // 60)
|
|
3052
|
+
# hours = minutes // 60
|
|
3053
|
+
# days = hours // 24
|
|
3054
|
+
# if minutes < 5:
|
|
3055
|
+
# write(colored('\n-> Downloading the responses (depending on their size) should be quite quick!','green'))
|
|
3056
|
+
# elif hours < 2:
|
|
3057
|
+
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(minutes)+' minutes.','green'))
|
|
3058
|
+
# elif hours < 6:
|
|
3059
|
+
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','green'))
|
|
3060
|
+
# elif hours < 24:
|
|
3061
|
+
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','yellow'))
|
|
3062
|
+
# elif days < 7:
|
|
3063
|
+
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days. Consider using arguments -ko, -l, -ci, -from and -to wisely! ','red'))
|
|
3064
|
+
# else:
|
|
3065
|
+
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days!!! Consider using arguments -ko, -l, -ci, -from and -to wisely!','red'))
|
|
3066
|
+
write('')
|
|
3067
|
+
else:
|
|
3068
|
+
# If the limit has been set over the default, give a warning that this could take a long time!
|
|
3069
|
+
if totalResponses - successCount > DEFAULT_LIMIT:
|
|
3070
|
+
if successCount > 0:
|
|
3071
|
+
writerr(colored(getSPACER('WARNING: Downloading remaining ' + str(totalResponses - successCount) + ' responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!'),'yellow'))
|
|
3072
|
+
else:
|
|
3073
|
+
writerr(colored(getSPACER('WARNING: Downloading ' + str(totalResponses) + ' responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!'),'yellow'))
|
|
3074
|
+
|
|
3075
|
+
# Open the index file if hash value is going to be used (not URL)
|
|
3076
|
+
if not args.url_filename:
|
|
3077
|
+
indexFile = open(indexPath,'a')
|
|
3078
|
+
|
|
3079
|
+
# Open the continue.URLScan.tmp file to store what record we are upto
|
|
3080
|
+
continueRespFileURLScan = open(continuePath,'w+')
|
|
3081
|
+
|
|
3082
|
+
# Process the URLs from URLScan
|
|
3083
|
+
if stopProgram is None:
|
|
3084
|
+
p = mp.Pool(args.processes)
|
|
3085
|
+
p.starmap(getURLScanDOM, linkRequests[successCount:])
|
|
3086
|
+
p.close()
|
|
3087
|
+
p.join()
|
|
3088
|
+
|
|
3089
|
+
# Delete the tmp files now it has run successfully
|
|
3090
|
+
if stopProgram is None:
|
|
3091
|
+
try:
|
|
3092
|
+
os.remove(continuePath)
|
|
3093
|
+
os.remove(responsesPath)
|
|
3094
|
+
except:
|
|
3095
|
+
pass
|
|
3096
|
+
|
|
3097
|
+
# Close the index file if hash value is going to be used (not URL)
|
|
3098
|
+
if not args.url_filename:
|
|
3099
|
+
indexFile.close()
|
|
3100
|
+
|
|
3101
|
+
# Close the continueResp.URLScan.tmp file
|
|
3102
|
+
continueRespFileURLScan.close()
|
|
3103
|
+
|
|
3104
|
+
if not args.check_only:
|
|
3105
|
+
try:
|
|
3106
|
+
if failureCount > 0:
|
|
3107
|
+
if verbose():
|
|
3108
|
+
write(colored('\nURLScan responses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' not found)\n','red'))
|
|
3109
|
+
else:
|
|
3110
|
+
write(colored('\nURLScan responses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' not found)\n','red'))
|
|
3111
|
+
else:
|
|
3112
|
+
if verbose():
|
|
3113
|
+
write(colored('\nURLScan responses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
|
|
3114
|
+
else:
|
|
3115
|
+
write(colored('\nURLScan responses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
|
|
3116
|
+
except Exception as e:
|
|
3117
|
+
if verbose():
|
|
3118
|
+
writerr(colored("ERROR processResponsesURLScan 5: " + str(e), "red"))
|
|
3119
|
+
|
|
3120
|
+
totalFileCount = totalFileCount + fileCount
|
|
3121
|
+
except Exception as e:
|
|
3122
|
+
writerr(colored(getSPACER('ERROR processResponsesURLScan 1: ' + str(e)), 'red'))
|
|
3123
|
+
finally:
|
|
3124
|
+
linkRequests = None
|
|
3125
|
+
|
|
3126
|
+
def processResponsesWayback():
|
|
2719
3127
|
"""
|
|
2720
3128
|
Get archived responses from Wayback Machine (archive.org)
|
|
2721
3129
|
"""
|
|
2722
|
-
global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory
|
|
3130
|
+
global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount
|
|
2723
3131
|
try:
|
|
3132
|
+
fileCount = 0
|
|
3133
|
+
failureCount = 0
|
|
2724
3134
|
if not args.check_only:
|
|
2725
3135
|
# Create 'results' and domain directory if needed
|
|
2726
3136
|
createDirs()
|
|
@@ -2729,11 +3139,11 @@ def processResponses():
|
|
|
2729
3139
|
try:
|
|
2730
3140
|
continuePath = responseOutputDirectory + 'continueResp.tmp'
|
|
2731
3141
|
responsesPath = responseOutputDirectory + 'responses.tmp'
|
|
2732
|
-
indexPath = responseOutputDirectory + '
|
|
3142
|
+
indexPath = responseOutputDirectory + 'waymore_index.txt'
|
|
2733
3143
|
except Exception as e:
|
|
2734
3144
|
if verbose():
|
|
2735
|
-
writerr(colored('ERROR
|
|
2736
|
-
|
|
3145
|
+
writerr(colored('ERROR processResponsesWayback 4: ' + str(e), 'red'))
|
|
3146
|
+
|
|
2737
3147
|
# Check if a continueResp.tmp and responses.tmp files exists
|
|
2738
3148
|
runPrevious = 'n'
|
|
2739
3149
|
if not args.check_only and os.path.exists(continuePath) and os.path.exists(responsesPath):
|
|
@@ -2822,7 +3232,7 @@ def processResponses():
|
|
|
2822
3232
|
url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords
|
|
2823
3233
|
|
|
2824
3234
|
if verbose():
|
|
2825
|
-
write(colored('The
|
|
3235
|
+
write(colored('The Wayback Machine URL requested to get responses: ','magenta')+colored(url+'\n','white'))
|
|
2826
3236
|
|
|
2827
3237
|
if args.check_only:
|
|
2828
3238
|
write(colored('\rChecking archived response requests...\r','cyan'))
|
|
@@ -2855,7 +3265,7 @@ def processResponses():
|
|
|
2855
3265
|
if resp.text == '':
|
|
2856
3266
|
writerr(colored(getSPACER('No archived responses were found on Wayback Machine (archive.org) for the given search parameters.'),'red'))
|
|
2857
3267
|
success = False
|
|
2858
|
-
# If a status other of 429, then stop processing
|
|
3268
|
+
# If a status other of 429, then stop processing
|
|
2859
3269
|
if resp.status_code == 429:
|
|
2860
3270
|
writerr(colored(getSPACER('[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
|
|
2861
3271
|
success = False
|
|
@@ -2892,7 +3302,7 @@ def processResponses():
|
|
|
2892
3302
|
originalUrl = parts[1]
|
|
2893
3303
|
linksFoundResponseAdd(timestamp+'/'+originalUrl)
|
|
2894
3304
|
except Exception as e:
|
|
2895
|
-
writerr(colored(getSPACER('ERROR
|
|
3305
|
+
writerr(colored(getSPACER('ERROR processResponsesWayback 3: Cannot to get link from line: '+str(line)), 'red'))
|
|
2896
3306
|
|
|
2897
3307
|
# Remove any links that have URL exclusions
|
|
2898
3308
|
linkRequests = []
|
|
@@ -2956,7 +3366,7 @@ def processResponses():
|
|
|
2956
3366
|
if not args.url_filename:
|
|
2957
3367
|
indexFile = open(indexPath,'a')
|
|
2958
3368
|
|
|
2959
|
-
# Open the
|
|
3369
|
+
# Open the continueResp.tmp file to store what record we are upto
|
|
2960
3370
|
continueRespFile = open(continuePath,'w+')
|
|
2961
3371
|
|
|
2962
3372
|
# Process the URLs from web archive
|
|
@@ -2981,8 +3391,26 @@ def processResponses():
|
|
|
2981
3391
|
# Close the continueResp.tmp file
|
|
2982
3392
|
continueRespFile.close()
|
|
2983
3393
|
|
|
3394
|
+
# Output results if not just checking
|
|
3395
|
+
if not args.check_only:
|
|
3396
|
+
try:
|
|
3397
|
+
if failureCount > 0:
|
|
3398
|
+
if verbose():
|
|
3399
|
+
write(colored('\nWayback responses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' failed)\n','red'))
|
|
3400
|
+
else:
|
|
3401
|
+
write(colored('\nWayback responses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' failed)\n','red'))
|
|
3402
|
+
else:
|
|
3403
|
+
if verbose():
|
|
3404
|
+
write(colored('\nWayback responses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
|
|
3405
|
+
else:
|
|
3406
|
+
write(colored('\nWayback responses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
|
|
3407
|
+
except Exception as e:
|
|
3408
|
+
if verbose():
|
|
3409
|
+
writerr(colored("ERROR processResponsesWayback 5: " + str(e), "red"))
|
|
3410
|
+
|
|
3411
|
+
totalFileCount = totalFileCount + fileCount
|
|
2984
3412
|
except Exception as e:
|
|
2985
|
-
writerr(colored(getSPACER('ERROR
|
|
3413
|
+
writerr(colored(getSPACER('ERROR processResponsesWayback 1: ' + str(e)), 'red'))
|
|
2986
3414
|
finally:
|
|
2987
3415
|
linkRequests = None
|
|
2988
3416
|
|
|
@@ -3098,7 +3526,7 @@ def combineInlineJS():
|
|
|
3098
3526
|
try:
|
|
3099
3527
|
write(colored('Creating combined inline JS files...', 'cyan'))
|
|
3100
3528
|
outputFileTemplate = "combinedInline{}.js"
|
|
3101
|
-
excludedNames = ['
|
|
3529
|
+
excludedNames = ['waymore_index.txt', 'continueResp.tmp', 'continueResp.URLScan.tmp', 'responses.tmp', 'responses.URLScan.tmp']
|
|
3102
3530
|
fileList = [name for name in os.listdir(responseOutputDirectory)
|
|
3103
3531
|
if os.path.isfile(os.path.join(responseOutputDirectory, name))
|
|
3104
3532
|
and not any(name.lower().endswith(ext) for ext in INLINE_JS_EXCLUDE)
|
|
@@ -3300,15 +3728,15 @@ def main():
|
|
|
3300
3728
|
'-from',
|
|
3301
3729
|
'--from-date',
|
|
3302
3730
|
action='store',
|
|
3303
|
-
type=
|
|
3731
|
+
type=validateArgDate,
|
|
3304
3732
|
help='What date to get responses from. If not specified it will get from the earliest possible results. A partial value can be passed, e.g. 2016, 201805, etc.',
|
|
3305
|
-
metavar='<yyyyMMddhhmmss>'
|
|
3733
|
+
metavar='<yyyyMMddhhmmss>',
|
|
3306
3734
|
)
|
|
3307
3735
|
parser.add_argument(
|
|
3308
3736
|
'-to',
|
|
3309
3737
|
'--to-date',
|
|
3310
3738
|
action='store',
|
|
3311
|
-
type=
|
|
3739
|
+
type=validateArgDate,
|
|
3312
3740
|
help='What date to get responses to. If not specified it will get to the latest possible results. A partial value can be passed, e.g. 2016, 201805, etc.',
|
|
3313
3741
|
metavar='<yyyyMMddhhmmss>'
|
|
3314
3742
|
)
|
|
@@ -3476,7 +3904,7 @@ def main():
|
|
|
3476
3904
|
"-co",
|
|
3477
3905
|
"--check-only",
|
|
3478
3906
|
action="store_true",
|
|
3479
|
-
help="This will make a few minimal requests to show you how many requests, and roughly how long it could take, to get URLs from the sources and downloaded responses from Wayback Machine.",
|
|
3907
|
+
help="This will make a few minimal requests to show you how many requests, and roughly how long it could take, to get URLs from the sources and downloaded responses from Wayback Machine (unfortunately it isn't possible to check how long it will take to download responses from URLScan).",
|
|
3480
3908
|
)
|
|
3481
3909
|
parser.add_argument(
|
|
3482
3910
|
"-nd",
|
|
@@ -3589,6 +4017,7 @@ def main():
|
|
|
3589
4017
|
indexFile = None
|
|
3590
4018
|
path = ''
|
|
3591
4019
|
stopSource = False
|
|
4020
|
+
urlscanRequestLinks = set()
|
|
3592
4021
|
|
|
3593
4022
|
# Get the config settings from the config.yml file
|
|
3594
4023
|
getConfig()
|
|
@@ -3640,16 +4069,9 @@ def main():
|
|
|
3640
4069
|
responseOutputDirectory = args.output_responses + '/'
|
|
3641
4070
|
else:
|
|
3642
4071
|
responseOutputDirectory = str(DEFAULT_OUTPUT_DIR) + '/results/' + str(argsInput).replace('/','-') + '/'
|
|
3643
|
-
|
|
4072
|
+
|
|
4073
|
+
# Get the responses
|
|
3644
4074
|
processResponses()
|
|
3645
|
-
|
|
3646
|
-
# Output details of the responses downloaded
|
|
3647
|
-
if not args.check_only:
|
|
3648
|
-
processResponsesOutput()
|
|
3649
|
-
|
|
3650
|
-
# If requested, generate the combined inline JS files
|
|
3651
|
-
if stopProgram is None and fileCount > 0 and args.output_inline_js:
|
|
3652
|
-
combineInlineJS()
|
|
3653
4075
|
|
|
3654
4076
|
if args.check_only:
|
|
3655
4077
|
write(colored('NOTE: The time frames are a very rough guide and doesn\'t take into account additonal time for rate limiting.','magenta'))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: waymore
|
|
3
|
-
Version:
|
|
3
|
+
Version: 6.1
|
|
4
4
|
Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
|
|
5
5
|
Home-page: https://github.com/xnl-h4ck3r/waymore
|
|
6
6
|
Author: @xnl-h4ck3r
|
|
@@ -15,7 +15,7 @@ Requires-Dist: tldextract
|
|
|
15
15
|
|
|
16
16
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
|
|
17
17
|
|
|
18
|
-
## About -
|
|
18
|
+
## About - v6.1
|
|
19
19
|
|
|
20
20
|
The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
|
|
21
21
|
|
|
@@ -112,7 +112,7 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
|
|
|
112
112
|
| -c | --config | Path to the YML config file. If not passed, it looks for file `config.yml` in the default directory, typically `~/.config/waymore`. |
|
|
113
113
|
| -wrlr | --wayback-rate-limit-retry | The number of minutes the user wants to wait for a rate limit pause on Wayback Machine (archive.org) instead of stopping with a `429` error (default: 3). |
|
|
114
114
|
| -urlr | --urlscan-rate-limit-retry | The number of minutes the user wants to wait for a rate limit pause on URLScan.io instead of stopping with a `429` error (default: 1). |
|
|
115
|
-
| -co | --check-only | This will make a few minimal requests to show you how many requests, and roughly how long it could take, to get URLs from the sources and downloaded responses from Wayback Machine.
|
|
115
|
+
| -co | --check-only | This will make a few minimal requests to show you how many requests, and roughly how long it could take, to get URLs from the sources and downloaded responses from Wayback Machine (unfortunately it isn't possible to check how long it will take to download responses from URLScan). |
|
|
116
116
|
| -nd | --notify-discord | Whether to send a notification to Discord when waymore completes. It requires `WEBHOOK_DISCORD` to be provided in the `config.yml` file. |
|
|
117
117
|
| -oijs | --output-inline-js | Whether to save combined inline javascript of all relevant files in the response directory when `-mode R` (or `-mode B`) has been used. The files are saved with the name `combinedInline{}.js` where `{}` is the number of the file, saving 1000 unique scripts per file. The file `combinedInlineSrc.txt` will also be created, containing the `src` value of all external scripts referenced in the files. |
|
|
118
118
|
| -v | --verbose | Verbose output |
|
|
@@ -284,7 +284,9 @@ If you come across any problems at all, or have ideas for improvements, please f
|
|
|
284
284
|
|
|
285
285
|
## TODO
|
|
286
286
|
|
|
287
|
-
- Add an `-oos` argument that accepts a file of Out Of Scope subdomains/URLs that will not be returned in the output, or have any responses downloaded
|
|
287
|
+
- Add an `-oos` argument that accepts a file of Out Of Scope subdomains/URLs that will not be returned in the output, or have any responses downloaded.
|
|
288
|
+
- The `waymore_index.txt` isn't de-duplicated if run multiple times for the same input with `-mode R` or `-mode B`.
|
|
289
|
+
- Rewrite to get from sources in parallel. Currently they are run consecutively sorry!
|
|
288
290
|
|
|
289
291
|
## References
|
|
290
292
|
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
waymore/__init__.py,sha256=F72990tIrad7qU0P6Y58bxSon9K_Afq4VPg1O8Plees,17
|
|
2
|
+
waymore/waymore.py,sha256=lZ3FMjdvMoB2okgnD1DwZfD53jMRsTGvo2Imlw0aqxc,211320
|
|
3
|
+
waymore-6.1.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
|
|
4
|
+
waymore-6.1.dist-info/METADATA,sha256=3iHse0hmlx7vH_I9Q4bQyc65cBeAGvG3PB0voC9_4AY,50878
|
|
5
|
+
waymore-6.1.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
6
|
+
waymore-6.1.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
|
|
7
|
+
waymore-6.1.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
|
|
8
|
+
waymore-6.1.dist-info/RECORD,,
|
waymore-5.1.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
waymore/__init__.py,sha256=X4ON0rponPxoQ0b-Wv7zvwCPFlzC2oSmg_nHdJmpyis,17
|
|
2
|
-
waymore/waymore.py,sha256=sG4cpeFN0cOfO06AgetlTbs20fVUbpxQr4g_RuHAlaw,188638
|
|
3
|
-
waymore-5.1.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
|
|
4
|
-
waymore-5.1.dist-info/METADATA,sha256=npHpoTL5ceG210zvaBtXHVSfWqE2Gh_ekR_beYLTRx0,50674
|
|
5
|
-
waymore-5.1.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
6
|
-
waymore-5.1.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
|
|
7
|
-
waymore-5.1.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
|
|
8
|
-
waymore-5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|