waymore 4.5__py3-none-any.whl → 4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -1
- waymore/waymore.py +197 -62
- {waymore-4.5.dist-info → waymore-4.6.dist-info}/METADATA +11 -7
- waymore-4.6.dist-info/RECORD +8 -0
- {waymore-4.5.dist-info → waymore-4.6.dist-info}/WHEEL +1 -1
- waymore-4.5.dist-info/RECORD +0 -8
- {waymore-4.5.dist-info → waymore-4.6.dist-info}/LICENSE +0 -0
- {waymore-4.5.dist-info → waymore-4.6.dist-info}/entry_points.txt +0 -0
- {waymore-4.5.dist-info → waymore-4.6.dist-info}/top_level.txt +0 -0
waymore/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__="4.
|
|
1
|
+
__version__="4.6"
|
waymore/waymore.py
CHANGED
|
@@ -136,6 +136,7 @@ DEFAULT_FILTER_KEYWORDS = 'admin,login,logon,signin,signup,register,registration
|
|
|
136
136
|
# Yaml config values
|
|
137
137
|
FILTER_URL = ''
|
|
138
138
|
FILTER_MIME = ''
|
|
139
|
+
MATCH_MIME = ''
|
|
139
140
|
FILTER_CODE = ''
|
|
140
141
|
MATCH_CODE = ''
|
|
141
142
|
FILTER_KEYWORDS = ''
|
|
@@ -313,8 +314,21 @@ def showOptions():
|
|
|
313
314
|
else:
|
|
314
315
|
write(colored('-n: ' +str(args.no_subs), 'magenta')+colored(' Sub domains are included in the search.','white'))
|
|
315
316
|
|
|
316
|
-
|
|
317
|
-
|
|
317
|
+
providers = ''
|
|
318
|
+
if not args.xwm:
|
|
319
|
+
providers = providers + 'Wayback, '
|
|
320
|
+
if not args.xcc:
|
|
321
|
+
providers = providers + 'CommonCrawl, '
|
|
322
|
+
if not args.xav:
|
|
323
|
+
providers = providers + 'Alien Vault OTX, '
|
|
324
|
+
if not args.xus:
|
|
325
|
+
providers = providers + 'URLScan, '
|
|
326
|
+
if not args.xvt:
|
|
327
|
+
providers = providers + 'VirusTotal, '
|
|
328
|
+
if providers == '':
|
|
329
|
+
providers = 'None'
|
|
330
|
+
write(colored('Providers: ' +str(providers.strip(', ')), 'magenta')+colored(' Which providers to check for URLs.','white'))
|
|
331
|
+
|
|
318
332
|
if not args.xcc:
|
|
319
333
|
if args.lcc ==0 and args.lcy == 0:
|
|
320
334
|
write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' Search ALL Common Crawl index collections.','white'))
|
|
@@ -325,13 +339,12 @@ def showOptions():
|
|
|
325
339
|
if args.lcc != 0:
|
|
326
340
|
write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' The number of latest Common Crawl index collections to be searched.','white'))
|
|
327
341
|
write(colored('-lcy: ' +str(args.lcy), 'magenta')+colored(' Search all Common Crawl index collections with data from year '+str(args.lcy)+' and after.','white'))
|
|
328
|
-
|
|
329
|
-
write(colored('-xus: ' +str(args.xus), 'magenta')+colored(' Whether to exclude checks for links from urlscan.io','white'))
|
|
342
|
+
|
|
330
343
|
if URLSCAN_API_KEY == '':
|
|
331
344
|
write(colored('URLScan API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://urlscan.io/user/signup which will let you get more back, and quicker.','white'))
|
|
332
345
|
else:
|
|
333
346
|
write(colored('URLScan API Key: ', 'magenta')+colored(URLSCAN_API_KEY))
|
|
334
|
-
|
|
347
|
+
|
|
335
348
|
if VIRUSTOTAL_API_KEY == '':
|
|
336
349
|
write(colored('VirusTotal API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://www.virustotal.com/gui/join-us which will let you get some extra URLs.','white'))
|
|
337
350
|
else:
|
|
@@ -382,11 +395,19 @@ def showOptions():
|
|
|
382
395
|
write(colored('-mc: ' +str(args.mc), 'magenta')+colored(' Only retrieve URLs and Responses that match these HTTP Status codes.','white'))
|
|
383
396
|
else:
|
|
384
397
|
if args.fc:
|
|
385
|
-
write(colored('-fc: ' +str(args.
|
|
386
|
-
write(colored('MIME Type exclusions: ', 'magenta')+colored(FILTER_MIME))
|
|
398
|
+
write(colored('-fc: ' +str(args.fc), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these HTTP Status codes.','white'))
|
|
387
399
|
if not args.mc and args.fc:
|
|
388
400
|
write(colored('Response Code exclusions: ', 'magenta')+colored(FILTER_CODE))
|
|
389
401
|
write(colored('Response URL exclusions: ', 'magenta')+colored(FILTER_URL))
|
|
402
|
+
|
|
403
|
+
if args.mt:
|
|
404
|
+
write(colored('-mt: ' +str(args.mt.lower()), 'magenta')+colored(' Only retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
|
|
405
|
+
else:
|
|
406
|
+
if args.ft:
|
|
407
|
+
write(colored('-ft: ' +str(args.ft.lower()), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
|
|
408
|
+
else:
|
|
409
|
+
write(colored('MIME Type exclusions: ', 'magenta')+colored(FILTER_MIME)+colored(' Don\'t retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
|
|
410
|
+
|
|
390
411
|
if args.keywords_only and args.keywords_only == '#CONFIG':
|
|
391
412
|
if FILTER_KEYWORDS == '':
|
|
392
413
|
write(colored('Keywords only: ', 'magenta')+colored('It looks like no keywords have been set in config.yml file.','red'))
|
|
@@ -423,7 +444,7 @@ def getConfig():
|
|
|
423
444
|
"""
|
|
424
445
|
Try to get the values from the config file, otherwise use the defaults
|
|
425
446
|
"""
|
|
426
|
-
global FILTER_CODE, FILTER_MIME, FILTER_URL, FILTER_KEYWORDS, URLSCAN_API_KEY, VIRUSTOTAL_API_KEY, CONTINUE_RESPONSES_IF_PIPED, subs, path, waymorePath, inputIsDomainANDPath, HTTP_ADAPTER, HTTP_ADAPTER_CC, argsInput, terminalWidth, MATCH_CODE, WEBHOOK_DISCORD, DEFAULT_OUTPUT_DIR
|
|
447
|
+
global FILTER_CODE, FILTER_MIME, FILTER_URL, FILTER_KEYWORDS, URLSCAN_API_KEY, VIRUSTOTAL_API_KEY, CONTINUE_RESPONSES_IF_PIPED, subs, path, waymorePath, inputIsDomainANDPath, HTTP_ADAPTER, HTTP_ADAPTER_CC, argsInput, terminalWidth, MATCH_CODE, WEBHOOK_DISCORD, DEFAULT_OUTPUT_DIR, MATCH_MIME
|
|
427
448
|
try:
|
|
428
449
|
|
|
429
450
|
# Set terminal width
|
|
@@ -467,7 +488,7 @@ def getConfig():
|
|
|
467
488
|
# Set up an HTTPAdaptor for retry strategy for Common Crawl when making requests
|
|
468
489
|
try:
|
|
469
490
|
retry= Retry(
|
|
470
|
-
total=args.retries+
|
|
491
|
+
total=args.retries+3,
|
|
471
492
|
backoff_factor=1.1,
|
|
472
493
|
status_forcelist=[503],
|
|
473
494
|
raise_on_status=False,
|
|
@@ -505,14 +526,22 @@ def getConfig():
|
|
|
505
526
|
writerr(colored('Unable to read "FILTER_URL" from config.yml - default set', 'red'))
|
|
506
527
|
FILTER_URL = DEFAULT_FILTER_URL
|
|
507
528
|
|
|
508
|
-
try
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
529
|
+
# If the argument -ft was passed, don't try to get from the config
|
|
530
|
+
if args.ft:
|
|
531
|
+
FILTER_MIME = args.ft.lower()
|
|
532
|
+
else:
|
|
533
|
+
try:
|
|
534
|
+
FILTER_MIME = config.get('FILTER_MIME')
|
|
535
|
+
if str(FILTER_MIME) == 'None':
|
|
536
|
+
writerr(colored('No value for "FILTER_MIME" in config.yml - default set', 'yellow'))
|
|
537
|
+
FILTER_MIME = ''
|
|
538
|
+
except Exception as e:
|
|
539
|
+
writerr(colored('Unable to read "FILTER_MIME" from config.yml - default set', 'red'))
|
|
540
|
+
FILTER_MIME = DEFAULT_FILTER_MIME
|
|
541
|
+
|
|
542
|
+
# Set the match codes if they were passed
|
|
543
|
+
if args.mt:
|
|
544
|
+
MATCH_MIME = args.mt.lower()
|
|
516
545
|
|
|
517
546
|
# If the argument -fc was passed, don't try to get from the config
|
|
518
547
|
if args.fc:
|
|
@@ -530,7 +559,7 @@ def getConfig():
|
|
|
530
559
|
# Set the match codes if they were passed
|
|
531
560
|
if args.mc:
|
|
532
561
|
MATCH_CODE = args.mc
|
|
533
|
-
|
|
562
|
+
|
|
534
563
|
try:
|
|
535
564
|
URLSCAN_API_KEY = config.get('URLSCAN_API_KEY')
|
|
536
565
|
if str(URLSCAN_API_KEY) == 'None':
|
|
@@ -618,7 +647,9 @@ def getConfig():
|
|
|
618
647
|
# Use defaults if required
|
|
619
648
|
if useDefaults:
|
|
620
649
|
FILTER_URL = DEFAULT_FILTER_URL
|
|
650
|
+
MATCH_MIME = ''
|
|
621
651
|
FILTER_MIME = DEFAULT_FILTER_MIME
|
|
652
|
+
MATCH_CODE = ''
|
|
622
653
|
FILTER_CODE = DEFAULT_FILTER_CODE
|
|
623
654
|
URLSCAN_API_KEY = ''
|
|
624
655
|
VIRUSTOTAL_API_KEY = ''
|
|
@@ -1224,6 +1255,44 @@ def validateArgStatusCodes(x):
|
|
|
1224
1255
|
raise argparse.ArgumentTypeError('Pass HTTP status codes separated by a comma')
|
|
1225
1256
|
return x
|
|
1226
1257
|
|
|
1258
|
+
def validateArgMimeTypes(x):
|
|
1259
|
+
"""
|
|
1260
|
+
Validate the -ft and -mt arguments
|
|
1261
|
+
The passed values will be changed to lower case.
|
|
1262
|
+
Only values matching the regex '[a-z]+\/[a-z0-9\-\+]+' separated by a comma
|
|
1263
|
+
"""
|
|
1264
|
+
invalid = False
|
|
1265
|
+
x = x.lower()
|
|
1266
|
+
mimeTypes = x.split(',')
|
|
1267
|
+
for mimeType in mimeTypes:
|
|
1268
|
+
if not re.fullmatch(r'[a-z]+/[a-z0-9\-\+]+', mimeType):
|
|
1269
|
+
invalid = True
|
|
1270
|
+
break
|
|
1271
|
+
if invalid:
|
|
1272
|
+
raise argparse.ArgumentTypeError('Pass MIME Types separated by a comma, e.g. text/html,text/xml')
|
|
1273
|
+
return x
|
|
1274
|
+
|
|
1275
|
+
def validateArgProviders(x):
|
|
1276
|
+
"""
|
|
1277
|
+
Validate the --providers argument
|
|
1278
|
+
Only the following values in a comma separated list are accepted:
|
|
1279
|
+
- wayback
|
|
1280
|
+
- commoncrawl
|
|
1281
|
+
- otx
|
|
1282
|
+
- urlscan
|
|
1283
|
+
- virustotal
|
|
1284
|
+
"""
|
|
1285
|
+
invalid = False
|
|
1286
|
+
x = x.lower()
|
|
1287
|
+
providers = x.split(',')
|
|
1288
|
+
for provider in providers:
|
|
1289
|
+
if not re.fullmatch(r'(wayback|commoncrawl|otx|urlscan|virustotal)', provider):
|
|
1290
|
+
invalid = True
|
|
1291
|
+
break
|
|
1292
|
+
if invalid:
|
|
1293
|
+
raise argparse.ArgumentTypeError('Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal')
|
|
1294
|
+
return x
|
|
1295
|
+
|
|
1227
1296
|
def processAlienVaultPage(url):
|
|
1228
1297
|
"""
|
|
1229
1298
|
Get URLs from a specific page of otx.alienvault.org API for the input domain
|
|
@@ -1458,7 +1527,6 @@ def processURLScanUrl(url, httpCode, mimeType):
|
|
|
1458
1527
|
addLink = False
|
|
1459
1528
|
|
|
1460
1529
|
# If the user didn't requested -f / --filter-responses-only then check http code
|
|
1461
|
-
# Note we can't check MIME filter because it is not returned by URLScan API
|
|
1462
1530
|
if addLink and not args.filter_responses_only:
|
|
1463
1531
|
|
|
1464
1532
|
# Compare the HTTP code against the Code exclusions and matches
|
|
@@ -1488,13 +1556,18 @@ def processURLScanUrl(url, httpCode, mimeType):
|
|
|
1488
1556
|
|
|
1489
1557
|
# Check the MIME exclusions
|
|
1490
1558
|
if mimeType != '':
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1559
|
+
if MATCH_MIME != '':
|
|
1560
|
+
match = re.search(r'('+re.escape(MATCH_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
|
|
1561
|
+
if match is None:
|
|
1562
|
+
addLink = False
|
|
1494
1563
|
else:
|
|
1495
|
-
|
|
1496
|
-
if
|
|
1497
|
-
|
|
1564
|
+
match = re.search(r'('+re.escape(FILTER_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
|
|
1565
|
+
if match is not None:
|
|
1566
|
+
addLink = False
|
|
1567
|
+
|
|
1568
|
+
# Add MIME Types if --verbose option was selected
|
|
1569
|
+
if verbose():
|
|
1570
|
+
linkMimes.add(mimeType)
|
|
1498
1571
|
|
|
1499
1572
|
# Add link if it passed filters
|
|
1500
1573
|
if addLink:
|
|
@@ -1869,8 +1942,14 @@ def getWaybackUrls():
|
|
|
1869
1942
|
# Write the file of URL's for the passed domain/URL
|
|
1870
1943
|
try:
|
|
1871
1944
|
stopSource = False
|
|
1872
|
-
|
|
1873
|
-
|
|
1945
|
+
|
|
1946
|
+
if MATCH_MIME != '':
|
|
1947
|
+
filterMIME = '&filter=mimetype:' + re.escape(MATCH_MIME).replace(',','|')
|
|
1948
|
+
else:
|
|
1949
|
+
filterMIME = '&filter=!mimetype:warc/revisit|' + re.escape(FILTER_MIME).replace(',','|')
|
|
1950
|
+
# If there any \+ in the MIME types, e.g. image/svg\+xml (the backslash is because it was previosuly escaped), then replace the \+ with a . otherwise the wayback API does not recognise it
|
|
1951
|
+
filterMIME = filterMIME.replace('\+','.')
|
|
1952
|
+
|
|
1874
1953
|
if MATCH_CODE != '':
|
|
1875
1954
|
filterCode = '&filter=statuscode:' + re.escape(MATCH_CODE).replace(',','|')
|
|
1876
1955
|
else:
|
|
@@ -1992,9 +2071,13 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
1992
2071
|
|
|
1993
2072
|
if not stopSource:
|
|
1994
2073
|
# Set mime content type filter
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
filterMIME = filterMIME + re.escape(
|
|
2074
|
+
if MATCH_MIME.strip() != '':
|
|
2075
|
+
filterMIME = '&filter=~mime:('
|
|
2076
|
+
filterMIME = filterMIME + re.escape(MATCH_MIME).replace(',','|')
|
|
2077
|
+
else:
|
|
2078
|
+
filterMIME = '&filter=!~mime:(warc/revisit|'
|
|
2079
|
+
if FILTER_MIME.strip() != '':
|
|
2080
|
+
filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
|
|
1998
2081
|
filterMIME = filterMIME + ')'
|
|
1999
2082
|
|
|
2000
2083
|
# Set status code filter
|
|
@@ -2186,9 +2269,13 @@ def getCommonCrawlUrls():
|
|
|
2186
2269
|
originalLinkCount = len(linksFound)
|
|
2187
2270
|
|
|
2188
2271
|
# Set mime content type filter
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
filterMIME = filterMIME + re.escape(
|
|
2272
|
+
if MATCH_MIME.strip() != '':
|
|
2273
|
+
filterMIME = '&filter=~mime:('
|
|
2274
|
+
filterMIME = filterMIME + re.escape(MATCH_MIME).replace(',','|')
|
|
2275
|
+
else:
|
|
2276
|
+
filterMIME = '&filter=!~mime:(warc/revisit|'
|
|
2277
|
+
if FILTER_MIME.strip() != '':
|
|
2278
|
+
filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
|
|
2192
2279
|
filterMIME = filterMIME + ')'
|
|
2193
2280
|
|
|
2194
2281
|
# Set status code filter
|
|
@@ -2211,32 +2298,34 @@ def getCommonCrawlUrls():
|
|
|
2211
2298
|
# Get the Common Crawl index collections
|
|
2212
2299
|
cdxApiUrls = getCommonCrawlIndexes()
|
|
2213
2300
|
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
|
|
2301
|
+
# If there were URLs returned then continue
|
|
2302
|
+
if cdxApiUrls:
|
|
2303
|
+
if args.check_only:
|
|
2304
|
+
if args.lcc < len(cdxApiUrls):
|
|
2305
|
+
checkCommonCrawl = args.lcc+1
|
|
2306
|
+
else:
|
|
2307
|
+
checkCommonCrawl = len(cdxApiUrls)+1
|
|
2308
|
+
write(colored('Get URLs from Common Crawl: ','cyan')+colored(str(checkCommonCrawl)+' requests','white'))
|
|
2217
2309
|
else:
|
|
2218
|
-
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
|
|
2310
|
+
write(colored('\rGetting links from the latest ' + str(len(cdxApiUrls)) + ' commoncrawl.org index collections (this can take a while for some domains)...\r','cyan'))
|
|
2311
|
+
|
|
2312
|
+
# Process the URLs from common crawl
|
|
2313
|
+
if stopProgram is None:
|
|
2314
|
+
p = mp.Pool(args.processes)
|
|
2315
|
+
p.map(processCommonCrawlCollection, cdxApiUrls)
|
|
2316
|
+
p.close()
|
|
2317
|
+
p.join()
|
|
2318
|
+
|
|
2319
|
+
# Show the MIME types found (in case user wants to exclude more)
|
|
2320
|
+
if verbose() and len(linkMimes) > 0:
|
|
2321
|
+
linkMimes.discard('warc/revisit')
|
|
2322
|
+
write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
|
|
2222
2323
|
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
|
|
2228
|
-
p.join()
|
|
2229
|
-
|
|
2230
|
-
# Show the MIME types found (in case user wants to exclude more)
|
|
2231
|
-
if verbose() and len(linkMimes) > 0:
|
|
2232
|
-
linkMimes.discard('warc/revisit')
|
|
2233
|
-
write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
|
|
2234
|
-
|
|
2235
|
-
linkCount = len(linksFound) - originalLinkCount
|
|
2236
|
-
if args.xwm:
|
|
2237
|
-
write(getSPACER(colored('Links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2238
|
-
else:
|
|
2239
|
-
write(getSPACER(colored('Extra links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2324
|
+
linkCount = len(linksFound) - originalLinkCount
|
|
2325
|
+
if args.xwm:
|
|
2326
|
+
write(getSPACER(colored('Links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2327
|
+
else:
|
|
2328
|
+
write(getSPACER(colored('Extra links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2240
2329
|
|
|
2241
2330
|
except Exception as e:
|
|
2242
2331
|
writerr(colored('ERROR getCommonCrawlUrls 1: ' + str(e), 'red'))
|
|
@@ -2478,8 +2567,11 @@ def processResponses():
|
|
|
2478
2567
|
linksFound = set()
|
|
2479
2568
|
|
|
2480
2569
|
# Set mime content type filter
|
|
2481
|
-
filterMIME = '
|
|
2482
|
-
if
|
|
2570
|
+
filterMIME = ''
|
|
2571
|
+
if MATCH_MIME.strip() != '':
|
|
2572
|
+
filterMIME = '&filter=mimetype:' + re.escape(MATCH_MIME).replace(',','|')
|
|
2573
|
+
else:
|
|
2574
|
+
filterMIME = '&filter=!mimetype:warc/revisit'
|
|
2483
2575
|
filterMIME = filterMIME + '|' + re.escape(FILTER_MIME).replace(',','|')
|
|
2484
2576
|
|
|
2485
2577
|
# Set status code filter
|
|
@@ -2949,12 +3041,24 @@ def main():
|
|
|
2949
3041
|
help='Filter HTTP status codes for retrieved URLs and responses. Comma separated list of codes (default: the FILTER_CODE values from config.yml). Passing this argument will override the value from config.yml',
|
|
2950
3042
|
type=validateArgStatusCodes,
|
|
2951
3043
|
)
|
|
3044
|
+
parser.add_argument(
|
|
3045
|
+
'-ft',
|
|
3046
|
+
action='store',
|
|
3047
|
+
help='Filter MIME Types for retrieved URLs and responses. Comma separated list of MIME Types (default: the FILTER_MIME values from config.yml). Passing this argument will override the value from config.yml. NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.',
|
|
3048
|
+
type=validateArgMimeTypes,
|
|
3049
|
+
)
|
|
2952
3050
|
parser.add_argument(
|
|
2953
3051
|
'-mc',
|
|
2954
3052
|
action='store',
|
|
2955
3053
|
help='Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config FILTER_CODE and -fc.',
|
|
2956
3054
|
type=validateArgStatusCodes,
|
|
2957
3055
|
)
|
|
3056
|
+
parser.add_argument(
|
|
3057
|
+
'-mt',
|
|
3058
|
+
action='store',
|
|
3059
|
+
help='Only MIME Types for retrieved URLs and responses. Comma separated list of MIME types. Passing this argument overrides the config FILTER_MIME and -ft. NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.',
|
|
3060
|
+
type=validateArgMimeTypes,
|
|
3061
|
+
)
|
|
2958
3062
|
parser.add_argument(
|
|
2959
3063
|
'-l',
|
|
2960
3064
|
'--limit',
|
|
@@ -3030,11 +3134,19 @@ def main():
|
|
|
3030
3134
|
help='Exclude checks for links from virustotal.com',
|
|
3031
3135
|
default=False
|
|
3032
3136
|
)
|
|
3137
|
+
parser.add_argument(
|
|
3138
|
+
'--providers',
|
|
3139
|
+
action='store',
|
|
3140
|
+
help='A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan and virustotal. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.',
|
|
3141
|
+
default=[],
|
|
3142
|
+
type=validateArgProviders,
|
|
3143
|
+
metavar='{wayback,commoncrawl,otx,urlscan,virustotal}'
|
|
3144
|
+
)
|
|
3033
3145
|
parser.add_argument(
|
|
3034
3146
|
'-lcc',
|
|
3035
3147
|
action='store',
|
|
3036
3148
|
type=int,
|
|
3037
|
-
help='Limit the number of Common Crawl index collections searched, e.g. \'-lcc 10\' will just search the latest 10 collections (default:
|
|
3149
|
+
help='Limit the number of Common Crawl index collections searched, e.g. \'-lcc 10\' will just search the latest 10 collections (default: 1). As of November 2024 there are currently 106 collections. Setting to 0 (default) will search ALL collections. If you don\'t want to search Common Crawl at all, use the -xcc option.'
|
|
3038
3150
|
)
|
|
3039
3151
|
parser.add_argument(
|
|
3040
3152
|
'-lcy',
|
|
@@ -3153,13 +3265,36 @@ def main():
|
|
|
3153
3265
|
write(colored('Waymore - v' + __version__,'cyan'))
|
|
3154
3266
|
sys.exit()
|
|
3155
3267
|
|
|
3156
|
-
# If -lcc wasn't passed then set to the default of
|
|
3268
|
+
# If -lcc wasn't passed then set to the default of 1 if -lcy is 0. This will make them work together
|
|
3157
3269
|
if args.lcc is None:
|
|
3158
3270
|
if args.lcy == 0:
|
|
3159
|
-
args.lcc =
|
|
3271
|
+
args.lcc = 1
|
|
3160
3272
|
else:
|
|
3161
3273
|
args.lcc = 0
|
|
3162
3274
|
|
|
3275
|
+
# If --providers was passed, then manually set the exclude arguments;
|
|
3276
|
+
if args.providers:
|
|
3277
|
+
if 'wayback' not in args.providers:
|
|
3278
|
+
args.xwm = True
|
|
3279
|
+
else:
|
|
3280
|
+
args.xwm = False
|
|
3281
|
+
if 'commoncrawl' not in args.providers:
|
|
3282
|
+
args.xcc = True
|
|
3283
|
+
else:
|
|
3284
|
+
args.xcc = False
|
|
3285
|
+
if 'otx' not in args.providers:
|
|
3286
|
+
args.xav = True
|
|
3287
|
+
else:
|
|
3288
|
+
args.xav = False
|
|
3289
|
+
if 'urlscan' not in args.providers:
|
|
3290
|
+
args.xus = True
|
|
3291
|
+
else:
|
|
3292
|
+
args.xus = False
|
|
3293
|
+
if 'virustotal' not in args.providers:
|
|
3294
|
+
args.xvt = True
|
|
3295
|
+
else:
|
|
3296
|
+
args.xvt = False
|
|
3297
|
+
|
|
3163
3298
|
# If no input was given, raise an error
|
|
3164
3299
|
if sys.stdin.isatty():
|
|
3165
3300
|
if args.input is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: waymore
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.6
|
|
4
4
|
Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
|
|
5
5
|
Home-page: https://github.com/xnl-h4ck3r/waymore
|
|
6
6
|
Author: @xnl-h4ck3r
|
|
@@ -15,7 +15,7 @@ Requires-Dist: tldextract
|
|
|
15
15
|
|
|
16
16
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
|
|
17
17
|
|
|
18
|
-
## About - v4.
|
|
18
|
+
## About - v4.6
|
|
19
19
|
|
|
20
20
|
The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
|
|
21
21
|
|
|
@@ -83,7 +83,9 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
|
|
|
83
83
|
| -n | --no-subs | Don't include subdomains of the target domain (only used if input is not a domain with a specific path). |
|
|
84
84
|
| -f | --filter-responses-only | The initial links from sources will not be filtered, only the responses that are downloaded, e.g. it maybe useful to still see all available paths from the links, even if you don't want to check the content. |
|
|
85
85
|
| -fc | | Filter HTTP status codes for retrieved URLs and responses. Comma separated list of codes (default: the `FILTER_CODE` values from `config.yml`). Passing this argument will override the value from `config.yml` |
|
|
86
|
+
| -ft | | Filter MIME Types for retrieved URLs and responses. Comma separated list of MIME Types (default: the `FILTER_MIME` values from `config.yml`). Passing this argument will override the value from `config.yml`. **NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.**. |
|
|
86
87
|
| -mc | | Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config `FILTER_CODE` and `-fc`. |
|
|
88
|
+
| -mt | | Only MIME Types for retrieved URLs and responses. Comma separated list of MIME types. Passing this argument overrides the config `FILTER_MIME` and `-ft`. **NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.**. |
|
|
87
89
|
| -l | --limit | How many responses will be saved (if `-mode R` or `-mode B` is passed). A positive value will get the **first N** results, a negative value will get the **last N** results. A value of 0 will get **ALL** responses (default: 5000) |
|
|
88
90
|
| -from | --from-date | What date to get responses from. If not specified it will get from the earliest possible results. A partial value can be passed, e.g. `2016`, `201805`, etc. |
|
|
89
91
|
| -to | --to-date | What date to get responses to. If not specified it will get to the latest possible results. A partial value can be passed, e.g. `2021`, `202112`, etc. |
|
|
@@ -95,7 +97,7 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
|
|
|
95
97
|
| -xav | | Exclude checks for links from alienvault.com |
|
|
96
98
|
| -xus | | Exclude checks for links from urlscan.io |
|
|
97
99
|
| -xvt | | Exclude checks for links from virustotal.com |
|
|
98
|
-
| -lcc | | Limit the number of Common Crawl index collections searched, e.g. `-lcc 10` will just search the latest `10` collections (default:
|
|
100
|
+
| -lcc | | Limit the number of Common Crawl index collections searched, e.g. `-lcc 10` will just search the latest `10` collections (default: 1). As of November 2024 there are currently 106 collections. Setting to `0` will search **ALL** collections. If you don't want to search Common Crawl at all, use the `-xcc` option. |
|
|
99
101
|
| -lcy | | Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with `-lcc`). For example, if you are only interested in data from 2015 and after, pass `-lcy 2015`. This will override the value of `-lcc` if passed. If you don't want to search Common Crawl at all, use the `-xcc` option. |
|
|
100
102
|
| -t | --timeout | This is for archived responses only! How many seconds to wait for the server to send data before giving up (default: 30) |
|
|
101
103
|
| -p | --processes | Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1) |
|
|
@@ -154,8 +156,8 @@ If the input is just a domain, e.g. `redbull.com` then the `-mode` defaults to `
|
|
|
154
156
|
|
|
155
157
|
The `config.yml` file (typically in `~/.config/waymore/`) have values that can be updated to suit your needs. Filters are all provided as comma separated lists:
|
|
156
158
|
|
|
157
|
-
- `FILTER_CODE` - Exclusions used to exclude responses we will try to get from web.archive.org, and also for file names when `-i` is a directory, e.g. `301,302`. This can be overridden with the `-fc` argument. Passing the `-mc` (to match status codes instead of filter) will override any value in `FILTER_CODE` or `-fc
|
|
158
|
-
- `FILTER_MIME` - MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API, e.g. `'text/css,image/jpeg`
|
|
159
|
+
- `FILTER_CODE` - Exclusions used to exclude responses we will try to get from web.archive.org, and also for file names when `-i` is a directory, e.g. `301,302`. This can be overridden with the `-fc` argument. Passing the `-mc` (to match status codes instead of filter) will override any value in `FILTER_CODE` or `-fc`.
|
|
160
|
+
- `FILTER_MIME` - MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API, e.g. `'text/css,image/jpeg`. This can be overridden with the `-ft` argument. . Passing the `-mt` (to match MIME types instead of filter) will override any value in `FILTER_MIME` or `-ft`.
|
|
159
161
|
- `FILTER_URL` - Response code exclusions we will use to filter links and responses from web.archive.org through their API, e.g. `.css,.jpg`
|
|
160
162
|
- `FILTER_KEYWORDS` - Only links and responses will be returned that contain the specified keywords if the `-ko`/`--keywords-only` argument is passed (without providing an explicit value on the command line), e.g. `admin,portal`
|
|
161
163
|
- `URLSCAN_API_KEY` - You can sign up to [urlscan.io](https://urlscan.io/user/signup) to get a **FREE** API key (there are also paid subscriptions available). It is recommended you get a key and put it into the config file so that you can get more back (and quicker) from their API. NOTE: You will get rate limited unless you have a full paid subscription.
|
|
@@ -163,7 +165,7 @@ The `config.yml` file (typically in `~/.config/waymore/`) have values that can b
|
|
|
163
165
|
- `WEBHOOK_DISCORD` - If the `--notify-discord` argument is passed, `knoxnl` will send a notification to this Discord wehook when a successful XSS is found.
|
|
164
166
|
- `DEFAULT_OUTPUT_DIR` - This is the default location of any output files written if the `-oU` and `-oR` arguments are not used. If the value of this key is blank, then it will default to the location of the `config.yml` file.
|
|
165
167
|
|
|
166
|
-
**NOTE: The MIME types cannot be filtered for Alien Vault
|
|
168
|
+
**NOTE: The MIME types cannot be filtered for Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined for a URL. In these cases, URLs will be included regardless of filter or match. Bear this in mind and consider excluding certain providers if this is important.**
|
|
167
169
|
|
|
168
170
|
## Output
|
|
169
171
|
|
|
@@ -201,6 +203,8 @@ The archive.org Wayback Machine CDX API can sometimes can sometimes require a hu
|
|
|
201
203
|
|
|
202
204
|
There is also a problem with the Wayback Machine CDX API where the number of pages returned is not correct when filters are applied and can cause issues (see https://github.com/internetarchive/wayback/issues/243). Until that issue is resolved, setting the `-lr` argument to a sensible value can help with that problem in the short term.
|
|
203
205
|
|
|
206
|
+
The Common Crawl API has had a lot of issues for a long time. Including this source could make waymore take a lot longer to run and may not yield any extra results. You can check if tere is an issue by visiting http://index.commoncrawl.org/collinfo.json and seeing if this is successful. Consider excluding Common Crawl altogether using the `--providers` argument and not including `commoncrawl`, or using the `-xcc` argument.
|
|
207
|
+
|
|
204
208
|
**The provider API servers aren't designed to cope with huge volumes, so be sensible and considerate about what you hit them with!**
|
|
205
209
|
|
|
206
210
|
When downloading archived responses, this can take a long time and can sometimes be killed by the machine for some reason, or manually killed by the user.
|
|
@@ -218,7 +222,7 @@ The URLs are saved in the same path as `config.yml` (typically `~/.config/waymor
|
|
|
218
222
|
|
|
219
223
|
### Example 2
|
|
220
224
|
|
|
221
|
-
Get ALL the URLs from Wayback for `redbull.com` (no filters are applied in `mode U` with `-f`, and no URLs are retrieved from Commone Crawl, Alien Vault, URLScan and Virus Total, because `-xcc`, `-xav`, `-xus`, `-xvt` are passed respectively).
|
|
225
|
+
Get ALL the URLs from Wayback for `redbull.com` (no filters are applied in `mode U` with `-f`, and no URLs are retrieved from Commone Crawl, Alien Vault, URLScan and Virus Total, because `-xcc`, `-xav`, `-xus`, `-xvt` are passed respectively. This can also be achieved by passing `--providers wayback` instead of the exclude arguments).
|
|
222
226
|
Save the FIRST 200 responses that are found starting from 2022 (`-l 200 -from 2022`):
|
|
223
227
|
|
|
224
228
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/example2.png"></center>
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
waymore/__init__.py,sha256=nBVFOoDYjRXcFurm_7co-GNVg6LUhzeZpudhmYNojHw,17
|
|
2
|
+
waymore/waymore.py,sha256=FhSRlLoK9DBGojEX89rMQdZ-bEacPSxJg2BJwQfUJGA,177093
|
|
3
|
+
waymore-4.6.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
|
|
4
|
+
waymore-4.6.dist-info/METADATA,sha256=oQMMrr_MbK_QPmShWY-TrHf-bxOg9dtjdUK76QE29H8,49511
|
|
5
|
+
waymore-4.6.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
6
|
+
waymore-4.6.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
|
|
7
|
+
waymore-4.6.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
|
|
8
|
+
waymore-4.6.dist-info/RECORD,,
|
waymore-4.5.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
waymore/__init__.py,sha256=HpBSj4W3_snlRrPgOuCuVP107OOZenaFQECvPnsC9V4,17
|
|
2
|
-
waymore/waymore.py,sha256=kfGA3T_cDADuhZ_78Ta22fxnqlGVUm56yvIncEfnZDs,170779
|
|
3
|
-
waymore-4.5.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
|
|
4
|
-
waymore-4.5.dist-info/METADATA,sha256=v57_NUSUTSGqA1fZQb9UQgFUoOIDcc9AQeDZbRyL7kk,47221
|
|
5
|
-
waymore-4.5.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
|
6
|
-
waymore-4.5.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
|
|
7
|
-
waymore-4.5.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
|
|
8
|
-
waymore-4.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|