waymore 4.4__py3-none-any.whl → 4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__="4.4"
1
+ __version__="4.6"
waymore/waymore.py CHANGED
@@ -136,6 +136,7 @@ DEFAULT_FILTER_KEYWORDS = 'admin,login,logon,signin,signup,register,registration
136
136
  # Yaml config values
137
137
  FILTER_URL = ''
138
138
  FILTER_MIME = ''
139
+ MATCH_MIME = ''
139
140
  FILTER_CODE = ''
140
141
  MATCH_CODE = ''
141
142
  FILTER_KEYWORDS = ''
@@ -313,8 +314,21 @@ def showOptions():
313
314
  else:
314
315
  write(colored('-n: ' +str(args.no_subs), 'magenta')+colored(' Sub domains are included in the search.','white'))
315
316
 
316
- write(colored('-xwm: ' +str(args.xwm), 'magenta')+colored(' Whether to exclude checks for links from Wayback Machine (archive.org)','white'))
317
- write(colored('-xcc: ' +str(args.xcc), 'magenta')+colored(' Whether to exclude checks for links from commoncrawl.org','white'))
317
+ providers = ''
318
+ if not args.xwm:
319
+ providers = providers + 'Wayback, '
320
+ if not args.xcc:
321
+ providers = providers + 'CommonCrawl, '
322
+ if not args.xav:
323
+ providers = providers + 'Alien Vault OTX, '
324
+ if not args.xus:
325
+ providers = providers + 'URLScan, '
326
+ if not args.xvt:
327
+ providers = providers + 'VirusTotal, '
328
+ if providers == '':
329
+ providers = 'None'
330
+ write(colored('Providers: ' +str(providers.strip(', ')), 'magenta')+colored(' Which providers to check for URLs.','white'))
331
+
318
332
  if not args.xcc:
319
333
  if args.lcc ==0 and args.lcy == 0:
320
334
  write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' Search ALL Common Crawl index collections.','white'))
@@ -325,13 +339,12 @@ def showOptions():
325
339
  if args.lcc != 0:
326
340
  write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' The number of latest Common Crawl index collections to be searched.','white'))
327
341
  write(colored('-lcy: ' +str(args.lcy), 'magenta')+colored(' Search all Common Crawl index collections with data from year '+str(args.lcy)+' and after.','white'))
328
- write(colored('-xav: ' +str(args.xav), 'magenta')+colored(' Whether to exclude checks for links from alienvault.com','white'))
329
- write(colored('-xus: ' +str(args.xus), 'magenta')+colored(' Whether to exclude checks for links from urlscan.io','white'))
342
+
330
343
  if URLSCAN_API_KEY == '':
331
344
  write(colored('URLScan API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://urlscan.io/user/signup which will let you get more back, and quicker.','white'))
332
345
  else:
333
346
  write(colored('URLScan API Key: ', 'magenta')+colored(URLSCAN_API_KEY))
334
- write(colored('-xvt: ' +str(args.xvt), 'magenta')+colored(' Whether to exclude checks for links from virustotal.com','white'))
347
+
335
348
  if VIRUSTOTAL_API_KEY == '':
336
349
  write(colored('VirusTotal API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://www.virustotal.com/gui/join-us which will let you get some extra URLs.','white'))
337
350
  else:
@@ -382,11 +395,19 @@ def showOptions():
382
395
  write(colored('-mc: ' +str(args.mc), 'magenta')+colored(' Only retrieve URLs and Responses that match these HTTP Status codes.','white'))
383
396
  else:
384
397
  if args.fc:
385
- write(colored('-fc: ' +str(args.mc), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these HTTP Status codes.','white'))
386
- write(colored('MIME Type exclusions: ', 'magenta')+colored(FILTER_MIME))
398
+ write(colored('-fc: ' +str(args.fc), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these HTTP Status codes.','white'))
387
399
  if not args.mc and args.fc:
388
400
  write(colored('Response Code exclusions: ', 'magenta')+colored(FILTER_CODE))
389
401
  write(colored('Response URL exclusions: ', 'magenta')+colored(FILTER_URL))
402
+
403
+ if args.mt:
404
+ write(colored('-mt: ' +str(args.mt.lower()), 'magenta')+colored(' Only retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
405
+ else:
406
+ if args.ft:
407
+ write(colored('-ft: ' +str(args.ft.lower()), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
408
+ else:
409
+ write(colored('MIME Type exclusions: ', 'magenta')+colored(FILTER_MIME)+colored(' Don\'t retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
410
+
390
411
  if args.keywords_only and args.keywords_only == '#CONFIG':
391
412
  if FILTER_KEYWORDS == '':
392
413
  write(colored('Keywords only: ', 'magenta')+colored('It looks like no keywords have been set in config.yml file.','red'))
@@ -423,7 +444,7 @@ def getConfig():
423
444
  """
424
445
  Try to get the values from the config file, otherwise use the defaults
425
446
  """
426
- global FILTER_CODE, FILTER_MIME, FILTER_URL, FILTER_KEYWORDS, URLSCAN_API_KEY, VIRUSTOTAL_API_KEY, CONTINUE_RESPONSES_IF_PIPED, subs, path, waymorePath, inputIsDomainANDPath, HTTP_ADAPTER, HTTP_ADAPTER_CC, argsInput, terminalWidth, MATCH_CODE, WEBHOOK_DISCORD, DEFAULT_OUTPUT_DIR
447
+ global FILTER_CODE, FILTER_MIME, FILTER_URL, FILTER_KEYWORDS, URLSCAN_API_KEY, VIRUSTOTAL_API_KEY, CONTINUE_RESPONSES_IF_PIPED, subs, path, waymorePath, inputIsDomainANDPath, HTTP_ADAPTER, HTTP_ADAPTER_CC, argsInput, terminalWidth, MATCH_CODE, WEBHOOK_DISCORD, DEFAULT_OUTPUT_DIR, MATCH_MIME
427
448
  try:
428
449
 
429
450
  # Set terminal width
@@ -467,7 +488,7 @@ def getConfig():
467
488
  # Set up an HTTPAdaptor for retry strategy for Common Crawl when making requests
468
489
  try:
469
490
  retry= Retry(
470
- total=args.retries+20,
491
+ total=args.retries+3,
471
492
  backoff_factor=1.1,
472
493
  status_forcelist=[503],
473
494
  raise_on_status=False,
@@ -505,14 +526,22 @@ def getConfig():
505
526
  writerr(colored('Unable to read "FILTER_URL" from config.yml - default set', 'red'))
506
527
  FILTER_URL = DEFAULT_FILTER_URL
507
528
 
508
- try:
509
- FILTER_MIME = config.get('FILTER_MIME')
510
- if str(FILTER_MIME) == 'None':
511
- writerr(colored('No value for "FILTER_MIME" in config.yml - default set', 'yellow'))
512
- FILTER_MIME = ''
513
- except Exception as e:
514
- writerr(colored('Unable to read "FILTER_MIME" from config.yml - default set', 'red'))
515
- FILTER_MIME = DEFAULT_FILTER_MIME
529
+ # If the argument -ft was passed, don't try to get from the config
530
+ if args.ft:
531
+ FILTER_MIME = args.ft.lower()
532
+ else:
533
+ try:
534
+ FILTER_MIME = config.get('FILTER_MIME')
535
+ if str(FILTER_MIME) == 'None':
536
+ writerr(colored('No value for "FILTER_MIME" in config.yml - default set', 'yellow'))
537
+ FILTER_MIME = ''
538
+ except Exception as e:
539
+ writerr(colored('Unable to read "FILTER_MIME" from config.yml - default set', 'red'))
540
+ FILTER_MIME = DEFAULT_FILTER_MIME
541
+
542
+ # Set the match codes if they were passed
543
+ if args.mt:
544
+ MATCH_MIME = args.mt.lower()
516
545
 
517
546
  # If the argument -fc was passed, don't try to get from the config
518
547
  if args.fc:
@@ -530,7 +559,7 @@ def getConfig():
530
559
  # Set the match codes if they were passed
531
560
  if args.mc:
532
561
  MATCH_CODE = args.mc
533
-
562
+
534
563
  try:
535
564
  URLSCAN_API_KEY = config.get('URLSCAN_API_KEY')
536
565
  if str(URLSCAN_API_KEY) == 'None':
@@ -618,7 +647,9 @@ def getConfig():
618
647
  # Use defaults if required
619
648
  if useDefaults:
620
649
  FILTER_URL = DEFAULT_FILTER_URL
650
+ MATCH_MIME = ''
621
651
  FILTER_MIME = DEFAULT_FILTER_MIME
652
+ MATCH_CODE = ''
622
653
  FILTER_CODE = DEFAULT_FILTER_CODE
623
654
  URLSCAN_API_KEY = ''
624
655
  VIRUSTOTAL_API_KEY = ''
@@ -1224,6 +1255,44 @@ def validateArgStatusCodes(x):
1224
1255
  raise argparse.ArgumentTypeError('Pass HTTP status codes separated by a comma')
1225
1256
  return x
1226
1257
 
1258
+ def validateArgMimeTypes(x):
1259
+ """
1260
+ Validate the -ft and -mt arguments
1261
+ The passed values will be changed to lower case.
1262
+ Only values matching the regex '[a-z]+\/[a-z0-9\-\+]+' separated by a comma
1263
+ """
1264
+ invalid = False
1265
+ x = x.lower()
1266
+ mimeTypes = x.split(',')
1267
+ for mimeType in mimeTypes:
1268
+ if not re.fullmatch(r'[a-z]+/[a-z0-9\-\+]+', mimeType):
1269
+ invalid = True
1270
+ break
1271
+ if invalid:
1272
+ raise argparse.ArgumentTypeError('Pass MIME Types separated by a comma, e.g. text/html,text/xml')
1273
+ return x
1274
+
1275
+ def validateArgProviders(x):
1276
+ """
1277
+ Validate the --providers argument
1278
+ Only the following values in a comma separated list are accepted:
1279
+ - wayback
1280
+ - commoncrawl
1281
+ - otx
1282
+ - urlscan
1283
+ - virustotal
1284
+ """
1285
+ invalid = False
1286
+ x = x.lower()
1287
+ providers = x.split(',')
1288
+ for provider in providers:
1289
+ if not re.fullmatch(r'(wayback|commoncrawl|otx|urlscan|virustotal)', provider):
1290
+ invalid = True
1291
+ break
1292
+ if invalid:
1293
+ raise argparse.ArgumentTypeError('Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal')
1294
+ return x
1295
+
1227
1296
  def processAlienVaultPage(url):
1228
1297
  """
1229
1298
  Get URLs from a specific page of otx.alienvault.org API for the input domain
@@ -1384,11 +1453,15 @@ def getAlienVaultUrls():
1384
1453
  # Carry on if something was found
1385
1454
  if resp.text.lower().find('"error": "') < 0:
1386
1455
 
1387
- # Get the JSON response
1388
- jsonResp = json.loads(resp.text.strip())
1389
-
1390
- # Try to get the number of results
1391
- totalUrls = jsonResp['full_size']
1456
+ try:
1457
+ # Get the JSON response
1458
+ jsonResp = json.loads(resp.text.strip())
1459
+
1460
+ # Try to get the number of results
1461
+ totalUrls = int(jsonResp['full_size'])
1462
+ except:
1463
+ writerr(colored(getSPACER('[ ERR ] There was an unexpected response from the Alien Vault API'),'red'))
1464
+ totalUrls = 0
1392
1465
 
1393
1466
  # If there are results, carry on
1394
1467
  if totalUrls > 0 or args.check_only:
@@ -1454,7 +1527,6 @@ def processURLScanUrl(url, httpCode, mimeType):
1454
1527
  addLink = False
1455
1528
 
1456
1529
  # If the user didn't requested -f / --filter-responses-only then check http code
1457
- # Note we can't check MIME filter because it is not returned by URLScan API
1458
1530
  if addLink and not args.filter_responses_only:
1459
1531
 
1460
1532
  # Compare the HTTP code against the Code exclusions and matches
@@ -1484,13 +1556,18 @@ def processURLScanUrl(url, httpCode, mimeType):
1484
1556
 
1485
1557
  # Check the MIME exclusions
1486
1558
  if mimeType != '':
1487
- match = re.search(r'('+re.escape(FILTER_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
1488
- if match is not None:
1489
- addLink = False
1559
+ if MATCH_MIME != '':
1560
+ match = re.search(r'('+re.escape(MATCH_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
1561
+ if match is None:
1562
+ addLink = False
1490
1563
  else:
1491
- # Add MIME Types if --verbose option was selected
1492
- if verbose():
1493
- linkMimes.add(mimeType)
1564
+ match = re.search(r'('+re.escape(FILTER_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
1565
+ if match is not None:
1566
+ addLink = False
1567
+
1568
+ # Add MIME Types if --verbose option was selected
1569
+ if verbose():
1570
+ linkMimes.add(mimeType)
1494
1571
 
1495
1572
  # Add link if it passed filters
1496
1573
  if addLink:
@@ -1588,19 +1665,28 @@ def getURLScanUrls():
1588
1665
  writerr(colored(getSPACER('[ ' + str(resp.status_code) + ' ] Unable to get links from urlscan.io'),'red'))
1589
1666
  return
1590
1667
 
1591
- # Get the JSON response
1592
- jsonResp = json.loads(resp.text.strip())
1668
+ try:
1669
+ # Get the JSON response
1670
+ jsonResp = json.loads(resp.text.strip())
1593
1671
 
1594
- # Get the number of results
1595
- totalUrls = jsonResp['total']
1672
+ # Get the number of results
1673
+ totalUrls = int(jsonResp['total'])
1674
+ except:
1675
+ writerr(colored(getSPACER('[ ERR ] There was an unexpected response from the URLScan API'),'red'))
1676
+ totalUrls = 0
1596
1677
 
1678
+ # Carry on if something was found
1597
1679
  if args.check_only:
1598
- hasMore = jsonResp['has_more']
1599
- if hasMore:
1600
- write(colored('Get URLs from URLScan: ','cyan')+colored('UNKNOWN requests','white'))
1601
- else:
1602
- write(colored('Get URLs from URLScan: ','cyan')+colored('1 request','white'))
1680
+ try:
1681
+ hasMore = jsonResp['has_more']
1682
+ if hasMore:
1683
+ write(colored('Get URLs from URLScan: ','cyan')+colored('UNKNOWN requests','white'))
1684
+ else:
1685
+ write(colored('Get URLs from URLScan: ','cyan')+colored('1 request','white'))
1686
+ except:
1687
+ pass
1603
1688
  checkURLScan = 1
1689
+
1604
1690
  else:
1605
1691
  # Carry on if something was found
1606
1692
  if int(totalUrls) > 0:
@@ -1746,6 +1832,7 @@ def processWayBackPage(url):
1746
1832
  if not stopSource:
1747
1833
  try:
1748
1834
  # Choose a random user agent string to use for any requests
1835
+ resp = None
1749
1836
  userAgent = random.choice(USER_AGENT)
1750
1837
  page = url.split('page=')[1]
1751
1838
  session = requests.Session()
@@ -1817,8 +1904,11 @@ def processWayBackPage(url):
1817
1904
  results = line.decode("utf-8")
1818
1905
  foundUrl = fixArchiveOrgUrl(str(results).split(' ')[1])
1819
1906
 
1820
- # Check the URL exclusions
1821
- match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
1907
+ # If --filter-responses-only wasn't used, then check the URL exclusions
1908
+ if args.filter_responses_only:
1909
+ match = None
1910
+ else:
1911
+ match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
1822
1912
  if match is None:
1823
1913
  # Only get MIME Types if --verbose option was selected
1824
1914
  if verbose():
@@ -1852,8 +1942,14 @@ def getWaybackUrls():
1852
1942
  # Write the file of URL's for the passed domain/URL
1853
1943
  try:
1854
1944
  stopSource = False
1855
- # If there any + in the MIME types, e.g. image/svg+xml, then replace the + with a . otherwise the wayback API does not recognise it
1856
- filterMIME = '&filter=!mimetype:warc/revisit|' + re.escape(FILTER_MIME).replace(',','|').replace('+','.')
1945
+
1946
+ if MATCH_MIME != '':
1947
+ filterMIME = '&filter=mimetype:' + re.escape(MATCH_MIME).replace(',','|')
1948
+ else:
1949
+ filterMIME = '&filter=!mimetype:warc/revisit|' + re.escape(FILTER_MIME).replace(',','|')
1950
+ # If there any \+ in the MIME types, e.g. image/svg\+xml (the backslash is because it was previosuly escaped), then replace the \+ with a . otherwise the wayback API does not recognise it
1951
+ filterMIME = filterMIME.replace('\+','.')
1952
+
1857
1953
  if MATCH_CODE != '':
1858
1954
  filterCode = '&filter=statuscode:' + re.escape(MATCH_CODE).replace(',','|')
1859
1955
  else:
@@ -1975,9 +2071,13 @@ def processCommonCrawlCollection(cdxApiUrl):
1975
2071
 
1976
2072
  if not stopSource:
1977
2073
  # Set mime content type filter
1978
- filterMIME = '&filter=!~mime:(warc/revisit|'
1979
- if FILTER_MIME.strip() != '':
1980
- filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
2074
+ if MATCH_MIME.strip() != '':
2075
+ filterMIME = '&filter=~mime:('
2076
+ filterMIME = filterMIME + re.escape(MATCH_MIME).replace(',','|')
2077
+ else:
2078
+ filterMIME = '&filter=!~mime:(warc/revisit|'
2079
+ if FILTER_MIME.strip() != '':
2080
+ filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
1981
2081
  filterMIME = filterMIME + ')'
1982
2082
 
1983
2083
  # Set status code filter
@@ -2169,9 +2269,13 @@ def getCommonCrawlUrls():
2169
2269
  originalLinkCount = len(linksFound)
2170
2270
 
2171
2271
  # Set mime content type filter
2172
- filterMIME = '&filter=!~mime:(warc/revisit|'
2173
- if FILTER_MIME.strip() != '':
2174
- filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
2272
+ if MATCH_MIME.strip() != '':
2273
+ filterMIME = '&filter=~mime:('
2274
+ filterMIME = filterMIME + re.escape(MATCH_MIME).replace(',','|')
2275
+ else:
2276
+ filterMIME = '&filter=!~mime:(warc/revisit|'
2277
+ if FILTER_MIME.strip() != '':
2278
+ filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
2175
2279
  filterMIME = filterMIME + ')'
2176
2280
 
2177
2281
  # Set status code filter
@@ -2194,32 +2298,34 @@ def getCommonCrawlUrls():
2194
2298
  # Get the Common Crawl index collections
2195
2299
  cdxApiUrls = getCommonCrawlIndexes()
2196
2300
 
2197
- if args.check_only:
2198
- if args.lcc < len(cdxApiUrls):
2199
- checkCommonCrawl = args.lcc+1
2301
+ # If there were URLs returned then continue
2302
+ if cdxApiUrls:
2303
+ if args.check_only:
2304
+ if args.lcc < len(cdxApiUrls):
2305
+ checkCommonCrawl = args.lcc+1
2306
+ else:
2307
+ checkCommonCrawl = len(cdxApiUrls)+1
2308
+ write(colored('Get URLs from Common Crawl: ','cyan')+colored(str(checkCommonCrawl)+' requests','white'))
2200
2309
  else:
2201
- checkCommonCrawl = len(cdxApiUrls)+1
2202
- write(colored('Get URLs from Common Crawl: ','cyan')+colored(str(checkCommonCrawl)+' requests','white'))
2203
- else:
2204
- write(colored('\rGetting links from the latest ' + str(len(cdxApiUrls)) + ' commoncrawl.org index collections (this can take a while for some domains)...\r','cyan'))
2310
+ write(colored('\rGetting links from the latest ' + str(len(cdxApiUrls)) + ' commoncrawl.org index collections (this can take a while for some domains)...\r','cyan'))
2311
+
2312
+ # Process the URLs from common crawl
2313
+ if stopProgram is None:
2314
+ p = mp.Pool(args.processes)
2315
+ p.map(processCommonCrawlCollection, cdxApiUrls)
2316
+ p.close()
2317
+ p.join()
2318
+
2319
+ # Show the MIME types found (in case user wants to exclude more)
2320
+ if verbose() and len(linkMimes) > 0:
2321
+ linkMimes.discard('warc/revisit')
2322
+ write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
2205
2323
 
2206
- # Process the URLs from common crawl
2207
- if stopProgram is None:
2208
- p = mp.Pool(args.processes)
2209
- p.map(processCommonCrawlCollection, cdxApiUrls)
2210
- p.close()
2211
- p.join()
2212
-
2213
- # Show the MIME types found (in case user wants to exclude more)
2214
- if verbose() and len(linkMimes) > 0:
2215
- linkMimes.discard('warc/revisit')
2216
- write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
2217
-
2218
- linkCount = len(linksFound) - originalLinkCount
2219
- if args.xwm:
2220
- write(getSPACER(colored('Links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
2221
- else:
2222
- write(getSPACER(colored('Extra links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
2324
+ linkCount = len(linksFound) - originalLinkCount
2325
+ if args.xwm:
2326
+ write(getSPACER(colored('Links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
2327
+ else:
2328
+ write(getSPACER(colored('Extra links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
2223
2329
 
2224
2330
  except Exception as e:
2225
2331
  writerr(colored('ERROR getCommonCrawlUrls 1: ' + str(e), 'red'))
@@ -2332,29 +2438,33 @@ def getVirusTotalUrls():
2332
2438
  return
2333
2439
 
2334
2440
  # Get the JSON response
2335
- jsonResp = json.loads(resp.text.strip())
2441
+ try:
2442
+ jsonResp = json.loads(resp.text.strip())
2336
2443
 
2337
- # Get the different URLs
2338
- if args.no_subs:
2339
- subDomains = []
2340
- else:
2444
+ # Get the different URLs
2445
+ if args.no_subs:
2446
+ subDomains = []
2447
+ else:
2448
+ try:
2449
+ subDomains = jsonResp['subdomains']
2450
+ except Exception as e:
2451
+ subDomains = []
2452
+ try:
2453
+ detectedUrls = [entry['url'] for entry in jsonResp.get('detected_urls', [])]
2454
+ except Exception as e:
2455
+ detectedUrls = []
2341
2456
  try:
2342
- subDomains = jsonResp['subdomains']
2457
+ undetectedUrls = [entry[0] for entry in jsonResp.get('undetected_urls', [])]
2343
2458
  except Exception as e:
2344
- subDomains = []
2345
- try:
2346
- detectedUrls = [entry['url'] for entry in jsonResp.get('detected_urls', [])]
2347
- except Exception as e:
2348
- detectedUrls = []
2349
- try:
2350
- undetectedUrls = [entry[0] for entry in jsonResp.get('undetected_urls', [])]
2351
- except Exception as e:
2352
- undetectedUrls = []
2353
- try:
2354
- totalUrls = set(subDomains + detectedUrls + undetectedUrls)
2355
- except Exception as e:
2459
+ undetectedUrls = []
2460
+ try:
2461
+ totalUrls = set(subDomains + detectedUrls + undetectedUrls)
2462
+ except Exception as e:
2463
+ totalUrls = []
2464
+ except:
2465
+ writerr(colored(getSPACER('[ ERR ] There was an unexpected response from the VirusTotal API'),'red'))
2356
2466
  totalUrls = []
2357
-
2467
+
2358
2468
  if args.check_only:
2359
2469
  write(colored('Get URLs from VirusTotal: ','cyan')+colored('1 request','white'))
2360
2470
  checkVirusTotal = 1
@@ -2457,8 +2567,11 @@ def processResponses():
2457
2567
  linksFound = set()
2458
2568
 
2459
2569
  # Set mime content type filter
2460
- filterMIME = '&filter=!mimetype:warc/revisit'
2461
- if FILTER_MIME.strip() != '':
2570
+ filterMIME = ''
2571
+ if MATCH_MIME.strip() != '':
2572
+ filterMIME = '&filter=mimetype:' + re.escape(MATCH_MIME).replace(',','|')
2573
+ else:
2574
+ filterMIME = '&filter=!mimetype:warc/revisit'
2462
2575
  filterMIME = filterMIME + '|' + re.escape(FILTER_MIME).replace(',','|')
2463
2576
 
2464
2577
  # Set status code filter
@@ -2928,12 +3041,24 @@ def main():
2928
3041
  help='Filter HTTP status codes for retrieved URLs and responses. Comma separated list of codes (default: the FILTER_CODE values from config.yml). Passing this argument will override the value from config.yml',
2929
3042
  type=validateArgStatusCodes,
2930
3043
  )
3044
+ parser.add_argument(
3045
+ '-ft',
3046
+ action='store',
3047
+ help='Filter MIME Types for retrieved URLs and responses. Comma separated list of MIME Types (default: the FILTER_MIME values from config.yml). Passing this argument will override the value from config.yml. NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.',
3048
+ type=validateArgMimeTypes,
3049
+ )
2931
3050
  parser.add_argument(
2932
3051
  '-mc',
2933
3052
  action='store',
2934
3053
  help='Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config FILTER_CODE and -fc.',
2935
3054
  type=validateArgStatusCodes,
2936
3055
  )
3056
+ parser.add_argument(
3057
+ '-mt',
3058
+ action='store',
3059
+ help='Only MIME Types for retrieved URLs and responses. Comma separated list of MIME types. Passing this argument overrides the config FILTER_MIME and -ft. NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.',
3060
+ type=validateArgMimeTypes,
3061
+ )
2937
3062
  parser.add_argument(
2938
3063
  '-l',
2939
3064
  '--limit',
@@ -3009,11 +3134,19 @@ def main():
3009
3134
  help='Exclude checks for links from virustotal.com',
3010
3135
  default=False
3011
3136
  )
3137
+ parser.add_argument(
3138
+ '--providers',
3139
+ action='store',
3140
+ help='A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan and virustotal. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.',
3141
+ default=[],
3142
+ type=validateArgProviders,
3143
+ metavar='{wayback,commoncrawl,otx,urlscan,virustotal}'
3144
+ )
3012
3145
  parser.add_argument(
3013
3146
  '-lcc',
3014
3147
  action='store',
3015
3148
  type=int,
3016
- help='Limit the number of Common Crawl index collections searched, e.g. \'-lcc 10\' will just search the latest 10 collections (default: 3). As of July 2023 there are currently 95 collections. Setting to 0 (default) will search ALL collections. If you don\'t want to search Common Crawl at all, use the -xcc option.'
3149
+ help='Limit the number of Common Crawl index collections searched, e.g. \'-lcc 10\' will just search the latest 10 collections (default: 1). As of November 2024 there are currently 106 collections. Setting to 0 (default) will search ALL collections. If you don\'t want to search Common Crawl at all, use the -xcc option.'
3017
3150
  )
3018
3151
  parser.add_argument(
3019
3152
  '-lcy',
@@ -3132,13 +3265,36 @@ def main():
3132
3265
  write(colored('Waymore - v' + __version__,'cyan'))
3133
3266
  sys.exit()
3134
3267
 
3135
- # If -lcc wasn't passed then set to the default of 3 if -lcy is 0. This will make them work together
3268
+ # If -lcc wasn't passed then set to the default of 1 if -lcy is 0. This will make them work together
3136
3269
  if args.lcc is None:
3137
3270
  if args.lcy == 0:
3138
- args.lcc = 3
3271
+ args.lcc = 1
3139
3272
  else:
3140
3273
  args.lcc = 0
3141
3274
 
3275
+ # If --providers was passed, then manually set the exclude arguments;
3276
+ if args.providers:
3277
+ if 'wayback' not in args.providers:
3278
+ args.xwm = True
3279
+ else:
3280
+ args.xwm = False
3281
+ if 'commoncrawl' not in args.providers:
3282
+ args.xcc = True
3283
+ else:
3284
+ args.xcc = False
3285
+ if 'otx' not in args.providers:
3286
+ args.xav = True
3287
+ else:
3288
+ args.xav = False
3289
+ if 'urlscan' not in args.providers:
3290
+ args.xus = True
3291
+ else:
3292
+ args.xus = False
3293
+ if 'virustotal' not in args.providers:
3294
+ args.xvt = True
3295
+ else:
3296
+ args.xvt = False
3297
+
3142
3298
  # If no input was given, raise an error
3143
3299
  if sys.stdin.isatty():
3144
3300
  if args.input is None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: waymore
3
- Version: 4.4
3
+ Version: 4.6
4
4
  Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
5
5
  Home-page: https://github.com/xnl-h4ck3r/waymore
6
6
  Author: @xnl-h4ck3r
@@ -15,7 +15,7 @@ Requires-Dist: tldextract
15
15
 
16
16
  <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
17
17
 
18
- ## About - v4.4
18
+ ## About - v4.6
19
19
 
20
20
  The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
21
21
 
@@ -83,7 +83,9 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
83
83
  | -n | --no-subs | Don't include subdomains of the target domain (only used if input is not a domain with a specific path). |
84
84
  | -f | --filter-responses-only | The initial links from sources will not be filtered, only the responses that are downloaded, e.g. it maybe useful to still see all available paths from the links, even if you don't want to check the content. |
85
85
  | -fc | | Filter HTTP status codes for retrieved URLs and responses. Comma separated list of codes (default: the `FILTER_CODE` values from `config.yml`). Passing this argument will override the value from `config.yml` |
86
+ | -ft | | Filter MIME Types for retrieved URLs and responses. Comma separated list of MIME Types (default: the `FILTER_MIME` values from `config.yml`). Passing this argument will override the value from `config.yml`. **NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.**. |
86
87
  | -mc | | Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config `FILTER_CODE` and `-fc`. |
88
+ | -mt | | Only MIME Types for retrieved URLs and responses. Comma separated list of MIME types. Passing this argument overrides the config `FILTER_MIME` and `-ft`. **NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.**. |
87
89
  | -l | --limit | How many responses will be saved (if `-mode R` or `-mode B` is passed). A positive value will get the **first N** results, a negative value will get the **last N** results. A value of 0 will get **ALL** responses (default: 5000) |
88
90
  | -from | --from-date | What date to get responses from. If not specified it will get from the earliest possible results. A partial value can be passed, e.g. `2016`, `201805`, etc. |
89
91
  | -to | --to-date | What date to get responses to. If not specified it will get to the latest possible results. A partial value can be passed, e.g. `2021`, `202112`, etc. |
@@ -95,7 +97,7 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
95
97
  | -xav | | Exclude checks for links from alienvault.com |
96
98
  | -xus | | Exclude checks for links from urlscan.io |
97
99
  | -xvt | | Exclude checks for links from virustotal.com |
98
- | -lcc | | Limit the number of Common Crawl index collections searched, e.g. `-lcc 10` will just search the latest `10` collections (default: 3). As of July 2023 there are currently 95 collections. Setting to `0` (default) will search **ALL** collections. If you don't want to search Common Crawl at all, use the `-xcc` option. |
100
+ | -lcc | | Limit the number of Common Crawl index collections searched, e.g. `-lcc 10` will just search the latest `10` collections (default: 1). As of November 2024 there are currently 106 collections. Setting to `0` will search **ALL** collections. If you don't want to search Common Crawl at all, use the `-xcc` option. |
99
101
  | -lcy | | Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with `-lcc`). For example, if you are only interested in data from 2015 and after, pass `-lcy 2015`. This will override the value of `-lcc` if passed. If you don't want to search Common Crawl at all, use the `-xcc` option. |
100
102
  | -t | --timeout | This is for archived responses only! How many seconds to wait for the server to send data before giving up (default: 30) |
101
103
  | -p | --processes | Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1) |
@@ -154,8 +156,8 @@ If the input is just a domain, e.g. `redbull.com` then the `-mode` defaults to `
154
156
 
155
157
  The `config.yml` file (typically in `~/.config/waymore/`) have values that can be updated to suit your needs. Filters are all provided as comma separated lists:
156
158
 
157
- - `FILTER_CODE` - Exclusions used to exclude responses we will try to get from web.archive.org, and also for file names when `-i` is a directory, e.g. `301,302`. This can be overridden with the `-fc` argument. Passing the `-mc` (to match status codes instead of filter) will override any value in `FILTER_CODE` or `-fc`
158
- - `FILTER_MIME` - MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API, e.g. `'text/css,image/jpeg`
159
+ - `FILTER_CODE` - Exclusions used to exclude responses we will try to get from web.archive.org, and also for file names when `-i` is a directory, e.g. `301,302`. This can be overridden with the `-fc` argument. Passing the `-mc` (to match status codes instead of filter) will override any value in `FILTER_CODE` or `-fc`.
160
+ - `FILTER_MIME` - MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API, e.g. `'text/css,image/jpeg`. This can be overridden with the `-ft` argument. . Passing the `-mt` (to match MIME types instead of filter) will override any value in `FILTER_MIME` or `-ft`.
159
161
  - `FILTER_URL` - Response code exclusions we will use to filter links and responses from web.archive.org through their API, e.g. `.css,.jpg`
160
162
  - `FILTER_KEYWORDS` - Only links and responses will be returned that contain the specified keywords if the `-ko`/`--keywords-only` argument is passed (without providing an explicit value on the command line), e.g. `admin,portal`
161
163
  - `URLSCAN_API_KEY` - You can sign up to [urlscan.io](https://urlscan.io/user/signup) to get a **FREE** API key (there are also paid subscriptions available). It is recommended you get a key and put it into the config file so that you can get more back (and quicker) from their API. NOTE: You will get rate limited unless you have a full paid subscription.
@@ -163,7 +165,7 @@ The `config.yml` file (typically in `~/.config/waymore/`) have values that can b
163
165
  - `WEBHOOK_DISCORD` - If the `--notify-discord` argument is passed, `knoxnl` will send a notification to this Discord wehook when a successful XSS is found.
164
166
  - `DEFAULT_OUTPUT_DIR` - This is the default location of any output files written if the `-oU` and `-oR` arguments are not used. If the value of this key is blank, then it will default to the location of the `config.yml` file.
165
167
 
166
- **NOTE: The MIME types cannot be filtered for Alien Vault results because they do not return that in the API response.**
168
+ **NOTE: The MIME types cannot be filtered for Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined for a URL. In these cases, URLs will be included regardless of filter or match. Bear this in mind and consider excluding certain providers if this is important.**
167
169
 
168
170
  ## Output
169
171
 
@@ -201,6 +203,8 @@ The archive.org Wayback Machine CDX API can sometimes can sometimes require a hu
201
203
 
202
204
  There is also a problem with the Wayback Machine CDX API where the number of pages returned is not correct when filters are applied and can cause issues (see https://github.com/internetarchive/wayback/issues/243). Until that issue is resolved, setting the `-lr` argument to a sensible value can help with that problem in the short term.
203
205
 
206
+ The Common Crawl API has had a lot of issues for a long time. Including this source could make waymore take a lot longer to run and may not yield any extra results. You can check if tere is an issue by visiting http://index.commoncrawl.org/collinfo.json and seeing if this is successful. Consider excluding Common Crawl altogether using the `--providers` argument and not including `commoncrawl`, or using the `-xcc` argument.
207
+
204
208
  **The provider API servers aren't designed to cope with huge volumes, so be sensible and considerate about what you hit them with!**
205
209
 
206
210
  When downloading archived responses, this can take a long time and can sometimes be killed by the machine for some reason, or manually killed by the user.
@@ -218,7 +222,7 @@ The URLs are saved in the same path as `config.yml` (typically `~/.config/waymor
218
222
 
219
223
  ### Example 2
220
224
 
221
- Get ALL the URLs from Wayback for `redbull.com` (no filters are applied in `mode U` with `-f`, and no URLs are retrieved from Commone Crawl, Alien Vault, URLScan and Virus Total, because `-xcc`, `-xav`, `-xus`, `-xvt` are passed respectively).
225
+ Get ALL the URLs from Wayback for `redbull.com` (no filters are applied in `mode U` with `-f`, and no URLs are retrieved from Commone Crawl, Alien Vault, URLScan and Virus Total, because `-xcc`, `-xav`, `-xus`, `-xvt` are passed respectively. This can also be achieved by passing `--providers wayback` instead of the exclude arguments).
222
226
  Save the FIRST 200 responses that are found starting from 2022 (`-l 200 -from 2022`):
223
227
 
224
228
  <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/example2.png"></center>
@@ -0,0 +1,8 @@
1
+ waymore/__init__.py,sha256=nBVFOoDYjRXcFurm_7co-GNVg6LUhzeZpudhmYNojHw,17
2
+ waymore/waymore.py,sha256=FhSRlLoK9DBGojEX89rMQdZ-bEacPSxJg2BJwQfUJGA,177093
3
+ waymore-4.6.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
4
+ waymore-4.6.dist-info/METADATA,sha256=oQMMrr_MbK_QPmShWY-TrHf-bxOg9dtjdUK76QE29H8,49511
5
+ waymore-4.6.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
6
+ waymore-4.6.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
7
+ waymore-4.6.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
8
+ waymore-4.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- waymore/__init__.py,sha256=bb3D2cWPj3M9gB4ePNX8nrpDuS8IImWiON1Cc_z3vGg,17
2
- waymore/waymore.py,sha256=cnFkODCRHd4OxxBZVMWUwus5bTZ-ypTGAK_Aa9HPd-g,169799
3
- waymore-4.4.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
4
- waymore-4.4.dist-info/METADATA,sha256=gpUxWzvVUkCmZUB_Dd-gl_8w2P9UFh5tpfyob7wMe-o,47221
5
- waymore-4.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
- waymore-4.4.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
7
- waymore-4.4.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
8
- waymore-4.4.dist-info/RECORD,,
File without changes