waymore 6.6__py3-none-any.whl → 7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/waymore.py CHANGED
@@ -5,6 +5,7 @@
5
5
  # Good luck and good hunting! If you really love the tool (or any others), or they helped you find an awesome bounty, consider BUYING ME A COFFEE! (https://ko-fi.com/xnlh4ck3r) ☕ (I could use the caffeine!)
6
6
 
7
7
  import argparse
8
+ import asyncio
8
9
  import enum
9
10
  import json
10
11
  import math
@@ -14,7 +15,7 @@ import pickle
14
15
  import random
15
16
  import re
16
17
  import sys
17
- import time
18
+ import threading
18
19
  from datetime import datetime, timedelta
19
20
  from pathlib import Path
20
21
  from signal import SIGINT, signal
@@ -60,6 +61,12 @@ argsInput = ""
60
61
  isInputFile = False
61
62
  stopProgramCount = 0
62
63
  stopSource = False
64
+ stopSourceWayback = False
65
+ stopSourceCommonCrawl = False
66
+ stopSourceAlienVault = False
67
+ stopSourceURLScan = False
68
+ stopSourceVirusTotal = False
69
+ stopSourceIntelx = False
63
70
  successCount = 0
64
71
  failureCount = 0
65
72
  fileCount = 0
@@ -80,6 +87,10 @@ currentMemUsage = 0
80
87
  maxMemoryPercent = 0
81
88
  currentMemPercent = 0
82
89
  process = None
90
+ current_response = None
91
+ current_session = None
92
+ # Event used to interrupt long sleeps (e.g., rate-limit waits) when SIGINT is received
93
+ interrupt_event = threading.Event()
83
94
  HTTP_ADAPTER = None
84
95
  HTTP_ADAPTER_CC = None
85
96
  checkWayback = 0
@@ -91,6 +102,20 @@ checkIntelx = 0
91
102
  argsInputHostname = ""
92
103
  responseOutputDirectory = ""
93
104
  urlscanRequestLinks = set()
105
+ intelxAPIIssue = False
106
+ linkCountWayback = 0
107
+ linkCountCommonCrawl = 0
108
+ linkCountAlienVault = 0
109
+ linkCountURLScan = 0
110
+ linkCountVirusTotal = 0
111
+ linkCountIntelx = 0
112
+
113
+ # Thread lock for protecting shared state during concurrent operations
114
+ links_lock = threading.Lock()
115
+
116
+ # Shared state for link collection across all sources
117
+ linksFound = set()
118
+ linkMimes = set()
94
119
 
95
120
  # Source Provider URLs
96
121
  WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
@@ -133,7 +158,7 @@ DEFAULT_LIMIT = 5000
133
158
  DEFAULT_TIMEOUT = 30
134
159
 
135
160
  # Exclusions used to exclude responses we will try to get from web.archive.org
136
- DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap"
161
+ DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource"
137
162
 
138
163
  # MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
139
164
  DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff"
@@ -329,7 +354,7 @@ def handler(signal_received, frame):
329
354
  This function is called if Ctrl-C is called by the user
330
355
  An attempt will be made to try and clean up properly
331
356
  """
332
- global stopSource, stopProgram, stopProgramCount
357
+ global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
333
358
 
334
359
  if stopProgram is not None:
335
360
  stopProgramCount = stopProgramCount + 1
@@ -358,6 +383,34 @@ def handler(signal_received, frame):
358
383
  else:
359
384
  stopProgram = StopProgram.SIGINT
360
385
  stopSource = True
386
+ stopSourceWayback = True
387
+ stopSourceCommonCrawl = True
388
+ stopSourceAlienVault = True
389
+ stopSourceURLScan = True
390
+ stopSourceVirusTotal = True
391
+ stopSourceIntelx = True
392
+ # Try to close any active response or session to interrupt blocking network I/O
393
+ try:
394
+ if current_response is not None:
395
+ try:
396
+ current_response.close()
397
+ except Exception:
398
+ pass
399
+ except Exception:
400
+ pass
401
+ try:
402
+ if current_session is not None:
403
+ try:
404
+ current_session.close()
405
+ except Exception:
406
+ pass
407
+ except Exception:
408
+ pass
409
+ # Signal any waits to stop early
410
+ try:
411
+ interrupt_event.set()
412
+ except Exception:
413
+ pass
361
414
  writerr(
362
415
  colored(
363
416
  getSPACER('>>> "Oh my God, they killed Kenny... and waymore!" - Kyle'),
@@ -760,7 +813,7 @@ def showOptions():
760
813
  if args.mode in ["R", "B"] or (args.mode == "U" and not args.xcc):
761
814
  write(
762
815
  colored("-p: " + str(args.processes), "magenta")
763
- + colored(" The number of parallel requests made.", "white")
816
+ + colored(" The number of parallel requests made per source.", "white")
764
817
  )
765
818
  write(
766
819
  colored("-r: " + str(args.retries), "magenta")
@@ -1251,7 +1304,7 @@ def fixArchiveOrgUrl(url):
1251
1304
 
1252
1305
  # Add a link to the linksFound collection for archived responses (included timestamp preifx)
1253
1306
  def linksFoundResponseAdd(link):
1254
- global linksFound, argsInput, argsInputHostname
1307
+ global linksFound, argsInput, argsInputHostname, links_lock
1255
1308
 
1256
1309
  try:
1257
1310
  if inputIsDomainANDPath:
@@ -1272,20 +1325,22 @@ def linksFoundResponseAdd(link):
1272
1325
 
1273
1326
  # Don't write it if the link does not contain the requested domain (this can sometimes happen)
1274
1327
  if parsed_url.lower().find(checkInput.lower()) >= 0:
1275
- linksFound.add(link)
1328
+ with links_lock:
1329
+ linksFound.add(link)
1276
1330
  # If streaming is enabled and mode is 'U', print the link to stdout
1277
1331
  if args.stream and args.mode == "U":
1278
1332
  write(link, pipe=True)
1279
1333
  except Exception:
1280
- linksFound.add(link)
1334
+ with links_lock:
1335
+ linksFound.add(link)
1281
1336
  # If streaming is enabled and mode is 'U', print the link to stdout
1282
1337
  if args.stream and args.mode == "U":
1283
1338
  write(link, pipe=True)
1284
1339
 
1285
1340
 
1286
1341
  # Add a link to the linksFound collection
1287
- def linksFoundAdd(link):
1288
- global linksFound, argsInput, argsInputHostname
1342
+ def linksFoundAdd(link, source_set=None):
1343
+ global linksFound, argsInput, argsInputHostname, links_lock
1289
1344
 
1290
1345
  try:
1291
1346
  if inputIsDomainANDPath:
@@ -1303,12 +1358,20 @@ def linksFoundAdd(link):
1303
1358
 
1304
1359
  # Don't write it if the link does not contain the requested domain (this can sometimes happen)
1305
1360
  if parsed_url.find(checkInput) >= 0:
1306
- linksFound.add(link)
1361
+ with links_lock:
1362
+ if source_set is not None:
1363
+ source_set.add(link)
1364
+ else:
1365
+ linksFound.add(link)
1307
1366
  # If streaming is enabled and mode is 'U', print the link to stdout
1308
1367
  if args.stream and args.mode == "U":
1309
1368
  write(link, pipe=True)
1310
1369
  except Exception:
1311
- linksFound.add(link)
1370
+ with links_lock:
1371
+ if source_set is not None:
1372
+ source_set.add(link)
1373
+ else:
1374
+ linksFound.add(link)
1312
1375
  # If streaming is enabled and mode is 'U', print the link to stdout
1313
1376
  if args.stream and args.mode == "U":
1314
1377
  write(link, pipe=True)
@@ -1567,12 +1630,10 @@ def processArchiveUrl(url):
1567
1630
  except Exception as e:
1568
1631
  writerr(
1569
1632
  colored(
1570
- getSPACER(
1571
- "[ ERR ] Failed to write file "
1572
- + filePath
1573
- + ": "
1574
- + str(e)
1575
- ),
1633
+ "Wayback - [ ERR ] Failed to write file "
1634
+ + filePath
1635
+ + ": "
1636
+ + str(e),
1576
1637
  "red",
1577
1638
  )
1578
1639
  )
@@ -1588,12 +1649,10 @@ def processArchiveUrl(url):
1588
1649
  except Exception as e:
1589
1650
  writerr(
1590
1651
  colored(
1591
- getSPACER(
1592
- '[ ERR ] Failed to write to waymore_index.txt for "'
1593
- + archiveUrl
1594
- + '": '
1595
- + str(e)
1596
- ),
1652
+ 'Wayback - [ ERR ] Failed to write to waymore_index.txt for "'
1653
+ + archiveUrl
1654
+ + '": '
1655
+ + str(e),
1597
1656
  "red",
1598
1657
  )
1599
1658
  )
@@ -1631,11 +1690,7 @@ def processArchiveUrl(url):
1631
1690
  if verbose():
1632
1691
  writerr(
1633
1692
  colored(
1634
- getSPACER(
1635
- '[ ERR ] Wayback Machine (archive.org) returned a problem for "'
1636
- + archiveUrl
1637
- + '"'
1638
- ),
1693
+ 'Wayback - [ ERR ] returned a problem for "' + archiveUrl + '"',
1639
1694
  "red",
1640
1695
  )
1641
1696
  )
@@ -1644,11 +1699,7 @@ def processArchiveUrl(url):
1644
1699
  if verbose():
1645
1700
  writerr(
1646
1701
  colored(
1647
- getSPACER(
1648
- '[ ERR ] Wayback Machine (archive.org) connection error for "'
1649
- + archiveUrl
1650
- + '"'
1651
- ),
1702
+ 'Wayback - [ ERR ] connection error for "' + archiveUrl + '"',
1652
1703
  "red",
1653
1704
  )
1654
1705
  )
@@ -1658,25 +1709,21 @@ def processArchiveUrl(url):
1658
1709
  try:
1659
1710
  writerr(
1660
1711
  colored(
1661
- getSPACER(
1662
- "[ "
1663
- + str(resp.status_code)
1664
- + ' ] Failed to get response for "'
1665
- + archiveUrl
1666
- + '"'
1667
- ),
1712
+ "Wayback - [ "
1713
+ + str(resp.status_code)
1714
+ + ' ] Failed to get response for "'
1715
+ + archiveUrl
1716
+ + '"',
1668
1717
  "red",
1669
1718
  )
1670
1719
  )
1671
1720
  except Exception:
1672
1721
  writerr(
1673
1722
  colored(
1674
- getSPACER(
1675
- '[ ERR ] Failed to get response for "'
1676
- + archiveUrl
1677
- + '": '
1678
- + str(e)
1679
- ),
1723
+ 'Wayback - [ ERR ] Failed to get response for "'
1724
+ + archiveUrl
1725
+ + '": '
1726
+ + str(e),
1680
1727
  "red",
1681
1728
  )
1682
1729
  )
@@ -1728,7 +1775,7 @@ def processArchiveUrl(url):
1728
1775
 
1729
1776
  except Exception as e:
1730
1777
  if verbose():
1731
- writerr(colored(getSPACER('Error for "' + url + '": ' + str(e)), "red"))
1778
+ writerr(colored('Wayback - [ ERR ] Error for "' + url + '": ' + str(e), "red"))
1732
1779
 
1733
1780
  except Exception as e:
1734
1781
  writerr(colored("ERROR processArchiveUrl 1: " + str(e), "red"))
@@ -1813,7 +1860,7 @@ def processURLOutput():
1813
1860
  linkCount = len(linksFound)
1814
1861
  write(
1815
1862
  getSPACER(
1816
- colored("Links found for " + subs + argsInput + ": ", "cyan")
1863
+ colored("\nTotal unique links found for " + subs + argsInput + ": ", "cyan")
1817
1864
  + colored(str(linkCount) + " 🤘", "white")
1818
1865
  )
1819
1866
  + "\n"
@@ -2139,12 +2186,12 @@ def processAlienVaultPage(url):
2139
2186
  """
2140
2187
  Get URLs from a specific page of otx.alienvault.org API for the input domain
2141
2188
  """
2142
- global totalPages, linkMimes, linksFound, stopSource, argsInput
2189
+ global totalPages, linkMimes, linksFound, stopSourceAlienVault, argsInput, linkCountAlienVault
2143
2190
  try:
2144
2191
  # Get memory in case it exceeds threshold
2145
2192
  getMemory()
2146
2193
 
2147
- if not stopSource:
2194
+ if not stopSourceAlienVault:
2148
2195
  try:
2149
2196
  # Choose a random user agent string to use for any requests
2150
2197
  userAgent = random.choice(USER_AGENT)
@@ -2156,7 +2203,7 @@ def processAlienVaultPage(url):
2156
2203
  except ConnectionError:
2157
2204
  writerr(
2158
2205
  colored(
2159
- getSPACER("[ ERR ] alienvault.org connection error for page " + page),
2206
+ getSPACER("AlienVault - [ ERR ] Connection error for page " + page),
2160
2207
  "red",
2161
2208
  )
2162
2209
  )
@@ -2165,9 +2212,10 @@ def processAlienVaultPage(url):
2165
2212
  except Exception as e:
2166
2213
  writerr(
2167
2214
  colored(
2168
- getSPACER(
2169
- "[ ERR ] Error getting response for page " + page + " - " + str(e)
2170
- ),
2215
+ "AlienVault -[ ERR ] Error getting response for page "
2216
+ + page
2217
+ + " - "
2218
+ + str(e),
2171
2219
  "red",
2172
2220
  )
2173
2221
  )
@@ -2178,22 +2226,21 @@ def processAlienVaultPage(url):
2178
2226
  if resp is not None:
2179
2227
  # If a status other of 429, then stop processing Alien Vault
2180
2228
  if resp.status_code == 429:
2181
- writerr(
2182
- colored(
2183
- getSPACER(
2184
- "[ 429 ] Alien Vault rate limit reached, so stopping. Links that have already been retrieved will be saved."
2185
- ),
2186
- "red",
2229
+ if not stopSourceAlienVault: # Only print message once
2230
+ writerr(
2231
+ colored(
2232
+ "AlienVault - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
2233
+ "red",
2234
+ )
2187
2235
  )
2188
- )
2189
- stopSource = True
2236
+ stopSourceAlienVault = True
2190
2237
  return
2191
2238
  # If the response from alienvault.com is empty then skip
2192
2239
  if resp.text == "" and totalPages == 0:
2193
2240
  if verbose():
2194
2241
  writerr(
2195
2242
  colored(
2196
- getSPACER("[ ERR ] " + url + " gave an empty response."),
2243
+ "AlienVault - [ ERR ] " + url + " gave an empty response.",
2197
2244
  "red",
2198
2245
  )
2199
2246
  )
@@ -2203,9 +2250,10 @@ def processAlienVaultPage(url):
2203
2250
  if verbose():
2204
2251
  writerr(
2205
2252
  colored(
2206
- getSPACER(
2207
- "[ " + str(resp.status_code) + " ] Error for " + url
2208
- ),
2253
+ "AlienVauilt - [ "
2254
+ + str(resp.status_code)
2255
+ + " ] Error for "
2256
+ + url,
2209
2257
  "red",
2210
2258
  )
2211
2259
  )
@@ -2228,6 +2276,7 @@ def processAlienVaultPage(url):
2228
2276
  if foundUrl != "":
2229
2277
  # If filters are not required and subs are wanted then just add the URL to the list
2230
2278
  if args.filter_responses_only and not args.no_subs:
2279
+ linkCountAlienVault = linkCountAlienVault + 1
2231
2280
  linksFoundAdd(foundUrl)
2232
2281
  else:
2233
2282
  addLink = True
@@ -2328,7 +2377,7 @@ def processAlienVaultPage(url):
2328
2377
 
2329
2378
  # Add link if it passed filters
2330
2379
  if addLink:
2331
- linksFoundAdd(foundUrl)
2380
+ linksFoundAdd(foundUrl, linksFoundAlienVault)
2332
2381
  else:
2333
2382
  pass
2334
2383
  except Exception as e:
@@ -2340,12 +2389,12 @@ def getAlienVaultUrls():
2340
2389
  """
2341
2390
  Get URLs from the Alien Vault OTX, otx.alienvault.com
2342
2391
  """
2343
- global linksFound, waymorePath, subs, path, stopProgram, totalPages, stopSource, argsInput, checkAlienVault, inputIsSubDomain, argsInputHostname
2392
+ global linksFound, waymorePath, subs, path, stopProgram, totalPages, stopSourceAlienVault, argsInput, checkAlienVault, inputIsSubDomain, argsInputHostname, linkCountAlienVault, linksFoundAlienVault
2344
2393
 
2345
2394
  # Write the file of URL's for the passed domain/URL
2346
2395
  try:
2347
- stopSource = False
2348
- originalLinkCount = len(linksFound)
2396
+ stopSourceAlienVault = False
2397
+ linksFoundAlienVault = set()
2349
2398
 
2350
2399
  # Set the Alien Vault API indicator types of domain or hostname (has subdomain)
2351
2400
  if inputIsSubDomain:
@@ -2362,11 +2411,12 @@ def getAlienVaultUrls():
2362
2411
 
2363
2412
  # Get the number of pages (i.e. separate requests) that are going to be made to alienvault.com
2364
2413
  totalPages = 0
2414
+ resp = None
2365
2415
  try:
2366
2416
  if not args.check_only:
2367
2417
  write(
2368
2418
  colored(
2369
- "\rGetting the number of alienvault.com pages to search...\r",
2419
+ "AlienVault - [ INFO ] Getting the number of alienvault.com pages to search...",
2370
2420
  "cyan",
2371
2421
  )
2372
2422
  )
@@ -2379,33 +2429,35 @@ def getAlienVaultUrls():
2379
2429
  except Exception as e:
2380
2430
  writerr(
2381
2431
  colored(
2382
- getSPACER("[ ERR ] Unable to get links from alienvault.com: " + str(e)),
2432
+ "AlienVault - [ ERR ] Unable to get links from alienvault.com: " + str(e),
2383
2433
  "red",
2384
2434
  )
2385
2435
  )
2386
- return
2436
+ # Don't return - continue to show link count at the end
2387
2437
 
2388
2438
  # If the rate limit was reached end now
2389
- if resp.status_code == 429:
2439
+ if resp is not None and resp.status_code == 429:
2390
2440
  writerr(
2391
2441
  colored(
2392
- getSPACER("[ 429 ] Alien Vault rate limit reached so unable to get links."),
2442
+ "AlienVault - [ 429 ] Rate limit reached so unable to get links.",
2393
2443
  "red",
2394
2444
  )
2395
2445
  )
2396
- return
2446
+ # Don't return - continue to show link count at the end
2397
2447
 
2398
- if verbose():
2448
+ if resp is not None and verbose():
2399
2449
  write(
2400
- getSPACER(
2401
- colored("The Alien Vault URL requested to get links: ", "magenta")
2402
- + colored(url, "white")
2403
- )
2450
+ colored("AlienVault - [ INFO ] The URL requested to get links: ", "magenta")
2451
+ + colored(url, "white")
2404
2452
  + "\n"
2405
2453
  )
2406
2454
 
2407
2455
  # Carry on if something was found
2408
- if resp.text.lower().find('"error": "') < 0:
2456
+ if (
2457
+ resp is not None
2458
+ and resp.status_code != 429
2459
+ and resp.text.lower().find('"error": "') < 0
2460
+ ):
2409
2461
 
2410
2462
  try:
2411
2463
  # Get the JSON response
@@ -2416,9 +2468,7 @@ def getAlienVaultUrls():
2416
2468
  except Exception:
2417
2469
  writerr(
2418
2470
  colored(
2419
- getSPACER(
2420
- "[ ERR ] There was an unexpected response from the Alien Vault API"
2421
- ),
2471
+ "AlienVault - [ ERR ] There was an unexpected response from the API",
2422
2472
  "red",
2423
2473
  )
2424
2474
  )
@@ -2440,16 +2490,16 @@ def getAlienVaultUrls():
2440
2490
  else:
2441
2491
  checkAlienVault = totalPages
2442
2492
  write(
2443
- colored("Get URLs from Alien Vault: ", "cyan")
2493
+ colored("AlienVault - [ INFO ] Getting URLs from Alien Vault: ", "cyan")
2444
2494
  + colored(str(checkAlienVault) + " requests", "white")
2445
2495
  )
2446
2496
  else:
2447
2497
  # if the page number was found then display it, but otherwise we will just try to increment until we have everything
2448
2498
  write(
2449
2499
  colored(
2450
- "\rGetting links from "
2500
+ "AlienVault - [ INFO ] Getting links from "
2451
2501
  + str(totalPages)
2452
- + " alienvault.com API requests (this can take a while for some domains)...\r",
2502
+ + " alienvault.com API requests (this can take a while for some domains)...",
2453
2503
  "cyan",
2454
2504
  )
2455
2505
  )
@@ -2469,30 +2519,19 @@ def getAlienVaultUrls():
2469
2519
  if verbose():
2470
2520
  writerr(
2471
2521
  colored(
2472
- getSPACER("[ ERR ] An error was returned in the alienvault.com response.")
2473
- + "\n",
2522
+ "AlienVault - [ ERR ] An error was returned in the response." + "\n",
2474
2523
  "red",
2475
2524
  )
2476
2525
  )
2477
2526
 
2478
2527
  if not args.check_only:
2479
- linkCount = len(linksFound) - originalLinkCount
2480
- if args.xwm and args.xcc:
2481
- write(
2482
- getSPACER(
2483
- colored("Links found on alienvault.com: ", "cyan")
2484
- + colored(str(linkCount), "white")
2485
- )
2486
- + "\n"
2487
- )
2488
- else:
2489
- write(
2490
- getSPACER(
2491
- colored("Extra links found on alienvault.com: ", "cyan")
2492
- + colored(str(linkCount), "white")
2493
- )
2494
- + "\n"
2495
- )
2528
+ linkCountAlienVault = len(linksFoundAlienVault)
2529
+ write(
2530
+ colored("AlienVault - [ INFO ] Links found on alienvault.com: ", "cyan")
2531
+ + colored(str(linkCountAlienVault), "white")
2532
+ )
2533
+ linksFound.update(linksFoundAlienVault)
2534
+ linksFoundAlienVault.clear()
2496
2535
 
2497
2536
  except Exception as e:
2498
2537
  writerr(colored("ERROR getAlienVaultUrls 1: " + str(e), "red"))
@@ -2502,7 +2541,7 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
2502
2541
  """
2503
2542
  Process a specific URL from urlscan.io to determine whether to save the link
2504
2543
  """
2505
- global argsInput, argsInputHostname, urlscanRequestLinks
2544
+ global argsInput, argsInputHostname, urlscanRequestLinks, links_lock, linkCountURLScan, linksFoundURLScan
2506
2545
 
2507
2546
  addLink = True
2508
2547
 
@@ -2591,7 +2630,8 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
2591
2630
  # Add MIME Types if --verbose option was selected
2592
2631
  if verbose():
2593
2632
  if mimeType.strip() != "":
2594
- linkMimes.add(mimeType)
2633
+ with links_lock:
2634
+ linkMimes.add(mimeType)
2595
2635
 
2596
2636
  # Add link if it passed filters
2597
2637
  if addLink:
@@ -2611,11 +2651,12 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
2611
2651
  )
2612
2652
  if match is not None:
2613
2653
  if args.mode in ("U", "B"):
2614
- linksFoundAdd(url)
2654
+ linksFoundAdd(url, linksFoundURLScan)
2615
2655
  # If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
2616
2656
  if urlscanID != "" and args.mode in ("R", "B"):
2617
2657
  if args.limit == 0 or len(urlscanRequestLinks) < args.limit:
2618
- urlscanRequestLinks.add((url, URLSCAN_DOM_URL + urlscanID))
2658
+ with links_lock:
2659
+ urlscanRequestLinks.add((url, URLSCAN_DOM_URL + urlscanID))
2619
2660
 
2620
2661
  except Exception as e:
2621
2662
  writerr(colored("ERROR processURLScanUrl 1: " + str(e), "red"))
@@ -2721,9 +2762,10 @@ def getURLScanDOM(originalUrl, domUrl):
2721
2762
  except Exception as e:
2722
2763
  writerr(
2723
2764
  colored(
2724
- getSPACER(
2725
- "[ ERR ] Failed to write file " + filePath + ": " + str(e)
2726
- ),
2765
+ "URLScan - [ ERR ] Failed to write file "
2766
+ + filePath
2767
+ + ": "
2768
+ + str(e),
2727
2769
  "red",
2728
2770
  )
2729
2771
  )
@@ -2746,12 +2788,10 @@ def getURLScanDOM(originalUrl, domUrl):
2746
2788
  except Exception as e:
2747
2789
  writerr(
2748
2790
  colored(
2749
- getSPACER(
2750
- '[ ERR ] Failed to write to waymore_index.txt for "'
2751
- + domUrl
2752
- + '": '
2753
- + str(e)
2754
- ),
2791
+ 'URLScan - [ ERR ] Failed to write to waymore_index.txt for "'
2792
+ + domUrl
2793
+ + '": '
2794
+ + str(e),
2755
2795
  "red",
2756
2796
  )
2757
2797
  )
@@ -2767,25 +2807,21 @@ def getURLScanDOM(originalUrl, domUrl):
2767
2807
  try:
2768
2808
  writerr(
2769
2809
  colored(
2770
- getSPACER(
2771
- "[ "
2772
- + str(resp.status_code)
2773
- + ' ] Failed to get response for "'
2774
- + domUrl
2775
- + '"'
2776
- ),
2810
+ "URLScan - [ "
2811
+ + str(resp.status_code)
2812
+ + ' ] Failed to get response for "'
2813
+ + domUrl
2814
+ + '"',
2777
2815
  "red",
2778
2816
  )
2779
2817
  )
2780
2818
  except Exception:
2781
2819
  writerr(
2782
2820
  colored(
2783
- getSPACER(
2784
- '[ ERR ] Failed to get response for "'
2785
- + domUrl
2786
- + '": '
2787
- + str(e)
2788
- ),
2821
+ 'URLScan - [ ERR ] Failed to get response for "'
2822
+ + domUrl
2823
+ + '": '
2824
+ + str(e),
2789
2825
  "red",
2790
2826
  )
2791
2827
  )
@@ -2832,7 +2868,9 @@ def getURLScanDOM(originalUrl, domUrl):
2832
2868
 
2833
2869
  except Exception as e:
2834
2870
  if verbose():
2835
- writerr(colored(getSPACER('Error for "' + domUrl + '": ' + str(e)), "red"))
2871
+ writerr(
2872
+ colored('URLScan - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red")
2873
+ )
2836
2874
 
2837
2875
  except Exception as e:
2838
2876
  writerr(colored("ERROR getURLScanDOM 1: " + str(e), "red"))
@@ -2857,14 +2895,15 @@ def getURLScanUrls():
2857
2895
  """
2858
2896
  Get URLs from the URLSCan API, urlscan.io
2859
2897
  """
2860
- global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSource, argsInput, checkURLScan, argsInputHostname
2898
+ global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
2861
2899
 
2862
2900
  # Write the file of URL's for the passed domain/URL
2863
2901
  try:
2864
2902
  requestsMade = 0
2865
- stopSource = False
2866
- linkMimes = set()
2867
- originalLinkCount = len(linksFound)
2903
+ stopSourceURLScan = False
2904
+ linksFoundURLScan = set()
2905
+ totalUrls = 0
2906
+ checkResponse = True
2868
2907
 
2869
2908
  # Set the URL to just the hostname
2870
2909
  url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
@@ -2887,21 +2926,23 @@ def getURLScanUrls():
2887
2926
  if args.mode == "R":
2888
2927
  write(
2889
2928
  colored(
2890
- "The URLScan URL requested to get links for responses: ",
2929
+ "URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
2891
2930
  "magenta",
2892
2931
  )
2893
2932
  + colored(url + "\n", "white")
2894
2933
  )
2895
2934
  else:
2896
2935
  write(
2897
- colored("The URLScan URL requested to get links: ", "magenta")
2936
+ colored(
2937
+ "URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
2938
+ )
2898
2939
  + colored(url + "\n", "white")
2899
2940
  )
2900
2941
 
2901
- if not args.check_only:
2942
+ if args.mode in ("U", "B") and not args.check_only:
2902
2943
  write(
2903
2944
  colored(
2904
- "\rGetting links from urlscan.io API (this can take a while for some domains)...\r",
2945
+ "URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
2905
2946
  "cyan",
2906
2947
  )
2907
2948
  )
@@ -2922,7 +2963,7 @@ def getURLScanUrls():
2922
2963
  except Exception as e:
2923
2964
  write(
2924
2965
  colored(
2925
- getSPACER("[ ERR ] Unable to get links from urlscan.io: " + str(e)),
2966
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
2926
2967
  "red",
2927
2968
  )
2928
2969
  )
@@ -2937,15 +2978,17 @@ def getURLScanUrls():
2937
2978
  if seconds <= args.urlscan_rate_limit_retry * 60:
2938
2979
  writerr(
2939
2980
  colored(
2940
- getSPACER(
2941
- "[ 429 ] URLScan rate limit reached, so waiting for another "
2942
- + str(seconds)
2943
- + " seconds before continuing..."
2944
- ),
2981
+ "URLScan - [ 429 ] Rate limit reached, so waiting for another "
2982
+ + str(seconds)
2983
+ + " seconds before continuing...",
2945
2984
  "yellow",
2946
2985
  )
2947
2986
  )
2948
- time.sleep(seconds + 1)
2987
+ # Wait can be interrupted by SIGINT via interrupt_event
2988
+ interrupt_event.clear()
2989
+ if interrupt_event.wait(seconds + 1):
2990
+ # Interrupted by SIGINT
2991
+ return
2949
2992
  try:
2950
2993
  resp = session.get(
2951
2994
  url,
@@ -2958,7 +3001,7 @@ def getURLScanUrls():
2958
3001
  except Exception as e:
2959
3002
  write(
2960
3003
  colored(
2961
- getSPACER("[ ERR ] Unable to get links from urlscan.io: " + str(e)),
3004
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
2962
3005
  "red",
2963
3006
  )
2964
3007
  )
@@ -2971,18 +3014,14 @@ def getURLScanUrls():
2971
3014
  if resp.status_code == 429:
2972
3015
  writerr(
2973
3016
  colored(
2974
- getSPACER(
2975
- "[ 429 ] URLScan rate limit reached so trying without API Key..."
2976
- ),
3017
+ "URLScan - [ 429 ] Rate limit reached so trying without API Key...",
2977
3018
  "red",
2978
3019
  )
2979
3020
  )
2980
3021
  else:
2981
3022
  writerr(
2982
3023
  colored(
2983
- getSPACER(
2984
- "The URLScan API Key is invalid so trying without API Key..."
2985
- ),
3024
+ "URLScan - [ INF ] The API Key is invalid so trying without API Key...",
2986
3025
  "red",
2987
3026
  )
2988
3027
  )
@@ -2992,56 +3031,54 @@ def getURLScanUrls():
2992
3031
  except Exception as e:
2993
3032
  writerr(
2994
3033
  colored(
2995
- getSPACER("[ ERR ] Unable to get links from urlscan.io: " + str(e)),
3034
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
2996
3035
  "red",
2997
3036
  )
2998
3037
  )
2999
- return
3038
+ checkResponse = False
3000
3039
 
3001
3040
  # If the rate limit was reached end now
3002
3041
  if resp.status_code == 429:
3003
3042
  writerr(
3004
3043
  colored(
3005
- getSPACER(
3006
- "[ 429 ] URLScan rate limit reached without API Key so unable to get links."
3007
- ),
3044
+ "URLScan - [ 429 ] Rate limit reached without API Key so unable to get links.",
3008
3045
  "red",
3009
3046
  )
3010
3047
  )
3011
- return
3048
+ checkResponse = False
3012
3049
  else:
3013
3050
  writerr(
3014
3051
  colored(
3015
- getSPACER("[ 429 ] URLScan rate limit reached so unable to get links."),
3052
+ "URLScan - [ 429 ] Rate limit reached so unable to get links.",
3016
3053
  "red",
3017
3054
  )
3018
3055
  )
3019
- return
3056
+ checkResponse = False
3020
3057
  elif resp.status_code != 200:
3021
3058
  writerr(
3022
3059
  colored(
3023
- getSPACER(
3024
- "[ " + str(resp.status_code) + " ] Unable to get links from urlscan.io"
3025
- ),
3060
+ "URLScan - [ "
3061
+ + str(resp.status_code)
3062
+ + " ] Unable to get links from urlscan.io",
3026
3063
  "red",
3027
3064
  )
3028
3065
  )
3029
- return
3066
+ checkResponse = False
3030
3067
 
3031
3068
  try:
3032
- # Get the JSON response
3033
- jsonResp = json.loads(resp.text.strip())
3069
+ if checkResponse:
3070
+ # Get the JSON response
3071
+ jsonResp = json.loads(resp.text.strip())
3034
3072
 
3035
- # Get the number of results
3036
- totalUrls = int(jsonResp["total"])
3073
+ # Get the number of results
3074
+ totalUrls = int(jsonResp["total"])
3037
3075
  except Exception:
3038
3076
  writerr(
3039
3077
  colored(
3040
- getSPACER("[ ERR ] There was an unexpected response from the URLScan API"),
3078
+ "URLScan - [ ERR ] There was an unexpected response from the API",
3041
3079
  "red",
3042
3080
  )
3043
3081
  )
3044
- totalUrls = 0
3045
3082
 
3046
3083
  # Carry on if something was found
3047
3084
  if args.check_only and args.mode != "R":
@@ -3049,12 +3086,13 @@ def getURLScanUrls():
3049
3086
  hasMore = jsonResp["has_more"]
3050
3087
  if hasMore:
3051
3088
  write(
3052
- colored("Get URLs from URLScan: ", "cyan")
3089
+ colored("URLScan - [ INFO ] Get URLs from URLScan: ", "cyan")
3053
3090
  + colored("UNKNOWN requests", "white")
3054
3091
  )
3055
3092
  else:
3056
3093
  write(
3057
- colored("Get URLs from URLScan: ", "cyan") + colored("1 request", "white")
3094
+ colored("URLScan - [ INFO ] Get URLs from URLScan: ", "cyan")
3095
+ + colored("1 request", "white")
3058
3096
  )
3059
3097
  except Exception:
3060
3098
  pass
@@ -3064,7 +3102,7 @@ def getURLScanUrls():
3064
3102
  # Carry on if something was found
3065
3103
  if int(totalUrls) > 0:
3066
3104
 
3067
- while not stopSource:
3105
+ while not stopSourceURLScan:
3068
3106
 
3069
3107
  searchAfter = ""
3070
3108
 
@@ -3139,7 +3177,7 @@ def getURLScanUrls():
3139
3177
  if searchAfter != "":
3140
3178
 
3141
3179
  keepTrying = True
3142
- while not stopSource and keepTrying:
3180
+ while not stopSourceURLScan and keepTrying:
3143
3181
  keepTrying = False
3144
3182
  # Get the next page from urlscan.io
3145
3183
  try:
@@ -3159,9 +3197,8 @@ def getURLScanUrls():
3159
3197
  except Exception as e:
3160
3198
  writerr(
3161
3199
  colored(
3162
- getSPACER(
3163
- "[ ERR ] Unable to get links from urlscan.io: " + str(e)
3164
- ),
3200
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: "
3201
+ + str(e),
3165
3202
  "red",
3166
3203
  )
3167
3204
  )
@@ -3180,56 +3217,53 @@ def getURLScanUrls():
3180
3217
  if seconds <= args.urlscan_rate_limit_retry * 60:
3181
3218
  writerr(
3182
3219
  colored(
3183
- getSPACER(
3184
- "[ 429 ] URLScan rate limit reached, so waiting for another "
3185
- + str(seconds)
3186
- + " seconds before continuing..."
3187
- ),
3220
+ "URLScan - [ 429 ] Rate limit reached, so waiting for another "
3221
+ + str(seconds)
3222
+ + " seconds before continuing...",
3188
3223
  "yellow",
3189
3224
  )
3190
3225
  )
3191
- time.sleep(seconds + 1)
3226
+ # Wait can be interrupted by SIGINT via interrupt_event
3227
+ interrupt_event.clear()
3228
+ if interrupt_event.wait(seconds + 1):
3229
+ # Interrupted by SIGINT
3230
+ keepTrying = False
3231
+ break
3192
3232
  keepTrying = True
3193
3233
  continue
3194
3234
  else:
3195
3235
  writerr(
3196
3236
  colored(
3197
- getSPACER(
3198
- "[ 429 ] URLScan rate limit reached (waiting time of "
3199
- + str(seconds)
3200
- + "), so stopping. Links that have already been retrieved will be saved."
3201
- ),
3237
+ "URLScan - [ 429 ] Rate limit reached (waiting time of "
3238
+ + str(seconds)
3239
+ + "), so stopping. Links that have already been retrieved will be saved.",
3202
3240
  "red",
3203
3241
  )
3204
3242
  )
3205
- stopSource = True
3243
+ stopSourceURLScan = True
3206
3244
  pass
3207
3245
  else:
3208
3246
  writerr(
3209
3247
  colored(
3210
- getSPACER(
3211
- "[ 429 ] URLScan rate limit reached, so stopping. Links that have already been retrieved will be saved."
3212
- ),
3248
+ "URLScan - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
3213
3249
  "red",
3214
3250
  )
3215
3251
  )
3216
- stopSource = True
3252
+ stopSourceURLScan = True
3217
3253
  pass
3218
3254
  elif resp.status_code != 200:
3219
3255
  writerr(
3220
3256
  colored(
3221
- getSPACER(
3222
- "[ "
3223
- + str(resp.status_code)
3224
- + " ] Unable to get links from urlscan.io"
3225
- ),
3257
+ "URLScan - [ "
3258
+ + str(resp.status_code)
3259
+ + " ] Unable to get links from urlscan.io",
3226
3260
  "red",
3227
3261
  )
3228
3262
  )
3229
- stopSource = True
3263
+ stopSourceURLScan = True
3230
3264
  pass
3231
3265
 
3232
- if not stopSource:
3266
+ if not stopSourceURLScan:
3233
3267
  # Get the JSON response
3234
3268
  jsonResp = json.loads(resp.text.strip())
3235
3269
 
@@ -3244,36 +3278,25 @@ def getURLScanUrls():
3244
3278
  and requestsMade > args.limit
3245
3279
  )
3246
3280
  ):
3247
- stopSource = True
3281
+ stopSourceURLScan = True
3248
3282
 
3249
3283
  # Show the MIME types found (in case user wants to exclude more)
3250
3284
  if verbose() and len(linkMimes) > 0 and args.mode != "R":
3251
3285
  linkMimes.discard("warc/revisit")
3252
3286
  write(
3253
- getSPACER(
3254
- colored("MIME types found: ", "magenta") + colored(str(linkMimes), "white")
3255
- )
3287
+ colored("URLScan - [ INFO ] MIME types found: ", "magenta")
3288
+ + colored(str(linkMimes), "white")
3256
3289
  + "\n"
3257
3290
  )
3258
3291
 
3259
- linkCount = len(linksFound) - originalLinkCount
3260
3292
  if args.mode != "R":
3261
- if args.xwm and args.xcc and args.xav:
3262
- write(
3263
- getSPACER(
3264
- colored("Links found on urlscan.io: ", "cyan")
3265
- + colored(str(linkCount), "white")
3266
- )
3267
- + "\n"
3268
- )
3269
- else:
3270
- write(
3271
- getSPACER(
3272
- colored("Extra links found on urlscan.io: ", "cyan")
3273
- + colored(str(linkCount), "white")
3274
- )
3275
- + "\n"
3276
- )
3293
+ linkCountURLScan = len(linksFoundURLScan)
3294
+ write(
3295
+ colored("URLScan - [ INFO ] Links found on urlscan.io: ", "cyan")
3296
+ + colored(str(linkCountURLScan), "white")
3297
+ )
3298
+ linksFound.update(linksFoundURLScan)
3299
+ linksFoundURLScan.clear()
3277
3300
 
3278
3301
  except Exception as e:
3279
3302
  writerr(colored("ERROR getURLScanUrls 1: " + str(e), "red"))
@@ -3283,12 +3306,11 @@ def processWayBackPage(url):
3283
3306
  """
3284
3307
  Get URLs from a specific page of archive.org CDX API for the input domain
3285
3308
  """
3286
- global totalPages, linkMimes, linksFound, stopSource
3309
+ global totalPages, linkMimes, linksFound, stopSourceWayback, linkCountWayback, linksFoundWayback, current_response, current_session
3287
3310
  try:
3288
3311
  # Get memory in case it exceeds threshold
3289
3312
  getMemory()
3290
-
3291
- if not stopSource:
3313
+ if not stopSourceWayback:
3292
3314
  try:
3293
3315
  # Choose a random user agent string to use for any requests
3294
3316
  resp = None
@@ -3297,210 +3319,231 @@ def processWayBackPage(url):
3297
3319
  session = requests.Session()
3298
3320
  session.mount("https://", HTTP_ADAPTER)
3299
3321
  session.mount("http://", HTTP_ADAPTER)
3300
- resp = session.get(url, headers={"User-Agent": userAgent})
3301
- except ConnectionError:
3302
- writerr(
3303
- colored(
3304
- getSPACER(
3305
- "[ ERR ] Wayback Machine (archive.org) connection error for page "
3306
- + page
3307
- ),
3308
- "red",
3309
- )
3310
- )
3311
- resp = None
3312
- return
3313
- except Exception as e:
3314
- writerr(
3315
- colored(
3316
- getSPACER(
3317
- "[ ERR ] Error getting response for page " + page + " - " + str(e)
3318
- ),
3319
- "red",
3320
- )
3322
+ # expose session so SIGINT handler can close it to interrupt blocking network I/O
3323
+ try:
3324
+ current_session = session
3325
+ except Exception:
3326
+ pass
3327
+
3328
+ resp = session.get(
3329
+ url, headers={"User-Agent": userAgent}, stream=True, timeout=args.timeout
3321
3330
  )
3322
- resp = None
3323
- return
3324
- finally:
3331
+ # expose live response so SIGINT handler can close it to interrupt blocking I/O
3325
3332
  try:
3326
- if resp is not None:
3327
- # If a status other of 429, then stop processing Wayback Machine
3328
- if resp.status_code == 429:
3329
- if args.wayback_rate_limit_retry > 0:
3330
- seconds = args.wayback_rate_limit_retry * 60
3331
- if args.processes == 1:
3332
- writerr(
3333
- colored(
3334
- "\r[ 429 ] Wayback Machine (archive.org) rate limit reached on page "
3335
- + str(page)
3336
- + " of "
3337
- + str(totalPages)
3338
- + ", so waiting for "
3339
- + str(seconds)
3340
- + " seconds before continuing...\r",
3341
- "yellow",
3342
- )
3343
- )
3344
- else:
3345
- writerr(
3346
- colored(
3347
- "\r[ 429 ] Wayback Machine (archive.org) rate limit reached, so waiting for "
3348
- + str(seconds)
3349
- + " seconds before continuing...\r",
3350
- "yellow",
3351
- )
3352
- )
3353
- time.sleep(seconds)
3354
- try:
3355
- resp = session.get(url, headers={"User-Agent": userAgent})
3356
- except ConnectionError:
3357
- writerr(
3358
- colored(
3359
- getSPACER(
3360
- "[ ERR ] Wayback Machine (archive.org) connection error for page "
3361
- + page
3362
- ),
3363
- "red",
3364
- )
3333
+ current_response = resp
3334
+ except Exception:
3335
+ pass
3336
+ # Check response status in the finally block
3337
+ if resp is not None:
3338
+ # If a status other of 429, then stop processing Wayback Machine
3339
+ if resp.status_code == 429:
3340
+ if args.wayback_rate_limit_retry > 0:
3341
+ seconds = args.wayback_rate_limit_retry * 60
3342
+ if args.processes == 1:
3343
+ writerr(
3344
+ colored(
3345
+ "Wayback - [ 429 ] Rate limit reached on page "
3346
+ + str(page)
3347
+ + " of "
3348
+ + str(totalPages)
3349
+ + ", so waiting for "
3350
+ + str(seconds)
3351
+ + " seconds before continuing...",
3352
+ "yellow",
3365
3353
  )
3366
- resp = None
3367
- return
3368
- except Exception as e:
3369
- writerr(
3370
- colored(
3371
- getSPACER(
3372
- "[ ERR ] Error getting response for page "
3373
- + page
3374
- + " - "
3375
- + str(e)
3376
- ),
3377
- "red",
3378
- )
3354
+ )
3355
+ else:
3356
+ writerr(
3357
+ colored(
3358
+ "Wayback - [ 429 ] Rate limit reached, so waiting for "
3359
+ + str(seconds)
3360
+ + " seconds before continuing...",
3361
+ "yellow",
3379
3362
  )
3380
- resp = None
3381
- return
3382
-
3383
- if resp.status_code == 429:
3384
- writerr(
3385
- colored(
3386
- getSPACER(
3387
- "[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
3388
- ),
3389
- "red",
3390
3363
  )
3391
- )
3392
- stopSource = True
3393
- return
3394
- # If a status other of 503, then the site is unavailable
3395
- if resp.status_code == 503:
3396
- writerr(
3397
- colored(
3398
- getSPACER(
3399
- "[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
3400
- ),
3401
- "red",
3364
+ # Wait can be interrupted by SIGINT via interrupt_event
3365
+ interrupt_event.clear()
3366
+ if interrupt_event.wait(seconds):
3367
+ return
3368
+ try:
3369
+ resp = session.get(
3370
+ url,
3371
+ headers={"User-Agent": userAgent},
3372
+ stream=True,
3373
+ timeout=args.timeout,
3402
3374
  )
3403
- )
3404
- stopSource = True
3405
- return
3406
- # If the response from archive.org is empty then skip
3407
- if resp.text == "" and totalPages == 0:
3408
- if verbose():
3375
+ try:
3376
+ current_response = resp
3377
+ except Exception:
3378
+ pass
3379
+ except ConnectionError:
3409
3380
  writerr(
3410
3381
  colored(
3411
- getSPACER("[ ERR ] " + url + " gave an empty response."),
3382
+ "Wayback - [ ERR ] Connection error for page " + page,
3412
3383
  "red",
3413
3384
  )
3414
3385
  )
3415
- return
3416
- # If a status other than 200, then stop
3417
- if resp.status_code != 200:
3418
- if verbose():
3386
+ resp = None
3387
+ return
3388
+ except Exception as e:
3419
3389
  writerr(
3420
3390
  colored(
3421
- getSPACER(
3422
- "[ " + str(resp.status_code) + " ] Error for " + url
3423
- ),
3391
+ "Wayback - [ ERR ] Error getting response for page "
3392
+ + page
3393
+ + " - "
3394
+ + str(e),
3424
3395
  "red",
3425
3396
  )
3426
3397
  )
3427
- return
3428
- except ConnectionError:
3429
- writerr(
3430
- colored(
3431
- getSPACER(
3432
- "[ ERR ] Wayback Machine (archive.org) connection error for page "
3433
- + page
3434
- ),
3435
- "red",
3398
+ resp = None
3399
+ return
3400
+
3401
+ if resp.status_code == 429:
3402
+ writerr(
3403
+ colored(
3404
+ "Wayback - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
3405
+ "red",
3406
+ )
3436
3407
  )
3437
- )
3438
- resp = None
3439
- return
3440
- except Exception as e:
3441
- writerr(
3442
- colored(
3443
- getSPACER(
3444
- "[ ERR ] Error getting response for page " + page + " - " + str(e)
3445
- ),
3446
- "red",
3408
+ stopSourceWayback = True
3409
+ return
3410
+ # If a status other of 503, then the site is unavailable
3411
+ if resp.status_code == 503:
3412
+ writerr(
3413
+ colored(
3414
+ "Wayback - [ 503 ] The Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.",
3415
+ "red",
3416
+ )
3447
3417
  )
3448
- )
3449
- resp = None
3450
- return
3418
+ stopSourceWayback = True
3419
+ return
3420
+ # If a status other than 200, then stop
3421
+ if resp.status_code != 200:
3422
+ if verbose():
3423
+ writerr(
3424
+ colored(
3425
+ "Wayback - [ " + str(resp.status_code) + " ] Error for " + url,
3426
+ "red",
3427
+ )
3428
+ )
3429
+ try:
3430
+ current_response = None
3431
+ except Exception:
3432
+ pass
3433
+ try:
3434
+ current_session = None
3435
+ except Exception:
3436
+ pass
3437
+ return
3451
3438
 
3452
- # Get the URLs and MIME types. Each line is a separate JSON string
3453
- try:
3439
+ # Get the URLs and MIME types. Each line is a separate JSON string
3440
+ # Process lines as they arrive - if connection drops, we keep what we've already processed
3454
3441
  for line in resp.iter_lines():
3455
- results = line.decode("utf-8")
3456
- foundUrl = fixArchiveOrgUrl(str(results).split(" ")[1])
3442
+ try:
3443
+ results = line.decode("utf-8")
3444
+ foundUrl = fixArchiveOrgUrl(str(results).split(" ")[1])
3457
3445
 
3458
- # If --filter-responses-only wasn't used, then check the URL exclusions
3459
- if args.filter_responses_only:
3460
- match = None
3461
- else:
3462
- match = re.search(
3463
- r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
3464
- foundUrl,
3465
- flags=re.IGNORECASE,
3466
- )
3467
- if match is None:
3468
- # Only get MIME Types if --verbose option was selected
3469
- if verbose():
3446
+ # If --filter-responses-only wasn't used, then check the URL exclusions
3447
+ if args.filter_responses_only:
3448
+ match = None
3449
+ else:
3450
+ match = re.search(
3451
+ r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
3452
+ foundUrl,
3453
+ flags=re.IGNORECASE,
3454
+ )
3455
+ if match is None:
3456
+ # Only get MIME Types if --verbose option was selected
3457
+ if verbose():
3458
+ try:
3459
+ mimeType = str(results).split(" ")[2]
3460
+ if mimeType != "":
3461
+ linkMimes.add(mimeType)
3462
+ except Exception:
3463
+ if verbose():
3464
+ writerr(
3465
+ colored(
3466
+ getSPACER(
3467
+ "ERROR processWayBackPage 2: Cannot get MIME type from line: "
3468
+ + str(line)
3469
+ ),
3470
+ "red",
3471
+ )
3472
+ )
3470
3473
  try:
3471
- mimeType = str(results).split(" ")[2]
3472
- if mimeType != "":
3473
- linkMimes.add(mimeType)
3474
+ linksFoundAdd(foundUrl, linksFoundWayback)
3475
+
3474
3476
  except Exception:
3475
3477
  if verbose():
3476
3478
  writerr(
3477
3479
  colored(
3478
3480
  getSPACER(
3479
- "ERROR processWayBackPage 2: Cannot get MIME type from line: "
3481
+ "ERROR processWayBackPage 3: Cannot get link from line: "
3480
3482
  + str(line)
3481
3483
  ),
3482
3484
  "red",
3483
3485
  )
3484
3486
  )
3485
- write(resp.text)
3486
- try:
3487
- linksFoundAdd(foundUrl)
3488
- except Exception:
3489
- if verbose():
3490
- writerr(
3491
- colored(
3492
- getSPACER(
3493
- "ERROR processWayBackPage 3: Cannot get link from line: "
3494
- + str(line)
3495
- ),
3496
- "red",
3497
- )
3487
+ except Exception:
3488
+ if verbose():
3489
+ writerr(
3490
+ colored(
3491
+ getSPACER("ERROR processWayBackPage 4: " + str(line)), "red"
3498
3492
  )
3499
- write(resp.text)
3500
- except Exception:
3501
- if verbose():
3502
- writerr(colored(getSPACER("ERROR processWayBackPage 4: " + str(line)), "red"))
3493
+ )
3494
+
3495
+ except ConnectionError:
3496
+ writerr(
3497
+ colored(
3498
+ "Wayback - [ ERR ] Connection error for page "
3499
+ + page
3500
+ + (
3501
+ f" (saved {len(linksFoundWayback)} URLs before error)"
3502
+ if len(linksFoundWayback) > 0
3503
+ else ""
3504
+ ),
3505
+ "red",
3506
+ )
3507
+ )
3508
+ try:
3509
+ current_response = None
3510
+ except Exception:
3511
+ pass
3512
+ try:
3513
+ current_session = None
3514
+ except Exception:
3515
+ pass
3516
+ return
3517
+ except Exception as e:
3518
+ # Even if connection drops, we've already saved the URLs processed so far
3519
+ if len(linksFoundWayback) > 0:
3520
+ writerr(
3521
+ colored(
3522
+ f"Wayback - [ WARN ] Error getting response for page {page} - {str(e)} (saved {len(linksFoundWayback)} URLs before error)",
3523
+ "yellow",
3524
+ )
3525
+ )
3526
+ else:
3527
+ writerr(
3528
+ colored(
3529
+ "Wayback - [ ERR ] Error getting response for page "
3530
+ + page
3531
+ + " - "
3532
+ + str(e),
3533
+ "red",
3534
+ )
3535
+ )
3536
+ try:
3537
+ current_response = None
3538
+ except Exception:
3539
+ pass
3540
+ try:
3541
+ current_session = None
3542
+ except Exception:
3543
+ pass
3544
+ return
3503
3545
  else:
3546
+ print("DEBUG: HERE END!") # DEBUG
3504
3547
  pass
3505
3548
  except Exception as e:
3506
3549
  if verbose():
@@ -3511,11 +3554,12 @@ def getWaybackUrls():
3511
3554
  """
3512
3555
  Get URLs from the Wayback Machine, archive.org
3513
3556
  """
3514
- global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages, stopSource, argsInput, checkWayback
3557
+ global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages, stopSourceWayback, argsInput, checkWayback, linkCountWayback, linksFoundWayback
3515
3558
 
3516
3559
  # Write the file of URL's for the passed domain/URL
3517
3560
  try:
3518
- stopSource = False
3561
+ stopSourceWayback = False
3562
+ linksFoundWayback = set()
3519
3563
 
3520
3564
  if MATCH_MIME != "":
3521
3565
  filterMIME = "&filter=mimetype:" + re.escape(MATCH_MIME).replace(",", "|")
@@ -3577,7 +3621,7 @@ def getWaybackUrls():
3577
3621
  if not args.check_only:
3578
3622
  write(
3579
3623
  colored(
3580
- "\rGetting the number of Wayback Machine (archive.org) pages to search...\r",
3624
+ "Wayback - [ INFO ] Getting the number of pages to search...",
3581
3625
  "cyan",
3582
3626
  )
3583
3627
  )
@@ -3602,9 +3646,7 @@ def getWaybackUrls():
3602
3646
  if resp.status_code == 429:
3603
3647
  writerr(
3604
3648
  colored(
3605
- getSPACER(
3606
- "[ 429 ] Wayback Machine (Archive.org) rate limit reached so unable to get links."
3607
- ),
3649
+ "Wayback - [ 429 ] Rate limit reached so unable to get links.",
3608
3650
  "red",
3609
3651
  )
3610
3652
  )
@@ -3614,9 +3656,7 @@ def getWaybackUrls():
3614
3656
  if resp.status_code == 503:
3615
3657
  writerr(
3616
3658
  colored(
3617
- getSPACER(
3618
- "[ 503 ] Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
3619
- ),
3659
+ "Wayback - [ 503 ] The Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.",
3620
3660
  "red",
3621
3661
  )
3622
3662
  )
@@ -3625,19 +3665,15 @@ def getWaybackUrls():
3625
3665
  if resp.text.lower().find("blocked site error") > 0:
3626
3666
  writerr(
3627
3667
  colored(
3628
- getSPACER(
3629
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)"
3630
- ),
3668
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)",
3631
3669
  "red",
3632
3670
  )
3633
3671
  )
3634
3672
  else:
3635
3673
  writerr(
3636
3674
  colored(
3637
- getSPACER(
3638
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): "
3639
- + str(resp.text.strip())
3640
- ),
3675
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): "
3676
+ + str(resp.text.strip()),
3641
3677
  "red",
3642
3678
  )
3643
3679
  )
@@ -3645,28 +3681,22 @@ def getWaybackUrls():
3645
3681
  if str(e).lower().find("alert access denied"):
3646
3682
  writerr(
3647
3683
  colored(
3648
- getSPACER(
3649
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don't know, but it has happened!)"
3650
- ),
3684
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don't know, but it has happened!)",
3651
3685
  "red",
3652
3686
  )
3653
3687
  )
3654
3688
  elif str(e).lower().find("connection refused"):
3655
3689
  writerr(
3656
3690
  colored(
3657
- getSPACER(
3658
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)"
3659
- ),
3691
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)",
3660
3692
  "red",
3661
3693
  )
3662
3694
  )
3663
3695
  else:
3664
3696
  writerr(
3665
3697
  colored(
3666
- getSPACER(
3667
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): "
3668
- + str(e)
3669
- ),
3698
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): "
3699
+ + str(e),
3670
3700
  "red",
3671
3701
  )
3672
3702
  )
@@ -3676,27 +3706,29 @@ def getWaybackUrls():
3676
3706
  if totalPages < 0:
3677
3707
  write(
3678
3708
  colored(
3679
- "Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.",
3709
+ "Wayback - [ INFO ] Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.",
3680
3710
  "cyan",
3681
3711
  )
3682
3712
  )
3683
3713
  else:
3684
3714
  checkWayback = totalPages
3685
3715
  write(
3686
- colored("Get URLs from Wayback Machine: ", "cyan")
3716
+ colored("Wayback - [ INFO ] Get URLs from Wayback Machine: ", "cyan")
3687
3717
  + colored(str(checkWayback) + " requests", "white")
3688
3718
  )
3689
3719
  else:
3690
3720
  if verbose():
3691
3721
  write(
3692
- colored("The archive URL requested to get links: ", "magenta")
3722
+ colored(
3723
+ "Wayback - [ INFO ] The archive URL requested to get links: ", "magenta"
3724
+ )
3693
3725
  + colored(url + "\n", "white")
3694
3726
  )
3695
3727
 
3696
3728
  if totalPages < 0:
3697
3729
  write(
3698
3730
  colored(
3699
- "\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r",
3731
+ "Wayback - [ INFO ] Getting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...",
3700
3732
  "cyan",
3701
3733
  )
3702
3734
  )
@@ -3706,9 +3738,9 @@ def getWaybackUrls():
3706
3738
  # if the page number was found then display it, but otherwise we will just try to increment until we have everything
3707
3739
  write(
3708
3740
  colored(
3709
- "\rGetting links from "
3741
+ "Wayback - [ INFO ] Getting links from "
3710
3742
  + str(totalPages)
3711
- + " Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r",
3743
+ + " Wayback Machine (archive.org) API requests (this can take a while for some domains)...",
3712
3744
  "cyan",
3713
3745
  )
3714
3746
  )
@@ -3732,22 +3764,22 @@ def getWaybackUrls():
3732
3764
  if verbose() and len(linkMimes) > 0:
3733
3765
  linkMimes.discard("warc/revisit")
3734
3766
  write(
3735
- getSPACER(
3736
- colored("MIME types found: ", "magenta") + colored(str(linkMimes), "white")
3737
- )
3767
+ colored("Wayback - [ INFO ] MIME types found: ", "magenta")
3768
+ + colored(str(linkMimes), "white")
3738
3769
  + "\n"
3739
3770
  )
3740
3771
  linkMimes = None
3741
3772
 
3742
3773
  if not args.xwm:
3743
- linkCount = len(linksFound)
3774
+ linkCountWayback = len(linksFoundWayback)
3744
3775
  write(
3745
- getSPACER(
3746
- colored("Links found on Wayback Machine (archive.org): ", "cyan")
3747
- + colored(str(linkCount), "white")
3776
+ colored(
3777
+ "Wayback - [ INFO ] Links found on Wayback Machine (archive.org): ", "cyan"
3748
3778
  )
3749
- + "\n"
3779
+ + colored(str(linkCountWayback), "white")
3750
3780
  )
3781
+ linksFound.update(linksFoundWayback)
3782
+ linksFoundWayback.clear()
3751
3783
 
3752
3784
  except Exception as e:
3753
3785
  writerr(colored("ERROR getWaybackUrls 1: " + str(e), "red"))
@@ -3757,13 +3789,13 @@ def processCommonCrawlCollection(cdxApiUrl):
3757
3789
  """
3758
3790
  Get URLs from a given Common Crawl index collection
3759
3791
  """
3760
- global subs, path, linksFound, linkMimes, stopSource, argsInput
3792
+ global subs, path, linksFound, linkMimes, stopSourceCommonCrawl, argsInput, linkCountCommonCrawl, linksFoundCommonCrawl, current_response, current_session
3761
3793
 
3762
3794
  try:
3763
3795
  # Get memory in case it exceeds threshold
3764
3796
  getMemory()
3765
3797
 
3766
- if not stopSource:
3798
+ if not stopSourceCommonCrawl:
3767
3799
  # Set mime content type filter
3768
3800
  if MATCH_MIME.strip() != "":
3769
3801
  filterMIME = "&filter=~mime:("
@@ -3812,18 +3844,26 @@ def processCommonCrawlCollection(cdxApiUrl):
3812
3844
  session = requests.Session()
3813
3845
  session.mount("https://", HTTP_ADAPTER_CC)
3814
3846
  session.mount("http://", HTTP_ADAPTER_CC)
3847
+ try:
3848
+ current_session = session
3849
+ except Exception:
3850
+ pass
3815
3851
  resp = session.get(url, stream=True, headers={"User-Agent": userAgent})
3852
+ try:
3853
+ current_response = resp
3854
+ except Exception:
3855
+ pass
3816
3856
  except ConnectionError:
3817
3857
  writerr(
3818
3858
  colored(
3819
- getSPACER("[ ERR ] Common Crawl connection error for index " + cdxApiUrl),
3859
+ "CommonCrawl - [ ERR ] Connection error for index " + cdxApiUrl,
3820
3860
  "red",
3821
3861
  )
3822
3862
  )
3823
3863
  resp = None
3824
3864
  return
3825
3865
  except Exception as e:
3826
- writerr(colored(getSPACER("[ ERR ] Error getting response - " + str(e)), "red"))
3866
+ writerr(colored("CommonCrawl - [ ERR ] Error getting response - " + str(e), "red"))
3827
3867
  resp = None
3828
3868
  return
3829
3869
  finally:
@@ -3833,13 +3873,11 @@ def processCommonCrawlCollection(cdxApiUrl):
3833
3873
  if resp.status_code == 429:
3834
3874
  writerr(
3835
3875
  colored(
3836
- getSPACER(
3837
- "[ 429 ] Common Crawl rate limit reached, so stopping. Links that have already been retrieved will be saved."
3838
- ),
3876
+ "CommonCrawl - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
3839
3877
  "red",
3840
3878
  )
3841
3879
  )
3842
- stopSource = True
3880
+ stopSourceCommonCrawl = True
3843
3881
  return
3844
3882
  # If the response from commoncrawl.org says nothing was found...
3845
3883
  if resp.text.lower().find("no captures found") > 0:
@@ -3850,7 +3888,7 @@ def processCommonCrawlCollection(cdxApiUrl):
3850
3888
  if verbose():
3851
3889
  writerr(
3852
3890
  colored(
3853
- getSPACER("[ ERR ] " + url + " gave an empty response."),
3891
+ "CommonCrawl - [ ERR ] " + url + " gave an empty response.",
3854
3892
  "red",
3855
3893
  )
3856
3894
  )
@@ -3860,12 +3898,10 @@ def processCommonCrawlCollection(cdxApiUrl):
3860
3898
  if verbose():
3861
3899
  writerr(
3862
3900
  colored(
3863
- getSPACER(
3864
- "[ "
3865
- + str(resp.status_code)
3866
- + " ] Error for "
3867
- + cdxApiUrl
3868
- ),
3901
+ "CommonCrawl - [ "
3902
+ + str(resp.status_code)
3903
+ + " ] Error for "
3904
+ + cdxApiUrl,
3869
3905
  "red",
3870
3906
  )
3871
3907
  )
@@ -3874,63 +3910,71 @@ def processCommonCrawlCollection(cdxApiUrl):
3874
3910
  pass
3875
3911
 
3876
3912
  # Get the URLs and MIME types
3877
- for line in resp.iter_lines():
3878
- results = line.decode("utf-8")
3879
- try:
3880
- data = json.loads(results)
3881
- # Get MIME Types if --verbose option was seletced
3882
- if verbose():
3883
- try:
3884
- if data["mime"] != "":
3885
- linkMimes.add(data["mime"])
3886
- except Exception:
3887
- pass
3888
- # If -from or -to were passed, check the timestamp of the URL.
3889
- # Only continue if the URL falls within the date range specified
3890
- if args.from_date is not None or args.to_date is not None:
3891
- try:
3892
- ts = data["timestamp"]
3893
-
3894
- # Normalize helper: pad/truncate date string to 14 digits (YYYYMMDDhhmmss)
3895
- def normalize_date(d, is_from):
3896
- if d is None:
3897
- return None
3898
- d = d.strip()
3899
- # Pad to 14 digits: from_date pads with 0s, to_date with 9s
3900
- if is_from:
3901
- return (d + "0" * (14 - len(d)))[:14]
3902
- else:
3903
- return (d + "9" * (14 - len(d)))[:14]
3913
+ try:
3914
+ for line in resp.iter_lines():
3915
+ results = line.decode("utf-8")
3916
+ try:
3917
+ data = json.loads(results)
3918
+ # Get MIME Types if --verbose option was seletced
3919
+ if verbose():
3920
+ try:
3921
+ if data["mime"] != "":
3922
+ linkMimes.add(data["mime"])
3923
+ except Exception:
3924
+ pass
3925
+ # If -from or -to were passed, check the timestamp of the URL.
3926
+ # Only continue if the URL falls within the date range specified
3927
+ if args.from_date is not None or args.to_date is not None:
3928
+ try:
3929
+ ts = data["timestamp"]
3930
+
3931
+ # Normalize helper: pad/truncate date string to 14 digits (YYYYMMDDhhmmss)
3932
+ def normalize_date(d, is_from):
3933
+ if d is None:
3934
+ return None
3935
+ d = d.strip()
3936
+ # Pad to 14 digits: from_date pads with 0s, to_date with 9s
3937
+ if is_from:
3938
+ return (d + "0" * (14 - len(d)))[:14]
3939
+ else:
3940
+ return (d + "9" * (14 - len(d)))[:14]
3904
3941
 
3905
- from_ts = normalize_date(args.from_date, True)
3906
- to_ts = normalize_date(args.to_date, False)
3942
+ from_ts = normalize_date(args.from_date, True)
3943
+ to_ts = normalize_date(args.to_date, False)
3907
3944
 
3908
- # Compare numerically
3909
- if from_ts and ts < from_ts:
3910
- continue
3911
- if to_ts and ts > to_ts:
3912
- continue
3945
+ # Compare numerically
3946
+ if from_ts and ts < from_ts:
3947
+ continue
3948
+ if to_ts and ts > to_ts:
3949
+ continue
3913
3950
 
3914
- except Exception as e:
3951
+ except Exception:
3952
+ writerr(
3953
+ colored(
3954
+ "ERROR processCommonCrawlCollection 3: Cannot get timestamp from line {line}: {str(e)}",
3955
+ "red",
3956
+ )
3957
+ )
3958
+
3959
+ linksFoundAdd(data["url"], linksFoundCommonCrawl)
3960
+ except Exception:
3961
+ if verbose():
3915
3962
  writerr(
3916
3963
  colored(
3917
- getSPACER(
3918
- f"ERROR processCommonCrawlCollection 3: Cannot get timestamp from line {line}: {str(e)}"
3919
- ),
3964
+ "ERROR processCommonCrawlCollection 2: Cannot get URL and MIME type from line: "
3965
+ + str(line),
3920
3966
  "red",
3921
3967
  )
3922
3968
  )
3923
-
3924
- linksFoundAdd(data["url"])
3969
+ finally:
3970
+ try:
3971
+ current_response = None
3925
3972
  except Exception:
3926
- if verbose():
3927
- writerr(
3928
- colored(
3929
- "ERROR processCommonCrawlCollection 2: Cannot get URL and MIME type from line: "
3930
- + str(line),
3931
- "red",
3932
- )
3933
- )
3973
+ pass
3974
+ try:
3975
+ current_session = None
3976
+ except Exception:
3977
+ pass
3934
3978
  else:
3935
3979
  pass
3936
3980
  except Exception as e:
@@ -3957,10 +4001,8 @@ def getCommonCrawlIndexes():
3957
4001
  except Exception as e:
3958
4002
  writerr(
3959
4003
  colored(
3960
- getSPACER(
3961
- "[ ERR ] Couldn't delete local version of Common Crawl index file: "
3962
- + str(e)
3963
- ),
4004
+ "CommonCrawl - [ ERR ] Couldn't delete local version of Common Crawl index file: "
4005
+ + str(e),
3964
4006
  "red",
3965
4007
  )
3966
4008
  )
@@ -3978,10 +4020,8 @@ def getCommonCrawlIndexes():
3978
4020
  createFile = True
3979
4021
  writerr(
3980
4022
  colored(
3981
- getSPACER(
3982
- "[ ERR ] Couldn't read local version of Common Crawl index file: "
3983
- + str(e)
3984
- ),
4023
+ "CommonCrawl - [ ERR ] Couldn't read local version of Common Crawl index file: "
4024
+ + str(e),
3985
4025
  "red",
3986
4026
  )
3987
4027
  )
@@ -3998,7 +4038,7 @@ def getCommonCrawlIndexes():
3998
4038
  except ConnectionError:
3999
4039
  writerr(
4000
4040
  colored(
4001
- getSPACER("[ ERR ] Common Crawl connection error getting Index file"),
4041
+ "CommonCrawl - [ ERR ] Connection error getting Index file",
4002
4042
  "red",
4003
4043
  )
4004
4044
  )
@@ -4006,9 +4046,8 @@ def getCommonCrawlIndexes():
4006
4046
  except Exception as e:
4007
4047
  writerr(
4008
4048
  colored(
4009
- getSPACER(
4010
- "[ ERR ] Error getting Common Crawl index collection - " + str(e)
4011
- ),
4049
+ "CommonCrawl - [ ERR ] Error getting Common Crawl index collection - "
4050
+ + str(e),
4012
4051
  "red",
4013
4052
  )
4014
4053
  )
@@ -4018,9 +4057,7 @@ def getCommonCrawlIndexes():
4018
4057
  if indexes.status_code == 429:
4019
4058
  writerr(
4020
4059
  colored(
4021
- getSPACER(
4022
- "[ 429 ] Common Crawl rate limit reached so unable to get links."
4023
- ),
4060
+ "CommonCrawl - [ 429 ] Rate limit reached so unable to get links.",
4024
4061
  "red",
4025
4062
  )
4026
4063
  )
@@ -4029,7 +4066,7 @@ def getCommonCrawlIndexes():
4029
4066
  elif indexes.status_code == 503:
4030
4067
  writerr(
4031
4068
  colored(
4032
- getSPACER("[ 503 ] Common Crawl seems to be unavailable."),
4069
+ "CommonCrawl - [ 503 ] Common Crawl seems to be unavailable.",
4033
4070
  "red",
4034
4071
  )
4035
4072
  )
@@ -4037,11 +4074,9 @@ def getCommonCrawlIndexes():
4037
4074
  elif indexes.status_code != 200:
4038
4075
  writerr(
4039
4076
  colored(
4040
- getSPACER(
4041
- "[ "
4042
- + str(indexes.status_code)
4043
- + " ] Common Crawl did not retrun the indexes file."
4044
- ),
4077
+ "CommonCrawl - [ "
4078
+ + str(indexes.status_code)
4079
+ + " ] Common Crawl did not retrun the indexes file.",
4045
4080
  "red",
4046
4081
  )
4047
4082
  )
@@ -4058,10 +4093,8 @@ def getCommonCrawlIndexes():
4058
4093
  except Exception as e:
4059
4094
  writerr(
4060
4095
  colored(
4061
- getSPACER(
4062
- "[ ERR ] Couldn't create local version of Common Crawl index file: "
4063
- + str(e)
4064
- ),
4096
+ "CommonCrawl - [ ERR ] Couldn't create local version of Common Crawl index file: "
4097
+ + str(e),
4065
4098
  "red",
4066
4099
  )
4067
4100
  )
@@ -4094,12 +4127,10 @@ def getCommonCrawlIndexes():
4094
4127
  except Exception as e:
4095
4128
  writerr(
4096
4129
  colored(
4097
- getSPACER(
4098
- "[ ERR ] Failed to get the year from index name "
4099
- + values[key]
4100
- + " - "
4101
- + str(e)
4102
- ),
4130
+ "CommonCrawl - [ ERR ] Failed to get the year from index name "
4131
+ + values[key]
4132
+ + " - "
4133
+ + str(e),
4103
4134
  "red",
4104
4135
  )
4105
4136
  )
@@ -4121,12 +4152,11 @@ def getCommonCrawlUrls():
4121
4152
  """
4122
4153
  Get all Common Crawl index collections to get all URLs from each one
4123
4154
  """
4124
- global linksFound, linkMimes, waymorePath, subs, path, stopSource, argsInput, checkCommonCrawl
4155
+ global linksFound, linkMimes, waymorePath, subs, path, stopSourceCommonCrawl, argsInput, checkCommonCrawl, linkCountCommonCrawl, linksFoundCommonCrawl
4125
4156
 
4126
4157
  try:
4127
- stopSource = False
4128
- linkMimes = set()
4129
- originalLinkCount = len(linksFound)
4158
+ stopSourceCommonCrawl = False
4159
+ linksFoundCommonCrawl = set()
4130
4160
 
4131
4161
  # Set mime content type filter
4132
4162
  if MATCH_MIME.strip() != "":
@@ -4164,7 +4194,7 @@ def getCommonCrawlUrls():
4164
4194
  )
4165
4195
  write(
4166
4196
  colored(
4167
- "The commoncrawl index URL requested to get links (where {CDX-API-URL} is from "
4197
+ "CommonCrawl - [ INFO ] The index URL requested to get links (where {CDX-API-URL} is from "
4168
4198
  + CCRAWL_INDEX_URL
4169
4199
  + "): ",
4170
4200
  "magenta",
@@ -4173,7 +4203,7 @@ def getCommonCrawlUrls():
4173
4203
  )
4174
4204
 
4175
4205
  if not args.check_only:
4176
- write(colored("\rGetting commoncrawl.org index collections list...\r", "cyan"))
4206
+ write(colored("CommonCrawl - [ INFO ] Getting index collections list...", "cyan"))
4177
4207
 
4178
4208
  # Get the Common Crawl index collections
4179
4209
  cdxApiUrls = getCommonCrawlIndexes()
@@ -4186,15 +4216,15 @@ def getCommonCrawlUrls():
4186
4216
  else:
4187
4217
  checkCommonCrawl = len(cdxApiUrls) + 1
4188
4218
  write(
4189
- colored("Get URLs from Common Crawl: ", "cyan")
4219
+ colored("CommonCrawl - [ INFO ] Get URLs from Common Crawl: ", "cyan")
4190
4220
  + colored(str(checkCommonCrawl) + " requests", "white")
4191
4221
  )
4192
4222
  else:
4193
4223
  write(
4194
4224
  colored(
4195
- "\rGetting links from the latest "
4225
+ "CommonCrawl - [ INFO ] Getting links from the latest "
4196
4226
  + str(len(cdxApiUrls))
4197
- + " commoncrawl.org index collections (this can take a while for some domains)...\r",
4227
+ + " commoncrawl.org index collections (this can take a while for some domains)...",
4198
4228
  "cyan",
4199
4229
  )
4200
4230
  )
@@ -4210,30 +4240,18 @@ def getCommonCrawlUrls():
4210
4240
  if verbose() and len(linkMimes) > 0:
4211
4241
  linkMimes.discard("warc/revisit")
4212
4242
  write(
4213
- getSPACER(
4214
- colored("MIME types found: ", "magenta")
4215
- + colored(str(linkMimes), "white")
4216
- )
4243
+ colored("CommonCrawl - [ INFO ] MIME types found: ", "magenta")
4244
+ + colored(str(linkMimes), "white")
4217
4245
  + "\n"
4218
4246
  )
4219
4247
 
4220
- linkCount = len(linksFound) - originalLinkCount
4221
- if args.xwm:
4222
- write(
4223
- getSPACER(
4224
- colored("Links found on commoncrawl.org: ", "cyan")
4225
- + colored(str(linkCount), "white")
4226
- )
4227
- + "\n"
4228
- )
4229
- else:
4230
- write(
4231
- getSPACER(
4232
- colored("Extra links found on commoncrawl.org: ", "cyan")
4233
- + colored(str(linkCount), "white")
4234
- )
4235
- + "\n"
4236
- )
4248
+ linkCountCommonCrawl = len(linksFoundCommonCrawl)
4249
+ write(
4250
+ colored("CommonCrawl - [ INFO ] Links found on commoncrawl.org: ", "cyan")
4251
+ + colored(str(linkCountCommonCrawl), "white")
4252
+ )
4253
+ linksFound.update(linksFoundCommonCrawl)
4254
+ linksFoundCommonCrawl.clear()
4237
4255
 
4238
4256
  except Exception as e:
4239
4257
  writerr(colored("ERROR getCommonCrawlUrls 1: " + str(e), "red"))
@@ -4243,7 +4261,7 @@ def processVirusTotalUrl(url):
4243
4261
  """
4244
4262
  Process a specific URL from virustotal.com to determine whether to save the link
4245
4263
  """
4246
- global argsInput, argsInputHostname
4264
+ global argsInput, argsInputHostname, linkCountVirusTotal, linksFoundVirusTotal
4247
4265
 
4248
4266
  addLink = True
4249
4267
 
@@ -4310,7 +4328,7 @@ def processVirusTotalUrl(url):
4310
4328
  flags=re.IGNORECASE,
4311
4329
  )
4312
4330
  if match is not None:
4313
- linksFoundAdd(url)
4331
+ linksFoundAdd(url, linksFoundVirusTotal)
4314
4332
 
4315
4333
  except Exception as e:
4316
4334
  writerr(colored("ERROR processVirusTotalUrl 1: " + str(e), "red"))
@@ -4321,12 +4339,11 @@ def getVirusTotalUrls():
4321
4339
  Get URLs from the VirusTotal API v2 and process them.
4322
4340
  Each URL is normalized as (url, scan_date) tuple. Dates are filtered according to args.from_date / args.to_date.
4323
4341
  """
4324
- global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSource, argsInput, checkVirusTotal, argsInputHostname
4342
+ global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceVirusTotal, argsInput, checkVirusTotal, argsInputHostname, linkCountVirusTotal, linksFoundVirusTotal
4325
4343
 
4326
4344
  try:
4327
- stopSource = False
4328
- linkMimes = set()
4329
- originalLinkCount = len(linksFound)
4345
+ stopSourceVirusTotal = False
4346
+ linksFoundVirusTotal = set()
4330
4347
 
4331
4348
  # Build the VirusTotal API URL
4332
4349
  url = VIRUSTOTAL_URL.replace("{DOMAIN}", quote(argsInputHostname)).replace(
@@ -4335,12 +4352,12 @@ def getVirusTotalUrls():
4335
4352
 
4336
4353
  if verbose():
4337
4354
  write(
4338
- colored("The VirusTotal URL requested to get links: ", "magenta")
4355
+ colored("VirusTotal - [ INFO ] The URL requested to get links: ", "magenta")
4339
4356
  + colored(url + "\n", "white")
4340
4357
  )
4341
4358
 
4342
4359
  if not args.check_only:
4343
- write(colored("\rGetting links from virustotal.com API...\r", "cyan"))
4360
+ write(colored("VirusTotal - [ INFO ] Getting links from virustotal.com API...", "cyan"))
4344
4361
 
4345
4362
  # Make request
4346
4363
  try:
@@ -4352,7 +4369,7 @@ def getVirusTotalUrls():
4352
4369
  except Exception as e:
4353
4370
  writerr(
4354
4371
  colored(
4355
- getSPACER(f"[ ERR ] Unable to get links from virustotal.com: {e}"),
4372
+ "VirusTotal - [ ERR ] Unable to get links from virustotal.com: " + str(e),
4356
4373
  "red",
4357
4374
  )
4358
4375
  )
@@ -4362,7 +4379,7 @@ def getVirusTotalUrls():
4362
4379
  if resp.status_code == 429:
4363
4380
  writerr(
4364
4381
  colored(
4365
- getSPACER("[ 429 ] VirusTotal rate limit reached so unable to get links."),
4382
+ "VirusTotal - [ 429 ] Rate limit reached so unable to get links.",
4366
4383
  "red",
4367
4384
  )
4368
4385
  )
@@ -4370,9 +4387,7 @@ def getVirusTotalUrls():
4370
4387
  elif resp.status_code == 403:
4371
4388
  writerr(
4372
4389
  colored(
4373
- getSPACER(
4374
- "[ 403 ] VirusTotal: Permission denied. Check your API key is correct."
4375
- ),
4390
+ "VirusTotal - [ 403 ] Permission denied. Check your API key is correct.",
4376
4391
  "red",
4377
4392
  )
4378
4393
  )
@@ -4380,7 +4395,9 @@ def getVirusTotalUrls():
4380
4395
  elif resp.status_code != 200:
4381
4396
  writerr(
4382
4397
  colored(
4383
- getSPACER(f"[ {resp.status_code} ] Unable to get links from virustotal.com"),
4398
+ "VirusTotal - [ ERR ] [ "
4399
+ + str(resp.status_code)
4400
+ + " ] Unable to get links from virustotal.com",
4384
4401
  "red",
4385
4402
  )
4386
4403
  )
@@ -4411,7 +4428,7 @@ def getVirusTotalUrls():
4411
4428
  except Exception as e:
4412
4429
  writerr(
4413
4430
  colored(
4414
- getSPACER("[ ERR ] Unexpected response from the VirusTotal API: " + str(e)),
4431
+ "VirusTotal - [ ERR ] Unexpected response from the VirusTotal API: " + str(e),
4415
4432
  "red",
4416
4433
  )
4417
4434
  )
@@ -4419,12 +4436,15 @@ def getVirusTotalUrls():
4419
4436
 
4420
4437
  # Check only mode
4421
4438
  if args.check_only:
4422
- write(colored("Get URLs from VirusTotal: ", "cyan") + colored("1 request", "white"))
4439
+ write(
4440
+ colored("VirusTotal - [ INFO ] Get URLs from VirusTotal: ", "cyan")
4441
+ + colored("1 request", "white")
4442
+ )
4423
4443
  checkVirusTotal = 1
4424
4444
  else:
4425
4445
  # Process each URL tuple
4426
4446
  for url, scan_date in all_urls:
4427
- if stopSource:
4447
+ if stopSourceVirusTotal:
4428
4448
  break
4429
4449
  getMemory()
4430
4450
 
@@ -4445,24 +4465,14 @@ def getVirusTotalUrls():
4445
4465
  # Process URL
4446
4466
  processVirusTotalUrl(url)
4447
4467
 
4448
- # Count links found
4449
- linkCount = len(linksFound) - originalLinkCount
4450
- if args.xwm and args.xcc and args.xav and args.xus:
4451
- write(
4452
- getSPACER(
4453
- colored("Links found on virustotal.com: ", "cyan")
4454
- + colored(str(linkCount), "white")
4455
- )
4456
- + "\n"
4457
- )
4458
- else:
4459
- write(
4460
- getSPACER(
4461
- colored("Extra links found on virustotal.com: ", "cyan")
4462
- + colored(str(linkCount), "white")
4463
- )
4464
- + "\n"
4465
- )
4468
+ # Show links found
4469
+ linkCountVirusTotal = len(linksFoundVirusTotal)
4470
+ write(
4471
+ colored("VirusTotal - [ INFO ] Links found on virustotal.com: ", "cyan")
4472
+ + colored(str(linkCountVirusTotal), "white")
4473
+ )
4474
+ linksFound.update(linksFoundVirusTotal)
4475
+ linksFoundVirusTotal.clear()
4466
4476
 
4467
4477
  except Exception as e:
4468
4478
  writerr(colored(f"ERROR getVirusTotalUrls: {e}", "red"))
@@ -4472,7 +4482,7 @@ def processIntelxUrl(url):
4472
4482
  """
4473
4483
  Process a specific URL from intelx.io to determine whether to save the link
4474
4484
  """
4475
- global argsInput, argsInputHostname
4485
+ global argsInput, argsInputHostname, linkCountIntelx, linksFoundIntelx
4476
4486
 
4477
4487
  addLink = True
4478
4488
 
@@ -4524,7 +4534,7 @@ def processIntelxUrl(url):
4524
4534
 
4525
4535
  # Add link if it passed filters
4526
4536
  if addLink:
4527
- linksFoundAdd(url)
4537
+ linksFoundAdd(url, linksFoundIntelx)
4528
4538
 
4529
4539
  except Exception as e:
4530
4540
  writerr(colored("ERROR processIntelxUrl 1: " + str(e), "red"))
@@ -4535,6 +4545,7 @@ def processIntelxType(target, credits):
4535
4545
  target: 1 - Domains
4536
4546
  target: 3 - URLs
4537
4547
  """
4548
+ global intelxAPIIssue
4538
4549
  try:
4539
4550
  try:
4540
4551
  requestsMade = 0
@@ -4554,7 +4565,7 @@ def processIntelxType(target, credits):
4554
4565
  except Exception as e:
4555
4566
  write(
4556
4567
  colored(
4557
- getSPACER("[ ERR ] Unable to get links from intelx.io: " + str(e)),
4568
+ "IntelX - [ ERR ] Unable to get links from intelx.io: " + str(e),
4558
4569
  "red",
4559
4570
  )
4560
4571
  )
@@ -4562,49 +4573,47 @@ def processIntelxType(target, credits):
4562
4573
 
4563
4574
  # Deal with any errors
4564
4575
  if resp.status_code == 429:
4576
+ intelxAPIIssue = True
4565
4577
  writerr(
4566
4578
  colored(
4567
- getSPACER("[ 429 ] IntelX rate limit reached so unable to get links."),
4579
+ "IntelX - [ 429 ] Rate limit reached so unable to get links.",
4568
4580
  "red",
4569
4581
  )
4570
4582
  )
4571
4583
  return
4572
4584
  elif resp.status_code == 401:
4585
+ intelxAPIIssue = True
4573
4586
  writerr(
4574
4587
  colored(
4575
- getSPACER(
4576
- "[ 401 ] IntelX: Not authorized. The source requires a paid API key. Check your API key is correct."
4577
- ),
4588
+ "IntelX - [ 401 ] Not authorized. The source requires a paid API key. Check your API key is correct.",
4578
4589
  "red",
4579
4590
  )
4580
4591
  )
4581
4592
  return
4582
4593
  elif resp.status_code == 402:
4594
+ intelxAPIIssue = True
4583
4595
  if credits.startswith("0/"):
4584
4596
  writerr(
4585
4597
  colored(
4586
- getSPACER(
4587
- "[ 402 ] IntelX: You have run out of daily credits on Intelx ("
4588
- + credits
4589
- + ")."
4590
- ),
4598
+ "IntelX - [ 402 ] You have run out of daily credits on Intelx ("
4599
+ + credits
4600
+ + ").",
4591
4601
  "red",
4592
4602
  )
4593
4603
  )
4594
4604
  else:
4595
4605
  writerr(
4596
4606
  colored(
4597
- getSPACER(
4598
- "[ 402 ] IntelX: It appears you have run out of daily credits on Intelx."
4599
- ),
4607
+ "IntelX - [ 402 ] It appears you have run out of daily credits on Intelx.",
4600
4608
  "red",
4601
4609
  )
4602
4610
  )
4603
4611
  return
4604
4612
  elif resp.status_code == 403:
4613
+ intelxAPIIssue = True
4605
4614
  writerr(
4606
4615
  colored(
4607
- getSPACER("[ 403 ] IntelX: Permission denied. Check your API key is correct."),
4616
+ "IntelX - [ 403 ] Permission denied. Check your API key is correct.",
4608
4617
  "red",
4609
4618
  )
4610
4619
  )
@@ -4612,9 +4621,7 @@ def processIntelxType(target, credits):
4612
4621
  elif resp.status_code != 200:
4613
4622
  writerr(
4614
4623
  colored(
4615
- getSPACER(
4616
- "[ " + str(resp.status_code) + " ] Unable to get links from intelx.io"
4617
- ),
4624
+ "IntelX - [ " + str(resp.status_code) + " ] Unable to get links from intelx.io",
4618
4625
  "red",
4619
4626
  )
4620
4627
  )
@@ -4627,7 +4634,7 @@ def processIntelxType(target, credits):
4627
4634
  except Exception:
4628
4635
  writerr(
4629
4636
  colored(
4630
- getSPACER("[ ERR ] There was an unexpected response from the Intelligence API"),
4637
+ "IntelX - [ ERR ] There was an unexpected response from the Intelligence API",
4631
4638
  "red",
4632
4639
  )
4633
4640
  )
@@ -4637,7 +4644,7 @@ def processIntelxType(target, credits):
4637
4644
  moreResults = True
4638
4645
  status = 0
4639
4646
  while moreResults:
4640
- if stopSource:
4647
+ if stopSourceIntelx:
4641
4648
  break
4642
4649
  try:
4643
4650
  resp = session.get(
@@ -4648,7 +4655,7 @@ def processIntelxType(target, credits):
4648
4655
  except Exception as e:
4649
4656
  write(
4650
4657
  colored(
4651
- getSPACER("[ ERR ] Unable to get links from intelx.io: " + str(e)),
4658
+ "IntelX - [ ERR ] Unable to get links from intelx.io: " + str(e),
4652
4659
  "red",
4653
4660
  )
4654
4661
  )
@@ -4661,9 +4668,7 @@ def processIntelxType(target, credits):
4661
4668
  except Exception:
4662
4669
  writerr(
4663
4670
  colored(
4664
- getSPACER(
4665
- "[ ERR ] There was an unexpected response from the Intelligence API"
4666
- ),
4671
+ "IntelX - [ ERR ] There was an unexpected response from the Intelligence API",
4667
4672
  "red",
4668
4673
  )
4669
4674
  )
@@ -4685,7 +4690,7 @@ def processIntelxType(target, credits):
4685
4690
  # Work out whether to include each url
4686
4691
  unique_values = list(set(selector_values + selector_valuesh))
4687
4692
  for ixurl in unique_values:
4688
- if stopSource:
4693
+ if stopSourceIntelx:
4689
4694
  break
4690
4695
  processIntelxUrl(ixurl)
4691
4696
 
@@ -4727,56 +4732,51 @@ def getIntelxUrls():
4727
4732
  """
4728
4733
  Get URLs from the Intelligence X Phonebook search
4729
4734
  """
4730
- global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSource, argsInput, checkIntelx, argsInputHostname
4735
+ global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx
4731
4736
 
4732
4737
  # Write the file of URL's for the passed domain/URL
4733
4738
  try:
4734
4739
  if args.check_only:
4735
4740
  write(
4736
- colored("Get URLs from Intelligence X: ", "cyan")
4741
+ colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
4737
4742
  + colored("minimum 4 requests", "white")
4738
4743
  )
4739
4744
  checkIntelx = 4
4740
4745
  return
4741
4746
 
4742
- stopSource = False
4743
- originalLinkCount = len(linksFound)
4747
+ stopSourceIntelx = False
4748
+ linksFoundIntelx = set()
4749
+
4744
4750
  credits = getIntelxAccountInfo()
4745
4751
  if verbose():
4746
4752
  write(
4747
4753
  colored(
4748
- "The Intelligence X URL requested to get links (Credits: " + credits + "): ",
4754
+ "IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
4755
+ + credits
4756
+ + "): ",
4749
4757
  "magenta",
4750
4758
  )
4751
4759
  + colored(INTELX_SEARCH_URL + "\n", "white")
4752
4760
  )
4753
4761
 
4754
4762
  if not args.check_only:
4755
- write(colored("\rGetting links from intelx.io API...\r", "cyan"))
4763
+ write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
4756
4764
 
4757
4765
  # Get the domains from Intelligence X if the --no-subs wasn't passed
4758
4766
  if not args.no_subs:
4759
4767
  processIntelxType(1, credits)
4760
4768
 
4761
4769
  # Get the URLs from Intelligence X
4762
- processIntelxType(3, credits)
4770
+ if not intelxAPIIssue:
4771
+ processIntelxType(3, credits)
4763
4772
 
4764
- linkCount = len(linksFound) - originalLinkCount
4765
- if args.xwm and args.xcc and args.xav and args.xus and args.xvt:
4766
- write(
4767
- getSPACER(
4768
- colored("Links found on intelx.io: ", "cyan") + colored(str(linkCount), "white")
4769
- )
4770
- + "\n"
4771
- )
4772
- else:
4773
- write(
4774
- getSPACER(
4775
- colored("Extra links found on intelx.io: ", "cyan")
4776
- + colored(str(linkCount), "white")
4777
- )
4778
- + "\n"
4779
- )
4773
+ linkCountIntelx = len(linksFoundIntelx)
4774
+ write(
4775
+ colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
4776
+ + colored(str(linkCountIntelx), "white")
4777
+ )
4778
+ linksFound.update(linksFoundIntelx)
4779
+ linksFoundIntelx.clear()
4780
4780
 
4781
4781
  except Exception as e:
4782
4782
  writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
@@ -4832,14 +4832,16 @@ def processResponsesURLScan():
4832
4832
  writerr(colored("ERROR processResponsesURLScan 4: " + str(e), "red"))
4833
4833
 
4834
4834
  # Get URLs from URLScan.io if the DOM ID's haven't been retrieved yet
4835
- if args.mode == "R" and stopProgram is None and not args.check_only:
4836
- write(
4837
- colored(
4838
- "\rGetting list of response links (this can take a while for some domains)...\r",
4839
- "cyan",
4835
+ if stopProgram is None and not args.check_only:
4836
+ if args.mode in ("R", "B"):
4837
+ write(
4838
+ colored(
4839
+ "URLScan - [ INFO ] Getting list of response links (this can take a while for some domains)...",
4840
+ "cyan",
4841
+ )
4840
4842
  )
4841
- )
4842
- getURLScanUrls()
4843
+ if args.mode == "R":
4844
+ getURLScanUrls()
4843
4845
 
4844
4846
  # Check if a continueResp.URLScan.tmp and responses.URLScan.tmp files exists
4845
4847
  runPrevious = "n"
@@ -4937,25 +4939,6 @@ def processResponsesURLScan():
4937
4939
  "green",
4938
4940
  )
4939
4941
  )
4940
- # if args.limit == 5000 and totalResponses == 5000:
4941
- # writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests (the --limit argument defaults to '+str(DEFAULT_LIMIT)+')','cyan'))
4942
- # else:
4943
- # writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests','white'))
4944
- # minutes = round(totalResponses*2.5 // 60)
4945
- # hours = minutes // 60
4946
- # days = hours // 24
4947
- # if minutes < 5:
4948
- # write(colored('\n-> Downloading the responses (depending on their size) should be quite quick!','green'))
4949
- # elif hours < 2:
4950
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(minutes)+' minutes.','green'))
4951
- # elif hours < 6:
4952
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','green'))
4953
- # elif hours < 24:
4954
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','yellow'))
4955
- # elif days < 7:
4956
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days. Consider using arguments -ko, -l, -ci, -from and -to wisely! ','red'))
4957
- # else:
4958
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days!!! Consider using arguments -ko, -l, -ci, -from and -to wisely!','red'))
4959
4942
  write("")
4960
4943
  else:
4961
4944
  # If the limit has been set over the default, give a warning that this could take a long time!
@@ -5017,7 +5000,7 @@ def processResponsesURLScan():
5017
5000
  if failureCount > 0:
5018
5001
  if verbose():
5019
5002
  write(
5020
- colored("\nURLScan responses saved to ", "cyan")
5003
+ colored("URLScan - [ INFO ] Responses saved to ", "cyan")
5021
5004
  + colored(responseOutputDirectory, "white")
5022
5005
  + colored(" for " + subs + argsInput + ": ", "cyan")
5023
5006
  + colored(
@@ -5032,7 +5015,7 @@ def processResponsesURLScan():
5032
5015
  else:
5033
5016
  write(
5034
5017
  colored(
5035
- "\nURLScan responses saved for " + subs + argsInput + ": ",
5018
+ "URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5036
5019
  "cyan",
5037
5020
  )
5038
5021
  + colored(
@@ -5047,7 +5030,10 @@ def processResponsesURLScan():
5047
5030
  else:
5048
5031
  if verbose():
5049
5032
  write(
5050
- colored("\nURLScan responses saved to ", "cyan")
5033
+ colored(
5034
+ "URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5035
+ "cyan",
5036
+ )
5051
5037
  + colored(responseOutputDirectory, "white")
5052
5038
  + colored(" for " + subs + argsInput + ": ", "cyan")
5053
5039
  + colored(
@@ -5061,7 +5047,7 @@ def processResponsesURLScan():
5061
5047
  else:
5062
5048
  write(
5063
5049
  colored(
5064
- "\nURLScan responses saved for " + subs + argsInput + ": ",
5050
+ "URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5065
5051
  "cyan",
5066
5052
  )
5067
5053
  + colored(
@@ -5087,7 +5073,7 @@ def processResponsesWayback():
5087
5073
  """
5088
5074
  Get archived responses from Wayback Machine (archive.org)
5089
5075
  """
5090
- global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount
5076
+ global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount, current_response, current_session
5091
5077
  try:
5092
5078
  fileCount = 0
5093
5079
  failureCount = 0
@@ -5230,18 +5216,18 @@ def processResponsesWayback():
5230
5216
  if verbose():
5231
5217
  write(
5232
5218
  colored(
5233
- "The Wayback Machine URL requested to get responses: ",
5219
+ "Wayback - [ INFO ] The URL requested to get responses: ",
5234
5220
  "magenta",
5235
5221
  )
5236
5222
  + colored(url + "\n", "white")
5237
5223
  )
5238
5224
 
5239
5225
  if args.check_only:
5240
- write(colored("\rChecking archived response requests...\r", "cyan"))
5226
+ write(colored("Wayback - [ INFO ] Checking archived response requests...", "cyan"))
5241
5227
  else:
5242
5228
  write(
5243
5229
  colored(
5244
- "\rGetting list of response links (this can take a while for some domains)...\r",
5230
+ "Wayback - [ INFO ] Getting list of response links (this can take a while for some domains)...",
5245
5231
  "cyan",
5246
5232
  )
5247
5233
  )
@@ -5254,16 +5240,24 @@ def processResponsesWayback():
5254
5240
  session = requests.Session()
5255
5241
  session.mount("https://", HTTP_ADAPTER)
5256
5242
  session.mount("http://", HTTP_ADAPTER)
5243
+ try:
5244
+ current_session = session
5245
+ except Exception:
5246
+ pass
5257
5247
  resp = session.get(
5258
5248
  url,
5259
5249
  stream=True,
5260
5250
  headers={"User-Agent": userAgent},
5261
5251
  timeout=args.timeout,
5262
5252
  )
5253
+ try:
5254
+ current_response = resp
5255
+ except Exception:
5256
+ pass
5263
5257
  except ConnectionError:
5264
5258
  writerr(
5265
5259
  colored(
5266
- getSPACER("[ ERR ] Wayback Machine (archive.org) connection error"),
5260
+ getSPACER("Wayback - [ ERR ] Connection error"),
5267
5261
  "red",
5268
5262
  )
5269
5263
  )
@@ -5273,7 +5267,7 @@ def processResponsesWayback():
5273
5267
  except Exception as e:
5274
5268
  writerr(
5275
5269
  colored(
5276
- getSPACER("[ ERR ] Couldn't get list of responses: " + str(e)),
5270
+ getSPACER("Wayback - [ ERR ] Couldn't get list of responses: " + str(e)),
5277
5271
  "red",
5278
5272
  )
5279
5273
  )
@@ -5288,7 +5282,7 @@ def processResponsesWayback():
5288
5282
  writerr(
5289
5283
  colored(
5290
5284
  getSPACER(
5291
- "No archived responses were found on Wayback Machine (archive.org) for the given search parameters."
5285
+ "Wayback - [ ERR ] No archived responses were found on Wayback Machine (archive.org) for the given search parameters."
5292
5286
  ),
5293
5287
  "red",
5294
5288
  )
@@ -5299,7 +5293,7 @@ def processResponsesWayback():
5299
5293
  writerr(
5300
5294
  colored(
5301
5295
  getSPACER(
5302
- "[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
5296
+ "Wayback - [ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
5303
5297
  ),
5304
5298
  "red",
5305
5299
  )
@@ -5310,7 +5304,7 @@ def processResponsesWayback():
5310
5304
  writerr(
5311
5305
  colored(
5312
5306
  getSPACER(
5313
- "[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
5307
+ "Wayback - [ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
5314
5308
  ),
5315
5309
  "red",
5316
5310
  )
@@ -5322,7 +5316,10 @@ def processResponsesWayback():
5322
5316
  writerr(
5323
5317
  colored(
5324
5318
  getSPACER(
5325
- "[ " + str(resp.status_code) + " ] Error for " + url
5319
+ "Wayback - [ "
5320
+ + str(resp.status_code)
5321
+ + " ] Error for "
5322
+ + url
5326
5323
  ),
5327
5324
  "red",
5328
5325
  )
@@ -5334,7 +5331,7 @@ def processResponsesWayback():
5334
5331
  writerr(
5335
5332
  colored(
5336
5333
  getSPACER(
5337
- "Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml"
5334
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml"
5338
5335
  ),
5339
5336
  "red",
5340
5337
  )
@@ -5343,7 +5340,7 @@ def processResponsesWayback():
5343
5340
  writerr(
5344
5341
  colored(
5345
5342
  getSPACER(
5346
- "Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed"
5343
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed"
5347
5344
  ),
5348
5345
  "red",
5349
5346
  )
@@ -5353,7 +5350,7 @@ def processResponsesWayback():
5353
5350
  writerr(
5354
5351
  colored(
5355
5352
  getSPACER(
5356
- "Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)"
5353
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)"
5357
5354
  ),
5358
5355
  "red",
5359
5356
  )
@@ -5362,7 +5359,7 @@ def processResponsesWayback():
5362
5359
  writerr(
5363
5360
  colored(
5364
5361
  getSPACER(
5365
- "Failed to get links from Wayback Machine (archive.org) - check input domain and try again."
5362
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - check input domain and try again."
5366
5363
  ),
5367
5364
  "red",
5368
5365
  )
@@ -5372,23 +5369,43 @@ def processResponsesWayback():
5372
5369
  pass
5373
5370
 
5374
5371
  # Go through the response to save the links found
5375
- for line in resp.iter_lines():
5372
+ try:
5373
+ for line in resp.iter_lines():
5374
+ try:
5375
+ results = line.decode("utf-8")
5376
+ parts = results.split(" ", 2)
5377
+ timestamp = parts[0]
5378
+ originalUrl = parts[1]
5379
+ linksFoundResponseAdd(timestamp + "/" + originalUrl)
5380
+ except Exception:
5381
+ writerr(
5382
+ colored(
5383
+ getSPACER(
5384
+ "ERROR processResponsesWayback 3: Cannot to get link from line: "
5385
+ + str(line)
5386
+ ),
5387
+ "red",
5388
+ )
5389
+ )
5390
+ finally:
5376
5391
  try:
5377
- results = line.decode("utf-8")
5378
- parts = results.split(" ", 2)
5379
- timestamp = parts[0]
5380
- originalUrl = parts[1]
5381
- linksFoundResponseAdd(timestamp + "/" + originalUrl)
5392
+ current_response = None
5382
5393
  except Exception:
5383
- writerr(
5384
- colored(
5385
- getSPACER(
5386
- "ERROR processResponsesWayback 3: Cannot to get link from line: "
5387
- + str(line)
5388
- ),
5389
- "red",
5390
- )
5391
- )
5394
+ pass
5395
+ try:
5396
+ current_session = None
5397
+ except Exception:
5398
+ pass
5399
+
5400
+ # Cleanup shared response/session references now the response has been processed
5401
+ try:
5402
+ current_response = None
5403
+ except Exception:
5404
+ pass
5405
+ try:
5406
+ current_session = None
5407
+ except Exception:
5408
+ pass
5392
5409
 
5393
5410
  # Remove any links that have URL exclusions
5394
5411
  linkRequests = []
@@ -5421,7 +5438,7 @@ def processResponsesWayback():
5421
5438
  writerr(
5422
5439
  colored(
5423
5440
  getSPACER(
5424
- 'Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'
5441
+ 'Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'
5425
5442
  + originalUrl
5426
5443
  + "\") but they didn't match the input you gave. Check input and try again."
5427
5444
  ),
@@ -5432,7 +5449,7 @@ def processResponsesWayback():
5432
5449
  writerr(
5433
5450
  colored(
5434
5451
  getSPACER(
5435
- "Failed to get links from Wayback Machine (archive.org) - check input and try again."
5452
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - check input and try again."
5436
5453
  ),
5437
5454
  "red",
5438
5455
  )
@@ -5575,7 +5592,7 @@ def processResponsesWayback():
5575
5592
  if failureCount > 0:
5576
5593
  if verbose():
5577
5594
  write(
5578
- colored("\nWayback responses saved to ", "cyan")
5595
+ colored("Wayback - [ INFO ] Responses saved to ", "cyan")
5579
5596
  + colored(responseOutputDirectory, "white")
5580
5597
  + colored(" for " + subs + argsInput + ": ", "cyan")
5581
5598
  + colored(
@@ -5590,7 +5607,7 @@ def processResponsesWayback():
5590
5607
  else:
5591
5608
  write(
5592
5609
  colored(
5593
- "\nWayback responses saved for " + subs + argsInput + ": ",
5610
+ "Wayback - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5594
5611
  "cyan",
5595
5612
  )
5596
5613
  + colored(
@@ -5605,7 +5622,7 @@ def processResponsesWayback():
5605
5622
  else:
5606
5623
  if verbose():
5607
5624
  write(
5608
- colored("\nWayback responses saved to ", "cyan")
5625
+ colored("Wayback - [ INFO ] Responses saved to ", "cyan")
5609
5626
  + colored(responseOutputDirectory, "white")
5610
5627
  + colored(" for " + subs + argsInput + ": ", "cyan")
5611
5628
  + colored(
@@ -5619,7 +5636,7 @@ def processResponsesWayback():
5619
5636
  else:
5620
5637
  write(
5621
5638
  colored(
5622
- "\nWayback responses saved for " + subs + argsInput + ": ",
5639
+ "Wayback - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5623
5640
  "cyan",
5624
5641
  )
5625
5642
  + colored(
@@ -5933,9 +5950,91 @@ def combineInlineJS():
5933
5950
  writerr(colored("ERROR combineInlineJS 1: " + str(e), "red"))
5934
5951
 
5935
5952
 
5953
+ # Async wrapper functions for concurrent source fetching
5954
+ async def fetch_wayback_async():
5955
+ """Async wrapper for getWaybackUrls - runs in thread pool"""
5956
+ loop = asyncio.get_event_loop()
5957
+ await loop.run_in_executor(None, getWaybackUrls)
5958
+
5959
+
5960
+ async def fetch_commoncrawl_async():
5961
+ """Async wrapper for getCommonCrawlUrls - runs in thread pool"""
5962
+ loop = asyncio.get_event_loop()
5963
+ await loop.run_in_executor(None, getCommonCrawlUrls)
5964
+
5965
+
5966
+ async def fetch_alienvault_async():
5967
+ """Async wrapper for getAlienVaultUrls - runs in thread pool"""
5968
+ loop = asyncio.get_event_loop()
5969
+ await loop.run_in_executor(None, getAlienVaultUrls)
5970
+
5971
+
5972
+ async def fetch_urlscan_async():
5973
+ """Async wrapper for getURLScanUrls - runs in thread pool"""
5974
+ loop = asyncio.get_event_loop()
5975
+ await loop.run_in_executor(None, getURLScanUrls)
5976
+
5977
+
5978
+ async def fetch_virustotal_async():
5979
+ """Async wrapper for getVirusTotalUrls - runs in thread pool"""
5980
+ loop = asyncio.get_event_loop()
5981
+ await loop.run_in_executor(None, getVirusTotalUrls)
5982
+
5983
+
5984
+ async def fetch_intelx_async():
5985
+ """Async wrapper for getIntelxUrls - runs in thread pool"""
5986
+ loop = asyncio.get_event_loop()
5987
+ await loop.run_in_executor(None, getIntelxUrls)
5988
+
5989
+
5990
+ async def fetch_all_sources_async():
5991
+ """
5992
+ Orchestrator function to fetch from all enabled sources concurrently.
5993
+ Each source runs in its own thread pool executor while orchestration happens async.
5994
+ """
5995
+ global args, stopProgram, VIRUSTOTAL_API_KEY, INTELX_API_KEY, argsInput
5996
+
5997
+ tasks = []
5998
+
5999
+ # Build list of tasks for enabled sources
6000
+ if not args.xwm and stopProgram is None:
6001
+ tasks.append(("Wayback Machine", fetch_wayback_async()))
6002
+ if not args.xcc and stopProgram is None:
6003
+ tasks.append(("Common Crawl", fetch_commoncrawl_async()))
6004
+ if not args.xav and stopProgram is None and not argsInput.startswith("."):
6005
+ tasks.append(("AlienVault OTX", fetch_alienvault_async()))
6006
+ if not args.xus and stopProgram is None:
6007
+ tasks.append(("URLScan", fetch_urlscan_async()))
6008
+ if not args.xvt and VIRUSTOTAL_API_KEY != "" and stopProgram is None:
6009
+ tasks.append(("VirusTotal", fetch_virustotal_async()))
6010
+ if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
6011
+ tasks.append(("Intelligence X", fetch_intelx_async()))
6012
+
6013
+ if not tasks:
6014
+ return
6015
+
6016
+ # Extract just the coroutines for gather
6017
+ task_coros = [task[1] for task in tasks]
6018
+
6019
+ # Fetch all concurrently, capturing exceptions so one failure doesn't stop others
6020
+ results = await asyncio.gather(*task_coros, return_exceptions=True)
6021
+
6022
+ # Check for any exceptions that occurred
6023
+ for i, result in enumerate(results):
6024
+ if isinstance(result, Exception):
6025
+ source_name = tasks[i][0]
6026
+ if verbose():
6027
+ writerr(
6028
+ colored(
6029
+ getSPACER(f"ERROR in {source_name} during concurrent fetch: {str(result)}"),
6030
+ "red",
6031
+ )
6032
+ )
6033
+
6034
+
5936
6035
  # Run waymore
5937
6036
  def main():
5938
- global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY
6037
+ global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
5939
6038
 
5940
6039
  # Tell Python to run the handler() function when SIGINT is received
5941
6040
  signal(SIGINT, handler)
@@ -6104,6 +6203,7 @@ def main():
6104
6203
  action="store",
6105
6204
  type=int,
6106
6205
  help="Limit the number of Common Crawl index collections searched, e.g. '-lcc 10' will just search the latest 10 collections (default: 1). As of November 2024 there are currently 106 collections. Setting to 0 (default) will search ALL collections. If you don't want to search Common Crawl at all, use the -xcc option.",
6206
+ default=1,
6107
6207
  )
6108
6208
  parser.add_argument(
6109
6209
  "-t",
@@ -6118,10 +6218,10 @@ def main():
6118
6218
  parser.add_argument(
6119
6219
  "-p",
6120
6220
  "--processes",
6121
- help="Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1)",
6221
+ help="Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 2)",
6122
6222
  action="store",
6123
6223
  type=validateArgProcesses,
6124
- default=1,
6224
+ default=2,
6125
6225
  metavar="<integer>",
6126
6226
  )
6127
6227
  parser.add_argument(
@@ -6326,6 +6426,12 @@ def main():
6326
6426
  indexFile = None
6327
6427
  path = ""
6328
6428
  stopSource = False
6429
+ stopSourceWayback = False
6430
+ stopSourceCommonCrawl = False
6431
+ stopSourceAlienVault = False
6432
+ stopSourceURLScan = False
6433
+ stopSourceVirusTotal = False
6434
+ stopSourceIntelx = False
6329
6435
 
6330
6436
  # Get the config settings from the config.yml file
6331
6437
  getConfig()
@@ -6343,29 +6449,17 @@ def main():
6343
6449
  # If the mode is U (URLs retrieved) or B (URLs retrieved AND Responses downloaded)
6344
6450
  if args.mode in ["U", "B"]:
6345
6451
 
6346
- # If not requested to exclude, get URLs from the Wayback Machine (archive.org)
6347
- if not args.xwm and stopProgram is None:
6348
- getWaybackUrls()
6349
-
6350
- # If not requested to exclude, get URLs from commoncrawl.org
6351
- if not args.xcc and stopProgram is None:
6352
- getCommonCrawlUrls()
6353
-
6354
- # If not requested to exclude and a TLD wasn't passed, get URLs from alienvault.com
6355
- if not args.xav and stopProgram is None and not inpt.startswith("."):
6356
- getAlienVaultUrls()
6357
-
6358
- # If not requested to exclude, get URLs from urlscan.io
6359
- if not args.xus and stopProgram is None:
6360
- getURLScanUrls()
6361
-
6362
- # If not requested to exclude, get URLs from virustotal.com if we have an API key
6363
- if not args.xvt and VIRUSTOTAL_API_KEY != "" and stopProgram is None:
6364
- getVirusTotalUrls()
6365
-
6366
- # If not requested to exclude, get URLs from intelx.io if we have an API key
6367
- if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
6368
- getIntelxUrls()
6452
+ # Fetch from all sources concurrently using async/await
6453
+ try:
6454
+ asyncio.run(fetch_all_sources_async())
6455
+ except Exception as e:
6456
+ if verbose():
6457
+ writerr(
6458
+ colored(
6459
+ getSPACER(f"ERROR during concurrent source fetching: {str(e)}"),
6460
+ "red",
6461
+ )
6462
+ )
6369
6463
 
6370
6464
  # Output results of all searches
6371
6465
  processURLOutput()