waymore 6.5__py3-none-any.whl → 7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/waymore.py CHANGED
@@ -4,28 +4,30 @@
4
4
  # Full help here: https://github.com/xnl-h4ck3r/waymore/blob/main/README.md
5
5
  # Good luck and good hunting! If you really love the tool (or any others), or they helped you find an awesome bounty, consider BUYING ME A COFFEE! (https://ko-fi.com/xnlh4ck3r) ☕ (I could use the caffeine!)
6
6
 
7
- from urllib.parse import urlparse
8
- import requests
9
- from requests.exceptions import ConnectionError
10
- from requests.utils import quote
11
- from requests.adapters import HTTPAdapter, Retry
12
7
  import argparse
13
- from signal import SIGINT, signal
8
+ import asyncio
9
+ import enum
10
+ import json
11
+ import math
14
12
  import multiprocessing.dummy as mp
15
- from termcolor import colored
16
- from datetime import datetime, timedelta
17
- from pathlib import Path
18
- import yaml
19
13
  import os
20
- import json
21
- import re
14
+ import pickle
22
15
  import random
16
+ import re
23
17
  import sys
24
- import math
25
- import enum
26
- import pickle
27
- import time
18
+ import threading
19
+ from datetime import datetime, timedelta
20
+ from pathlib import Path
21
+ from signal import SIGINT, signal
22
+ from urllib.parse import urlparse
23
+
24
+ import requests
28
25
  import tldextract
26
+ import yaml
27
+ from requests.adapters import HTTPAdapter, Retry
28
+ from requests.exceptions import ConnectionError
29
+ from requests.utils import quote
30
+ from termcolor import colored
29
31
 
30
32
  try:
31
33
  from . import __version__
@@ -59,6 +61,12 @@ argsInput = ""
59
61
  isInputFile = False
60
62
  stopProgramCount = 0
61
63
  stopSource = False
64
+ stopSourceWayback = False
65
+ stopSourceCommonCrawl = False
66
+ stopSourceAlienVault = False
67
+ stopSourceURLScan = False
68
+ stopSourceVirusTotal = False
69
+ stopSourceIntelx = False
62
70
  successCount = 0
63
71
  failureCount = 0
64
72
  fileCount = 0
@@ -79,6 +87,10 @@ currentMemUsage = 0
79
87
  maxMemoryPercent = 0
80
88
  currentMemPercent = 0
81
89
  process = None
90
+ current_response = None
91
+ current_session = None
92
+ # Event used to interrupt long sleeps (e.g., rate-limit waits) when SIGINT is received
93
+ interrupt_event = threading.Event()
82
94
  HTTP_ADAPTER = None
83
95
  HTTP_ADAPTER_CC = None
84
96
  checkWayback = 0
@@ -90,20 +102,28 @@ checkIntelx = 0
90
102
  argsInputHostname = ""
91
103
  responseOutputDirectory = ""
92
104
  urlscanRequestLinks = set()
105
+ intelxAPIIssue = False
106
+ linkCountWayback = 0
107
+ linkCountCommonCrawl = 0
108
+ linkCountAlienVault = 0
109
+ linkCountURLScan = 0
110
+ linkCountVirusTotal = 0
111
+ linkCountIntelx = 0
112
+
113
+ # Thread lock for protecting shared state during concurrent operations
114
+ links_lock = threading.Lock()
115
+
116
+ # Shared state for link collection across all sources
117
+ linksFound = set()
118
+ linkMimes = set()
93
119
 
94
120
  # Source Provider URLs
95
121
  WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
96
122
  CCRAWL_INDEX_URL = "https://index.commoncrawl.org/collinfo.json"
97
- ALIENVAULT_URL = (
98
- "https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500"
99
- )
100
- URLSCAN_URL = (
101
- "https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}{DATERANGE}&size=10000"
102
- )
123
+ ALIENVAULT_URL = "https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500"
124
+ URLSCAN_URL = "https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}{DATERANGE}&size=10000"
103
125
  URLSCAN_DOM_URL = "https://urlscan.io/dom/"
104
- VIRUSTOTAL_URL = (
105
- "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
106
- )
126
+ VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
107
127
  INTELX_SEARCH_URL = "https://2.intelx.io/phonebook/search"
108
128
  INTELX_RESULTS_URL = "https://2.intelx.io/phonebook/search/result?id="
109
129
  INTELX_ACCOUNT_URL = "https://2.intelx.io/authenticate/info"
@@ -237,8 +257,7 @@ def write(text="", pipe=False):
237
257
  # or if the tool has been piped and the pipe parameter is True
238
258
  # AND if --stream is NOT active OR if it is active but we are explicitly piping (e.g. for URLs)
239
259
  if (sys.stdout.isatty() or (not sys.stdout.isatty() and pipe)) and (
240
- not (args.stream and args.mode == "U")
241
- or (args.stream and args.mode == "U" and pipe)
260
+ not (args.stream and args.mode == "U") or (args.stream and args.mode == "U" and pipe)
242
261
  ):
243
262
  # If it has carriage return in the string, don't add a newline
244
263
  if text.find("\r") > 0:
@@ -274,26 +293,14 @@ def showVersion():
274
293
  timeout=3,
275
294
  )
276
295
  except Exception:
277
- write(
278
- "Current waymore version "
279
- + __version__
280
- + " (unable to check if latest)\n"
281
- )
296
+ write("Current waymore version " + __version__ + " (unable to check if latest)\n")
282
297
  if __version__ == resp.text.split("=")[1].replace('"', "").strip():
283
298
  write(
284
- "Current waymore version "
285
- + __version__
286
- + " ("
287
- + colored("latest", "green")
288
- + ")\n"
299
+ "Current waymore version " + __version__ + " (" + colored("latest", "green") + ")\n"
289
300
  )
290
301
  else:
291
302
  write(
292
- "Current waymore version "
293
- + __version__
294
- + " ("
295
- + colored("outdated", "red")
296
- + ")\n"
303
+ "Current waymore version " + __version__ + " (" + colored("outdated", "red") + ")\n"
297
304
  )
298
305
  except Exception:
299
306
  pass
@@ -307,9 +314,7 @@ def showBanner():
307
314
  write(colored("| | | / ___ | |_| ", "red") + "| | | | |_| | | | |_| |")
308
315
  write(colored(r" \___/\_____|\__ ", "red") + r"|_|_|_|\___/| | | ____/")
309
316
  write(
310
- colored(" (____/ ", "red")
311
- + colored(" by Xnl-h4ck3r ", "magenta")
312
- + r" \_____)"
317
+ colored(" (____/ ", "red") + colored(" by Xnl-h4ck3r ", "magenta") + r" \_____)"
313
318
  )
314
319
  try:
315
320
  currentDate = datetime.now().date()
@@ -322,11 +327,7 @@ def showBanner():
322
327
  )
323
328
  )
324
329
  elif currentDate.month == 10 and currentDate.day == 31:
325
- write(
326
- colored(
327
- " *** 🎃 HAPPY HALLOWEEN! 🎃 ***", "red", attrs=["blink"]
328
- )
329
- )
330
+ write(colored(" *** 🎃 HAPPY HALLOWEEN! 🎃 ***", "red", attrs=["blink"]))
330
331
  elif currentDate.month == 1 and currentDate.day in (1, 2, 3, 4, 5):
331
332
  write(
332
333
  colored(
@@ -353,16 +354,14 @@ def handler(signal_received, frame):
353
354
  This function is called if Ctrl-C is called by the user
354
355
  An attempt will be made to try and clean up properly
355
356
  """
356
- global stopSource, stopProgram, stopProgramCount
357
+ global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
357
358
 
358
359
  if stopProgram is not None:
359
360
  stopProgramCount = stopProgramCount + 1
360
361
  if stopProgramCount == 1:
361
362
  writerr(
362
363
  colored(
363
- getSPACER(
364
- ">>> Please be patient... Trying to save data and end gracefully!"
365
- ),
364
+ getSPACER(">>> Please be patient... Trying to save data and end gracefully!"),
366
365
  "red",
367
366
  )
368
367
  )
@@ -384,17 +383,41 @@ def handler(signal_received, frame):
384
383
  else:
385
384
  stopProgram = StopProgram.SIGINT
386
385
  stopSource = True
386
+ stopSourceWayback = True
387
+ stopSourceCommonCrawl = True
388
+ stopSourceAlienVault = True
389
+ stopSourceURLScan = True
390
+ stopSourceVirusTotal = True
391
+ stopSourceIntelx = True
392
+ # Try to close any active response or session to interrupt blocking network I/O
393
+ try:
394
+ if current_response is not None:
395
+ try:
396
+ current_response.close()
397
+ except Exception:
398
+ pass
399
+ except Exception:
400
+ pass
401
+ try:
402
+ if current_session is not None:
403
+ try:
404
+ current_session.close()
405
+ except Exception:
406
+ pass
407
+ except Exception:
408
+ pass
409
+ # Signal any waits to stop early
410
+ try:
411
+ interrupt_event.set()
412
+ except Exception:
413
+ pass
387
414
  writerr(
388
415
  colored(
389
416
  getSPACER('>>> "Oh my God, they killed Kenny... and waymore!" - Kyle'),
390
417
  "red",
391
418
  )
392
419
  )
393
- writerr(
394
- colored(
395
- getSPACER(">>> Attempting to rescue any data gathered so far..."), "red"
396
- )
397
- )
420
+ writerr(colored(getSPACER(">>> Attempting to rescue any data gathered so far..."), "red"))
398
421
 
399
422
 
400
423
  def showOptions():
@@ -479,13 +502,13 @@ def showOptions():
479
502
  )
480
503
 
481
504
  if not args.xcc:
482
- if args.lcc == 0 and args.lcy == 0:
505
+ if args.lcc == 0 and args.from_date is None and args.to_date is None:
483
506
  write(
484
507
  colored("-lcc: " + str(args.lcc), "magenta")
485
508
  + colored(" Search ALL Common Crawl index collections.", "white")
486
509
  )
487
510
  else:
488
- if args.lcy == 0:
511
+ if args.from_date is None and args.to_date is None:
489
512
  write(
490
513
  colored("-lcc: " + str(args.lcc), "magenta")
491
514
  + colored(
@@ -498,19 +521,10 @@ def showOptions():
498
521
  write(
499
522
  colored("-lcc: " + str(args.lcc), "magenta")
500
523
  + colored(
501
- " The number of latest Common Crawl index collections to be searched.",
524
+ " The number of latest Common Crawl index collections to be searched within the specified date range (-to and -from).",
502
525
  "white",
503
526
  )
504
527
  )
505
- write(
506
- colored("-lcy: " + str(args.lcy), "magenta")
507
- + colored(
508
- " Search all Common Crawl index collections with data from year "
509
- + str(args.lcy)
510
- + " and after.",
511
- "white",
512
- )
513
- )
514
528
 
515
529
  if URLSCAN_API_KEY == "":
516
530
  write(
@@ -532,9 +546,7 @@ def showOptions():
532
546
  )
533
547
  )
534
548
  else:
535
- write(
536
- colored("VirusTotal API Key: ", "magenta") + colored(VIRUSTOTAL_API_KEY)
537
- )
549
+ write(colored("VirusTotal API Key: ", "magenta") + colored(VIRUSTOTAL_API_KEY))
538
550
 
539
551
  if INTELX_API_KEY == "":
540
552
  write(
@@ -545,9 +557,7 @@ def showOptions():
545
557
  )
546
558
  )
547
559
  else:
548
- write(
549
- colored("Intelligence X API Key: ", "magenta") + colored(INTELX_API_KEY)
550
- )
560
+ write(colored("Intelligence X API Key: ", "magenta") + colored(INTELX_API_KEY))
551
561
 
552
562
  if args.mode in ["U", "B"]:
553
563
  if args.output_urls != "":
@@ -589,9 +599,7 @@ def showOptions():
589
599
  write(
590
600
  colored("-l: " + str(args.limit), "magenta")
591
601
  + colored(
592
- " Only save the FIRST "
593
- + str(args.limit)
594
- + " responses found.",
602
+ " Only save the FIRST " + str(args.limit) + " responses found.",
595
603
  "white",
596
604
  )
597
605
  )
@@ -599,24 +607,11 @@ def showOptions():
599
607
  write(
600
608
  colored("-l: " + str(args.limit), "magenta")
601
609
  + colored(
602
- " Only save the LAST "
603
- + str(abs(args.limit))
604
- + " responses found.",
610
+ " Only save the LAST " + str(abs(args.limit)) + " responses found.",
605
611
  "white",
606
612
  )
607
613
  )
608
614
 
609
- if args.from_date is not None:
610
- write(
611
- colored("-from: " + str(args.from_date), "magenta")
612
- + colored(" The date/time to get responses from.", "white")
613
- )
614
- if args.to_date is not None:
615
- write(
616
- colored("-to: " + str(args.to_date), "magenta")
617
- + colored(" The date/time to get responses up to.", "white")
618
- )
619
-
620
615
  if args.capture_interval == "h":
621
616
  write(
622
617
  colored("-ci: " + args.capture_interval, "magenta")
@@ -667,6 +662,32 @@ def showOptions():
667
662
  )
668
663
  )
669
664
 
665
+ if args.from_date is not None:
666
+ write(
667
+ colored("-from: " + str(args.from_date), "magenta")
668
+ + colored(
669
+ " The date/time to get data from.",
670
+ "white",
671
+ )
672
+ + colored(
673
+ " NOTE: All results will still be returned from Intelligence X, and all sub domains from Virus Total, because these cannot be filtered by date.",
674
+ "yellow",
675
+ )
676
+ )
677
+
678
+ if args.to_date is not None:
679
+ write(
680
+ colored("-to: " + str(args.to_date), "magenta")
681
+ + colored(
682
+ " The date/time to get data up to.",
683
+ "white",
684
+ )
685
+ + colored(
686
+ " NOTE: All results will still be returned from Intelligence X, and all sub domains from Virus Total, because these cannot be filtered by date.",
687
+ "yellow",
688
+ )
689
+ )
690
+
670
691
  write(
671
692
  colored("-f: " + str(args.filter_responses_only), "magenta")
672
693
  + colored(
@@ -705,9 +726,7 @@ def showOptions():
705
726
  )
706
727
  )
707
728
  if not args.mc and args.fc:
708
- write(
709
- colored("Response Code exclusions: ", "magenta") + colored(FILTER_CODE)
710
- )
729
+ write(colored("Response Code exclusions: ", "magenta") + colored(FILTER_CODE))
711
730
  write(colored("Response URL exclusions: ", "magenta") + colored(FILTER_URL))
712
731
 
713
732
  if args.mt:
@@ -771,14 +790,9 @@ def showOptions():
771
790
  )
772
791
  )
773
792
  else:
774
- write(
775
- colored("Discord Webhook: ", "magenta") + colored(WEBHOOK_DISCORD)
776
- )
793
+ write(colored("Discord Webhook: ", "magenta") + colored(WEBHOOK_DISCORD))
777
794
 
778
- write(
779
- colored("Default Output Directory: ", "magenta")
780
- + colored(str(DEFAULT_OUTPUT_DIR))
781
- )
795
+ write(colored("Default Output Directory: ", "magenta") + colored(str(DEFAULT_OUTPUT_DIR)))
782
796
 
783
797
  if args.regex_after is not None:
784
798
  write(
@@ -799,7 +813,7 @@ def showOptions():
799
813
  if args.mode in ["R", "B"] or (args.mode == "U" and not args.xcc):
800
814
  write(
801
815
  colored("-p: " + str(args.processes), "magenta")
802
- + colored(" The number of parallel requests made.", "white")
816
+ + colored(" The number of parallel requests made per source.", "white")
803
817
  )
804
818
  write(
805
819
  colored("-r: " + str(args.retries), "magenta")
@@ -1084,10 +1098,7 @@ def getConfig():
1084
1098
  if args.notify_discord:
1085
1099
  try:
1086
1100
  WEBHOOK_DISCORD = config.get("WEBHOOK_DISCORD")
1087
- if (
1088
- str(WEBHOOK_DISCORD) == "None"
1089
- or str(WEBHOOK_DISCORD) == "YOUR_WEBHOOK"
1090
- ):
1101
+ if str(WEBHOOK_DISCORD) == "None" or str(WEBHOOK_DISCORD) == "YOUR_WEBHOOK":
1091
1102
  writerr(
1092
1103
  colored(
1093
1104
  'No value for "WEBHOOK_DISCORD" in config.yml - default set',
@@ -1164,9 +1175,7 @@ def getConfig():
1164
1175
  else:
1165
1176
  writerr(
1166
1177
  colored(
1167
- 'WARNING: Cannot find file "'
1168
- + args.config
1169
- + '", so using default values',
1178
+ 'WARNING: Cannot find file "' + args.config + '", so using default values',
1170
1179
  "yellow",
1171
1180
  )
1172
1181
  )
@@ -1238,9 +1247,7 @@ def printProgressBar(
1238
1247
  if not (args.stream and args.mode == "U"):
1239
1248
  try:
1240
1249
  percent = (
1241
- ("{0:." + str(decimals) + "f}")
1242
- .format(100 * (iteration / float(total)))
1243
- .rjust(5)
1250
+ ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))).rjust(5)
1244
1251
  )
1245
1252
  filledLength = int(length * iteration // total)
1246
1253
  bar = fill * filledLength + "-" * (length - filledLength)
@@ -1297,7 +1304,7 @@ def fixArchiveOrgUrl(url):
1297
1304
 
1298
1305
  # Add a link to the linksFound collection for archived responses (included timestamp preifx)
1299
1306
  def linksFoundResponseAdd(link):
1300
- global linksFound, argsInput, argsInputHostname
1307
+ global linksFound, argsInput, argsInputHostname, links_lock
1301
1308
 
1302
1309
  try:
1303
1310
  if inputIsDomainANDPath:
@@ -1318,20 +1325,22 @@ def linksFoundResponseAdd(link):
1318
1325
 
1319
1326
  # Don't write it if the link does not contain the requested domain (this can sometimes happen)
1320
1327
  if parsed_url.lower().find(checkInput.lower()) >= 0:
1321
- linksFound.add(link)
1328
+ with links_lock:
1329
+ linksFound.add(link)
1322
1330
  # If streaming is enabled and mode is 'U', print the link to stdout
1323
1331
  if args.stream and args.mode == "U":
1324
1332
  write(link, pipe=True)
1325
1333
  except Exception:
1326
- linksFound.add(link)
1334
+ with links_lock:
1335
+ linksFound.add(link)
1327
1336
  # If streaming is enabled and mode is 'U', print the link to stdout
1328
1337
  if args.stream and args.mode == "U":
1329
1338
  write(link, pipe=True)
1330
1339
 
1331
1340
 
1332
1341
  # Add a link to the linksFound collection
1333
- def linksFoundAdd(link):
1334
- global linksFound, argsInput, argsInputHostname
1342
+ def linksFoundAdd(link, source_set=None):
1343
+ global linksFound, argsInput, argsInputHostname, links_lock
1335
1344
 
1336
1345
  try:
1337
1346
  if inputIsDomainANDPath:
@@ -1349,12 +1358,20 @@ def linksFoundAdd(link):
1349
1358
 
1350
1359
  # Don't write it if the link does not contain the requested domain (this can sometimes happen)
1351
1360
  if parsed_url.find(checkInput) >= 0:
1352
- linksFound.add(link)
1361
+ with links_lock:
1362
+ if source_set is not None:
1363
+ source_set.add(link)
1364
+ else:
1365
+ linksFound.add(link)
1353
1366
  # If streaming is enabled and mode is 'U', print the link to stdout
1354
1367
  if args.stream and args.mode == "U":
1355
1368
  write(link, pipe=True)
1356
1369
  except Exception:
1357
- linksFound.add(link)
1370
+ with links_lock:
1371
+ if source_set is not None:
1372
+ source_set.add(link)
1373
+ else:
1374
+ linksFound.add(link)
1358
1375
  # If streaming is enabled and mode is 'U', print the link to stdout
1359
1376
  if args.stream and args.mode == "U":
1360
1377
  write(link, pipe=True)
@@ -1394,9 +1411,7 @@ def processArchiveUrl(url):
1394
1411
  )
1395
1412
  archiveHtml = str(resp.text)
1396
1413
  try:
1397
- contentType = (
1398
- resp.headers.get("Content-Type").split(";")[0].lower()
1399
- )
1414
+ contentType = resp.headers.get("Content-Type").split(";")[0].lower()
1400
1415
  except Exception:
1401
1416
  contentType = ""
1402
1417
 
@@ -1407,18 +1422,13 @@ def processArchiveUrl(url):
1407
1422
  # If the FILTER_CODE includes 404, and it doesn't seem to be a custom 404 page
1408
1423
  if "404" not in FILTER_CODE or (
1409
1424
  "404" in FILTER_CODE
1410
- and not re.findall(
1411
- REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE
1412
- )
1425
+ and not re.findall(REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE)
1413
1426
  ):
1414
1427
 
1415
1428
  # Add the URL as a comment at the start of the response
1416
1429
  if args.url_filename:
1417
1430
  archiveHtml = (
1418
- "/* Original URL: "
1419
- + archiveUrl
1420
- + " */\n"
1421
- + archiveHtml
1431
+ "/* Original URL: " + archiveUrl + " */\n" + archiveHtml
1422
1432
  )
1423
1433
 
1424
1434
  # Remove all web archive references in the response
@@ -1565,9 +1575,7 @@ def processArchiveUrl(url):
1565
1575
  # Determine the extension from the content type
1566
1576
  try:
1567
1577
  if contentType != "":
1568
- extension = contentType.split("/")[
1569
- 1
1570
- ].replace("x-", "")
1578
+ extension = contentType.split("/")[1].replace("x-", "")
1571
1579
  if extension == "":
1572
1580
  extension = contentType.lower()
1573
1581
  except Exception:
@@ -1588,15 +1596,11 @@ def processArchiveUrl(url):
1588
1596
  # If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
1589
1597
  if extension == "":
1590
1598
  if (
1591
- archiveHtml.lower()
1592
- .strip()
1593
- .endswith("</html>")
1599
+ archiveHtml.lower().strip().endswith("</html>")
1594
1600
  or archiveHtml.lower()
1595
1601
  .strip()
1596
1602
  .startswith("<!doctype html")
1597
- or archiveHtml.lower()
1598
- .strip()
1599
- .startswith("<html")
1603
+ or archiveHtml.lower().strip().startswith("<html")
1600
1604
  ):
1601
1605
  extension = "html"
1602
1606
  else:
@@ -1626,12 +1630,10 @@ def processArchiveUrl(url):
1626
1630
  except Exception as e:
1627
1631
  writerr(
1628
1632
  colored(
1629
- getSPACER(
1630
- "[ ERR ] Failed to write file "
1631
- + filePath
1632
- + ": "
1633
- + str(e)
1634
- ),
1633
+ "Wayback - [ ERR ] Failed to write file "
1634
+ + filePath
1635
+ + ": "
1636
+ + str(e),
1635
1637
  "red",
1636
1638
  )
1637
1639
  )
@@ -1641,23 +1643,16 @@ def processArchiveUrl(url):
1641
1643
  try:
1642
1644
  timestamp = str(datetime.now())
1643
1645
  indexFile.write(
1644
- hashValue
1645
- + ","
1646
- + archiveUrl
1647
- + " ,"
1648
- + timestamp
1649
- + "\n"
1646
+ hashValue + "," + archiveUrl + " ," + timestamp + "\n"
1650
1647
  )
1651
1648
  indexFile.flush()
1652
1649
  except Exception as e:
1653
1650
  writerr(
1654
1651
  colored(
1655
- getSPACER(
1656
- '[ ERR ] Failed to write to waymore_index.txt for "'
1657
- + archiveUrl
1658
- + '": '
1659
- + str(e)
1660
- ),
1652
+ 'Wayback - [ ERR ] Failed to write to waymore_index.txt for "'
1653
+ + archiveUrl
1654
+ + '": '
1655
+ + str(e),
1661
1656
  "red",
1662
1657
  )
1663
1658
  )
@@ -1668,9 +1663,7 @@ def processArchiveUrl(url):
1668
1663
  debugText = ""
1669
1664
  if archiveHtml.lower().find("archive.org") > 0:
1670
1665
  debugText = "ARCHIVE.ORG"
1671
- elif (
1672
- archiveHtml.lower().find("internet archive") > 0
1673
- ):
1666
+ elif archiveHtml.lower().find("internet archive") > 0:
1674
1667
  debugText = "INTERNET ARCHIVE"
1675
1668
  elif archiveHtml.lower().find("wombat") > 0:
1676
1669
  debugText = "WOMBAT (JS)"
@@ -1697,11 +1690,7 @@ def processArchiveUrl(url):
1697
1690
  if verbose():
1698
1691
  writerr(
1699
1692
  colored(
1700
- getSPACER(
1701
- '[ ERR ] Wayback Machine (archive.org) returned a problem for "'
1702
- + archiveUrl
1703
- + '"'
1704
- ),
1693
+ 'Wayback - [ ERR ] returned a problem for "' + archiveUrl + '"',
1705
1694
  "red",
1706
1695
  )
1707
1696
  )
@@ -1710,11 +1699,7 @@ def processArchiveUrl(url):
1710
1699
  if verbose():
1711
1700
  writerr(
1712
1701
  colored(
1713
- getSPACER(
1714
- '[ ERR ] Wayback Machine (archive.org) connection error for "'
1715
- + archiveUrl
1716
- + '"'
1717
- ),
1702
+ 'Wayback - [ ERR ] connection error for "' + archiveUrl + '"',
1718
1703
  "red",
1719
1704
  )
1720
1705
  )
@@ -1724,25 +1709,21 @@ def processArchiveUrl(url):
1724
1709
  try:
1725
1710
  writerr(
1726
1711
  colored(
1727
- getSPACER(
1728
- "[ "
1729
- + str(resp.status_code)
1730
- + ' ] Failed to get response for "'
1731
- + archiveUrl
1732
- + '"'
1733
- ),
1712
+ "Wayback - [ "
1713
+ + str(resp.status_code)
1714
+ + ' ] Failed to get response for "'
1715
+ + archiveUrl
1716
+ + '"',
1734
1717
  "red",
1735
1718
  )
1736
1719
  )
1737
1720
  except Exception:
1738
1721
  writerr(
1739
1722
  colored(
1740
- getSPACER(
1741
- '[ ERR ] Failed to get response for "'
1742
- + archiveUrl
1743
- + '": '
1744
- + str(e)
1745
- ),
1723
+ 'Wayback - [ ERR ] Failed to get response for "'
1724
+ + archiveUrl
1725
+ + '": '
1726
+ + str(e),
1746
1727
  "red",
1747
1728
  )
1748
1729
  )
@@ -1769,9 +1750,7 @@ def processArchiveUrl(url):
1769
1750
  )
1770
1751
  except Exception:
1771
1752
  if verbose():
1772
- suffix = (
1773
- 'Complete (To show mem use, run "pip install psutil")'
1774
- )
1753
+ suffix = 'Complete (To show mem use, run "pip install psutil")'
1775
1754
  printProgressBar(
1776
1755
  successCount + failureCount,
1777
1756
  totalResponses,
@@ -1796,9 +1775,7 @@ def processArchiveUrl(url):
1796
1775
 
1797
1776
  except Exception as e:
1798
1777
  if verbose():
1799
- writerr(
1800
- colored(getSPACER('Error for "' + url + '": ' + str(e)), "red")
1801
- )
1778
+ writerr(colored('Wayback - [ ERR ] Error for "' + url + '": ' + str(e), "red"))
1802
1779
 
1803
1780
  except Exception as e:
1804
1781
  writerr(colored("ERROR processArchiveUrl 1: " + str(e), "red"))
@@ -1883,7 +1860,7 @@ def processURLOutput():
1883
1860
  linkCount = len(linksFound)
1884
1861
  write(
1885
1862
  getSPACER(
1886
- colored("Links found for " + subs + argsInput + ": ", "cyan")
1863
+ colored("\nTotal unique links found for " + subs + argsInput + ": ", "cyan")
1887
1864
  + colored(str(linkCount) + " 🤘", "white")
1888
1865
  )
1889
1866
  + "\n"
@@ -1926,7 +1903,7 @@ def processURLOutput():
1926
1903
  appendedUrls = False
1927
1904
  if not args.output_overwrite:
1928
1905
  try:
1929
- with open(filename, "r") as existingLinks:
1906
+ with open(filename) as existingLinks:
1930
1907
  for link in existingLinks.readlines():
1931
1908
  linksFound.add(link.strip())
1932
1909
  appendedUrls = True
@@ -1968,16 +1945,10 @@ def processURLOutput():
1968
1945
  writerr(colored("ERROR processURLOutput 3: " + str(e), "red"))
1969
1946
 
1970
1947
  # If there are less links output because of filters, show the new total
1971
- if (
1972
- args.regex_after is not None
1973
- and linkCount > 0
1974
- and outputCount < linkCount
1975
- ):
1948
+ if args.regex_after is not None and linkCount > 0 and outputCount < linkCount:
1976
1949
  write(
1977
1950
  colored(
1978
- 'Links found after applying filter "'
1979
- + args.regex_after
1980
- + '": ',
1951
+ 'Links found after applying filter "' + args.regex_after + '": ',
1981
1952
  "cyan",
1982
1953
  )
1983
1954
  + colored(str(outputCount) + " 🤘\n", "white")
@@ -1992,11 +1963,7 @@ def processURLOutput():
1992
1963
 
1993
1964
  if verbose():
1994
1965
  if outputCount == 0:
1995
- write(
1996
- colored(
1997
- "No links were found so nothing written to file.", "cyan"
1998
- )
1999
- )
1966
+ write(colored("No links were found so nothing written to file.", "cyan"))
2000
1967
  else:
2001
1968
  if appendedUrls:
2002
1969
  write(
@@ -2018,11 +1985,11 @@ def processURLOutput():
2018
1985
  if os.path.exists(filenameOld) and os.path.exists(filename):
2019
1986
 
2020
1987
  # Get all the old links
2021
- with open(filenameOld, "r") as oldFile:
1988
+ with open(filenameOld) as oldFile:
2022
1989
  oldLinks = set(oldFile.readlines())
2023
1990
 
2024
1991
  # Get all the new links
2025
- with open(filename, "r") as newFile:
1992
+ with open(filename) as newFile:
2026
1993
  newLinks = set(newFile.readlines())
2027
1994
 
2028
1995
  # Create a file with most recent new links
@@ -2061,7 +2028,7 @@ def stripUnwanted(url):
2061
2028
  """
2062
2029
  parsed = urlparse(url)
2063
2030
  # Strip scheme
2064
- scheme = "%s://" % parsed.scheme
2031
+ scheme = f"{parsed.scheme}://"
2065
2032
  strippedUrl = parsed.geturl().replace(scheme, "", 1)
2066
2033
  # Strip query string and fragment
2067
2034
  strippedUrl = strippedUrl.split("#")[0].split("?")[0]
@@ -2092,7 +2059,7 @@ def validateArgInput(x):
2092
2059
  if os.path.isfile(x):
2093
2060
  isInputFile = True
2094
2061
  # Open file and put all values in input list
2095
- with open(x, "r") as inputFile:
2062
+ with open(x) as inputFile:
2096
2063
  lines = inputFile.readlines()
2097
2064
  # Check if any lines start with a *. and replace without the *.
2098
2065
  for line in lines:
@@ -2189,9 +2156,7 @@ def validateArgProviders(x):
2189
2156
  x = x.lower()
2190
2157
  providers = x.split(",")
2191
2158
  for provider in providers:
2192
- if not re.fullmatch(
2193
- r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx)", provider
2194
- ):
2159
+ if not re.fullmatch(r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx)", provider):
2195
2160
  invalid = True
2196
2161
  break
2197
2162
  if invalid:
@@ -2201,16 +2166,32 @@ def validateArgProviders(x):
2201
2166
  return x
2202
2167
 
2203
2168
 
2169
+ def parseDateArg(dateArg):
2170
+ """
2171
+ Parse a date argument from the command line into a datetime object
2172
+ """
2173
+ formats = {
2174
+ 4: "%Y",
2175
+ 6: "%Y%m",
2176
+ 8: "%Y%m%d",
2177
+ 10: "%Y%m%d%H",
2178
+ 12: "%Y%m%d%H%M",
2179
+ 14: "%Y%m%d%H%M%S",
2180
+ }
2181
+ fmt = formats.get(len(dateArg))
2182
+ return datetime.strptime(dateArg, fmt)
2183
+
2184
+
2204
2185
  def processAlienVaultPage(url):
2205
2186
  """
2206
2187
  Get URLs from a specific page of otx.alienvault.org API for the input domain
2207
2188
  """
2208
- global totalPages, linkMimes, linksFound, stopSource, argsInput
2189
+ global totalPages, linkMimes, linksFound, stopSourceAlienVault, argsInput, linkCountAlienVault
2209
2190
  try:
2210
2191
  # Get memory in case it exceeds threshold
2211
2192
  getMemory()
2212
2193
 
2213
- if not stopSource:
2194
+ if not stopSourceAlienVault:
2214
2195
  try:
2215
2196
  # Choose a random user agent string to use for any requests
2216
2197
  userAgent = random.choice(USER_AGENT)
@@ -2222,9 +2203,7 @@ def processAlienVaultPage(url):
2222
2203
  except ConnectionError:
2223
2204
  writerr(
2224
2205
  colored(
2225
- getSPACER(
2226
- "[ ERR ] alienvault.org connection error for page " + page
2227
- ),
2206
+ getSPACER("AlienVault - [ ERR ] Connection error for page " + page),
2228
2207
  "red",
2229
2208
  )
2230
2209
  )
@@ -2233,12 +2212,10 @@ def processAlienVaultPage(url):
2233
2212
  except Exception as e:
2234
2213
  writerr(
2235
2214
  colored(
2236
- getSPACER(
2237
- "[ ERR ] Error getting response for page "
2238
- + page
2239
- + " - "
2240
- + str(e)
2241
- ),
2215
+ "AlienVault -[ ERR ] Error getting response for page "
2216
+ + page
2217
+ + " - "
2218
+ + str(e),
2242
2219
  "red",
2243
2220
  )
2244
2221
  )
@@ -2249,26 +2226,21 @@ def processAlienVaultPage(url):
2249
2226
  if resp is not None:
2250
2227
  # If a status other of 429, then stop processing Alien Vault
2251
2228
  if resp.status_code == 429:
2252
- writerr(
2253
- colored(
2254
- getSPACER(
2255
- "[ 429 ] Alien Vault rate limit reached, so stopping. Links that have already been retrieved will be saved."
2256
- ),
2257
- "red",
2229
+ if not stopSourceAlienVault: # Only print message once
2230
+ writerr(
2231
+ colored(
2232
+ "AlienVault - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
2233
+ "red",
2234
+ )
2258
2235
  )
2259
- )
2260
- stopSource = True
2236
+ stopSourceAlienVault = True
2261
2237
  return
2262
2238
  # If the response from alienvault.com is empty then skip
2263
2239
  if resp.text == "" and totalPages == 0:
2264
2240
  if verbose():
2265
2241
  writerr(
2266
2242
  colored(
2267
- getSPACER(
2268
- "[ ERR ] "
2269
- + url
2270
- + " gave an empty response."
2271
- ),
2243
+ "AlienVault - [ ERR ] " + url + " gave an empty response.",
2272
2244
  "red",
2273
2245
  )
2274
2246
  )
@@ -2278,12 +2250,10 @@ def processAlienVaultPage(url):
2278
2250
  if verbose():
2279
2251
  writerr(
2280
2252
  colored(
2281
- getSPACER(
2282
- "[ "
2283
- + str(resp.status_code)
2284
- + " ] Error for "
2285
- + url
2286
- ),
2253
+ "AlienVauilt - [ "
2254
+ + str(resp.status_code)
2255
+ + " ] Error for "
2256
+ + url,
2287
2257
  "red",
2288
2258
  )
2289
2259
  )
@@ -2306,6 +2276,7 @@ def processAlienVaultPage(url):
2306
2276
  if foundUrl != "":
2307
2277
  # If filters are not required and subs are wanted then just add the URL to the list
2308
2278
  if args.filter_responses_only and not args.no_subs:
2279
+ linkCountAlienVault = linkCountAlienVault + 1
2309
2280
  linksFoundAdd(foundUrl)
2310
2281
  else:
2311
2282
  addLink = True
@@ -2332,9 +2303,7 @@ def processAlienVaultPage(url):
2332
2303
  # Compare the HTTP code gainst the Code exclusions and matches
2333
2304
  if MATCH_CODE != "":
2334
2305
  match = re.search(
2335
- r"("
2336
- + re.escape(MATCH_CODE).replace(",", "|")
2337
- + ")",
2306
+ r"(" + re.escape(MATCH_CODE).replace(",", "|") + ")",
2338
2307
  httpCode,
2339
2308
  flags=re.IGNORECASE,
2340
2309
  )
@@ -2342,9 +2311,7 @@ def processAlienVaultPage(url):
2342
2311
  addLink = False
2343
2312
  else:
2344
2313
  match = re.search(
2345
- r"("
2346
- + re.escape(FILTER_CODE).replace(",", "|")
2347
- + ")",
2314
+ r"(" + re.escape(FILTER_CODE).replace(",", "|") + ")",
2348
2315
  httpCode,
2349
2316
  flags=re.IGNORECASE,
2350
2317
  )
@@ -2354,9 +2321,7 @@ def processAlienVaultPage(url):
2354
2321
  # Check the URL exclusions
2355
2322
  if addLink:
2356
2323
  match = re.search(
2357
- r"("
2358
- + re.escape(FILTER_URL).replace(",", "|")
2359
- + ")",
2324
+ r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
2360
2325
  foundUrl,
2361
2326
  flags=re.IGNORECASE,
2362
2327
  )
@@ -2367,9 +2332,7 @@ def processAlienVaultPage(url):
2367
2332
  if addLink and args.keywords_only:
2368
2333
  if args.keywords_only == "#CONFIG":
2369
2334
  match = re.search(
2370
- r"("
2371
- + re.escape(FILTER_KEYWORDS).replace(",", "|")
2372
- + ")",
2335
+ r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
2373
2336
  foundUrl,
2374
2337
  flags=re.IGNORECASE,
2375
2338
  )
@@ -2382,9 +2345,39 @@ def processAlienVaultPage(url):
2382
2345
  if match is None:
2383
2346
  addLink = False
2384
2347
 
2348
+ # Check date is in range if required
2349
+ if args.from_date is not None or args.to_date is not None:
2350
+ try:
2351
+ urlDateStr = urlSection["date"]
2352
+
2353
+ # Remove fractional seconds if present
2354
+ urlDateStr = urlDateStr.split(".")[0]
2355
+
2356
+ urlDate = datetime.strptime(urlDateStr, "%Y-%m-%dT%H:%M:%S")
2357
+
2358
+ # If from date passed, check
2359
+ if args.from_date is not None:
2360
+ fromDate = parseDateArg(args.from_date)
2361
+ if urlDate < fromDate:
2362
+ addLink = False
2363
+ # If to date passed, check
2364
+ if args.to_date is not None:
2365
+ toDate = parseDateArg(args.to_date)
2366
+ if urlDate >= toDate:
2367
+ addLink = False
2368
+ except Exception as e:
2369
+ if verbose():
2370
+ writerr(
2371
+ colored(
2372
+ "ERROR processLAlienVaultPage date check: "
2373
+ + str(e),
2374
+ "red",
2375
+ )
2376
+ )
2377
+
2385
2378
  # Add link if it passed filters
2386
2379
  if addLink:
2387
- linksFoundAdd(foundUrl)
2380
+ linksFoundAdd(foundUrl, linksFoundAlienVault)
2388
2381
  else:
2389
2382
  pass
2390
2383
  except Exception as e:
@@ -2396,12 +2389,12 @@ def getAlienVaultUrls():
2396
2389
  """
2397
2390
  Get URLs from the Alien Vault OTX, otx.alienvault.com
2398
2391
  """
2399
- global linksFound, waymorePath, subs, path, stopProgram, totalPages, stopSource, argsInput, checkAlienVault, inputIsSubDomain, argsInputHostname
2392
+ global linksFound, waymorePath, subs, path, stopProgram, totalPages, stopSourceAlienVault, argsInput, checkAlienVault, inputIsSubDomain, argsInputHostname, linkCountAlienVault, linksFoundAlienVault
2400
2393
 
2401
2394
  # Write the file of URL's for the passed domain/URL
2402
2395
  try:
2403
- stopSource = False
2404
- originalLinkCount = len(linksFound)
2396
+ stopSourceAlienVault = False
2397
+ linksFoundAlienVault = set()
2405
2398
 
2406
2399
  # Set the Alien Vault API indicator types of domain or hostname (has subdomain)
2407
2400
  if inputIsSubDomain:
@@ -2418,11 +2411,12 @@ def getAlienVaultUrls():
2418
2411
 
2419
2412
  # Get the number of pages (i.e. separate requests) that are going to be made to alienvault.com
2420
2413
  totalPages = 0
2414
+ resp = None
2421
2415
  try:
2422
2416
  if not args.check_only:
2423
2417
  write(
2424
2418
  colored(
2425
- "\rGetting the number of alienvault.com pages to search...\r",
2419
+ "AlienVault - [ INFO ] Getting the number of alienvault.com pages to search...",
2426
2420
  "cyan",
2427
2421
  )
2428
2422
  )
@@ -2431,43 +2425,39 @@ def getAlienVaultUrls():
2431
2425
  session = requests.Session()
2432
2426
  session.mount("https://", HTTP_ADAPTER)
2433
2427
  session.mount("http://", HTTP_ADAPTER)
2434
- resp = session.get(
2435
- url + "&showNumPages=True", headers={"User-Agent": userAgent}
2436
- )
2428
+ resp = session.get(url + "&showNumPages=True", headers={"User-Agent": userAgent})
2437
2429
  except Exception as e:
2438
2430
  writerr(
2439
2431
  colored(
2440
- getSPACER(
2441
- "[ ERR ] Unable to get links from alienvault.com: " + str(e)
2442
- ),
2432
+ "AlienVault - [ ERR ] Unable to get links from alienvault.com: " + str(e),
2443
2433
  "red",
2444
2434
  )
2445
2435
  )
2446
- return
2436
+ # Don't return - continue to show link count at the end
2447
2437
 
2448
2438
  # If the rate limit was reached end now
2449
- if resp.status_code == 429:
2439
+ if resp is not None and resp.status_code == 429:
2450
2440
  writerr(
2451
2441
  colored(
2452
- getSPACER(
2453
- "[ 429 ] Alien Vault rate limit reached so unable to get links."
2454
- ),
2442
+ "AlienVault - [ 429 ] Rate limit reached so unable to get links.",
2455
2443
  "red",
2456
2444
  )
2457
2445
  )
2458
- return
2446
+ # Don't return - continue to show link count at the end
2459
2447
 
2460
- if verbose():
2448
+ if resp is not None and verbose():
2461
2449
  write(
2462
- getSPACER(
2463
- colored("The Alien Vault URL requested to get links: ", "magenta")
2464
- + colored(url, "white")
2465
- )
2450
+ colored("AlienVault - [ INFO ] The URL requested to get links: ", "magenta")
2451
+ + colored(url, "white")
2466
2452
  + "\n"
2467
2453
  )
2468
2454
 
2469
2455
  # Carry on if something was found
2470
- if resp.text.lower().find('"error": "') < 0:
2456
+ if (
2457
+ resp is not None
2458
+ and resp.status_code != 429
2459
+ and resp.text.lower().find('"error": "') < 0
2460
+ ):
2471
2461
 
2472
2462
  try:
2473
2463
  # Get the JSON response
@@ -2478,9 +2468,7 @@ def getAlienVaultUrls():
2478
2468
  except Exception:
2479
2469
  writerr(
2480
2470
  colored(
2481
- getSPACER(
2482
- "[ ERR ] There was an unexpected response from the Alien Vault API"
2483
- ),
2471
+ "AlienVault - [ ERR ] There was an unexpected response from the API",
2484
2472
  "red",
2485
2473
  )
2486
2474
  )
@@ -2502,16 +2490,16 @@ def getAlienVaultUrls():
2502
2490
  else:
2503
2491
  checkAlienVault = totalPages
2504
2492
  write(
2505
- colored("Get URLs from Alien Vault: ", "cyan")
2493
+ colored("AlienVault - [ INFO ] Getting URLs from Alien Vault: ", "cyan")
2506
2494
  + colored(str(checkAlienVault) + " requests", "white")
2507
2495
  )
2508
2496
  else:
2509
2497
  # if the page number was found then display it, but otherwise we will just try to increment until we have everything
2510
2498
  write(
2511
2499
  colored(
2512
- "\rGetting links from "
2500
+ "AlienVault - [ INFO ] Getting links from "
2513
2501
  + str(totalPages)
2514
- + " alienvault.com API requests (this can take a while for some domains)...\r",
2502
+ + " alienvault.com API requests (this can take a while for some domains)...",
2515
2503
  "cyan",
2516
2504
  )
2517
2505
  )
@@ -2531,32 +2519,19 @@ def getAlienVaultUrls():
2531
2519
  if verbose():
2532
2520
  writerr(
2533
2521
  colored(
2534
- getSPACER(
2535
- "[ ERR ] An error was returned in the alienvault.com response."
2536
- )
2537
- + "\n",
2522
+ "AlienVault - [ ERR ] An error was returned in the response." + "\n",
2538
2523
  "red",
2539
2524
  )
2540
2525
  )
2541
2526
 
2542
2527
  if not args.check_only:
2543
- linkCount = len(linksFound) - originalLinkCount
2544
- if args.xwm and args.xcc:
2545
- write(
2546
- getSPACER(
2547
- colored("Links found on alienvault.com: ", "cyan")
2548
- + colored(str(linkCount), "white")
2549
- )
2550
- + "\n"
2551
- )
2552
- else:
2553
- write(
2554
- getSPACER(
2555
- colored("Extra links found on alienvault.com: ", "cyan")
2556
- + colored(str(linkCount), "white")
2557
- )
2558
- + "\n"
2559
- )
2528
+ linkCountAlienVault = len(linksFoundAlienVault)
2529
+ write(
2530
+ colored("AlienVault - [ INFO ] Links found on alienvault.com: ", "cyan")
2531
+ + colored(str(linkCountAlienVault), "white")
2532
+ )
2533
+ linksFound.update(linksFoundAlienVault)
2534
+ linksFoundAlienVault.clear()
2560
2535
 
2561
2536
  except Exception as e:
2562
2537
  writerr(colored("ERROR getAlienVaultUrls 1: " + str(e), "red"))
@@ -2566,7 +2541,7 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
2566
2541
  """
2567
2542
  Process a specific URL from urlscan.io to determine whether to save the link
2568
2543
  """
2569
- global argsInput, argsInputHostname, urlscanRequestLinks
2544
+ global argsInput, argsInputHostname, urlscanRequestLinks, links_lock, linkCountURLScan, linksFoundURLScan
2570
2545
 
2571
2546
  addLink = True
2572
2547
 
@@ -2629,9 +2604,7 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
2629
2604
  flags=re.IGNORECASE,
2630
2605
  )
2631
2606
  else:
2632
- match = re.search(
2633
- r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE
2634
- )
2607
+ match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
2635
2608
  if match is None:
2636
2609
  addLink = False
2637
2610
 
@@ -2657,7 +2630,8 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
2657
2630
  # Add MIME Types if --verbose option was selected
2658
2631
  if verbose():
2659
2632
  if mimeType.strip() != "":
2660
- linkMimes.add(mimeType)
2633
+ with links_lock:
2634
+ linkMimes.add(mimeType)
2661
2635
 
2662
2636
  # Add link if it passed filters
2663
2637
  if addLink:
@@ -2677,11 +2651,12 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
2677
2651
  )
2678
2652
  if match is not None:
2679
2653
  if args.mode in ("U", "B"):
2680
- linksFoundAdd(url)
2654
+ linksFoundAdd(url, linksFoundURLScan)
2681
2655
  # If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
2682
2656
  if urlscanID != "" and args.mode in ("R", "B"):
2683
2657
  if args.limit == 0 or len(urlscanRequestLinks) < args.limit:
2684
- urlscanRequestLinks.add((url, URLSCAN_DOM_URL + urlscanID))
2658
+ with links_lock:
2659
+ urlscanRequestLinks.add((url, URLSCAN_DOM_URL + urlscanID))
2685
2660
 
2686
2661
  except Exception as e:
2687
2662
  writerr(colored("ERROR processURLScanUrl 1: " + str(e), "red"))
@@ -2726,12 +2701,7 @@ def getURLScanDOM(originalUrl, domUrl):
2726
2701
 
2727
2702
  # Add the URL as a comment at the start of the response
2728
2703
  if args.url_filename:
2729
- archiveHtml = (
2730
- "/* Original URL: "
2731
- + originalUrl
2732
- + " */\n"
2733
- + archiveHtml
2734
- )
2704
+ archiveHtml = "/* Original URL: " + originalUrl + " */\n" + archiveHtml
2735
2705
 
2736
2706
  # Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
2737
2707
  if args.url_filename:
@@ -2760,9 +2730,7 @@ def getURLScanDOM(originalUrl, domUrl):
2760
2730
  if (
2761
2731
  archiveHtml.lower().strip().endswith("</html>")
2762
2732
  or archiveHtml.lower().strip().endswith("</body>")
2763
- or archiveHtml.lower()
2764
- .strip()
2765
- .startswith("<!doctype html")
2733
+ or archiveHtml.lower().strip().startswith("<!doctype html")
2766
2734
  or archiveHtml.lower().strip().startswith("<html")
2767
2735
  or archiveHtml.lower().strip().startswith("<head")
2768
2736
  ):
@@ -2794,12 +2762,10 @@ def getURLScanDOM(originalUrl, domUrl):
2794
2762
  except Exception as e:
2795
2763
  writerr(
2796
2764
  colored(
2797
- getSPACER(
2798
- "[ ERR ] Failed to write file "
2799
- + filePath
2800
- + ": "
2801
- + str(e)
2802
- ),
2765
+ "URLScan - [ ERR ] Failed to write file "
2766
+ + filePath
2767
+ + ": "
2768
+ + str(e),
2803
2769
  "red",
2804
2770
  )
2805
2771
  )
@@ -2822,12 +2788,10 @@ def getURLScanDOM(originalUrl, domUrl):
2822
2788
  except Exception as e:
2823
2789
  writerr(
2824
2790
  colored(
2825
- getSPACER(
2826
- '[ ERR ] Failed to write to waymore_index.txt for "'
2827
- + domUrl
2828
- + '": '
2829
- + str(e)
2830
- ),
2791
+ 'URLScan - [ ERR ] Failed to write to waymore_index.txt for "'
2792
+ + domUrl
2793
+ + '": '
2794
+ + str(e),
2831
2795
  "red",
2832
2796
  )
2833
2797
  )
@@ -2843,25 +2807,21 @@ def getURLScanDOM(originalUrl, domUrl):
2843
2807
  try:
2844
2808
  writerr(
2845
2809
  colored(
2846
- getSPACER(
2847
- "[ "
2848
- + str(resp.status_code)
2849
- + ' ] Failed to get response for "'
2850
- + domUrl
2851
- + '"'
2852
- ),
2810
+ "URLScan - [ "
2811
+ + str(resp.status_code)
2812
+ + ' ] Failed to get response for "'
2813
+ + domUrl
2814
+ + '"',
2853
2815
  "red",
2854
2816
  )
2855
2817
  )
2856
2818
  except Exception:
2857
2819
  writerr(
2858
2820
  colored(
2859
- getSPACER(
2860
- '[ ERR ] Failed to get response for "'
2861
- + domUrl
2862
- + '": '
2863
- + str(e)
2864
- ),
2821
+ 'URLScan - [ ERR ] Failed to get response for "'
2822
+ + domUrl
2823
+ + '": '
2824
+ + str(e),
2865
2825
  "red",
2866
2826
  )
2867
2827
  )
@@ -2888,9 +2848,7 @@ def getURLScanDOM(originalUrl, domUrl):
2888
2848
  )
2889
2849
  except Exception:
2890
2850
  if verbose():
2891
- suffix = (
2892
- 'Complete (To show mem use, run "pip install psutil")'
2893
- )
2851
+ suffix = 'Complete (To show mem use, run "pip install psutil")'
2894
2852
  printProgressBar(
2895
2853
  successCount + failureCount,
2896
2854
  totalResponses,
@@ -2903,23 +2861,15 @@ def getURLScanDOM(originalUrl, domUrl):
2903
2861
  # Write the total count to the continueResp.URLScan.tmp file
2904
2862
  try:
2905
2863
  continueRespFileURLScan.seek(0)
2906
- continueRespFileURLScan.write(
2907
- str(successCount + failureCount) + "\n"
2908
- )
2864
+ continueRespFileURLScan.write(str(successCount + failureCount) + "\n")
2909
2865
  except Exception as e:
2910
2866
  if verbose():
2911
- writerr(
2912
- colored(
2913
- getSPACER("ERROR getURLScanDOM 2: " + str(e)), "red"
2914
- )
2915
- )
2867
+ writerr(colored(getSPACER("ERROR getURLScanDOM 2: " + str(e)), "red"))
2916
2868
 
2917
2869
  except Exception as e:
2918
2870
  if verbose():
2919
2871
  writerr(
2920
- colored(
2921
- getSPACER('Error for "' + domUrl + '": ' + str(e)), "red"
2922
- )
2872
+ colored('URLScan - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red")
2923
2873
  )
2924
2874
 
2925
2875
  except Exception as e:
@@ -2945,14 +2895,15 @@ def getURLScanUrls():
2945
2895
  """
2946
2896
  Get URLs from the URLSCan API, urlscan.io
2947
2897
  """
2948
- global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSource, argsInput, checkURLScan, argsInputHostname
2898
+ global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
2949
2899
 
2950
2900
  # Write the file of URL's for the passed domain/URL
2951
2901
  try:
2952
2902
  requestsMade = 0
2953
- stopSource = False
2954
- linkMimes = set()
2955
- originalLinkCount = len(linksFound)
2903
+ stopSourceURLScan = False
2904
+ linksFoundURLScan = set()
2905
+ totalUrls = 0
2906
+ checkResponse = True
2956
2907
 
2957
2908
  # Set the URL to just the hostname
2958
2909
  url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
@@ -2975,21 +2926,23 @@ def getURLScanUrls():
2975
2926
  if args.mode == "R":
2976
2927
  write(
2977
2928
  colored(
2978
- "The URLScan URL requested to get links for responses: ",
2929
+ "URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
2979
2930
  "magenta",
2980
2931
  )
2981
2932
  + colored(url + "\n", "white")
2982
2933
  )
2983
2934
  else:
2984
2935
  write(
2985
- colored("The URLScan URL requested to get links: ", "magenta")
2936
+ colored(
2937
+ "URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
2938
+ )
2986
2939
  + colored(url + "\n", "white")
2987
2940
  )
2988
2941
 
2989
- if not args.check_only:
2942
+ if args.mode in ("U", "B") and not args.check_only:
2990
2943
  write(
2991
2944
  colored(
2992
- "\rGetting links from urlscan.io API (this can take a while for some domains)...\r",
2945
+ "URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
2993
2946
  "cyan",
2994
2947
  )
2995
2948
  )
@@ -3005,14 +2958,12 @@ def getURLScanUrls():
3005
2958
  session.mount("https://", HTTP_ADAPTER)
3006
2959
  session.mount("http://", HTTP_ADAPTER)
3007
2960
  # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
3008
- resp = session.get(
3009
- url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY}
3010
- )
2961
+ resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
3011
2962
  requestsMade = requestsMade + 1
3012
2963
  except Exception as e:
3013
2964
  write(
3014
2965
  colored(
3015
- getSPACER("[ ERR ] Unable to get links from urlscan.io: " + str(e)),
2966
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
3016
2967
  "red",
3017
2968
  )
3018
2969
  )
@@ -3027,15 +2978,17 @@ def getURLScanUrls():
3027
2978
  if seconds <= args.urlscan_rate_limit_retry * 60:
3028
2979
  writerr(
3029
2980
  colored(
3030
- getSPACER(
3031
- "[ 429 ] URLScan rate limit reached, so waiting for another "
3032
- + str(seconds)
3033
- + " seconds before continuing..."
3034
- ),
2981
+ "URLScan - [ 429 ] Rate limit reached, so waiting for another "
2982
+ + str(seconds)
2983
+ + " seconds before continuing...",
3035
2984
  "yellow",
3036
2985
  )
3037
2986
  )
3038
- time.sleep(seconds + 1)
2987
+ # Wait can be interrupted by SIGINT via interrupt_event
2988
+ interrupt_event.clear()
2989
+ if interrupt_event.wait(seconds + 1):
2990
+ # Interrupted by SIGINT
2991
+ return
3039
2992
  try:
3040
2993
  resp = session.get(
3041
2994
  url,
@@ -3048,10 +3001,7 @@ def getURLScanUrls():
3048
3001
  except Exception as e:
3049
3002
  write(
3050
3003
  colored(
3051
- getSPACER(
3052
- "[ ERR ] Unable to get links from urlscan.io: "
3053
- + str(e)
3054
- ),
3004
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
3055
3005
  "red",
3056
3006
  )
3057
3007
  )
@@ -3064,18 +3014,14 @@ def getURLScanUrls():
3064
3014
  if resp.status_code == 429:
3065
3015
  writerr(
3066
3016
  colored(
3067
- getSPACER(
3068
- "[ 429 ] URLScan rate limit reached so trying without API Key..."
3069
- ),
3017
+ "URLScan - [ 429 ] Rate limit reached so trying without API Key...",
3070
3018
  "red",
3071
3019
  )
3072
3020
  )
3073
3021
  else:
3074
3022
  writerr(
3075
3023
  colored(
3076
- getSPACER(
3077
- "The URLScan API Key is invalid so trying without API Key..."
3078
- ),
3024
+ "URLScan - [ INF ] The API Key is invalid so trying without API Key...",
3079
3025
  "red",
3080
3026
  )
3081
3027
  )
@@ -3085,64 +3031,54 @@ def getURLScanUrls():
3085
3031
  except Exception as e:
3086
3032
  writerr(
3087
3033
  colored(
3088
- getSPACER(
3089
- "[ ERR ] Unable to get links from urlscan.io: " + str(e)
3090
- ),
3034
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
3091
3035
  "red",
3092
3036
  )
3093
3037
  )
3094
- return
3038
+ checkResponse = False
3095
3039
 
3096
3040
  # If the rate limit was reached end now
3097
3041
  if resp.status_code == 429:
3098
3042
  writerr(
3099
3043
  colored(
3100
- getSPACER(
3101
- "[ 429 ] URLScan rate limit reached without API Key so unable to get links."
3102
- ),
3044
+ "URLScan - [ 429 ] Rate limit reached without API Key so unable to get links.",
3103
3045
  "red",
3104
3046
  )
3105
3047
  )
3106
- return
3048
+ checkResponse = False
3107
3049
  else:
3108
3050
  writerr(
3109
3051
  colored(
3110
- getSPACER(
3111
- "[ 429 ] URLScan rate limit reached so unable to get links."
3112
- ),
3052
+ "URLScan - [ 429 ] Rate limit reached so unable to get links.",
3113
3053
  "red",
3114
3054
  )
3115
3055
  )
3116
- return
3056
+ checkResponse = False
3117
3057
  elif resp.status_code != 200:
3118
3058
  writerr(
3119
3059
  colored(
3120
- getSPACER(
3121
- "[ "
3122
- + str(resp.status_code)
3123
- + " ] Unable to get links from urlscan.io"
3124
- ),
3060
+ "URLScan - [ "
3061
+ + str(resp.status_code)
3062
+ + " ] Unable to get links from urlscan.io",
3125
3063
  "red",
3126
3064
  )
3127
3065
  )
3128
- return
3066
+ checkResponse = False
3129
3067
 
3130
3068
  try:
3131
- # Get the JSON response
3132
- jsonResp = json.loads(resp.text.strip())
3069
+ if checkResponse:
3070
+ # Get the JSON response
3071
+ jsonResp = json.loads(resp.text.strip())
3133
3072
 
3134
- # Get the number of results
3135
- totalUrls = int(jsonResp["total"])
3073
+ # Get the number of results
3074
+ totalUrls = int(jsonResp["total"])
3136
3075
  except Exception:
3137
3076
  writerr(
3138
3077
  colored(
3139
- getSPACER(
3140
- "[ ERR ] There was an unexpected response from the URLScan API"
3141
- ),
3078
+ "URLScan - [ ERR ] There was an unexpected response from the API",
3142
3079
  "red",
3143
3080
  )
3144
3081
  )
3145
- totalUrls = 0
3146
3082
 
3147
3083
  # Carry on if something was found
3148
3084
  if args.check_only and args.mode != "R":
@@ -3150,12 +3086,12 @@ def getURLScanUrls():
3150
3086
  hasMore = jsonResp["has_more"]
3151
3087
  if hasMore:
3152
3088
  write(
3153
- colored("Get URLs from URLScan: ", "cyan")
3089
+ colored("URLScan - [ INFO ] Get URLs from URLScan: ", "cyan")
3154
3090
  + colored("UNKNOWN requests", "white")
3155
3091
  )
3156
3092
  else:
3157
3093
  write(
3158
- colored("Get URLs from URLScan: ", "cyan")
3094
+ colored("URLScan - [ INFO ] Get URLs from URLScan: ", "cyan")
3159
3095
  + colored("1 request", "white")
3160
3096
  )
3161
3097
  except Exception:
@@ -3166,7 +3102,7 @@ def getURLScanUrls():
3166
3102
  # Carry on if something was found
3167
3103
  if int(totalUrls) > 0:
3168
3104
 
3169
- while not stopSource:
3105
+ while not stopSourceURLScan:
3170
3106
 
3171
3107
  searchAfter = ""
3172
3108
 
@@ -3203,9 +3139,7 @@ def getURLScanUrls():
3203
3139
  sort = urlSection["sort"]
3204
3140
  except Exception:
3205
3141
  sort = ""
3206
- searchAfter = (
3207
- "&search_after=" + str(sort[0]) + "," + str(sort[1])
3208
- )
3142
+ searchAfter = "&search_after=" + str(sort[0]) + "," + str(sort[1])
3209
3143
 
3210
3144
  # Get the HTTP code
3211
3145
  try:
@@ -3243,7 +3177,7 @@ def getURLScanUrls():
3243
3177
  if searchAfter != "":
3244
3178
 
3245
3179
  keepTrying = True
3246
- while not stopSource and keepTrying:
3180
+ while not stopSourceURLScan and keepTrying:
3247
3181
  keepTrying = False
3248
3182
  # Get the next page from urlscan.io
3249
3183
  try:
@@ -3263,10 +3197,8 @@ def getURLScanUrls():
3263
3197
  except Exception as e:
3264
3198
  writerr(
3265
3199
  colored(
3266
- getSPACER(
3267
- "[ ERR ] Unable to get links from urlscan.io: "
3268
- + str(e)
3269
- ),
3200
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: "
3201
+ + str(e),
3270
3202
  "red",
3271
3203
  )
3272
3204
  )
@@ -3285,56 +3217,53 @@ def getURLScanUrls():
3285
3217
  if seconds <= args.urlscan_rate_limit_retry * 60:
3286
3218
  writerr(
3287
3219
  colored(
3288
- getSPACER(
3289
- "[ 429 ] URLScan rate limit reached, so waiting for another "
3290
- + str(seconds)
3291
- + " seconds before continuing..."
3292
- ),
3220
+ "URLScan - [ 429 ] Rate limit reached, so waiting for another "
3221
+ + str(seconds)
3222
+ + " seconds before continuing...",
3293
3223
  "yellow",
3294
3224
  )
3295
3225
  )
3296
- time.sleep(seconds + 1)
3226
+ # Wait can be interrupted by SIGINT via interrupt_event
3227
+ interrupt_event.clear()
3228
+ if interrupt_event.wait(seconds + 1):
3229
+ # Interrupted by SIGINT
3230
+ keepTrying = False
3231
+ break
3297
3232
  keepTrying = True
3298
3233
  continue
3299
3234
  else:
3300
3235
  writerr(
3301
3236
  colored(
3302
- getSPACER(
3303
- "[ 429 ] URLScan rate limit reached (waiting time of "
3304
- + str(seconds)
3305
- + "), so stopping. Links that have already been retrieved will be saved."
3306
- ),
3237
+ "URLScan - [ 429 ] Rate limit reached (waiting time of "
3238
+ + str(seconds)
3239
+ + "), so stopping. Links that have already been retrieved will be saved.",
3307
3240
  "red",
3308
3241
  )
3309
3242
  )
3310
- stopSource = True
3243
+ stopSourceURLScan = True
3311
3244
  pass
3312
3245
  else:
3313
3246
  writerr(
3314
3247
  colored(
3315
- getSPACER(
3316
- "[ 429 ] URLScan rate limit reached, so stopping. Links that have already been retrieved will be saved."
3317
- ),
3248
+ "URLScan - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
3318
3249
  "red",
3319
3250
  )
3320
3251
  )
3321
- stopSource = True
3252
+ stopSourceURLScan = True
3322
3253
  pass
3323
3254
  elif resp.status_code != 200:
3324
3255
  writerr(
3325
3256
  colored(
3326
- getSPACER(
3327
- "[ "
3328
- + str(resp.status_code)
3329
- + " ] Unable to get links from urlscan.io"
3330
- ),
3257
+ "URLScan - [ "
3258
+ + str(resp.status_code)
3259
+ + " ] Unable to get links from urlscan.io",
3331
3260
  "red",
3332
3261
  )
3333
3262
  )
3334
- stopSource = True
3263
+ stopSourceURLScan = True
3335
3264
  pass
3336
3265
 
3337
- if not stopSource:
3266
+ if not stopSourceURLScan:
3338
3267
  # Get the JSON response
3339
3268
  jsonResp = json.loads(resp.text.strip())
3340
3269
 
@@ -3342,47 +3271,32 @@ def getURLScanUrls():
3342
3271
  if (
3343
3272
  jsonResp["results"] is None
3344
3273
  or len(jsonResp["results"]) == 0
3345
- or (
3346
- args.limit_requests != 0
3347
- and requestsMade > args.limit_requests
3348
- )
3274
+ or (args.limit_requests != 0 and requestsMade > args.limit_requests)
3349
3275
  or (
3350
3276
  args.mode == "R"
3351
3277
  and args.limit != 0
3352
3278
  and requestsMade > args.limit
3353
3279
  )
3354
3280
  ):
3355
- stopSource = True
3281
+ stopSourceURLScan = True
3356
3282
 
3357
3283
  # Show the MIME types found (in case user wants to exclude more)
3358
3284
  if verbose() and len(linkMimes) > 0 and args.mode != "R":
3359
3285
  linkMimes.discard("warc/revisit")
3360
3286
  write(
3361
- getSPACER(
3362
- colored("MIME types found: ", "magenta")
3363
- + colored(str(linkMimes), "white")
3364
- )
3287
+ colored("URLScan - [ INFO ] MIME types found: ", "magenta")
3288
+ + colored(str(linkMimes), "white")
3365
3289
  + "\n"
3366
3290
  )
3367
3291
 
3368
- linkCount = len(linksFound) - originalLinkCount
3369
3292
  if args.mode != "R":
3370
- if args.xwm and args.xcc and args.xav:
3371
- write(
3372
- getSPACER(
3373
- colored("Links found on urlscan.io: ", "cyan")
3374
- + colored(str(linkCount), "white")
3375
- )
3376
- + "\n"
3377
- )
3378
- else:
3379
- write(
3380
- getSPACER(
3381
- colored("Extra links found on urlscan.io: ", "cyan")
3382
- + colored(str(linkCount), "white")
3383
- )
3384
- + "\n"
3385
- )
3293
+ linkCountURLScan = len(linksFoundURLScan)
3294
+ write(
3295
+ colored("URLScan - [ INFO ] Links found on urlscan.io: ", "cyan")
3296
+ + colored(str(linkCountURLScan), "white")
3297
+ )
3298
+ linksFound.update(linksFoundURLScan)
3299
+ linksFoundURLScan.clear()
3386
3300
 
3387
3301
  except Exception as e:
3388
3302
  writerr(colored("ERROR getURLScanUrls 1: " + str(e), "red"))
@@ -3392,12 +3306,11 @@ def processWayBackPage(url):
3392
3306
  """
3393
3307
  Get URLs from a specific page of archive.org CDX API for the input domain
3394
3308
  """
3395
- global totalPages, linkMimes, linksFound, stopSource
3309
+ global totalPages, linkMimes, linksFound, stopSourceWayback, linkCountWayback, linksFoundWayback, current_response, current_session
3396
3310
  try:
3397
3311
  # Get memory in case it exceeds threshold
3398
3312
  getMemory()
3399
-
3400
- if not stopSource:
3313
+ if not stopSourceWayback:
3401
3314
  try:
3402
3315
  # Choose a random user agent string to use for any requests
3403
3316
  resp = None
@@ -3406,229 +3319,231 @@ def processWayBackPage(url):
3406
3319
  session = requests.Session()
3407
3320
  session.mount("https://", HTTP_ADAPTER)
3408
3321
  session.mount("http://", HTTP_ADAPTER)
3409
- resp = session.get(url, headers={"User-Agent": userAgent})
3410
- except ConnectionError:
3411
- writerr(
3412
- colored(
3413
- getSPACER(
3414
- "[ ERR ] Wayback Machine (archive.org) connection error for page "
3415
- + page
3416
- ),
3417
- "red",
3418
- )
3419
- )
3420
- resp = None
3421
- return
3422
- except Exception as e:
3423
- writerr(
3424
- colored(
3425
- getSPACER(
3426
- "[ ERR ] Error getting response for page "
3427
- + page
3428
- + " - "
3429
- + str(e)
3430
- ),
3431
- "red",
3432
- )
3322
+ # expose session so SIGINT handler can close it to interrupt blocking network I/O
3323
+ try:
3324
+ current_session = session
3325
+ except Exception:
3326
+ pass
3327
+
3328
+ resp = session.get(
3329
+ url, headers={"User-Agent": userAgent}, stream=True, timeout=args.timeout
3433
3330
  )
3434
- resp = None
3435
- return
3436
- finally:
3331
+ # expose live response so SIGINT handler can close it to interrupt blocking I/O
3437
3332
  try:
3438
- if resp is not None:
3439
- # If a status other of 429, then stop processing Wayback Machine
3440
- if resp.status_code == 429:
3441
- if args.wayback_rate_limit_retry > 0:
3442
- seconds = args.wayback_rate_limit_retry * 60
3443
- if args.processes == 1:
3444
- writerr(
3445
- colored(
3446
- "\r[ 429 ] Wayback Machine (archive.org) rate limit reached on page "
3447
- + str(page)
3448
- + " of "
3449
- + str(totalPages)
3450
- + ", so waiting for "
3451
- + str(seconds)
3452
- + " seconds before continuing...\r",
3453
- "yellow",
3454
- )
3455
- )
3456
- else:
3457
- writerr(
3458
- colored(
3459
- "\r[ 429 ] Wayback Machine (archive.org) rate limit reached, so waiting for "
3460
- + str(seconds)
3461
- + " seconds before continuing...\r",
3462
- "yellow",
3463
- )
3464
- )
3465
- time.sleep(seconds)
3466
- try:
3467
- resp = session.get(
3468
- url, headers={"User-Agent": userAgent}
3469
- )
3470
- except ConnectionError:
3471
- writerr(
3472
- colored(
3473
- getSPACER(
3474
- "[ ERR ] Wayback Machine (archive.org) connection error for page "
3475
- + page
3476
- ),
3477
- "red",
3478
- )
3333
+ current_response = resp
3334
+ except Exception:
3335
+ pass
3336
+ # Check response status in the finally block
3337
+ if resp is not None:
3338
+ # If a status other of 429, then stop processing Wayback Machine
3339
+ if resp.status_code == 429:
3340
+ if args.wayback_rate_limit_retry > 0:
3341
+ seconds = args.wayback_rate_limit_retry * 60
3342
+ if args.processes == 1:
3343
+ writerr(
3344
+ colored(
3345
+ "Wayback - [ 429 ] Rate limit reached on page "
3346
+ + str(page)
3347
+ + " of "
3348
+ + str(totalPages)
3349
+ + ", so waiting for "
3350
+ + str(seconds)
3351
+ + " seconds before continuing...",
3352
+ "yellow",
3479
3353
  )
3480
- resp = None
3481
- return
3482
- except Exception as e:
3483
- writerr(
3484
- colored(
3485
- getSPACER(
3486
- "[ ERR ] Error getting response for page "
3487
- + page
3488
- + " - "
3489
- + str(e)
3490
- ),
3491
- "red",
3492
- )
3354
+ )
3355
+ else:
3356
+ writerr(
3357
+ colored(
3358
+ "Wayback - [ 429 ] Rate limit reached, so waiting for "
3359
+ + str(seconds)
3360
+ + " seconds before continuing...",
3361
+ "yellow",
3493
3362
  )
3494
- resp = None
3495
- return
3496
-
3497
- if resp.status_code == 429:
3498
- writerr(
3499
- colored(
3500
- getSPACER(
3501
- "[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
3502
- ),
3503
- "red",
3504
3363
  )
3505
- )
3506
- stopSource = True
3507
- return
3508
- # If a status other of 503, then the site is unavailable
3509
- if resp.status_code == 503:
3510
- writerr(
3511
- colored(
3512
- getSPACER(
3513
- "[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
3514
- ),
3515
- "red",
3364
+ # Wait can be interrupted by SIGINT via interrupt_event
3365
+ interrupt_event.clear()
3366
+ if interrupt_event.wait(seconds):
3367
+ return
3368
+ try:
3369
+ resp = session.get(
3370
+ url,
3371
+ headers={"User-Agent": userAgent},
3372
+ stream=True,
3373
+ timeout=args.timeout,
3516
3374
  )
3517
- )
3518
- stopSource = True
3519
- return
3520
- # If the response from archive.org is empty then skip
3521
- if resp.text == "" and totalPages == 0:
3522
- if verbose():
3375
+ try:
3376
+ current_response = resp
3377
+ except Exception:
3378
+ pass
3379
+ except ConnectionError:
3523
3380
  writerr(
3524
3381
  colored(
3525
- getSPACER(
3526
- "[ ERR ] "
3527
- + url
3528
- + " gave an empty response."
3529
- ),
3382
+ "Wayback - [ ERR ] Connection error for page " + page,
3530
3383
  "red",
3531
3384
  )
3532
3385
  )
3533
- return
3534
- # If a status other than 200, then stop
3535
- if resp.status_code != 200:
3536
- if verbose():
3386
+ resp = None
3387
+ return
3388
+ except Exception as e:
3537
3389
  writerr(
3538
3390
  colored(
3539
- getSPACER(
3540
- "[ "
3541
- + str(resp.status_code)
3542
- + " ] Error for "
3543
- + url
3544
- ),
3391
+ "Wayback - [ ERR ] Error getting response for page "
3392
+ + page
3393
+ + " - "
3394
+ + str(e),
3545
3395
  "red",
3546
3396
  )
3547
3397
  )
3548
- return
3549
- except ConnectionError:
3550
- writerr(
3551
- colored(
3552
- getSPACER(
3553
- "[ ERR ] Wayback Machine (archive.org) connection error for page "
3554
- + page
3555
- ),
3556
- "red",
3398
+ resp = None
3399
+ return
3400
+
3401
+ if resp.status_code == 429:
3402
+ writerr(
3403
+ colored(
3404
+ "Wayback - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
3405
+ "red",
3406
+ )
3557
3407
  )
3558
- )
3559
- resp = None
3560
- return
3561
- except Exception as e:
3562
- writerr(
3563
- colored(
3564
- getSPACER(
3565
- "[ ERR ] Error getting response for page "
3566
- + page
3567
- + " - "
3568
- + str(e)
3569
- ),
3570
- "red",
3408
+ stopSourceWayback = True
3409
+ return
3410
+ # If a status other of 503, then the site is unavailable
3411
+ if resp.status_code == 503:
3412
+ writerr(
3413
+ colored(
3414
+ "Wayback - [ 503 ] The Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.",
3415
+ "red",
3416
+ )
3571
3417
  )
3572
- )
3573
- resp = None
3574
- return
3418
+ stopSourceWayback = True
3419
+ return
3420
+ # If a status other than 200, then stop
3421
+ if resp.status_code != 200:
3422
+ if verbose():
3423
+ writerr(
3424
+ colored(
3425
+ "Wayback - [ " + str(resp.status_code) + " ] Error for " + url,
3426
+ "red",
3427
+ )
3428
+ )
3429
+ try:
3430
+ current_response = None
3431
+ except Exception:
3432
+ pass
3433
+ try:
3434
+ current_session = None
3435
+ except Exception:
3436
+ pass
3437
+ return
3575
3438
 
3576
- # Get the URLs and MIME types. Each line is a separate JSON string
3577
- try:
3439
+ # Get the URLs and MIME types. Each line is a separate JSON string
3440
+ # Process lines as they arrive - if connection drops, we keep what we've already processed
3578
3441
  for line in resp.iter_lines():
3579
- results = line.decode("utf-8")
3580
- foundUrl = fixArchiveOrgUrl(str(results).split(" ")[1])
3442
+ try:
3443
+ results = line.decode("utf-8")
3444
+ foundUrl = fixArchiveOrgUrl(str(results).split(" ")[1])
3581
3445
 
3582
- # If --filter-responses-only wasn't used, then check the URL exclusions
3583
- if args.filter_responses_only:
3584
- match = None
3585
- else:
3586
- match = re.search(
3587
- r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
3588
- foundUrl,
3589
- flags=re.IGNORECASE,
3590
- )
3591
- if match is None:
3592
- # Only get MIME Types if --verbose option was selected
3593
- if verbose():
3446
+ # If --filter-responses-only wasn't used, then check the URL exclusions
3447
+ if args.filter_responses_only:
3448
+ match = None
3449
+ else:
3450
+ match = re.search(
3451
+ r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
3452
+ foundUrl,
3453
+ flags=re.IGNORECASE,
3454
+ )
3455
+ if match is None:
3456
+ # Only get MIME Types if --verbose option was selected
3457
+ if verbose():
3458
+ try:
3459
+ mimeType = str(results).split(" ")[2]
3460
+ if mimeType != "":
3461
+ linkMimes.add(mimeType)
3462
+ except Exception:
3463
+ if verbose():
3464
+ writerr(
3465
+ colored(
3466
+ getSPACER(
3467
+ "ERROR processWayBackPage 2: Cannot get MIME type from line: "
3468
+ + str(line)
3469
+ ),
3470
+ "red",
3471
+ )
3472
+ )
3594
3473
  try:
3595
- mimeType = str(results).split(" ")[2]
3596
- if mimeType != "":
3597
- linkMimes.add(mimeType)
3474
+ linksFoundAdd(foundUrl, linksFoundWayback)
3475
+
3598
3476
  except Exception:
3599
3477
  if verbose():
3600
3478
  writerr(
3601
3479
  colored(
3602
3480
  getSPACER(
3603
- "ERROR processWayBackPage 2: Cannot get MIME type from line: "
3481
+ "ERROR processWayBackPage 3: Cannot get link from line: "
3604
3482
  + str(line)
3605
3483
  ),
3606
3484
  "red",
3607
3485
  )
3608
3486
  )
3609
- write(resp.text)
3610
- try:
3611
- linksFoundAdd(foundUrl)
3612
- except Exception:
3613
- if verbose():
3614
- writerr(
3615
- colored(
3616
- getSPACER(
3617
- "ERROR processWayBackPage 3: Cannot get link from line: "
3618
- + str(line)
3619
- ),
3620
- "red",
3621
- )
3487
+ except Exception:
3488
+ if verbose():
3489
+ writerr(
3490
+ colored(
3491
+ getSPACER("ERROR processWayBackPage 4: " + str(line)), "red"
3622
3492
  )
3623
- write(resp.text)
3624
- except Exception:
3625
- if verbose():
3493
+ )
3494
+
3495
+ except ConnectionError:
3496
+ writerr(
3497
+ colored(
3498
+ "Wayback - [ ERR ] Connection error for page "
3499
+ + page
3500
+ + (
3501
+ f" (saved {len(linksFoundWayback)} URLs before error)"
3502
+ if len(linksFoundWayback) > 0
3503
+ else ""
3504
+ ),
3505
+ "red",
3506
+ )
3507
+ )
3508
+ try:
3509
+ current_response = None
3510
+ except Exception:
3511
+ pass
3512
+ try:
3513
+ current_session = None
3514
+ except Exception:
3515
+ pass
3516
+ return
3517
+ except Exception as e:
3518
+ # Even if connection drops, we've already saved the URLs processed so far
3519
+ if len(linksFoundWayback) > 0:
3520
+ writerr(
3521
+ colored(
3522
+ f"Wayback - [ WARN ] Error getting response for page {page} - {str(e)} (saved {len(linksFoundWayback)} URLs before error)",
3523
+ "yellow",
3524
+ )
3525
+ )
3526
+ else:
3626
3527
  writerr(
3627
3528
  colored(
3628
- getSPACER("ERROR processWayBackPage 4: " + str(line)), "red"
3529
+ "Wayback - [ ERR ] Error getting response for page "
3530
+ + page
3531
+ + " - "
3532
+ + str(e),
3533
+ "red",
3629
3534
  )
3630
3535
  )
3536
+ try:
3537
+ current_response = None
3538
+ except Exception:
3539
+ pass
3540
+ try:
3541
+ current_session = None
3542
+ except Exception:
3543
+ pass
3544
+ return
3631
3545
  else:
3546
+ print("DEBUG: HERE END!") # DEBUG
3632
3547
  pass
3633
3548
  except Exception as e:
3634
3549
  if verbose():
@@ -3639,40 +3554,47 @@ def getWaybackUrls():
3639
3554
  """
3640
3555
  Get URLs from the Wayback Machine, archive.org
3641
3556
  """
3642
- global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages, stopSource, argsInput, checkWayback
3557
+ global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages, stopSourceWayback, argsInput, checkWayback, linkCountWayback, linksFoundWayback
3643
3558
 
3644
3559
  # Write the file of URL's for the passed domain/URL
3645
3560
  try:
3646
- stopSource = False
3561
+ stopSourceWayback = False
3562
+ linksFoundWayback = set()
3647
3563
 
3648
3564
  if MATCH_MIME != "":
3649
3565
  filterMIME = "&filter=mimetype:" + re.escape(MATCH_MIME).replace(",", "|")
3650
3566
  else:
3651
- filterMIME = "&filter=!mimetype:warc/revisit|" + re.escape(
3652
- FILTER_MIME
3653
- ).replace(",", "|")
3567
+ filterMIME = "&filter=!mimetype:warc/revisit|" + re.escape(FILTER_MIME).replace(
3568
+ ",", "|"
3569
+ )
3654
3570
  # If there any \+ in the MIME types, e.g. image/svg\+xml (the backslash is because it was previosuly escaped), then replace the \+ with a . otherwise the wayback API does not recognise it
3655
3571
  filterMIME = filterMIME.replace("\+", ".")
3656
3572
 
3657
3573
  if MATCH_CODE != "":
3658
3574
  filterCode = "&filter=statuscode:" + re.escape(MATCH_CODE).replace(",", "|")
3659
3575
  else:
3660
- filterCode = "&filter=!statuscode:" + re.escape(FILTER_CODE).replace(
3661
- ",", "|"
3662
- )
3576
+ filterCode = "&filter=!statuscode:" + re.escape(FILTER_CODE).replace(",", "|")
3663
3577
 
3664
3578
  # Set keywords filter if -ko argument passed
3665
3579
  filterKeywords = ""
3666
3580
  if args.keywords_only:
3667
3581
  if args.keywords_only == "#CONFIG":
3668
3582
  filterKeywords = (
3669
- "&filter=original:.*("
3670
- + re.escape(FILTER_KEYWORDS).replace(",", "|")
3671
- + ").*"
3583
+ "&filter=original:.*(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ").*"
3672
3584
  )
3673
3585
  else:
3674
3586
  filterKeywords = "&filter=original:.*(" + args.keywords_only + ").*"
3675
3587
 
3588
+ # Add the date filters if they were passed
3589
+ if args.from_date is None:
3590
+ filterFrom = ""
3591
+ else:
3592
+ filterFrom = "&from=" + str(args.from_date)
3593
+ if args.to_date is None:
3594
+ filterTo = ""
3595
+ else:
3596
+ filterTo = "&to=" + str(args.to_date)
3597
+
3676
3598
  if args.filter_responses_only:
3677
3599
  url = (
3678
3600
  WAYBACK_URL.replace("{DOMAIN}", subs + quote(argsInput) + path).replace(
@@ -3688,6 +3610,8 @@ def getWaybackUrls():
3688
3610
  + filterMIME
3689
3611
  + filterCode
3690
3612
  + filterKeywords
3613
+ + filterFrom
3614
+ + filterTo
3691
3615
  + "&page="
3692
3616
  )
3693
3617
 
@@ -3697,7 +3621,7 @@ def getWaybackUrls():
3697
3621
  if not args.check_only:
3698
3622
  write(
3699
3623
  colored(
3700
- "\rGetting the number of Wayback Machine (archive.org) pages to search...\r",
3624
+ "Wayback - [ INFO ] Getting the number of pages to search...",
3701
3625
  "cyan",
3702
3626
  )
3703
3627
  )
@@ -3706,9 +3630,7 @@ def getWaybackUrls():
3706
3630
  session = requests.Session()
3707
3631
  session.mount("https://", HTTP_ADAPTER)
3708
3632
  session.mount("http://", HTTP_ADAPTER)
3709
- resp = session.get(
3710
- url + "&showNumPages=True", headers={"User-Agent": userAgent}
3711
- )
3633
+ resp = session.get(url + "&showNumPages=True", headers={"User-Agent": userAgent})
3712
3634
  # Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
3713
3635
  try:
3714
3636
  totalPages = int(resp.text.strip())
@@ -3724,9 +3646,7 @@ def getWaybackUrls():
3724
3646
  if resp.status_code == 429:
3725
3647
  writerr(
3726
3648
  colored(
3727
- getSPACER(
3728
- "[ 429 ] Wayback Machine (Archive.org) rate limit reached so unable to get links."
3729
- ),
3649
+ "Wayback - [ 429 ] Rate limit reached so unable to get links.",
3730
3650
  "red",
3731
3651
  )
3732
3652
  )
@@ -3736,9 +3656,7 @@ def getWaybackUrls():
3736
3656
  if resp.status_code == 503:
3737
3657
  writerr(
3738
3658
  colored(
3739
- getSPACER(
3740
- "[ 503 ] Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
3741
- ),
3659
+ "Wayback - [ 503 ] The Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.",
3742
3660
  "red",
3743
3661
  )
3744
3662
  )
@@ -3747,19 +3665,15 @@ def getWaybackUrls():
3747
3665
  if resp.text.lower().find("blocked site error") > 0:
3748
3666
  writerr(
3749
3667
  colored(
3750
- getSPACER(
3751
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)"
3752
- ),
3668
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)",
3753
3669
  "red",
3754
3670
  )
3755
3671
  )
3756
3672
  else:
3757
3673
  writerr(
3758
3674
  colored(
3759
- getSPACER(
3760
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): "
3761
- + str(resp.text.strip())
3762
- ),
3675
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): "
3676
+ + str(resp.text.strip()),
3763
3677
  "red",
3764
3678
  )
3765
3679
  )
@@ -3767,28 +3681,22 @@ def getWaybackUrls():
3767
3681
  if str(e).lower().find("alert access denied"):
3768
3682
  writerr(
3769
3683
  colored(
3770
- getSPACER(
3771
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don't know, but it has happened!)"
3772
- ),
3684
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don't know, but it has happened!)",
3773
3685
  "red",
3774
3686
  )
3775
3687
  )
3776
3688
  elif str(e).lower().find("connection refused"):
3777
3689
  writerr(
3778
3690
  colored(
3779
- getSPACER(
3780
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)"
3781
- ),
3691
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)",
3782
3692
  "red",
3783
3693
  )
3784
3694
  )
3785
3695
  else:
3786
3696
  writerr(
3787
3697
  colored(
3788
- getSPACER(
3789
- "[ ERR ] Unable to get links from Wayback Machine (archive.org): "
3790
- + str(e)
3791
- ),
3698
+ "Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): "
3699
+ + str(e),
3792
3700
  "red",
3793
3701
  )
3794
3702
  )
@@ -3798,27 +3706,29 @@ def getWaybackUrls():
3798
3706
  if totalPages < 0:
3799
3707
  write(
3800
3708
  colored(
3801
- "Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.",
3709
+ "Wayback - [ INFO ] Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.",
3802
3710
  "cyan",
3803
3711
  )
3804
3712
  )
3805
3713
  else:
3806
3714
  checkWayback = totalPages
3807
3715
  write(
3808
- colored("Get URLs from Wayback Machine: ", "cyan")
3716
+ colored("Wayback - [ INFO ] Get URLs from Wayback Machine: ", "cyan")
3809
3717
  + colored(str(checkWayback) + " requests", "white")
3810
3718
  )
3811
3719
  else:
3812
3720
  if verbose():
3813
3721
  write(
3814
- colored("The archive URL requested to get links: ", "magenta")
3722
+ colored(
3723
+ "Wayback - [ INFO ] The archive URL requested to get links: ", "magenta"
3724
+ )
3815
3725
  + colored(url + "\n", "white")
3816
3726
  )
3817
3727
 
3818
3728
  if totalPages < 0:
3819
3729
  write(
3820
3730
  colored(
3821
- "\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r",
3731
+ "Wayback - [ INFO ] Getting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...",
3822
3732
  "cyan",
3823
3733
  )
3824
3734
  )
@@ -3828,9 +3738,9 @@ def getWaybackUrls():
3828
3738
  # if the page number was found then display it, but otherwise we will just try to increment until we have everything
3829
3739
  write(
3830
3740
  colored(
3831
- "\rGetting links from "
3741
+ "Wayback - [ INFO ] Getting links from "
3832
3742
  + str(totalPages)
3833
- + " Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r",
3743
+ + " Wayback Machine (archive.org) API requests (this can take a while for some domains)...",
3834
3744
  "cyan",
3835
3745
  )
3836
3746
  )
@@ -3854,25 +3764,22 @@ def getWaybackUrls():
3854
3764
  if verbose() and len(linkMimes) > 0:
3855
3765
  linkMimes.discard("warc/revisit")
3856
3766
  write(
3857
- getSPACER(
3858
- colored("MIME types found: ", "magenta")
3859
- + colored(str(linkMimes), "white")
3860
- )
3767
+ colored("Wayback - [ INFO ] MIME types found: ", "magenta")
3768
+ + colored(str(linkMimes), "white")
3861
3769
  + "\n"
3862
3770
  )
3863
3771
  linkMimes = None
3864
3772
 
3865
3773
  if not args.xwm:
3866
- linkCount = len(linksFound)
3774
+ linkCountWayback = len(linksFoundWayback)
3867
3775
  write(
3868
- getSPACER(
3869
- colored(
3870
- "Links found on Wayback Machine (archive.org): ", "cyan"
3871
- )
3872
- + colored(str(linkCount), "white")
3776
+ colored(
3777
+ "Wayback - [ INFO ] Links found on Wayback Machine (archive.org): ", "cyan"
3873
3778
  )
3874
- + "\n"
3779
+ + colored(str(linkCountWayback), "white")
3875
3780
  )
3781
+ linksFound.update(linksFoundWayback)
3782
+ linksFoundWayback.clear()
3876
3783
 
3877
3784
  except Exception as e:
3878
3785
  writerr(colored("ERROR getWaybackUrls 1: " + str(e), "red"))
@@ -3882,13 +3789,13 @@ def processCommonCrawlCollection(cdxApiUrl):
3882
3789
  """
3883
3790
  Get URLs from a given Common Crawl index collection
3884
3791
  """
3885
- global subs, path, linksFound, linkMimes, stopSource, argsInput
3792
+ global subs, path, linksFound, linkMimes, stopSourceCommonCrawl, argsInput, linkCountCommonCrawl, linksFoundCommonCrawl, current_response, current_session
3886
3793
 
3887
3794
  try:
3888
3795
  # Get memory in case it exceeds threshold
3889
3796
  getMemory()
3890
3797
 
3891
- if not stopSource:
3798
+ if not stopSourceCommonCrawl:
3892
3799
  # Set mime content type filter
3893
3800
  if MATCH_MIME.strip() != "":
3894
3801
  filterMIME = "&filter=~mime:("
@@ -3902,31 +3809,21 @@ def processCommonCrawlCollection(cdxApiUrl):
3902
3809
  # Set status code filter
3903
3810
  filterCode = ""
3904
3811
  if MATCH_CODE.strip() != "":
3905
- filterCode = (
3906
- "&filter=~status:(" + re.escape(MATCH_CODE).replace(",", "|") + ")"
3907
- )
3812
+ filterCode = "&filter=~status:(" + re.escape(MATCH_CODE).replace(",", "|") + ")"
3908
3813
  else:
3909
- filterCode = (
3910
- "&filter=!~status:("
3911
- + re.escape(FILTER_CODE).replace(",", "|")
3912
- + ")"
3913
- )
3814
+ filterCode = "&filter=!~status:(" + re.escape(FILTER_CODE).replace(",", "|") + ")"
3914
3815
 
3915
3816
  # Set keywords filter if -ko argument passed
3916
3817
  filterKeywords = ""
3917
3818
  if args.keywords_only:
3918
3819
  if args.keywords_only == "#CONFIG":
3919
3820
  filterKeywords = (
3920
- "&filter=~url:.*("
3921
- + re.escape(FILTER_KEYWORDS).replace(",", "|")
3922
- + ").*"
3821
+ "&filter=~url:.*(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ").*"
3923
3822
  )
3924
3823
  else:
3925
3824
  filterKeywords = "&filter=~url:.*(" + args.keywords_only + ").*"
3926
3825
 
3927
- commonCrawlUrl = (
3928
- cdxApiUrl + "?output=json&fl=timestamp,url,mime,status,digest&url="
3929
- )
3826
+ commonCrawlUrl = cdxApiUrl + "?output=json&fl=timestamp,url,mime,status,digest&url="
3930
3827
 
3931
3828
  if args.filter_responses_only:
3932
3829
  url = commonCrawlUrl + subs + quote(argsInput) + path
@@ -3947,25 +3844,26 @@ def processCommonCrawlCollection(cdxApiUrl):
3947
3844
  session = requests.Session()
3948
3845
  session.mount("https://", HTTP_ADAPTER_CC)
3949
3846
  session.mount("http://", HTTP_ADAPTER_CC)
3847
+ try:
3848
+ current_session = session
3849
+ except Exception:
3850
+ pass
3950
3851
  resp = session.get(url, stream=True, headers={"User-Agent": userAgent})
3852
+ try:
3853
+ current_response = resp
3854
+ except Exception:
3855
+ pass
3951
3856
  except ConnectionError:
3952
3857
  writerr(
3953
3858
  colored(
3954
- getSPACER(
3955
- "[ ERR ] Common Crawl connection error for index "
3956
- + cdxApiUrl
3957
- ),
3859
+ "CommonCrawl - [ ERR ] Connection error for index " + cdxApiUrl,
3958
3860
  "red",
3959
3861
  )
3960
3862
  )
3961
3863
  resp = None
3962
3864
  return
3963
3865
  except Exception as e:
3964
- writerr(
3965
- colored(
3966
- getSPACER("[ ERR ] Error getting response - " + str(e)), "red"
3967
- )
3968
- )
3866
+ writerr(colored("CommonCrawl - [ ERR ] Error getting response - " + str(e), "red"))
3969
3867
  resp = None
3970
3868
  return
3971
3869
  finally:
@@ -3975,13 +3873,11 @@ def processCommonCrawlCollection(cdxApiUrl):
3975
3873
  if resp.status_code == 429:
3976
3874
  writerr(
3977
3875
  colored(
3978
- getSPACER(
3979
- "[ 429 ] Common Crawl rate limit reached, so stopping. Links that have already been retrieved will be saved."
3980
- ),
3876
+ "CommonCrawl - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
3981
3877
  "red",
3982
3878
  )
3983
3879
  )
3984
- stopSource = True
3880
+ stopSourceCommonCrawl = True
3985
3881
  return
3986
3882
  # If the response from commoncrawl.org says nothing was found...
3987
3883
  if resp.text.lower().find("no captures found") > 0:
@@ -3992,11 +3888,7 @@ def processCommonCrawlCollection(cdxApiUrl):
3992
3888
  if verbose():
3993
3889
  writerr(
3994
3890
  colored(
3995
- getSPACER(
3996
- "[ ERR ] "
3997
- + url
3998
- + " gave an empty response."
3999
- ),
3891
+ "CommonCrawl - [ ERR ] " + url + " gave an empty response.",
4000
3892
  "red",
4001
3893
  )
4002
3894
  )
@@ -4006,12 +3898,10 @@ def processCommonCrawlCollection(cdxApiUrl):
4006
3898
  if verbose():
4007
3899
  writerr(
4008
3900
  colored(
4009
- getSPACER(
4010
- "[ "
4011
- + str(resp.status_code)
4012
- + " ] Error for "
4013
- + cdxApiUrl
4014
- ),
3901
+ "CommonCrawl - [ "
3902
+ + str(resp.status_code)
3903
+ + " ] Error for "
3904
+ + cdxApiUrl,
4015
3905
  "red",
4016
3906
  )
4017
3907
  )
@@ -4020,27 +3910,71 @@ def processCommonCrawlCollection(cdxApiUrl):
4020
3910
  pass
4021
3911
 
4022
3912
  # Get the URLs and MIME types
4023
- for line in resp.iter_lines():
4024
- results = line.decode("utf-8")
4025
- try:
4026
- data = json.loads(results)
4027
- # Get MIME Types if --verbose option was seletced
4028
- if verbose():
4029
- try:
4030
- if data["mime"] != "":
4031
- linkMimes.add(data["mime"])
4032
- except Exception:
4033
- pass
4034
- linksFoundAdd(data["url"])
4035
- except Exception:
4036
- if verbose():
4037
- writerr(
4038
- colored(
4039
- "ERROR processCommonCrawlCollection 2: Cannot get URL and MIME type from line: "
4040
- + str(line),
4041
- "red",
3913
+ try:
3914
+ for line in resp.iter_lines():
3915
+ results = line.decode("utf-8")
3916
+ try:
3917
+ data = json.loads(results)
3918
+ # Get MIME Types if --verbose option was seletced
3919
+ if verbose():
3920
+ try:
3921
+ if data["mime"] != "":
3922
+ linkMimes.add(data["mime"])
3923
+ except Exception:
3924
+ pass
3925
+ # If -from or -to were passed, check the timestamp of the URL.
3926
+ # Only continue if the URL falls within the date range specified
3927
+ if args.from_date is not None or args.to_date is not None:
3928
+ try:
3929
+ ts = data["timestamp"]
3930
+
3931
+ # Normalize helper: pad/truncate date string to 14 digits (YYYYMMDDhhmmss)
3932
+ def normalize_date(d, is_from):
3933
+ if d is None:
3934
+ return None
3935
+ d = d.strip()
3936
+ # Pad to 14 digits: from_date pads with 0s, to_date with 9s
3937
+ if is_from:
3938
+ return (d + "0" * (14 - len(d)))[:14]
3939
+ else:
3940
+ return (d + "9" * (14 - len(d)))[:14]
3941
+
3942
+ from_ts = normalize_date(args.from_date, True)
3943
+ to_ts = normalize_date(args.to_date, False)
3944
+
3945
+ # Compare numerically
3946
+ if from_ts and ts < from_ts:
3947
+ continue
3948
+ if to_ts and ts > to_ts:
3949
+ continue
3950
+
3951
+ except Exception:
3952
+ writerr(
3953
+ colored(
3954
+ "ERROR processCommonCrawlCollection 3: Cannot get timestamp from line {line}: {str(e)}",
3955
+ "red",
3956
+ )
3957
+ )
3958
+
3959
+ linksFoundAdd(data["url"], linksFoundCommonCrawl)
3960
+ except Exception:
3961
+ if verbose():
3962
+ writerr(
3963
+ colored(
3964
+ "ERROR processCommonCrawlCollection 2: Cannot get URL and MIME type from line: "
3965
+ + str(line),
3966
+ "red",
3967
+ )
4042
3968
  )
4043
- )
3969
+ finally:
3970
+ try:
3971
+ current_response = None
3972
+ except Exception:
3973
+ pass
3974
+ try:
3975
+ current_session = None
3976
+ except Exception:
3977
+ pass
4044
3978
  else:
4045
3979
  pass
4046
3980
  except Exception as e:
@@ -4067,10 +4001,8 @@ def getCommonCrawlIndexes():
4067
4001
  except Exception as e:
4068
4002
  writerr(
4069
4003
  colored(
4070
- getSPACER(
4071
- "[ ERR ] Couldn't delete local version of Common Crawl index file: "
4072
- + str(e)
4073
- ),
4004
+ "CommonCrawl - [ ERR ] Couldn't delete local version of Common Crawl index file: "
4005
+ + str(e),
4074
4006
  "red",
4075
4007
  )
4076
4008
  )
@@ -4081,17 +4013,15 @@ def getCommonCrawlIndexes():
4081
4013
  if not createFile:
4082
4014
  # Read the indexes from the local file
4083
4015
  try:
4084
- with open(collinfoPath, "r") as file:
4016
+ with open(collinfoPath) as file:
4085
4017
  jsonResp = file.read()
4086
4018
  file.close()
4087
4019
  except Exception as e:
4088
4020
  createFile = True
4089
4021
  writerr(
4090
4022
  colored(
4091
- getSPACER(
4092
- "[ ERR ] Couldn't read local version of Common Crawl index file: "
4093
- + str(e)
4094
- ),
4023
+ "CommonCrawl - [ ERR ] Couldn't read local version of Common Crawl index file: "
4024
+ + str(e),
4095
4025
  "red",
4096
4026
  )
4097
4027
  )
@@ -4104,15 +4034,11 @@ def getCommonCrawlIndexes():
4104
4034
  session = requests.Session()
4105
4035
  session.mount("https://", HTTP_ADAPTER_CC)
4106
4036
  session.mount("http://", HTTP_ADAPTER_CC)
4107
- indexes = session.get(
4108
- CCRAWL_INDEX_URL, headers={"User-Agent": userAgent}
4109
- )
4037
+ indexes = session.get(CCRAWL_INDEX_URL, headers={"User-Agent": userAgent})
4110
4038
  except ConnectionError:
4111
4039
  writerr(
4112
4040
  colored(
4113
- getSPACER(
4114
- "[ ERR ] Common Crawl connection error getting Index file"
4115
- ),
4041
+ "CommonCrawl - [ ERR ] Connection error getting Index file",
4116
4042
  "red",
4117
4043
  )
4118
4044
  )
@@ -4120,10 +4046,8 @@ def getCommonCrawlIndexes():
4120
4046
  except Exception as e:
4121
4047
  writerr(
4122
4048
  colored(
4123
- getSPACER(
4124
- "[ ERR ] Error getting Common Crawl index collection - "
4125
- + str(e)
4126
- ),
4049
+ "CommonCrawl - [ ERR ] Error getting Common Crawl index collection - "
4050
+ + str(e),
4127
4051
  "red",
4128
4052
  )
4129
4053
  )
@@ -4133,9 +4057,7 @@ def getCommonCrawlIndexes():
4133
4057
  if indexes.status_code == 429:
4134
4058
  writerr(
4135
4059
  colored(
4136
- getSPACER(
4137
- "[ 429 ] Common Crawl rate limit reached so unable to get links."
4138
- ),
4060
+ "CommonCrawl - [ 429 ] Rate limit reached so unable to get links.",
4139
4061
  "red",
4140
4062
  )
4141
4063
  )
@@ -4144,7 +4066,7 @@ def getCommonCrawlIndexes():
4144
4066
  elif indexes.status_code == 503:
4145
4067
  writerr(
4146
4068
  colored(
4147
- getSPACER("[ 503 ] Common Crawl seems to be unavailable."),
4069
+ "CommonCrawl - [ 503 ] Common Crawl seems to be unavailable.",
4148
4070
  "red",
4149
4071
  )
4150
4072
  )
@@ -4152,11 +4074,9 @@ def getCommonCrawlIndexes():
4152
4074
  elif indexes.status_code != 200:
4153
4075
  writerr(
4154
4076
  colored(
4155
- getSPACER(
4156
- "[ "
4157
- + str(indexes.status_code)
4158
- + " ] Common Crawl did not retrun the indexes file."
4159
- ),
4077
+ "CommonCrawl - [ "
4078
+ + str(indexes.status_code)
4079
+ + " ] Common Crawl did not retrun the indexes file.",
4160
4080
  "red",
4161
4081
  )
4162
4082
  )
@@ -4173,10 +4093,8 @@ def getCommonCrawlIndexes():
4173
4093
  except Exception as e:
4174
4094
  writerr(
4175
4095
  colored(
4176
- getSPACER(
4177
- "[ ERR ] Couldn't create local version of Common Crawl index file: "
4178
- + str(e)
4179
- ),
4096
+ "CommonCrawl - [ ERR ] Couldn't create local version of Common Crawl index file: "
4097
+ + str(e),
4180
4098
  "red",
4181
4099
  )
4182
4100
  )
@@ -4187,26 +4105,40 @@ def getCommonCrawlIndexes():
4187
4105
  for values in json.loads(jsonResp):
4188
4106
  for key in values:
4189
4107
  if key == "cdx-api":
4190
- if args.lcy != 0:
4108
+ if args.from_date is not None or args.to_date is not None:
4191
4109
  try:
4192
4110
  indexYear = values[key].split("CC-MAIN-")[1][:4]
4193
- if int(indexYear) >= args.lcy:
4194
- cdxApiUrls.add(values[key])
4111
+
4112
+ # Only get the indexes that fall within the date range specified
4113
+ if args.from_date is not None:
4114
+ fromYear = int(args.from_date[:4])
4115
+ # There are a few exceptions with the filename format at the start of Common Crawl indexes where it contains 2 years, so deal with those (e.g. CC-MAIN-2009-2010-index and CC-MAIN-2008-2009-index)
4116
+ if fromYear in (2009, 2010):
4117
+ fromYear = fromYear - 1
4118
+ if int(indexYear) < fromYear:
4119
+ continue
4120
+ if args.to_date is not None:
4121
+ toYear = int(args.to_date[:4])
4122
+ if int(indexYear) > toYear:
4123
+ continue
4124
+ # If it passed the date range checks then add the index URL
4125
+ cdxApiUrls.add(values[key])
4126
+ collection = collection + 1
4195
4127
  except Exception as e:
4196
4128
  writerr(
4197
4129
  colored(
4198
- getSPACER(
4199
- "[ ERR ] Failed to get the year from index name "
4200
- + values[key]
4201
- + " - "
4202
- + str(e)
4203
- ),
4130
+ "CommonCrawl - [ ERR ] Failed to get the year from index name "
4131
+ + values[key]
4132
+ + " - "
4133
+ + str(e),
4204
4134
  "red",
4205
4135
  )
4206
4136
  )
4207
4137
  else:
4208
4138
  cdxApiUrls.add(values[key])
4209
- collection = collection + 1
4139
+ collection = collection + 1
4140
+
4141
+ # Only get the most recent number of indexes specified by -lcc argument
4210
4142
  if collection == args.lcc:
4211
4143
  break
4212
4144
 
@@ -4220,12 +4152,11 @@ def getCommonCrawlUrls():
4220
4152
  """
4221
4153
  Get all Common Crawl index collections to get all URLs from each one
4222
4154
  """
4223
- global linksFound, linkMimes, waymorePath, subs, path, stopSource, argsInput, checkCommonCrawl
4155
+ global linksFound, linkMimes, waymorePath, subs, path, stopSourceCommonCrawl, argsInput, checkCommonCrawl, linkCountCommonCrawl, linksFoundCommonCrawl
4224
4156
 
4225
4157
  try:
4226
- stopSource = False
4227
- linkMimes = set()
4228
- originalLinkCount = len(linksFound)
4158
+ stopSourceCommonCrawl = False
4159
+ linksFoundCommonCrawl = set()
4229
4160
 
4230
4161
  # Set mime content type filter
4231
4162
  if MATCH_MIME.strip() != "":
@@ -4240,13 +4171,9 @@ def getCommonCrawlUrls():
4240
4171
  # Set status code filter
4241
4172
  filterCode = ""
4242
4173
  if MATCH_CODE.strip() != "":
4243
- filterCode = (
4244
- "&filter=~status:(" + re.escape(MATCH_CODE).replace(",", "|") + ")"
4245
- )
4174
+ filterCode = "&filter=~status:(" + re.escape(MATCH_CODE).replace(",", "|") + ")"
4246
4175
  else:
4247
- filterCode = (
4248
- "&filter=!~status:(" + re.escape(FILTER_CODE).replace(",", "|") + ")"
4249
- )
4176
+ filterCode = "&filter=!~status:(" + re.escape(FILTER_CODE).replace(",", "|") + ")"
4250
4177
 
4251
4178
  if verbose():
4252
4179
  if args.filter_responses_only:
@@ -4267,7 +4194,7 @@ def getCommonCrawlUrls():
4267
4194
  )
4268
4195
  write(
4269
4196
  colored(
4270
- "The commoncrawl index URL requested to get links (where {CDX-API-URL} is from "
4197
+ "CommonCrawl - [ INFO ] The index URL requested to get links (where {CDX-API-URL} is from "
4271
4198
  + CCRAWL_INDEX_URL
4272
4199
  + "): ",
4273
4200
  "magenta",
@@ -4276,9 +4203,7 @@ def getCommonCrawlUrls():
4276
4203
  )
4277
4204
 
4278
4205
  if not args.check_only:
4279
- write(
4280
- colored("\rGetting commoncrawl.org index collections list...\r", "cyan")
4281
- )
4206
+ write(colored("CommonCrawl - [ INFO ] Getting index collections list...", "cyan"))
4282
4207
 
4283
4208
  # Get the Common Crawl index collections
4284
4209
  cdxApiUrls = getCommonCrawlIndexes()
@@ -4291,15 +4216,15 @@ def getCommonCrawlUrls():
4291
4216
  else:
4292
4217
  checkCommonCrawl = len(cdxApiUrls) + 1
4293
4218
  write(
4294
- colored("Get URLs from Common Crawl: ", "cyan")
4219
+ colored("CommonCrawl - [ INFO ] Get URLs from Common Crawl: ", "cyan")
4295
4220
  + colored(str(checkCommonCrawl) + " requests", "white")
4296
4221
  )
4297
4222
  else:
4298
4223
  write(
4299
4224
  colored(
4300
- "\rGetting links from the latest "
4225
+ "CommonCrawl - [ INFO ] Getting links from the latest "
4301
4226
  + str(len(cdxApiUrls))
4302
- + " commoncrawl.org index collections (this can take a while for some domains)...\r",
4227
+ + " commoncrawl.org index collections (this can take a while for some domains)...",
4303
4228
  "cyan",
4304
4229
  )
4305
4230
  )
@@ -4315,30 +4240,18 @@ def getCommonCrawlUrls():
4315
4240
  if verbose() and len(linkMimes) > 0:
4316
4241
  linkMimes.discard("warc/revisit")
4317
4242
  write(
4318
- getSPACER(
4319
- colored("MIME types found: ", "magenta")
4320
- + colored(str(linkMimes), "white")
4321
- )
4243
+ colored("CommonCrawl - [ INFO ] MIME types found: ", "magenta")
4244
+ + colored(str(linkMimes), "white")
4322
4245
  + "\n"
4323
4246
  )
4324
4247
 
4325
- linkCount = len(linksFound) - originalLinkCount
4326
- if args.xwm:
4327
- write(
4328
- getSPACER(
4329
- colored("Links found on commoncrawl.org: ", "cyan")
4330
- + colored(str(linkCount), "white")
4331
- )
4332
- + "\n"
4333
- )
4334
- else:
4335
- write(
4336
- getSPACER(
4337
- colored("Extra links found on commoncrawl.org: ", "cyan")
4338
- + colored(str(linkCount), "white")
4339
- )
4340
- + "\n"
4341
- )
4248
+ linkCountCommonCrawl = len(linksFoundCommonCrawl)
4249
+ write(
4250
+ colored("CommonCrawl - [ INFO ] Links found on commoncrawl.org: ", "cyan")
4251
+ + colored(str(linkCountCommonCrawl), "white")
4252
+ )
4253
+ linksFound.update(linksFoundCommonCrawl)
4254
+ linksFoundCommonCrawl.clear()
4342
4255
 
4343
4256
  except Exception as e:
4344
4257
  writerr(colored("ERROR getCommonCrawlUrls 1: " + str(e), "red"))
@@ -4348,7 +4261,7 @@ def processVirusTotalUrl(url):
4348
4261
  """
4349
4262
  Process a specific URL from virustotal.com to determine whether to save the link
4350
4263
  """
4351
- global argsInput, argsInputHostname
4264
+ global argsInput, argsInputHostname, linkCountVirusTotal, linksFoundVirusTotal
4352
4265
 
4353
4266
  addLink = True
4354
4267
 
@@ -4394,9 +4307,7 @@ def processVirusTotalUrl(url):
4394
4307
  flags=re.IGNORECASE,
4395
4308
  )
4396
4309
  else:
4397
- match = re.search(
4398
- r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE
4399
- )
4310
+ match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
4400
4311
  if match is None:
4401
4312
  addLink = False
4402
4313
 
@@ -4417,7 +4328,7 @@ def processVirusTotalUrl(url):
4417
4328
  flags=re.IGNORECASE,
4418
4329
  )
4419
4330
  if match is not None:
4420
- linksFoundAdd(url)
4331
+ linksFoundAdd(url, linksFoundVirusTotal)
4421
4332
 
4422
4333
  except Exception as e:
4423
4334
  writerr(colored("ERROR processVirusTotalUrl 1: " + str(e), "red"))
@@ -4425,58 +4336,50 @@ def processVirusTotalUrl(url):
4425
4336
 
4426
4337
  def getVirusTotalUrls():
4427
4338
  """
4428
- Get URLs from the VirusTotal API v2
4339
+ Get URLs from the VirusTotal API v2 and process them.
4340
+ Each URL is normalized as (url, scan_date) tuple. Dates are filtered according to args.from_date / args.to_date.
4429
4341
  """
4430
- global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSource, argsInput, checkVirusTotal, argsInputHostname
4342
+ global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceVirusTotal, argsInput, checkVirusTotal, argsInputHostname, linkCountVirusTotal, linksFoundVirusTotal
4431
4343
 
4432
- # Write the file of URL's for the passed domain/URL
4433
4344
  try:
4434
- requestsMade = 0
4435
- stopSource = False
4436
- linkMimes = set()
4437
- originalLinkCount = len(linksFound)
4345
+ stopSourceVirusTotal = False
4346
+ linksFoundVirusTotal = set()
4438
4347
 
4439
- # Just pass the hostname in the URL
4348
+ # Build the VirusTotal API URL
4440
4349
  url = VIRUSTOTAL_URL.replace("{DOMAIN}", quote(argsInputHostname)).replace(
4441
4350
  "{APIKEY}", VIRUSTOTAL_API_KEY
4442
4351
  )
4443
4352
 
4444
4353
  if verbose():
4445
4354
  write(
4446
- colored("The VirusTotal URL requested to get links: ", "magenta")
4355
+ colored("VirusTotal - [ INFO ] The URL requested to get links: ", "magenta")
4447
4356
  + colored(url + "\n", "white")
4448
4357
  )
4449
4358
 
4450
4359
  if not args.check_only:
4451
- write(colored("\rGetting links from virustotal.com API...\r", "cyan"))
4360
+ write(colored("VirusTotal - [ INFO ] Getting links from virustotal.com API...", "cyan"))
4452
4361
 
4453
- # Get the domain report from virustotal
4362
+ # Make request
4454
4363
  try:
4455
- # Choose a random user agent string to use for any requests
4456
4364
  userAgent = random.choice(USER_AGENT)
4457
4365
  session = requests.Session()
4458
4366
  session.mount("https://", HTTP_ADAPTER)
4459
4367
  session.mount("http://", HTTP_ADAPTER)
4460
4368
  resp = session.get(url, headers={"User-Agent": userAgent})
4461
- requestsMade = requestsMade + 1
4462
4369
  except Exception as e:
4463
- write(
4370
+ writerr(
4464
4371
  colored(
4465
- getSPACER(
4466
- "[ ERR ] Unable to get links from virustotal.com: " + str(e)
4467
- ),
4372
+ "VirusTotal - [ ERR ] Unable to get links from virustotal.com: " + str(e),
4468
4373
  "red",
4469
4374
  )
4470
4375
  )
4471
4376
  return
4472
4377
 
4473
- # Deal with any errors
4378
+ # Handle HTTP errors
4474
4379
  if resp.status_code == 429:
4475
4380
  writerr(
4476
4381
  colored(
4477
- getSPACER(
4478
- "[ 429 ] VirusTotal rate limit reached so unable to get links."
4479
- ),
4382
+ "VirusTotal - [ 429 ] Rate limit reached so unable to get links.",
4480
4383
  "red",
4481
4384
  )
4482
4385
  )
@@ -4484,9 +4387,7 @@ def getVirusTotalUrls():
4484
4387
  elif resp.status_code == 403:
4485
4388
  writerr(
4486
4389
  colored(
4487
- getSPACER(
4488
- "[ 403 ] VirusTotal: Permission denied. Check your API key is correct."
4489
- ),
4390
+ "VirusTotal - [ 403 ] Permission denied. Check your API key is correct.",
4490
4391
  "red",
4491
4392
  )
4492
4393
  )
@@ -4494,101 +4395,94 @@ def getVirusTotalUrls():
4494
4395
  elif resp.status_code != 200:
4495
4396
  writerr(
4496
4397
  colored(
4497
- getSPACER(
4498
- "[ "
4499
- + str(resp.status_code)
4500
- + " ] Unable to get links from virustotal.com"
4501
- ),
4398
+ "VirusTotal - [ ERR ] [ "
4399
+ + str(resp.status_code)
4400
+ + " ] Unable to get links from virustotal.com",
4502
4401
  "red",
4503
4402
  )
4504
4403
  )
4505
4404
  return
4506
4405
 
4507
- # Get the JSON response
4406
+ # Parse JSON
4508
4407
  try:
4509
4408
  jsonResp = json.loads(resp.text.strip())
4510
4409
 
4511
- # Get the different URLs
4410
+ # Normalize arrays as (url, scan_date) tuples
4512
4411
  if args.no_subs:
4513
- subDomains = []
4412
+ subdomains = []
4514
4413
  else:
4515
- try:
4516
- subDomains = jsonResp["subdomains"]
4517
- except Exception:
4518
- subDomains = []
4519
- try:
4520
- detectedUrls = [
4521
- entry["url"] for entry in jsonResp.get("detected_urls", [])
4522
- ]
4523
- except Exception:
4524
- detectedUrls = []
4525
- try:
4526
- undetectedUrls = [
4527
- entry[0] for entry in jsonResp.get("undetected_urls", [])
4528
- ]
4529
- except Exception:
4530
- undetectedUrls = []
4531
- try:
4532
- totalUrls = set(subDomains + detectedUrls + undetectedUrls)
4533
- except Exception:
4534
- totalUrls = []
4535
- except Exception:
4414
+ subdomains = [(sd, None) for sd in jsonResp.get("subdomains", [])]
4415
+
4416
+ detected_urls = [
4417
+ (entry.get("url"), entry.get("scan_date"))
4418
+ for entry in jsonResp.get("detected_urls", [])
4419
+ ]
4420
+
4421
+ undetected_urls = [
4422
+ (entry[0], entry[4]) for entry in jsonResp.get("undetected_urls", [])
4423
+ ]
4424
+
4425
+ # Combine all
4426
+ all_urls = subdomains + detected_urls + undetected_urls
4427
+
4428
+ except Exception as e:
4536
4429
  writerr(
4537
4430
  colored(
4538
- getSPACER(
4539
- "[ ERR ] There was an unexpected response from the VirusTotal API"
4540
- ),
4431
+ "VirusTotal - [ ERR ] Unexpected response from the VirusTotal API: " + str(e),
4541
4432
  "red",
4542
4433
  )
4543
4434
  )
4544
- totalUrls = []
4435
+ all_urls = []
4545
4436
 
4437
+ # Check only mode
4546
4438
  if args.check_only:
4547
4439
  write(
4548
- colored("Get URLs from VirusTotal: ", "cyan")
4440
+ colored("VirusTotal - [ INFO ] Get URLs from VirusTotal: ", "cyan")
4549
4441
  + colored("1 request", "white")
4550
4442
  )
4551
4443
  checkVirusTotal = 1
4552
4444
  else:
4553
- # Carry on if something was found
4554
- for vturl in totalUrls:
4555
-
4556
- if stopSource:
4445
+ # Process each URL tuple
4446
+ for url, scan_date in all_urls:
4447
+ if stopSourceVirusTotal:
4557
4448
  break
4558
-
4559
- # Get memory in case it exceeds threshold
4560
4449
  getMemory()
4561
4450
 
4562
- # Work out whether to include it
4563
- processVirusTotalUrl(vturl)
4564
-
4565
- linkCount = len(linksFound) - originalLinkCount
4566
- if args.xwm and args.xcc and args.xav and args.xus:
4567
- write(
4568
- getSPACER(
4569
- colored("Links found on virustotal.com: ", "cyan")
4570
- + colored(str(linkCount), "white")
4571
- )
4572
- + "\n"
4573
- )
4574
- else:
4575
- write(
4576
- getSPACER(
4577
- colored("Extra links found on virustotal.com: ", "cyan")
4578
- + colored(str(linkCount), "white")
4579
- )
4580
- + "\n"
4581
- )
4451
+ # Filter by date if -from or -to was passed and we have a date for the url
4452
+ if scan_date and (args.from_date is not None or args.to_date is not None):
4453
+ urlDate = datetime.strptime(scan_date, "%Y-%m-%d %H:%M:%S")
4454
+ # If from date passed, check
4455
+ if args.from_date is not None:
4456
+ fromDate = parseDateArg(args.from_date)
4457
+ if urlDate < fromDate:
4458
+ continue
4459
+ # If to date passed, check
4460
+ if args.to_date is not None:
4461
+ toDate = parseDateArg(args.to_date)
4462
+ if urlDate >= toDate:
4463
+ continue
4464
+
4465
+ # Process URL
4466
+ processVirusTotalUrl(url)
4467
+
4468
+ # Show links found
4469
+ linkCountVirusTotal = len(linksFoundVirusTotal)
4470
+ write(
4471
+ colored("VirusTotal - [ INFO ] Links found on virustotal.com: ", "cyan")
4472
+ + colored(str(linkCountVirusTotal), "white")
4473
+ )
4474
+ linksFound.update(linksFoundVirusTotal)
4475
+ linksFoundVirusTotal.clear()
4582
4476
 
4583
4477
  except Exception as e:
4584
- writerr(colored("ERROR getVirusTotalUrls 1: " + str(e), "red"))
4478
+ writerr(colored(f"ERROR getVirusTotalUrls: {e}", "red"))
4585
4479
 
4586
4480
 
4587
4481
  def processIntelxUrl(url):
4588
4482
  """
4589
4483
  Process a specific URL from intelx.io to determine whether to save the link
4590
4484
  """
4591
- global argsInput, argsInputHostname
4485
+ global argsInput, argsInputHostname, linkCountIntelx, linksFoundIntelx
4592
4486
 
4593
4487
  addLink = True
4594
4488
 
@@ -4634,15 +4528,13 @@ def processIntelxUrl(url):
4634
4528
  flags=re.IGNORECASE,
4635
4529
  )
4636
4530
  else:
4637
- match = re.search(
4638
- r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE
4639
- )
4531
+ match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
4640
4532
  if match is None:
4641
4533
  addLink = False
4642
4534
 
4643
4535
  # Add link if it passed filters
4644
4536
  if addLink:
4645
- linksFoundAdd(url)
4537
+ linksFoundAdd(url, linksFoundIntelx)
4646
4538
 
4647
4539
  except Exception as e:
4648
4540
  writerr(colored("ERROR processIntelxUrl 1: " + str(e), "red"))
@@ -4653,6 +4545,7 @@ def processIntelxType(target, credits):
4653
4545
  target: 1 - Domains
4654
4546
  target: 3 - URLs
4655
4547
  """
4548
+ global intelxAPIIssue
4656
4549
  try:
4657
4550
  try:
4658
4551
  requestsMade = 0
@@ -4665,18 +4558,14 @@ def processIntelxType(target, credits):
4665
4558
  # Pass the API key in the X-Key header too.
4666
4559
  resp = session.post(
4667
4560
  INTELX_SEARCH_URL,
4668
- data='{"term":"'
4669
- + quote(argsInputHostname)
4670
- + '","target":'
4671
- + str(target)
4672
- + "}",
4561
+ data='{"term":"' + quote(argsInputHostname) + '","target":' + str(target) + "}",
4673
4562
  headers={"User-Agent": userAgent, "X-Key": INTELX_API_KEY},
4674
4563
  )
4675
4564
  requestsMade = requestsMade + 1
4676
4565
  except Exception as e:
4677
4566
  write(
4678
4567
  colored(
4679
- getSPACER("[ ERR ] Unable to get links from intelx.io: " + str(e)),
4568
+ "IntelX - [ ERR ] Unable to get links from intelx.io: " + str(e),
4680
4569
  "red",
4681
4570
  )
4682
4571
  )
@@ -4684,53 +4573,47 @@ def processIntelxType(target, credits):
4684
4573
 
4685
4574
  # Deal with any errors
4686
4575
  if resp.status_code == 429:
4576
+ intelxAPIIssue = True
4687
4577
  writerr(
4688
4578
  colored(
4689
- getSPACER(
4690
- "[ 429 ] IntelX rate limit reached so unable to get links."
4691
- ),
4579
+ "IntelX - [ 429 ] Rate limit reached so unable to get links.",
4692
4580
  "red",
4693
4581
  )
4694
4582
  )
4695
4583
  return
4696
4584
  elif resp.status_code == 401:
4585
+ intelxAPIIssue = True
4697
4586
  writerr(
4698
4587
  colored(
4699
- getSPACER(
4700
- "[ 401 ] IntelX: Not authorized. The source requires a paid API key. Check your API key is correct."
4701
- ),
4588
+ "IntelX - [ 401 ] Not authorized. The source requires a paid API key. Check your API key is correct.",
4702
4589
  "red",
4703
4590
  )
4704
4591
  )
4705
4592
  return
4706
4593
  elif resp.status_code == 402:
4594
+ intelxAPIIssue = True
4707
4595
  if credits.startswith("0/"):
4708
4596
  writerr(
4709
4597
  colored(
4710
- getSPACER(
4711
- "[ 402 ] IntelX: You have run out of daily credits on Intelx ("
4712
- + credits
4713
- + ")."
4714
- ),
4598
+ "IntelX - [ 402 ] You have run out of daily credits on Intelx ("
4599
+ + credits
4600
+ + ").",
4715
4601
  "red",
4716
4602
  )
4717
4603
  )
4718
4604
  else:
4719
4605
  writerr(
4720
4606
  colored(
4721
- getSPACER(
4722
- "[ 402 ] IntelX: It appears you have run out of daily credits on Intelx."
4723
- ),
4607
+ "IntelX - [ 402 ] It appears you have run out of daily credits on Intelx.",
4724
4608
  "red",
4725
4609
  )
4726
4610
  )
4727
4611
  return
4728
4612
  elif resp.status_code == 403:
4613
+ intelxAPIIssue = True
4729
4614
  writerr(
4730
4615
  colored(
4731
- getSPACER(
4732
- "[ 403 ] IntelX: Permission denied. Check your API key is correct."
4733
- ),
4616
+ "IntelX - [ 403 ] Permission denied. Check your API key is correct.",
4734
4617
  "red",
4735
4618
  )
4736
4619
  )
@@ -4738,11 +4621,7 @@ def processIntelxType(target, credits):
4738
4621
  elif resp.status_code != 200:
4739
4622
  writerr(
4740
4623
  colored(
4741
- getSPACER(
4742
- "[ "
4743
- + str(resp.status_code)
4744
- + " ] Unable to get links from intelx.io"
4745
- ),
4624
+ "IntelX - [ " + str(resp.status_code) + " ] Unable to get links from intelx.io",
4746
4625
  "red",
4747
4626
  )
4748
4627
  )
@@ -4755,9 +4634,7 @@ def processIntelxType(target, credits):
4755
4634
  except Exception:
4756
4635
  writerr(
4757
4636
  colored(
4758
- getSPACER(
4759
- "[ ERR ] There was an unexpected response from the Intelligence API"
4760
- ),
4637
+ "IntelX - [ ERR ] There was an unexpected response from the Intelligence API",
4761
4638
  "red",
4762
4639
  )
4763
4640
  )
@@ -4767,7 +4644,7 @@ def processIntelxType(target, credits):
4767
4644
  moreResults = True
4768
4645
  status = 0
4769
4646
  while moreResults:
4770
- if stopSource:
4647
+ if stopSourceIntelx:
4771
4648
  break
4772
4649
  try:
4773
4650
  resp = session.get(
@@ -4778,9 +4655,7 @@ def processIntelxType(target, credits):
4778
4655
  except Exception as e:
4779
4656
  write(
4780
4657
  colored(
4781
- getSPACER(
4782
- "[ ERR ] Unable to get links from intelx.io: " + str(e)
4783
- ),
4658
+ "IntelX - [ ERR ] Unable to get links from intelx.io: " + str(e),
4784
4659
  "red",
4785
4660
  )
4786
4661
  )
@@ -4793,9 +4668,7 @@ def processIntelxType(target, credits):
4793
4668
  except Exception:
4794
4669
  writerr(
4795
4670
  colored(
4796
- getSPACER(
4797
- "[ ERR ] There was an unexpected response from the Intelligence API"
4798
- ),
4671
+ "IntelX - [ ERR ] There was an unexpected response from the Intelligence API",
4799
4672
  "red",
4800
4673
  )
4801
4674
  )
@@ -4817,7 +4690,7 @@ def processIntelxType(target, credits):
4817
4690
  # Work out whether to include each url
4818
4691
  unique_values = list(set(selector_values + selector_valuesh))
4819
4692
  for ixurl in unique_values:
4820
- if stopSource:
4693
+ if stopSourceIntelx:
4821
4694
  break
4822
4695
  processIntelxUrl(ixurl)
4823
4696
 
@@ -4845,14 +4718,10 @@ def getIntelxAccountInfo() -> str:
4845
4718
  )
4846
4719
  jsonResp = json.loads(resp.text.strip())
4847
4720
  credits = str(
4848
- jsonResp.get("paths", {})
4849
- .get("/phonebook/search", {})
4850
- .get("Credit", "Unknown")
4721
+ jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
4851
4722
  )
4852
4723
  credits_max = str(
4853
- jsonResp.get("paths", {})
4854
- .get("/phonebook/search", {})
4855
- .get("CreditMax", "Unknown")
4724
+ jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
4856
4725
  )
4857
4726
  return credits + "/" + credits_max
4858
4727
  except Exception:
@@ -4863,25 +4732,26 @@ def getIntelxUrls():
4863
4732
  """
4864
4733
  Get URLs from the Intelligence X Phonebook search
4865
4734
  """
4866
- global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSource, argsInput, checkIntelx, argsInputHostname
4735
+ global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx
4867
4736
 
4868
4737
  # Write the file of URL's for the passed domain/URL
4869
4738
  try:
4870
4739
  if args.check_only:
4871
4740
  write(
4872
- colored("Get URLs from Intelligence X: ", "cyan")
4741
+ colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
4873
4742
  + colored("minimum 4 requests", "white")
4874
4743
  )
4875
4744
  checkIntelx = 4
4876
4745
  return
4877
4746
 
4878
- stopSource = False
4879
- originalLinkCount = len(linksFound)
4747
+ stopSourceIntelx = False
4748
+ linksFoundIntelx = set()
4749
+
4880
4750
  credits = getIntelxAccountInfo()
4881
4751
  if verbose():
4882
4752
  write(
4883
4753
  colored(
4884
- "The Intelligence X URL requested to get links (Credits: "
4754
+ "IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
4885
4755
  + credits
4886
4756
  + "): ",
4887
4757
  "magenta",
@@ -4890,32 +4760,23 @@ def getIntelxUrls():
4890
4760
  )
4891
4761
 
4892
4762
  if not args.check_only:
4893
- write(colored("\rGetting links from intelx.io API...\r", "cyan"))
4763
+ write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
4894
4764
 
4895
4765
  # Get the domains from Intelligence X if the --no-subs wasn't passed
4896
4766
  if not args.no_subs:
4897
4767
  processIntelxType(1, credits)
4898
4768
 
4899
4769
  # Get the URLs from Intelligence X
4900
- processIntelxType(3, credits)
4770
+ if not intelxAPIIssue:
4771
+ processIntelxType(3, credits)
4901
4772
 
4902
- linkCount = len(linksFound) - originalLinkCount
4903
- if args.xwm and args.xcc and args.xav and args.xus and args.xvt:
4904
- write(
4905
- getSPACER(
4906
- colored("Links found on intelx.io: ", "cyan")
4907
- + colored(str(linkCount), "white")
4908
- )
4909
- + "\n"
4910
- )
4911
- else:
4912
- write(
4913
- getSPACER(
4914
- colored("Extra links found on intelx.io: ", "cyan")
4915
- + colored(str(linkCount), "white")
4916
- )
4917
- + "\n"
4918
- )
4773
+ linkCountIntelx = len(linksFoundIntelx)
4774
+ write(
4775
+ colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
4776
+ + colored(str(linkCountIntelx), "white")
4777
+ )
4778
+ linksFound.update(linksFoundIntelx)
4779
+ linksFoundIntelx.clear()
4919
4780
 
4920
4781
  except Exception as e:
4921
4782
  writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
@@ -4968,27 +4829,23 @@ def processResponsesURLScan():
4968
4829
  indexPath = responseOutputDirectory + "waymore_index.txt"
4969
4830
  except Exception as e:
4970
4831
  if verbose():
4971
- writerr(
4972
- colored("ERROR processResponsesURLScan 4: " + str(e), "red")
4973
- )
4832
+ writerr(colored("ERROR processResponsesURLScan 4: " + str(e), "red"))
4974
4833
 
4975
4834
  # Get URLs from URLScan.io if the DOM ID's haven't been retrieved yet
4976
- if args.mode == "R" and stopProgram is None and not args.check_only:
4977
- write(
4978
- colored(
4979
- "\rGetting list of response links (this can take a while for some domains)...\r",
4980
- "cyan",
4835
+ if stopProgram is None and not args.check_only:
4836
+ if args.mode in ("R", "B"):
4837
+ write(
4838
+ colored(
4839
+ "URLScan - [ INFO ] Getting list of response links (this can take a while for some domains)...",
4840
+ "cyan",
4841
+ )
4981
4842
  )
4982
- )
4983
- getURLScanUrls()
4843
+ if args.mode == "R":
4844
+ getURLScanUrls()
4984
4845
 
4985
4846
  # Check if a continueResp.URLScan.tmp and responses.URLScan.tmp files exists
4986
4847
  runPrevious = "n"
4987
- if (
4988
- not args.check_only
4989
- and os.path.exists(continuePath)
4990
- and os.path.exists(responsesPath)
4991
- ):
4848
+ if not args.check_only and os.path.exists(continuePath) and os.path.exists(responsesPath):
4992
4849
 
4993
4850
  # Load the links into the set
4994
4851
  with open(responsesPath, "rb") as fl:
@@ -4997,7 +4854,7 @@ def processResponsesURLScan():
4997
4854
 
4998
4855
  # Get the previous end position to start again at this point
4999
4856
  try:
5000
- with open(continuePath, "r") as fc:
4857
+ with open(continuePath) as fc:
5001
4858
  successCount = int(fc.readline().strip())
5002
4859
  except Exception:
5003
4860
  successCount = 0
@@ -5082,25 +4939,6 @@ def processResponsesURLScan():
5082
4939
  "green",
5083
4940
  )
5084
4941
  )
5085
- # if args.limit == 5000 and totalResponses == 5000:
5086
- # writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests (the --limit argument defaults to '+str(DEFAULT_LIMIT)+')','cyan'))
5087
- # else:
5088
- # writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests','white'))
5089
- # minutes = round(totalResponses*2.5 // 60)
5090
- # hours = minutes // 60
5091
- # days = hours // 24
5092
- # if minutes < 5:
5093
- # write(colored('\n-> Downloading the responses (depending on their size) should be quite quick!','green'))
5094
- # elif hours < 2:
5095
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(minutes)+' minutes.','green'))
5096
- # elif hours < 6:
5097
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','green'))
5098
- # elif hours < 24:
5099
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','yellow'))
5100
- # elif days < 7:
5101
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days. Consider using arguments -ko, -l, -ci, -from and -to wisely! ','red'))
5102
- # else:
5103
- # write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days!!! Consider using arguments -ko, -l, -ci, -from and -to wisely!','red'))
5104
4942
  write("")
5105
4943
  else:
5106
4944
  # If the limit has been set over the default, give a warning that this could take a long time!
@@ -5162,7 +5000,7 @@ def processResponsesURLScan():
5162
5000
  if failureCount > 0:
5163
5001
  if verbose():
5164
5002
  write(
5165
- colored("\nURLScan responses saved to ", "cyan")
5003
+ colored("URLScan - [ INFO ] Responses saved to ", "cyan")
5166
5004
  + colored(responseOutputDirectory, "white")
5167
5005
  + colored(" for " + subs + argsInput + ": ", "cyan")
5168
5006
  + colored(
@@ -5177,10 +5015,7 @@ def processResponsesURLScan():
5177
5015
  else:
5178
5016
  write(
5179
5017
  colored(
5180
- "\nURLScan responses saved for "
5181
- + subs
5182
- + argsInput
5183
- + ": ",
5018
+ "URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5184
5019
  "cyan",
5185
5020
  )
5186
5021
  + colored(
@@ -5195,7 +5030,10 @@ def processResponsesURLScan():
5195
5030
  else:
5196
5031
  if verbose():
5197
5032
  write(
5198
- colored("\nURLScan responses saved to ", "cyan")
5033
+ colored(
5034
+ "URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5035
+ "cyan",
5036
+ )
5199
5037
  + colored(responseOutputDirectory, "white")
5200
5038
  + colored(" for " + subs + argsInput + ": ", "cyan")
5201
5039
  + colored(
@@ -5209,10 +5047,7 @@ def processResponsesURLScan():
5209
5047
  else:
5210
5048
  write(
5211
5049
  colored(
5212
- "\nURLScan responses saved for "
5213
- + subs
5214
- + argsInput
5215
- + ": ",
5050
+ "URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5216
5051
  "cyan",
5217
5052
  )
5218
5053
  + colored(
@@ -5225,9 +5060,7 @@ def processResponsesURLScan():
5225
5060
  )
5226
5061
  except Exception as e:
5227
5062
  if verbose():
5228
- writerr(
5229
- colored("ERROR processResponsesURLScan 5: " + str(e), "red")
5230
- )
5063
+ writerr(colored("ERROR processResponsesURLScan 5: " + str(e), "red"))
5231
5064
 
5232
5065
  totalFileCount = totalFileCount + fileCount
5233
5066
  except Exception as e:
@@ -5240,7 +5073,7 @@ def processResponsesWayback():
5240
5073
  """
5241
5074
  Get archived responses from Wayback Machine (archive.org)
5242
5075
  """
5243
- global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount
5076
+ global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount, current_response, current_session
5244
5077
  try:
5245
5078
  fileCount = 0
5246
5079
  failureCount = 0
@@ -5255,17 +5088,11 @@ def processResponsesWayback():
5255
5088
  indexPath = responseOutputDirectory + "waymore_index.txt"
5256
5089
  except Exception as e:
5257
5090
  if verbose():
5258
- writerr(
5259
- colored("ERROR processResponsesWayback 4: " + str(e), "red")
5260
- )
5091
+ writerr(colored("ERROR processResponsesWayback 4: " + str(e), "red"))
5261
5092
 
5262
5093
  # Check if a continueResp.tmp and responses.tmp files exists
5263
5094
  runPrevious = "n"
5264
- if (
5265
- not args.check_only
5266
- and os.path.exists(continuePath)
5267
- and os.path.exists(responsesPath)
5268
- ):
5095
+ if not args.check_only and os.path.exists(continuePath) and os.path.exists(responsesPath):
5269
5096
 
5270
5097
  # Load the links into the set
5271
5098
  with open(responsesPath, "rb") as fl:
@@ -5274,7 +5101,7 @@ def processResponsesWayback():
5274
5101
 
5275
5102
  # Get the previous end position to start again at this point
5276
5103
  try:
5277
- with open(continuePath, "r") as fc:
5104
+ with open(continuePath) as fc:
5278
5105
  successCount = int(fc.readline().strip())
5279
5106
  except Exception:
5280
5107
  successCount = 0
@@ -5349,9 +5176,7 @@ def processResponsesWayback():
5349
5176
  # Set mime content type filter
5350
5177
  filterMIME = ""
5351
5178
  if MATCH_MIME.strip() != "":
5352
- filterMIME = "&filter=mimetype:" + re.escape(MATCH_MIME).replace(
5353
- ",", "|"
5354
- )
5179
+ filterMIME = "&filter=mimetype:" + re.escape(MATCH_MIME).replace(",", "|")
5355
5180
  else:
5356
5181
  filterMIME = "&filter=!mimetype:warc/revisit"
5357
5182
  filterMIME = filterMIME + "|" + re.escape(FILTER_MIME).replace(",", "|")
@@ -5359,13 +5184,9 @@ def processResponsesWayback():
5359
5184
  # Set status code filter
5360
5185
  filterCode = ""
5361
5186
  if MATCH_CODE.strip() != "":
5362
- filterCode = "&filter=statuscode:" + re.escape(MATCH_CODE).replace(
5363
- ",", "|"
5364
- )
5187
+ filterCode = "&filter=statuscode:" + re.escape(MATCH_CODE).replace(",", "|")
5365
5188
  else:
5366
- filterCode = "&filter=!statuscode:" + re.escape(FILTER_CODE).replace(
5367
- ",", "|"
5368
- )
5189
+ filterCode = "&filter=!statuscode:" + re.escape(FILTER_CODE).replace(",", "|")
5369
5190
 
5370
5191
  # Set the collapse parameter value in the archive.org URL. From the Wayback API docs:
5371
5192
  # "A new form of filtering is the option to 'collapse' results based on a field, or a substring of a field.
@@ -5377,9 +5198,7 @@ def processResponsesWayback():
5377
5198
  collapse = "&collapse=timestamp:10"
5378
5199
  elif args.capture_interval == "d": # get at most 1 capture per URL per day
5379
5200
  collapse = "&collapse=timestamp:8"
5380
- elif (
5381
- args.capture_interval == "m"
5382
- ): # get at most 1 capture per URL per month
5201
+ elif args.capture_interval == "m": # get at most 1 capture per URL per month
5383
5202
  collapse = "&collapse=timestamp:6"
5384
5203
 
5385
5204
  url = (
@@ -5397,18 +5216,18 @@ def processResponsesWayback():
5397
5216
  if verbose():
5398
5217
  write(
5399
5218
  colored(
5400
- "The Wayback Machine URL requested to get responses: ",
5219
+ "Wayback - [ INFO ] The URL requested to get responses: ",
5401
5220
  "magenta",
5402
5221
  )
5403
5222
  + colored(url + "\n", "white")
5404
5223
  )
5405
5224
 
5406
5225
  if args.check_only:
5407
- write(colored("\rChecking archived response requests...\r", "cyan"))
5226
+ write(colored("Wayback - [ INFO ] Checking archived response requests...", "cyan"))
5408
5227
  else:
5409
5228
  write(
5410
5229
  colored(
5411
- "\rGetting list of response links (this can take a while for some domains)...\r",
5230
+ "Wayback - [ INFO ] Getting list of response links (this can take a while for some domains)...",
5412
5231
  "cyan",
5413
5232
  )
5414
5233
  )
@@ -5421,18 +5240,24 @@ def processResponsesWayback():
5421
5240
  session = requests.Session()
5422
5241
  session.mount("https://", HTTP_ADAPTER)
5423
5242
  session.mount("http://", HTTP_ADAPTER)
5243
+ try:
5244
+ current_session = session
5245
+ except Exception:
5246
+ pass
5424
5247
  resp = session.get(
5425
5248
  url,
5426
5249
  stream=True,
5427
5250
  headers={"User-Agent": userAgent},
5428
5251
  timeout=args.timeout,
5429
5252
  )
5253
+ try:
5254
+ current_response = resp
5255
+ except Exception:
5256
+ pass
5430
5257
  except ConnectionError:
5431
5258
  writerr(
5432
5259
  colored(
5433
- getSPACER(
5434
- "[ ERR ] Wayback Machine (archive.org) connection error"
5435
- ),
5260
+ getSPACER("Wayback - [ ERR ] Connection error"),
5436
5261
  "red",
5437
5262
  )
5438
5263
  )
@@ -5442,7 +5267,7 @@ def processResponsesWayback():
5442
5267
  except Exception as e:
5443
5268
  writerr(
5444
5269
  colored(
5445
- getSPACER("[ ERR ] Couldn't get list of responses: " + str(e)),
5270
+ getSPACER("Wayback - [ ERR ] Couldn't get list of responses: " + str(e)),
5446
5271
  "red",
5447
5272
  )
5448
5273
  )
@@ -5457,7 +5282,7 @@ def processResponsesWayback():
5457
5282
  writerr(
5458
5283
  colored(
5459
5284
  getSPACER(
5460
- "No archived responses were found on Wayback Machine (archive.org) for the given search parameters."
5285
+ "Wayback - [ ERR ] No archived responses were found on Wayback Machine (archive.org) for the given search parameters."
5461
5286
  ),
5462
5287
  "red",
5463
5288
  )
@@ -5468,7 +5293,7 @@ def processResponsesWayback():
5468
5293
  writerr(
5469
5294
  colored(
5470
5295
  getSPACER(
5471
- "[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
5296
+ "Wayback - [ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
5472
5297
  ),
5473
5298
  "red",
5474
5299
  )
@@ -5479,7 +5304,7 @@ def processResponsesWayback():
5479
5304
  writerr(
5480
5305
  colored(
5481
5306
  getSPACER(
5482
- "[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
5307
+ "Wayback - [ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
5483
5308
  ),
5484
5309
  "red",
5485
5310
  )
@@ -5491,7 +5316,7 @@ def processResponsesWayback():
5491
5316
  writerr(
5492
5317
  colored(
5493
5318
  getSPACER(
5494
- "[ "
5319
+ "Wayback - [ "
5495
5320
  + str(resp.status_code)
5496
5321
  + " ] Error for "
5497
5322
  + url
@@ -5506,7 +5331,7 @@ def processResponsesWayback():
5506
5331
  writerr(
5507
5332
  colored(
5508
5333
  getSPACER(
5509
- "Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml"
5334
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml"
5510
5335
  ),
5511
5336
  "red",
5512
5337
  )
@@ -5515,7 +5340,7 @@ def processResponsesWayback():
5515
5340
  writerr(
5516
5341
  colored(
5517
5342
  getSPACER(
5518
- "Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed"
5343
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed"
5519
5344
  ),
5520
5345
  "red",
5521
5346
  )
@@ -5525,7 +5350,7 @@ def processResponsesWayback():
5525
5350
  writerr(
5526
5351
  colored(
5527
5352
  getSPACER(
5528
- "Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)"
5353
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)"
5529
5354
  ),
5530
5355
  "red",
5531
5356
  )
@@ -5534,7 +5359,7 @@ def processResponsesWayback():
5534
5359
  writerr(
5535
5360
  colored(
5536
5361
  getSPACER(
5537
- "Failed to get links from Wayback Machine (archive.org) - check input domain and try again."
5362
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - check input domain and try again."
5538
5363
  ),
5539
5364
  "red",
5540
5365
  )
@@ -5544,23 +5369,43 @@ def processResponsesWayback():
5544
5369
  pass
5545
5370
 
5546
5371
  # Go through the response to save the links found
5547
- for line in resp.iter_lines():
5372
+ try:
5373
+ for line in resp.iter_lines():
5374
+ try:
5375
+ results = line.decode("utf-8")
5376
+ parts = results.split(" ", 2)
5377
+ timestamp = parts[0]
5378
+ originalUrl = parts[1]
5379
+ linksFoundResponseAdd(timestamp + "/" + originalUrl)
5380
+ except Exception:
5381
+ writerr(
5382
+ colored(
5383
+ getSPACER(
5384
+ "ERROR processResponsesWayback 3: Cannot to get link from line: "
5385
+ + str(line)
5386
+ ),
5387
+ "red",
5388
+ )
5389
+ )
5390
+ finally:
5548
5391
  try:
5549
- results = line.decode("utf-8")
5550
- parts = results.split(" ", 2)
5551
- timestamp = parts[0]
5552
- originalUrl = parts[1]
5553
- linksFoundResponseAdd(timestamp + "/" + originalUrl)
5392
+ current_response = None
5554
5393
  except Exception:
5555
- writerr(
5556
- colored(
5557
- getSPACER(
5558
- "ERROR processResponsesWayback 3: Cannot to get link from line: "
5559
- + str(line)
5560
- ),
5561
- "red",
5562
- )
5563
- )
5394
+ pass
5395
+ try:
5396
+ current_session = None
5397
+ except Exception:
5398
+ pass
5399
+
5400
+ # Cleanup shared response/session references now the response has been processed
5401
+ try:
5402
+ current_response = None
5403
+ except Exception:
5404
+ pass
5405
+ try:
5406
+ current_session = None
5407
+ except Exception:
5408
+ pass
5564
5409
 
5565
5410
  # Remove any links that have URL exclusions
5566
5411
  linkRequests = []
@@ -5574,8 +5419,7 @@ def processResponsesWayback():
5574
5419
  # b) it does not match the URL exclusions
5575
5420
  if (
5576
5421
  args.regex_after is None
5577
- or re.search(args.regex_after, link, flags=re.IGNORECASE)
5578
- is not None
5422
+ or re.search(args.regex_after, link, flags=re.IGNORECASE) is not None
5579
5423
  ) and exclusionRegex.search(link) is None:
5580
5424
  linkRequests.append(link)
5581
5425
 
@@ -5594,7 +5438,7 @@ def processResponsesWayback():
5594
5438
  writerr(
5595
5439
  colored(
5596
5440
  getSPACER(
5597
- 'Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'
5441
+ 'Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'
5598
5442
  + originalUrl
5599
5443
  + "\") but they didn't match the input you gave. Check input and try again."
5600
5444
  ),
@@ -5605,7 +5449,7 @@ def processResponsesWayback():
5605
5449
  writerr(
5606
5450
  colored(
5607
5451
  getSPACER(
5608
- "Failed to get links from Wayback Machine (archive.org) - check input and try again."
5452
+ "Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - check input and try again."
5609
5453
  ),
5610
5454
  "red",
5611
5455
  )
@@ -5748,7 +5592,7 @@ def processResponsesWayback():
5748
5592
  if failureCount > 0:
5749
5593
  if verbose():
5750
5594
  write(
5751
- colored("\nWayback responses saved to ", "cyan")
5595
+ colored("Wayback - [ INFO ] Responses saved to ", "cyan")
5752
5596
  + colored(responseOutputDirectory, "white")
5753
5597
  + colored(" for " + subs + argsInput + ": ", "cyan")
5754
5598
  + colored(
@@ -5763,10 +5607,7 @@ def processResponsesWayback():
5763
5607
  else:
5764
5608
  write(
5765
5609
  colored(
5766
- "\nWayback responses saved for "
5767
- + subs
5768
- + argsInput
5769
- + ": ",
5610
+ "Wayback - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5770
5611
  "cyan",
5771
5612
  )
5772
5613
  + colored(
@@ -5781,7 +5622,7 @@ def processResponsesWayback():
5781
5622
  else:
5782
5623
  if verbose():
5783
5624
  write(
5784
- colored("\nWayback responses saved to ", "cyan")
5625
+ colored("Wayback - [ INFO ] Responses saved to ", "cyan")
5785
5626
  + colored(responseOutputDirectory, "white")
5786
5627
  + colored(" for " + subs + argsInput + ": ", "cyan")
5787
5628
  + colored(
@@ -5795,10 +5636,7 @@ def processResponsesWayback():
5795
5636
  else:
5796
5637
  write(
5797
5638
  colored(
5798
- "\nWayback responses saved for "
5799
- + subs
5800
- + argsInput
5801
- + ": ",
5639
+ "Wayback - [ INFO ] Responses saved for " + subs + argsInput + ": ",
5802
5640
  "cyan",
5803
5641
  )
5804
5642
  + colored(
@@ -5811,9 +5649,7 @@ def processResponsesWayback():
5811
5649
  )
5812
5650
  except Exception as e:
5813
5651
  if verbose():
5814
- writerr(
5815
- colored("ERROR processResponsesWayback 5: " + str(e), "red")
5816
- )
5652
+ writerr(colored("ERROR processResponsesWayback 5: " + str(e), "red"))
5817
5653
 
5818
5654
  totalFileCount = totalFileCount + fileCount
5819
5655
  except Exception as e:
@@ -5911,8 +5747,7 @@ def notifyDiscord():
5911
5747
  writerr(
5912
5748
  colored(
5913
5749
  getSPACER(
5914
- "WARNING: Failed to send notification to Discord - "
5915
- + result.json()
5750
+ "WARNING: Failed to send notification to Discord - " + result.json()
5916
5751
  ),
5917
5752
  "yellow",
5918
5753
  )
@@ -5920,9 +5755,7 @@ def notifyDiscord():
5920
5755
  except Exception as e:
5921
5756
  writerr(
5922
5757
  colored(
5923
- getSPACER(
5924
- "WARNING: Failed to send notification to Discord - " + str(e)
5925
- ),
5758
+ getSPACER("WARNING: Failed to send notification to Discord - " + str(e)),
5926
5759
  "yellow",
5927
5760
  )
5928
5761
  )
@@ -6037,9 +5870,7 @@ def combineInlineJS():
6037
5870
 
6038
5871
  totalSections = len(uniqueScripts)
6039
5872
  sectionCounter = 0 # Counter for inline JS sections
6040
- currentOutputFile = os.path.join(
6041
- responseOutputDirectory, outputFileTemplate.format(1)
6042
- )
5873
+ currentOutputFile = os.path.join(responseOutputDirectory, outputFileTemplate.format(1))
6043
5874
  currentSectionsWritten = 0 # Counter for sections written in current file
6044
5875
 
6045
5876
  if totalSections > 0:
@@ -6075,9 +5906,7 @@ def combineInlineJS():
6075
5906
  currentSectionsWritten = 1
6076
5907
 
6077
5908
  # Insert comment line for the beginning of the section
6078
- inlineJSFile.write(
6079
- f"//****** INLINE JS SECTION {sectionCounter} ******//\n\n"
6080
- )
5909
+ inlineJSFile.write(f"//****** INLINE JS SECTION {sectionCounter} ******//\n\n")
6081
5910
 
6082
5911
  # Write comments indicating the files the script was found in
6083
5912
  files = ""
@@ -6111,10 +5940,7 @@ def combineInlineJS():
6111
5940
  write(
6112
5941
  colored("Created files ", "cyan")
6113
5942
  + colored(
6114
- responseOutputDirectory
6115
- + "combinedInline{1-"
6116
- + str(fileNumber)
6117
- + "}.js",
5943
+ responseOutputDirectory + "combinedInline{1-" + str(fileNumber) + "}.js",
6118
5944
  "white",
6119
5945
  )
6120
5946
  + colored(" (contents of inline JS)\n", "cyan")
@@ -6124,9 +5950,91 @@ def combineInlineJS():
6124
5950
  writerr(colored("ERROR combineInlineJS 1: " + str(e), "red"))
6125
5951
 
6126
5952
 
5953
+ # Async wrapper functions for concurrent source fetching
5954
+ async def fetch_wayback_async():
5955
+ """Async wrapper for getWaybackUrls - runs in thread pool"""
5956
+ loop = asyncio.get_event_loop()
5957
+ await loop.run_in_executor(None, getWaybackUrls)
5958
+
5959
+
5960
+ async def fetch_commoncrawl_async():
5961
+ """Async wrapper for getCommonCrawlUrls - runs in thread pool"""
5962
+ loop = asyncio.get_event_loop()
5963
+ await loop.run_in_executor(None, getCommonCrawlUrls)
5964
+
5965
+
5966
+ async def fetch_alienvault_async():
5967
+ """Async wrapper for getAlienVaultUrls - runs in thread pool"""
5968
+ loop = asyncio.get_event_loop()
5969
+ await loop.run_in_executor(None, getAlienVaultUrls)
5970
+
5971
+
5972
+ async def fetch_urlscan_async():
5973
+ """Async wrapper for getURLScanUrls - runs in thread pool"""
5974
+ loop = asyncio.get_event_loop()
5975
+ await loop.run_in_executor(None, getURLScanUrls)
5976
+
5977
+
5978
+ async def fetch_virustotal_async():
5979
+ """Async wrapper for getVirusTotalUrls - runs in thread pool"""
5980
+ loop = asyncio.get_event_loop()
5981
+ await loop.run_in_executor(None, getVirusTotalUrls)
5982
+
5983
+
5984
+ async def fetch_intelx_async():
5985
+ """Async wrapper for getIntelxUrls - runs in thread pool"""
5986
+ loop = asyncio.get_event_loop()
5987
+ await loop.run_in_executor(None, getIntelxUrls)
5988
+
5989
+
5990
+ async def fetch_all_sources_async():
5991
+ """
5992
+ Orchestrator function to fetch from all enabled sources concurrently.
5993
+ Each source runs in its own thread pool executor while orchestration happens async.
5994
+ """
5995
+ global args, stopProgram, VIRUSTOTAL_API_KEY, INTELX_API_KEY, argsInput
5996
+
5997
+ tasks = []
5998
+
5999
+ # Build list of tasks for enabled sources
6000
+ if not args.xwm and stopProgram is None:
6001
+ tasks.append(("Wayback Machine", fetch_wayback_async()))
6002
+ if not args.xcc and stopProgram is None:
6003
+ tasks.append(("Common Crawl", fetch_commoncrawl_async()))
6004
+ if not args.xav and stopProgram is None and not argsInput.startswith("."):
6005
+ tasks.append(("AlienVault OTX", fetch_alienvault_async()))
6006
+ if not args.xus and stopProgram is None:
6007
+ tasks.append(("URLScan", fetch_urlscan_async()))
6008
+ if not args.xvt and VIRUSTOTAL_API_KEY != "" and stopProgram is None:
6009
+ tasks.append(("VirusTotal", fetch_virustotal_async()))
6010
+ if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
6011
+ tasks.append(("Intelligence X", fetch_intelx_async()))
6012
+
6013
+ if not tasks:
6014
+ return
6015
+
6016
+ # Extract just the coroutines for gather
6017
+ task_coros = [task[1] for task in tasks]
6018
+
6019
+ # Fetch all concurrently, capturing exceptions so one failure doesn't stop others
6020
+ results = await asyncio.gather(*task_coros, return_exceptions=True)
6021
+
6022
+ # Check for any exceptions that occurred
6023
+ for i, result in enumerate(results):
6024
+ if isinstance(result, Exception):
6025
+ source_name = tasks[i][0]
6026
+ if verbose():
6027
+ writerr(
6028
+ colored(
6029
+ getSPACER(f"ERROR in {source_name} during concurrent fetch: {str(result)}"),
6030
+ "red",
6031
+ )
6032
+ )
6033
+
6034
+
6127
6035
  # Run waymore
6128
6036
  def main():
6129
- global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY
6037
+ global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
6130
6038
 
6131
6039
  # Tell Python to run the handler() function when SIGINT is received
6132
6040
  signal(SIGINT, handler)
@@ -6295,13 +6203,7 @@ def main():
6295
6203
  action="store",
6296
6204
  type=int,
6297
6205
  help="Limit the number of Common Crawl index collections searched, e.g. '-lcc 10' will just search the latest 10 collections (default: 1). As of November 2024 there are currently 106 collections. Setting to 0 (default) will search ALL collections. If you don't want to search Common Crawl at all, use the -xcc option.",
6298
- )
6299
- parser.add_argument(
6300
- "-lcy",
6301
- action="store",
6302
- type=int,
6303
- help="Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with -lcc). For example, if you are only interested in data from 2015 and after, pass -lcy 2015. If you don't want to search Common Crawl at all, use the -xcc option.",
6304
- default=0,
6206
+ default=1,
6305
6207
  )
6306
6208
  parser.add_argument(
6307
6209
  "-t",
@@ -6316,10 +6218,10 @@ def main():
6316
6218
  parser.add_argument(
6317
6219
  "-p",
6318
6220
  "--processes",
6319
- help="Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1)",
6221
+ help="Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 2)",
6320
6222
  action="store",
6321
6223
  type=validateArgProcesses,
6322
- default=1,
6224
+ default=2,
6323
6225
  metavar="<integer>",
6324
6226
  )
6325
6227
  parser.add_argument(
@@ -6420,13 +6322,6 @@ def main():
6420
6322
  showVersion()
6421
6323
  sys.exit()
6422
6324
 
6423
- # If -lcc wasn't passed then set to the default of 1 if -lcy is 0. This will make them work together
6424
- if args.lcc is None:
6425
- if args.lcy == 0:
6426
- args.lcc = 1
6427
- else:
6428
- args.lcc = 0
6429
-
6430
6325
  # If --providers was passed, then manually set the exclude arguments;
6431
6326
  if args.providers:
6432
6327
  if "wayback" not in args.providers:
@@ -6531,6 +6426,12 @@ def main():
6531
6426
  indexFile = None
6532
6427
  path = ""
6533
6428
  stopSource = False
6429
+ stopSourceWayback = False
6430
+ stopSourceCommonCrawl = False
6431
+ stopSourceAlienVault = False
6432
+ stopSourceURLScan = False
6433
+ stopSourceVirusTotal = False
6434
+ stopSourceIntelx = False
6534
6435
 
6535
6436
  # Get the config settings from the config.yml file
6536
6437
  getConfig()
@@ -6548,29 +6449,17 @@ def main():
6548
6449
  # If the mode is U (URLs retrieved) or B (URLs retrieved AND Responses downloaded)
6549
6450
  if args.mode in ["U", "B"]:
6550
6451
 
6551
- # If not requested to exclude, get URLs from the Wayback Machine (archive.org)
6552
- if not args.xwm and stopProgram is None:
6553
- getWaybackUrls()
6554
-
6555
- # If not requested to exclude, get URLs from commoncrawl.org
6556
- if not args.xcc and stopProgram is None:
6557
- getCommonCrawlUrls()
6558
-
6559
- # If not requested to exclude and a TLD wasn't passed, get URLs from alienvault.com
6560
- if not args.xav and stopProgram is None and not inpt.startswith("."):
6561
- getAlienVaultUrls()
6562
-
6563
- # If not requested to exclude, get URLs from urlscan.io
6564
- if not args.xus and stopProgram is None:
6565
- getURLScanUrls()
6566
-
6567
- # If not requested to exclude, get URLs from virustotal.com if we have an API key
6568
- if not args.xvt and VIRUSTOTAL_API_KEY != "" and stopProgram is None:
6569
- getVirusTotalUrls()
6570
-
6571
- # If not requested to exclude, get URLs from intelx.io if we have an API key
6572
- if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
6573
- getIntelxUrls()
6452
+ # Fetch from all sources concurrently using async/await
6453
+ try:
6454
+ asyncio.run(fetch_all_sources_async())
6455
+ except Exception as e:
6456
+ if verbose():
6457
+ writerr(
6458
+ colored(
6459
+ getSPACER(f"ERROR during concurrent source fetching: {str(e)}"),
6460
+ "red",
6461
+ )
6462
+ )
6574
6463
 
6575
6464
  # Output results of all searches
6576
6465
  processURLOutput()