waymore 6.5__py3-none-any.whl → 7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -1
- waymore/waymore.py +1082 -1193
- {waymore-6.5.dist-info → waymore-7.0.dist-info}/METADATA +10 -8
- waymore-7.0.dist-info/RECORD +8 -0
- {waymore-6.5.dist-info → waymore-7.0.dist-info}/WHEEL +1 -1
- waymore-6.5.dist-info/RECORD +0 -8
- {waymore-6.5.dist-info → waymore-7.0.dist-info}/entry_points.txt +0 -0
- {waymore-6.5.dist-info → waymore-7.0.dist-info/licenses}/LICENSE +0 -0
- {waymore-6.5.dist-info → waymore-7.0.dist-info}/top_level.txt +0 -0
waymore/waymore.py
CHANGED
|
@@ -4,28 +4,30 @@
|
|
|
4
4
|
# Full help here: https://github.com/xnl-h4ck3r/waymore/blob/main/README.md
|
|
5
5
|
# Good luck and good hunting! If you really love the tool (or any others), or they helped you find an awesome bounty, consider BUYING ME A COFFEE! (https://ko-fi.com/xnlh4ck3r) ☕ (I could use the caffeine!)
|
|
6
6
|
|
|
7
|
-
from urllib.parse import urlparse
|
|
8
|
-
import requests
|
|
9
|
-
from requests.exceptions import ConnectionError
|
|
10
|
-
from requests.utils import quote
|
|
11
|
-
from requests.adapters import HTTPAdapter, Retry
|
|
12
7
|
import argparse
|
|
13
|
-
|
|
8
|
+
import asyncio
|
|
9
|
+
import enum
|
|
10
|
+
import json
|
|
11
|
+
import math
|
|
14
12
|
import multiprocessing.dummy as mp
|
|
15
|
-
from termcolor import colored
|
|
16
|
-
from datetime import datetime, timedelta
|
|
17
|
-
from pathlib import Path
|
|
18
|
-
import yaml
|
|
19
13
|
import os
|
|
20
|
-
import
|
|
21
|
-
import re
|
|
14
|
+
import pickle
|
|
22
15
|
import random
|
|
16
|
+
import re
|
|
23
17
|
import sys
|
|
24
|
-
import
|
|
25
|
-
import
|
|
26
|
-
import
|
|
27
|
-
import
|
|
18
|
+
import threading
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from signal import SIGINT, signal
|
|
22
|
+
from urllib.parse import urlparse
|
|
23
|
+
|
|
24
|
+
import requests
|
|
28
25
|
import tldextract
|
|
26
|
+
import yaml
|
|
27
|
+
from requests.adapters import HTTPAdapter, Retry
|
|
28
|
+
from requests.exceptions import ConnectionError
|
|
29
|
+
from requests.utils import quote
|
|
30
|
+
from termcolor import colored
|
|
29
31
|
|
|
30
32
|
try:
|
|
31
33
|
from . import __version__
|
|
@@ -59,6 +61,12 @@ argsInput = ""
|
|
|
59
61
|
isInputFile = False
|
|
60
62
|
stopProgramCount = 0
|
|
61
63
|
stopSource = False
|
|
64
|
+
stopSourceWayback = False
|
|
65
|
+
stopSourceCommonCrawl = False
|
|
66
|
+
stopSourceAlienVault = False
|
|
67
|
+
stopSourceURLScan = False
|
|
68
|
+
stopSourceVirusTotal = False
|
|
69
|
+
stopSourceIntelx = False
|
|
62
70
|
successCount = 0
|
|
63
71
|
failureCount = 0
|
|
64
72
|
fileCount = 0
|
|
@@ -79,6 +87,10 @@ currentMemUsage = 0
|
|
|
79
87
|
maxMemoryPercent = 0
|
|
80
88
|
currentMemPercent = 0
|
|
81
89
|
process = None
|
|
90
|
+
current_response = None
|
|
91
|
+
current_session = None
|
|
92
|
+
# Event used to interrupt long sleeps (e.g., rate-limit waits) when SIGINT is received
|
|
93
|
+
interrupt_event = threading.Event()
|
|
82
94
|
HTTP_ADAPTER = None
|
|
83
95
|
HTTP_ADAPTER_CC = None
|
|
84
96
|
checkWayback = 0
|
|
@@ -90,20 +102,28 @@ checkIntelx = 0
|
|
|
90
102
|
argsInputHostname = ""
|
|
91
103
|
responseOutputDirectory = ""
|
|
92
104
|
urlscanRequestLinks = set()
|
|
105
|
+
intelxAPIIssue = False
|
|
106
|
+
linkCountWayback = 0
|
|
107
|
+
linkCountCommonCrawl = 0
|
|
108
|
+
linkCountAlienVault = 0
|
|
109
|
+
linkCountURLScan = 0
|
|
110
|
+
linkCountVirusTotal = 0
|
|
111
|
+
linkCountIntelx = 0
|
|
112
|
+
|
|
113
|
+
# Thread lock for protecting shared state during concurrent operations
|
|
114
|
+
links_lock = threading.Lock()
|
|
115
|
+
|
|
116
|
+
# Shared state for link collection across all sources
|
|
117
|
+
linksFound = set()
|
|
118
|
+
linkMimes = set()
|
|
93
119
|
|
|
94
120
|
# Source Provider URLs
|
|
95
121
|
WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
|
|
96
122
|
CCRAWL_INDEX_URL = "https://index.commoncrawl.org/collinfo.json"
|
|
97
|
-
ALIENVAULT_URL =
|
|
98
|
-
|
|
99
|
-
)
|
|
100
|
-
URLSCAN_URL = (
|
|
101
|
-
"https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}{DATERANGE}&size=10000"
|
|
102
|
-
)
|
|
123
|
+
ALIENVAULT_URL = "https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500"
|
|
124
|
+
URLSCAN_URL = "https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}{DATERANGE}&size=10000"
|
|
103
125
|
URLSCAN_DOM_URL = "https://urlscan.io/dom/"
|
|
104
|
-
VIRUSTOTAL_URL =
|
|
105
|
-
"https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
|
|
106
|
-
)
|
|
126
|
+
VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
|
|
107
127
|
INTELX_SEARCH_URL = "https://2.intelx.io/phonebook/search"
|
|
108
128
|
INTELX_RESULTS_URL = "https://2.intelx.io/phonebook/search/result?id="
|
|
109
129
|
INTELX_ACCOUNT_URL = "https://2.intelx.io/authenticate/info"
|
|
@@ -237,8 +257,7 @@ def write(text="", pipe=False):
|
|
|
237
257
|
# or if the tool has been piped and the pipe parameter is True
|
|
238
258
|
# AND if --stream is NOT active OR if it is active but we are explicitly piping (e.g. for URLs)
|
|
239
259
|
if (sys.stdout.isatty() or (not sys.stdout.isatty() and pipe)) and (
|
|
240
|
-
not (args.stream and args.mode == "U")
|
|
241
|
-
or (args.stream and args.mode == "U" and pipe)
|
|
260
|
+
not (args.stream and args.mode == "U") or (args.stream and args.mode == "U" and pipe)
|
|
242
261
|
):
|
|
243
262
|
# If it has carriage return in the string, don't add a newline
|
|
244
263
|
if text.find("\r") > 0:
|
|
@@ -274,26 +293,14 @@ def showVersion():
|
|
|
274
293
|
timeout=3,
|
|
275
294
|
)
|
|
276
295
|
except Exception:
|
|
277
|
-
write(
|
|
278
|
-
"Current waymore version "
|
|
279
|
-
+ __version__
|
|
280
|
-
+ " (unable to check if latest)\n"
|
|
281
|
-
)
|
|
296
|
+
write("Current waymore version " + __version__ + " (unable to check if latest)\n")
|
|
282
297
|
if __version__ == resp.text.split("=")[1].replace('"', "").strip():
|
|
283
298
|
write(
|
|
284
|
-
"Current waymore version "
|
|
285
|
-
+ __version__
|
|
286
|
-
+ " ("
|
|
287
|
-
+ colored("latest", "green")
|
|
288
|
-
+ ")\n"
|
|
299
|
+
"Current waymore version " + __version__ + " (" + colored("latest", "green") + ")\n"
|
|
289
300
|
)
|
|
290
301
|
else:
|
|
291
302
|
write(
|
|
292
|
-
"Current waymore version "
|
|
293
|
-
+ __version__
|
|
294
|
-
+ " ("
|
|
295
|
-
+ colored("outdated", "red")
|
|
296
|
-
+ ")\n"
|
|
303
|
+
"Current waymore version " + __version__ + " (" + colored("outdated", "red") + ")\n"
|
|
297
304
|
)
|
|
298
305
|
except Exception:
|
|
299
306
|
pass
|
|
@@ -307,9 +314,7 @@ def showBanner():
|
|
|
307
314
|
write(colored("| | | / ___ | |_| ", "red") + "| | | | |_| | | | |_| |")
|
|
308
315
|
write(colored(r" \___/\_____|\__ ", "red") + r"|_|_|_|\___/| | | ____/")
|
|
309
316
|
write(
|
|
310
|
-
colored(" (____/ ", "red")
|
|
311
|
-
+ colored(" by Xnl-h4ck3r ", "magenta")
|
|
312
|
-
+ r" \_____)"
|
|
317
|
+
colored(" (____/ ", "red") + colored(" by Xnl-h4ck3r ", "magenta") + r" \_____)"
|
|
313
318
|
)
|
|
314
319
|
try:
|
|
315
320
|
currentDate = datetime.now().date()
|
|
@@ -322,11 +327,7 @@ def showBanner():
|
|
|
322
327
|
)
|
|
323
328
|
)
|
|
324
329
|
elif currentDate.month == 10 and currentDate.day == 31:
|
|
325
|
-
write(
|
|
326
|
-
colored(
|
|
327
|
-
" *** 🎃 HAPPY HALLOWEEN! 🎃 ***", "red", attrs=["blink"]
|
|
328
|
-
)
|
|
329
|
-
)
|
|
330
|
+
write(colored(" *** 🎃 HAPPY HALLOWEEN! 🎃 ***", "red", attrs=["blink"]))
|
|
330
331
|
elif currentDate.month == 1 and currentDate.day in (1, 2, 3, 4, 5):
|
|
331
332
|
write(
|
|
332
333
|
colored(
|
|
@@ -353,16 +354,14 @@ def handler(signal_received, frame):
|
|
|
353
354
|
This function is called if Ctrl-C is called by the user
|
|
354
355
|
An attempt will be made to try and clean up properly
|
|
355
356
|
"""
|
|
356
|
-
global stopSource, stopProgram, stopProgramCount
|
|
357
|
+
global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
|
|
357
358
|
|
|
358
359
|
if stopProgram is not None:
|
|
359
360
|
stopProgramCount = stopProgramCount + 1
|
|
360
361
|
if stopProgramCount == 1:
|
|
361
362
|
writerr(
|
|
362
363
|
colored(
|
|
363
|
-
getSPACER(
|
|
364
|
-
">>> Please be patient... Trying to save data and end gracefully!"
|
|
365
|
-
),
|
|
364
|
+
getSPACER(">>> Please be patient... Trying to save data and end gracefully!"),
|
|
366
365
|
"red",
|
|
367
366
|
)
|
|
368
367
|
)
|
|
@@ -384,17 +383,41 @@ def handler(signal_received, frame):
|
|
|
384
383
|
else:
|
|
385
384
|
stopProgram = StopProgram.SIGINT
|
|
386
385
|
stopSource = True
|
|
386
|
+
stopSourceWayback = True
|
|
387
|
+
stopSourceCommonCrawl = True
|
|
388
|
+
stopSourceAlienVault = True
|
|
389
|
+
stopSourceURLScan = True
|
|
390
|
+
stopSourceVirusTotal = True
|
|
391
|
+
stopSourceIntelx = True
|
|
392
|
+
# Try to close any active response or session to interrupt blocking network I/O
|
|
393
|
+
try:
|
|
394
|
+
if current_response is not None:
|
|
395
|
+
try:
|
|
396
|
+
current_response.close()
|
|
397
|
+
except Exception:
|
|
398
|
+
pass
|
|
399
|
+
except Exception:
|
|
400
|
+
pass
|
|
401
|
+
try:
|
|
402
|
+
if current_session is not None:
|
|
403
|
+
try:
|
|
404
|
+
current_session.close()
|
|
405
|
+
except Exception:
|
|
406
|
+
pass
|
|
407
|
+
except Exception:
|
|
408
|
+
pass
|
|
409
|
+
# Signal any waits to stop early
|
|
410
|
+
try:
|
|
411
|
+
interrupt_event.set()
|
|
412
|
+
except Exception:
|
|
413
|
+
pass
|
|
387
414
|
writerr(
|
|
388
415
|
colored(
|
|
389
416
|
getSPACER('>>> "Oh my God, they killed Kenny... and waymore!" - Kyle'),
|
|
390
417
|
"red",
|
|
391
418
|
)
|
|
392
419
|
)
|
|
393
|
-
writerr(
|
|
394
|
-
colored(
|
|
395
|
-
getSPACER(">>> Attempting to rescue any data gathered so far..."), "red"
|
|
396
|
-
)
|
|
397
|
-
)
|
|
420
|
+
writerr(colored(getSPACER(">>> Attempting to rescue any data gathered so far..."), "red"))
|
|
398
421
|
|
|
399
422
|
|
|
400
423
|
def showOptions():
|
|
@@ -479,13 +502,13 @@ def showOptions():
|
|
|
479
502
|
)
|
|
480
503
|
|
|
481
504
|
if not args.xcc:
|
|
482
|
-
if args.lcc == 0 and args.
|
|
505
|
+
if args.lcc == 0 and args.from_date is None and args.to_date is None:
|
|
483
506
|
write(
|
|
484
507
|
colored("-lcc: " + str(args.lcc), "magenta")
|
|
485
508
|
+ colored(" Search ALL Common Crawl index collections.", "white")
|
|
486
509
|
)
|
|
487
510
|
else:
|
|
488
|
-
if args.
|
|
511
|
+
if args.from_date is None and args.to_date is None:
|
|
489
512
|
write(
|
|
490
513
|
colored("-lcc: " + str(args.lcc), "magenta")
|
|
491
514
|
+ colored(
|
|
@@ -498,19 +521,10 @@ def showOptions():
|
|
|
498
521
|
write(
|
|
499
522
|
colored("-lcc: " + str(args.lcc), "magenta")
|
|
500
523
|
+ colored(
|
|
501
|
-
" The number of latest Common Crawl index collections to be searched.",
|
|
524
|
+
" The number of latest Common Crawl index collections to be searched within the specified date range (-to and -from).",
|
|
502
525
|
"white",
|
|
503
526
|
)
|
|
504
527
|
)
|
|
505
|
-
write(
|
|
506
|
-
colored("-lcy: " + str(args.lcy), "magenta")
|
|
507
|
-
+ colored(
|
|
508
|
-
" Search all Common Crawl index collections with data from year "
|
|
509
|
-
+ str(args.lcy)
|
|
510
|
-
+ " and after.",
|
|
511
|
-
"white",
|
|
512
|
-
)
|
|
513
|
-
)
|
|
514
528
|
|
|
515
529
|
if URLSCAN_API_KEY == "":
|
|
516
530
|
write(
|
|
@@ -532,9 +546,7 @@ def showOptions():
|
|
|
532
546
|
)
|
|
533
547
|
)
|
|
534
548
|
else:
|
|
535
|
-
write(
|
|
536
|
-
colored("VirusTotal API Key: ", "magenta") + colored(VIRUSTOTAL_API_KEY)
|
|
537
|
-
)
|
|
549
|
+
write(colored("VirusTotal API Key: ", "magenta") + colored(VIRUSTOTAL_API_KEY))
|
|
538
550
|
|
|
539
551
|
if INTELX_API_KEY == "":
|
|
540
552
|
write(
|
|
@@ -545,9 +557,7 @@ def showOptions():
|
|
|
545
557
|
)
|
|
546
558
|
)
|
|
547
559
|
else:
|
|
548
|
-
write(
|
|
549
|
-
colored("Intelligence X API Key: ", "magenta") + colored(INTELX_API_KEY)
|
|
550
|
-
)
|
|
560
|
+
write(colored("Intelligence X API Key: ", "magenta") + colored(INTELX_API_KEY))
|
|
551
561
|
|
|
552
562
|
if args.mode in ["U", "B"]:
|
|
553
563
|
if args.output_urls != "":
|
|
@@ -589,9 +599,7 @@ def showOptions():
|
|
|
589
599
|
write(
|
|
590
600
|
colored("-l: " + str(args.limit), "magenta")
|
|
591
601
|
+ colored(
|
|
592
|
-
" Only save the FIRST "
|
|
593
|
-
+ str(args.limit)
|
|
594
|
-
+ " responses found.",
|
|
602
|
+
" Only save the FIRST " + str(args.limit) + " responses found.",
|
|
595
603
|
"white",
|
|
596
604
|
)
|
|
597
605
|
)
|
|
@@ -599,24 +607,11 @@ def showOptions():
|
|
|
599
607
|
write(
|
|
600
608
|
colored("-l: " + str(args.limit), "magenta")
|
|
601
609
|
+ colored(
|
|
602
|
-
" Only save the LAST "
|
|
603
|
-
+ str(abs(args.limit))
|
|
604
|
-
+ " responses found.",
|
|
610
|
+
" Only save the LAST " + str(abs(args.limit)) + " responses found.",
|
|
605
611
|
"white",
|
|
606
612
|
)
|
|
607
613
|
)
|
|
608
614
|
|
|
609
|
-
if args.from_date is not None:
|
|
610
|
-
write(
|
|
611
|
-
colored("-from: " + str(args.from_date), "magenta")
|
|
612
|
-
+ colored(" The date/time to get responses from.", "white")
|
|
613
|
-
)
|
|
614
|
-
if args.to_date is not None:
|
|
615
|
-
write(
|
|
616
|
-
colored("-to: " + str(args.to_date), "magenta")
|
|
617
|
-
+ colored(" The date/time to get responses up to.", "white")
|
|
618
|
-
)
|
|
619
|
-
|
|
620
615
|
if args.capture_interval == "h":
|
|
621
616
|
write(
|
|
622
617
|
colored("-ci: " + args.capture_interval, "magenta")
|
|
@@ -667,6 +662,32 @@ def showOptions():
|
|
|
667
662
|
)
|
|
668
663
|
)
|
|
669
664
|
|
|
665
|
+
if args.from_date is not None:
|
|
666
|
+
write(
|
|
667
|
+
colored("-from: " + str(args.from_date), "magenta")
|
|
668
|
+
+ colored(
|
|
669
|
+
" The date/time to get data from.",
|
|
670
|
+
"white",
|
|
671
|
+
)
|
|
672
|
+
+ colored(
|
|
673
|
+
" NOTE: All results will still be returned from Intelligence X, and all sub domains from Virus Total, because these cannot be filtered by date.",
|
|
674
|
+
"yellow",
|
|
675
|
+
)
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
if args.to_date is not None:
|
|
679
|
+
write(
|
|
680
|
+
colored("-to: " + str(args.to_date), "magenta")
|
|
681
|
+
+ colored(
|
|
682
|
+
" The date/time to get data up to.",
|
|
683
|
+
"white",
|
|
684
|
+
)
|
|
685
|
+
+ colored(
|
|
686
|
+
" NOTE: All results will still be returned from Intelligence X, and all sub domains from Virus Total, because these cannot be filtered by date.",
|
|
687
|
+
"yellow",
|
|
688
|
+
)
|
|
689
|
+
)
|
|
690
|
+
|
|
670
691
|
write(
|
|
671
692
|
colored("-f: " + str(args.filter_responses_only), "magenta")
|
|
672
693
|
+ colored(
|
|
@@ -705,9 +726,7 @@ def showOptions():
|
|
|
705
726
|
)
|
|
706
727
|
)
|
|
707
728
|
if not args.mc and args.fc:
|
|
708
|
-
write(
|
|
709
|
-
colored("Response Code exclusions: ", "magenta") + colored(FILTER_CODE)
|
|
710
|
-
)
|
|
729
|
+
write(colored("Response Code exclusions: ", "magenta") + colored(FILTER_CODE))
|
|
711
730
|
write(colored("Response URL exclusions: ", "magenta") + colored(FILTER_URL))
|
|
712
731
|
|
|
713
732
|
if args.mt:
|
|
@@ -771,14 +790,9 @@ def showOptions():
|
|
|
771
790
|
)
|
|
772
791
|
)
|
|
773
792
|
else:
|
|
774
|
-
write(
|
|
775
|
-
colored("Discord Webhook: ", "magenta") + colored(WEBHOOK_DISCORD)
|
|
776
|
-
)
|
|
793
|
+
write(colored("Discord Webhook: ", "magenta") + colored(WEBHOOK_DISCORD))
|
|
777
794
|
|
|
778
|
-
write(
|
|
779
|
-
colored("Default Output Directory: ", "magenta")
|
|
780
|
-
+ colored(str(DEFAULT_OUTPUT_DIR))
|
|
781
|
-
)
|
|
795
|
+
write(colored("Default Output Directory: ", "magenta") + colored(str(DEFAULT_OUTPUT_DIR)))
|
|
782
796
|
|
|
783
797
|
if args.regex_after is not None:
|
|
784
798
|
write(
|
|
@@ -799,7 +813,7 @@ def showOptions():
|
|
|
799
813
|
if args.mode in ["R", "B"] or (args.mode == "U" and not args.xcc):
|
|
800
814
|
write(
|
|
801
815
|
colored("-p: " + str(args.processes), "magenta")
|
|
802
|
-
+ colored(" The number of parallel requests made.", "white")
|
|
816
|
+
+ colored(" The number of parallel requests made per source.", "white")
|
|
803
817
|
)
|
|
804
818
|
write(
|
|
805
819
|
colored("-r: " + str(args.retries), "magenta")
|
|
@@ -1084,10 +1098,7 @@ def getConfig():
|
|
|
1084
1098
|
if args.notify_discord:
|
|
1085
1099
|
try:
|
|
1086
1100
|
WEBHOOK_DISCORD = config.get("WEBHOOK_DISCORD")
|
|
1087
|
-
if (
|
|
1088
|
-
str(WEBHOOK_DISCORD) == "None"
|
|
1089
|
-
or str(WEBHOOK_DISCORD) == "YOUR_WEBHOOK"
|
|
1090
|
-
):
|
|
1101
|
+
if str(WEBHOOK_DISCORD) == "None" or str(WEBHOOK_DISCORD) == "YOUR_WEBHOOK":
|
|
1091
1102
|
writerr(
|
|
1092
1103
|
colored(
|
|
1093
1104
|
'No value for "WEBHOOK_DISCORD" in config.yml - default set',
|
|
@@ -1164,9 +1175,7 @@ def getConfig():
|
|
|
1164
1175
|
else:
|
|
1165
1176
|
writerr(
|
|
1166
1177
|
colored(
|
|
1167
|
-
'WARNING: Cannot find file "'
|
|
1168
|
-
+ args.config
|
|
1169
|
-
+ '", so using default values',
|
|
1178
|
+
'WARNING: Cannot find file "' + args.config + '", so using default values',
|
|
1170
1179
|
"yellow",
|
|
1171
1180
|
)
|
|
1172
1181
|
)
|
|
@@ -1238,9 +1247,7 @@ def printProgressBar(
|
|
|
1238
1247
|
if not (args.stream and args.mode == "U"):
|
|
1239
1248
|
try:
|
|
1240
1249
|
percent = (
|
|
1241
|
-
("{0:." + str(decimals) + "f}")
|
|
1242
|
-
.format(100 * (iteration / float(total)))
|
|
1243
|
-
.rjust(5)
|
|
1250
|
+
("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))).rjust(5)
|
|
1244
1251
|
)
|
|
1245
1252
|
filledLength = int(length * iteration // total)
|
|
1246
1253
|
bar = fill * filledLength + "-" * (length - filledLength)
|
|
@@ -1297,7 +1304,7 @@ def fixArchiveOrgUrl(url):
|
|
|
1297
1304
|
|
|
1298
1305
|
# Add a link to the linksFound collection for archived responses (included timestamp preifx)
|
|
1299
1306
|
def linksFoundResponseAdd(link):
|
|
1300
|
-
global linksFound, argsInput, argsInputHostname
|
|
1307
|
+
global linksFound, argsInput, argsInputHostname, links_lock
|
|
1301
1308
|
|
|
1302
1309
|
try:
|
|
1303
1310
|
if inputIsDomainANDPath:
|
|
@@ -1318,20 +1325,22 @@ def linksFoundResponseAdd(link):
|
|
|
1318
1325
|
|
|
1319
1326
|
# Don't write it if the link does not contain the requested domain (this can sometimes happen)
|
|
1320
1327
|
if parsed_url.lower().find(checkInput.lower()) >= 0:
|
|
1321
|
-
|
|
1328
|
+
with links_lock:
|
|
1329
|
+
linksFound.add(link)
|
|
1322
1330
|
# If streaming is enabled and mode is 'U', print the link to stdout
|
|
1323
1331
|
if args.stream and args.mode == "U":
|
|
1324
1332
|
write(link, pipe=True)
|
|
1325
1333
|
except Exception:
|
|
1326
|
-
|
|
1334
|
+
with links_lock:
|
|
1335
|
+
linksFound.add(link)
|
|
1327
1336
|
# If streaming is enabled and mode is 'U', print the link to stdout
|
|
1328
1337
|
if args.stream and args.mode == "U":
|
|
1329
1338
|
write(link, pipe=True)
|
|
1330
1339
|
|
|
1331
1340
|
|
|
1332
1341
|
# Add a link to the linksFound collection
|
|
1333
|
-
def linksFoundAdd(link):
|
|
1334
|
-
global linksFound, argsInput, argsInputHostname
|
|
1342
|
+
def linksFoundAdd(link, source_set=None):
|
|
1343
|
+
global linksFound, argsInput, argsInputHostname, links_lock
|
|
1335
1344
|
|
|
1336
1345
|
try:
|
|
1337
1346
|
if inputIsDomainANDPath:
|
|
@@ -1349,12 +1358,20 @@ def linksFoundAdd(link):
|
|
|
1349
1358
|
|
|
1350
1359
|
# Don't write it if the link does not contain the requested domain (this can sometimes happen)
|
|
1351
1360
|
if parsed_url.find(checkInput) >= 0:
|
|
1352
|
-
|
|
1361
|
+
with links_lock:
|
|
1362
|
+
if source_set is not None:
|
|
1363
|
+
source_set.add(link)
|
|
1364
|
+
else:
|
|
1365
|
+
linksFound.add(link)
|
|
1353
1366
|
# If streaming is enabled and mode is 'U', print the link to stdout
|
|
1354
1367
|
if args.stream and args.mode == "U":
|
|
1355
1368
|
write(link, pipe=True)
|
|
1356
1369
|
except Exception:
|
|
1357
|
-
|
|
1370
|
+
with links_lock:
|
|
1371
|
+
if source_set is not None:
|
|
1372
|
+
source_set.add(link)
|
|
1373
|
+
else:
|
|
1374
|
+
linksFound.add(link)
|
|
1358
1375
|
# If streaming is enabled and mode is 'U', print the link to stdout
|
|
1359
1376
|
if args.stream and args.mode == "U":
|
|
1360
1377
|
write(link, pipe=True)
|
|
@@ -1394,9 +1411,7 @@ def processArchiveUrl(url):
|
|
|
1394
1411
|
)
|
|
1395
1412
|
archiveHtml = str(resp.text)
|
|
1396
1413
|
try:
|
|
1397
|
-
contentType = (
|
|
1398
|
-
resp.headers.get("Content-Type").split(";")[0].lower()
|
|
1399
|
-
)
|
|
1414
|
+
contentType = resp.headers.get("Content-Type").split(";")[0].lower()
|
|
1400
1415
|
except Exception:
|
|
1401
1416
|
contentType = ""
|
|
1402
1417
|
|
|
@@ -1407,18 +1422,13 @@ def processArchiveUrl(url):
|
|
|
1407
1422
|
# If the FILTER_CODE includes 404, and it doesn't seem to be a custom 404 page
|
|
1408
1423
|
if "404" not in FILTER_CODE or (
|
|
1409
1424
|
"404" in FILTER_CODE
|
|
1410
|
-
and not re.findall(
|
|
1411
|
-
REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE
|
|
1412
|
-
)
|
|
1425
|
+
and not re.findall(REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE)
|
|
1413
1426
|
):
|
|
1414
1427
|
|
|
1415
1428
|
# Add the URL as a comment at the start of the response
|
|
1416
1429
|
if args.url_filename:
|
|
1417
1430
|
archiveHtml = (
|
|
1418
|
-
"/* Original URL: "
|
|
1419
|
-
+ archiveUrl
|
|
1420
|
-
+ " */\n"
|
|
1421
|
-
+ archiveHtml
|
|
1431
|
+
"/* Original URL: " + archiveUrl + " */\n" + archiveHtml
|
|
1422
1432
|
)
|
|
1423
1433
|
|
|
1424
1434
|
# Remove all web archive references in the response
|
|
@@ -1565,9 +1575,7 @@ def processArchiveUrl(url):
|
|
|
1565
1575
|
# Determine the extension from the content type
|
|
1566
1576
|
try:
|
|
1567
1577
|
if contentType != "":
|
|
1568
|
-
extension = contentType.split("/")[
|
|
1569
|
-
1
|
|
1570
|
-
].replace("x-", "")
|
|
1578
|
+
extension = contentType.split("/")[1].replace("x-", "")
|
|
1571
1579
|
if extension == "":
|
|
1572
1580
|
extension = contentType.lower()
|
|
1573
1581
|
except Exception:
|
|
@@ -1588,15 +1596,11 @@ def processArchiveUrl(url):
|
|
|
1588
1596
|
# If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
|
|
1589
1597
|
if extension == "":
|
|
1590
1598
|
if (
|
|
1591
|
-
archiveHtml.lower()
|
|
1592
|
-
.strip()
|
|
1593
|
-
.endswith("</html>")
|
|
1599
|
+
archiveHtml.lower().strip().endswith("</html>")
|
|
1594
1600
|
or archiveHtml.lower()
|
|
1595
1601
|
.strip()
|
|
1596
1602
|
.startswith("<!doctype html")
|
|
1597
|
-
or archiveHtml.lower()
|
|
1598
|
-
.strip()
|
|
1599
|
-
.startswith("<html")
|
|
1603
|
+
or archiveHtml.lower().strip().startswith("<html")
|
|
1600
1604
|
):
|
|
1601
1605
|
extension = "html"
|
|
1602
1606
|
else:
|
|
@@ -1626,12 +1630,10 @@ def processArchiveUrl(url):
|
|
|
1626
1630
|
except Exception as e:
|
|
1627
1631
|
writerr(
|
|
1628
1632
|
colored(
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
+ str(e)
|
|
1634
|
-
),
|
|
1633
|
+
"Wayback - [ ERR ] Failed to write file "
|
|
1634
|
+
+ filePath
|
|
1635
|
+
+ ": "
|
|
1636
|
+
+ str(e),
|
|
1635
1637
|
"red",
|
|
1636
1638
|
)
|
|
1637
1639
|
)
|
|
@@ -1641,23 +1643,16 @@ def processArchiveUrl(url):
|
|
|
1641
1643
|
try:
|
|
1642
1644
|
timestamp = str(datetime.now())
|
|
1643
1645
|
indexFile.write(
|
|
1644
|
-
hashValue
|
|
1645
|
-
+ ","
|
|
1646
|
-
+ archiveUrl
|
|
1647
|
-
+ " ,"
|
|
1648
|
-
+ timestamp
|
|
1649
|
-
+ "\n"
|
|
1646
|
+
hashValue + "," + archiveUrl + " ," + timestamp + "\n"
|
|
1650
1647
|
)
|
|
1651
1648
|
indexFile.flush()
|
|
1652
1649
|
except Exception as e:
|
|
1653
1650
|
writerr(
|
|
1654
1651
|
colored(
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
+ str(e)
|
|
1660
|
-
),
|
|
1652
|
+
'Wayback - [ ERR ] Failed to write to waymore_index.txt for "'
|
|
1653
|
+
+ archiveUrl
|
|
1654
|
+
+ '": '
|
|
1655
|
+
+ str(e),
|
|
1661
1656
|
"red",
|
|
1662
1657
|
)
|
|
1663
1658
|
)
|
|
@@ -1668,9 +1663,7 @@ def processArchiveUrl(url):
|
|
|
1668
1663
|
debugText = ""
|
|
1669
1664
|
if archiveHtml.lower().find("archive.org") > 0:
|
|
1670
1665
|
debugText = "ARCHIVE.ORG"
|
|
1671
|
-
elif (
|
|
1672
|
-
archiveHtml.lower().find("internet archive") > 0
|
|
1673
|
-
):
|
|
1666
|
+
elif archiveHtml.lower().find("internet archive") > 0:
|
|
1674
1667
|
debugText = "INTERNET ARCHIVE"
|
|
1675
1668
|
elif archiveHtml.lower().find("wombat") > 0:
|
|
1676
1669
|
debugText = "WOMBAT (JS)"
|
|
@@ -1697,11 +1690,7 @@ def processArchiveUrl(url):
|
|
|
1697
1690
|
if verbose():
|
|
1698
1691
|
writerr(
|
|
1699
1692
|
colored(
|
|
1700
|
-
|
|
1701
|
-
'[ ERR ] Wayback Machine (archive.org) returned a problem for "'
|
|
1702
|
-
+ archiveUrl
|
|
1703
|
-
+ '"'
|
|
1704
|
-
),
|
|
1693
|
+
'Wayback - [ ERR ] returned a problem for "' + archiveUrl + '"',
|
|
1705
1694
|
"red",
|
|
1706
1695
|
)
|
|
1707
1696
|
)
|
|
@@ -1710,11 +1699,7 @@ def processArchiveUrl(url):
|
|
|
1710
1699
|
if verbose():
|
|
1711
1700
|
writerr(
|
|
1712
1701
|
colored(
|
|
1713
|
-
|
|
1714
|
-
'[ ERR ] Wayback Machine (archive.org) connection error for "'
|
|
1715
|
-
+ archiveUrl
|
|
1716
|
-
+ '"'
|
|
1717
|
-
),
|
|
1702
|
+
'Wayback - [ ERR ] connection error for "' + archiveUrl + '"',
|
|
1718
1703
|
"red",
|
|
1719
1704
|
)
|
|
1720
1705
|
)
|
|
@@ -1724,25 +1709,21 @@ def processArchiveUrl(url):
|
|
|
1724
1709
|
try:
|
|
1725
1710
|
writerr(
|
|
1726
1711
|
colored(
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
+ '"'
|
|
1733
|
-
),
|
|
1712
|
+
"Wayback - [ "
|
|
1713
|
+
+ str(resp.status_code)
|
|
1714
|
+
+ ' ] Failed to get response for "'
|
|
1715
|
+
+ archiveUrl
|
|
1716
|
+
+ '"',
|
|
1734
1717
|
"red",
|
|
1735
1718
|
)
|
|
1736
1719
|
)
|
|
1737
1720
|
except Exception:
|
|
1738
1721
|
writerr(
|
|
1739
1722
|
colored(
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
+ str(e)
|
|
1745
|
-
),
|
|
1723
|
+
'Wayback - [ ERR ] Failed to get response for "'
|
|
1724
|
+
+ archiveUrl
|
|
1725
|
+
+ '": '
|
|
1726
|
+
+ str(e),
|
|
1746
1727
|
"red",
|
|
1747
1728
|
)
|
|
1748
1729
|
)
|
|
@@ -1769,9 +1750,7 @@ def processArchiveUrl(url):
|
|
|
1769
1750
|
)
|
|
1770
1751
|
except Exception:
|
|
1771
1752
|
if verbose():
|
|
1772
|
-
suffix = (
|
|
1773
|
-
'Complete (To show mem use, run "pip install psutil")'
|
|
1774
|
-
)
|
|
1753
|
+
suffix = 'Complete (To show mem use, run "pip install psutil")'
|
|
1775
1754
|
printProgressBar(
|
|
1776
1755
|
successCount + failureCount,
|
|
1777
1756
|
totalResponses,
|
|
@@ -1796,9 +1775,7 @@ def processArchiveUrl(url):
|
|
|
1796
1775
|
|
|
1797
1776
|
except Exception as e:
|
|
1798
1777
|
if verbose():
|
|
1799
|
-
writerr(
|
|
1800
|
-
colored(getSPACER('Error for "' + url + '": ' + str(e)), "red")
|
|
1801
|
-
)
|
|
1778
|
+
writerr(colored('Wayback - [ ERR ] Error for "' + url + '": ' + str(e), "red"))
|
|
1802
1779
|
|
|
1803
1780
|
except Exception as e:
|
|
1804
1781
|
writerr(colored("ERROR processArchiveUrl 1: " + str(e), "red"))
|
|
@@ -1883,7 +1860,7 @@ def processURLOutput():
|
|
|
1883
1860
|
linkCount = len(linksFound)
|
|
1884
1861
|
write(
|
|
1885
1862
|
getSPACER(
|
|
1886
|
-
colored("
|
|
1863
|
+
colored("\nTotal unique links found for " + subs + argsInput + ": ", "cyan")
|
|
1887
1864
|
+ colored(str(linkCount) + " 🤘", "white")
|
|
1888
1865
|
)
|
|
1889
1866
|
+ "\n"
|
|
@@ -1926,7 +1903,7 @@ def processURLOutput():
|
|
|
1926
1903
|
appendedUrls = False
|
|
1927
1904
|
if not args.output_overwrite:
|
|
1928
1905
|
try:
|
|
1929
|
-
with open(filename
|
|
1906
|
+
with open(filename) as existingLinks:
|
|
1930
1907
|
for link in existingLinks.readlines():
|
|
1931
1908
|
linksFound.add(link.strip())
|
|
1932
1909
|
appendedUrls = True
|
|
@@ -1968,16 +1945,10 @@ def processURLOutput():
|
|
|
1968
1945
|
writerr(colored("ERROR processURLOutput 3: " + str(e), "red"))
|
|
1969
1946
|
|
|
1970
1947
|
# If there are less links output because of filters, show the new total
|
|
1971
|
-
if
|
|
1972
|
-
args.regex_after is not None
|
|
1973
|
-
and linkCount > 0
|
|
1974
|
-
and outputCount < linkCount
|
|
1975
|
-
):
|
|
1948
|
+
if args.regex_after is not None and linkCount > 0 and outputCount < linkCount:
|
|
1976
1949
|
write(
|
|
1977
1950
|
colored(
|
|
1978
|
-
'Links found after applying filter "'
|
|
1979
|
-
+ args.regex_after
|
|
1980
|
-
+ '": ',
|
|
1951
|
+
'Links found after applying filter "' + args.regex_after + '": ',
|
|
1981
1952
|
"cyan",
|
|
1982
1953
|
)
|
|
1983
1954
|
+ colored(str(outputCount) + " 🤘\n", "white")
|
|
@@ -1992,11 +1963,7 @@ def processURLOutput():
|
|
|
1992
1963
|
|
|
1993
1964
|
if verbose():
|
|
1994
1965
|
if outputCount == 0:
|
|
1995
|
-
write(
|
|
1996
|
-
colored(
|
|
1997
|
-
"No links were found so nothing written to file.", "cyan"
|
|
1998
|
-
)
|
|
1999
|
-
)
|
|
1966
|
+
write(colored("No links were found so nothing written to file.", "cyan"))
|
|
2000
1967
|
else:
|
|
2001
1968
|
if appendedUrls:
|
|
2002
1969
|
write(
|
|
@@ -2018,11 +1985,11 @@ def processURLOutput():
|
|
|
2018
1985
|
if os.path.exists(filenameOld) and os.path.exists(filename):
|
|
2019
1986
|
|
|
2020
1987
|
# Get all the old links
|
|
2021
|
-
with open(filenameOld
|
|
1988
|
+
with open(filenameOld) as oldFile:
|
|
2022
1989
|
oldLinks = set(oldFile.readlines())
|
|
2023
1990
|
|
|
2024
1991
|
# Get all the new links
|
|
2025
|
-
with open(filename
|
|
1992
|
+
with open(filename) as newFile:
|
|
2026
1993
|
newLinks = set(newFile.readlines())
|
|
2027
1994
|
|
|
2028
1995
|
# Create a file with most recent new links
|
|
@@ -2061,7 +2028,7 @@ def stripUnwanted(url):
|
|
|
2061
2028
|
"""
|
|
2062
2029
|
parsed = urlparse(url)
|
|
2063
2030
|
# Strip scheme
|
|
2064
|
-
scheme = "
|
|
2031
|
+
scheme = f"{parsed.scheme}://"
|
|
2065
2032
|
strippedUrl = parsed.geturl().replace(scheme, "", 1)
|
|
2066
2033
|
# Strip query string and fragment
|
|
2067
2034
|
strippedUrl = strippedUrl.split("#")[0].split("?")[0]
|
|
@@ -2092,7 +2059,7 @@ def validateArgInput(x):
|
|
|
2092
2059
|
if os.path.isfile(x):
|
|
2093
2060
|
isInputFile = True
|
|
2094
2061
|
# Open file and put all values in input list
|
|
2095
|
-
with open(x
|
|
2062
|
+
with open(x) as inputFile:
|
|
2096
2063
|
lines = inputFile.readlines()
|
|
2097
2064
|
# Check if any lines start with a *. and replace without the *.
|
|
2098
2065
|
for line in lines:
|
|
@@ -2189,9 +2156,7 @@ def validateArgProviders(x):
|
|
|
2189
2156
|
x = x.lower()
|
|
2190
2157
|
providers = x.split(",")
|
|
2191
2158
|
for provider in providers:
|
|
2192
|
-
if not re.fullmatch(
|
|
2193
|
-
r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx)", provider
|
|
2194
|
-
):
|
|
2159
|
+
if not re.fullmatch(r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx)", provider):
|
|
2195
2160
|
invalid = True
|
|
2196
2161
|
break
|
|
2197
2162
|
if invalid:
|
|
@@ -2201,16 +2166,32 @@ def validateArgProviders(x):
|
|
|
2201
2166
|
return x
|
|
2202
2167
|
|
|
2203
2168
|
|
|
2169
|
+
def parseDateArg(dateArg):
|
|
2170
|
+
"""
|
|
2171
|
+
Parse a date argument from the command line into a datetime object
|
|
2172
|
+
"""
|
|
2173
|
+
formats = {
|
|
2174
|
+
4: "%Y",
|
|
2175
|
+
6: "%Y%m",
|
|
2176
|
+
8: "%Y%m%d",
|
|
2177
|
+
10: "%Y%m%d%H",
|
|
2178
|
+
12: "%Y%m%d%H%M",
|
|
2179
|
+
14: "%Y%m%d%H%M%S",
|
|
2180
|
+
}
|
|
2181
|
+
fmt = formats.get(len(dateArg))
|
|
2182
|
+
return datetime.strptime(dateArg, fmt)
|
|
2183
|
+
|
|
2184
|
+
|
|
2204
2185
|
def processAlienVaultPage(url):
|
|
2205
2186
|
"""
|
|
2206
2187
|
Get URLs from a specific page of otx.alienvault.org API for the input domain
|
|
2207
2188
|
"""
|
|
2208
|
-
global totalPages, linkMimes, linksFound,
|
|
2189
|
+
global totalPages, linkMimes, linksFound, stopSourceAlienVault, argsInput, linkCountAlienVault
|
|
2209
2190
|
try:
|
|
2210
2191
|
# Get memory in case it exceeds threshold
|
|
2211
2192
|
getMemory()
|
|
2212
2193
|
|
|
2213
|
-
if not
|
|
2194
|
+
if not stopSourceAlienVault:
|
|
2214
2195
|
try:
|
|
2215
2196
|
# Choose a random user agent string to use for any requests
|
|
2216
2197
|
userAgent = random.choice(USER_AGENT)
|
|
@@ -2222,9 +2203,7 @@ def processAlienVaultPage(url):
|
|
|
2222
2203
|
except ConnectionError:
|
|
2223
2204
|
writerr(
|
|
2224
2205
|
colored(
|
|
2225
|
-
getSPACER(
|
|
2226
|
-
"[ ERR ] alienvault.org connection error for page " + page
|
|
2227
|
-
),
|
|
2206
|
+
getSPACER("AlienVault - [ ERR ] Connection error for page " + page),
|
|
2228
2207
|
"red",
|
|
2229
2208
|
)
|
|
2230
2209
|
)
|
|
@@ -2233,12 +2212,10 @@ def processAlienVaultPage(url):
|
|
|
2233
2212
|
except Exception as e:
|
|
2234
2213
|
writerr(
|
|
2235
2214
|
colored(
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
+ str(e)
|
|
2241
|
-
),
|
|
2215
|
+
"AlienVault -[ ERR ] Error getting response for page "
|
|
2216
|
+
+ page
|
|
2217
|
+
+ " - "
|
|
2218
|
+
+ str(e),
|
|
2242
2219
|
"red",
|
|
2243
2220
|
)
|
|
2244
2221
|
)
|
|
@@ -2249,26 +2226,21 @@ def processAlienVaultPage(url):
|
|
|
2249
2226
|
if resp is not None:
|
|
2250
2227
|
# If a status other of 429, then stop processing Alien Vault
|
|
2251
2228
|
if resp.status_code == 429:
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2255
|
-
"[ 429 ]
|
|
2256
|
-
|
|
2257
|
-
|
|
2229
|
+
if not stopSourceAlienVault: # Only print message once
|
|
2230
|
+
writerr(
|
|
2231
|
+
colored(
|
|
2232
|
+
"AlienVault - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
|
|
2233
|
+
"red",
|
|
2234
|
+
)
|
|
2258
2235
|
)
|
|
2259
|
-
|
|
2260
|
-
stopSource = True
|
|
2236
|
+
stopSourceAlienVault = True
|
|
2261
2237
|
return
|
|
2262
2238
|
# If the response from alienvault.com is empty then skip
|
|
2263
2239
|
if resp.text == "" and totalPages == 0:
|
|
2264
2240
|
if verbose():
|
|
2265
2241
|
writerr(
|
|
2266
2242
|
colored(
|
|
2267
|
-
|
|
2268
|
-
"[ ERR ] "
|
|
2269
|
-
+ url
|
|
2270
|
-
+ " gave an empty response."
|
|
2271
|
-
),
|
|
2243
|
+
"AlienVault - [ ERR ] " + url + " gave an empty response.",
|
|
2272
2244
|
"red",
|
|
2273
2245
|
)
|
|
2274
2246
|
)
|
|
@@ -2278,12 +2250,10 @@ def processAlienVaultPage(url):
|
|
|
2278
2250
|
if verbose():
|
|
2279
2251
|
writerr(
|
|
2280
2252
|
colored(
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
+ url
|
|
2286
|
-
),
|
|
2253
|
+
"AlienVauilt - [ "
|
|
2254
|
+
+ str(resp.status_code)
|
|
2255
|
+
+ " ] Error for "
|
|
2256
|
+
+ url,
|
|
2287
2257
|
"red",
|
|
2288
2258
|
)
|
|
2289
2259
|
)
|
|
@@ -2306,6 +2276,7 @@ def processAlienVaultPage(url):
|
|
|
2306
2276
|
if foundUrl != "":
|
|
2307
2277
|
# If filters are not required and subs are wanted then just add the URL to the list
|
|
2308
2278
|
if args.filter_responses_only and not args.no_subs:
|
|
2279
|
+
linkCountAlienVault = linkCountAlienVault + 1
|
|
2309
2280
|
linksFoundAdd(foundUrl)
|
|
2310
2281
|
else:
|
|
2311
2282
|
addLink = True
|
|
@@ -2332,9 +2303,7 @@ def processAlienVaultPage(url):
|
|
|
2332
2303
|
# Compare the HTTP code gainst the Code exclusions and matches
|
|
2333
2304
|
if MATCH_CODE != "":
|
|
2334
2305
|
match = re.search(
|
|
2335
|
-
r"("
|
|
2336
|
-
+ re.escape(MATCH_CODE).replace(",", "|")
|
|
2337
|
-
+ ")",
|
|
2306
|
+
r"(" + re.escape(MATCH_CODE).replace(",", "|") + ")",
|
|
2338
2307
|
httpCode,
|
|
2339
2308
|
flags=re.IGNORECASE,
|
|
2340
2309
|
)
|
|
@@ -2342,9 +2311,7 @@ def processAlienVaultPage(url):
|
|
|
2342
2311
|
addLink = False
|
|
2343
2312
|
else:
|
|
2344
2313
|
match = re.search(
|
|
2345
|
-
r"("
|
|
2346
|
-
+ re.escape(FILTER_CODE).replace(",", "|")
|
|
2347
|
-
+ ")",
|
|
2314
|
+
r"(" + re.escape(FILTER_CODE).replace(",", "|") + ")",
|
|
2348
2315
|
httpCode,
|
|
2349
2316
|
flags=re.IGNORECASE,
|
|
2350
2317
|
)
|
|
@@ -2354,9 +2321,7 @@ def processAlienVaultPage(url):
|
|
|
2354
2321
|
# Check the URL exclusions
|
|
2355
2322
|
if addLink:
|
|
2356
2323
|
match = re.search(
|
|
2357
|
-
r"("
|
|
2358
|
-
+ re.escape(FILTER_URL).replace(",", "|")
|
|
2359
|
-
+ ")",
|
|
2324
|
+
r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
|
|
2360
2325
|
foundUrl,
|
|
2361
2326
|
flags=re.IGNORECASE,
|
|
2362
2327
|
)
|
|
@@ -2367,9 +2332,7 @@ def processAlienVaultPage(url):
|
|
|
2367
2332
|
if addLink and args.keywords_only:
|
|
2368
2333
|
if args.keywords_only == "#CONFIG":
|
|
2369
2334
|
match = re.search(
|
|
2370
|
-
r"("
|
|
2371
|
-
+ re.escape(FILTER_KEYWORDS).replace(",", "|")
|
|
2372
|
-
+ ")",
|
|
2335
|
+
r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
|
|
2373
2336
|
foundUrl,
|
|
2374
2337
|
flags=re.IGNORECASE,
|
|
2375
2338
|
)
|
|
@@ -2382,9 +2345,39 @@ def processAlienVaultPage(url):
|
|
|
2382
2345
|
if match is None:
|
|
2383
2346
|
addLink = False
|
|
2384
2347
|
|
|
2348
|
+
# Check date is in range if required
|
|
2349
|
+
if args.from_date is not None or args.to_date is not None:
|
|
2350
|
+
try:
|
|
2351
|
+
urlDateStr = urlSection["date"]
|
|
2352
|
+
|
|
2353
|
+
# Remove fractional seconds if present
|
|
2354
|
+
urlDateStr = urlDateStr.split(".")[0]
|
|
2355
|
+
|
|
2356
|
+
urlDate = datetime.strptime(urlDateStr, "%Y-%m-%dT%H:%M:%S")
|
|
2357
|
+
|
|
2358
|
+
# If from date passed, check
|
|
2359
|
+
if args.from_date is not None:
|
|
2360
|
+
fromDate = parseDateArg(args.from_date)
|
|
2361
|
+
if urlDate < fromDate:
|
|
2362
|
+
addLink = False
|
|
2363
|
+
# If to date passed, check
|
|
2364
|
+
if args.to_date is not None:
|
|
2365
|
+
toDate = parseDateArg(args.to_date)
|
|
2366
|
+
if urlDate >= toDate:
|
|
2367
|
+
addLink = False
|
|
2368
|
+
except Exception as e:
|
|
2369
|
+
if verbose():
|
|
2370
|
+
writerr(
|
|
2371
|
+
colored(
|
|
2372
|
+
"ERROR processLAlienVaultPage date check: "
|
|
2373
|
+
+ str(e),
|
|
2374
|
+
"red",
|
|
2375
|
+
)
|
|
2376
|
+
)
|
|
2377
|
+
|
|
2385
2378
|
# Add link if it passed filters
|
|
2386
2379
|
if addLink:
|
|
2387
|
-
linksFoundAdd(foundUrl)
|
|
2380
|
+
linksFoundAdd(foundUrl, linksFoundAlienVault)
|
|
2388
2381
|
else:
|
|
2389
2382
|
pass
|
|
2390
2383
|
except Exception as e:
|
|
@@ -2396,12 +2389,12 @@ def getAlienVaultUrls():
|
|
|
2396
2389
|
"""
|
|
2397
2390
|
Get URLs from the Alien Vault OTX, otx.alienvault.com
|
|
2398
2391
|
"""
|
|
2399
|
-
global linksFound, waymorePath, subs, path, stopProgram, totalPages,
|
|
2392
|
+
global linksFound, waymorePath, subs, path, stopProgram, totalPages, stopSourceAlienVault, argsInput, checkAlienVault, inputIsSubDomain, argsInputHostname, linkCountAlienVault, linksFoundAlienVault
|
|
2400
2393
|
|
|
2401
2394
|
# Write the file of URL's for the passed domain/URL
|
|
2402
2395
|
try:
|
|
2403
|
-
|
|
2404
|
-
|
|
2396
|
+
stopSourceAlienVault = False
|
|
2397
|
+
linksFoundAlienVault = set()
|
|
2405
2398
|
|
|
2406
2399
|
# Set the Alien Vault API indicator types of domain or hostname (has subdomain)
|
|
2407
2400
|
if inputIsSubDomain:
|
|
@@ -2418,11 +2411,12 @@ def getAlienVaultUrls():
|
|
|
2418
2411
|
|
|
2419
2412
|
# Get the number of pages (i.e. separate requests) that are going to be made to alienvault.com
|
|
2420
2413
|
totalPages = 0
|
|
2414
|
+
resp = None
|
|
2421
2415
|
try:
|
|
2422
2416
|
if not args.check_only:
|
|
2423
2417
|
write(
|
|
2424
2418
|
colored(
|
|
2425
|
-
"
|
|
2419
|
+
"AlienVault - [ INFO ] Getting the number of alienvault.com pages to search...",
|
|
2426
2420
|
"cyan",
|
|
2427
2421
|
)
|
|
2428
2422
|
)
|
|
@@ -2431,43 +2425,39 @@ def getAlienVaultUrls():
|
|
|
2431
2425
|
session = requests.Session()
|
|
2432
2426
|
session.mount("https://", HTTP_ADAPTER)
|
|
2433
2427
|
session.mount("http://", HTTP_ADAPTER)
|
|
2434
|
-
resp = session.get(
|
|
2435
|
-
url + "&showNumPages=True", headers={"User-Agent": userAgent}
|
|
2436
|
-
)
|
|
2428
|
+
resp = session.get(url + "&showNumPages=True", headers={"User-Agent": userAgent})
|
|
2437
2429
|
except Exception as e:
|
|
2438
2430
|
writerr(
|
|
2439
2431
|
colored(
|
|
2440
|
-
|
|
2441
|
-
"[ ERR ] Unable to get links from alienvault.com: " + str(e)
|
|
2442
|
-
),
|
|
2432
|
+
"AlienVault - [ ERR ] Unable to get links from alienvault.com: " + str(e),
|
|
2443
2433
|
"red",
|
|
2444
2434
|
)
|
|
2445
2435
|
)
|
|
2446
|
-
return
|
|
2436
|
+
# Don't return - continue to show link count at the end
|
|
2447
2437
|
|
|
2448
2438
|
# If the rate limit was reached end now
|
|
2449
|
-
if resp.status_code == 429:
|
|
2439
|
+
if resp is not None and resp.status_code == 429:
|
|
2450
2440
|
writerr(
|
|
2451
2441
|
colored(
|
|
2452
|
-
|
|
2453
|
-
"[ 429 ] Alien Vault rate limit reached so unable to get links."
|
|
2454
|
-
),
|
|
2442
|
+
"AlienVault - [ 429 ] Rate limit reached so unable to get links.",
|
|
2455
2443
|
"red",
|
|
2456
2444
|
)
|
|
2457
2445
|
)
|
|
2458
|
-
return
|
|
2446
|
+
# Don't return - continue to show link count at the end
|
|
2459
2447
|
|
|
2460
|
-
if verbose():
|
|
2448
|
+
if resp is not None and verbose():
|
|
2461
2449
|
write(
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
+ colored(url, "white")
|
|
2465
|
-
)
|
|
2450
|
+
colored("AlienVault - [ INFO ] The URL requested to get links: ", "magenta")
|
|
2451
|
+
+ colored(url, "white")
|
|
2466
2452
|
+ "\n"
|
|
2467
2453
|
)
|
|
2468
2454
|
|
|
2469
2455
|
# Carry on if something was found
|
|
2470
|
-
if
|
|
2456
|
+
if (
|
|
2457
|
+
resp is not None
|
|
2458
|
+
and resp.status_code != 429
|
|
2459
|
+
and resp.text.lower().find('"error": "') < 0
|
|
2460
|
+
):
|
|
2471
2461
|
|
|
2472
2462
|
try:
|
|
2473
2463
|
# Get the JSON response
|
|
@@ -2478,9 +2468,7 @@ def getAlienVaultUrls():
|
|
|
2478
2468
|
except Exception:
|
|
2479
2469
|
writerr(
|
|
2480
2470
|
colored(
|
|
2481
|
-
|
|
2482
|
-
"[ ERR ] There was an unexpected response from the Alien Vault API"
|
|
2483
|
-
),
|
|
2471
|
+
"AlienVault - [ ERR ] There was an unexpected response from the API",
|
|
2484
2472
|
"red",
|
|
2485
2473
|
)
|
|
2486
2474
|
)
|
|
@@ -2502,16 +2490,16 @@ def getAlienVaultUrls():
|
|
|
2502
2490
|
else:
|
|
2503
2491
|
checkAlienVault = totalPages
|
|
2504
2492
|
write(
|
|
2505
|
-
colored("
|
|
2493
|
+
colored("AlienVault - [ INFO ] Getting URLs from Alien Vault: ", "cyan")
|
|
2506
2494
|
+ colored(str(checkAlienVault) + " requests", "white")
|
|
2507
2495
|
)
|
|
2508
2496
|
else:
|
|
2509
2497
|
# if the page number was found then display it, but otherwise we will just try to increment until we have everything
|
|
2510
2498
|
write(
|
|
2511
2499
|
colored(
|
|
2512
|
-
"
|
|
2500
|
+
"AlienVault - [ INFO ] Getting links from "
|
|
2513
2501
|
+ str(totalPages)
|
|
2514
|
-
+ " alienvault.com API requests (this can take a while for some domains)
|
|
2502
|
+
+ " alienvault.com API requests (this can take a while for some domains)...",
|
|
2515
2503
|
"cyan",
|
|
2516
2504
|
)
|
|
2517
2505
|
)
|
|
@@ -2531,32 +2519,19 @@ def getAlienVaultUrls():
|
|
|
2531
2519
|
if verbose():
|
|
2532
2520
|
writerr(
|
|
2533
2521
|
colored(
|
|
2534
|
-
|
|
2535
|
-
"[ ERR ] An error was returned in the alienvault.com response."
|
|
2536
|
-
)
|
|
2537
|
-
+ "\n",
|
|
2522
|
+
"AlienVault - [ ERR ] An error was returned in the response." + "\n",
|
|
2538
2523
|
"red",
|
|
2539
2524
|
)
|
|
2540
2525
|
)
|
|
2541
2526
|
|
|
2542
2527
|
if not args.check_only:
|
|
2543
|
-
|
|
2544
|
-
|
|
2545
|
-
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
+ "\n"
|
|
2551
|
-
)
|
|
2552
|
-
else:
|
|
2553
|
-
write(
|
|
2554
|
-
getSPACER(
|
|
2555
|
-
colored("Extra links found on alienvault.com: ", "cyan")
|
|
2556
|
-
+ colored(str(linkCount), "white")
|
|
2557
|
-
)
|
|
2558
|
-
+ "\n"
|
|
2559
|
-
)
|
|
2528
|
+
linkCountAlienVault = len(linksFoundAlienVault)
|
|
2529
|
+
write(
|
|
2530
|
+
colored("AlienVault - [ INFO ] Links found on alienvault.com: ", "cyan")
|
|
2531
|
+
+ colored(str(linkCountAlienVault), "white")
|
|
2532
|
+
)
|
|
2533
|
+
linksFound.update(linksFoundAlienVault)
|
|
2534
|
+
linksFoundAlienVault.clear()
|
|
2560
2535
|
|
|
2561
2536
|
except Exception as e:
|
|
2562
2537
|
writerr(colored("ERROR getAlienVaultUrls 1: " + str(e), "red"))
|
|
@@ -2566,7 +2541,7 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
|
|
|
2566
2541
|
"""
|
|
2567
2542
|
Process a specific URL from urlscan.io to determine whether to save the link
|
|
2568
2543
|
"""
|
|
2569
|
-
global argsInput, argsInputHostname, urlscanRequestLinks
|
|
2544
|
+
global argsInput, argsInputHostname, urlscanRequestLinks, links_lock, linkCountURLScan, linksFoundURLScan
|
|
2570
2545
|
|
|
2571
2546
|
addLink = True
|
|
2572
2547
|
|
|
@@ -2629,9 +2604,7 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
|
|
|
2629
2604
|
flags=re.IGNORECASE,
|
|
2630
2605
|
)
|
|
2631
2606
|
else:
|
|
2632
|
-
match = re.search(
|
|
2633
|
-
r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE
|
|
2634
|
-
)
|
|
2607
|
+
match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
|
|
2635
2608
|
if match is None:
|
|
2636
2609
|
addLink = False
|
|
2637
2610
|
|
|
@@ -2657,7 +2630,8 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
|
|
|
2657
2630
|
# Add MIME Types if --verbose option was selected
|
|
2658
2631
|
if verbose():
|
|
2659
2632
|
if mimeType.strip() != "":
|
|
2660
|
-
|
|
2633
|
+
with links_lock:
|
|
2634
|
+
linkMimes.add(mimeType)
|
|
2661
2635
|
|
|
2662
2636
|
# Add link if it passed filters
|
|
2663
2637
|
if addLink:
|
|
@@ -2677,11 +2651,12 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
|
|
|
2677
2651
|
)
|
|
2678
2652
|
if match is not None:
|
|
2679
2653
|
if args.mode in ("U", "B"):
|
|
2680
|
-
linksFoundAdd(url)
|
|
2654
|
+
linksFoundAdd(url, linksFoundURLScan)
|
|
2681
2655
|
# If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
|
|
2682
2656
|
if urlscanID != "" and args.mode in ("R", "B"):
|
|
2683
2657
|
if args.limit == 0 or len(urlscanRequestLinks) < args.limit:
|
|
2684
|
-
|
|
2658
|
+
with links_lock:
|
|
2659
|
+
urlscanRequestLinks.add((url, URLSCAN_DOM_URL + urlscanID))
|
|
2685
2660
|
|
|
2686
2661
|
except Exception as e:
|
|
2687
2662
|
writerr(colored("ERROR processURLScanUrl 1: " + str(e), "red"))
|
|
@@ -2726,12 +2701,7 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2726
2701
|
|
|
2727
2702
|
# Add the URL as a comment at the start of the response
|
|
2728
2703
|
if args.url_filename:
|
|
2729
|
-
archiveHtml =
|
|
2730
|
-
"/* Original URL: "
|
|
2731
|
-
+ originalUrl
|
|
2732
|
-
+ " */\n"
|
|
2733
|
-
+ archiveHtml
|
|
2734
|
-
)
|
|
2704
|
+
archiveHtml = "/* Original URL: " + originalUrl + " */\n" + archiveHtml
|
|
2735
2705
|
|
|
2736
2706
|
# Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
|
|
2737
2707
|
if args.url_filename:
|
|
@@ -2760,9 +2730,7 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2760
2730
|
if (
|
|
2761
2731
|
archiveHtml.lower().strip().endswith("</html>")
|
|
2762
2732
|
or archiveHtml.lower().strip().endswith("</body>")
|
|
2763
|
-
or archiveHtml.lower()
|
|
2764
|
-
.strip()
|
|
2765
|
-
.startswith("<!doctype html")
|
|
2733
|
+
or archiveHtml.lower().strip().startswith("<!doctype html")
|
|
2766
2734
|
or archiveHtml.lower().strip().startswith("<html")
|
|
2767
2735
|
or archiveHtml.lower().strip().startswith("<head")
|
|
2768
2736
|
):
|
|
@@ -2794,12 +2762,10 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2794
2762
|
except Exception as e:
|
|
2795
2763
|
writerr(
|
|
2796
2764
|
colored(
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
+ str(e)
|
|
2802
|
-
),
|
|
2765
|
+
"URLScan - [ ERR ] Failed to write file "
|
|
2766
|
+
+ filePath
|
|
2767
|
+
+ ": "
|
|
2768
|
+
+ str(e),
|
|
2803
2769
|
"red",
|
|
2804
2770
|
)
|
|
2805
2771
|
)
|
|
@@ -2822,12 +2788,10 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2822
2788
|
except Exception as e:
|
|
2823
2789
|
writerr(
|
|
2824
2790
|
colored(
|
|
2825
|
-
|
|
2826
|
-
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
+ str(e)
|
|
2830
|
-
),
|
|
2791
|
+
'URLScan - [ ERR ] Failed to write to waymore_index.txt for "'
|
|
2792
|
+
+ domUrl
|
|
2793
|
+
+ '": '
|
|
2794
|
+
+ str(e),
|
|
2831
2795
|
"red",
|
|
2832
2796
|
)
|
|
2833
2797
|
)
|
|
@@ -2843,25 +2807,21 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2843
2807
|
try:
|
|
2844
2808
|
writerr(
|
|
2845
2809
|
colored(
|
|
2846
|
-
|
|
2847
|
-
|
|
2848
|
-
|
|
2849
|
-
|
|
2850
|
-
|
|
2851
|
-
+ '"'
|
|
2852
|
-
),
|
|
2810
|
+
"URLScan - [ "
|
|
2811
|
+
+ str(resp.status_code)
|
|
2812
|
+
+ ' ] Failed to get response for "'
|
|
2813
|
+
+ domUrl
|
|
2814
|
+
+ '"',
|
|
2853
2815
|
"red",
|
|
2854
2816
|
)
|
|
2855
2817
|
)
|
|
2856
2818
|
except Exception:
|
|
2857
2819
|
writerr(
|
|
2858
2820
|
colored(
|
|
2859
|
-
|
|
2860
|
-
|
|
2861
|
-
|
|
2862
|
-
|
|
2863
|
-
+ str(e)
|
|
2864
|
-
),
|
|
2821
|
+
'URLScan - [ ERR ] Failed to get response for "'
|
|
2822
|
+
+ domUrl
|
|
2823
|
+
+ '": '
|
|
2824
|
+
+ str(e),
|
|
2865
2825
|
"red",
|
|
2866
2826
|
)
|
|
2867
2827
|
)
|
|
@@ -2888,9 +2848,7 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2888
2848
|
)
|
|
2889
2849
|
except Exception:
|
|
2890
2850
|
if verbose():
|
|
2891
|
-
suffix = (
|
|
2892
|
-
'Complete (To show mem use, run "pip install psutil")'
|
|
2893
|
-
)
|
|
2851
|
+
suffix = 'Complete (To show mem use, run "pip install psutil")'
|
|
2894
2852
|
printProgressBar(
|
|
2895
2853
|
successCount + failureCount,
|
|
2896
2854
|
totalResponses,
|
|
@@ -2903,23 +2861,15 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2903
2861
|
# Write the total count to the continueResp.URLScan.tmp file
|
|
2904
2862
|
try:
|
|
2905
2863
|
continueRespFileURLScan.seek(0)
|
|
2906
|
-
continueRespFileURLScan.write(
|
|
2907
|
-
str(successCount + failureCount) + "\n"
|
|
2908
|
-
)
|
|
2864
|
+
continueRespFileURLScan.write(str(successCount + failureCount) + "\n")
|
|
2909
2865
|
except Exception as e:
|
|
2910
2866
|
if verbose():
|
|
2911
|
-
writerr(
|
|
2912
|
-
colored(
|
|
2913
|
-
getSPACER("ERROR getURLScanDOM 2: " + str(e)), "red"
|
|
2914
|
-
)
|
|
2915
|
-
)
|
|
2867
|
+
writerr(colored(getSPACER("ERROR getURLScanDOM 2: " + str(e)), "red"))
|
|
2916
2868
|
|
|
2917
2869
|
except Exception as e:
|
|
2918
2870
|
if verbose():
|
|
2919
2871
|
writerr(
|
|
2920
|
-
colored(
|
|
2921
|
-
getSPACER('Error for "' + domUrl + '": ' + str(e)), "red"
|
|
2922
|
-
)
|
|
2872
|
+
colored('URLScan - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red")
|
|
2923
2873
|
)
|
|
2924
2874
|
|
|
2925
2875
|
except Exception as e:
|
|
@@ -2945,14 +2895,15 @@ def getURLScanUrls():
|
|
|
2945
2895
|
"""
|
|
2946
2896
|
Get URLs from the URLSCan API, urlscan.io
|
|
2947
2897
|
"""
|
|
2948
|
-
global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram,
|
|
2898
|
+
global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
|
|
2949
2899
|
|
|
2950
2900
|
# Write the file of URL's for the passed domain/URL
|
|
2951
2901
|
try:
|
|
2952
2902
|
requestsMade = 0
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2903
|
+
stopSourceURLScan = False
|
|
2904
|
+
linksFoundURLScan = set()
|
|
2905
|
+
totalUrls = 0
|
|
2906
|
+
checkResponse = True
|
|
2956
2907
|
|
|
2957
2908
|
# Set the URL to just the hostname
|
|
2958
2909
|
url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
|
|
@@ -2975,21 +2926,23 @@ def getURLScanUrls():
|
|
|
2975
2926
|
if args.mode == "R":
|
|
2976
2927
|
write(
|
|
2977
2928
|
colored(
|
|
2978
|
-
"The URLScan URL requested to get links for responses: ",
|
|
2929
|
+
"URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
|
|
2979
2930
|
"magenta",
|
|
2980
2931
|
)
|
|
2981
2932
|
+ colored(url + "\n", "white")
|
|
2982
2933
|
)
|
|
2983
2934
|
else:
|
|
2984
2935
|
write(
|
|
2985
|
-
colored(
|
|
2936
|
+
colored(
|
|
2937
|
+
"URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
|
|
2938
|
+
)
|
|
2986
2939
|
+ colored(url + "\n", "white")
|
|
2987
2940
|
)
|
|
2988
2941
|
|
|
2989
|
-
if not args.check_only:
|
|
2942
|
+
if args.mode in ("U", "B") and not args.check_only:
|
|
2990
2943
|
write(
|
|
2991
2944
|
colored(
|
|
2992
|
-
"
|
|
2945
|
+
"URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
|
|
2993
2946
|
"cyan",
|
|
2994
2947
|
)
|
|
2995
2948
|
)
|
|
@@ -3005,14 +2958,12 @@ def getURLScanUrls():
|
|
|
3005
2958
|
session.mount("https://", HTTP_ADAPTER)
|
|
3006
2959
|
session.mount("http://", HTTP_ADAPTER)
|
|
3007
2960
|
# Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
|
|
3008
|
-
resp = session.get(
|
|
3009
|
-
url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY}
|
|
3010
|
-
)
|
|
2961
|
+
resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
|
|
3011
2962
|
requestsMade = requestsMade + 1
|
|
3012
2963
|
except Exception as e:
|
|
3013
2964
|
write(
|
|
3014
2965
|
colored(
|
|
3015
|
-
|
|
2966
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
3016
2967
|
"red",
|
|
3017
2968
|
)
|
|
3018
2969
|
)
|
|
@@ -3027,15 +2978,17 @@ def getURLScanUrls():
|
|
|
3027
2978
|
if seconds <= args.urlscan_rate_limit_retry * 60:
|
|
3028
2979
|
writerr(
|
|
3029
2980
|
colored(
|
|
3030
|
-
|
|
3031
|
-
|
|
3032
|
-
|
|
3033
|
-
+ " seconds before continuing..."
|
|
3034
|
-
),
|
|
2981
|
+
"URLScan - [ 429 ] Rate limit reached, so waiting for another "
|
|
2982
|
+
+ str(seconds)
|
|
2983
|
+
+ " seconds before continuing...",
|
|
3035
2984
|
"yellow",
|
|
3036
2985
|
)
|
|
3037
2986
|
)
|
|
3038
|
-
|
|
2987
|
+
# Wait can be interrupted by SIGINT via interrupt_event
|
|
2988
|
+
interrupt_event.clear()
|
|
2989
|
+
if interrupt_event.wait(seconds + 1):
|
|
2990
|
+
# Interrupted by SIGINT
|
|
2991
|
+
return
|
|
3039
2992
|
try:
|
|
3040
2993
|
resp = session.get(
|
|
3041
2994
|
url,
|
|
@@ -3048,10 +3001,7 @@ def getURLScanUrls():
|
|
|
3048
3001
|
except Exception as e:
|
|
3049
3002
|
write(
|
|
3050
3003
|
colored(
|
|
3051
|
-
|
|
3052
|
-
"[ ERR ] Unable to get links from urlscan.io: "
|
|
3053
|
-
+ str(e)
|
|
3054
|
-
),
|
|
3004
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
3055
3005
|
"red",
|
|
3056
3006
|
)
|
|
3057
3007
|
)
|
|
@@ -3064,18 +3014,14 @@ def getURLScanUrls():
|
|
|
3064
3014
|
if resp.status_code == 429:
|
|
3065
3015
|
writerr(
|
|
3066
3016
|
colored(
|
|
3067
|
-
|
|
3068
|
-
"[ 429 ] URLScan rate limit reached so trying without API Key..."
|
|
3069
|
-
),
|
|
3017
|
+
"URLScan - [ 429 ] Rate limit reached so trying without API Key...",
|
|
3070
3018
|
"red",
|
|
3071
3019
|
)
|
|
3072
3020
|
)
|
|
3073
3021
|
else:
|
|
3074
3022
|
writerr(
|
|
3075
3023
|
colored(
|
|
3076
|
-
|
|
3077
|
-
"The URLScan API Key is invalid so trying without API Key..."
|
|
3078
|
-
),
|
|
3024
|
+
"URLScan - [ INF ] The API Key is invalid so trying without API Key...",
|
|
3079
3025
|
"red",
|
|
3080
3026
|
)
|
|
3081
3027
|
)
|
|
@@ -3085,64 +3031,54 @@ def getURLScanUrls():
|
|
|
3085
3031
|
except Exception as e:
|
|
3086
3032
|
writerr(
|
|
3087
3033
|
colored(
|
|
3088
|
-
|
|
3089
|
-
"[ ERR ] Unable to get links from urlscan.io: " + str(e)
|
|
3090
|
-
),
|
|
3034
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
3091
3035
|
"red",
|
|
3092
3036
|
)
|
|
3093
3037
|
)
|
|
3094
|
-
|
|
3038
|
+
checkResponse = False
|
|
3095
3039
|
|
|
3096
3040
|
# If the rate limit was reached end now
|
|
3097
3041
|
if resp.status_code == 429:
|
|
3098
3042
|
writerr(
|
|
3099
3043
|
colored(
|
|
3100
|
-
|
|
3101
|
-
"[ 429 ] URLScan rate limit reached without API Key so unable to get links."
|
|
3102
|
-
),
|
|
3044
|
+
"URLScan - [ 429 ] Rate limit reached without API Key so unable to get links.",
|
|
3103
3045
|
"red",
|
|
3104
3046
|
)
|
|
3105
3047
|
)
|
|
3106
|
-
|
|
3048
|
+
checkResponse = False
|
|
3107
3049
|
else:
|
|
3108
3050
|
writerr(
|
|
3109
3051
|
colored(
|
|
3110
|
-
|
|
3111
|
-
"[ 429 ] URLScan rate limit reached so unable to get links."
|
|
3112
|
-
),
|
|
3052
|
+
"URLScan - [ 429 ] Rate limit reached so unable to get links.",
|
|
3113
3053
|
"red",
|
|
3114
3054
|
)
|
|
3115
3055
|
)
|
|
3116
|
-
|
|
3056
|
+
checkResponse = False
|
|
3117
3057
|
elif resp.status_code != 200:
|
|
3118
3058
|
writerr(
|
|
3119
3059
|
colored(
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
+ " ] Unable to get links from urlscan.io"
|
|
3124
|
-
),
|
|
3060
|
+
"URLScan - [ "
|
|
3061
|
+
+ str(resp.status_code)
|
|
3062
|
+
+ " ] Unable to get links from urlscan.io",
|
|
3125
3063
|
"red",
|
|
3126
3064
|
)
|
|
3127
3065
|
)
|
|
3128
|
-
|
|
3066
|
+
checkResponse = False
|
|
3129
3067
|
|
|
3130
3068
|
try:
|
|
3131
|
-
|
|
3132
|
-
|
|
3069
|
+
if checkResponse:
|
|
3070
|
+
# Get the JSON response
|
|
3071
|
+
jsonResp = json.loads(resp.text.strip())
|
|
3133
3072
|
|
|
3134
|
-
|
|
3135
|
-
|
|
3073
|
+
# Get the number of results
|
|
3074
|
+
totalUrls = int(jsonResp["total"])
|
|
3136
3075
|
except Exception:
|
|
3137
3076
|
writerr(
|
|
3138
3077
|
colored(
|
|
3139
|
-
|
|
3140
|
-
"[ ERR ] There was an unexpected response from the URLScan API"
|
|
3141
|
-
),
|
|
3078
|
+
"URLScan - [ ERR ] There was an unexpected response from the API",
|
|
3142
3079
|
"red",
|
|
3143
3080
|
)
|
|
3144
3081
|
)
|
|
3145
|
-
totalUrls = 0
|
|
3146
3082
|
|
|
3147
3083
|
# Carry on if something was found
|
|
3148
3084
|
if args.check_only and args.mode != "R":
|
|
@@ -3150,12 +3086,12 @@ def getURLScanUrls():
|
|
|
3150
3086
|
hasMore = jsonResp["has_more"]
|
|
3151
3087
|
if hasMore:
|
|
3152
3088
|
write(
|
|
3153
|
-
colored("Get URLs from URLScan: ", "cyan")
|
|
3089
|
+
colored("URLScan - [ INFO ] Get URLs from URLScan: ", "cyan")
|
|
3154
3090
|
+ colored("UNKNOWN requests", "white")
|
|
3155
3091
|
)
|
|
3156
3092
|
else:
|
|
3157
3093
|
write(
|
|
3158
|
-
colored("Get URLs from URLScan: ", "cyan")
|
|
3094
|
+
colored("URLScan - [ INFO ] Get URLs from URLScan: ", "cyan")
|
|
3159
3095
|
+ colored("1 request", "white")
|
|
3160
3096
|
)
|
|
3161
3097
|
except Exception:
|
|
@@ -3166,7 +3102,7 @@ def getURLScanUrls():
|
|
|
3166
3102
|
# Carry on if something was found
|
|
3167
3103
|
if int(totalUrls) > 0:
|
|
3168
3104
|
|
|
3169
|
-
while not
|
|
3105
|
+
while not stopSourceURLScan:
|
|
3170
3106
|
|
|
3171
3107
|
searchAfter = ""
|
|
3172
3108
|
|
|
@@ -3203,9 +3139,7 @@ def getURLScanUrls():
|
|
|
3203
3139
|
sort = urlSection["sort"]
|
|
3204
3140
|
except Exception:
|
|
3205
3141
|
sort = ""
|
|
3206
|
-
searchAfter = (
|
|
3207
|
-
"&search_after=" + str(sort[0]) + "," + str(sort[1])
|
|
3208
|
-
)
|
|
3142
|
+
searchAfter = "&search_after=" + str(sort[0]) + "," + str(sort[1])
|
|
3209
3143
|
|
|
3210
3144
|
# Get the HTTP code
|
|
3211
3145
|
try:
|
|
@@ -3243,7 +3177,7 @@ def getURLScanUrls():
|
|
|
3243
3177
|
if searchAfter != "":
|
|
3244
3178
|
|
|
3245
3179
|
keepTrying = True
|
|
3246
|
-
while not
|
|
3180
|
+
while not stopSourceURLScan and keepTrying:
|
|
3247
3181
|
keepTrying = False
|
|
3248
3182
|
# Get the next page from urlscan.io
|
|
3249
3183
|
try:
|
|
@@ -3263,10 +3197,8 @@ def getURLScanUrls():
|
|
|
3263
3197
|
except Exception as e:
|
|
3264
3198
|
writerr(
|
|
3265
3199
|
colored(
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
+ str(e)
|
|
3269
|
-
),
|
|
3200
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: "
|
|
3201
|
+
+ str(e),
|
|
3270
3202
|
"red",
|
|
3271
3203
|
)
|
|
3272
3204
|
)
|
|
@@ -3285,56 +3217,53 @@ def getURLScanUrls():
|
|
|
3285
3217
|
if seconds <= args.urlscan_rate_limit_retry * 60:
|
|
3286
3218
|
writerr(
|
|
3287
3219
|
colored(
|
|
3288
|
-
|
|
3289
|
-
|
|
3290
|
-
|
|
3291
|
-
+ " seconds before continuing..."
|
|
3292
|
-
),
|
|
3220
|
+
"URLScan - [ 429 ] Rate limit reached, so waiting for another "
|
|
3221
|
+
+ str(seconds)
|
|
3222
|
+
+ " seconds before continuing...",
|
|
3293
3223
|
"yellow",
|
|
3294
3224
|
)
|
|
3295
3225
|
)
|
|
3296
|
-
|
|
3226
|
+
# Wait can be interrupted by SIGINT via interrupt_event
|
|
3227
|
+
interrupt_event.clear()
|
|
3228
|
+
if interrupt_event.wait(seconds + 1):
|
|
3229
|
+
# Interrupted by SIGINT
|
|
3230
|
+
keepTrying = False
|
|
3231
|
+
break
|
|
3297
3232
|
keepTrying = True
|
|
3298
3233
|
continue
|
|
3299
3234
|
else:
|
|
3300
3235
|
writerr(
|
|
3301
3236
|
colored(
|
|
3302
|
-
|
|
3303
|
-
|
|
3304
|
-
|
|
3305
|
-
+ "), so stopping. Links that have already been retrieved will be saved."
|
|
3306
|
-
),
|
|
3237
|
+
"URLScan - [ 429 ] Rate limit reached (waiting time of "
|
|
3238
|
+
+ str(seconds)
|
|
3239
|
+
+ "), so stopping. Links that have already been retrieved will be saved.",
|
|
3307
3240
|
"red",
|
|
3308
3241
|
)
|
|
3309
3242
|
)
|
|
3310
|
-
|
|
3243
|
+
stopSourceURLScan = True
|
|
3311
3244
|
pass
|
|
3312
3245
|
else:
|
|
3313
3246
|
writerr(
|
|
3314
3247
|
colored(
|
|
3315
|
-
|
|
3316
|
-
"[ 429 ] URLScan rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
3317
|
-
),
|
|
3248
|
+
"URLScan - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
|
|
3318
3249
|
"red",
|
|
3319
3250
|
)
|
|
3320
3251
|
)
|
|
3321
|
-
|
|
3252
|
+
stopSourceURLScan = True
|
|
3322
3253
|
pass
|
|
3323
3254
|
elif resp.status_code != 200:
|
|
3324
3255
|
writerr(
|
|
3325
3256
|
colored(
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
+ " ] Unable to get links from urlscan.io"
|
|
3330
|
-
),
|
|
3257
|
+
"URLScan - [ "
|
|
3258
|
+
+ str(resp.status_code)
|
|
3259
|
+
+ " ] Unable to get links from urlscan.io",
|
|
3331
3260
|
"red",
|
|
3332
3261
|
)
|
|
3333
3262
|
)
|
|
3334
|
-
|
|
3263
|
+
stopSourceURLScan = True
|
|
3335
3264
|
pass
|
|
3336
3265
|
|
|
3337
|
-
if not
|
|
3266
|
+
if not stopSourceURLScan:
|
|
3338
3267
|
# Get the JSON response
|
|
3339
3268
|
jsonResp = json.loads(resp.text.strip())
|
|
3340
3269
|
|
|
@@ -3342,47 +3271,32 @@ def getURLScanUrls():
|
|
|
3342
3271
|
if (
|
|
3343
3272
|
jsonResp["results"] is None
|
|
3344
3273
|
or len(jsonResp["results"]) == 0
|
|
3345
|
-
or (
|
|
3346
|
-
args.limit_requests != 0
|
|
3347
|
-
and requestsMade > args.limit_requests
|
|
3348
|
-
)
|
|
3274
|
+
or (args.limit_requests != 0 and requestsMade > args.limit_requests)
|
|
3349
3275
|
or (
|
|
3350
3276
|
args.mode == "R"
|
|
3351
3277
|
and args.limit != 0
|
|
3352
3278
|
and requestsMade > args.limit
|
|
3353
3279
|
)
|
|
3354
3280
|
):
|
|
3355
|
-
|
|
3281
|
+
stopSourceURLScan = True
|
|
3356
3282
|
|
|
3357
3283
|
# Show the MIME types found (in case user wants to exclude more)
|
|
3358
3284
|
if verbose() and len(linkMimes) > 0 and args.mode != "R":
|
|
3359
3285
|
linkMimes.discard("warc/revisit")
|
|
3360
3286
|
write(
|
|
3361
|
-
|
|
3362
|
-
|
|
3363
|
-
+ colored(str(linkMimes), "white")
|
|
3364
|
-
)
|
|
3287
|
+
colored("URLScan - [ INFO ] MIME types found: ", "magenta")
|
|
3288
|
+
+ colored(str(linkMimes), "white")
|
|
3365
3289
|
+ "\n"
|
|
3366
3290
|
)
|
|
3367
3291
|
|
|
3368
|
-
linkCount = len(linksFound) - originalLinkCount
|
|
3369
3292
|
if args.mode != "R":
|
|
3370
|
-
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
|
|
3374
|
-
|
|
3375
|
-
|
|
3376
|
-
|
|
3377
|
-
)
|
|
3378
|
-
else:
|
|
3379
|
-
write(
|
|
3380
|
-
getSPACER(
|
|
3381
|
-
colored("Extra links found on urlscan.io: ", "cyan")
|
|
3382
|
-
+ colored(str(linkCount), "white")
|
|
3383
|
-
)
|
|
3384
|
-
+ "\n"
|
|
3385
|
-
)
|
|
3293
|
+
linkCountURLScan = len(linksFoundURLScan)
|
|
3294
|
+
write(
|
|
3295
|
+
colored("URLScan - [ INFO ] Links found on urlscan.io: ", "cyan")
|
|
3296
|
+
+ colored(str(linkCountURLScan), "white")
|
|
3297
|
+
)
|
|
3298
|
+
linksFound.update(linksFoundURLScan)
|
|
3299
|
+
linksFoundURLScan.clear()
|
|
3386
3300
|
|
|
3387
3301
|
except Exception as e:
|
|
3388
3302
|
writerr(colored("ERROR getURLScanUrls 1: " + str(e), "red"))
|
|
@@ -3392,12 +3306,11 @@ def processWayBackPage(url):
|
|
|
3392
3306
|
"""
|
|
3393
3307
|
Get URLs from a specific page of archive.org CDX API for the input domain
|
|
3394
3308
|
"""
|
|
3395
|
-
global totalPages, linkMimes, linksFound,
|
|
3309
|
+
global totalPages, linkMimes, linksFound, stopSourceWayback, linkCountWayback, linksFoundWayback, current_response, current_session
|
|
3396
3310
|
try:
|
|
3397
3311
|
# Get memory in case it exceeds threshold
|
|
3398
3312
|
getMemory()
|
|
3399
|
-
|
|
3400
|
-
if not stopSource:
|
|
3313
|
+
if not stopSourceWayback:
|
|
3401
3314
|
try:
|
|
3402
3315
|
# Choose a random user agent string to use for any requests
|
|
3403
3316
|
resp = None
|
|
@@ -3406,229 +3319,231 @@ def processWayBackPage(url):
|
|
|
3406
3319
|
session = requests.Session()
|
|
3407
3320
|
session.mount("https://", HTTP_ADAPTER)
|
|
3408
3321
|
session.mount("http://", HTTP_ADAPTER)
|
|
3409
|
-
|
|
3410
|
-
|
|
3411
|
-
|
|
3412
|
-
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
"red",
|
|
3418
|
-
)
|
|
3419
|
-
)
|
|
3420
|
-
resp = None
|
|
3421
|
-
return
|
|
3422
|
-
except Exception as e:
|
|
3423
|
-
writerr(
|
|
3424
|
-
colored(
|
|
3425
|
-
getSPACER(
|
|
3426
|
-
"[ ERR ] Error getting response for page "
|
|
3427
|
-
+ page
|
|
3428
|
-
+ " - "
|
|
3429
|
-
+ str(e)
|
|
3430
|
-
),
|
|
3431
|
-
"red",
|
|
3432
|
-
)
|
|
3322
|
+
# expose session so SIGINT handler can close it to interrupt blocking network I/O
|
|
3323
|
+
try:
|
|
3324
|
+
current_session = session
|
|
3325
|
+
except Exception:
|
|
3326
|
+
pass
|
|
3327
|
+
|
|
3328
|
+
resp = session.get(
|
|
3329
|
+
url, headers={"User-Agent": userAgent}, stream=True, timeout=args.timeout
|
|
3433
3330
|
)
|
|
3434
|
-
|
|
3435
|
-
return
|
|
3436
|
-
finally:
|
|
3331
|
+
# expose live response so SIGINT handler can close it to interrupt blocking I/O
|
|
3437
3332
|
try:
|
|
3438
|
-
|
|
3439
|
-
|
|
3440
|
-
|
|
3441
|
-
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
|
|
3449
|
-
|
|
3450
|
-
|
|
3451
|
-
|
|
3452
|
-
|
|
3453
|
-
|
|
3454
|
-
|
|
3455
|
-
|
|
3456
|
-
|
|
3457
|
-
|
|
3458
|
-
colored(
|
|
3459
|
-
"\r[ 429 ] Wayback Machine (archive.org) rate limit reached, so waiting for "
|
|
3460
|
-
+ str(seconds)
|
|
3461
|
-
+ " seconds before continuing...\r",
|
|
3462
|
-
"yellow",
|
|
3463
|
-
)
|
|
3464
|
-
)
|
|
3465
|
-
time.sleep(seconds)
|
|
3466
|
-
try:
|
|
3467
|
-
resp = session.get(
|
|
3468
|
-
url, headers={"User-Agent": userAgent}
|
|
3469
|
-
)
|
|
3470
|
-
except ConnectionError:
|
|
3471
|
-
writerr(
|
|
3472
|
-
colored(
|
|
3473
|
-
getSPACER(
|
|
3474
|
-
"[ ERR ] Wayback Machine (archive.org) connection error for page "
|
|
3475
|
-
+ page
|
|
3476
|
-
),
|
|
3477
|
-
"red",
|
|
3478
|
-
)
|
|
3333
|
+
current_response = resp
|
|
3334
|
+
except Exception:
|
|
3335
|
+
pass
|
|
3336
|
+
# Check response status in the finally block
|
|
3337
|
+
if resp is not None:
|
|
3338
|
+
# If a status other of 429, then stop processing Wayback Machine
|
|
3339
|
+
if resp.status_code == 429:
|
|
3340
|
+
if args.wayback_rate_limit_retry > 0:
|
|
3341
|
+
seconds = args.wayback_rate_limit_retry * 60
|
|
3342
|
+
if args.processes == 1:
|
|
3343
|
+
writerr(
|
|
3344
|
+
colored(
|
|
3345
|
+
"Wayback - [ 429 ] Rate limit reached on page "
|
|
3346
|
+
+ str(page)
|
|
3347
|
+
+ " of "
|
|
3348
|
+
+ str(totalPages)
|
|
3349
|
+
+ ", so waiting for "
|
|
3350
|
+
+ str(seconds)
|
|
3351
|
+
+ " seconds before continuing...",
|
|
3352
|
+
"yellow",
|
|
3479
3353
|
)
|
|
3480
|
-
|
|
3481
|
-
|
|
3482
|
-
|
|
3483
|
-
|
|
3484
|
-
|
|
3485
|
-
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
+ " - "
|
|
3489
|
-
+ str(e)
|
|
3490
|
-
),
|
|
3491
|
-
"red",
|
|
3492
|
-
)
|
|
3354
|
+
)
|
|
3355
|
+
else:
|
|
3356
|
+
writerr(
|
|
3357
|
+
colored(
|
|
3358
|
+
"Wayback - [ 429 ] Rate limit reached, so waiting for "
|
|
3359
|
+
+ str(seconds)
|
|
3360
|
+
+ " seconds before continuing...",
|
|
3361
|
+
"yellow",
|
|
3493
3362
|
)
|
|
3494
|
-
resp = None
|
|
3495
|
-
return
|
|
3496
|
-
|
|
3497
|
-
if resp.status_code == 429:
|
|
3498
|
-
writerr(
|
|
3499
|
-
colored(
|
|
3500
|
-
getSPACER(
|
|
3501
|
-
"[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
3502
|
-
),
|
|
3503
|
-
"red",
|
|
3504
3363
|
)
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
|
|
3508
|
-
|
|
3509
|
-
|
|
3510
|
-
|
|
3511
|
-
|
|
3512
|
-
|
|
3513
|
-
|
|
3514
|
-
|
|
3515
|
-
"red",
|
|
3364
|
+
# Wait can be interrupted by SIGINT via interrupt_event
|
|
3365
|
+
interrupt_event.clear()
|
|
3366
|
+
if interrupt_event.wait(seconds):
|
|
3367
|
+
return
|
|
3368
|
+
try:
|
|
3369
|
+
resp = session.get(
|
|
3370
|
+
url,
|
|
3371
|
+
headers={"User-Agent": userAgent},
|
|
3372
|
+
stream=True,
|
|
3373
|
+
timeout=args.timeout,
|
|
3516
3374
|
)
|
|
3517
|
-
|
|
3518
|
-
|
|
3519
|
-
|
|
3520
|
-
|
|
3521
|
-
|
|
3522
|
-
if verbose():
|
|
3375
|
+
try:
|
|
3376
|
+
current_response = resp
|
|
3377
|
+
except Exception:
|
|
3378
|
+
pass
|
|
3379
|
+
except ConnectionError:
|
|
3523
3380
|
writerr(
|
|
3524
3381
|
colored(
|
|
3525
|
-
|
|
3526
|
-
"[ ERR ] "
|
|
3527
|
-
+ url
|
|
3528
|
-
+ " gave an empty response."
|
|
3529
|
-
),
|
|
3382
|
+
"Wayback - [ ERR ] Connection error for page " + page,
|
|
3530
3383
|
"red",
|
|
3531
3384
|
)
|
|
3532
3385
|
)
|
|
3533
|
-
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
if verbose():
|
|
3386
|
+
resp = None
|
|
3387
|
+
return
|
|
3388
|
+
except Exception as e:
|
|
3537
3389
|
writerr(
|
|
3538
3390
|
colored(
|
|
3539
|
-
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3543
|
-
+ url
|
|
3544
|
-
),
|
|
3391
|
+
"Wayback - [ ERR ] Error getting response for page "
|
|
3392
|
+
+ page
|
|
3393
|
+
+ " - "
|
|
3394
|
+
+ str(e),
|
|
3545
3395
|
"red",
|
|
3546
3396
|
)
|
|
3547
3397
|
)
|
|
3548
|
-
|
|
3549
|
-
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
|
|
3553
|
-
|
|
3554
|
-
|
|
3555
|
-
|
|
3556
|
-
|
|
3398
|
+
resp = None
|
|
3399
|
+
return
|
|
3400
|
+
|
|
3401
|
+
if resp.status_code == 429:
|
|
3402
|
+
writerr(
|
|
3403
|
+
colored(
|
|
3404
|
+
"Wayback - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
|
|
3405
|
+
"red",
|
|
3406
|
+
)
|
|
3557
3407
|
)
|
|
3558
|
-
|
|
3559
|
-
|
|
3560
|
-
|
|
3561
|
-
|
|
3562
|
-
|
|
3563
|
-
|
|
3564
|
-
|
|
3565
|
-
"
|
|
3566
|
-
|
|
3567
|
-
+ " - "
|
|
3568
|
-
+ str(e)
|
|
3569
|
-
),
|
|
3570
|
-
"red",
|
|
3408
|
+
stopSourceWayback = True
|
|
3409
|
+
return
|
|
3410
|
+
# If a status other of 503, then the site is unavailable
|
|
3411
|
+
if resp.status_code == 503:
|
|
3412
|
+
writerr(
|
|
3413
|
+
colored(
|
|
3414
|
+
"Wayback - [ 503 ] The Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.",
|
|
3415
|
+
"red",
|
|
3416
|
+
)
|
|
3571
3417
|
)
|
|
3572
|
-
|
|
3573
|
-
|
|
3574
|
-
|
|
3418
|
+
stopSourceWayback = True
|
|
3419
|
+
return
|
|
3420
|
+
# If a status other than 200, then stop
|
|
3421
|
+
if resp.status_code != 200:
|
|
3422
|
+
if verbose():
|
|
3423
|
+
writerr(
|
|
3424
|
+
colored(
|
|
3425
|
+
"Wayback - [ " + str(resp.status_code) + " ] Error for " + url,
|
|
3426
|
+
"red",
|
|
3427
|
+
)
|
|
3428
|
+
)
|
|
3429
|
+
try:
|
|
3430
|
+
current_response = None
|
|
3431
|
+
except Exception:
|
|
3432
|
+
pass
|
|
3433
|
+
try:
|
|
3434
|
+
current_session = None
|
|
3435
|
+
except Exception:
|
|
3436
|
+
pass
|
|
3437
|
+
return
|
|
3575
3438
|
|
|
3576
|
-
|
|
3577
|
-
|
|
3439
|
+
# Get the URLs and MIME types. Each line is a separate JSON string
|
|
3440
|
+
# Process lines as they arrive - if connection drops, we keep what we've already processed
|
|
3578
3441
|
for line in resp.iter_lines():
|
|
3579
|
-
|
|
3580
|
-
|
|
3442
|
+
try:
|
|
3443
|
+
results = line.decode("utf-8")
|
|
3444
|
+
foundUrl = fixArchiveOrgUrl(str(results).split(" ")[1])
|
|
3581
3445
|
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
|
|
3587
|
-
|
|
3588
|
-
|
|
3589
|
-
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
|
|
3593
|
-
|
|
3446
|
+
# If --filter-responses-only wasn't used, then check the URL exclusions
|
|
3447
|
+
if args.filter_responses_only:
|
|
3448
|
+
match = None
|
|
3449
|
+
else:
|
|
3450
|
+
match = re.search(
|
|
3451
|
+
r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
|
|
3452
|
+
foundUrl,
|
|
3453
|
+
flags=re.IGNORECASE,
|
|
3454
|
+
)
|
|
3455
|
+
if match is None:
|
|
3456
|
+
# Only get MIME Types if --verbose option was selected
|
|
3457
|
+
if verbose():
|
|
3458
|
+
try:
|
|
3459
|
+
mimeType = str(results).split(" ")[2]
|
|
3460
|
+
if mimeType != "":
|
|
3461
|
+
linkMimes.add(mimeType)
|
|
3462
|
+
except Exception:
|
|
3463
|
+
if verbose():
|
|
3464
|
+
writerr(
|
|
3465
|
+
colored(
|
|
3466
|
+
getSPACER(
|
|
3467
|
+
"ERROR processWayBackPage 2: Cannot get MIME type from line: "
|
|
3468
|
+
+ str(line)
|
|
3469
|
+
),
|
|
3470
|
+
"red",
|
|
3471
|
+
)
|
|
3472
|
+
)
|
|
3594
3473
|
try:
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
linkMimes.add(mimeType)
|
|
3474
|
+
linksFoundAdd(foundUrl, linksFoundWayback)
|
|
3475
|
+
|
|
3598
3476
|
except Exception:
|
|
3599
3477
|
if verbose():
|
|
3600
3478
|
writerr(
|
|
3601
3479
|
colored(
|
|
3602
3480
|
getSPACER(
|
|
3603
|
-
"ERROR processWayBackPage
|
|
3481
|
+
"ERROR processWayBackPage 3: Cannot get link from line: "
|
|
3604
3482
|
+ str(line)
|
|
3605
3483
|
),
|
|
3606
3484
|
"red",
|
|
3607
3485
|
)
|
|
3608
3486
|
)
|
|
3609
|
-
|
|
3610
|
-
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
|
|
3614
|
-
writerr(
|
|
3615
|
-
colored(
|
|
3616
|
-
getSPACER(
|
|
3617
|
-
"ERROR processWayBackPage 3: Cannot get link from line: "
|
|
3618
|
-
+ str(line)
|
|
3619
|
-
),
|
|
3620
|
-
"red",
|
|
3621
|
-
)
|
|
3487
|
+
except Exception:
|
|
3488
|
+
if verbose():
|
|
3489
|
+
writerr(
|
|
3490
|
+
colored(
|
|
3491
|
+
getSPACER("ERROR processWayBackPage 4: " + str(line)), "red"
|
|
3622
3492
|
)
|
|
3623
|
-
|
|
3624
|
-
|
|
3625
|
-
|
|
3493
|
+
)
|
|
3494
|
+
|
|
3495
|
+
except ConnectionError:
|
|
3496
|
+
writerr(
|
|
3497
|
+
colored(
|
|
3498
|
+
"Wayback - [ ERR ] Connection error for page "
|
|
3499
|
+
+ page
|
|
3500
|
+
+ (
|
|
3501
|
+
f" (saved {len(linksFoundWayback)} URLs before error)"
|
|
3502
|
+
if len(linksFoundWayback) > 0
|
|
3503
|
+
else ""
|
|
3504
|
+
),
|
|
3505
|
+
"red",
|
|
3506
|
+
)
|
|
3507
|
+
)
|
|
3508
|
+
try:
|
|
3509
|
+
current_response = None
|
|
3510
|
+
except Exception:
|
|
3511
|
+
pass
|
|
3512
|
+
try:
|
|
3513
|
+
current_session = None
|
|
3514
|
+
except Exception:
|
|
3515
|
+
pass
|
|
3516
|
+
return
|
|
3517
|
+
except Exception as e:
|
|
3518
|
+
# Even if connection drops, we've already saved the URLs processed so far
|
|
3519
|
+
if len(linksFoundWayback) > 0:
|
|
3520
|
+
writerr(
|
|
3521
|
+
colored(
|
|
3522
|
+
f"Wayback - [ WARN ] Error getting response for page {page} - {str(e)} (saved {len(linksFoundWayback)} URLs before error)",
|
|
3523
|
+
"yellow",
|
|
3524
|
+
)
|
|
3525
|
+
)
|
|
3526
|
+
else:
|
|
3626
3527
|
writerr(
|
|
3627
3528
|
colored(
|
|
3628
|
-
|
|
3529
|
+
"Wayback - [ ERR ] Error getting response for page "
|
|
3530
|
+
+ page
|
|
3531
|
+
+ " - "
|
|
3532
|
+
+ str(e),
|
|
3533
|
+
"red",
|
|
3629
3534
|
)
|
|
3630
3535
|
)
|
|
3536
|
+
try:
|
|
3537
|
+
current_response = None
|
|
3538
|
+
except Exception:
|
|
3539
|
+
pass
|
|
3540
|
+
try:
|
|
3541
|
+
current_session = None
|
|
3542
|
+
except Exception:
|
|
3543
|
+
pass
|
|
3544
|
+
return
|
|
3631
3545
|
else:
|
|
3546
|
+
print("DEBUG: HERE END!") # DEBUG
|
|
3632
3547
|
pass
|
|
3633
3548
|
except Exception as e:
|
|
3634
3549
|
if verbose():
|
|
@@ -3639,40 +3554,47 @@ def getWaybackUrls():
|
|
|
3639
3554
|
"""
|
|
3640
3555
|
Get URLs from the Wayback Machine, archive.org
|
|
3641
3556
|
"""
|
|
3642
|
-
global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages,
|
|
3557
|
+
global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages, stopSourceWayback, argsInput, checkWayback, linkCountWayback, linksFoundWayback
|
|
3643
3558
|
|
|
3644
3559
|
# Write the file of URL's for the passed domain/URL
|
|
3645
3560
|
try:
|
|
3646
|
-
|
|
3561
|
+
stopSourceWayback = False
|
|
3562
|
+
linksFoundWayback = set()
|
|
3647
3563
|
|
|
3648
3564
|
if MATCH_MIME != "":
|
|
3649
3565
|
filterMIME = "&filter=mimetype:" + re.escape(MATCH_MIME).replace(",", "|")
|
|
3650
3566
|
else:
|
|
3651
|
-
filterMIME = "&filter=!mimetype:warc/revisit|" + re.escape(
|
|
3652
|
-
|
|
3653
|
-
)
|
|
3567
|
+
filterMIME = "&filter=!mimetype:warc/revisit|" + re.escape(FILTER_MIME).replace(
|
|
3568
|
+
",", "|"
|
|
3569
|
+
)
|
|
3654
3570
|
# If there any \+ in the MIME types, e.g. image/svg\+xml (the backslash is because it was previosuly escaped), then replace the \+ with a . otherwise the wayback API does not recognise it
|
|
3655
3571
|
filterMIME = filterMIME.replace("\+", ".")
|
|
3656
3572
|
|
|
3657
3573
|
if MATCH_CODE != "":
|
|
3658
3574
|
filterCode = "&filter=statuscode:" + re.escape(MATCH_CODE).replace(",", "|")
|
|
3659
3575
|
else:
|
|
3660
|
-
filterCode = "&filter=!statuscode:" + re.escape(FILTER_CODE).replace(
|
|
3661
|
-
",", "|"
|
|
3662
|
-
)
|
|
3576
|
+
filterCode = "&filter=!statuscode:" + re.escape(FILTER_CODE).replace(",", "|")
|
|
3663
3577
|
|
|
3664
3578
|
# Set keywords filter if -ko argument passed
|
|
3665
3579
|
filterKeywords = ""
|
|
3666
3580
|
if args.keywords_only:
|
|
3667
3581
|
if args.keywords_only == "#CONFIG":
|
|
3668
3582
|
filterKeywords = (
|
|
3669
|
-
"&filter=original:.*("
|
|
3670
|
-
+ re.escape(FILTER_KEYWORDS).replace(",", "|")
|
|
3671
|
-
+ ").*"
|
|
3583
|
+
"&filter=original:.*(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ").*"
|
|
3672
3584
|
)
|
|
3673
3585
|
else:
|
|
3674
3586
|
filterKeywords = "&filter=original:.*(" + args.keywords_only + ").*"
|
|
3675
3587
|
|
|
3588
|
+
# Add the date filters if they were passed
|
|
3589
|
+
if args.from_date is None:
|
|
3590
|
+
filterFrom = ""
|
|
3591
|
+
else:
|
|
3592
|
+
filterFrom = "&from=" + str(args.from_date)
|
|
3593
|
+
if args.to_date is None:
|
|
3594
|
+
filterTo = ""
|
|
3595
|
+
else:
|
|
3596
|
+
filterTo = "&to=" + str(args.to_date)
|
|
3597
|
+
|
|
3676
3598
|
if args.filter_responses_only:
|
|
3677
3599
|
url = (
|
|
3678
3600
|
WAYBACK_URL.replace("{DOMAIN}", subs + quote(argsInput) + path).replace(
|
|
@@ -3688,6 +3610,8 @@ def getWaybackUrls():
|
|
|
3688
3610
|
+ filterMIME
|
|
3689
3611
|
+ filterCode
|
|
3690
3612
|
+ filterKeywords
|
|
3613
|
+
+ filterFrom
|
|
3614
|
+
+ filterTo
|
|
3691
3615
|
+ "&page="
|
|
3692
3616
|
)
|
|
3693
3617
|
|
|
@@ -3697,7 +3621,7 @@ def getWaybackUrls():
|
|
|
3697
3621
|
if not args.check_only:
|
|
3698
3622
|
write(
|
|
3699
3623
|
colored(
|
|
3700
|
-
"
|
|
3624
|
+
"Wayback - [ INFO ] Getting the number of pages to search...",
|
|
3701
3625
|
"cyan",
|
|
3702
3626
|
)
|
|
3703
3627
|
)
|
|
@@ -3706,9 +3630,7 @@ def getWaybackUrls():
|
|
|
3706
3630
|
session = requests.Session()
|
|
3707
3631
|
session.mount("https://", HTTP_ADAPTER)
|
|
3708
3632
|
session.mount("http://", HTTP_ADAPTER)
|
|
3709
|
-
resp = session.get(
|
|
3710
|
-
url + "&showNumPages=True", headers={"User-Agent": userAgent}
|
|
3711
|
-
)
|
|
3633
|
+
resp = session.get(url + "&showNumPages=True", headers={"User-Agent": userAgent})
|
|
3712
3634
|
# Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
|
|
3713
3635
|
try:
|
|
3714
3636
|
totalPages = int(resp.text.strip())
|
|
@@ -3724,9 +3646,7 @@ def getWaybackUrls():
|
|
|
3724
3646
|
if resp.status_code == 429:
|
|
3725
3647
|
writerr(
|
|
3726
3648
|
colored(
|
|
3727
|
-
|
|
3728
|
-
"[ 429 ] Wayback Machine (Archive.org) rate limit reached so unable to get links."
|
|
3729
|
-
),
|
|
3649
|
+
"Wayback - [ 429 ] Rate limit reached so unable to get links.",
|
|
3730
3650
|
"red",
|
|
3731
3651
|
)
|
|
3732
3652
|
)
|
|
@@ -3736,9 +3656,7 @@ def getWaybackUrls():
|
|
|
3736
3656
|
if resp.status_code == 503:
|
|
3737
3657
|
writerr(
|
|
3738
3658
|
colored(
|
|
3739
|
-
|
|
3740
|
-
"[ 503 ] Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
|
|
3741
|
-
),
|
|
3659
|
+
"Wayback - [ 503 ] The Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.",
|
|
3742
3660
|
"red",
|
|
3743
3661
|
)
|
|
3744
3662
|
)
|
|
@@ -3747,19 +3665,15 @@ def getWaybackUrls():
|
|
|
3747
3665
|
if resp.text.lower().find("blocked site error") > 0:
|
|
3748
3666
|
writerr(
|
|
3749
3667
|
colored(
|
|
3750
|
-
|
|
3751
|
-
"[ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)"
|
|
3752
|
-
),
|
|
3668
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)",
|
|
3753
3669
|
"red",
|
|
3754
3670
|
)
|
|
3755
3671
|
)
|
|
3756
3672
|
else:
|
|
3757
3673
|
writerr(
|
|
3758
3674
|
colored(
|
|
3759
|
-
|
|
3760
|
-
|
|
3761
|
-
+ str(resp.text.strip())
|
|
3762
|
-
),
|
|
3675
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): "
|
|
3676
|
+
+ str(resp.text.strip()),
|
|
3763
3677
|
"red",
|
|
3764
3678
|
)
|
|
3765
3679
|
)
|
|
@@ -3767,28 +3681,22 @@ def getWaybackUrls():
|
|
|
3767
3681
|
if str(e).lower().find("alert access denied"):
|
|
3768
3682
|
writerr(
|
|
3769
3683
|
colored(
|
|
3770
|
-
|
|
3771
|
-
"[ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don't know, but it has happened!)"
|
|
3772
|
-
),
|
|
3684
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don't know, but it has happened!)",
|
|
3773
3685
|
"red",
|
|
3774
3686
|
)
|
|
3775
3687
|
)
|
|
3776
3688
|
elif str(e).lower().find("connection refused"):
|
|
3777
3689
|
writerr(
|
|
3778
3690
|
colored(
|
|
3779
|
-
|
|
3780
|
-
"[ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)"
|
|
3781
|
-
),
|
|
3691
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)",
|
|
3782
3692
|
"red",
|
|
3783
3693
|
)
|
|
3784
3694
|
)
|
|
3785
3695
|
else:
|
|
3786
3696
|
writerr(
|
|
3787
3697
|
colored(
|
|
3788
|
-
|
|
3789
|
-
|
|
3790
|
-
+ str(e)
|
|
3791
|
-
),
|
|
3698
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): "
|
|
3699
|
+
+ str(e),
|
|
3792
3700
|
"red",
|
|
3793
3701
|
)
|
|
3794
3702
|
)
|
|
@@ -3798,27 +3706,29 @@ def getWaybackUrls():
|
|
|
3798
3706
|
if totalPages < 0:
|
|
3799
3707
|
write(
|
|
3800
3708
|
colored(
|
|
3801
|
-
"Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.",
|
|
3709
|
+
"Wayback - [ INFO ] Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.",
|
|
3802
3710
|
"cyan",
|
|
3803
3711
|
)
|
|
3804
3712
|
)
|
|
3805
3713
|
else:
|
|
3806
3714
|
checkWayback = totalPages
|
|
3807
3715
|
write(
|
|
3808
|
-
colored("Get URLs from Wayback Machine: ", "cyan")
|
|
3716
|
+
colored("Wayback - [ INFO ] Get URLs from Wayback Machine: ", "cyan")
|
|
3809
3717
|
+ colored(str(checkWayback) + " requests", "white")
|
|
3810
3718
|
)
|
|
3811
3719
|
else:
|
|
3812
3720
|
if verbose():
|
|
3813
3721
|
write(
|
|
3814
|
-
colored(
|
|
3722
|
+
colored(
|
|
3723
|
+
"Wayback - [ INFO ] The archive URL requested to get links: ", "magenta"
|
|
3724
|
+
)
|
|
3815
3725
|
+ colored(url + "\n", "white")
|
|
3816
3726
|
)
|
|
3817
3727
|
|
|
3818
3728
|
if totalPages < 0:
|
|
3819
3729
|
write(
|
|
3820
3730
|
colored(
|
|
3821
|
-
"
|
|
3731
|
+
"Wayback - [ INFO ] Getting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...",
|
|
3822
3732
|
"cyan",
|
|
3823
3733
|
)
|
|
3824
3734
|
)
|
|
@@ -3828,9 +3738,9 @@ def getWaybackUrls():
|
|
|
3828
3738
|
# if the page number was found then display it, but otherwise we will just try to increment until we have everything
|
|
3829
3739
|
write(
|
|
3830
3740
|
colored(
|
|
3831
|
-
"
|
|
3741
|
+
"Wayback - [ INFO ] Getting links from "
|
|
3832
3742
|
+ str(totalPages)
|
|
3833
|
-
+ " Wayback Machine (archive.org) API requests (this can take a while for some domains)
|
|
3743
|
+
+ " Wayback Machine (archive.org) API requests (this can take a while for some domains)...",
|
|
3834
3744
|
"cyan",
|
|
3835
3745
|
)
|
|
3836
3746
|
)
|
|
@@ -3854,25 +3764,22 @@ def getWaybackUrls():
|
|
|
3854
3764
|
if verbose() and len(linkMimes) > 0:
|
|
3855
3765
|
linkMimes.discard("warc/revisit")
|
|
3856
3766
|
write(
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
+ colored(str(linkMimes), "white")
|
|
3860
|
-
)
|
|
3767
|
+
colored("Wayback - [ INFO ] MIME types found: ", "magenta")
|
|
3768
|
+
+ colored(str(linkMimes), "white")
|
|
3861
3769
|
+ "\n"
|
|
3862
3770
|
)
|
|
3863
3771
|
linkMimes = None
|
|
3864
3772
|
|
|
3865
3773
|
if not args.xwm:
|
|
3866
|
-
|
|
3774
|
+
linkCountWayback = len(linksFoundWayback)
|
|
3867
3775
|
write(
|
|
3868
|
-
|
|
3869
|
-
|
|
3870
|
-
"Links found on Wayback Machine (archive.org): ", "cyan"
|
|
3871
|
-
)
|
|
3872
|
-
+ colored(str(linkCount), "white")
|
|
3776
|
+
colored(
|
|
3777
|
+
"Wayback - [ INFO ] Links found on Wayback Machine (archive.org): ", "cyan"
|
|
3873
3778
|
)
|
|
3874
|
-
+ "
|
|
3779
|
+
+ colored(str(linkCountWayback), "white")
|
|
3875
3780
|
)
|
|
3781
|
+
linksFound.update(linksFoundWayback)
|
|
3782
|
+
linksFoundWayback.clear()
|
|
3876
3783
|
|
|
3877
3784
|
except Exception as e:
|
|
3878
3785
|
writerr(colored("ERROR getWaybackUrls 1: " + str(e), "red"))
|
|
@@ -3882,13 +3789,13 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3882
3789
|
"""
|
|
3883
3790
|
Get URLs from a given Common Crawl index collection
|
|
3884
3791
|
"""
|
|
3885
|
-
global subs, path, linksFound, linkMimes,
|
|
3792
|
+
global subs, path, linksFound, linkMimes, stopSourceCommonCrawl, argsInput, linkCountCommonCrawl, linksFoundCommonCrawl, current_response, current_session
|
|
3886
3793
|
|
|
3887
3794
|
try:
|
|
3888
3795
|
# Get memory in case it exceeds threshold
|
|
3889
3796
|
getMemory()
|
|
3890
3797
|
|
|
3891
|
-
if not
|
|
3798
|
+
if not stopSourceCommonCrawl:
|
|
3892
3799
|
# Set mime content type filter
|
|
3893
3800
|
if MATCH_MIME.strip() != "":
|
|
3894
3801
|
filterMIME = "&filter=~mime:("
|
|
@@ -3902,31 +3809,21 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3902
3809
|
# Set status code filter
|
|
3903
3810
|
filterCode = ""
|
|
3904
3811
|
if MATCH_CODE.strip() != "":
|
|
3905
|
-
filterCode = (
|
|
3906
|
-
"&filter=~status:(" + re.escape(MATCH_CODE).replace(",", "|") + ")"
|
|
3907
|
-
)
|
|
3812
|
+
filterCode = "&filter=~status:(" + re.escape(MATCH_CODE).replace(",", "|") + ")"
|
|
3908
3813
|
else:
|
|
3909
|
-
filterCode = (
|
|
3910
|
-
"&filter=!~status:("
|
|
3911
|
-
+ re.escape(FILTER_CODE).replace(",", "|")
|
|
3912
|
-
+ ")"
|
|
3913
|
-
)
|
|
3814
|
+
filterCode = "&filter=!~status:(" + re.escape(FILTER_CODE).replace(",", "|") + ")"
|
|
3914
3815
|
|
|
3915
3816
|
# Set keywords filter if -ko argument passed
|
|
3916
3817
|
filterKeywords = ""
|
|
3917
3818
|
if args.keywords_only:
|
|
3918
3819
|
if args.keywords_only == "#CONFIG":
|
|
3919
3820
|
filterKeywords = (
|
|
3920
|
-
"&filter=~url:.*("
|
|
3921
|
-
+ re.escape(FILTER_KEYWORDS).replace(",", "|")
|
|
3922
|
-
+ ").*"
|
|
3821
|
+
"&filter=~url:.*(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ").*"
|
|
3923
3822
|
)
|
|
3924
3823
|
else:
|
|
3925
3824
|
filterKeywords = "&filter=~url:.*(" + args.keywords_only + ").*"
|
|
3926
3825
|
|
|
3927
|
-
commonCrawlUrl =
|
|
3928
|
-
cdxApiUrl + "?output=json&fl=timestamp,url,mime,status,digest&url="
|
|
3929
|
-
)
|
|
3826
|
+
commonCrawlUrl = cdxApiUrl + "?output=json&fl=timestamp,url,mime,status,digest&url="
|
|
3930
3827
|
|
|
3931
3828
|
if args.filter_responses_only:
|
|
3932
3829
|
url = commonCrawlUrl + subs + quote(argsInput) + path
|
|
@@ -3947,25 +3844,26 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3947
3844
|
session = requests.Session()
|
|
3948
3845
|
session.mount("https://", HTTP_ADAPTER_CC)
|
|
3949
3846
|
session.mount("http://", HTTP_ADAPTER_CC)
|
|
3847
|
+
try:
|
|
3848
|
+
current_session = session
|
|
3849
|
+
except Exception:
|
|
3850
|
+
pass
|
|
3950
3851
|
resp = session.get(url, stream=True, headers={"User-Agent": userAgent})
|
|
3852
|
+
try:
|
|
3853
|
+
current_response = resp
|
|
3854
|
+
except Exception:
|
|
3855
|
+
pass
|
|
3951
3856
|
except ConnectionError:
|
|
3952
3857
|
writerr(
|
|
3953
3858
|
colored(
|
|
3954
|
-
|
|
3955
|
-
"[ ERR ] Common Crawl connection error for index "
|
|
3956
|
-
+ cdxApiUrl
|
|
3957
|
-
),
|
|
3859
|
+
"CommonCrawl - [ ERR ] Connection error for index " + cdxApiUrl,
|
|
3958
3860
|
"red",
|
|
3959
3861
|
)
|
|
3960
3862
|
)
|
|
3961
3863
|
resp = None
|
|
3962
3864
|
return
|
|
3963
3865
|
except Exception as e:
|
|
3964
|
-
writerr(
|
|
3965
|
-
colored(
|
|
3966
|
-
getSPACER("[ ERR ] Error getting response - " + str(e)), "red"
|
|
3967
|
-
)
|
|
3968
|
-
)
|
|
3866
|
+
writerr(colored("CommonCrawl - [ ERR ] Error getting response - " + str(e), "red"))
|
|
3969
3867
|
resp = None
|
|
3970
3868
|
return
|
|
3971
3869
|
finally:
|
|
@@ -3975,13 +3873,11 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3975
3873
|
if resp.status_code == 429:
|
|
3976
3874
|
writerr(
|
|
3977
3875
|
colored(
|
|
3978
|
-
|
|
3979
|
-
"[ 429 ] Common Crawl rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
3980
|
-
),
|
|
3876
|
+
"CommonCrawl - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
|
|
3981
3877
|
"red",
|
|
3982
3878
|
)
|
|
3983
3879
|
)
|
|
3984
|
-
|
|
3880
|
+
stopSourceCommonCrawl = True
|
|
3985
3881
|
return
|
|
3986
3882
|
# If the response from commoncrawl.org says nothing was found...
|
|
3987
3883
|
if resp.text.lower().find("no captures found") > 0:
|
|
@@ -3992,11 +3888,7 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3992
3888
|
if verbose():
|
|
3993
3889
|
writerr(
|
|
3994
3890
|
colored(
|
|
3995
|
-
|
|
3996
|
-
"[ ERR ] "
|
|
3997
|
-
+ url
|
|
3998
|
-
+ " gave an empty response."
|
|
3999
|
-
),
|
|
3891
|
+
"CommonCrawl - [ ERR ] " + url + " gave an empty response.",
|
|
4000
3892
|
"red",
|
|
4001
3893
|
)
|
|
4002
3894
|
)
|
|
@@ -4006,12 +3898,10 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
4006
3898
|
if verbose():
|
|
4007
3899
|
writerr(
|
|
4008
3900
|
colored(
|
|
4009
|
-
|
|
4010
|
-
|
|
4011
|
-
|
|
4012
|
-
|
|
4013
|
-
+ cdxApiUrl
|
|
4014
|
-
),
|
|
3901
|
+
"CommonCrawl - [ "
|
|
3902
|
+
+ str(resp.status_code)
|
|
3903
|
+
+ " ] Error for "
|
|
3904
|
+
+ cdxApiUrl,
|
|
4015
3905
|
"red",
|
|
4016
3906
|
)
|
|
4017
3907
|
)
|
|
@@ -4020,27 +3910,71 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
4020
3910
|
pass
|
|
4021
3911
|
|
|
4022
3912
|
# Get the URLs and MIME types
|
|
4023
|
-
|
|
4024
|
-
|
|
4025
|
-
|
|
4026
|
-
|
|
4027
|
-
|
|
4028
|
-
|
|
4029
|
-
|
|
4030
|
-
|
|
4031
|
-
|
|
4032
|
-
|
|
4033
|
-
|
|
4034
|
-
|
|
4035
|
-
|
|
4036
|
-
|
|
4037
|
-
|
|
4038
|
-
|
|
4039
|
-
|
|
4040
|
-
|
|
4041
|
-
|
|
3913
|
+
try:
|
|
3914
|
+
for line in resp.iter_lines():
|
|
3915
|
+
results = line.decode("utf-8")
|
|
3916
|
+
try:
|
|
3917
|
+
data = json.loads(results)
|
|
3918
|
+
# Get MIME Types if --verbose option was seletced
|
|
3919
|
+
if verbose():
|
|
3920
|
+
try:
|
|
3921
|
+
if data["mime"] != "":
|
|
3922
|
+
linkMimes.add(data["mime"])
|
|
3923
|
+
except Exception:
|
|
3924
|
+
pass
|
|
3925
|
+
# If -from or -to were passed, check the timestamp of the URL.
|
|
3926
|
+
# Only continue if the URL falls within the date range specified
|
|
3927
|
+
if args.from_date is not None or args.to_date is not None:
|
|
3928
|
+
try:
|
|
3929
|
+
ts = data["timestamp"]
|
|
3930
|
+
|
|
3931
|
+
# Normalize helper: pad/truncate date string to 14 digits (YYYYMMDDhhmmss)
|
|
3932
|
+
def normalize_date(d, is_from):
|
|
3933
|
+
if d is None:
|
|
3934
|
+
return None
|
|
3935
|
+
d = d.strip()
|
|
3936
|
+
# Pad to 14 digits: from_date pads with 0s, to_date with 9s
|
|
3937
|
+
if is_from:
|
|
3938
|
+
return (d + "0" * (14 - len(d)))[:14]
|
|
3939
|
+
else:
|
|
3940
|
+
return (d + "9" * (14 - len(d)))[:14]
|
|
3941
|
+
|
|
3942
|
+
from_ts = normalize_date(args.from_date, True)
|
|
3943
|
+
to_ts = normalize_date(args.to_date, False)
|
|
3944
|
+
|
|
3945
|
+
# Compare numerically
|
|
3946
|
+
if from_ts and ts < from_ts:
|
|
3947
|
+
continue
|
|
3948
|
+
if to_ts and ts > to_ts:
|
|
3949
|
+
continue
|
|
3950
|
+
|
|
3951
|
+
except Exception:
|
|
3952
|
+
writerr(
|
|
3953
|
+
colored(
|
|
3954
|
+
"ERROR processCommonCrawlCollection 3: Cannot get timestamp from line {line}: {str(e)}",
|
|
3955
|
+
"red",
|
|
3956
|
+
)
|
|
3957
|
+
)
|
|
3958
|
+
|
|
3959
|
+
linksFoundAdd(data["url"], linksFoundCommonCrawl)
|
|
3960
|
+
except Exception:
|
|
3961
|
+
if verbose():
|
|
3962
|
+
writerr(
|
|
3963
|
+
colored(
|
|
3964
|
+
"ERROR processCommonCrawlCollection 2: Cannot get URL and MIME type from line: "
|
|
3965
|
+
+ str(line),
|
|
3966
|
+
"red",
|
|
3967
|
+
)
|
|
4042
3968
|
)
|
|
4043
|
-
|
|
3969
|
+
finally:
|
|
3970
|
+
try:
|
|
3971
|
+
current_response = None
|
|
3972
|
+
except Exception:
|
|
3973
|
+
pass
|
|
3974
|
+
try:
|
|
3975
|
+
current_session = None
|
|
3976
|
+
except Exception:
|
|
3977
|
+
pass
|
|
4044
3978
|
else:
|
|
4045
3979
|
pass
|
|
4046
3980
|
except Exception as e:
|
|
@@ -4067,10 +4001,8 @@ def getCommonCrawlIndexes():
|
|
|
4067
4001
|
except Exception as e:
|
|
4068
4002
|
writerr(
|
|
4069
4003
|
colored(
|
|
4070
|
-
|
|
4071
|
-
|
|
4072
|
-
+ str(e)
|
|
4073
|
-
),
|
|
4004
|
+
"CommonCrawl - [ ERR ] Couldn't delete local version of Common Crawl index file: "
|
|
4005
|
+
+ str(e),
|
|
4074
4006
|
"red",
|
|
4075
4007
|
)
|
|
4076
4008
|
)
|
|
@@ -4081,17 +4013,15 @@ def getCommonCrawlIndexes():
|
|
|
4081
4013
|
if not createFile:
|
|
4082
4014
|
# Read the indexes from the local file
|
|
4083
4015
|
try:
|
|
4084
|
-
with open(collinfoPath
|
|
4016
|
+
with open(collinfoPath) as file:
|
|
4085
4017
|
jsonResp = file.read()
|
|
4086
4018
|
file.close()
|
|
4087
4019
|
except Exception as e:
|
|
4088
4020
|
createFile = True
|
|
4089
4021
|
writerr(
|
|
4090
4022
|
colored(
|
|
4091
|
-
|
|
4092
|
-
|
|
4093
|
-
+ str(e)
|
|
4094
|
-
),
|
|
4023
|
+
"CommonCrawl - [ ERR ] Couldn't read local version of Common Crawl index file: "
|
|
4024
|
+
+ str(e),
|
|
4095
4025
|
"red",
|
|
4096
4026
|
)
|
|
4097
4027
|
)
|
|
@@ -4104,15 +4034,11 @@ def getCommonCrawlIndexes():
|
|
|
4104
4034
|
session = requests.Session()
|
|
4105
4035
|
session.mount("https://", HTTP_ADAPTER_CC)
|
|
4106
4036
|
session.mount("http://", HTTP_ADAPTER_CC)
|
|
4107
|
-
indexes = session.get(
|
|
4108
|
-
CCRAWL_INDEX_URL, headers={"User-Agent": userAgent}
|
|
4109
|
-
)
|
|
4037
|
+
indexes = session.get(CCRAWL_INDEX_URL, headers={"User-Agent": userAgent})
|
|
4110
4038
|
except ConnectionError:
|
|
4111
4039
|
writerr(
|
|
4112
4040
|
colored(
|
|
4113
|
-
|
|
4114
|
-
"[ ERR ] Common Crawl connection error getting Index file"
|
|
4115
|
-
),
|
|
4041
|
+
"CommonCrawl - [ ERR ] Connection error getting Index file",
|
|
4116
4042
|
"red",
|
|
4117
4043
|
)
|
|
4118
4044
|
)
|
|
@@ -4120,10 +4046,8 @@ def getCommonCrawlIndexes():
|
|
|
4120
4046
|
except Exception as e:
|
|
4121
4047
|
writerr(
|
|
4122
4048
|
colored(
|
|
4123
|
-
|
|
4124
|
-
|
|
4125
|
-
+ str(e)
|
|
4126
|
-
),
|
|
4049
|
+
"CommonCrawl - [ ERR ] Error getting Common Crawl index collection - "
|
|
4050
|
+
+ str(e),
|
|
4127
4051
|
"red",
|
|
4128
4052
|
)
|
|
4129
4053
|
)
|
|
@@ -4133,9 +4057,7 @@ def getCommonCrawlIndexes():
|
|
|
4133
4057
|
if indexes.status_code == 429:
|
|
4134
4058
|
writerr(
|
|
4135
4059
|
colored(
|
|
4136
|
-
|
|
4137
|
-
"[ 429 ] Common Crawl rate limit reached so unable to get links."
|
|
4138
|
-
),
|
|
4060
|
+
"CommonCrawl - [ 429 ] Rate limit reached so unable to get links.",
|
|
4139
4061
|
"red",
|
|
4140
4062
|
)
|
|
4141
4063
|
)
|
|
@@ -4144,7 +4066,7 @@ def getCommonCrawlIndexes():
|
|
|
4144
4066
|
elif indexes.status_code == 503:
|
|
4145
4067
|
writerr(
|
|
4146
4068
|
colored(
|
|
4147
|
-
|
|
4069
|
+
"CommonCrawl - [ 503 ] Common Crawl seems to be unavailable.",
|
|
4148
4070
|
"red",
|
|
4149
4071
|
)
|
|
4150
4072
|
)
|
|
@@ -4152,11 +4074,9 @@ def getCommonCrawlIndexes():
|
|
|
4152
4074
|
elif indexes.status_code != 200:
|
|
4153
4075
|
writerr(
|
|
4154
4076
|
colored(
|
|
4155
|
-
|
|
4156
|
-
|
|
4157
|
-
|
|
4158
|
-
+ " ] Common Crawl did not retrun the indexes file."
|
|
4159
|
-
),
|
|
4077
|
+
"CommonCrawl - [ "
|
|
4078
|
+
+ str(indexes.status_code)
|
|
4079
|
+
+ " ] Common Crawl did not retrun the indexes file.",
|
|
4160
4080
|
"red",
|
|
4161
4081
|
)
|
|
4162
4082
|
)
|
|
@@ -4173,10 +4093,8 @@ def getCommonCrawlIndexes():
|
|
|
4173
4093
|
except Exception as e:
|
|
4174
4094
|
writerr(
|
|
4175
4095
|
colored(
|
|
4176
|
-
|
|
4177
|
-
|
|
4178
|
-
+ str(e)
|
|
4179
|
-
),
|
|
4096
|
+
"CommonCrawl - [ ERR ] Couldn't create local version of Common Crawl index file: "
|
|
4097
|
+
+ str(e),
|
|
4180
4098
|
"red",
|
|
4181
4099
|
)
|
|
4182
4100
|
)
|
|
@@ -4187,26 +4105,40 @@ def getCommonCrawlIndexes():
|
|
|
4187
4105
|
for values in json.loads(jsonResp):
|
|
4188
4106
|
for key in values:
|
|
4189
4107
|
if key == "cdx-api":
|
|
4190
|
-
if args.
|
|
4108
|
+
if args.from_date is not None or args.to_date is not None:
|
|
4191
4109
|
try:
|
|
4192
4110
|
indexYear = values[key].split("CC-MAIN-")[1][:4]
|
|
4193
|
-
|
|
4194
|
-
|
|
4111
|
+
|
|
4112
|
+
# Only get the indexes that fall within the date range specified
|
|
4113
|
+
if args.from_date is not None:
|
|
4114
|
+
fromYear = int(args.from_date[:4])
|
|
4115
|
+
# There are a few exceptions with the filename format at the start of Common Crawl indexes where it contains 2 years, so deal with those (e.g. CC-MAIN-2009-2010-index and CC-MAIN-2008-2009-index)
|
|
4116
|
+
if fromYear in (2009, 2010):
|
|
4117
|
+
fromYear = fromYear - 1
|
|
4118
|
+
if int(indexYear) < fromYear:
|
|
4119
|
+
continue
|
|
4120
|
+
if args.to_date is not None:
|
|
4121
|
+
toYear = int(args.to_date[:4])
|
|
4122
|
+
if int(indexYear) > toYear:
|
|
4123
|
+
continue
|
|
4124
|
+
# If it passed the date range checks then add the index URL
|
|
4125
|
+
cdxApiUrls.add(values[key])
|
|
4126
|
+
collection = collection + 1
|
|
4195
4127
|
except Exception as e:
|
|
4196
4128
|
writerr(
|
|
4197
4129
|
colored(
|
|
4198
|
-
|
|
4199
|
-
|
|
4200
|
-
|
|
4201
|
-
|
|
4202
|
-
+ str(e)
|
|
4203
|
-
),
|
|
4130
|
+
"CommonCrawl - [ ERR ] Failed to get the year from index name "
|
|
4131
|
+
+ values[key]
|
|
4132
|
+
+ " - "
|
|
4133
|
+
+ str(e),
|
|
4204
4134
|
"red",
|
|
4205
4135
|
)
|
|
4206
4136
|
)
|
|
4207
4137
|
else:
|
|
4208
4138
|
cdxApiUrls.add(values[key])
|
|
4209
|
-
|
|
4139
|
+
collection = collection + 1
|
|
4140
|
+
|
|
4141
|
+
# Only get the most recent number of indexes specified by -lcc argument
|
|
4210
4142
|
if collection == args.lcc:
|
|
4211
4143
|
break
|
|
4212
4144
|
|
|
@@ -4220,12 +4152,11 @@ def getCommonCrawlUrls():
|
|
|
4220
4152
|
"""
|
|
4221
4153
|
Get all Common Crawl index collections to get all URLs from each one
|
|
4222
4154
|
"""
|
|
4223
|
-
global linksFound, linkMimes, waymorePath, subs, path,
|
|
4155
|
+
global linksFound, linkMimes, waymorePath, subs, path, stopSourceCommonCrawl, argsInput, checkCommonCrawl, linkCountCommonCrawl, linksFoundCommonCrawl
|
|
4224
4156
|
|
|
4225
4157
|
try:
|
|
4226
|
-
|
|
4227
|
-
|
|
4228
|
-
originalLinkCount = len(linksFound)
|
|
4158
|
+
stopSourceCommonCrawl = False
|
|
4159
|
+
linksFoundCommonCrawl = set()
|
|
4229
4160
|
|
|
4230
4161
|
# Set mime content type filter
|
|
4231
4162
|
if MATCH_MIME.strip() != "":
|
|
@@ -4240,13 +4171,9 @@ def getCommonCrawlUrls():
|
|
|
4240
4171
|
# Set status code filter
|
|
4241
4172
|
filterCode = ""
|
|
4242
4173
|
if MATCH_CODE.strip() != "":
|
|
4243
|
-
filterCode = (
|
|
4244
|
-
"&filter=~status:(" + re.escape(MATCH_CODE).replace(",", "|") + ")"
|
|
4245
|
-
)
|
|
4174
|
+
filterCode = "&filter=~status:(" + re.escape(MATCH_CODE).replace(",", "|") + ")"
|
|
4246
4175
|
else:
|
|
4247
|
-
filterCode = (
|
|
4248
|
-
"&filter=!~status:(" + re.escape(FILTER_CODE).replace(",", "|") + ")"
|
|
4249
|
-
)
|
|
4176
|
+
filterCode = "&filter=!~status:(" + re.escape(FILTER_CODE).replace(",", "|") + ")"
|
|
4250
4177
|
|
|
4251
4178
|
if verbose():
|
|
4252
4179
|
if args.filter_responses_only:
|
|
@@ -4267,7 +4194,7 @@ def getCommonCrawlUrls():
|
|
|
4267
4194
|
)
|
|
4268
4195
|
write(
|
|
4269
4196
|
colored(
|
|
4270
|
-
"The
|
|
4197
|
+
"CommonCrawl - [ INFO ] The index URL requested to get links (where {CDX-API-URL} is from "
|
|
4271
4198
|
+ CCRAWL_INDEX_URL
|
|
4272
4199
|
+ "): ",
|
|
4273
4200
|
"magenta",
|
|
@@ -4276,9 +4203,7 @@ def getCommonCrawlUrls():
|
|
|
4276
4203
|
)
|
|
4277
4204
|
|
|
4278
4205
|
if not args.check_only:
|
|
4279
|
-
write(
|
|
4280
|
-
colored("\rGetting commoncrawl.org index collections list...\r", "cyan")
|
|
4281
|
-
)
|
|
4206
|
+
write(colored("CommonCrawl - [ INFO ] Getting index collections list...", "cyan"))
|
|
4282
4207
|
|
|
4283
4208
|
# Get the Common Crawl index collections
|
|
4284
4209
|
cdxApiUrls = getCommonCrawlIndexes()
|
|
@@ -4291,15 +4216,15 @@ def getCommonCrawlUrls():
|
|
|
4291
4216
|
else:
|
|
4292
4217
|
checkCommonCrawl = len(cdxApiUrls) + 1
|
|
4293
4218
|
write(
|
|
4294
|
-
colored("Get URLs from Common Crawl: ", "cyan")
|
|
4219
|
+
colored("CommonCrawl - [ INFO ] Get URLs from Common Crawl: ", "cyan")
|
|
4295
4220
|
+ colored(str(checkCommonCrawl) + " requests", "white")
|
|
4296
4221
|
)
|
|
4297
4222
|
else:
|
|
4298
4223
|
write(
|
|
4299
4224
|
colored(
|
|
4300
|
-
"
|
|
4225
|
+
"CommonCrawl - [ INFO ] Getting links from the latest "
|
|
4301
4226
|
+ str(len(cdxApiUrls))
|
|
4302
|
-
+ " commoncrawl.org index collections (this can take a while for some domains)
|
|
4227
|
+
+ " commoncrawl.org index collections (this can take a while for some domains)...",
|
|
4303
4228
|
"cyan",
|
|
4304
4229
|
)
|
|
4305
4230
|
)
|
|
@@ -4315,30 +4240,18 @@ def getCommonCrawlUrls():
|
|
|
4315
4240
|
if verbose() and len(linkMimes) > 0:
|
|
4316
4241
|
linkMimes.discard("warc/revisit")
|
|
4317
4242
|
write(
|
|
4318
|
-
|
|
4319
|
-
|
|
4320
|
-
+ colored(str(linkMimes), "white")
|
|
4321
|
-
)
|
|
4243
|
+
colored("CommonCrawl - [ INFO ] MIME types found: ", "magenta")
|
|
4244
|
+
+ colored(str(linkMimes), "white")
|
|
4322
4245
|
+ "\n"
|
|
4323
4246
|
)
|
|
4324
4247
|
|
|
4325
|
-
|
|
4326
|
-
|
|
4327
|
-
|
|
4328
|
-
|
|
4329
|
-
|
|
4330
|
-
|
|
4331
|
-
|
|
4332
|
-
+ "\n"
|
|
4333
|
-
)
|
|
4334
|
-
else:
|
|
4335
|
-
write(
|
|
4336
|
-
getSPACER(
|
|
4337
|
-
colored("Extra links found on commoncrawl.org: ", "cyan")
|
|
4338
|
-
+ colored(str(linkCount), "white")
|
|
4339
|
-
)
|
|
4340
|
-
+ "\n"
|
|
4341
|
-
)
|
|
4248
|
+
linkCountCommonCrawl = len(linksFoundCommonCrawl)
|
|
4249
|
+
write(
|
|
4250
|
+
colored("CommonCrawl - [ INFO ] Links found on commoncrawl.org: ", "cyan")
|
|
4251
|
+
+ colored(str(linkCountCommonCrawl), "white")
|
|
4252
|
+
)
|
|
4253
|
+
linksFound.update(linksFoundCommonCrawl)
|
|
4254
|
+
linksFoundCommonCrawl.clear()
|
|
4342
4255
|
|
|
4343
4256
|
except Exception as e:
|
|
4344
4257
|
writerr(colored("ERROR getCommonCrawlUrls 1: " + str(e), "red"))
|
|
@@ -4348,7 +4261,7 @@ def processVirusTotalUrl(url):
|
|
|
4348
4261
|
"""
|
|
4349
4262
|
Process a specific URL from virustotal.com to determine whether to save the link
|
|
4350
4263
|
"""
|
|
4351
|
-
global argsInput, argsInputHostname
|
|
4264
|
+
global argsInput, argsInputHostname, linkCountVirusTotal, linksFoundVirusTotal
|
|
4352
4265
|
|
|
4353
4266
|
addLink = True
|
|
4354
4267
|
|
|
@@ -4394,9 +4307,7 @@ def processVirusTotalUrl(url):
|
|
|
4394
4307
|
flags=re.IGNORECASE,
|
|
4395
4308
|
)
|
|
4396
4309
|
else:
|
|
4397
|
-
match = re.search(
|
|
4398
|
-
r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE
|
|
4399
|
-
)
|
|
4310
|
+
match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
|
|
4400
4311
|
if match is None:
|
|
4401
4312
|
addLink = False
|
|
4402
4313
|
|
|
@@ -4417,7 +4328,7 @@ def processVirusTotalUrl(url):
|
|
|
4417
4328
|
flags=re.IGNORECASE,
|
|
4418
4329
|
)
|
|
4419
4330
|
if match is not None:
|
|
4420
|
-
linksFoundAdd(url)
|
|
4331
|
+
linksFoundAdd(url, linksFoundVirusTotal)
|
|
4421
4332
|
|
|
4422
4333
|
except Exception as e:
|
|
4423
4334
|
writerr(colored("ERROR processVirusTotalUrl 1: " + str(e), "red"))
|
|
@@ -4425,58 +4336,50 @@ def processVirusTotalUrl(url):
|
|
|
4425
4336
|
|
|
4426
4337
|
def getVirusTotalUrls():
|
|
4427
4338
|
"""
|
|
4428
|
-
Get URLs from the VirusTotal API v2
|
|
4339
|
+
Get URLs from the VirusTotal API v2 and process them.
|
|
4340
|
+
Each URL is normalized as (url, scan_date) tuple. Dates are filtered according to args.from_date / args.to_date.
|
|
4429
4341
|
"""
|
|
4430
|
-
global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram,
|
|
4342
|
+
global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceVirusTotal, argsInput, checkVirusTotal, argsInputHostname, linkCountVirusTotal, linksFoundVirusTotal
|
|
4431
4343
|
|
|
4432
|
-
# Write the file of URL's for the passed domain/URL
|
|
4433
4344
|
try:
|
|
4434
|
-
|
|
4435
|
-
|
|
4436
|
-
linkMimes = set()
|
|
4437
|
-
originalLinkCount = len(linksFound)
|
|
4345
|
+
stopSourceVirusTotal = False
|
|
4346
|
+
linksFoundVirusTotal = set()
|
|
4438
4347
|
|
|
4439
|
-
#
|
|
4348
|
+
# Build the VirusTotal API URL
|
|
4440
4349
|
url = VIRUSTOTAL_URL.replace("{DOMAIN}", quote(argsInputHostname)).replace(
|
|
4441
4350
|
"{APIKEY}", VIRUSTOTAL_API_KEY
|
|
4442
4351
|
)
|
|
4443
4352
|
|
|
4444
4353
|
if verbose():
|
|
4445
4354
|
write(
|
|
4446
|
-
colored("The
|
|
4355
|
+
colored("VirusTotal - [ INFO ] The URL requested to get links: ", "magenta")
|
|
4447
4356
|
+ colored(url + "\n", "white")
|
|
4448
4357
|
)
|
|
4449
4358
|
|
|
4450
4359
|
if not args.check_only:
|
|
4451
|
-
write(colored("
|
|
4360
|
+
write(colored("VirusTotal - [ INFO ] Getting links from virustotal.com API...", "cyan"))
|
|
4452
4361
|
|
|
4453
|
-
#
|
|
4362
|
+
# Make request
|
|
4454
4363
|
try:
|
|
4455
|
-
# Choose a random user agent string to use for any requests
|
|
4456
4364
|
userAgent = random.choice(USER_AGENT)
|
|
4457
4365
|
session = requests.Session()
|
|
4458
4366
|
session.mount("https://", HTTP_ADAPTER)
|
|
4459
4367
|
session.mount("http://", HTTP_ADAPTER)
|
|
4460
4368
|
resp = session.get(url, headers={"User-Agent": userAgent})
|
|
4461
|
-
requestsMade = requestsMade + 1
|
|
4462
4369
|
except Exception as e:
|
|
4463
|
-
|
|
4370
|
+
writerr(
|
|
4464
4371
|
colored(
|
|
4465
|
-
|
|
4466
|
-
"[ ERR ] Unable to get links from virustotal.com: " + str(e)
|
|
4467
|
-
),
|
|
4372
|
+
"VirusTotal - [ ERR ] Unable to get links from virustotal.com: " + str(e),
|
|
4468
4373
|
"red",
|
|
4469
4374
|
)
|
|
4470
4375
|
)
|
|
4471
4376
|
return
|
|
4472
4377
|
|
|
4473
|
-
#
|
|
4378
|
+
# Handle HTTP errors
|
|
4474
4379
|
if resp.status_code == 429:
|
|
4475
4380
|
writerr(
|
|
4476
4381
|
colored(
|
|
4477
|
-
|
|
4478
|
-
"[ 429 ] VirusTotal rate limit reached so unable to get links."
|
|
4479
|
-
),
|
|
4382
|
+
"VirusTotal - [ 429 ] Rate limit reached so unable to get links.",
|
|
4480
4383
|
"red",
|
|
4481
4384
|
)
|
|
4482
4385
|
)
|
|
@@ -4484,9 +4387,7 @@ def getVirusTotalUrls():
|
|
|
4484
4387
|
elif resp.status_code == 403:
|
|
4485
4388
|
writerr(
|
|
4486
4389
|
colored(
|
|
4487
|
-
|
|
4488
|
-
"[ 403 ] VirusTotal: Permission denied. Check your API key is correct."
|
|
4489
|
-
),
|
|
4390
|
+
"VirusTotal - [ 403 ] Permission denied. Check your API key is correct.",
|
|
4490
4391
|
"red",
|
|
4491
4392
|
)
|
|
4492
4393
|
)
|
|
@@ -4494,101 +4395,94 @@ def getVirusTotalUrls():
|
|
|
4494
4395
|
elif resp.status_code != 200:
|
|
4495
4396
|
writerr(
|
|
4496
4397
|
colored(
|
|
4497
|
-
|
|
4498
|
-
|
|
4499
|
-
|
|
4500
|
-
+ " ] Unable to get links from virustotal.com"
|
|
4501
|
-
),
|
|
4398
|
+
"VirusTotal - [ ERR ] [ "
|
|
4399
|
+
+ str(resp.status_code)
|
|
4400
|
+
+ " ] Unable to get links from virustotal.com",
|
|
4502
4401
|
"red",
|
|
4503
4402
|
)
|
|
4504
4403
|
)
|
|
4505
4404
|
return
|
|
4506
4405
|
|
|
4507
|
-
#
|
|
4406
|
+
# Parse JSON
|
|
4508
4407
|
try:
|
|
4509
4408
|
jsonResp = json.loads(resp.text.strip())
|
|
4510
4409
|
|
|
4511
|
-
#
|
|
4410
|
+
# Normalize arrays as (url, scan_date) tuples
|
|
4512
4411
|
if args.no_subs:
|
|
4513
|
-
|
|
4412
|
+
subdomains = []
|
|
4514
4413
|
else:
|
|
4515
|
-
|
|
4516
|
-
|
|
4517
|
-
|
|
4518
|
-
|
|
4519
|
-
|
|
4520
|
-
|
|
4521
|
-
|
|
4522
|
-
|
|
4523
|
-
|
|
4524
|
-
|
|
4525
|
-
|
|
4526
|
-
|
|
4527
|
-
|
|
4528
|
-
|
|
4529
|
-
|
|
4530
|
-
undetectedUrls = []
|
|
4531
|
-
try:
|
|
4532
|
-
totalUrls = set(subDomains + detectedUrls + undetectedUrls)
|
|
4533
|
-
except Exception:
|
|
4534
|
-
totalUrls = []
|
|
4535
|
-
except Exception:
|
|
4414
|
+
subdomains = [(sd, None) for sd in jsonResp.get("subdomains", [])]
|
|
4415
|
+
|
|
4416
|
+
detected_urls = [
|
|
4417
|
+
(entry.get("url"), entry.get("scan_date"))
|
|
4418
|
+
for entry in jsonResp.get("detected_urls", [])
|
|
4419
|
+
]
|
|
4420
|
+
|
|
4421
|
+
undetected_urls = [
|
|
4422
|
+
(entry[0], entry[4]) for entry in jsonResp.get("undetected_urls", [])
|
|
4423
|
+
]
|
|
4424
|
+
|
|
4425
|
+
# Combine all
|
|
4426
|
+
all_urls = subdomains + detected_urls + undetected_urls
|
|
4427
|
+
|
|
4428
|
+
except Exception as e:
|
|
4536
4429
|
writerr(
|
|
4537
4430
|
colored(
|
|
4538
|
-
|
|
4539
|
-
"[ ERR ] There was an unexpected response from the VirusTotal API"
|
|
4540
|
-
),
|
|
4431
|
+
"VirusTotal - [ ERR ] Unexpected response from the VirusTotal API: " + str(e),
|
|
4541
4432
|
"red",
|
|
4542
4433
|
)
|
|
4543
4434
|
)
|
|
4544
|
-
|
|
4435
|
+
all_urls = []
|
|
4545
4436
|
|
|
4437
|
+
# Check only mode
|
|
4546
4438
|
if args.check_only:
|
|
4547
4439
|
write(
|
|
4548
|
-
colored("Get URLs from VirusTotal: ", "cyan")
|
|
4440
|
+
colored("VirusTotal - [ INFO ] Get URLs from VirusTotal: ", "cyan")
|
|
4549
4441
|
+ colored("1 request", "white")
|
|
4550
4442
|
)
|
|
4551
4443
|
checkVirusTotal = 1
|
|
4552
4444
|
else:
|
|
4553
|
-
#
|
|
4554
|
-
for
|
|
4555
|
-
|
|
4556
|
-
if stopSource:
|
|
4445
|
+
# Process each URL tuple
|
|
4446
|
+
for url, scan_date in all_urls:
|
|
4447
|
+
if stopSourceVirusTotal:
|
|
4557
4448
|
break
|
|
4558
|
-
|
|
4559
|
-
# Get memory in case it exceeds threshold
|
|
4560
4449
|
getMemory()
|
|
4561
4450
|
|
|
4562
|
-
#
|
|
4563
|
-
|
|
4564
|
-
|
|
4565
|
-
|
|
4566
|
-
|
|
4567
|
-
|
|
4568
|
-
|
|
4569
|
-
|
|
4570
|
-
|
|
4571
|
-
|
|
4572
|
-
|
|
4573
|
-
|
|
4574
|
-
|
|
4575
|
-
|
|
4576
|
-
|
|
4577
|
-
|
|
4578
|
-
|
|
4579
|
-
|
|
4580
|
-
|
|
4581
|
-
|
|
4451
|
+
# Filter by date if -from or -to was passed and we have a date for the url
|
|
4452
|
+
if scan_date and (args.from_date is not None or args.to_date is not None):
|
|
4453
|
+
urlDate = datetime.strptime(scan_date, "%Y-%m-%d %H:%M:%S")
|
|
4454
|
+
# If from date passed, check
|
|
4455
|
+
if args.from_date is not None:
|
|
4456
|
+
fromDate = parseDateArg(args.from_date)
|
|
4457
|
+
if urlDate < fromDate:
|
|
4458
|
+
continue
|
|
4459
|
+
# If to date passed, check
|
|
4460
|
+
if args.to_date is not None:
|
|
4461
|
+
toDate = parseDateArg(args.to_date)
|
|
4462
|
+
if urlDate >= toDate:
|
|
4463
|
+
continue
|
|
4464
|
+
|
|
4465
|
+
# Process URL
|
|
4466
|
+
processVirusTotalUrl(url)
|
|
4467
|
+
|
|
4468
|
+
# Show links found
|
|
4469
|
+
linkCountVirusTotal = len(linksFoundVirusTotal)
|
|
4470
|
+
write(
|
|
4471
|
+
colored("VirusTotal - [ INFO ] Links found on virustotal.com: ", "cyan")
|
|
4472
|
+
+ colored(str(linkCountVirusTotal), "white")
|
|
4473
|
+
)
|
|
4474
|
+
linksFound.update(linksFoundVirusTotal)
|
|
4475
|
+
linksFoundVirusTotal.clear()
|
|
4582
4476
|
|
|
4583
4477
|
except Exception as e:
|
|
4584
|
-
writerr(colored("ERROR getVirusTotalUrls
|
|
4478
|
+
writerr(colored(f"ERROR getVirusTotalUrls: {e}", "red"))
|
|
4585
4479
|
|
|
4586
4480
|
|
|
4587
4481
|
def processIntelxUrl(url):
|
|
4588
4482
|
"""
|
|
4589
4483
|
Process a specific URL from intelx.io to determine whether to save the link
|
|
4590
4484
|
"""
|
|
4591
|
-
global argsInput, argsInputHostname
|
|
4485
|
+
global argsInput, argsInputHostname, linkCountIntelx, linksFoundIntelx
|
|
4592
4486
|
|
|
4593
4487
|
addLink = True
|
|
4594
4488
|
|
|
@@ -4634,15 +4528,13 @@ def processIntelxUrl(url):
|
|
|
4634
4528
|
flags=re.IGNORECASE,
|
|
4635
4529
|
)
|
|
4636
4530
|
else:
|
|
4637
|
-
match = re.search(
|
|
4638
|
-
r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE
|
|
4639
|
-
)
|
|
4531
|
+
match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
|
|
4640
4532
|
if match is None:
|
|
4641
4533
|
addLink = False
|
|
4642
4534
|
|
|
4643
4535
|
# Add link if it passed filters
|
|
4644
4536
|
if addLink:
|
|
4645
|
-
linksFoundAdd(url)
|
|
4537
|
+
linksFoundAdd(url, linksFoundIntelx)
|
|
4646
4538
|
|
|
4647
4539
|
except Exception as e:
|
|
4648
4540
|
writerr(colored("ERROR processIntelxUrl 1: " + str(e), "red"))
|
|
@@ -4653,6 +4545,7 @@ def processIntelxType(target, credits):
|
|
|
4653
4545
|
target: 1 - Domains
|
|
4654
4546
|
target: 3 - URLs
|
|
4655
4547
|
"""
|
|
4548
|
+
global intelxAPIIssue
|
|
4656
4549
|
try:
|
|
4657
4550
|
try:
|
|
4658
4551
|
requestsMade = 0
|
|
@@ -4665,18 +4558,14 @@ def processIntelxType(target, credits):
|
|
|
4665
4558
|
# Pass the API key in the X-Key header too.
|
|
4666
4559
|
resp = session.post(
|
|
4667
4560
|
INTELX_SEARCH_URL,
|
|
4668
|
-
data='{"term":"'
|
|
4669
|
-
+ quote(argsInputHostname)
|
|
4670
|
-
+ '","target":'
|
|
4671
|
-
+ str(target)
|
|
4672
|
-
+ "}",
|
|
4561
|
+
data='{"term":"' + quote(argsInputHostname) + '","target":' + str(target) + "}",
|
|
4673
4562
|
headers={"User-Agent": userAgent, "X-Key": INTELX_API_KEY},
|
|
4674
4563
|
)
|
|
4675
4564
|
requestsMade = requestsMade + 1
|
|
4676
4565
|
except Exception as e:
|
|
4677
4566
|
write(
|
|
4678
4567
|
colored(
|
|
4679
|
-
|
|
4568
|
+
"IntelX - [ ERR ] Unable to get links from intelx.io: " + str(e),
|
|
4680
4569
|
"red",
|
|
4681
4570
|
)
|
|
4682
4571
|
)
|
|
@@ -4684,53 +4573,47 @@ def processIntelxType(target, credits):
|
|
|
4684
4573
|
|
|
4685
4574
|
# Deal with any errors
|
|
4686
4575
|
if resp.status_code == 429:
|
|
4576
|
+
intelxAPIIssue = True
|
|
4687
4577
|
writerr(
|
|
4688
4578
|
colored(
|
|
4689
|
-
|
|
4690
|
-
"[ 429 ] IntelX rate limit reached so unable to get links."
|
|
4691
|
-
),
|
|
4579
|
+
"IntelX - [ 429 ] Rate limit reached so unable to get links.",
|
|
4692
4580
|
"red",
|
|
4693
4581
|
)
|
|
4694
4582
|
)
|
|
4695
4583
|
return
|
|
4696
4584
|
elif resp.status_code == 401:
|
|
4585
|
+
intelxAPIIssue = True
|
|
4697
4586
|
writerr(
|
|
4698
4587
|
colored(
|
|
4699
|
-
|
|
4700
|
-
"[ 401 ] IntelX: Not authorized. The source requires a paid API key. Check your API key is correct."
|
|
4701
|
-
),
|
|
4588
|
+
"IntelX - [ 401 ] Not authorized. The source requires a paid API key. Check your API key is correct.",
|
|
4702
4589
|
"red",
|
|
4703
4590
|
)
|
|
4704
4591
|
)
|
|
4705
4592
|
return
|
|
4706
4593
|
elif resp.status_code == 402:
|
|
4594
|
+
intelxAPIIssue = True
|
|
4707
4595
|
if credits.startswith("0/"):
|
|
4708
4596
|
writerr(
|
|
4709
4597
|
colored(
|
|
4710
|
-
|
|
4711
|
-
|
|
4712
|
-
|
|
4713
|
-
+ ")."
|
|
4714
|
-
),
|
|
4598
|
+
"IntelX - [ 402 ] You have run out of daily credits on Intelx ("
|
|
4599
|
+
+ credits
|
|
4600
|
+
+ ").",
|
|
4715
4601
|
"red",
|
|
4716
4602
|
)
|
|
4717
4603
|
)
|
|
4718
4604
|
else:
|
|
4719
4605
|
writerr(
|
|
4720
4606
|
colored(
|
|
4721
|
-
|
|
4722
|
-
"[ 402 ] IntelX: It appears you have run out of daily credits on Intelx."
|
|
4723
|
-
),
|
|
4607
|
+
"IntelX - [ 402 ] It appears you have run out of daily credits on Intelx.",
|
|
4724
4608
|
"red",
|
|
4725
4609
|
)
|
|
4726
4610
|
)
|
|
4727
4611
|
return
|
|
4728
4612
|
elif resp.status_code == 403:
|
|
4613
|
+
intelxAPIIssue = True
|
|
4729
4614
|
writerr(
|
|
4730
4615
|
colored(
|
|
4731
|
-
|
|
4732
|
-
"[ 403 ] IntelX: Permission denied. Check your API key is correct."
|
|
4733
|
-
),
|
|
4616
|
+
"IntelX - [ 403 ] Permission denied. Check your API key is correct.",
|
|
4734
4617
|
"red",
|
|
4735
4618
|
)
|
|
4736
4619
|
)
|
|
@@ -4738,11 +4621,7 @@ def processIntelxType(target, credits):
|
|
|
4738
4621
|
elif resp.status_code != 200:
|
|
4739
4622
|
writerr(
|
|
4740
4623
|
colored(
|
|
4741
|
-
|
|
4742
|
-
"[ "
|
|
4743
|
-
+ str(resp.status_code)
|
|
4744
|
-
+ " ] Unable to get links from intelx.io"
|
|
4745
|
-
),
|
|
4624
|
+
"IntelX - [ " + str(resp.status_code) + " ] Unable to get links from intelx.io",
|
|
4746
4625
|
"red",
|
|
4747
4626
|
)
|
|
4748
4627
|
)
|
|
@@ -4755,9 +4634,7 @@ def processIntelxType(target, credits):
|
|
|
4755
4634
|
except Exception:
|
|
4756
4635
|
writerr(
|
|
4757
4636
|
colored(
|
|
4758
|
-
|
|
4759
|
-
"[ ERR ] There was an unexpected response from the Intelligence API"
|
|
4760
|
-
),
|
|
4637
|
+
"IntelX - [ ERR ] There was an unexpected response from the Intelligence API",
|
|
4761
4638
|
"red",
|
|
4762
4639
|
)
|
|
4763
4640
|
)
|
|
@@ -4767,7 +4644,7 @@ def processIntelxType(target, credits):
|
|
|
4767
4644
|
moreResults = True
|
|
4768
4645
|
status = 0
|
|
4769
4646
|
while moreResults:
|
|
4770
|
-
if
|
|
4647
|
+
if stopSourceIntelx:
|
|
4771
4648
|
break
|
|
4772
4649
|
try:
|
|
4773
4650
|
resp = session.get(
|
|
@@ -4778,9 +4655,7 @@ def processIntelxType(target, credits):
|
|
|
4778
4655
|
except Exception as e:
|
|
4779
4656
|
write(
|
|
4780
4657
|
colored(
|
|
4781
|
-
|
|
4782
|
-
"[ ERR ] Unable to get links from intelx.io: " + str(e)
|
|
4783
|
-
),
|
|
4658
|
+
"IntelX - [ ERR ] Unable to get links from intelx.io: " + str(e),
|
|
4784
4659
|
"red",
|
|
4785
4660
|
)
|
|
4786
4661
|
)
|
|
@@ -4793,9 +4668,7 @@ def processIntelxType(target, credits):
|
|
|
4793
4668
|
except Exception:
|
|
4794
4669
|
writerr(
|
|
4795
4670
|
colored(
|
|
4796
|
-
|
|
4797
|
-
"[ ERR ] There was an unexpected response from the Intelligence API"
|
|
4798
|
-
),
|
|
4671
|
+
"IntelX - [ ERR ] There was an unexpected response from the Intelligence API",
|
|
4799
4672
|
"red",
|
|
4800
4673
|
)
|
|
4801
4674
|
)
|
|
@@ -4817,7 +4690,7 @@ def processIntelxType(target, credits):
|
|
|
4817
4690
|
# Work out whether to include each url
|
|
4818
4691
|
unique_values = list(set(selector_values + selector_valuesh))
|
|
4819
4692
|
for ixurl in unique_values:
|
|
4820
|
-
if
|
|
4693
|
+
if stopSourceIntelx:
|
|
4821
4694
|
break
|
|
4822
4695
|
processIntelxUrl(ixurl)
|
|
4823
4696
|
|
|
@@ -4845,14 +4718,10 @@ def getIntelxAccountInfo() -> str:
|
|
|
4845
4718
|
)
|
|
4846
4719
|
jsonResp = json.loads(resp.text.strip())
|
|
4847
4720
|
credits = str(
|
|
4848
|
-
jsonResp.get("paths", {})
|
|
4849
|
-
.get("/phonebook/search", {})
|
|
4850
|
-
.get("Credit", "Unknown")
|
|
4721
|
+
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
|
|
4851
4722
|
)
|
|
4852
4723
|
credits_max = str(
|
|
4853
|
-
jsonResp.get("paths", {})
|
|
4854
|
-
.get("/phonebook/search", {})
|
|
4855
|
-
.get("CreditMax", "Unknown")
|
|
4724
|
+
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
|
|
4856
4725
|
)
|
|
4857
4726
|
return credits + "/" + credits_max
|
|
4858
4727
|
except Exception:
|
|
@@ -4863,25 +4732,26 @@ def getIntelxUrls():
|
|
|
4863
4732
|
"""
|
|
4864
4733
|
Get URLs from the Intelligence X Phonebook search
|
|
4865
4734
|
"""
|
|
4866
|
-
global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram,
|
|
4735
|
+
global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx
|
|
4867
4736
|
|
|
4868
4737
|
# Write the file of URL's for the passed domain/URL
|
|
4869
4738
|
try:
|
|
4870
4739
|
if args.check_only:
|
|
4871
4740
|
write(
|
|
4872
|
-
colored("Get URLs from Intelligence X: ", "cyan")
|
|
4741
|
+
colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
|
|
4873
4742
|
+ colored("minimum 4 requests", "white")
|
|
4874
4743
|
)
|
|
4875
4744
|
checkIntelx = 4
|
|
4876
4745
|
return
|
|
4877
4746
|
|
|
4878
|
-
|
|
4879
|
-
|
|
4747
|
+
stopSourceIntelx = False
|
|
4748
|
+
linksFoundIntelx = set()
|
|
4749
|
+
|
|
4880
4750
|
credits = getIntelxAccountInfo()
|
|
4881
4751
|
if verbose():
|
|
4882
4752
|
write(
|
|
4883
4753
|
colored(
|
|
4884
|
-
"The Intelligence X URL requested to get links (Credits: "
|
|
4754
|
+
"IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
|
|
4885
4755
|
+ credits
|
|
4886
4756
|
+ "): ",
|
|
4887
4757
|
"magenta",
|
|
@@ -4890,32 +4760,23 @@ def getIntelxUrls():
|
|
|
4890
4760
|
)
|
|
4891
4761
|
|
|
4892
4762
|
if not args.check_only:
|
|
4893
|
-
write(colored("
|
|
4763
|
+
write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
|
|
4894
4764
|
|
|
4895
4765
|
# Get the domains from Intelligence X if the --no-subs wasn't passed
|
|
4896
4766
|
if not args.no_subs:
|
|
4897
4767
|
processIntelxType(1, credits)
|
|
4898
4768
|
|
|
4899
4769
|
# Get the URLs from Intelligence X
|
|
4900
|
-
|
|
4770
|
+
if not intelxAPIIssue:
|
|
4771
|
+
processIntelxType(3, credits)
|
|
4901
4772
|
|
|
4902
|
-
|
|
4903
|
-
|
|
4904
|
-
|
|
4905
|
-
|
|
4906
|
-
|
|
4907
|
-
|
|
4908
|
-
|
|
4909
|
-
+ "\n"
|
|
4910
|
-
)
|
|
4911
|
-
else:
|
|
4912
|
-
write(
|
|
4913
|
-
getSPACER(
|
|
4914
|
-
colored("Extra links found on intelx.io: ", "cyan")
|
|
4915
|
-
+ colored(str(linkCount), "white")
|
|
4916
|
-
)
|
|
4917
|
-
+ "\n"
|
|
4918
|
-
)
|
|
4773
|
+
linkCountIntelx = len(linksFoundIntelx)
|
|
4774
|
+
write(
|
|
4775
|
+
colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
|
|
4776
|
+
+ colored(str(linkCountIntelx), "white")
|
|
4777
|
+
)
|
|
4778
|
+
linksFound.update(linksFoundIntelx)
|
|
4779
|
+
linksFoundIntelx.clear()
|
|
4919
4780
|
|
|
4920
4781
|
except Exception as e:
|
|
4921
4782
|
writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
|
|
@@ -4968,27 +4829,23 @@ def processResponsesURLScan():
|
|
|
4968
4829
|
indexPath = responseOutputDirectory + "waymore_index.txt"
|
|
4969
4830
|
except Exception as e:
|
|
4970
4831
|
if verbose():
|
|
4971
|
-
writerr(
|
|
4972
|
-
colored("ERROR processResponsesURLScan 4: " + str(e), "red")
|
|
4973
|
-
)
|
|
4832
|
+
writerr(colored("ERROR processResponsesURLScan 4: " + str(e), "red"))
|
|
4974
4833
|
|
|
4975
4834
|
# Get URLs from URLScan.io if the DOM ID's haven't been retrieved yet
|
|
4976
|
-
if
|
|
4977
|
-
|
|
4978
|
-
|
|
4979
|
-
|
|
4980
|
-
|
|
4835
|
+
if stopProgram is None and not args.check_only:
|
|
4836
|
+
if args.mode in ("R", "B"):
|
|
4837
|
+
write(
|
|
4838
|
+
colored(
|
|
4839
|
+
"URLScan - [ INFO ] Getting list of response links (this can take a while for some domains)...",
|
|
4840
|
+
"cyan",
|
|
4841
|
+
)
|
|
4981
4842
|
)
|
|
4982
|
-
|
|
4983
|
-
|
|
4843
|
+
if args.mode == "R":
|
|
4844
|
+
getURLScanUrls()
|
|
4984
4845
|
|
|
4985
4846
|
# Check if a continueResp.URLScan.tmp and responses.URLScan.tmp files exists
|
|
4986
4847
|
runPrevious = "n"
|
|
4987
|
-
if (
|
|
4988
|
-
not args.check_only
|
|
4989
|
-
and os.path.exists(continuePath)
|
|
4990
|
-
and os.path.exists(responsesPath)
|
|
4991
|
-
):
|
|
4848
|
+
if not args.check_only and os.path.exists(continuePath) and os.path.exists(responsesPath):
|
|
4992
4849
|
|
|
4993
4850
|
# Load the links into the set
|
|
4994
4851
|
with open(responsesPath, "rb") as fl:
|
|
@@ -4997,7 +4854,7 @@ def processResponsesURLScan():
|
|
|
4997
4854
|
|
|
4998
4855
|
# Get the previous end position to start again at this point
|
|
4999
4856
|
try:
|
|
5000
|
-
with open(continuePath
|
|
4857
|
+
with open(continuePath) as fc:
|
|
5001
4858
|
successCount = int(fc.readline().strip())
|
|
5002
4859
|
except Exception:
|
|
5003
4860
|
successCount = 0
|
|
@@ -5082,25 +4939,6 @@ def processResponsesURLScan():
|
|
|
5082
4939
|
"green",
|
|
5083
4940
|
)
|
|
5084
4941
|
)
|
|
5085
|
-
# if args.limit == 5000 and totalResponses == 5000:
|
|
5086
|
-
# writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests (the --limit argument defaults to '+str(DEFAULT_LIMIT)+')','cyan'))
|
|
5087
|
-
# else:
|
|
5088
|
-
# writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests','white'))
|
|
5089
|
-
# minutes = round(totalResponses*2.5 // 60)
|
|
5090
|
-
# hours = minutes // 60
|
|
5091
|
-
# days = hours // 24
|
|
5092
|
-
# if minutes < 5:
|
|
5093
|
-
# write(colored('\n-> Downloading the responses (depending on their size) should be quite quick!','green'))
|
|
5094
|
-
# elif hours < 2:
|
|
5095
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(minutes)+' minutes.','green'))
|
|
5096
|
-
# elif hours < 6:
|
|
5097
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','green'))
|
|
5098
|
-
# elif hours < 24:
|
|
5099
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','yellow'))
|
|
5100
|
-
# elif days < 7:
|
|
5101
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days. Consider using arguments -ko, -l, -ci, -from and -to wisely! ','red'))
|
|
5102
|
-
# else:
|
|
5103
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days!!! Consider using arguments -ko, -l, -ci, -from and -to wisely!','red'))
|
|
5104
4942
|
write("")
|
|
5105
4943
|
else:
|
|
5106
4944
|
# If the limit has been set over the default, give a warning that this could take a long time!
|
|
@@ -5162,7 +5000,7 @@ def processResponsesURLScan():
|
|
|
5162
5000
|
if failureCount > 0:
|
|
5163
5001
|
if verbose():
|
|
5164
5002
|
write(
|
|
5165
|
-
colored("
|
|
5003
|
+
colored("URLScan - [ INFO ] Responses saved to ", "cyan")
|
|
5166
5004
|
+ colored(responseOutputDirectory, "white")
|
|
5167
5005
|
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
5168
5006
|
+ colored(
|
|
@@ -5177,10 +5015,7 @@ def processResponsesURLScan():
|
|
|
5177
5015
|
else:
|
|
5178
5016
|
write(
|
|
5179
5017
|
colored(
|
|
5180
|
-
"
|
|
5181
|
-
+ subs
|
|
5182
|
-
+ argsInput
|
|
5183
|
-
+ ": ",
|
|
5018
|
+
"URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5184
5019
|
"cyan",
|
|
5185
5020
|
)
|
|
5186
5021
|
+ colored(
|
|
@@ -5195,7 +5030,10 @@ def processResponsesURLScan():
|
|
|
5195
5030
|
else:
|
|
5196
5031
|
if verbose():
|
|
5197
5032
|
write(
|
|
5198
|
-
colored(
|
|
5033
|
+
colored(
|
|
5034
|
+
"URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5035
|
+
"cyan",
|
|
5036
|
+
)
|
|
5199
5037
|
+ colored(responseOutputDirectory, "white")
|
|
5200
5038
|
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
5201
5039
|
+ colored(
|
|
@@ -5209,10 +5047,7 @@ def processResponsesURLScan():
|
|
|
5209
5047
|
else:
|
|
5210
5048
|
write(
|
|
5211
5049
|
colored(
|
|
5212
|
-
"
|
|
5213
|
-
+ subs
|
|
5214
|
-
+ argsInput
|
|
5215
|
-
+ ": ",
|
|
5050
|
+
"URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5216
5051
|
"cyan",
|
|
5217
5052
|
)
|
|
5218
5053
|
+ colored(
|
|
@@ -5225,9 +5060,7 @@ def processResponsesURLScan():
|
|
|
5225
5060
|
)
|
|
5226
5061
|
except Exception as e:
|
|
5227
5062
|
if verbose():
|
|
5228
|
-
writerr(
|
|
5229
|
-
colored("ERROR processResponsesURLScan 5: " + str(e), "red")
|
|
5230
|
-
)
|
|
5063
|
+
writerr(colored("ERROR processResponsesURLScan 5: " + str(e), "red"))
|
|
5231
5064
|
|
|
5232
5065
|
totalFileCount = totalFileCount + fileCount
|
|
5233
5066
|
except Exception as e:
|
|
@@ -5240,7 +5073,7 @@ def processResponsesWayback():
|
|
|
5240
5073
|
"""
|
|
5241
5074
|
Get archived responses from Wayback Machine (archive.org)
|
|
5242
5075
|
"""
|
|
5243
|
-
global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount
|
|
5076
|
+
global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount, current_response, current_session
|
|
5244
5077
|
try:
|
|
5245
5078
|
fileCount = 0
|
|
5246
5079
|
failureCount = 0
|
|
@@ -5255,17 +5088,11 @@ def processResponsesWayback():
|
|
|
5255
5088
|
indexPath = responseOutputDirectory + "waymore_index.txt"
|
|
5256
5089
|
except Exception as e:
|
|
5257
5090
|
if verbose():
|
|
5258
|
-
writerr(
|
|
5259
|
-
colored("ERROR processResponsesWayback 4: " + str(e), "red")
|
|
5260
|
-
)
|
|
5091
|
+
writerr(colored("ERROR processResponsesWayback 4: " + str(e), "red"))
|
|
5261
5092
|
|
|
5262
5093
|
# Check if a continueResp.tmp and responses.tmp files exists
|
|
5263
5094
|
runPrevious = "n"
|
|
5264
|
-
if (
|
|
5265
|
-
not args.check_only
|
|
5266
|
-
and os.path.exists(continuePath)
|
|
5267
|
-
and os.path.exists(responsesPath)
|
|
5268
|
-
):
|
|
5095
|
+
if not args.check_only and os.path.exists(continuePath) and os.path.exists(responsesPath):
|
|
5269
5096
|
|
|
5270
5097
|
# Load the links into the set
|
|
5271
5098
|
with open(responsesPath, "rb") as fl:
|
|
@@ -5274,7 +5101,7 @@ def processResponsesWayback():
|
|
|
5274
5101
|
|
|
5275
5102
|
# Get the previous end position to start again at this point
|
|
5276
5103
|
try:
|
|
5277
|
-
with open(continuePath
|
|
5104
|
+
with open(continuePath) as fc:
|
|
5278
5105
|
successCount = int(fc.readline().strip())
|
|
5279
5106
|
except Exception:
|
|
5280
5107
|
successCount = 0
|
|
@@ -5349,9 +5176,7 @@ def processResponsesWayback():
|
|
|
5349
5176
|
# Set mime content type filter
|
|
5350
5177
|
filterMIME = ""
|
|
5351
5178
|
if MATCH_MIME.strip() != "":
|
|
5352
|
-
filterMIME = "&filter=mimetype:" + re.escape(MATCH_MIME).replace(
|
|
5353
|
-
",", "|"
|
|
5354
|
-
)
|
|
5179
|
+
filterMIME = "&filter=mimetype:" + re.escape(MATCH_MIME).replace(",", "|")
|
|
5355
5180
|
else:
|
|
5356
5181
|
filterMIME = "&filter=!mimetype:warc/revisit"
|
|
5357
5182
|
filterMIME = filterMIME + "|" + re.escape(FILTER_MIME).replace(",", "|")
|
|
@@ -5359,13 +5184,9 @@ def processResponsesWayback():
|
|
|
5359
5184
|
# Set status code filter
|
|
5360
5185
|
filterCode = ""
|
|
5361
5186
|
if MATCH_CODE.strip() != "":
|
|
5362
|
-
filterCode = "&filter=statuscode:" + re.escape(MATCH_CODE).replace(
|
|
5363
|
-
",", "|"
|
|
5364
|
-
)
|
|
5187
|
+
filterCode = "&filter=statuscode:" + re.escape(MATCH_CODE).replace(",", "|")
|
|
5365
5188
|
else:
|
|
5366
|
-
filterCode = "&filter=!statuscode:" + re.escape(FILTER_CODE).replace(
|
|
5367
|
-
",", "|"
|
|
5368
|
-
)
|
|
5189
|
+
filterCode = "&filter=!statuscode:" + re.escape(FILTER_CODE).replace(",", "|")
|
|
5369
5190
|
|
|
5370
5191
|
# Set the collapse parameter value in the archive.org URL. From the Wayback API docs:
|
|
5371
5192
|
# "A new form of filtering is the option to 'collapse' results based on a field, or a substring of a field.
|
|
@@ -5377,9 +5198,7 @@ def processResponsesWayback():
|
|
|
5377
5198
|
collapse = "&collapse=timestamp:10"
|
|
5378
5199
|
elif args.capture_interval == "d": # get at most 1 capture per URL per day
|
|
5379
5200
|
collapse = "&collapse=timestamp:8"
|
|
5380
|
-
elif
|
|
5381
|
-
args.capture_interval == "m"
|
|
5382
|
-
): # get at most 1 capture per URL per month
|
|
5201
|
+
elif args.capture_interval == "m": # get at most 1 capture per URL per month
|
|
5383
5202
|
collapse = "&collapse=timestamp:6"
|
|
5384
5203
|
|
|
5385
5204
|
url = (
|
|
@@ -5397,18 +5216,18 @@ def processResponsesWayback():
|
|
|
5397
5216
|
if verbose():
|
|
5398
5217
|
write(
|
|
5399
5218
|
colored(
|
|
5400
|
-
"
|
|
5219
|
+
"Wayback - [ INFO ] The URL requested to get responses: ",
|
|
5401
5220
|
"magenta",
|
|
5402
5221
|
)
|
|
5403
5222
|
+ colored(url + "\n", "white")
|
|
5404
5223
|
)
|
|
5405
5224
|
|
|
5406
5225
|
if args.check_only:
|
|
5407
|
-
write(colored("
|
|
5226
|
+
write(colored("Wayback - [ INFO ] Checking archived response requests...", "cyan"))
|
|
5408
5227
|
else:
|
|
5409
5228
|
write(
|
|
5410
5229
|
colored(
|
|
5411
|
-
"
|
|
5230
|
+
"Wayback - [ INFO ] Getting list of response links (this can take a while for some domains)...",
|
|
5412
5231
|
"cyan",
|
|
5413
5232
|
)
|
|
5414
5233
|
)
|
|
@@ -5421,18 +5240,24 @@ def processResponsesWayback():
|
|
|
5421
5240
|
session = requests.Session()
|
|
5422
5241
|
session.mount("https://", HTTP_ADAPTER)
|
|
5423
5242
|
session.mount("http://", HTTP_ADAPTER)
|
|
5243
|
+
try:
|
|
5244
|
+
current_session = session
|
|
5245
|
+
except Exception:
|
|
5246
|
+
pass
|
|
5424
5247
|
resp = session.get(
|
|
5425
5248
|
url,
|
|
5426
5249
|
stream=True,
|
|
5427
5250
|
headers={"User-Agent": userAgent},
|
|
5428
5251
|
timeout=args.timeout,
|
|
5429
5252
|
)
|
|
5253
|
+
try:
|
|
5254
|
+
current_response = resp
|
|
5255
|
+
except Exception:
|
|
5256
|
+
pass
|
|
5430
5257
|
except ConnectionError:
|
|
5431
5258
|
writerr(
|
|
5432
5259
|
colored(
|
|
5433
|
-
getSPACER(
|
|
5434
|
-
"[ ERR ] Wayback Machine (archive.org) connection error"
|
|
5435
|
-
),
|
|
5260
|
+
getSPACER("Wayback - [ ERR ] Connection error"),
|
|
5436
5261
|
"red",
|
|
5437
5262
|
)
|
|
5438
5263
|
)
|
|
@@ -5442,7 +5267,7 @@ def processResponsesWayback():
|
|
|
5442
5267
|
except Exception as e:
|
|
5443
5268
|
writerr(
|
|
5444
5269
|
colored(
|
|
5445
|
-
getSPACER("[ ERR ] Couldn't get list of responses: " + str(e)),
|
|
5270
|
+
getSPACER("Wayback - [ ERR ] Couldn't get list of responses: " + str(e)),
|
|
5446
5271
|
"red",
|
|
5447
5272
|
)
|
|
5448
5273
|
)
|
|
@@ -5457,7 +5282,7 @@ def processResponsesWayback():
|
|
|
5457
5282
|
writerr(
|
|
5458
5283
|
colored(
|
|
5459
5284
|
getSPACER(
|
|
5460
|
-
"No archived responses were found on Wayback Machine (archive.org) for the given search parameters."
|
|
5285
|
+
"Wayback - [ ERR ] No archived responses were found on Wayback Machine (archive.org) for the given search parameters."
|
|
5461
5286
|
),
|
|
5462
5287
|
"red",
|
|
5463
5288
|
)
|
|
@@ -5468,7 +5293,7 @@ def processResponsesWayback():
|
|
|
5468
5293
|
writerr(
|
|
5469
5294
|
colored(
|
|
5470
5295
|
getSPACER(
|
|
5471
|
-
"[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
5296
|
+
"Wayback - [ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
5472
5297
|
),
|
|
5473
5298
|
"red",
|
|
5474
5299
|
)
|
|
@@ -5479,7 +5304,7 @@ def processResponsesWayback():
|
|
|
5479
5304
|
writerr(
|
|
5480
5305
|
colored(
|
|
5481
5306
|
getSPACER(
|
|
5482
|
-
"[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
|
|
5307
|
+
"Wayback - [ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
|
|
5483
5308
|
),
|
|
5484
5309
|
"red",
|
|
5485
5310
|
)
|
|
@@ -5491,7 +5316,7 @@ def processResponsesWayback():
|
|
|
5491
5316
|
writerr(
|
|
5492
5317
|
colored(
|
|
5493
5318
|
getSPACER(
|
|
5494
|
-
"[ "
|
|
5319
|
+
"Wayback - [ "
|
|
5495
5320
|
+ str(resp.status_code)
|
|
5496
5321
|
+ " ] Error for "
|
|
5497
5322
|
+ url
|
|
@@ -5506,7 +5331,7 @@ def processResponsesWayback():
|
|
|
5506
5331
|
writerr(
|
|
5507
5332
|
colored(
|
|
5508
5333
|
getSPACER(
|
|
5509
|
-
"Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml"
|
|
5334
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml"
|
|
5510
5335
|
),
|
|
5511
5336
|
"red",
|
|
5512
5337
|
)
|
|
@@ -5515,7 +5340,7 @@ def processResponsesWayback():
|
|
|
5515
5340
|
writerr(
|
|
5516
5341
|
colored(
|
|
5517
5342
|
getSPACER(
|
|
5518
|
-
"Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed"
|
|
5343
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed"
|
|
5519
5344
|
),
|
|
5520
5345
|
"red",
|
|
5521
5346
|
)
|
|
@@ -5525,7 +5350,7 @@ def processResponsesWayback():
|
|
|
5525
5350
|
writerr(
|
|
5526
5351
|
colored(
|
|
5527
5352
|
getSPACER(
|
|
5528
|
-
"Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)"
|
|
5353
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)"
|
|
5529
5354
|
),
|
|
5530
5355
|
"red",
|
|
5531
5356
|
)
|
|
@@ -5534,7 +5359,7 @@ def processResponsesWayback():
|
|
|
5534
5359
|
writerr(
|
|
5535
5360
|
colored(
|
|
5536
5361
|
getSPACER(
|
|
5537
|
-
"Failed to get links from Wayback Machine (archive.org) - check input domain and try again."
|
|
5362
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - check input domain and try again."
|
|
5538
5363
|
),
|
|
5539
5364
|
"red",
|
|
5540
5365
|
)
|
|
@@ -5544,23 +5369,43 @@ def processResponsesWayback():
|
|
|
5544
5369
|
pass
|
|
5545
5370
|
|
|
5546
5371
|
# Go through the response to save the links found
|
|
5547
|
-
|
|
5372
|
+
try:
|
|
5373
|
+
for line in resp.iter_lines():
|
|
5374
|
+
try:
|
|
5375
|
+
results = line.decode("utf-8")
|
|
5376
|
+
parts = results.split(" ", 2)
|
|
5377
|
+
timestamp = parts[0]
|
|
5378
|
+
originalUrl = parts[1]
|
|
5379
|
+
linksFoundResponseAdd(timestamp + "/" + originalUrl)
|
|
5380
|
+
except Exception:
|
|
5381
|
+
writerr(
|
|
5382
|
+
colored(
|
|
5383
|
+
getSPACER(
|
|
5384
|
+
"ERROR processResponsesWayback 3: Cannot to get link from line: "
|
|
5385
|
+
+ str(line)
|
|
5386
|
+
),
|
|
5387
|
+
"red",
|
|
5388
|
+
)
|
|
5389
|
+
)
|
|
5390
|
+
finally:
|
|
5548
5391
|
try:
|
|
5549
|
-
|
|
5550
|
-
parts = results.split(" ", 2)
|
|
5551
|
-
timestamp = parts[0]
|
|
5552
|
-
originalUrl = parts[1]
|
|
5553
|
-
linksFoundResponseAdd(timestamp + "/" + originalUrl)
|
|
5392
|
+
current_response = None
|
|
5554
5393
|
except Exception:
|
|
5555
|
-
|
|
5556
|
-
|
|
5557
|
-
|
|
5558
|
-
|
|
5559
|
-
|
|
5560
|
-
|
|
5561
|
-
|
|
5562
|
-
|
|
5563
|
-
|
|
5394
|
+
pass
|
|
5395
|
+
try:
|
|
5396
|
+
current_session = None
|
|
5397
|
+
except Exception:
|
|
5398
|
+
pass
|
|
5399
|
+
|
|
5400
|
+
# Cleanup shared response/session references now the response has been processed
|
|
5401
|
+
try:
|
|
5402
|
+
current_response = None
|
|
5403
|
+
except Exception:
|
|
5404
|
+
pass
|
|
5405
|
+
try:
|
|
5406
|
+
current_session = None
|
|
5407
|
+
except Exception:
|
|
5408
|
+
pass
|
|
5564
5409
|
|
|
5565
5410
|
# Remove any links that have URL exclusions
|
|
5566
5411
|
linkRequests = []
|
|
@@ -5574,8 +5419,7 @@ def processResponsesWayback():
|
|
|
5574
5419
|
# b) it does not match the URL exclusions
|
|
5575
5420
|
if (
|
|
5576
5421
|
args.regex_after is None
|
|
5577
|
-
or re.search(args.regex_after, link, flags=re.IGNORECASE)
|
|
5578
|
-
is not None
|
|
5422
|
+
or re.search(args.regex_after, link, flags=re.IGNORECASE) is not None
|
|
5579
5423
|
) and exclusionRegex.search(link) is None:
|
|
5580
5424
|
linkRequests.append(link)
|
|
5581
5425
|
|
|
@@ -5594,7 +5438,7 @@ def processResponsesWayback():
|
|
|
5594
5438
|
writerr(
|
|
5595
5439
|
colored(
|
|
5596
5440
|
getSPACER(
|
|
5597
|
-
'Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'
|
|
5441
|
+
'Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'
|
|
5598
5442
|
+ originalUrl
|
|
5599
5443
|
+ "\") but they didn't match the input you gave. Check input and try again."
|
|
5600
5444
|
),
|
|
@@ -5605,7 +5449,7 @@ def processResponsesWayback():
|
|
|
5605
5449
|
writerr(
|
|
5606
5450
|
colored(
|
|
5607
5451
|
getSPACER(
|
|
5608
|
-
"Failed to get links from Wayback Machine (archive.org) - check input and try again."
|
|
5452
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - check input and try again."
|
|
5609
5453
|
),
|
|
5610
5454
|
"red",
|
|
5611
5455
|
)
|
|
@@ -5748,7 +5592,7 @@ def processResponsesWayback():
|
|
|
5748
5592
|
if failureCount > 0:
|
|
5749
5593
|
if verbose():
|
|
5750
5594
|
write(
|
|
5751
|
-
colored("
|
|
5595
|
+
colored("Wayback - [ INFO ] Responses saved to ", "cyan")
|
|
5752
5596
|
+ colored(responseOutputDirectory, "white")
|
|
5753
5597
|
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
5754
5598
|
+ colored(
|
|
@@ -5763,10 +5607,7 @@ def processResponsesWayback():
|
|
|
5763
5607
|
else:
|
|
5764
5608
|
write(
|
|
5765
5609
|
colored(
|
|
5766
|
-
"
|
|
5767
|
-
+ subs
|
|
5768
|
-
+ argsInput
|
|
5769
|
-
+ ": ",
|
|
5610
|
+
"Wayback - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5770
5611
|
"cyan",
|
|
5771
5612
|
)
|
|
5772
5613
|
+ colored(
|
|
@@ -5781,7 +5622,7 @@ def processResponsesWayback():
|
|
|
5781
5622
|
else:
|
|
5782
5623
|
if verbose():
|
|
5783
5624
|
write(
|
|
5784
|
-
colored("
|
|
5625
|
+
colored("Wayback - [ INFO ] Responses saved to ", "cyan")
|
|
5785
5626
|
+ colored(responseOutputDirectory, "white")
|
|
5786
5627
|
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
5787
5628
|
+ colored(
|
|
@@ -5795,10 +5636,7 @@ def processResponsesWayback():
|
|
|
5795
5636
|
else:
|
|
5796
5637
|
write(
|
|
5797
5638
|
colored(
|
|
5798
|
-
"
|
|
5799
|
-
+ subs
|
|
5800
|
-
+ argsInput
|
|
5801
|
-
+ ": ",
|
|
5639
|
+
"Wayback - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5802
5640
|
"cyan",
|
|
5803
5641
|
)
|
|
5804
5642
|
+ colored(
|
|
@@ -5811,9 +5649,7 @@ def processResponsesWayback():
|
|
|
5811
5649
|
)
|
|
5812
5650
|
except Exception as e:
|
|
5813
5651
|
if verbose():
|
|
5814
|
-
writerr(
|
|
5815
|
-
colored("ERROR processResponsesWayback 5: " + str(e), "red")
|
|
5816
|
-
)
|
|
5652
|
+
writerr(colored("ERROR processResponsesWayback 5: " + str(e), "red"))
|
|
5817
5653
|
|
|
5818
5654
|
totalFileCount = totalFileCount + fileCount
|
|
5819
5655
|
except Exception as e:
|
|
@@ -5911,8 +5747,7 @@ def notifyDiscord():
|
|
|
5911
5747
|
writerr(
|
|
5912
5748
|
colored(
|
|
5913
5749
|
getSPACER(
|
|
5914
|
-
"WARNING: Failed to send notification to Discord - "
|
|
5915
|
-
+ result.json()
|
|
5750
|
+
"WARNING: Failed to send notification to Discord - " + result.json()
|
|
5916
5751
|
),
|
|
5917
5752
|
"yellow",
|
|
5918
5753
|
)
|
|
@@ -5920,9 +5755,7 @@ def notifyDiscord():
|
|
|
5920
5755
|
except Exception as e:
|
|
5921
5756
|
writerr(
|
|
5922
5757
|
colored(
|
|
5923
|
-
getSPACER(
|
|
5924
|
-
"WARNING: Failed to send notification to Discord - " + str(e)
|
|
5925
|
-
),
|
|
5758
|
+
getSPACER("WARNING: Failed to send notification to Discord - " + str(e)),
|
|
5926
5759
|
"yellow",
|
|
5927
5760
|
)
|
|
5928
5761
|
)
|
|
@@ -6037,9 +5870,7 @@ def combineInlineJS():
|
|
|
6037
5870
|
|
|
6038
5871
|
totalSections = len(uniqueScripts)
|
|
6039
5872
|
sectionCounter = 0 # Counter for inline JS sections
|
|
6040
|
-
currentOutputFile = os.path.join(
|
|
6041
|
-
responseOutputDirectory, outputFileTemplate.format(1)
|
|
6042
|
-
)
|
|
5873
|
+
currentOutputFile = os.path.join(responseOutputDirectory, outputFileTemplate.format(1))
|
|
6043
5874
|
currentSectionsWritten = 0 # Counter for sections written in current file
|
|
6044
5875
|
|
|
6045
5876
|
if totalSections > 0:
|
|
@@ -6075,9 +5906,7 @@ def combineInlineJS():
|
|
|
6075
5906
|
currentSectionsWritten = 1
|
|
6076
5907
|
|
|
6077
5908
|
# Insert comment line for the beginning of the section
|
|
6078
|
-
inlineJSFile.write(
|
|
6079
|
-
f"//****** INLINE JS SECTION {sectionCounter} ******//\n\n"
|
|
6080
|
-
)
|
|
5909
|
+
inlineJSFile.write(f"//****** INLINE JS SECTION {sectionCounter} ******//\n\n")
|
|
6081
5910
|
|
|
6082
5911
|
# Write comments indicating the files the script was found in
|
|
6083
5912
|
files = ""
|
|
@@ -6111,10 +5940,7 @@ def combineInlineJS():
|
|
|
6111
5940
|
write(
|
|
6112
5941
|
colored("Created files ", "cyan")
|
|
6113
5942
|
+ colored(
|
|
6114
|
-
responseOutputDirectory
|
|
6115
|
-
+ "combinedInline{1-"
|
|
6116
|
-
+ str(fileNumber)
|
|
6117
|
-
+ "}.js",
|
|
5943
|
+
responseOutputDirectory + "combinedInline{1-" + str(fileNumber) + "}.js",
|
|
6118
5944
|
"white",
|
|
6119
5945
|
)
|
|
6120
5946
|
+ colored(" (contents of inline JS)\n", "cyan")
|
|
@@ -6124,9 +5950,91 @@ def combineInlineJS():
|
|
|
6124
5950
|
writerr(colored("ERROR combineInlineJS 1: " + str(e), "red"))
|
|
6125
5951
|
|
|
6126
5952
|
|
|
5953
|
+
# Async wrapper functions for concurrent source fetching
|
|
5954
|
+
async def fetch_wayback_async():
|
|
5955
|
+
"""Async wrapper for getWaybackUrls - runs in thread pool"""
|
|
5956
|
+
loop = asyncio.get_event_loop()
|
|
5957
|
+
await loop.run_in_executor(None, getWaybackUrls)
|
|
5958
|
+
|
|
5959
|
+
|
|
5960
|
+
async def fetch_commoncrawl_async():
|
|
5961
|
+
"""Async wrapper for getCommonCrawlUrls - runs in thread pool"""
|
|
5962
|
+
loop = asyncio.get_event_loop()
|
|
5963
|
+
await loop.run_in_executor(None, getCommonCrawlUrls)
|
|
5964
|
+
|
|
5965
|
+
|
|
5966
|
+
async def fetch_alienvault_async():
|
|
5967
|
+
"""Async wrapper for getAlienVaultUrls - runs in thread pool"""
|
|
5968
|
+
loop = asyncio.get_event_loop()
|
|
5969
|
+
await loop.run_in_executor(None, getAlienVaultUrls)
|
|
5970
|
+
|
|
5971
|
+
|
|
5972
|
+
async def fetch_urlscan_async():
|
|
5973
|
+
"""Async wrapper for getURLScanUrls - runs in thread pool"""
|
|
5974
|
+
loop = asyncio.get_event_loop()
|
|
5975
|
+
await loop.run_in_executor(None, getURLScanUrls)
|
|
5976
|
+
|
|
5977
|
+
|
|
5978
|
+
async def fetch_virustotal_async():
|
|
5979
|
+
"""Async wrapper for getVirusTotalUrls - runs in thread pool"""
|
|
5980
|
+
loop = asyncio.get_event_loop()
|
|
5981
|
+
await loop.run_in_executor(None, getVirusTotalUrls)
|
|
5982
|
+
|
|
5983
|
+
|
|
5984
|
+
async def fetch_intelx_async():
|
|
5985
|
+
"""Async wrapper for getIntelxUrls - runs in thread pool"""
|
|
5986
|
+
loop = asyncio.get_event_loop()
|
|
5987
|
+
await loop.run_in_executor(None, getIntelxUrls)
|
|
5988
|
+
|
|
5989
|
+
|
|
5990
|
+
async def fetch_all_sources_async():
|
|
5991
|
+
"""
|
|
5992
|
+
Orchestrator function to fetch from all enabled sources concurrently.
|
|
5993
|
+
Each source runs in its own thread pool executor while orchestration happens async.
|
|
5994
|
+
"""
|
|
5995
|
+
global args, stopProgram, VIRUSTOTAL_API_KEY, INTELX_API_KEY, argsInput
|
|
5996
|
+
|
|
5997
|
+
tasks = []
|
|
5998
|
+
|
|
5999
|
+
# Build list of tasks for enabled sources
|
|
6000
|
+
if not args.xwm and stopProgram is None:
|
|
6001
|
+
tasks.append(("Wayback Machine", fetch_wayback_async()))
|
|
6002
|
+
if not args.xcc and stopProgram is None:
|
|
6003
|
+
tasks.append(("Common Crawl", fetch_commoncrawl_async()))
|
|
6004
|
+
if not args.xav and stopProgram is None and not argsInput.startswith("."):
|
|
6005
|
+
tasks.append(("AlienVault OTX", fetch_alienvault_async()))
|
|
6006
|
+
if not args.xus and stopProgram is None:
|
|
6007
|
+
tasks.append(("URLScan", fetch_urlscan_async()))
|
|
6008
|
+
if not args.xvt and VIRUSTOTAL_API_KEY != "" and stopProgram is None:
|
|
6009
|
+
tasks.append(("VirusTotal", fetch_virustotal_async()))
|
|
6010
|
+
if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
|
|
6011
|
+
tasks.append(("Intelligence X", fetch_intelx_async()))
|
|
6012
|
+
|
|
6013
|
+
if not tasks:
|
|
6014
|
+
return
|
|
6015
|
+
|
|
6016
|
+
# Extract just the coroutines for gather
|
|
6017
|
+
task_coros = [task[1] for task in tasks]
|
|
6018
|
+
|
|
6019
|
+
# Fetch all concurrently, capturing exceptions so one failure doesn't stop others
|
|
6020
|
+
results = await asyncio.gather(*task_coros, return_exceptions=True)
|
|
6021
|
+
|
|
6022
|
+
# Check for any exceptions that occurred
|
|
6023
|
+
for i, result in enumerate(results):
|
|
6024
|
+
if isinstance(result, Exception):
|
|
6025
|
+
source_name = tasks[i][0]
|
|
6026
|
+
if verbose():
|
|
6027
|
+
writerr(
|
|
6028
|
+
colored(
|
|
6029
|
+
getSPACER(f"ERROR in {source_name} during concurrent fetch: {str(result)}"),
|
|
6030
|
+
"red",
|
|
6031
|
+
)
|
|
6032
|
+
)
|
|
6033
|
+
|
|
6034
|
+
|
|
6127
6035
|
# Run waymore
|
|
6128
6036
|
def main():
|
|
6129
|
-
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY
|
|
6037
|
+
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
|
|
6130
6038
|
|
|
6131
6039
|
# Tell Python to run the handler() function when SIGINT is received
|
|
6132
6040
|
signal(SIGINT, handler)
|
|
@@ -6295,13 +6203,7 @@ def main():
|
|
|
6295
6203
|
action="store",
|
|
6296
6204
|
type=int,
|
|
6297
6205
|
help="Limit the number of Common Crawl index collections searched, e.g. '-lcc 10' will just search the latest 10 collections (default: 1). As of November 2024 there are currently 106 collections. Setting to 0 (default) will search ALL collections. If you don't want to search Common Crawl at all, use the -xcc option.",
|
|
6298
|
-
|
|
6299
|
-
parser.add_argument(
|
|
6300
|
-
"-lcy",
|
|
6301
|
-
action="store",
|
|
6302
|
-
type=int,
|
|
6303
|
-
help="Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with -lcc). For example, if you are only interested in data from 2015 and after, pass -lcy 2015. If you don't want to search Common Crawl at all, use the -xcc option.",
|
|
6304
|
-
default=0,
|
|
6206
|
+
default=1,
|
|
6305
6207
|
)
|
|
6306
6208
|
parser.add_argument(
|
|
6307
6209
|
"-t",
|
|
@@ -6316,10 +6218,10 @@ def main():
|
|
|
6316
6218
|
parser.add_argument(
|
|
6317
6219
|
"-p",
|
|
6318
6220
|
"--processes",
|
|
6319
|
-
help="Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default:
|
|
6221
|
+
help="Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 2)",
|
|
6320
6222
|
action="store",
|
|
6321
6223
|
type=validateArgProcesses,
|
|
6322
|
-
default=
|
|
6224
|
+
default=2,
|
|
6323
6225
|
metavar="<integer>",
|
|
6324
6226
|
)
|
|
6325
6227
|
parser.add_argument(
|
|
@@ -6420,13 +6322,6 @@ def main():
|
|
|
6420
6322
|
showVersion()
|
|
6421
6323
|
sys.exit()
|
|
6422
6324
|
|
|
6423
|
-
# If -lcc wasn't passed then set to the default of 1 if -lcy is 0. This will make them work together
|
|
6424
|
-
if args.lcc is None:
|
|
6425
|
-
if args.lcy == 0:
|
|
6426
|
-
args.lcc = 1
|
|
6427
|
-
else:
|
|
6428
|
-
args.lcc = 0
|
|
6429
|
-
|
|
6430
6325
|
# If --providers was passed, then manually set the exclude arguments;
|
|
6431
6326
|
if args.providers:
|
|
6432
6327
|
if "wayback" not in args.providers:
|
|
@@ -6531,6 +6426,12 @@ def main():
|
|
|
6531
6426
|
indexFile = None
|
|
6532
6427
|
path = ""
|
|
6533
6428
|
stopSource = False
|
|
6429
|
+
stopSourceWayback = False
|
|
6430
|
+
stopSourceCommonCrawl = False
|
|
6431
|
+
stopSourceAlienVault = False
|
|
6432
|
+
stopSourceURLScan = False
|
|
6433
|
+
stopSourceVirusTotal = False
|
|
6434
|
+
stopSourceIntelx = False
|
|
6534
6435
|
|
|
6535
6436
|
# Get the config settings from the config.yml file
|
|
6536
6437
|
getConfig()
|
|
@@ -6548,29 +6449,17 @@ def main():
|
|
|
6548
6449
|
# If the mode is U (URLs retrieved) or B (URLs retrieved AND Responses downloaded)
|
|
6549
6450
|
if args.mode in ["U", "B"]:
|
|
6550
6451
|
|
|
6551
|
-
#
|
|
6552
|
-
|
|
6553
|
-
|
|
6554
|
-
|
|
6555
|
-
|
|
6556
|
-
|
|
6557
|
-
|
|
6558
|
-
|
|
6559
|
-
|
|
6560
|
-
|
|
6561
|
-
|
|
6562
|
-
|
|
6563
|
-
# If not requested to exclude, get URLs from urlscan.io
|
|
6564
|
-
if not args.xus and stopProgram is None:
|
|
6565
|
-
getURLScanUrls()
|
|
6566
|
-
|
|
6567
|
-
# If not requested to exclude, get URLs from virustotal.com if we have an API key
|
|
6568
|
-
if not args.xvt and VIRUSTOTAL_API_KEY != "" and stopProgram is None:
|
|
6569
|
-
getVirusTotalUrls()
|
|
6570
|
-
|
|
6571
|
-
# If not requested to exclude, get URLs from intelx.io if we have an API key
|
|
6572
|
-
if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
|
|
6573
|
-
getIntelxUrls()
|
|
6452
|
+
# Fetch from all sources concurrently using async/await
|
|
6453
|
+
try:
|
|
6454
|
+
asyncio.run(fetch_all_sources_async())
|
|
6455
|
+
except Exception as e:
|
|
6456
|
+
if verbose():
|
|
6457
|
+
writerr(
|
|
6458
|
+
colored(
|
|
6459
|
+
getSPACER(f"ERROR during concurrent source fetching: {str(e)}"),
|
|
6460
|
+
"red",
|
|
6461
|
+
)
|
|
6462
|
+
)
|
|
6574
6463
|
|
|
6575
6464
|
# Output results of all searches
|
|
6576
6465
|
processURLOutput()
|