waymore 7.7__py3-none-any.whl → 8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -1
- waymore/waymore.py +1363 -244
- {waymore-7.7.dist-info → waymore-8.1.dist-info}/METADATA +5 -2
- waymore-8.1.dist-info/RECORD +8 -0
- {waymore-7.7.dist-info → waymore-8.1.dist-info}/WHEEL +1 -1
- waymore-7.7.dist-info/RECORD +0 -8
- {waymore-7.7.dist-info → waymore-8.1.dist-info}/entry_points.txt +0 -0
- {waymore-7.7.dist-info → waymore-8.1.dist-info}/licenses/LICENSE +0 -0
- {waymore-7.7.dist-info → waymore-8.1.dist-info}/top_level.txt +0 -0
waymore/waymore.py
CHANGED
|
@@ -70,6 +70,7 @@ stopSourceAlienVault = False
|
|
|
70
70
|
stopSourceURLScan = False
|
|
71
71
|
stopSourceVirusTotal = False
|
|
72
72
|
stopSourceIntelx = False
|
|
73
|
+
stopSourceGhostArchive = False
|
|
73
74
|
successCount = 0
|
|
74
75
|
failureCount = 0
|
|
75
76
|
fileCount = 0
|
|
@@ -79,6 +80,7 @@ totalPages = 0
|
|
|
79
80
|
indexFile = None
|
|
80
81
|
continueRespFile = None
|
|
81
82
|
continueRespFileURLScan = None
|
|
83
|
+
continueRespFileGhostArchive = None
|
|
82
84
|
inputIsDomainANDPath = False
|
|
83
85
|
inputIsSubDomain = False
|
|
84
86
|
subs = "*."
|
|
@@ -102,6 +104,7 @@ checkAlienVault = 0
|
|
|
102
104
|
checkURLScan = 0
|
|
103
105
|
checkVirusTotal = 0
|
|
104
106
|
checkIntelx = 0
|
|
107
|
+
checkGhostArchive = 0
|
|
105
108
|
argsInputHostname = ""
|
|
106
109
|
responseOutputDirectory = ""
|
|
107
110
|
urlscanRequestLinks = set()
|
|
@@ -112,11 +115,14 @@ linkCountAlienVault = 0
|
|
|
112
115
|
linkCountURLScan = 0
|
|
113
116
|
linkCountVirusTotal = 0
|
|
114
117
|
linkCountIntelx = 0
|
|
118
|
+
linkCountGhostArchive = 0
|
|
115
119
|
linksFoundCommonCrawl = set()
|
|
116
120
|
linksFoundAlienVault = set()
|
|
117
121
|
linksFoundURLScan = set()
|
|
118
122
|
linksFoundVirusTotal = set()
|
|
119
123
|
linksFoundIntelx = set()
|
|
124
|
+
linksFoundGhostArchive = set()
|
|
125
|
+
ghostArchiveRequestLinks = set()
|
|
120
126
|
|
|
121
127
|
# Thread lock for protecting shared state during concurrent operations
|
|
122
128
|
links_lock = threading.Lock()
|
|
@@ -124,6 +130,7 @@ links_lock = threading.Lock()
|
|
|
124
130
|
# Shared state for link collection across all sources
|
|
125
131
|
linksFound = set()
|
|
126
132
|
linkMimes = set()
|
|
133
|
+
extraWarcLinks = set() # Track extra URLs found in WARC files for mode B
|
|
127
134
|
|
|
128
135
|
# Source Provider URLs
|
|
129
136
|
WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
|
|
@@ -134,6 +141,8 @@ URLSCAN_DOM_URL = "https://urlscan.io/dom/"
|
|
|
134
141
|
VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
|
|
135
142
|
# Paid endpoint first, free endpoint as fallback
|
|
136
143
|
INTELX_BASES = ["https://2.intelx.io", "https://free.intelx.io"]
|
|
144
|
+
GHOSTARCHIVE_URL = "https://ghostarchive.org/search?term={DOMAIN}&page="
|
|
145
|
+
GHOSTARCHIVE_DOM_URL = "https://ghostarchive.org"
|
|
137
146
|
|
|
138
147
|
intelx_tls = threading.local()
|
|
139
148
|
|
|
@@ -247,10 +256,10 @@ DEFAULT_LIMIT = 5000
|
|
|
247
256
|
DEFAULT_TIMEOUT = 30
|
|
248
257
|
|
|
249
258
|
# Exclusions used to exclude responses we will try to get from web.archive.org
|
|
250
|
-
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx"
|
|
259
|
+
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx,.avif"
|
|
251
260
|
|
|
252
261
|
# MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
|
|
253
|
-
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/
|
|
262
|
+
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2,image/avif"
|
|
254
263
|
|
|
255
264
|
# Response code exclusions we will use to filter links and responses from web.archive.org through their API
|
|
256
265
|
DEFAULT_FILTER_CODE = "404,301,302"
|
|
@@ -743,7 +752,7 @@ def handler(signal_received, frame):
|
|
|
743
752
|
This function is called if Ctrl-C is called by the user
|
|
744
753
|
An attempt will be made to try and clean up properly
|
|
745
754
|
"""
|
|
746
|
-
global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
|
|
755
|
+
global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, current_response, current_session
|
|
747
756
|
|
|
748
757
|
if stopProgram is not None:
|
|
749
758
|
stopProgramCount = stopProgramCount + 1
|
|
@@ -778,6 +787,7 @@ def handler(signal_received, frame):
|
|
|
778
787
|
stopSourceURLScan = True
|
|
779
788
|
stopSourceVirusTotal = True
|
|
780
789
|
stopSourceIntelx = True
|
|
790
|
+
stopSourceGhostArchive = True
|
|
781
791
|
# Try to close any active response or session to interrupt blocking network I/O
|
|
782
792
|
try:
|
|
783
793
|
if current_response is not None:
|
|
@@ -1324,6 +1334,46 @@ def getConfig():
|
|
|
1324
1334
|
configPath = Path(waymorePath / "config.yml")
|
|
1325
1335
|
else:
|
|
1326
1336
|
configPath = Path(args.config)
|
|
1337
|
+
|
|
1338
|
+
# If the config file doesn't exist, create the default one
|
|
1339
|
+
if not os.path.isfile(configPath):
|
|
1340
|
+
try:
|
|
1341
|
+
# Make sure the directory exists
|
|
1342
|
+
os.makedirs(os.path.dirname(configPath), exist_ok=True)
|
|
1343
|
+
# Create the default config content using the DEFAULT_* constants
|
|
1344
|
+
defaultConfig = f"""FILTER_CODE: {DEFAULT_FILTER_CODE}
|
|
1345
|
+
FILTER_MIME: {DEFAULT_FILTER_MIME}
|
|
1346
|
+
FILTER_URL: {DEFAULT_FILTER_URL}
|
|
1347
|
+
FILTER_KEYWORDS: {DEFAULT_FILTER_KEYWORDS}
|
|
1348
|
+
URLSCAN_API_KEY:
|
|
1349
|
+
VIRUSTOTAL_API_KEY:
|
|
1350
|
+
CONTINUE_RESPONSES_IF_PIPED: True
|
|
1351
|
+
WEBHOOK_DISCORD: YOUR_WEBHOOK
|
|
1352
|
+
TELEGRAM_BOT_TOKEN: YOUR_TOKEN
|
|
1353
|
+
TELEGRAM_CHAT_ID: YOUR_CHAT_ID
|
|
1354
|
+
DEFAULT_OUTPUT_DIR:
|
|
1355
|
+
INTELX_API_KEY:
|
|
1356
|
+
SOURCE_IP:
|
|
1357
|
+
"""
|
|
1358
|
+
with open(configPath, "w", encoding="utf-8") as f:
|
|
1359
|
+
f.write(defaultConfig)
|
|
1360
|
+
writerr(
|
|
1361
|
+
colored(
|
|
1362
|
+
'Config file not found - created default config at "'
|
|
1363
|
+
+ str(configPath)
|
|
1364
|
+
+ '"',
|
|
1365
|
+
"yellow",
|
|
1366
|
+
)
|
|
1367
|
+
)
|
|
1368
|
+
except Exception as e:
|
|
1369
|
+
writerr(
|
|
1370
|
+
colored(
|
|
1371
|
+
"Config file not found, but failed to create default config file: "
|
|
1372
|
+
+ str(e),
|
|
1373
|
+
"red",
|
|
1374
|
+
)
|
|
1375
|
+
)
|
|
1376
|
+
|
|
1327
1377
|
config = yaml.safe_load(open(configPath))
|
|
1328
1378
|
try:
|
|
1329
1379
|
FILTER_URL = config.get("FILTER_URL")
|
|
@@ -1753,11 +1803,15 @@ def printProgressBar(
|
|
|
1753
1803
|
|
|
1754
1804
|
def filehash(text):
|
|
1755
1805
|
"""
|
|
1756
|
-
Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
|
|
1806
|
+
Generate a hash value for the passed string or bytes. This is used for the file name of a downloaded archived response
|
|
1757
1807
|
"""
|
|
1758
1808
|
hash = 0
|
|
1759
1809
|
for ch in text:
|
|
1760
|
-
|
|
1810
|
+
# Handle both str (gives chars needing ord()) and bytes (gives ints directly)
|
|
1811
|
+
if isinstance(ch, int):
|
|
1812
|
+
hash = (hash * 281 ^ ch * 997) & 0xFFFFFFFFFFF
|
|
1813
|
+
else:
|
|
1814
|
+
hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
|
|
1761
1815
|
return str(hash)
|
|
1762
1816
|
|
|
1763
1817
|
|
|
@@ -1945,7 +1999,7 @@ def processArchiveUrl(url):
|
|
|
1945
1999
|
try:
|
|
1946
2000
|
try:
|
|
1947
2001
|
try:
|
|
1948
|
-
if os.environ.get("USER") == "xnl":
|
|
2002
|
+
if verbose() and os.environ.get("USER") == "xnl":
|
|
1949
2003
|
writerr(
|
|
1950
2004
|
colored(
|
|
1951
2005
|
"[ DBG ] Requesting file " + archiveUrl,
|
|
@@ -2265,7 +2319,7 @@ def processArchiveUrl(url):
|
|
|
2265
2319
|
debugText = "INTERNET ARCHIVE"
|
|
2266
2320
|
elif archiveHtml.lower().find("wombat") > 0:
|
|
2267
2321
|
debugText = "WOMBAT (JS)"
|
|
2268
|
-
if debugText != "":
|
|
2322
|
+
if verbose() and debugText != "":
|
|
2269
2323
|
writerr(
|
|
2270
2324
|
colored(
|
|
2271
2325
|
getSPACER(
|
|
@@ -2280,16 +2334,17 @@ def processArchiveUrl(url):
|
|
|
2280
2334
|
)
|
|
2281
2335
|
)
|
|
2282
2336
|
except Exception as e:
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2337
|
+
if verbose():
|
|
2338
|
+
writerr(
|
|
2339
|
+
colored(
|
|
2340
|
+
'[ DBG ] Error - Failed to output debug info for "'
|
|
2341
|
+
+ archiveUrl
|
|
2342
|
+
+ '": '
|
|
2343
|
+
+ str(e),
|
|
2344
|
+
"red",
|
|
2345
|
+
attrs=["dark"],
|
|
2346
|
+
)
|
|
2291
2347
|
)
|
|
2292
|
-
)
|
|
2293
2348
|
pass
|
|
2294
2349
|
|
|
2295
2350
|
successCount = successCount + 1
|
|
@@ -2760,17 +2815,20 @@ def validateArgProviders(x):
|
|
|
2760
2815
|
- urlscan
|
|
2761
2816
|
- virustotal
|
|
2762
2817
|
- intelx
|
|
2818
|
+
- ghostarchive
|
|
2763
2819
|
"""
|
|
2764
2820
|
invalid = False
|
|
2765
2821
|
x = x.lower()
|
|
2766
2822
|
providers = x.split(",")
|
|
2767
2823
|
for provider in providers:
|
|
2768
|
-
if not re.fullmatch(
|
|
2824
|
+
if not re.fullmatch(
|
|
2825
|
+
r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx|ghostarchive)", provider
|
|
2826
|
+
):
|
|
2769
2827
|
invalid = True
|
|
2770
2828
|
break
|
|
2771
2829
|
if invalid:
|
|
2772
2830
|
raise argparse.ArgumentTypeError(
|
|
2773
|
-
"Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx"
|
|
2831
|
+
"Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive"
|
|
2774
2832
|
)
|
|
2775
2833
|
return x
|
|
2776
2834
|
|
|
@@ -3528,181 +3586,697 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
3528
3586
|
writerr(colored("ERROR getURLScanDOM 1: " + str(e), "red"))
|
|
3529
3587
|
|
|
3530
3588
|
|
|
3531
|
-
def
|
|
3532
|
-
# Handle different lengths of input
|
|
3533
|
-
if len(date_str) == 4: # YYYY
|
|
3534
|
-
date_str += "0101"
|
|
3535
|
-
elif len(date_str) == 6: # YYYYMM
|
|
3536
|
-
date_str += "01"
|
|
3537
|
-
|
|
3538
|
-
# Convert to YYYY-MM-DD format
|
|
3539
|
-
try:
|
|
3540
|
-
formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
|
|
3541
|
-
return formatted_date
|
|
3542
|
-
except Exception:
|
|
3543
|
-
return ""
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
def getURLScanUrls():
|
|
3589
|
+
def getGhostArchiveWARC(originalUrl, domUrl):
|
|
3547
3590
|
"""
|
|
3548
|
-
Get
|
|
3591
|
+
Get the DOM for the passed GhostArchive link - parses WARC files containing multiple request/response pairs
|
|
3549
3592
|
"""
|
|
3550
|
-
global
|
|
3551
|
-
|
|
3552
|
-
# Write the file of URL's for the passed domain/URL
|
|
3593
|
+
global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, argsInputHostname, REGEX_404, linksFound, extraWarcLinks, links_lock
|
|
3553
3594
|
try:
|
|
3554
|
-
|
|
3555
|
-
stopSourceURLScan = False
|
|
3556
|
-
linksFoundURLScan = set()
|
|
3557
|
-
totalUrls = 0
|
|
3558
|
-
checkResponse = True
|
|
3559
|
-
|
|
3560
|
-
# Set the URL to just the hostname
|
|
3561
|
-
url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
|
|
3595
|
+
if stopProgram is None:
|
|
3562
3596
|
|
|
3563
|
-
|
|
3564
|
-
|
|
3565
|
-
if args.from_date:
|
|
3566
|
-
fromDate = format_date_for_urlscan(str(args.from_date)[:8])
|
|
3567
|
-
else:
|
|
3568
|
-
fromDate = "2016-01-01" # The year URLScan started
|
|
3569
|
-
if args.to_date:
|
|
3570
|
-
toDate = format_date_for_urlscan(str(args.to_date)[:8])
|
|
3571
|
-
else:
|
|
3572
|
-
toDate = "now"
|
|
3573
|
-
url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
|
|
3574
|
-
else:
|
|
3575
|
-
url = url.replace("{DATERANGE}", "")
|
|
3597
|
+
# The WARC files are found by replacing /archive with /chimurai4 and using the .warc file extension
|
|
3598
|
+
warcUrl = domUrl.replace("/archive", "/chimurai4") + ".warc"
|
|
3576
3599
|
|
|
3577
|
-
|
|
3578
|
-
if
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
)
|
|
3584
|
-
+ colored(url + "\n", "white")
|
|
3585
|
-
)
|
|
3586
|
-
else:
|
|
3587
|
-
write(
|
|
3588
|
-
colored(
|
|
3589
|
-
"URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
|
|
3590
|
-
)
|
|
3591
|
-
+ colored(url + "\n", "white")
|
|
3592
|
-
)
|
|
3600
|
+
# Get memory usage every 100 responses
|
|
3601
|
+
if (successCount + failureCount) % 100 == 0:
|
|
3602
|
+
try:
|
|
3603
|
+
getMemory()
|
|
3604
|
+
except Exception:
|
|
3605
|
+
pass
|
|
3593
3606
|
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
|
|
3598
|
-
|
|
3599
|
-
|
|
3600
|
-
|
|
3607
|
+
# Fetch content
|
|
3608
|
+
try:
|
|
3609
|
+
# Show progress bar
|
|
3610
|
+
fillTest = (successCount + failureCount) % 2
|
|
3611
|
+
fillChar = "o"
|
|
3612
|
+
if fillTest == 0:
|
|
3613
|
+
fillChar = "O"
|
|
3614
|
+
suffix = "Complete "
|
|
3601
3615
|
|
|
3602
|
-
|
|
3603
|
-
|
|
3604
|
-
|
|
3605
|
-
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
session = requests.Session()
|
|
3610
|
-
session.mount("https://", HTTP_ADAPTER)
|
|
3611
|
-
session.mount("http://", HTTP_ADAPTER)
|
|
3612
|
-
# Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
|
|
3613
|
-
resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
|
|
3614
|
-
requestsMade = requestsMade + 1
|
|
3615
|
-
except Exception as e:
|
|
3616
|
-
write(
|
|
3617
|
-
colored(
|
|
3618
|
-
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
3619
|
-
"red",
|
|
3616
|
+
printProgressBar(
|
|
3617
|
+
successCount + failureCount,
|
|
3618
|
+
totalResponses,
|
|
3619
|
+
prefix="Processing " + str(totalResponses) + " WARC files:",
|
|
3620
|
+
suffix=suffix,
|
|
3621
|
+
length=getProgressBarLength(),
|
|
3622
|
+
fill=fillChar,
|
|
3620
3623
|
)
|
|
3621
|
-
)
|
|
3622
|
-
return
|
|
3623
3624
|
|
|
3624
|
-
|
|
3625
|
-
if resp.status_code == 429:
|
|
3626
|
-
# Get the number of seconds the rate limit resets
|
|
3627
|
-
match = re.search(r"Reset in (\d+) seconds", resp.text, flags=re.IGNORECASE)
|
|
3628
|
-
if match is not None:
|
|
3629
|
-
seconds = int(match.group(1))
|
|
3630
|
-
if seconds <= args.urlscan_rate_limit_retry * 60:
|
|
3631
|
-
writerr(
|
|
3632
|
-
colored(
|
|
3633
|
-
"URLScan - [ 429 ] Rate limit reached, so waiting for another "
|
|
3634
|
-
+ str(seconds)
|
|
3635
|
-
+ " seconds before continuing...",
|
|
3636
|
-
"yellow",
|
|
3637
|
-
)
|
|
3638
|
-
)
|
|
3639
|
-
# Wait can be interrupted by SIGINT via interrupt_event
|
|
3640
|
-
interrupt_event.clear()
|
|
3641
|
-
if interrupt_event.wait(seconds + 1):
|
|
3642
|
-
# Interrupted by SIGINT
|
|
3643
|
-
return
|
|
3625
|
+
try:
|
|
3644
3626
|
try:
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
|
|
3648
|
-
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3652
|
-
requestsMade = requestsMade + 1
|
|
3653
|
-
except Exception as e:
|
|
3654
|
-
write(
|
|
3655
|
-
colored(
|
|
3656
|
-
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
3657
|
-
"red",
|
|
3627
|
+
if verbose() and os.environ.get("USER") == "xnl":
|
|
3628
|
+
writerr(
|
|
3629
|
+
colored(
|
|
3630
|
+
"[ DBG ] Requesting file " + warcUrl,
|
|
3631
|
+
"yellow",
|
|
3632
|
+
attrs=["dark"],
|
|
3633
|
+
)
|
|
3658
3634
|
)
|
|
3635
|
+
except Exception:
|
|
3636
|
+
pass
|
|
3637
|
+
|
|
3638
|
+
# Choose a random user agent string to use for any requests
|
|
3639
|
+
userAgent = random.choice(USER_AGENT)
|
|
3640
|
+
session = requests.Session()
|
|
3641
|
+
session.mount("https://", HTTP_ADAPTER)
|
|
3642
|
+
session.mount("http://", HTTP_ADAPTER)
|
|
3643
|
+
|
|
3644
|
+
# Retry loop for 503 or maintenance responses
|
|
3645
|
+
maxRetries = 3
|
|
3646
|
+
warcBytes = b""
|
|
3647
|
+
for attempt in range(maxRetries):
|
|
3648
|
+
resp = session.get(
|
|
3649
|
+
warcUrl,
|
|
3650
|
+
headers={"User-Agent": userAgent},
|
|
3651
|
+
allow_redirects=True,
|
|
3652
|
+
timeout=args.timeout,
|
|
3659
3653
|
)
|
|
3654
|
+
warcBytes = resp.content
|
|
3655
|
+
|
|
3656
|
+
# Check if we need to retry (decode just for this check)
|
|
3657
|
+
try:
|
|
3658
|
+
warcTextCheck = warcBytes.decode("utf-8", errors="replace").lower()
|
|
3659
|
+
except Exception:
|
|
3660
|
+
warcTextCheck = ""
|
|
3661
|
+
if resp.status_code == 503 or "website under maintenance" in warcTextCheck:
|
|
3662
|
+
if attempt < maxRetries - 1:
|
|
3663
|
+
import time
|
|
3664
|
+
|
|
3665
|
+
time.sleep(0.5)
|
|
3666
|
+
continue
|
|
3667
|
+
break
|
|
3668
|
+
|
|
3669
|
+
# Parse the WARC file to extract multiple responses
|
|
3670
|
+
# WARC header lines are text, but response bodies may be binary
|
|
3671
|
+
# Split by line separator but keep bytes for body extraction
|
|
3672
|
+
lineBytes = warcBytes.split(b"\n")
|
|
3673
|
+
lines = [lb.decode("utf-8", errors="replace") for lb in lineBytes]
|
|
3674
|
+
|
|
3675
|
+
# State machine to track parsing
|
|
3676
|
+
currentTargetUri = ""
|
|
3677
|
+
inResponse = False
|
|
3678
|
+
contentType = ""
|
|
3679
|
+
responsesFound = (
|
|
3680
|
+
[]
|
|
3681
|
+
) # List of (targetUri, contentType, responseBytes, httpStatusCode)
|
|
3682
|
+
|
|
3683
|
+
i = 0
|
|
3684
|
+
skipCurrentResponse = False # Initialize before loop
|
|
3685
|
+
pendingResponseType = (
|
|
3686
|
+
False # Track if we saw WARC-Type: response and are waiting for Target-URI
|
|
3687
|
+
)
|
|
3688
|
+
responseStartIdx = -1 # Initialize before loop
|
|
3689
|
+
httpStatusCode = "" # Initialize before loop
|
|
3690
|
+
while i < len(lines) and stopProgram is None and not stopSourceGhostArchive:
|
|
3691
|
+
line = lines[i]
|
|
3692
|
+
|
|
3693
|
+
# When we see a new WARC record start, reset pending state
|
|
3694
|
+
if line.startswith("WARC/1.0"):
|
|
3695
|
+
# If we were in a response and collecting, save it before moving to new record
|
|
3696
|
+
if inResponse and responseStartIdx >= 0:
|
|
3697
|
+
responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:i])
|
|
3698
|
+
responsesFound.append(
|
|
3699
|
+
(
|
|
3700
|
+
currentTargetUri,
|
|
3701
|
+
contentType,
|
|
3702
|
+
responseBodyBytes,
|
|
3703
|
+
httpStatusCode if "httpStatusCode" in dir() else "",
|
|
3704
|
+
)
|
|
3705
|
+
)
|
|
3706
|
+
inResponse = False
|
|
3707
|
+
responseStartIdx = -1
|
|
3708
|
+
contentType = ""
|
|
3709
|
+
httpStatusCode = ""
|
|
3710
|
+
pendingResponseType = False
|
|
3711
|
+
skipCurrentResponse = False
|
|
3712
|
+
|
|
3713
|
+
# Look for WARC-Type: response - mark that we're in a response record header
|
|
3714
|
+
elif line.startswith("WARC-Type: response"):
|
|
3715
|
+
pendingResponseType = True
|
|
3716
|
+
inResponse = False # Don't start capturing body yet
|
|
3717
|
+
responseStartIdx = -1
|
|
3718
|
+
contentType = ""
|
|
3719
|
+
|
|
3720
|
+
# Look for WARC-Target-URI to get the request URL
|
|
3721
|
+
elif line.startswith("WARC-Target-URI:"):
|
|
3722
|
+
currentTargetUri = line.split(":", 1)[1].strip()
|
|
3723
|
+
skipCurrentResponse = False
|
|
3724
|
+
|
|
3725
|
+
# Check: URL host must contain the input hostname
|
|
3726
|
+
if argsInputHostname:
|
|
3727
|
+
try:
|
|
3728
|
+
parsed = urlparse(currentTargetUri)
|
|
3729
|
+
host = parsed.netloc.lower()
|
|
3730
|
+
if argsInputHostname.lower() not in host:
|
|
3731
|
+
skipCurrentResponse = True
|
|
3732
|
+
except Exception:
|
|
3733
|
+
skipCurrentResponse = True
|
|
3734
|
+
|
|
3735
|
+
# Check: Filter by URL (FILTER_URL)
|
|
3736
|
+
if not skipCurrentResponse and FILTER_URL and currentTargetUri:
|
|
3737
|
+
filterUrls = [u.strip().lower() for u in FILTER_URL.split(",")]
|
|
3738
|
+
for filterUrl in filterUrls:
|
|
3739
|
+
if filterUrl in currentTargetUri.lower():
|
|
3740
|
+
skipCurrentResponse = True
|
|
3741
|
+
break
|
|
3742
|
+
|
|
3743
|
+
# If we were waiting for Target-URI after seeing WARC-Type: response, and it's valid, start response mode
|
|
3744
|
+
if pendingResponseType and not skipCurrentResponse:
|
|
3745
|
+
inResponse = True
|
|
3746
|
+
pendingResponseType = False
|
|
3747
|
+
|
|
3748
|
+
# If we're in a response section (after seeing both WARC-Type: response and valid WARC-Target-URI)
|
|
3749
|
+
elif inResponse:
|
|
3750
|
+
# Check for HTTP start and capture status code
|
|
3751
|
+
if line.startswith("HTTP"):
|
|
3752
|
+
# Extract status code (e.g., "HTTP/1.1 200 OK" -> "200")
|
|
3753
|
+
try:
|
|
3754
|
+
httpStatusCode = line.split()[1]
|
|
3755
|
+
except Exception:
|
|
3756
|
+
httpStatusCode = ""
|
|
3757
|
+
|
|
3758
|
+
# Early check: Filter by HTTP status code (FILTER_CODE)
|
|
3759
|
+
if FILTER_CODE and httpStatusCode:
|
|
3760
|
+
filterCodes = [c.strip() for c in FILTER_CODE.split(",")]
|
|
3761
|
+
if httpStatusCode in filterCodes:
|
|
3762
|
+
inResponse = False
|
|
3763
|
+
responseStartIdx = -1
|
|
3764
|
+
i += 1
|
|
3765
|
+
continue
|
|
3766
|
+
|
|
3767
|
+
responseStartIdx = i # Mark start of response
|
|
3768
|
+
elif responseStartIdx >= 0:
|
|
3769
|
+
# Capture Content-Type if present (case-insensitive check)
|
|
3770
|
+
if line.lower().startswith("content-type:"):
|
|
3771
|
+
try:
|
|
3772
|
+
contentType = (
|
|
3773
|
+
line.split(":", 1)[1].strip().split(";")[0].lower()
|
|
3774
|
+
)
|
|
3775
|
+
except Exception:
|
|
3776
|
+
pass
|
|
3777
|
+
|
|
3778
|
+
# Early check: Filter by MIME type (FILTER_MIME)
|
|
3779
|
+
if FILTER_MIME and contentType:
|
|
3780
|
+
filterMimes = [
|
|
3781
|
+
m.strip().lower() for m in FILTER_MIME.split(",")
|
|
3782
|
+
]
|
|
3783
|
+
if contentType in filterMimes:
|
|
3784
|
+
inResponse = False
|
|
3785
|
+
responseStartIdx = -1
|
|
3786
|
+
i += 1
|
|
3787
|
+
continue
|
|
3788
|
+
|
|
3789
|
+
i += 1
|
|
3790
|
+
|
|
3791
|
+
if stopProgram is not None:
|
|
3660
3792
|
return
|
|
3661
3793
|
|
|
3662
|
-
|
|
3663
|
-
|
|
3664
|
-
|
|
3665
|
-
|
|
3666
|
-
|
|
3667
|
-
|
|
3668
|
-
|
|
3669
|
-
|
|
3670
|
-
"
|
|
3671
|
-
)
|
|
3672
|
-
)
|
|
3673
|
-
else:
|
|
3674
|
-
writerr(
|
|
3675
|
-
colored(
|
|
3676
|
-
"URLScan - [ INF ] The API Key is invalid so trying without API Key...",
|
|
3677
|
-
"red",
|
|
3794
|
+
# Don't forget the last response if file doesn't end with WARC/1.0
|
|
3795
|
+
if inResponse and responseStartIdx >= 0:
|
|
3796
|
+
responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:])
|
|
3797
|
+
responsesFound.append(
|
|
3798
|
+
(
|
|
3799
|
+
currentTargetUri,
|
|
3800
|
+
contentType,
|
|
3801
|
+
responseBodyBytes,
|
|
3802
|
+
httpStatusCode if "httpStatusCode" in dir() else "",
|
|
3678
3803
|
)
|
|
3679
3804
|
)
|
|
3680
|
-
# Set key to blank for further requests
|
|
3681
|
-
URLSCAN_API_KEY = ""
|
|
3682
|
-
session_no_key = requests.Session()
|
|
3683
|
-
session_no_key.mount("https://", HTTP_ADAPTER)
|
|
3684
|
-
session_no_key.mount("http://", HTTP_ADAPTER)
|
|
3685
|
-
resp = session_no_key.get(url, headers={"User-Agent": userAgent})
|
|
3686
|
-
except Exception as e:
|
|
3687
|
-
writerr(
|
|
3688
|
-
colored(
|
|
3689
|
-
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
3690
|
-
"red",
|
|
3691
|
-
)
|
|
3692
|
-
)
|
|
3693
|
-
checkResponse = False
|
|
3694
3805
|
|
|
3695
|
-
|
|
3696
|
-
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
|
|
3700
|
-
|
|
3806
|
+
# Process each response found
|
|
3807
|
+
for targetUri, contentType, responseBytes, httpStatusCode in responsesFound:
|
|
3808
|
+
if stopProgram is not None:
|
|
3809
|
+
break
|
|
3810
|
+
|
|
3811
|
+
if not responseBytes:
|
|
3812
|
+
continue
|
|
3813
|
+
|
|
3814
|
+
# Split HTTP header from body in bytes (look for \r\n\r\n or \n\n separator)
|
|
3815
|
+
if b"\r\n\r\n" in responseBytes:
|
|
3816
|
+
bodyBytes = responseBytes.split(b"\r\n\r\n", 1)[1]
|
|
3817
|
+
elif b"\n\n" in responseBytes:
|
|
3818
|
+
bodyBytes = responseBytes.split(b"\n\n", 1)[1]
|
|
3819
|
+
else:
|
|
3820
|
+
bodyBytes = responseBytes
|
|
3821
|
+
|
|
3822
|
+
# Skip empty bodies or "not found" responses
|
|
3823
|
+
if not bodyBytes or bodyBytes.lower().strip() == b"not found":
|
|
3824
|
+
continue
|
|
3825
|
+
|
|
3826
|
+
# If -f / --filter-responses-only is passed, track all URLs immediately (before filtering)
|
|
3827
|
+
if args.mode == "B" and args.filter_responses_only and targetUri:
|
|
3828
|
+
with links_lock:
|
|
3829
|
+
if targetUri not in linksFound and targetUri not in extraWarcLinks:
|
|
3830
|
+
extraWarcLinks.add(targetUri)
|
|
3831
|
+
|
|
3832
|
+
# Use isBinaryContent to detect if this is binary content
|
|
3833
|
+
isBinary = isBinaryContent(bodyBytes, contentType, targetUri)
|
|
3834
|
+
|
|
3835
|
+
if isBinary:
|
|
3836
|
+
# Binary file - save raw bytes
|
|
3837
|
+
archiveContent = bodyBytes
|
|
3838
|
+
archiveHtml = None
|
|
3839
|
+
else:
|
|
3840
|
+
# Text file - decode to string
|
|
3841
|
+
archiveHtml = bodyBytes.decode("utf-8", errors="replace")
|
|
3842
|
+
archiveContent = None
|
|
3843
|
+
|
|
3844
|
+
# Collapse multiple blank lines into one
|
|
3845
|
+
archiveHtml = re.sub(r"\n{3,}", "\n\n", archiveHtml)
|
|
3846
|
+
|
|
3847
|
+
# Skip if body is empty after processing
|
|
3848
|
+
if not archiveHtml.strip():
|
|
3849
|
+
continue
|
|
3850
|
+
|
|
3851
|
+
if stopProgram is not None:
|
|
3852
|
+
break
|
|
3853
|
+
|
|
3854
|
+
# Determine if this is HTML or JS based on content-type or URL
|
|
3855
|
+
isHtml = (
|
|
3856
|
+
contentType in ["text/html", "application/xhtml+xml"]
|
|
3857
|
+
or targetUri.lower().endswith(".html")
|
|
3858
|
+
or targetUri.lower().endswith(".htm")
|
|
3701
3859
|
)
|
|
3702
|
-
|
|
3703
|
-
|
|
3704
|
-
|
|
3705
|
-
|
|
3860
|
+
isJs = contentType in [
|
|
3861
|
+
"text/javascript",
|
|
3862
|
+
"application/javascript",
|
|
3863
|
+
"application/x-javascript",
|
|
3864
|
+
] or targetUri.lower().endswith(".js")
|
|
3865
|
+
|
|
3866
|
+
# Add the URL as a comment at the start of the response (only for text files)
|
|
3867
|
+
if not isBinary and args.url_filename:
|
|
3868
|
+
if isHtml:
|
|
3869
|
+
archiveHtml = (
|
|
3870
|
+
"<!-- Original URL: " + targetUri + " -->\n" + archiveHtml
|
|
3871
|
+
)
|
|
3872
|
+
elif isJs:
|
|
3873
|
+
archiveHtml = (
|
|
3874
|
+
"/* Original URL: " + targetUri + " */\n" + archiveHtml
|
|
3875
|
+
)
|
|
3876
|
+
|
|
3877
|
+
# Create file name based on url or hash value
|
|
3878
|
+
if args.url_filename:
|
|
3879
|
+
fileName = targetUri.replace("/", "-").replace(":", "")
|
|
3880
|
+
fileName = fileName[0:254]
|
|
3881
|
+
hashValue = ""
|
|
3882
|
+
else:
|
|
3883
|
+
# Hash the content to get the filename
|
|
3884
|
+
if isBinary:
|
|
3885
|
+
hashValue = filehash(archiveContent)
|
|
3886
|
+
else:
|
|
3887
|
+
hashValue = filehash(archiveHtml)
|
|
3888
|
+
fileName = hashValue
|
|
3889
|
+
|
|
3890
|
+
# Determine extension of file from the content-type or URL
|
|
3891
|
+
extension = ""
|
|
3892
|
+
try:
|
|
3893
|
+
# Get path extension from URL
|
|
3894
|
+
if "://" in targetUri:
|
|
3895
|
+
targetUrl = "https://" + targetUri.split("://")[1]
|
|
3896
|
+
parsed = urlparse(targetUrl.strip())
|
|
3897
|
+
path = parsed.path
|
|
3898
|
+
extension = path[path.rindex(".") + 1 :]
|
|
3899
|
+
if "/" in extension:
|
|
3900
|
+
extension = ""
|
|
3901
|
+
# If extension is over 6 characters, it's likely not a real extension (e.g. API endpoint ID)
|
|
3902
|
+
if len(extension) > 6:
|
|
3903
|
+
extension = ""
|
|
3904
|
+
except Exception:
|
|
3905
|
+
pass
|
|
3906
|
+
|
|
3907
|
+
# If extension is blank, determine from MIME type or content
|
|
3908
|
+
if extension == "":
|
|
3909
|
+
if isBinary:
|
|
3910
|
+
# Binary file extensions from MIME type
|
|
3911
|
+
if contentType:
|
|
3912
|
+
if "image/png" in contentType:
|
|
3913
|
+
extension = "png"
|
|
3914
|
+
elif (
|
|
3915
|
+
"image/jpeg" in contentType
|
|
3916
|
+
or "image/jpg" in contentType
|
|
3917
|
+
):
|
|
3918
|
+
extension = "jpg"
|
|
3919
|
+
elif "image/gif" in contentType:
|
|
3920
|
+
extension = "gif"
|
|
3921
|
+
elif "image/webp" in contentType:
|
|
3922
|
+
extension = "webp"
|
|
3923
|
+
elif "application/pdf" in contentType:
|
|
3924
|
+
extension = "pdf"
|
|
3925
|
+
elif "application/zip" in contentType:
|
|
3926
|
+
extension = "zip"
|
|
3927
|
+
else:
|
|
3928
|
+
extension = "bin"
|
|
3929
|
+
else:
|
|
3930
|
+
extension = "bin"
|
|
3931
|
+
else:
|
|
3932
|
+
# Text file extensions
|
|
3933
|
+
if contentType and "javascript" in contentType.lower():
|
|
3934
|
+
extension = "js"
|
|
3935
|
+
elif contentType and "html" in contentType.lower():
|
|
3936
|
+
extension = "html"
|
|
3937
|
+
elif contentType and "json" in contentType.lower():
|
|
3938
|
+
extension = "json"
|
|
3939
|
+
elif contentType and "text" in contentType.lower():
|
|
3940
|
+
extension = "txt"
|
|
3941
|
+
elif archiveHtml and (
|
|
3942
|
+
archiveHtml.lower().strip().endswith("</html>")
|
|
3943
|
+
or archiveHtml.lower().strip().endswith("</body>")
|
|
3944
|
+
or archiveHtml.lower().strip().startswith("<!doctype html")
|
|
3945
|
+
or archiveHtml.lower().strip().startswith("<html")
|
|
3946
|
+
or archiveHtml.lower().strip().startswith("<head")
|
|
3947
|
+
):
|
|
3948
|
+
extension = "html"
|
|
3949
|
+
else:
|
|
3950
|
+
extension = "unknown"
|
|
3951
|
+
|
|
3952
|
+
fileName = fileName + "." + extension
|
|
3953
|
+
|
|
3954
|
+
# Determine file path
|
|
3955
|
+
if args.output_responses != "":
|
|
3956
|
+
filePath = args.output_responses + "/" + f"{fileName}"
|
|
3957
|
+
else:
|
|
3958
|
+
filePath = (
|
|
3959
|
+
DEFAULT_OUTPUT_DIR
|
|
3960
|
+
+ "/results/"
|
|
3961
|
+
+ str(argsInput).replace("/", "-")
|
|
3962
|
+
+ "/"
|
|
3963
|
+
+ f"{fileName}"
|
|
3964
|
+
)
|
|
3965
|
+
|
|
3966
|
+
if stopProgram is not None:
|
|
3967
|
+
break
|
|
3968
|
+
|
|
3969
|
+
# Write the file
|
|
3970
|
+
try:
|
|
3971
|
+
if isBinary:
|
|
3972
|
+
# Binary file - write as bytes
|
|
3973
|
+
responseFile = open(filePath, "wb")
|
|
3974
|
+
responseFile.write(archiveContent)
|
|
3975
|
+
else:
|
|
3976
|
+
# Text file - write as UTF-8
|
|
3977
|
+
responseFile = open(filePath, "w", encoding="utf8")
|
|
3978
|
+
responseFile.write(archiveHtml)
|
|
3979
|
+
responseFile.close()
|
|
3980
|
+
with links_lock:
|
|
3981
|
+
fileCount = fileCount + 1
|
|
3982
|
+
|
|
3983
|
+
# Track extra URLs found in WARC files for mode B (only when -f is not passed, since we track earlier if it is)
|
|
3984
|
+
if args.mode == "B" and not args.filter_responses_only and targetUri:
|
|
3985
|
+
with links_lock:
|
|
3986
|
+
if (
|
|
3987
|
+
targetUri not in linksFound
|
|
3988
|
+
and targetUri not in extraWarcLinks
|
|
3989
|
+
):
|
|
3990
|
+
extraWarcLinks.add(targetUri)
|
|
3991
|
+
except Exception as e:
|
|
3992
|
+
writerr(
|
|
3993
|
+
colored(
|
|
3994
|
+
"GhostArchive - [ ERR ] Failed to write file "
|
|
3995
|
+
+ filePath
|
|
3996
|
+
+ ": "
|
|
3997
|
+
+ str(e),
|
|
3998
|
+
"red",
|
|
3999
|
+
)
|
|
4000
|
+
)
|
|
4001
|
+
|
|
4002
|
+
# Write the hash value and URL to the index file
|
|
4003
|
+
if not args.url_filename and hashValue:
|
|
4004
|
+
try:
|
|
4005
|
+
timestamp = str(datetime.now())
|
|
4006
|
+
indexFile.write(
|
|
4007
|
+
hashValue
|
|
4008
|
+
+ ","
|
|
4009
|
+
+ domUrl
|
|
4010
|
+
+ "#"
|
|
4011
|
+
+ targetUri
|
|
4012
|
+
+ " ,"
|
|
4013
|
+
+ timestamp
|
|
4014
|
+
+ "\n"
|
|
4015
|
+
)
|
|
4016
|
+
indexFile.flush()
|
|
4017
|
+
except Exception as e:
|
|
4018
|
+
writerr(
|
|
4019
|
+
colored(
|
|
4020
|
+
'GhostArchive - [ ERR ] Failed to write to waymore_index.txt for "'
|
|
4021
|
+
+ warcUrl
|
|
4022
|
+
+ '": '
|
|
4023
|
+
+ str(e),
|
|
4024
|
+
"red",
|
|
4025
|
+
)
|
|
4026
|
+
)
|
|
4027
|
+
|
|
4028
|
+
successCount = successCount + 1
|
|
4029
|
+
|
|
4030
|
+
except WayBackException:
|
|
4031
|
+
failureCount = failureCount + 1
|
|
4032
|
+
|
|
4033
|
+
except Exception as e:
|
|
4034
|
+
failureCount = failureCount + 1
|
|
4035
|
+
if verbose():
|
|
4036
|
+
# Simplify common error messages
|
|
4037
|
+
if "connection broken" in str(e).lower():
|
|
4038
|
+
errorMsg = "Connection Broken"
|
|
4039
|
+
else:
|
|
4040
|
+
errorMsg = str(e)
|
|
4041
|
+
try:
|
|
4042
|
+
statusCode = (
|
|
4043
|
+
resp.status_code if "resp" in dir() and resp is not None else "ERR"
|
|
4044
|
+
)
|
|
4045
|
+
writerr(
|
|
4046
|
+
colored(
|
|
4047
|
+
"GhostArchive - [ "
|
|
4048
|
+
+ str(statusCode)
|
|
4049
|
+
+ ' ] Failed to get response for "'
|
|
4050
|
+
+ warcUrl
|
|
4051
|
+
+ '": '
|
|
4052
|
+
+ errorMsg,
|
|
4053
|
+
"red",
|
|
4054
|
+
)
|
|
4055
|
+
)
|
|
4056
|
+
except Exception:
|
|
4057
|
+
writerr(
|
|
4058
|
+
colored(
|
|
4059
|
+
'GhostArchive - [ ERR ] Failed to get response for "'
|
|
4060
|
+
+ warcUrl
|
|
4061
|
+
+ '": '
|
|
4062
|
+
+ errorMsg,
|
|
4063
|
+
"red",
|
|
4064
|
+
)
|
|
4065
|
+
)
|
|
4066
|
+
|
|
4067
|
+
# Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
|
|
4068
|
+
if (successCount + failureCount) % 25 == 1 or (
|
|
4069
|
+
successCount + failureCount
|
|
4070
|
+
) == totalResponses:
|
|
4071
|
+
try:
|
|
4072
|
+
getMemory()
|
|
4073
|
+
if verbose():
|
|
4074
|
+
suffix = (
|
|
4075
|
+
"Complete (Mem Usage "
|
|
4076
|
+
+ humanReadableSize(currentMemUsage)
|
|
4077
|
+
+ ", Total Mem "
|
|
4078
|
+
+ str(currentMemPercent)
|
|
4079
|
+
+ "%) "
|
|
4080
|
+
)
|
|
4081
|
+
except Exception:
|
|
4082
|
+
if verbose():
|
|
4083
|
+
suffix = 'Complete (To show mem use, run "pip install psutil")'
|
|
4084
|
+
printProgressBar(
|
|
4085
|
+
successCount + failureCount,
|
|
4086
|
+
totalResponses,
|
|
4087
|
+
prefix="Processing " + str(totalResponses) + " WARC files:",
|
|
4088
|
+
suffix=suffix,
|
|
4089
|
+
length=getProgressBarLength(),
|
|
4090
|
+
fill=fillChar,
|
|
4091
|
+
)
|
|
4092
|
+
|
|
4093
|
+
except Exception as e:
|
|
4094
|
+
if verbose():
|
|
4095
|
+
writerr(
|
|
4096
|
+
colored(
|
|
4097
|
+
'GhostArchive - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red"
|
|
4098
|
+
)
|
|
4099
|
+
)
|
|
4100
|
+
|
|
4101
|
+
except Exception as e:
|
|
4102
|
+
writerr(colored("ERROR getGhostArchiveWARC 1: " + str(e), "red"))
|
|
4103
|
+
|
|
4104
|
+
|
|
4105
|
+
def format_date_for_urlscan(date_str):
|
|
4106
|
+
# Handle different lengths of input
|
|
4107
|
+
if len(date_str) == 4: # YYYY
|
|
4108
|
+
date_str += "0101"
|
|
4109
|
+
elif len(date_str) == 6: # YYYYMM
|
|
4110
|
+
date_str += "01"
|
|
4111
|
+
|
|
4112
|
+
# Convert to YYYY-MM-DD format
|
|
4113
|
+
try:
|
|
4114
|
+
formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
|
|
4115
|
+
return formatted_date
|
|
4116
|
+
except Exception:
|
|
4117
|
+
return ""
|
|
4118
|
+
|
|
4119
|
+
|
|
4120
|
+
def getURLScanUrls():
|
|
4121
|
+
"""
|
|
4122
|
+
Get URLs from the URLSCan API, urlscan.io
|
|
4123
|
+
"""
|
|
4124
|
+
global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
|
|
4125
|
+
|
|
4126
|
+
# Write the file of URL's for the passed domain/URL
|
|
4127
|
+
try:
|
|
4128
|
+
requestsMade = 0
|
|
4129
|
+
stopSourceURLScan = False
|
|
4130
|
+
linksFoundURLScan = set()
|
|
4131
|
+
totalUrls = 0
|
|
4132
|
+
checkResponse = True
|
|
4133
|
+
|
|
4134
|
+
# Set the URL to just the hostname
|
|
4135
|
+
url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
|
|
4136
|
+
|
|
4137
|
+
# If the --from-date or --to-date parameters were paassed then also add a date filter
|
|
4138
|
+
if args.from_date or args.to_date:
|
|
4139
|
+
if args.from_date:
|
|
4140
|
+
fromDate = format_date_for_urlscan(str(args.from_date)[:8])
|
|
4141
|
+
else:
|
|
4142
|
+
fromDate = "2016-01-01" # The year URLScan started
|
|
4143
|
+
if args.to_date:
|
|
4144
|
+
toDate = format_date_for_urlscan(str(args.to_date)[:8])
|
|
4145
|
+
else:
|
|
4146
|
+
toDate = "now"
|
|
4147
|
+
url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
|
|
4148
|
+
else:
|
|
4149
|
+
url = url.replace("{DATERANGE}", "")
|
|
4150
|
+
|
|
4151
|
+
if verbose():
|
|
4152
|
+
if args.mode == "R":
|
|
4153
|
+
write(
|
|
4154
|
+
colored(
|
|
4155
|
+
"URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
|
|
4156
|
+
"magenta",
|
|
4157
|
+
)
|
|
4158
|
+
+ colored(url + "\n", "white")
|
|
4159
|
+
)
|
|
4160
|
+
else:
|
|
4161
|
+
write(
|
|
4162
|
+
colored(
|
|
4163
|
+
"URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
|
|
4164
|
+
)
|
|
4165
|
+
+ colored(url + "\n", "white")
|
|
4166
|
+
)
|
|
4167
|
+
|
|
4168
|
+
if args.mode in ("U", "B") and not args.check_only:
|
|
4169
|
+
write(
|
|
4170
|
+
colored(
|
|
4171
|
+
"URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
|
|
4172
|
+
"cyan",
|
|
4173
|
+
)
|
|
4174
|
+
)
|
|
4175
|
+
|
|
4176
|
+
# Get the first page from urlscan.io
|
|
4177
|
+
try:
|
|
4178
|
+
# Choose a random user agent string to use for any requests
|
|
4179
|
+
# For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
|
|
4180
|
+
# that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
|
|
4181
|
+
# be successful all the time
|
|
4182
|
+
userAgent = "waymore v" + __version__ + " by xnl-h4ck3r"
|
|
4183
|
+
session = requests.Session()
|
|
4184
|
+
session.mount("https://", HTTP_ADAPTER)
|
|
4185
|
+
session.mount("http://", HTTP_ADAPTER)
|
|
4186
|
+
# Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
|
|
4187
|
+
resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
|
|
4188
|
+
requestsMade = requestsMade + 1
|
|
4189
|
+
except Exception as e:
|
|
4190
|
+
write(
|
|
4191
|
+
colored(
|
|
4192
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
4193
|
+
"red",
|
|
4194
|
+
)
|
|
4195
|
+
)
|
|
4196
|
+
return
|
|
4197
|
+
|
|
4198
|
+
# If the rate limit was reached then determine if to wait and then try again
|
|
4199
|
+
if resp.status_code == 429:
|
|
4200
|
+
# Get the number of seconds the rate limit resets
|
|
4201
|
+
match = re.search(r"Reset in (\d+) seconds", resp.text, flags=re.IGNORECASE)
|
|
4202
|
+
if match is not None:
|
|
4203
|
+
seconds = int(match.group(1))
|
|
4204
|
+
if seconds <= args.urlscan_rate_limit_retry * 60:
|
|
4205
|
+
writerr(
|
|
4206
|
+
colored(
|
|
4207
|
+
"URLScan - [ 429 ] Rate limit reached, so waiting for another "
|
|
4208
|
+
+ str(seconds)
|
|
4209
|
+
+ " seconds before continuing...",
|
|
4210
|
+
"yellow",
|
|
4211
|
+
)
|
|
4212
|
+
)
|
|
4213
|
+
# Wait can be interrupted by SIGINT via interrupt_event
|
|
4214
|
+
interrupt_event.clear()
|
|
4215
|
+
if interrupt_event.wait(seconds + 1):
|
|
4216
|
+
# Interrupted by SIGINT
|
|
4217
|
+
return
|
|
4218
|
+
try:
|
|
4219
|
+
resp = session.get(
|
|
4220
|
+
url,
|
|
4221
|
+
headers={
|
|
4222
|
+
"User-Agent": userAgent,
|
|
4223
|
+
"API-Key": URLSCAN_API_KEY,
|
|
4224
|
+
},
|
|
4225
|
+
)
|
|
4226
|
+
requestsMade = requestsMade + 1
|
|
4227
|
+
except Exception as e:
|
|
4228
|
+
write(
|
|
4229
|
+
colored(
|
|
4230
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
4231
|
+
"red",
|
|
4232
|
+
)
|
|
4233
|
+
)
|
|
4234
|
+
return
|
|
4235
|
+
|
|
4236
|
+
# If the rate limit was reached or if a 401 (which likely means the API key isn't valid), try without API key
|
|
4237
|
+
if resp.status_code in (401, 429):
|
|
4238
|
+
if URLSCAN_API_KEY != "":
|
|
4239
|
+
try:
|
|
4240
|
+
if resp.status_code == 429:
|
|
4241
|
+
writerr(
|
|
4242
|
+
colored(
|
|
4243
|
+
"URLScan - [ 429 ] Rate limit reached so trying without API Key...",
|
|
4244
|
+
"red",
|
|
4245
|
+
)
|
|
4246
|
+
)
|
|
4247
|
+
else:
|
|
4248
|
+
writerr(
|
|
4249
|
+
colored(
|
|
4250
|
+
"URLScan - [ INF ] The API Key is invalid so trying without API Key...",
|
|
4251
|
+
"red",
|
|
4252
|
+
)
|
|
4253
|
+
)
|
|
4254
|
+
# Set key to blank for further requests
|
|
4255
|
+
URLSCAN_API_KEY = ""
|
|
4256
|
+
session_no_key = requests.Session()
|
|
4257
|
+
session_no_key.mount("https://", HTTP_ADAPTER)
|
|
4258
|
+
session_no_key.mount("http://", HTTP_ADAPTER)
|
|
4259
|
+
resp = session_no_key.get(url, headers={"User-Agent": userAgent})
|
|
4260
|
+
except Exception as e:
|
|
4261
|
+
writerr(
|
|
4262
|
+
colored(
|
|
4263
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
4264
|
+
"red",
|
|
4265
|
+
)
|
|
4266
|
+
)
|
|
4267
|
+
checkResponse = False
|
|
4268
|
+
|
|
4269
|
+
# If the rate limit was reached end now
|
|
4270
|
+
if resp.status_code == 429:
|
|
4271
|
+
writerr(
|
|
4272
|
+
colored(
|
|
4273
|
+
"URLScan - [ 429 ] Rate limit reached without API Key so unable to get links.",
|
|
4274
|
+
"red",
|
|
4275
|
+
)
|
|
4276
|
+
)
|
|
4277
|
+
checkResponse = False
|
|
4278
|
+
else:
|
|
4279
|
+
writerr(
|
|
3706
4280
|
colored(
|
|
3707
4281
|
"URLScan - [ 429 ] Rate limit reached so unable to get links.",
|
|
3708
4282
|
"red",
|
|
@@ -4198,7 +4772,6 @@ def processWayBackPage(url):
|
|
|
4198
4772
|
pass
|
|
4199
4773
|
return
|
|
4200
4774
|
else:
|
|
4201
|
-
print("DEBUG: HERE END!") # DEBUG
|
|
4202
4775
|
pass
|
|
4203
4776
|
except Exception as e:
|
|
4204
4777
|
if verbose():
|
|
@@ -5380,80 +5953,373 @@ def processIntelxType(target, credits):
|
|
|
5380
5953
|
writerr(colored("ERROR processIntelxType 1: " + str(e), "red"))
|
|
5381
5954
|
|
|
5382
5955
|
|
|
5383
|
-
def getIntelxAccountInfo() -> str:
|
|
5384
|
-
"""
|
|
5385
|
-
Get the account info and return the number of Credits remaining from the /phonebook/search
|
|
5386
|
-
"""
|
|
5387
|
-
initIntelxTls()
|
|
5388
|
-
try:
|
|
5389
|
-
resp = chooseIntelxBase(INTELX_API_KEY)
|
|
5390
|
-
if resp is None or resp.status_code != 200:
|
|
5391
|
-
return "Unknown"
|
|
5392
|
-
jsonResp = json.loads(resp.text.strip())
|
|
5393
|
-
credits = str(
|
|
5394
|
-
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
|
|
5395
|
-
)
|
|
5396
|
-
credits_max = str(
|
|
5397
|
-
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
|
|
5398
|
-
)
|
|
5399
|
-
return credits + "/" + credits_max
|
|
5400
|
-
except Exception:
|
|
5401
|
-
return "Unknown"
|
|
5956
|
+
def getIntelxAccountInfo() -> str:
|
|
5957
|
+
"""
|
|
5958
|
+
Get the account info and return the number of Credits remaining from the /phonebook/search
|
|
5959
|
+
"""
|
|
5960
|
+
initIntelxTls()
|
|
5961
|
+
try:
|
|
5962
|
+
resp = chooseIntelxBase(INTELX_API_KEY)
|
|
5963
|
+
if resp is None or resp.status_code != 200:
|
|
5964
|
+
return "Unknown"
|
|
5965
|
+
jsonResp = json.loads(resp.text.strip())
|
|
5966
|
+
credits = str(
|
|
5967
|
+
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
|
|
5968
|
+
)
|
|
5969
|
+
credits_max = str(
|
|
5970
|
+
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
|
|
5971
|
+
)
|
|
5972
|
+
return credits + "/" + credits_max
|
|
5973
|
+
except Exception:
|
|
5974
|
+
return "Unknown"
|
|
5975
|
+
|
|
5976
|
+
|
|
5977
|
+
def getIntelxUrls():
|
|
5978
|
+
"""
|
|
5979
|
+
Get URLs from the Intelligence X Phonebook search
|
|
5980
|
+
"""
|
|
5981
|
+
global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
|
|
5982
|
+
|
|
5983
|
+
# Write the file of URL's for the passed domain/URL
|
|
5984
|
+
try:
|
|
5985
|
+
if args.check_only:
|
|
5986
|
+
write(
|
|
5987
|
+
colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
|
|
5988
|
+
+ colored("minimum 4 requests", "white")
|
|
5989
|
+
)
|
|
5990
|
+
checkIntelx = 4
|
|
5991
|
+
return
|
|
5992
|
+
|
|
5993
|
+
stopSourceIntelx = False
|
|
5994
|
+
linksFoundIntelx = set()
|
|
5995
|
+
initIntelxTls()
|
|
5996
|
+
|
|
5997
|
+
credits = getIntelxAccountInfo()
|
|
5998
|
+
if verbose():
|
|
5999
|
+
write(
|
|
6000
|
+
colored(
|
|
6001
|
+
"IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
|
|
6002
|
+
+ credits
|
|
6003
|
+
+ "): ",
|
|
6004
|
+
"magenta",
|
|
6005
|
+
)
|
|
6006
|
+
+ colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
|
|
6007
|
+
)
|
|
6008
|
+
|
|
6009
|
+
if not args.check_only:
|
|
6010
|
+
write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
|
|
6011
|
+
|
|
6012
|
+
# Get the domains from Intelligence X if the --no-subs wasn't passed
|
|
6013
|
+
if not args.no_subs:
|
|
6014
|
+
processIntelxType(1, credits)
|
|
6015
|
+
|
|
6016
|
+
# Get the URLs from Intelligence X
|
|
6017
|
+
if not intelxAPIIssue:
|
|
6018
|
+
processIntelxType(3, credits)
|
|
6019
|
+
|
|
6020
|
+
linkCountIntelx = len(linksFoundIntelx)
|
|
6021
|
+
write(
|
|
6022
|
+
colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
|
|
6023
|
+
+ colored(str(linkCountIntelx), "white")
|
|
6024
|
+
)
|
|
6025
|
+
linksFound.update(linksFoundIntelx)
|
|
6026
|
+
linksFoundIntelx.clear()
|
|
6027
|
+
|
|
6028
|
+
except Exception as e:
|
|
6029
|
+
writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
|
|
6030
|
+
|
|
6031
|
+
|
|
6032
|
+
def processGhostArchiveUrl(url, ghostArchiveID=""):
|
|
6033
|
+
"""
|
|
6034
|
+
Process a specific URL from ghostarchive.org to determine whether to save the link
|
|
6035
|
+
"""
|
|
6036
|
+
global argsInput, argsInputHostname, links_lock, linkCountGhostArchive, linksFoundGhostArchive
|
|
6037
|
+
|
|
6038
|
+
addLink = True
|
|
6039
|
+
|
|
6040
|
+
try:
|
|
6041
|
+
# Strip Wayback Machine prefix if present (e.g., https://web.archive.org/web/20230101120000_/https://example.com)
|
|
6042
|
+
waybackMatch = re.match(r"^https?://web\.archive\.org/[^/]+/[a-zA-Z0-9]+_/", url)
|
|
6043
|
+
if waybackMatch:
|
|
6044
|
+
url = url[waybackMatch.end() :]
|
|
6045
|
+
|
|
6046
|
+
# If the input has a / in it, then a URL was passed, so the link will only be added if the URL matches
|
|
6047
|
+
if "/" in url:
|
|
6048
|
+
if argsInput not in url:
|
|
6049
|
+
addLink = False
|
|
6050
|
+
|
|
6051
|
+
# If filters are required then test them
|
|
6052
|
+
if addLink and not args.filter_responses_only:
|
|
6053
|
+
|
|
6054
|
+
# If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
|
|
6055
|
+
if args.no_subs:
|
|
6056
|
+
match = re.search(
|
|
6057
|
+
r"^[A-za-z]*\:\/\/(www\.)?" + re.escape(argsInputHostname),
|
|
6058
|
+
url,
|
|
6059
|
+
flags=re.IGNORECASE,
|
|
6060
|
+
)
|
|
6061
|
+
if match is None:
|
|
6062
|
+
addLink = False
|
|
6063
|
+
|
|
6064
|
+
# If the user didn't requested -f / --filter-responses-only then check http code
|
|
6065
|
+
if addLink and not args.filter_responses_only:
|
|
6066
|
+
|
|
6067
|
+
# Check the URL exclusions
|
|
6068
|
+
if addLink:
|
|
6069
|
+
match = re.search(
|
|
6070
|
+
r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
|
|
6071
|
+
url,
|
|
6072
|
+
flags=re.IGNORECASE,
|
|
6073
|
+
)
|
|
6074
|
+
if match is not None:
|
|
6075
|
+
addLink = False
|
|
6076
|
+
|
|
6077
|
+
# Set keywords filter if -ko argument passed
|
|
6078
|
+
if addLink and args.keywords_only:
|
|
6079
|
+
if args.keywords_only == "#CONFIG":
|
|
6080
|
+
match = re.search(
|
|
6081
|
+
r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
|
|
6082
|
+
url,
|
|
6083
|
+
flags=re.IGNORECASE,
|
|
6084
|
+
)
|
|
6085
|
+
else:
|
|
6086
|
+
match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
|
|
6087
|
+
if match is None:
|
|
6088
|
+
addLink = False
|
|
6089
|
+
|
|
6090
|
+
# Add link if it passed filters
|
|
6091
|
+
if addLink:
|
|
6092
|
+
# Just get the hostname of the url
|
|
6093
|
+
tldExtract = tldextract.extract(url)
|
|
6094
|
+
subDomain = tldExtract.subdomain
|
|
6095
|
+
if subDomain != "":
|
|
6096
|
+
subDomain = subDomain + "."
|
|
6097
|
+
domainOnly = subDomain + tldExtract.domain + "." + tldExtract.suffix
|
|
6098
|
+
|
|
6099
|
+
# GhostArchive might return URLs that aren't for the domain passed so we need to check for those and not process them
|
|
6100
|
+
# Check the URL
|
|
6101
|
+
match = re.search(
|
|
6102
|
+
r"(^|\.)" + re.escape(argsInputHostname) + "$",
|
|
6103
|
+
domainOnly,
|
|
6104
|
+
flags=re.IGNORECASE,
|
|
6105
|
+
)
|
|
6106
|
+
if match is not None:
|
|
6107
|
+
if args.mode in ("U", "B"):
|
|
6108
|
+
linksFoundAdd(url, linksFoundGhostArchive)
|
|
6109
|
+
# If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
|
|
6110
|
+
if ghostArchiveID != "" and args.mode in ("R", "B"):
|
|
6111
|
+
if args.limit == 0 or len(ghostArchiveRequestLinks) < args.limit:
|
|
6112
|
+
with links_lock:
|
|
6113
|
+
ghostArchiveRequestLinks.add(
|
|
6114
|
+
(url, GHOSTARCHIVE_DOM_URL + ghostArchiveID)
|
|
6115
|
+
)
|
|
6116
|
+
|
|
6117
|
+
except Exception as e:
|
|
6118
|
+
writerr(colored("ERROR processGhostArchiveUrl 1: " + str(e), "red"))
|
|
6119
|
+
|
|
6120
|
+
|
|
6121
|
+
def getGhostArchiveUrls():
|
|
6122
|
+
"""
|
|
6123
|
+
Get URLs from GhostArchive (ghostarchive.org)
|
|
6124
|
+
This source doesn't have an API, so we crawl the HTML pages directly.
|
|
6125
|
+
"""
|
|
6126
|
+
global linksFound, path, subs, stopProgram, stopSourceGhostArchive, argsInput, checkGhostArchive, argsInputHostname, linkCountGhostArchive, linksFoundGhostArchive
|
|
6127
|
+
|
|
6128
|
+
try:
|
|
6129
|
+
stopSourceGhostArchive = False
|
|
6130
|
+
linksFoundGhostArchive = set()
|
|
6131
|
+
|
|
6132
|
+
# Build the base URL
|
|
6133
|
+
# If there is only one . in the hostname, we can guarantee that a subdoman wasn't passed, so we can prefix with . to the links quicker as it won't include other domains that end with the target domain,
|
|
6134
|
+
# Else, we need to get all and then confirm the actual host of the links later
|
|
6135
|
+
if argsInputHostname.count(".") == 1:
|
|
6136
|
+
baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", "." + quote(argsInput))
|
|
6137
|
+
else:
|
|
6138
|
+
baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", quote(argsInput))
|
|
6139
|
+
|
|
6140
|
+
if verbose():
|
|
6141
|
+
write(
|
|
6142
|
+
colored("GhostArchive - [ INFO ] The URL requested to get links: ", "magenta")
|
|
6143
|
+
+ colored(baseUrl + "0\n", "white")
|
|
6144
|
+
)
|
|
6145
|
+
|
|
6146
|
+
if not args.check_only and args.mode == "U":
|
|
6147
|
+
write(
|
|
6148
|
+
colored(
|
|
6149
|
+
"GhostArchive - [ INFO ] Getting links from ghostarchive.org (this can take a while for some domains)...",
|
|
6150
|
+
"cyan",
|
|
6151
|
+
)
|
|
6152
|
+
)
|
|
6153
|
+
|
|
6154
|
+
# Set up session with cookie
|
|
6155
|
+
session = requests.Session()
|
|
6156
|
+
if HTTP_ADAPTER is not None:
|
|
6157
|
+
session.mount("https://", HTTP_ADAPTER)
|
|
6158
|
+
session.mount("http://", HTTP_ADAPTER)
|
|
6159
|
+
|
|
6160
|
+
userAgent = random.choice(USER_AGENT)
|
|
6161
|
+
headers = {"User-Agent": userAgent}
|
|
6162
|
+
cookies = {"theme": "original"}
|
|
6163
|
+
|
|
6164
|
+
pageNum = 0
|
|
6165
|
+
|
|
6166
|
+
while stopProgram is None and not stopSourceGhostArchive:
|
|
6167
|
+
getMemory()
|
|
6168
|
+
|
|
6169
|
+
url = baseUrl + str(pageNum)
|
|
6170
|
+
|
|
6171
|
+
try:
|
|
6172
|
+
resp = session.get(url, headers=headers, cookies=cookies, timeout=DEFAULT_TIMEOUT)
|
|
6173
|
+
except Exception as e:
|
|
6174
|
+
writerr(
|
|
6175
|
+
colored(
|
|
6176
|
+
"GhostArchive - [ ERR ] Unable to get page " + str(pageNum) + ": " + str(e),
|
|
6177
|
+
"red",
|
|
6178
|
+
)
|
|
6179
|
+
)
|
|
6180
|
+
break
|
|
6181
|
+
|
|
6182
|
+
if resp.status_code == 429:
|
|
6183
|
+
writerr(
|
|
6184
|
+
colored(
|
|
6185
|
+
"GhostArchive - [ 429 ] Rate limit reached at page " + str(pageNum) + ".",
|
|
6186
|
+
"red",
|
|
6187
|
+
)
|
|
6188
|
+
)
|
|
6189
|
+
break
|
|
6190
|
+
|
|
6191
|
+
# Check for maintenance/end of results indicator
|
|
6192
|
+
if (
|
|
6193
|
+
resp.status_code == 503
|
|
6194
|
+
or "The site is under maintenance and will be back soon" in resp.text
|
|
6195
|
+
or "No archives for that site" in resp.text
|
|
6196
|
+
):
|
|
6197
|
+
if verbose():
|
|
6198
|
+
if pageNum == 0:
|
|
6199
|
+
if args.check_only:
|
|
6200
|
+
checkGhostArchive = 1
|
|
6201
|
+
write(
|
|
6202
|
+
colored(
|
|
6203
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6204
|
+
)
|
|
6205
|
+
+ colored("1 request", "white")
|
|
6206
|
+
)
|
|
6207
|
+
else:
|
|
6208
|
+
write(
|
|
6209
|
+
colored(
|
|
6210
|
+
"GhostArchive - [ INFO ] No results found",
|
|
6211
|
+
"cyan",
|
|
6212
|
+
)
|
|
6213
|
+
)
|
|
6214
|
+
else:
|
|
6215
|
+
write(
|
|
6216
|
+
colored(
|
|
6217
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6218
|
+
+ str(pageNum)
|
|
6219
|
+
+ " pages",
|
|
6220
|
+
"cyan",
|
|
6221
|
+
)
|
|
6222
|
+
)
|
|
6223
|
+
break
|
|
6224
|
+
if resp.status_code != 200:
|
|
6225
|
+
writerr(
|
|
6226
|
+
colored(
|
|
6227
|
+
"GhostArchive - [ ERR ] [ "
|
|
6228
|
+
+ str(resp.status_code)
|
|
6229
|
+
+ " ] at page "
|
|
6230
|
+
+ str(pageNum),
|
|
6231
|
+
"red",
|
|
6232
|
+
)
|
|
6233
|
+
)
|
|
6234
|
+
break
|
|
6235
|
+
|
|
6236
|
+
# Check only mode - just count pages
|
|
6237
|
+
if args.check_only:
|
|
6238
|
+
# For check only, we check if there are results and try to get total count
|
|
6239
|
+
if pageNum == 0:
|
|
6240
|
+
# Check if there are any results on the first page
|
|
6241
|
+
if '<a href="/archive/' in resp.text:
|
|
6242
|
+
# Try to find "out of X" to determine total results/pages
|
|
6243
|
+
outOfMatch = re.search(r"out of (\d+)", resp.text)
|
|
6244
|
+
if outOfMatch:
|
|
6245
|
+
totalResults = int(outOfMatch.group(1))
|
|
6246
|
+
checkGhostArchive = totalResults
|
|
6247
|
+
write(
|
|
6248
|
+
colored(
|
|
6249
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6250
|
+
)
|
|
6251
|
+
+ colored(f"{totalResults} requests (pagination required)", "white")
|
|
6252
|
+
)
|
|
6253
|
+
else:
|
|
6254
|
+
checkGhostArchive = 1
|
|
6255
|
+
write(
|
|
6256
|
+
colored(
|
|
6257
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6258
|
+
)
|
|
6259
|
+
+ colored("unknown requests (pagination required)", "white")
|
|
6260
|
+
)
|
|
6261
|
+
else:
|
|
6262
|
+
checkGhostArchive = 1
|
|
6263
|
+
write(
|
|
6264
|
+
colored("GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan")
|
|
6265
|
+
+ colored("1 request (no results)", "white")
|
|
6266
|
+
)
|
|
6267
|
+
break
|
|
6268
|
+
|
|
6269
|
+
# Use regex to extract URLs from anchor tag text content
|
|
6270
|
+
# Pattern matches: <a href="/archive/ID">URL_HERE</a> - captures both href path and URL
|
|
6271
|
+
pattern = r'<a href="(/archive/[^"]*)">([^<]+)</a>'
|
|
6272
|
+
matches = re.findall(pattern, resp.text)
|
|
5402
6273
|
|
|
6274
|
+
# If no matches found, we've reached the end of results
|
|
6275
|
+
if not matches:
|
|
6276
|
+
if verbose():
|
|
6277
|
+
write(
|
|
6278
|
+
colored(
|
|
6279
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6280
|
+
+ str(pageNum + 1)
|
|
6281
|
+
+ " pages",
|
|
6282
|
+
"cyan",
|
|
6283
|
+
)
|
|
6284
|
+
)
|
|
6285
|
+
break
|
|
5403
6286
|
|
|
5404
|
-
|
|
5405
|
-
|
|
5406
|
-
|
|
5407
|
-
|
|
5408
|
-
global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
|
|
6287
|
+
for match in matches:
|
|
6288
|
+
ghostArchiveId = match[0] # e.g., "/archive/gkOOR"
|
|
6289
|
+
potentialUrl = match[1].strip()
|
|
6290
|
+
processGhostArchiveUrl(potentialUrl, ghostArchiveId)
|
|
5409
6291
|
|
|
5410
|
-
|
|
5411
|
-
|
|
5412
|
-
|
|
5413
|
-
|
|
5414
|
-
|
|
5415
|
-
|
|
5416
|
-
|
|
5417
|
-
|
|
5418
|
-
|
|
6292
|
+
# Check if there's a "Next Page" link - if not, we've reached the last page
|
|
6293
|
+
# GhostArchive resets to Page 1 when exceeding actual pages, so checking for Next Page is essential
|
|
6294
|
+
if "Next Page" not in resp.text and ">»</a>" not in resp.text:
|
|
6295
|
+
if verbose():
|
|
6296
|
+
write(
|
|
6297
|
+
colored(
|
|
6298
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6299
|
+
+ str(pageNum + 1)
|
|
6300
|
+
+ " pages",
|
|
6301
|
+
"cyan",
|
|
6302
|
+
)
|
|
6303
|
+
)
|
|
6304
|
+
break
|
|
5419
6305
|
|
|
5420
|
-
|
|
5421
|
-
linksFoundIntelx = set()
|
|
5422
|
-
initIntelxTls()
|
|
6306
|
+
pageNum += 1
|
|
5423
6307
|
|
|
5424
|
-
|
|
5425
|
-
|
|
6308
|
+
if not args.check_only:
|
|
6309
|
+
# Count links based on mode - in R mode, count response links; in U/B mode, count URL links
|
|
6310
|
+
if args.mode == "R":
|
|
6311
|
+
linkCountGhostArchive = len(ghostArchiveRequestLinks)
|
|
6312
|
+
else:
|
|
6313
|
+
linkCountGhostArchive = len(linksFoundGhostArchive)
|
|
5426
6314
|
write(
|
|
5427
|
-
colored(
|
|
5428
|
-
|
|
5429
|
-
+ credits
|
|
5430
|
-
+ "): ",
|
|
5431
|
-
"magenta",
|
|
5432
|
-
)
|
|
5433
|
-
+ colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
|
|
6315
|
+
colored("GhostArchive - [ INFO ] Links found on ghostarchive.org: ", "cyan")
|
|
6316
|
+
+ colored(str(linkCountGhostArchive), "white")
|
|
5434
6317
|
)
|
|
5435
|
-
|
|
5436
|
-
|
|
5437
|
-
write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
|
|
5438
|
-
|
|
5439
|
-
# Get the domains from Intelligence X if the --no-subs wasn't passed
|
|
5440
|
-
if not args.no_subs:
|
|
5441
|
-
processIntelxType(1, credits)
|
|
5442
|
-
|
|
5443
|
-
# Get the URLs from Intelligence X
|
|
5444
|
-
if not intelxAPIIssue:
|
|
5445
|
-
processIntelxType(3, credits)
|
|
5446
|
-
|
|
5447
|
-
linkCountIntelx = len(linksFoundIntelx)
|
|
5448
|
-
write(
|
|
5449
|
-
colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
|
|
5450
|
-
+ colored(str(linkCountIntelx), "white")
|
|
5451
|
-
)
|
|
5452
|
-
linksFound.update(linksFoundIntelx)
|
|
5453
|
-
linksFoundIntelx.clear()
|
|
6318
|
+
linksFound.update(linksFoundGhostArchive)
|
|
6319
|
+
linksFoundGhostArchive.clear()
|
|
5454
6320
|
|
|
5455
6321
|
except Exception as e:
|
|
5456
|
-
writerr(colored("ERROR
|
|
6322
|
+
writerr(colored("ERROR getGhostArchiveUrls 1: " + str(e), "red"))
|
|
5457
6323
|
|
|
5458
6324
|
|
|
5459
6325
|
def processResponses():
|
|
@@ -5463,6 +6329,10 @@ def processResponses():
|
|
|
5463
6329
|
global stopProgram, totalFileCount
|
|
5464
6330
|
try:
|
|
5465
6331
|
|
|
6332
|
+
# Get responses from GhostArchive unless excluded
|
|
6333
|
+
if stopProgram is None and not args.xga:
|
|
6334
|
+
processResponsesGhostArchive()
|
|
6335
|
+
|
|
5466
6336
|
# Get responses from URLScan unless excluded
|
|
5467
6337
|
if stopProgram is None and not args.xus:
|
|
5468
6338
|
processResponsesURLScan()
|
|
@@ -5484,6 +6354,235 @@ def processResponses():
|
|
|
5484
6354
|
writerr(colored(getSPACER("ERROR processResponses 1: " + str(e)), "red"))
|
|
5485
6355
|
|
|
5486
6356
|
|
|
6357
|
+
def processResponsesGhostArchive():
|
|
6358
|
+
"""
|
|
6359
|
+
Get archived responses from GhostArchive (ghostarchive.org)
|
|
6360
|
+
"""
|
|
6361
|
+
global subs, path, indexFile, totalResponses, stopProgram, argsInput, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, ghostArchiveRequestLinks, failureCount, totalFileCount, checkGhostArchive
|
|
6362
|
+
try:
|
|
6363
|
+
fileCount = 0
|
|
6364
|
+
failureCount = 0
|
|
6365
|
+
if not args.check_only:
|
|
6366
|
+
# Create 'results' and domain directory if needed
|
|
6367
|
+
createDirs()
|
|
6368
|
+
|
|
6369
|
+
# Get the path of the files, depending on whether -oR / --output_responses was passed
|
|
6370
|
+
try:
|
|
6371
|
+
responsesPath = responseOutputDirectory + "responses.GhostArchive.tmp"
|
|
6372
|
+
indexPath = responseOutputDirectory + "waymore_index.txt"
|
|
6373
|
+
except Exception as e:
|
|
6374
|
+
if verbose():
|
|
6375
|
+
writerr(colored("ERROR processResponsesGhostArchive 4: " + str(e), "red"))
|
|
6376
|
+
|
|
6377
|
+
# Get URLs from GhostArchive if the DOM ID's haven't been retrieved yet
|
|
6378
|
+
if stopProgram is None and not args.check_only:
|
|
6379
|
+
if args.mode in ("R", "B"):
|
|
6380
|
+
write(
|
|
6381
|
+
colored(
|
|
6382
|
+
"GhostArchive - [ INFO ] Getting list of response links (this can take a while for some domains)...",
|
|
6383
|
+
"cyan",
|
|
6384
|
+
)
|
|
6385
|
+
)
|
|
6386
|
+
if args.mode == "R":
|
|
6387
|
+
getGhostArchiveUrls()
|
|
6388
|
+
|
|
6389
|
+
# Check if a responses.GhostArchive.tmp files exists
|
|
6390
|
+
if not args.check_only and os.path.exists(responsesPath):
|
|
6391
|
+
|
|
6392
|
+
# Load the links into the set
|
|
6393
|
+
with open(responsesPath, "rb") as fl:
|
|
6394
|
+
linkRequests = pickle.load(fl)
|
|
6395
|
+
|
|
6396
|
+
# Set start point
|
|
6397
|
+
successCount = 0
|
|
6398
|
+
|
|
6399
|
+
# Get the URLScan DOM links
|
|
6400
|
+
linkRequests = []
|
|
6401
|
+
for originalUrl, domUrl in ghostArchiveRequestLinks:
|
|
6402
|
+
linkRequests.append((originalUrl, domUrl))
|
|
6403
|
+
|
|
6404
|
+
# Write the links to a temp file
|
|
6405
|
+
if not args.check_only:
|
|
6406
|
+
with open(responsesPath, "wb") as f:
|
|
6407
|
+
pickle.dump(linkRequests, f)
|
|
6408
|
+
|
|
6409
|
+
# Get the total number of responses we will try to get and set the current file count to the success count
|
|
6410
|
+
totalResponses = len(linkRequests)
|
|
6411
|
+
checkGhostArchive = checkGhostArchive + totalResponses
|
|
6412
|
+
|
|
6413
|
+
# If there are no reponses to download, diaplay an error and exit
|
|
6414
|
+
if args.mode != "R" and totalResponses == 0:
|
|
6415
|
+
writerr(
|
|
6416
|
+
colored(
|
|
6417
|
+
getSPACER(
|
|
6418
|
+
"Failed to get responses from GhostArchive (ghostarchive.org) - check input and try again."
|
|
6419
|
+
),
|
|
6420
|
+
"red",
|
|
6421
|
+
)
|
|
6422
|
+
)
|
|
6423
|
+
return
|
|
6424
|
+
|
|
6425
|
+
fileCount = successCount
|
|
6426
|
+
|
|
6427
|
+
if args.check_only:
|
|
6428
|
+
writerr(
|
|
6429
|
+
colored("Downloading archived responses: ", "cyan")
|
|
6430
|
+
+ colored("UNKNOWN requests", "cyan")
|
|
6431
|
+
)
|
|
6432
|
+
writerr(
|
|
6433
|
+
colored(
|
|
6434
|
+
"\n-> Downloading the responses can vary depending on the target and the rate limiting on GhostArchive",
|
|
6435
|
+
"green",
|
|
6436
|
+
)
|
|
6437
|
+
)
|
|
6438
|
+
write("")
|
|
6439
|
+
else:
|
|
6440
|
+
# If the limit has been set over the default, give a warning that this could take a long time!
|
|
6441
|
+
if totalResponses - successCount > DEFAULT_LIMIT:
|
|
6442
|
+
if successCount > 0:
|
|
6443
|
+
writerr(
|
|
6444
|
+
colored(
|
|
6445
|
+
getSPACER(
|
|
6446
|
+
"WARNING: Downloading remaining "
|
|
6447
|
+
+ str(totalResponses - successCount)
|
|
6448
|
+
+ " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
|
|
6449
|
+
),
|
|
6450
|
+
"yellow",
|
|
6451
|
+
)
|
|
6452
|
+
)
|
|
6453
|
+
else:
|
|
6454
|
+
writerr(
|
|
6455
|
+
colored(
|
|
6456
|
+
getSPACER(
|
|
6457
|
+
"WARNING: Downloading "
|
|
6458
|
+
+ str(totalResponses)
|
|
6459
|
+
+ " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
|
|
6460
|
+
),
|
|
6461
|
+
"yellow",
|
|
6462
|
+
)
|
|
6463
|
+
)
|
|
6464
|
+
|
|
6465
|
+
# Open the index file if hash value is going to be used (not URL)
|
|
6466
|
+
if not args.url_filename:
|
|
6467
|
+
indexFile = open(indexPath, "a")
|
|
6468
|
+
|
|
6469
|
+
# Process the URLs from GhostArchive
|
|
6470
|
+
if stopProgram is None:
|
|
6471
|
+
p = mp.Pool(
|
|
6472
|
+
args.processes * 2
|
|
6473
|
+
) # Double the number of processes to speed up the download
|
|
6474
|
+
p.starmap(getGhostArchiveWARC, linkRequests[successCount:])
|
|
6475
|
+
p.close()
|
|
6476
|
+
p.join()
|
|
6477
|
+
|
|
6478
|
+
# Delete the tmp files now it has run successfully
|
|
6479
|
+
if stopProgram is None:
|
|
6480
|
+
try:
|
|
6481
|
+
os.remove(responsesPath)
|
|
6482
|
+
except Exception:
|
|
6483
|
+
pass
|
|
6484
|
+
|
|
6485
|
+
# Close the index file if hash value is going to be used (not URL)
|
|
6486
|
+
if not args.url_filename:
|
|
6487
|
+
indexFile.close()
|
|
6488
|
+
|
|
6489
|
+
if not args.check_only:
|
|
6490
|
+
try:
|
|
6491
|
+
if failureCount > 0:
|
|
6492
|
+
if verbose():
|
|
6493
|
+
write(
|
|
6494
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6495
|
+
+ colored(responseOutputDirectory, "white")
|
|
6496
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6497
|
+
+ colored(
|
|
6498
|
+
str(fileCount) + " 🤘",
|
|
6499
|
+
"white",
|
|
6500
|
+
)
|
|
6501
|
+
+ colored(" (" + str(failureCount) + " not found)\n", "red")
|
|
6502
|
+
)
|
|
6503
|
+
else:
|
|
6504
|
+
write(
|
|
6505
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6506
|
+
+ colored(responseOutputDirectory, "white")
|
|
6507
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6508
|
+
+ colored(str(fileCount) + " 🤘", "white")
|
|
6509
|
+
+ colored(" (" + str(failureCount) + " not found)\n", "red")
|
|
6510
|
+
)
|
|
6511
|
+
else:
|
|
6512
|
+
if verbose():
|
|
6513
|
+
write(
|
|
6514
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6515
|
+
+ colored(responseOutputDirectory, "white")
|
|
6516
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6517
|
+
+ colored(str(fileCount) + " 🤘\n", "white")
|
|
6518
|
+
)
|
|
6519
|
+
else:
|
|
6520
|
+
write(
|
|
6521
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6522
|
+
+ colored(responseOutputDirectory, "white")
|
|
6523
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6524
|
+
+ colored(str(fileCount) + " 🤘\n", "white")
|
|
6525
|
+
)
|
|
6526
|
+
except Exception as e:
|
|
6527
|
+
if verbose():
|
|
6528
|
+
writerr(colored("ERROR processResponsesGhostArchive 5: " + str(e), "red"))
|
|
6529
|
+
|
|
6530
|
+
# Append extra links from WARC files to URL output file (for mode B)
|
|
6531
|
+
try:
|
|
6532
|
+
if args.mode == "B" and len(extraWarcLinks) > 0:
|
|
6533
|
+
# Determine URL output file path (same logic as processURLOutput)
|
|
6534
|
+
if args.output_urls == "":
|
|
6535
|
+
if args.output_responses != "":
|
|
6536
|
+
urlFilePath = args.output_responses + "/waymore.txt"
|
|
6537
|
+
else:
|
|
6538
|
+
urlFilePath = (
|
|
6539
|
+
str(DEFAULT_OUTPUT_DIR)
|
|
6540
|
+
+ "/results/"
|
|
6541
|
+
+ str(argsInput).replace("/", "-")
|
|
6542
|
+
+ "/waymore.txt"
|
|
6543
|
+
)
|
|
6544
|
+
else:
|
|
6545
|
+
urlFilePath = args.output_urls
|
|
6546
|
+
|
|
6547
|
+
# Load existing URLs from file to avoid duplicates
|
|
6548
|
+
existingUrls = set()
|
|
6549
|
+
try:
|
|
6550
|
+
with open(urlFilePath) as f:
|
|
6551
|
+
for line in f:
|
|
6552
|
+
existingUrls.add(line.strip())
|
|
6553
|
+
except Exception:
|
|
6554
|
+
pass
|
|
6555
|
+
|
|
6556
|
+
# Append only new unique URLs
|
|
6557
|
+
newLinks = [
|
|
6558
|
+
url
|
|
6559
|
+
for url in extraWarcLinks
|
|
6560
|
+
if url not in existingUrls and url not in linksFound
|
|
6561
|
+
]
|
|
6562
|
+
if len(newLinks) > 0:
|
|
6563
|
+
with open(urlFilePath, "a") as f:
|
|
6564
|
+
for url in newLinks:
|
|
6565
|
+
f.write(url + "\n")
|
|
6566
|
+
|
|
6567
|
+
# Display message about extra links
|
|
6568
|
+
write(
|
|
6569
|
+
colored("GhostArchive - [ INFO ] ", "cyan")
|
|
6570
|
+
+ colored(str(len(newLinks)), "white")
|
|
6571
|
+
+ colored(" extra links found in WARC files added to file ", "cyan")
|
|
6572
|
+
+ colored(urlFilePath, "white")
|
|
6573
|
+
+ "\n"
|
|
6574
|
+
)
|
|
6575
|
+
except Exception as e:
|
|
6576
|
+
if verbose():
|
|
6577
|
+
writerr(colored("ERROR processResponsesGhostArchive 6: " + str(e), "red"))
|
|
6578
|
+
|
|
6579
|
+
totalFileCount = totalFileCount + fileCount
|
|
6580
|
+
except Exception as e:
|
|
6581
|
+
writerr(colored(getSPACER("ERROR processResponsesGhostArchive 1: " + str(e)), "red"))
|
|
6582
|
+
finally:
|
|
6583
|
+
linkRequests = None
|
|
6584
|
+
|
|
6585
|
+
|
|
5487
6586
|
def processResponsesURLScan():
|
|
5488
6587
|
"""
|
|
5489
6588
|
Get archived responses from URLScan (urlscan.io)
|
|
@@ -6699,6 +7798,12 @@ async def fetch_intelx_async():
|
|
|
6699
7798
|
await loop.run_in_executor(None, getIntelxUrls)
|
|
6700
7799
|
|
|
6701
7800
|
|
|
7801
|
+
async def fetch_ghostarchive_async():
|
|
7802
|
+
"""Async wrapper for getGhostArchiveUrls - runs in thread pool"""
|
|
7803
|
+
loop = asyncio.get_event_loop()
|
|
7804
|
+
await loop.run_in_executor(None, getGhostArchiveUrls)
|
|
7805
|
+
|
|
7806
|
+
|
|
6702
7807
|
async def fetch_all_sources_async():
|
|
6703
7808
|
"""
|
|
6704
7809
|
Orchestrator function to fetch from all enabled sources concurrently.
|
|
@@ -6721,6 +7826,8 @@ async def fetch_all_sources_async():
|
|
|
6721
7826
|
tasks.append(("VirusTotal", fetch_virustotal_async()))
|
|
6722
7827
|
if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
|
|
6723
7828
|
tasks.append(("Intelligence X", fetch_intelx_async()))
|
|
7829
|
+
if not args.xga and stopProgram is None:
|
|
7830
|
+
tasks.append(("GhostArchive", fetch_ghostarchive_async()))
|
|
6724
7831
|
|
|
6725
7832
|
if not tasks:
|
|
6726
7833
|
return
|
|
@@ -6746,7 +7853,7 @@ async def fetch_all_sources_async():
|
|
|
6746
7853
|
|
|
6747
7854
|
# Run waymore
|
|
6748
7855
|
def main():
|
|
6749
|
-
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
|
|
7856
|
+
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, extraWarcLinks
|
|
6750
7857
|
|
|
6751
7858
|
# Tell Python to run the handler() function when SIGINT is received
|
|
6752
7859
|
signal(SIGINT, handler)
|
|
@@ -6902,13 +8009,19 @@ def main():
|
|
|
6902
8009
|
help="Exclude checks for links from intelx.io",
|
|
6903
8010
|
default=False,
|
|
6904
8011
|
)
|
|
8012
|
+
parser.add_argument(
|
|
8013
|
+
"-xga",
|
|
8014
|
+
action="store_true",
|
|
8015
|
+
help="Exclude checks for links from ghostarchive.org",
|
|
8016
|
+
default=False,
|
|
8017
|
+
)
|
|
6905
8018
|
parser.add_argument(
|
|
6906
8019
|
"--providers",
|
|
6907
8020
|
action="store",
|
|
6908
|
-
help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal and
|
|
8021
|
+
help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal,intelx and ghostarchive. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
|
|
6909
8022
|
default=[],
|
|
6910
8023
|
type=validateArgProviders,
|
|
6911
|
-
metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx}",
|
|
8024
|
+
metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive}",
|
|
6912
8025
|
)
|
|
6913
8026
|
parser.add_argument(
|
|
6914
8027
|
"-lcc",
|
|
@@ -7075,6 +8188,10 @@ def main():
|
|
|
7075
8188
|
args.xix = True
|
|
7076
8189
|
else:
|
|
7077
8190
|
args.xix = False
|
|
8191
|
+
if "ghostarchive" not in args.providers:
|
|
8192
|
+
args.xga = True
|
|
8193
|
+
else:
|
|
8194
|
+
args.xga = False
|
|
7078
8195
|
|
|
7079
8196
|
# If no input was given, raise an error
|
|
7080
8197
|
if sys.stdin.isatty():
|
|
@@ -7145,6 +8262,7 @@ def main():
|
|
|
7145
8262
|
# Reset global variables
|
|
7146
8263
|
linksFound = set()
|
|
7147
8264
|
linkMimes = set()
|
|
8265
|
+
extraWarcLinks = set()
|
|
7148
8266
|
successCount = 0
|
|
7149
8267
|
failureCount = 0
|
|
7150
8268
|
fileCount = 0
|
|
@@ -7159,6 +8277,7 @@ def main():
|
|
|
7159
8277
|
stopSourceURLScan = False
|
|
7160
8278
|
stopSourceVirusTotal = False
|
|
7161
8279
|
stopSourceIntelx = False
|
|
8280
|
+
stopSourceGhostArchive = False
|
|
7162
8281
|
|
|
7163
8282
|
# Get the config settings from the config.yml file
|
|
7164
8283
|
getConfig()
|