waymore 7.7__py3-none-any.whl → 8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -1
- waymore/waymore.py +1102 -23
- {waymore-7.7.dist-info → waymore-8.0.dist-info}/METADATA +5 -2
- waymore-8.0.dist-info/RECORD +8 -0
- {waymore-7.7.dist-info → waymore-8.0.dist-info}/WHEEL +1 -1
- waymore-7.7.dist-info/RECORD +0 -8
- {waymore-7.7.dist-info → waymore-8.0.dist-info}/entry_points.txt +0 -0
- {waymore-7.7.dist-info → waymore-8.0.dist-info}/licenses/LICENSE +0 -0
- {waymore-7.7.dist-info → waymore-8.0.dist-info}/top_level.txt +0 -0
waymore/waymore.py
CHANGED
|
@@ -70,6 +70,7 @@ stopSourceAlienVault = False
|
|
|
70
70
|
stopSourceURLScan = False
|
|
71
71
|
stopSourceVirusTotal = False
|
|
72
72
|
stopSourceIntelx = False
|
|
73
|
+
stopSourceGhostArchive = False
|
|
73
74
|
successCount = 0
|
|
74
75
|
failureCount = 0
|
|
75
76
|
fileCount = 0
|
|
@@ -79,6 +80,7 @@ totalPages = 0
|
|
|
79
80
|
indexFile = None
|
|
80
81
|
continueRespFile = None
|
|
81
82
|
continueRespFileURLScan = None
|
|
83
|
+
continueRespFileGhostArchive = None
|
|
82
84
|
inputIsDomainANDPath = False
|
|
83
85
|
inputIsSubDomain = False
|
|
84
86
|
subs = "*."
|
|
@@ -102,6 +104,7 @@ checkAlienVault = 0
|
|
|
102
104
|
checkURLScan = 0
|
|
103
105
|
checkVirusTotal = 0
|
|
104
106
|
checkIntelx = 0
|
|
107
|
+
checkGhostArchive = 0
|
|
105
108
|
argsInputHostname = ""
|
|
106
109
|
responseOutputDirectory = ""
|
|
107
110
|
urlscanRequestLinks = set()
|
|
@@ -112,11 +115,14 @@ linkCountAlienVault = 0
|
|
|
112
115
|
linkCountURLScan = 0
|
|
113
116
|
linkCountVirusTotal = 0
|
|
114
117
|
linkCountIntelx = 0
|
|
118
|
+
linkCountGhostArchive = 0
|
|
115
119
|
linksFoundCommonCrawl = set()
|
|
116
120
|
linksFoundAlienVault = set()
|
|
117
121
|
linksFoundURLScan = set()
|
|
118
122
|
linksFoundVirusTotal = set()
|
|
119
123
|
linksFoundIntelx = set()
|
|
124
|
+
linksFoundGhostArchive = set()
|
|
125
|
+
ghostArchiveRequestLinks = set()
|
|
120
126
|
|
|
121
127
|
# Thread lock for protecting shared state during concurrent operations
|
|
122
128
|
links_lock = threading.Lock()
|
|
@@ -124,6 +130,7 @@ links_lock = threading.Lock()
|
|
|
124
130
|
# Shared state for link collection across all sources
|
|
125
131
|
linksFound = set()
|
|
126
132
|
linkMimes = set()
|
|
133
|
+
extraWarcLinks = set() # Track extra URLs found in WARC files for mode B
|
|
127
134
|
|
|
128
135
|
# Source Provider URLs
|
|
129
136
|
WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
|
|
@@ -134,6 +141,8 @@ URLSCAN_DOM_URL = "https://urlscan.io/dom/"
|
|
|
134
141
|
VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
|
|
135
142
|
# Paid endpoint first, free endpoint as fallback
|
|
136
143
|
INTELX_BASES = ["https://2.intelx.io", "https://free.intelx.io"]
|
|
144
|
+
GHOSTARCHIVE_URL = "https://ghostarchive.org/search?term={DOMAIN}&page="
|
|
145
|
+
GHOSTARCHIVE_DOM_URL = "https://ghostarchive.org"
|
|
137
146
|
|
|
138
147
|
intelx_tls = threading.local()
|
|
139
148
|
|
|
@@ -247,10 +256,10 @@ DEFAULT_LIMIT = 5000
|
|
|
247
256
|
DEFAULT_TIMEOUT = 30
|
|
248
257
|
|
|
249
258
|
# Exclusions used to exclude responses we will try to get from web.archive.org
|
|
250
|
-
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx"
|
|
259
|
+
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx,.avif"
|
|
251
260
|
|
|
252
261
|
# MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
|
|
253
|
-
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/
|
|
262
|
+
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2,image/avif"
|
|
254
263
|
|
|
255
264
|
# Response code exclusions we will use to filter links and responses from web.archive.org through their API
|
|
256
265
|
DEFAULT_FILTER_CODE = "404,301,302"
|
|
@@ -743,7 +752,7 @@ def handler(signal_received, frame):
|
|
|
743
752
|
This function is called if Ctrl-C is called by the user
|
|
744
753
|
An attempt will be made to try and clean up properly
|
|
745
754
|
"""
|
|
746
|
-
global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
|
|
755
|
+
global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, current_response, current_session
|
|
747
756
|
|
|
748
757
|
if stopProgram is not None:
|
|
749
758
|
stopProgramCount = stopProgramCount + 1
|
|
@@ -778,6 +787,7 @@ def handler(signal_received, frame):
|
|
|
778
787
|
stopSourceURLScan = True
|
|
779
788
|
stopSourceVirusTotal = True
|
|
780
789
|
stopSourceIntelx = True
|
|
790
|
+
stopSourceGhostArchive = True
|
|
781
791
|
# Try to close any active response or session to interrupt blocking network I/O
|
|
782
792
|
try:
|
|
783
793
|
if current_response is not None:
|
|
@@ -1753,11 +1763,15 @@ def printProgressBar(
|
|
|
1753
1763
|
|
|
1754
1764
|
def filehash(text):
|
|
1755
1765
|
"""
|
|
1756
|
-
Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
|
|
1766
|
+
Generate a hash value for the passed string or bytes. This is used for the file name of a downloaded archived response
|
|
1757
1767
|
"""
|
|
1758
1768
|
hash = 0
|
|
1759
1769
|
for ch in text:
|
|
1760
|
-
|
|
1770
|
+
# Handle both str (gives chars needing ord()) and bytes (gives ints directly)
|
|
1771
|
+
if isinstance(ch, int):
|
|
1772
|
+
hash = (hash * 281 ^ ch * 997) & 0xFFFFFFFFFFF
|
|
1773
|
+
else:
|
|
1774
|
+
hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
|
|
1761
1775
|
return str(hash)
|
|
1762
1776
|
|
|
1763
1777
|
|
|
@@ -1945,7 +1959,7 @@ def processArchiveUrl(url):
|
|
|
1945
1959
|
try:
|
|
1946
1960
|
try:
|
|
1947
1961
|
try:
|
|
1948
|
-
if os.environ.get("USER") == "xnl":
|
|
1962
|
+
if verbose() and os.environ.get("USER") == "xnl":
|
|
1949
1963
|
writerr(
|
|
1950
1964
|
colored(
|
|
1951
1965
|
"[ DBG ] Requesting file " + archiveUrl,
|
|
@@ -2265,7 +2279,7 @@ def processArchiveUrl(url):
|
|
|
2265
2279
|
debugText = "INTERNET ARCHIVE"
|
|
2266
2280
|
elif archiveHtml.lower().find("wombat") > 0:
|
|
2267
2281
|
debugText = "WOMBAT (JS)"
|
|
2268
|
-
if debugText != "":
|
|
2282
|
+
if verbose() and debugText != "":
|
|
2269
2283
|
writerr(
|
|
2270
2284
|
colored(
|
|
2271
2285
|
getSPACER(
|
|
@@ -2280,16 +2294,17 @@ def processArchiveUrl(url):
|
|
|
2280
2294
|
)
|
|
2281
2295
|
)
|
|
2282
2296
|
except Exception as e:
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2297
|
+
if verbose():
|
|
2298
|
+
writerr(
|
|
2299
|
+
colored(
|
|
2300
|
+
'[ DBG ] Error - Failed to output debug info for "'
|
|
2301
|
+
+ archiveUrl
|
|
2302
|
+
+ '": '
|
|
2303
|
+
+ str(e),
|
|
2304
|
+
"red",
|
|
2305
|
+
attrs=["dark"],
|
|
2306
|
+
)
|
|
2291
2307
|
)
|
|
2292
|
-
)
|
|
2293
2308
|
pass
|
|
2294
2309
|
|
|
2295
2310
|
successCount = successCount + 1
|
|
@@ -2760,17 +2775,20 @@ def validateArgProviders(x):
|
|
|
2760
2775
|
- urlscan
|
|
2761
2776
|
- virustotal
|
|
2762
2777
|
- intelx
|
|
2778
|
+
- ghostarchive
|
|
2763
2779
|
"""
|
|
2764
2780
|
invalid = False
|
|
2765
2781
|
x = x.lower()
|
|
2766
2782
|
providers = x.split(",")
|
|
2767
2783
|
for provider in providers:
|
|
2768
|
-
if not re.fullmatch(
|
|
2784
|
+
if not re.fullmatch(
|
|
2785
|
+
r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx|ghostarchive)", provider
|
|
2786
|
+
):
|
|
2769
2787
|
invalid = True
|
|
2770
2788
|
break
|
|
2771
2789
|
if invalid:
|
|
2772
2790
|
raise argparse.ArgumentTypeError(
|
|
2773
|
-
"Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx"
|
|
2791
|
+
"Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive"
|
|
2774
2792
|
)
|
|
2775
2793
|
return x
|
|
2776
2794
|
|
|
@@ -3528,6 +3546,522 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
3528
3546
|
writerr(colored("ERROR getURLScanDOM 1: " + str(e), "red"))
|
|
3529
3547
|
|
|
3530
3548
|
|
|
3549
|
+
def getGhostArchiveWARC(originalUrl, domUrl):
|
|
3550
|
+
"""
|
|
3551
|
+
Get the DOM for the passed GhostArchive link - parses WARC files containing multiple request/response pairs
|
|
3552
|
+
"""
|
|
3553
|
+
global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, argsInputHostname, REGEX_404, linksFound, extraWarcLinks, links_lock
|
|
3554
|
+
try:
|
|
3555
|
+
if stopProgram is None:
|
|
3556
|
+
|
|
3557
|
+
# The WARC files are found by replacing /archive with /chimurai4 and using the .warc file extension
|
|
3558
|
+
warcUrl = domUrl.replace("/archive", "/chimurai4") + ".warc"
|
|
3559
|
+
|
|
3560
|
+
# Get memory usage every 100 responses
|
|
3561
|
+
if (successCount + failureCount) % 100 == 0:
|
|
3562
|
+
try:
|
|
3563
|
+
getMemory()
|
|
3564
|
+
except Exception:
|
|
3565
|
+
pass
|
|
3566
|
+
|
|
3567
|
+
# Fetch content
|
|
3568
|
+
try:
|
|
3569
|
+
# Show progress bar
|
|
3570
|
+
fillTest = (successCount + failureCount) % 2
|
|
3571
|
+
fillChar = "o"
|
|
3572
|
+
if fillTest == 0:
|
|
3573
|
+
fillChar = "O"
|
|
3574
|
+
suffix = "Complete "
|
|
3575
|
+
|
|
3576
|
+
printProgressBar(
|
|
3577
|
+
successCount + failureCount,
|
|
3578
|
+
totalResponses,
|
|
3579
|
+
prefix="Processing " + str(totalResponses) + " WARC files:",
|
|
3580
|
+
suffix=suffix,
|
|
3581
|
+
length=getProgressBarLength(),
|
|
3582
|
+
fill=fillChar,
|
|
3583
|
+
)
|
|
3584
|
+
|
|
3585
|
+
try:
|
|
3586
|
+
try:
|
|
3587
|
+
if verbose() and os.environ.get("USER") == "xnl":
|
|
3588
|
+
writerr(
|
|
3589
|
+
colored(
|
|
3590
|
+
"[ DBG ] Requesting file " + warcUrl,
|
|
3591
|
+
"yellow",
|
|
3592
|
+
attrs=["dark"],
|
|
3593
|
+
)
|
|
3594
|
+
)
|
|
3595
|
+
except Exception:
|
|
3596
|
+
pass
|
|
3597
|
+
|
|
3598
|
+
# Choose a random user agent string to use for any requests
|
|
3599
|
+
userAgent = random.choice(USER_AGENT)
|
|
3600
|
+
session = requests.Session()
|
|
3601
|
+
session.mount("https://", HTTP_ADAPTER)
|
|
3602
|
+
session.mount("http://", HTTP_ADAPTER)
|
|
3603
|
+
|
|
3604
|
+
# Retry loop for 503 or maintenance responses
|
|
3605
|
+
maxRetries = 3
|
|
3606
|
+
warcBytes = b""
|
|
3607
|
+
for attempt in range(maxRetries):
|
|
3608
|
+
resp = session.get(
|
|
3609
|
+
warcUrl,
|
|
3610
|
+
headers={"User-Agent": userAgent},
|
|
3611
|
+
allow_redirects=True,
|
|
3612
|
+
timeout=args.timeout,
|
|
3613
|
+
)
|
|
3614
|
+
warcBytes = resp.content
|
|
3615
|
+
|
|
3616
|
+
# Check if we need to retry (decode just for this check)
|
|
3617
|
+
try:
|
|
3618
|
+
warcTextCheck = warcBytes.decode("utf-8", errors="replace").lower()
|
|
3619
|
+
except Exception:
|
|
3620
|
+
warcTextCheck = ""
|
|
3621
|
+
if resp.status_code == 503 or "website under maintenance" in warcTextCheck:
|
|
3622
|
+
if attempt < maxRetries - 1:
|
|
3623
|
+
import time
|
|
3624
|
+
|
|
3625
|
+
time.sleep(0.5)
|
|
3626
|
+
continue
|
|
3627
|
+
break
|
|
3628
|
+
|
|
3629
|
+
# Parse the WARC file to extract multiple responses
|
|
3630
|
+
# WARC header lines are text, but response bodies may be binary
|
|
3631
|
+
# Split by line separator but keep bytes for body extraction
|
|
3632
|
+
lineBytes = warcBytes.split(b"\n")
|
|
3633
|
+
lines = [lb.decode("utf-8", errors="replace") for lb in lineBytes]
|
|
3634
|
+
|
|
3635
|
+
# State machine to track parsing
|
|
3636
|
+
currentTargetUri = ""
|
|
3637
|
+
inResponse = False
|
|
3638
|
+
contentType = ""
|
|
3639
|
+
responsesFound = (
|
|
3640
|
+
[]
|
|
3641
|
+
) # List of (targetUri, contentType, responseBytes, httpStatusCode)
|
|
3642
|
+
|
|
3643
|
+
i = 0
|
|
3644
|
+
skipCurrentResponse = False # Initialize before loop
|
|
3645
|
+
pendingResponseType = (
|
|
3646
|
+
False # Track if we saw WARC-Type: response and are waiting for Target-URI
|
|
3647
|
+
)
|
|
3648
|
+
responseStartIdx = -1 # Initialize before loop
|
|
3649
|
+
httpStatusCode = "" # Initialize before loop
|
|
3650
|
+
while i < len(lines) and stopProgram is None and not stopSourceGhostArchive:
|
|
3651
|
+
line = lines[i]
|
|
3652
|
+
|
|
3653
|
+
# When we see a new WARC record start, reset pending state
|
|
3654
|
+
if line.startswith("WARC/1.0"):
|
|
3655
|
+
# If we were in a response and collecting, save it before moving to new record
|
|
3656
|
+
if inResponse and responseStartIdx >= 0:
|
|
3657
|
+
responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:i])
|
|
3658
|
+
responsesFound.append(
|
|
3659
|
+
(
|
|
3660
|
+
currentTargetUri,
|
|
3661
|
+
contentType,
|
|
3662
|
+
responseBodyBytes,
|
|
3663
|
+
httpStatusCode if "httpStatusCode" in dir() else "",
|
|
3664
|
+
)
|
|
3665
|
+
)
|
|
3666
|
+
inResponse = False
|
|
3667
|
+
responseStartIdx = -1
|
|
3668
|
+
contentType = ""
|
|
3669
|
+
httpStatusCode = ""
|
|
3670
|
+
pendingResponseType = False
|
|
3671
|
+
skipCurrentResponse = False
|
|
3672
|
+
|
|
3673
|
+
# Look for WARC-Type: response - mark that we're in a response record header
|
|
3674
|
+
elif line.startswith("WARC-Type: response"):
|
|
3675
|
+
pendingResponseType = True
|
|
3676
|
+
inResponse = False # Don't start capturing body yet
|
|
3677
|
+
responseStartIdx = -1
|
|
3678
|
+
contentType = ""
|
|
3679
|
+
|
|
3680
|
+
# Look for WARC-Target-URI to get the request URL
|
|
3681
|
+
elif line.startswith("WARC-Target-URI:"):
|
|
3682
|
+
currentTargetUri = line.split(":", 1)[1].strip()
|
|
3683
|
+
skipCurrentResponse = False
|
|
3684
|
+
|
|
3685
|
+
# Check: URL host must contain the input hostname
|
|
3686
|
+
if argsInputHostname:
|
|
3687
|
+
try:
|
|
3688
|
+
parsed = urlparse(currentTargetUri)
|
|
3689
|
+
host = parsed.netloc.lower()
|
|
3690
|
+
if argsInputHostname.lower() not in host:
|
|
3691
|
+
skipCurrentResponse = True
|
|
3692
|
+
except Exception:
|
|
3693
|
+
skipCurrentResponse = True
|
|
3694
|
+
|
|
3695
|
+
# Check: Filter by URL (FILTER_URL)
|
|
3696
|
+
if not skipCurrentResponse and FILTER_URL and currentTargetUri:
|
|
3697
|
+
filterUrls = [u.strip().lower() for u in FILTER_URL.split(",")]
|
|
3698
|
+
for filterUrl in filterUrls:
|
|
3699
|
+
if filterUrl in currentTargetUri.lower():
|
|
3700
|
+
skipCurrentResponse = True
|
|
3701
|
+
break
|
|
3702
|
+
|
|
3703
|
+
# If we were waiting for Target-URI after seeing WARC-Type: response, and it's valid, start response mode
|
|
3704
|
+
if pendingResponseType and not skipCurrentResponse:
|
|
3705
|
+
inResponse = True
|
|
3706
|
+
pendingResponseType = False
|
|
3707
|
+
|
|
3708
|
+
# If we're in a response section (after seeing both WARC-Type: response and valid WARC-Target-URI)
|
|
3709
|
+
elif inResponse:
|
|
3710
|
+
# Check for HTTP start and capture status code
|
|
3711
|
+
if line.startswith("HTTP"):
|
|
3712
|
+
# Extract status code (e.g., "HTTP/1.1 200 OK" -> "200")
|
|
3713
|
+
try:
|
|
3714
|
+
httpStatusCode = line.split()[1]
|
|
3715
|
+
except Exception:
|
|
3716
|
+
httpStatusCode = ""
|
|
3717
|
+
|
|
3718
|
+
# Early check: Filter by HTTP status code (FILTER_CODE)
|
|
3719
|
+
if FILTER_CODE and httpStatusCode:
|
|
3720
|
+
filterCodes = [c.strip() for c in FILTER_CODE.split(",")]
|
|
3721
|
+
if httpStatusCode in filterCodes:
|
|
3722
|
+
inResponse = False
|
|
3723
|
+
responseStartIdx = -1
|
|
3724
|
+
i += 1
|
|
3725
|
+
continue
|
|
3726
|
+
|
|
3727
|
+
responseStartIdx = i # Mark start of response
|
|
3728
|
+
elif responseStartIdx >= 0:
|
|
3729
|
+
# Capture Content-Type if present (case-insensitive check)
|
|
3730
|
+
if line.lower().startswith("content-type:"):
|
|
3731
|
+
try:
|
|
3732
|
+
contentType = (
|
|
3733
|
+
line.split(":", 1)[1].strip().split(";")[0].lower()
|
|
3734
|
+
)
|
|
3735
|
+
except Exception:
|
|
3736
|
+
pass
|
|
3737
|
+
|
|
3738
|
+
# Early check: Filter by MIME type (FILTER_MIME)
|
|
3739
|
+
if FILTER_MIME and contentType:
|
|
3740
|
+
filterMimes = [
|
|
3741
|
+
m.strip().lower() for m in FILTER_MIME.split(",")
|
|
3742
|
+
]
|
|
3743
|
+
if contentType in filterMimes:
|
|
3744
|
+
inResponse = False
|
|
3745
|
+
responseStartIdx = -1
|
|
3746
|
+
i += 1
|
|
3747
|
+
continue
|
|
3748
|
+
|
|
3749
|
+
i += 1
|
|
3750
|
+
|
|
3751
|
+
if stopProgram is not None:
|
|
3752
|
+
return
|
|
3753
|
+
|
|
3754
|
+
# Don't forget the last response if file doesn't end with WARC/1.0
|
|
3755
|
+
if inResponse and responseStartIdx >= 0:
|
|
3756
|
+
responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:])
|
|
3757
|
+
responsesFound.append(
|
|
3758
|
+
(
|
|
3759
|
+
currentTargetUri,
|
|
3760
|
+
contentType,
|
|
3761
|
+
responseBodyBytes,
|
|
3762
|
+
httpStatusCode if "httpStatusCode" in dir() else "",
|
|
3763
|
+
)
|
|
3764
|
+
)
|
|
3765
|
+
|
|
3766
|
+
# Process each response found
|
|
3767
|
+
for targetUri, contentType, responseBytes, httpStatusCode in responsesFound:
|
|
3768
|
+
if stopProgram is not None:
|
|
3769
|
+
break
|
|
3770
|
+
|
|
3771
|
+
if not responseBytes:
|
|
3772
|
+
continue
|
|
3773
|
+
|
|
3774
|
+
# Split HTTP header from body in bytes (look for \r\n\r\n or \n\n separator)
|
|
3775
|
+
if b"\r\n\r\n" in responseBytes:
|
|
3776
|
+
bodyBytes = responseBytes.split(b"\r\n\r\n", 1)[1]
|
|
3777
|
+
elif b"\n\n" in responseBytes:
|
|
3778
|
+
bodyBytes = responseBytes.split(b"\n\n", 1)[1]
|
|
3779
|
+
else:
|
|
3780
|
+
bodyBytes = responseBytes
|
|
3781
|
+
|
|
3782
|
+
# Skip empty bodies or "not found" responses
|
|
3783
|
+
if not bodyBytes or bodyBytes.lower().strip() == b"not found":
|
|
3784
|
+
continue
|
|
3785
|
+
|
|
3786
|
+
# If -f / --filter-responses-only is passed, track all URLs immediately (before filtering)
|
|
3787
|
+
if args.mode == "B" and args.filter_responses_only and targetUri:
|
|
3788
|
+
with links_lock:
|
|
3789
|
+
if targetUri not in linksFound and targetUri not in extraWarcLinks:
|
|
3790
|
+
extraWarcLinks.add(targetUri)
|
|
3791
|
+
|
|
3792
|
+
# Use isBinaryContent to detect if this is binary content
|
|
3793
|
+
isBinary = isBinaryContent(bodyBytes, contentType, targetUri)
|
|
3794
|
+
|
|
3795
|
+
if isBinary:
|
|
3796
|
+
# Binary file - save raw bytes
|
|
3797
|
+
archiveContent = bodyBytes
|
|
3798
|
+
archiveHtml = None
|
|
3799
|
+
else:
|
|
3800
|
+
# Text file - decode to string
|
|
3801
|
+
archiveHtml = bodyBytes.decode("utf-8", errors="replace")
|
|
3802
|
+
archiveContent = None
|
|
3803
|
+
|
|
3804
|
+
# Collapse multiple blank lines into one
|
|
3805
|
+
archiveHtml = re.sub(r"\n{3,}", "\n\n", archiveHtml)
|
|
3806
|
+
|
|
3807
|
+
# Skip if body is empty after processing
|
|
3808
|
+
if not archiveHtml.strip():
|
|
3809
|
+
continue
|
|
3810
|
+
|
|
3811
|
+
if stopProgram is not None:
|
|
3812
|
+
break
|
|
3813
|
+
|
|
3814
|
+
# Determine if this is HTML or JS based on content-type or URL
|
|
3815
|
+
isHtml = (
|
|
3816
|
+
contentType in ["text/html", "application/xhtml+xml"]
|
|
3817
|
+
or targetUri.lower().endswith(".html")
|
|
3818
|
+
or targetUri.lower().endswith(".htm")
|
|
3819
|
+
)
|
|
3820
|
+
isJs = contentType in [
|
|
3821
|
+
"text/javascript",
|
|
3822
|
+
"application/javascript",
|
|
3823
|
+
"application/x-javascript",
|
|
3824
|
+
] or targetUri.lower().endswith(".js")
|
|
3825
|
+
|
|
3826
|
+
# Add the URL as a comment at the start of the response (only for text files)
|
|
3827
|
+
if not isBinary and args.url_filename:
|
|
3828
|
+
if isHtml:
|
|
3829
|
+
archiveHtml = (
|
|
3830
|
+
"<!-- Original URL: " + targetUri + " -->\n" + archiveHtml
|
|
3831
|
+
)
|
|
3832
|
+
elif isJs:
|
|
3833
|
+
archiveHtml = (
|
|
3834
|
+
"/* Original URL: " + targetUri + " */\n" + archiveHtml
|
|
3835
|
+
)
|
|
3836
|
+
|
|
3837
|
+
# Create file name based on url or hash value
|
|
3838
|
+
if args.url_filename:
|
|
3839
|
+
fileName = targetUri.replace("/", "-").replace(":", "")
|
|
3840
|
+
fileName = fileName[0:254]
|
|
3841
|
+
hashValue = ""
|
|
3842
|
+
else:
|
|
3843
|
+
# Hash the content to get the filename
|
|
3844
|
+
if isBinary:
|
|
3845
|
+
hashValue = filehash(archiveContent)
|
|
3846
|
+
else:
|
|
3847
|
+
hashValue = filehash(archiveHtml)
|
|
3848
|
+
fileName = hashValue
|
|
3849
|
+
|
|
3850
|
+
# Determine extension of file from the content-type or URL
|
|
3851
|
+
extension = ""
|
|
3852
|
+
try:
|
|
3853
|
+
# Get path extension from URL
|
|
3854
|
+
if "://" in targetUri:
|
|
3855
|
+
targetUrl = "https://" + targetUri.split("://")[1]
|
|
3856
|
+
parsed = urlparse(targetUrl.strip())
|
|
3857
|
+
path = parsed.path
|
|
3858
|
+
extension = path[path.rindex(".") + 1 :]
|
|
3859
|
+
if "/" in extension:
|
|
3860
|
+
extension = ""
|
|
3861
|
+
# If extension is over 6 characters, it's likely not a real extension (e.g. API endpoint ID)
|
|
3862
|
+
if len(extension) > 6:
|
|
3863
|
+
extension = ""
|
|
3864
|
+
except Exception:
|
|
3865
|
+
pass
|
|
3866
|
+
|
|
3867
|
+
# If extension is blank, determine from MIME type or content
|
|
3868
|
+
if extension == "":
|
|
3869
|
+
if isBinary:
|
|
3870
|
+
# Binary file extensions from MIME type
|
|
3871
|
+
if contentType:
|
|
3872
|
+
if "image/png" in contentType:
|
|
3873
|
+
extension = "png"
|
|
3874
|
+
elif (
|
|
3875
|
+
"image/jpeg" in contentType
|
|
3876
|
+
or "image/jpg" in contentType
|
|
3877
|
+
):
|
|
3878
|
+
extension = "jpg"
|
|
3879
|
+
elif "image/gif" in contentType:
|
|
3880
|
+
extension = "gif"
|
|
3881
|
+
elif "image/webp" in contentType:
|
|
3882
|
+
extension = "webp"
|
|
3883
|
+
elif "application/pdf" in contentType:
|
|
3884
|
+
extension = "pdf"
|
|
3885
|
+
elif "application/zip" in contentType:
|
|
3886
|
+
extension = "zip"
|
|
3887
|
+
else:
|
|
3888
|
+
extension = "bin"
|
|
3889
|
+
else:
|
|
3890
|
+
extension = "bin"
|
|
3891
|
+
else:
|
|
3892
|
+
# Text file extensions
|
|
3893
|
+
if contentType and "javascript" in contentType.lower():
|
|
3894
|
+
extension = "js"
|
|
3895
|
+
elif contentType and "html" in contentType.lower():
|
|
3896
|
+
extension = "html"
|
|
3897
|
+
elif contentType and "json" in contentType.lower():
|
|
3898
|
+
extension = "json"
|
|
3899
|
+
elif contentType and "text" in contentType.lower():
|
|
3900
|
+
extension = "txt"
|
|
3901
|
+
elif archiveHtml and (
|
|
3902
|
+
archiveHtml.lower().strip().endswith("</html>")
|
|
3903
|
+
or archiveHtml.lower().strip().endswith("</body>")
|
|
3904
|
+
or archiveHtml.lower().strip().startswith("<!doctype html")
|
|
3905
|
+
or archiveHtml.lower().strip().startswith("<html")
|
|
3906
|
+
or archiveHtml.lower().strip().startswith("<head")
|
|
3907
|
+
):
|
|
3908
|
+
extension = "html"
|
|
3909
|
+
else:
|
|
3910
|
+
extension = "unknown"
|
|
3911
|
+
|
|
3912
|
+
fileName = fileName + "." + extension
|
|
3913
|
+
|
|
3914
|
+
# Determine file path
|
|
3915
|
+
if args.output_responses != "":
|
|
3916
|
+
filePath = args.output_responses + "/" + f"{fileName}"
|
|
3917
|
+
else:
|
|
3918
|
+
filePath = (
|
|
3919
|
+
DEFAULT_OUTPUT_DIR
|
|
3920
|
+
+ "/results/"
|
|
3921
|
+
+ str(argsInput).replace("/", "-")
|
|
3922
|
+
+ "/"
|
|
3923
|
+
+ f"{fileName}"
|
|
3924
|
+
)
|
|
3925
|
+
|
|
3926
|
+
if stopProgram is not None:
|
|
3927
|
+
break
|
|
3928
|
+
|
|
3929
|
+
# Write the file
|
|
3930
|
+
try:
|
|
3931
|
+
if isBinary:
|
|
3932
|
+
# Binary file - write as bytes
|
|
3933
|
+
responseFile = open(filePath, "wb")
|
|
3934
|
+
responseFile.write(archiveContent)
|
|
3935
|
+
else:
|
|
3936
|
+
# Text file - write as UTF-8
|
|
3937
|
+
responseFile = open(filePath, "w", encoding="utf8")
|
|
3938
|
+
responseFile.write(archiveHtml)
|
|
3939
|
+
responseFile.close()
|
|
3940
|
+
with links_lock:
|
|
3941
|
+
fileCount = fileCount + 1
|
|
3942
|
+
|
|
3943
|
+
# Track extra URLs found in WARC files for mode B (only when -f is not passed, since we track earlier if it is)
|
|
3944
|
+
if args.mode == "B" and not args.filter_responses_only and targetUri:
|
|
3945
|
+
with links_lock:
|
|
3946
|
+
if (
|
|
3947
|
+
targetUri not in linksFound
|
|
3948
|
+
and targetUri not in extraWarcLinks
|
|
3949
|
+
):
|
|
3950
|
+
extraWarcLinks.add(targetUri)
|
|
3951
|
+
except Exception as e:
|
|
3952
|
+
writerr(
|
|
3953
|
+
colored(
|
|
3954
|
+
"GhostArchive - [ ERR ] Failed to write file "
|
|
3955
|
+
+ filePath
|
|
3956
|
+
+ ": "
|
|
3957
|
+
+ str(e),
|
|
3958
|
+
"red",
|
|
3959
|
+
)
|
|
3960
|
+
)
|
|
3961
|
+
|
|
3962
|
+
# Write the hash value and URL to the index file
|
|
3963
|
+
if not args.url_filename and hashValue:
|
|
3964
|
+
try:
|
|
3965
|
+
timestamp = str(datetime.now())
|
|
3966
|
+
indexFile.write(
|
|
3967
|
+
hashValue
|
|
3968
|
+
+ ","
|
|
3969
|
+
+ domUrl
|
|
3970
|
+
+ "#"
|
|
3971
|
+
+ targetUri
|
|
3972
|
+
+ " ,"
|
|
3973
|
+
+ timestamp
|
|
3974
|
+
+ "\n"
|
|
3975
|
+
)
|
|
3976
|
+
indexFile.flush()
|
|
3977
|
+
except Exception as e:
|
|
3978
|
+
writerr(
|
|
3979
|
+
colored(
|
|
3980
|
+
'GhostArchive - [ ERR ] Failed to write to waymore_index.txt for "'
|
|
3981
|
+
+ warcUrl
|
|
3982
|
+
+ '": '
|
|
3983
|
+
+ str(e),
|
|
3984
|
+
"red",
|
|
3985
|
+
)
|
|
3986
|
+
)
|
|
3987
|
+
|
|
3988
|
+
successCount = successCount + 1
|
|
3989
|
+
|
|
3990
|
+
except WayBackException:
|
|
3991
|
+
failureCount = failureCount + 1
|
|
3992
|
+
|
|
3993
|
+
except Exception as e:
|
|
3994
|
+
failureCount = failureCount + 1
|
|
3995
|
+
if verbose():
|
|
3996
|
+
# Simplify common error messages
|
|
3997
|
+
if "connection broken" in str(e).lower():
|
|
3998
|
+
errorMsg = "Connection Broken"
|
|
3999
|
+
else:
|
|
4000
|
+
errorMsg = str(e)
|
|
4001
|
+
try:
|
|
4002
|
+
statusCode = (
|
|
4003
|
+
resp.status_code if "resp" in dir() and resp is not None else "ERR"
|
|
4004
|
+
)
|
|
4005
|
+
writerr(
|
|
4006
|
+
colored(
|
|
4007
|
+
"GhostArchive - [ "
|
|
4008
|
+
+ str(statusCode)
|
|
4009
|
+
+ ' ] Failed to get response for "'
|
|
4010
|
+
+ warcUrl
|
|
4011
|
+
+ '": '
|
|
4012
|
+
+ errorMsg,
|
|
4013
|
+
"red",
|
|
4014
|
+
)
|
|
4015
|
+
)
|
|
4016
|
+
except Exception:
|
|
4017
|
+
writerr(
|
|
4018
|
+
colored(
|
|
4019
|
+
'GhostArchive - [ ERR ] Failed to get response for "'
|
|
4020
|
+
+ warcUrl
|
|
4021
|
+
+ '": '
|
|
4022
|
+
+ errorMsg,
|
|
4023
|
+
"red",
|
|
4024
|
+
)
|
|
4025
|
+
)
|
|
4026
|
+
|
|
4027
|
+
# Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
|
|
4028
|
+
if (successCount + failureCount) % 25 == 1 or (
|
|
4029
|
+
successCount + failureCount
|
|
4030
|
+
) == totalResponses:
|
|
4031
|
+
try:
|
|
4032
|
+
getMemory()
|
|
4033
|
+
if verbose():
|
|
4034
|
+
suffix = (
|
|
4035
|
+
"Complete (Mem Usage "
|
|
4036
|
+
+ humanReadableSize(currentMemUsage)
|
|
4037
|
+
+ ", Total Mem "
|
|
4038
|
+
+ str(currentMemPercent)
|
|
4039
|
+
+ "%) "
|
|
4040
|
+
)
|
|
4041
|
+
except Exception:
|
|
4042
|
+
if verbose():
|
|
4043
|
+
suffix = 'Complete (To show mem use, run "pip install psutil")'
|
|
4044
|
+
printProgressBar(
|
|
4045
|
+
successCount + failureCount,
|
|
4046
|
+
totalResponses,
|
|
4047
|
+
prefix="Processing " + str(totalResponses) + " WARC files:",
|
|
4048
|
+
suffix=suffix,
|
|
4049
|
+
length=getProgressBarLength(),
|
|
4050
|
+
fill=fillChar,
|
|
4051
|
+
)
|
|
4052
|
+
|
|
4053
|
+
except Exception as e:
|
|
4054
|
+
if verbose():
|
|
4055
|
+
writerr(
|
|
4056
|
+
colored(
|
|
4057
|
+
'GhostArchive - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red"
|
|
4058
|
+
)
|
|
4059
|
+
)
|
|
4060
|
+
|
|
4061
|
+
except Exception as e:
|
|
4062
|
+
writerr(colored("ERROR getGhostArchiveWARC 1: " + str(e), "red"))
|
|
4063
|
+
|
|
4064
|
+
|
|
3531
4065
|
def format_date_for_urlscan(date_str):
|
|
3532
4066
|
# Handle different lengths of input
|
|
3533
4067
|
if len(date_str) == 4: # YYYY
|
|
@@ -4198,7 +4732,6 @@ def processWayBackPage(url):
|
|
|
4198
4732
|
pass
|
|
4199
4733
|
return
|
|
4200
4734
|
else:
|
|
4201
|
-
print("DEBUG: HERE END!") # DEBUG
|
|
4202
4735
|
pass
|
|
4203
4736
|
except Exception as e:
|
|
4204
4737
|
if verbose():
|
|
@@ -5456,13 +5989,310 @@ def getIntelxUrls():
|
|
|
5456
5989
|
writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
|
|
5457
5990
|
|
|
5458
5991
|
|
|
5459
|
-
def
|
|
5992
|
+
def processGhostArchiveUrl(url, ghostArchiveID=""):
|
|
5993
|
+
"""
|
|
5994
|
+
Process a specific URL from ghostarchive.org to determine whether to save the link
|
|
5995
|
+
"""
|
|
5996
|
+
global argsInput, argsInputHostname, links_lock, linkCountGhostArchive, linksFoundGhostArchive
|
|
5997
|
+
|
|
5998
|
+
addLink = True
|
|
5999
|
+
|
|
6000
|
+
try:
|
|
6001
|
+
# Strip Wayback Machine prefix if present (e.g., https://web.archive.org/web/20230101120000_/https://example.com)
|
|
6002
|
+
waybackMatch = re.match(r"^https?://web\.archive\.org/[^/]+/[a-zA-Z0-9]+_/", url)
|
|
6003
|
+
if waybackMatch:
|
|
6004
|
+
url = url[waybackMatch.end() :]
|
|
6005
|
+
|
|
6006
|
+
# If the input has a / in it, then a URL was passed, so the link will only be added if the URL matches
|
|
6007
|
+
if "/" in url:
|
|
6008
|
+
if argsInput not in url:
|
|
6009
|
+
addLink = False
|
|
6010
|
+
|
|
6011
|
+
# If filters are required then test them
|
|
6012
|
+
if addLink and not args.filter_responses_only:
|
|
6013
|
+
|
|
6014
|
+
# If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
|
|
6015
|
+
if args.no_subs:
|
|
6016
|
+
match = re.search(
|
|
6017
|
+
r"^[A-za-z]*\:\/\/(www\.)?" + re.escape(argsInputHostname),
|
|
6018
|
+
url,
|
|
6019
|
+
flags=re.IGNORECASE,
|
|
6020
|
+
)
|
|
6021
|
+
if match is None:
|
|
6022
|
+
addLink = False
|
|
6023
|
+
|
|
6024
|
+
# If the user didn't requested -f / --filter-responses-only then check http code
|
|
6025
|
+
if addLink and not args.filter_responses_only:
|
|
6026
|
+
|
|
6027
|
+
# Check the URL exclusions
|
|
6028
|
+
if addLink:
|
|
6029
|
+
match = re.search(
|
|
6030
|
+
r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
|
|
6031
|
+
url,
|
|
6032
|
+
flags=re.IGNORECASE,
|
|
6033
|
+
)
|
|
6034
|
+
if match is not None:
|
|
6035
|
+
addLink = False
|
|
6036
|
+
|
|
6037
|
+
# Set keywords filter if -ko argument passed
|
|
6038
|
+
if addLink and args.keywords_only:
|
|
6039
|
+
if args.keywords_only == "#CONFIG":
|
|
6040
|
+
match = re.search(
|
|
6041
|
+
r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
|
|
6042
|
+
url,
|
|
6043
|
+
flags=re.IGNORECASE,
|
|
6044
|
+
)
|
|
6045
|
+
else:
|
|
6046
|
+
match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
|
|
6047
|
+
if match is None:
|
|
6048
|
+
addLink = False
|
|
6049
|
+
|
|
6050
|
+
# Add link if it passed filters
|
|
6051
|
+
if addLink:
|
|
6052
|
+
# Just get the hostname of the url
|
|
6053
|
+
tldExtract = tldextract.extract(url)
|
|
6054
|
+
subDomain = tldExtract.subdomain
|
|
6055
|
+
if subDomain != "":
|
|
6056
|
+
subDomain = subDomain + "."
|
|
6057
|
+
domainOnly = subDomain + tldExtract.domain + "." + tldExtract.suffix
|
|
6058
|
+
|
|
6059
|
+
# GhostArchive might return URLs that aren't for the domain passed so we need to check for those and not process them
|
|
6060
|
+
# Check the URL
|
|
6061
|
+
match = re.search(
|
|
6062
|
+
r"(^|\.)" + re.escape(argsInputHostname) + "$",
|
|
6063
|
+
domainOnly,
|
|
6064
|
+
flags=re.IGNORECASE,
|
|
6065
|
+
)
|
|
6066
|
+
if match is not None:
|
|
6067
|
+
if args.mode in ("U", "B"):
|
|
6068
|
+
linksFoundAdd(url, linksFoundGhostArchive)
|
|
6069
|
+
# If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
|
|
6070
|
+
if ghostArchiveID != "" and args.mode in ("R", "B"):
|
|
6071
|
+
if args.limit == 0 or len(ghostArchiveRequestLinks) < args.limit:
|
|
6072
|
+
with links_lock:
|
|
6073
|
+
ghostArchiveRequestLinks.add(
|
|
6074
|
+
(url, GHOSTARCHIVE_DOM_URL + ghostArchiveID)
|
|
6075
|
+
)
|
|
6076
|
+
|
|
6077
|
+
except Exception as e:
|
|
6078
|
+
writerr(colored("ERROR processGhostArchiveUrl 1: " + str(e), "red"))
|
|
6079
|
+
|
|
6080
|
+
|
|
6081
|
+
def getGhostArchiveUrls():
|
|
6082
|
+
"""
|
|
6083
|
+
Get URLs from GhostArchive (ghostarchive.org)
|
|
6084
|
+
This source doesn't have an API, so we crawl the HTML pages directly.
|
|
6085
|
+
"""
|
|
6086
|
+
global linksFound, path, subs, stopProgram, stopSourceGhostArchive, argsInput, checkGhostArchive, argsInputHostname, linkCountGhostArchive, linksFoundGhostArchive
|
|
6087
|
+
|
|
6088
|
+
try:
|
|
6089
|
+
stopSourceGhostArchive = False
|
|
6090
|
+
linksFoundGhostArchive = set()
|
|
6091
|
+
|
|
6092
|
+
# Build the base URL
|
|
6093
|
+
# If there is only one . in the hostname, we can guarantee that a subdoman wasn't passed, so we can prefix with . to the links quicker as it won't include other domains that end with the target domain,
|
|
6094
|
+
# Else, we need to get all and then confirm the actual host of the links later
|
|
6095
|
+
if argsInputHostname.count(".") == 1:
|
|
6096
|
+
baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", "." + quote(argsInput))
|
|
6097
|
+
else:
|
|
6098
|
+
baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", quote(argsInput))
|
|
6099
|
+
|
|
6100
|
+
if verbose():
|
|
6101
|
+
write(
|
|
6102
|
+
colored("GhostArchive - [ INFO ] The URL requested to get links: ", "magenta")
|
|
6103
|
+
+ colored(baseUrl + "0\n", "white")
|
|
6104
|
+
)
|
|
6105
|
+
|
|
6106
|
+
if not args.check_only and args.mode == "U":
|
|
6107
|
+
write(
|
|
6108
|
+
colored(
|
|
6109
|
+
"GhostArchive - [ INFO ] Getting links from ghostarchive.org (this can take a while for some domains)...",
|
|
6110
|
+
"cyan",
|
|
6111
|
+
)
|
|
6112
|
+
)
|
|
6113
|
+
|
|
6114
|
+
# Set up session with cookie
|
|
6115
|
+
session = requests.Session()
|
|
6116
|
+
if HTTP_ADAPTER is not None:
|
|
6117
|
+
session.mount("https://", HTTP_ADAPTER)
|
|
6118
|
+
session.mount("http://", HTTP_ADAPTER)
|
|
6119
|
+
|
|
6120
|
+
userAgent = random.choice(USER_AGENT)
|
|
6121
|
+
headers = {"User-Agent": userAgent}
|
|
6122
|
+
cookies = {"theme": "original"}
|
|
6123
|
+
|
|
6124
|
+
pageNum = 0
|
|
6125
|
+
|
|
6126
|
+
while stopProgram is None and not stopSourceGhostArchive:
|
|
6127
|
+
getMemory()
|
|
6128
|
+
|
|
6129
|
+
url = baseUrl + str(pageNum)
|
|
6130
|
+
|
|
6131
|
+
try:
|
|
6132
|
+
resp = session.get(url, headers=headers, cookies=cookies, timeout=DEFAULT_TIMEOUT)
|
|
6133
|
+
except Exception as e:
|
|
6134
|
+
writerr(
|
|
6135
|
+
colored(
|
|
6136
|
+
"GhostArchive - [ ERR ] Unable to get page " + str(pageNum) + ": " + str(e),
|
|
6137
|
+
"red",
|
|
6138
|
+
)
|
|
6139
|
+
)
|
|
6140
|
+
break
|
|
6141
|
+
|
|
6142
|
+
if resp.status_code == 429:
|
|
6143
|
+
writerr(
|
|
6144
|
+
colored(
|
|
6145
|
+
"GhostArchive - [ 429 ] Rate limit reached at page " + str(pageNum) + ".",
|
|
6146
|
+
"red",
|
|
6147
|
+
)
|
|
6148
|
+
)
|
|
6149
|
+
break
|
|
6150
|
+
|
|
6151
|
+
# Check for maintenance/end of results indicator
|
|
6152
|
+
if (
|
|
6153
|
+
resp.status_code == 503
|
|
6154
|
+
or "The site is under maintenance and will be back soon" in resp.text
|
|
6155
|
+
or "No archives for that site" in resp.text
|
|
6156
|
+
):
|
|
6157
|
+
if verbose():
|
|
6158
|
+
if pageNum == 0:
|
|
6159
|
+
if args.check_only:
|
|
6160
|
+
checkGhostArchive = 1
|
|
6161
|
+
write(
|
|
6162
|
+
colored(
|
|
6163
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6164
|
+
)
|
|
6165
|
+
+ colored("1 request", "white")
|
|
6166
|
+
)
|
|
6167
|
+
else:
|
|
6168
|
+
write(
|
|
6169
|
+
colored(
|
|
6170
|
+
"GhostArchive - [ INFO ] No results found",
|
|
6171
|
+
"cyan",
|
|
6172
|
+
)
|
|
6173
|
+
)
|
|
6174
|
+
else:
|
|
6175
|
+
write(
|
|
6176
|
+
colored(
|
|
6177
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6178
|
+
+ str(pageNum)
|
|
6179
|
+
+ " pages",
|
|
6180
|
+
"cyan",
|
|
6181
|
+
)
|
|
6182
|
+
)
|
|
6183
|
+
break
|
|
6184
|
+
if resp.status_code != 200:
|
|
6185
|
+
writerr(
|
|
6186
|
+
colored(
|
|
6187
|
+
"GhostArchive - [ ERR ] [ "
|
|
6188
|
+
+ str(resp.status_code)
|
|
6189
|
+
+ " ] at page "
|
|
6190
|
+
+ str(pageNum),
|
|
6191
|
+
"red",
|
|
6192
|
+
)
|
|
6193
|
+
)
|
|
6194
|
+
break
|
|
6195
|
+
|
|
6196
|
+
# Check only mode - just count pages
|
|
6197
|
+
if args.check_only:
|
|
6198
|
+
# For check only, we check if there are results and try to get total count
|
|
6199
|
+
if pageNum == 0:
|
|
6200
|
+
# Check if there are any results on the first page
|
|
6201
|
+
if '<a href="/archive/' in resp.text:
|
|
6202
|
+
# Try to find "out of X" to determine total results/pages
|
|
6203
|
+
outOfMatch = re.search(r"out of (\d+)", resp.text)
|
|
6204
|
+
if outOfMatch:
|
|
6205
|
+
totalResults = int(outOfMatch.group(1))
|
|
6206
|
+
checkGhostArchive = totalResults
|
|
6207
|
+
write(
|
|
6208
|
+
colored(
|
|
6209
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6210
|
+
)
|
|
6211
|
+
+ colored(f"{totalResults} requests (pagination required)", "white")
|
|
6212
|
+
)
|
|
6213
|
+
else:
|
|
6214
|
+
checkGhostArchive = 1
|
|
6215
|
+
write(
|
|
6216
|
+
colored(
|
|
6217
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6218
|
+
)
|
|
6219
|
+
+ colored("unknown requests (pagination required)", "white")
|
|
6220
|
+
)
|
|
6221
|
+
else:
|
|
6222
|
+
checkGhostArchive = 1
|
|
6223
|
+
write(
|
|
6224
|
+
colored("GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan")
|
|
6225
|
+
+ colored("1 request (no results)", "white")
|
|
6226
|
+
)
|
|
6227
|
+
break
|
|
6228
|
+
|
|
6229
|
+
# Use regex to extract URLs from anchor tag text content
|
|
6230
|
+
# Pattern matches: <a href="/archive/ID">URL_HERE</a> - captures both href path and URL
|
|
6231
|
+
pattern = r'<a href="(/archive/[^"]*)">([^<]+)</a>'
|
|
6232
|
+
matches = re.findall(pattern, resp.text)
|
|
6233
|
+
|
|
6234
|
+
# If no matches found, we've reached the end of results
|
|
6235
|
+
if not matches:
|
|
6236
|
+
if verbose():
|
|
6237
|
+
write(
|
|
6238
|
+
colored(
|
|
6239
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6240
|
+
+ str(pageNum + 1)
|
|
6241
|
+
+ " pages",
|
|
6242
|
+
"cyan",
|
|
6243
|
+
)
|
|
6244
|
+
)
|
|
6245
|
+
break
|
|
6246
|
+
|
|
6247
|
+
for match in matches:
|
|
6248
|
+
ghostArchiveId = match[0] # e.g., "/archive/gkOOR"
|
|
6249
|
+
potentialUrl = match[1].strip()
|
|
6250
|
+
processGhostArchiveUrl(potentialUrl, ghostArchiveId)
|
|
6251
|
+
|
|
6252
|
+
# Check if there's a "Next Page" link - if not, we've reached the last page
|
|
6253
|
+
# GhostArchive resets to Page 1 when exceeding actual pages, so checking for Next Page is essential
|
|
6254
|
+
if "Next Page" not in resp.text and ">»</a>" not in resp.text:
|
|
6255
|
+
if verbose():
|
|
6256
|
+
write(
|
|
6257
|
+
colored(
|
|
6258
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6259
|
+
+ str(pageNum + 1)
|
|
6260
|
+
+ " pages",
|
|
6261
|
+
"cyan",
|
|
6262
|
+
)
|
|
6263
|
+
)
|
|
6264
|
+
break
|
|
6265
|
+
|
|
6266
|
+
pageNum += 1
|
|
6267
|
+
|
|
6268
|
+
if not args.check_only:
|
|
6269
|
+
# Count links based on mode - in R mode, count response links; in U/B mode, count URL links
|
|
6270
|
+
if args.mode == "R":
|
|
6271
|
+
linkCountGhostArchive = len(ghostArchiveRequestLinks)
|
|
6272
|
+
else:
|
|
6273
|
+
linkCountGhostArchive = len(linksFoundGhostArchive)
|
|
6274
|
+
write(
|
|
6275
|
+
colored("GhostArchive - [ INFO ] Links found on ghostarchive.org: ", "cyan")
|
|
6276
|
+
+ colored(str(linkCountGhostArchive), "white")
|
|
6277
|
+
)
|
|
6278
|
+
linksFound.update(linksFoundGhostArchive)
|
|
6279
|
+
linksFoundGhostArchive.clear()
|
|
6280
|
+
|
|
6281
|
+
except Exception as e:
|
|
6282
|
+
writerr(colored("ERROR getGhostArchiveUrls 1: " + str(e), "red"))
|
|
6283
|
+
|
|
6284
|
+
|
|
6285
|
+
def processResponses():
|
|
5460
6286
|
"""
|
|
5461
6287
|
Get archived responses from al sources
|
|
5462
6288
|
"""
|
|
5463
6289
|
global stopProgram, totalFileCount
|
|
5464
6290
|
try:
|
|
5465
6291
|
|
|
6292
|
+
# Get responses from GhostArchive unless excluded
|
|
6293
|
+
if stopProgram is None and not args.xga:
|
|
6294
|
+
processResponsesGhostArchive()
|
|
6295
|
+
|
|
5466
6296
|
# Get responses from URLScan unless excluded
|
|
5467
6297
|
if stopProgram is None and not args.xus:
|
|
5468
6298
|
processResponsesURLScan()
|
|
@@ -5484,6 +6314,235 @@ def processResponses():
|
|
|
5484
6314
|
writerr(colored(getSPACER("ERROR processResponses 1: " + str(e)), "red"))
|
|
5485
6315
|
|
|
5486
6316
|
|
|
6317
|
+
def processResponsesGhostArchive():
|
|
6318
|
+
"""
|
|
6319
|
+
Get archived responses from GhostArchive (ghostarchive.org)
|
|
6320
|
+
"""
|
|
6321
|
+
global subs, path, indexFile, totalResponses, stopProgram, argsInput, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, ghostArchiveRequestLinks, failureCount, totalFileCount, checkGhostArchive
|
|
6322
|
+
try:
|
|
6323
|
+
fileCount = 0
|
|
6324
|
+
failureCount = 0
|
|
6325
|
+
if not args.check_only:
|
|
6326
|
+
# Create 'results' and domain directory if needed
|
|
6327
|
+
createDirs()
|
|
6328
|
+
|
|
6329
|
+
# Get the path of the files, depending on whether -oR / --output_responses was passed
|
|
6330
|
+
try:
|
|
6331
|
+
responsesPath = responseOutputDirectory + "responses.GhostArchive.tmp"
|
|
6332
|
+
indexPath = responseOutputDirectory + "waymore_index.txt"
|
|
6333
|
+
except Exception as e:
|
|
6334
|
+
if verbose():
|
|
6335
|
+
writerr(colored("ERROR processResponsesGhostArchive 4: " + str(e), "red"))
|
|
6336
|
+
|
|
6337
|
+
# Get URLs from GhostArchive if the DOM ID's haven't been retrieved yet
|
|
6338
|
+
if stopProgram is None and not args.check_only:
|
|
6339
|
+
if args.mode in ("R", "B"):
|
|
6340
|
+
write(
|
|
6341
|
+
colored(
|
|
6342
|
+
"GhostArchive - [ INFO ] Getting list of response links (this can take a while for some domains)...",
|
|
6343
|
+
"cyan",
|
|
6344
|
+
)
|
|
6345
|
+
)
|
|
6346
|
+
if args.mode == "R":
|
|
6347
|
+
getGhostArchiveUrls()
|
|
6348
|
+
|
|
6349
|
+
# Check if a responses.GhostArchive.tmp files exists
|
|
6350
|
+
if not args.check_only and os.path.exists(responsesPath):
|
|
6351
|
+
|
|
6352
|
+
# Load the links into the set
|
|
6353
|
+
with open(responsesPath, "rb") as fl:
|
|
6354
|
+
linkRequests = pickle.load(fl)
|
|
6355
|
+
|
|
6356
|
+
# Set start point
|
|
6357
|
+
successCount = 0
|
|
6358
|
+
|
|
6359
|
+
# Get the URLScan DOM links
|
|
6360
|
+
linkRequests = []
|
|
6361
|
+
for originalUrl, domUrl in ghostArchiveRequestLinks:
|
|
6362
|
+
linkRequests.append((originalUrl, domUrl))
|
|
6363
|
+
|
|
6364
|
+
# Write the links to a temp file
|
|
6365
|
+
if not args.check_only:
|
|
6366
|
+
with open(responsesPath, "wb") as f:
|
|
6367
|
+
pickle.dump(linkRequests, f)
|
|
6368
|
+
|
|
6369
|
+
# Get the total number of responses we will try to get and set the current file count to the success count
|
|
6370
|
+
totalResponses = len(linkRequests)
|
|
6371
|
+
checkGhostArchive = checkGhostArchive + totalResponses
|
|
6372
|
+
|
|
6373
|
+
# If there are no reponses to download, diaplay an error and exit
|
|
6374
|
+
if args.mode != "R" and totalResponses == 0:
|
|
6375
|
+
writerr(
|
|
6376
|
+
colored(
|
|
6377
|
+
getSPACER(
|
|
6378
|
+
"Failed to get responses from GhostArchive (ghostarchive.org) - check input and try again."
|
|
6379
|
+
),
|
|
6380
|
+
"red",
|
|
6381
|
+
)
|
|
6382
|
+
)
|
|
6383
|
+
return
|
|
6384
|
+
|
|
6385
|
+
fileCount = successCount
|
|
6386
|
+
|
|
6387
|
+
if args.check_only:
|
|
6388
|
+
writerr(
|
|
6389
|
+
colored("Downloading archived responses: ", "cyan")
|
|
6390
|
+
+ colored("UNKNOWN requests", "cyan")
|
|
6391
|
+
)
|
|
6392
|
+
writerr(
|
|
6393
|
+
colored(
|
|
6394
|
+
"\n-> Downloading the responses can vary depending on the target and the rate limiting on GhostArchive",
|
|
6395
|
+
"green",
|
|
6396
|
+
)
|
|
6397
|
+
)
|
|
6398
|
+
write("")
|
|
6399
|
+
else:
|
|
6400
|
+
# If the limit has been set over the default, give a warning that this could take a long time!
|
|
6401
|
+
if totalResponses - successCount > DEFAULT_LIMIT:
|
|
6402
|
+
if successCount > 0:
|
|
6403
|
+
writerr(
|
|
6404
|
+
colored(
|
|
6405
|
+
getSPACER(
|
|
6406
|
+
"WARNING: Downloading remaining "
|
|
6407
|
+
+ str(totalResponses - successCount)
|
|
6408
|
+
+ " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
|
|
6409
|
+
),
|
|
6410
|
+
"yellow",
|
|
6411
|
+
)
|
|
6412
|
+
)
|
|
6413
|
+
else:
|
|
6414
|
+
writerr(
|
|
6415
|
+
colored(
|
|
6416
|
+
getSPACER(
|
|
6417
|
+
"WARNING: Downloading "
|
|
6418
|
+
+ str(totalResponses)
|
|
6419
|
+
+ " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
|
|
6420
|
+
),
|
|
6421
|
+
"yellow",
|
|
6422
|
+
)
|
|
6423
|
+
)
|
|
6424
|
+
|
|
6425
|
+
# Open the index file if hash value is going to be used (not URL)
|
|
6426
|
+
if not args.url_filename:
|
|
6427
|
+
indexFile = open(indexPath, "a")
|
|
6428
|
+
|
|
6429
|
+
# Process the URLs from GhostArchive
|
|
6430
|
+
if stopProgram is None:
|
|
6431
|
+
p = mp.Pool(
|
|
6432
|
+
args.processes * 2
|
|
6433
|
+
) # Double the number of processes to speed up the download
|
|
6434
|
+
p.starmap(getGhostArchiveWARC, linkRequests[successCount:])
|
|
6435
|
+
p.close()
|
|
6436
|
+
p.join()
|
|
6437
|
+
|
|
6438
|
+
# Delete the tmp files now it has run successfully
|
|
6439
|
+
if stopProgram is None:
|
|
6440
|
+
try:
|
|
6441
|
+
os.remove(responsesPath)
|
|
6442
|
+
except Exception:
|
|
6443
|
+
pass
|
|
6444
|
+
|
|
6445
|
+
# Close the index file if hash value is going to be used (not URL)
|
|
6446
|
+
if not args.url_filename:
|
|
6447
|
+
indexFile.close()
|
|
6448
|
+
|
|
6449
|
+
if not args.check_only:
|
|
6450
|
+
try:
|
|
6451
|
+
if failureCount > 0:
|
|
6452
|
+
if verbose():
|
|
6453
|
+
write(
|
|
6454
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6455
|
+
+ colored(responseOutputDirectory, "white")
|
|
6456
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6457
|
+
+ colored(
|
|
6458
|
+
str(fileCount) + " 🤘",
|
|
6459
|
+
"white",
|
|
6460
|
+
)
|
|
6461
|
+
+ colored(" (" + str(failureCount) + " not found)\n", "red")
|
|
6462
|
+
)
|
|
6463
|
+
else:
|
|
6464
|
+
write(
|
|
6465
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6466
|
+
+ colored(responseOutputDirectory, "white")
|
|
6467
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6468
|
+
+ colored(str(fileCount) + " 🤘", "white")
|
|
6469
|
+
+ colored(" (" + str(failureCount) + " not found)\n", "red")
|
|
6470
|
+
)
|
|
6471
|
+
else:
|
|
6472
|
+
if verbose():
|
|
6473
|
+
write(
|
|
6474
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6475
|
+
+ colored(responseOutputDirectory, "white")
|
|
6476
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6477
|
+
+ colored(str(fileCount) + " 🤘\n", "white")
|
|
6478
|
+
)
|
|
6479
|
+
else:
|
|
6480
|
+
write(
|
|
6481
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6482
|
+
+ colored(responseOutputDirectory, "white")
|
|
6483
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6484
|
+
+ colored(str(fileCount) + " 🤘\n", "white")
|
|
6485
|
+
)
|
|
6486
|
+
except Exception as e:
|
|
6487
|
+
if verbose():
|
|
6488
|
+
writerr(colored("ERROR processResponsesGhostArchive 5: " + str(e), "red"))
|
|
6489
|
+
|
|
6490
|
+
# Append extra links from WARC files to URL output file (for mode B)
|
|
6491
|
+
try:
|
|
6492
|
+
if args.mode == "B" and len(extraWarcLinks) > 0:
|
|
6493
|
+
# Determine URL output file path (same logic as processURLOutput)
|
|
6494
|
+
if args.output_urls == "":
|
|
6495
|
+
if args.output_responses != "":
|
|
6496
|
+
urlFilePath = args.output_responses + "/waymore.txt"
|
|
6497
|
+
else:
|
|
6498
|
+
urlFilePath = (
|
|
6499
|
+
str(DEFAULT_OUTPUT_DIR)
|
|
6500
|
+
+ "/results/"
|
|
6501
|
+
+ str(argsInput).replace("/", "-")
|
|
6502
|
+
+ "/waymore.txt"
|
|
6503
|
+
)
|
|
6504
|
+
else:
|
|
6505
|
+
urlFilePath = args.output_urls
|
|
6506
|
+
|
|
6507
|
+
# Load existing URLs from file to avoid duplicates
|
|
6508
|
+
existingUrls = set()
|
|
6509
|
+
try:
|
|
6510
|
+
with open(urlFilePath) as f:
|
|
6511
|
+
for line in f:
|
|
6512
|
+
existingUrls.add(line.strip())
|
|
6513
|
+
except Exception:
|
|
6514
|
+
pass
|
|
6515
|
+
|
|
6516
|
+
# Append only new unique URLs
|
|
6517
|
+
newLinks = [
|
|
6518
|
+
url
|
|
6519
|
+
for url in extraWarcLinks
|
|
6520
|
+
if url not in existingUrls and url not in linksFound
|
|
6521
|
+
]
|
|
6522
|
+
if len(newLinks) > 0:
|
|
6523
|
+
with open(urlFilePath, "a") as f:
|
|
6524
|
+
for url in newLinks:
|
|
6525
|
+
f.write(url + "\n")
|
|
6526
|
+
|
|
6527
|
+
# Display message about extra links
|
|
6528
|
+
write(
|
|
6529
|
+
colored("GhostArchive - [ INFO ] ", "cyan")
|
|
6530
|
+
+ colored(str(len(newLinks)), "white")
|
|
6531
|
+
+ colored(" extra links found in WARC files added to file ", "cyan")
|
|
6532
|
+
+ colored(urlFilePath, "white")
|
|
6533
|
+
+ "\n"
|
|
6534
|
+
)
|
|
6535
|
+
except Exception as e:
|
|
6536
|
+
if verbose():
|
|
6537
|
+
writerr(colored("ERROR processResponsesGhostArchive 6: " + str(e), "red"))
|
|
6538
|
+
|
|
6539
|
+
totalFileCount = totalFileCount + fileCount
|
|
6540
|
+
except Exception as e:
|
|
6541
|
+
writerr(colored(getSPACER("ERROR processResponsesGhostArchive 1: " + str(e)), "red"))
|
|
6542
|
+
finally:
|
|
6543
|
+
linkRequests = None
|
|
6544
|
+
|
|
6545
|
+
|
|
5487
6546
|
def processResponsesURLScan():
|
|
5488
6547
|
"""
|
|
5489
6548
|
Get archived responses from URLScan (urlscan.io)
|
|
@@ -6699,6 +7758,12 @@ async def fetch_intelx_async():
|
|
|
6699
7758
|
await loop.run_in_executor(None, getIntelxUrls)
|
|
6700
7759
|
|
|
6701
7760
|
|
|
7761
|
+
async def fetch_ghostarchive_async():
|
|
7762
|
+
"""Async wrapper for getGhostArchiveUrls - runs in thread pool"""
|
|
7763
|
+
loop = asyncio.get_event_loop()
|
|
7764
|
+
await loop.run_in_executor(None, getGhostArchiveUrls)
|
|
7765
|
+
|
|
7766
|
+
|
|
6702
7767
|
async def fetch_all_sources_async():
|
|
6703
7768
|
"""
|
|
6704
7769
|
Orchestrator function to fetch from all enabled sources concurrently.
|
|
@@ -6721,6 +7786,8 @@ async def fetch_all_sources_async():
|
|
|
6721
7786
|
tasks.append(("VirusTotal", fetch_virustotal_async()))
|
|
6722
7787
|
if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
|
|
6723
7788
|
tasks.append(("Intelligence X", fetch_intelx_async()))
|
|
7789
|
+
if not args.xga and stopProgram is None:
|
|
7790
|
+
tasks.append(("GhostArchive", fetch_ghostarchive_async()))
|
|
6724
7791
|
|
|
6725
7792
|
if not tasks:
|
|
6726
7793
|
return
|
|
@@ -6746,7 +7813,7 @@ async def fetch_all_sources_async():
|
|
|
6746
7813
|
|
|
6747
7814
|
# Run waymore
|
|
6748
7815
|
def main():
|
|
6749
|
-
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
|
|
7816
|
+
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, extraWarcLinks
|
|
6750
7817
|
|
|
6751
7818
|
# Tell Python to run the handler() function when SIGINT is received
|
|
6752
7819
|
signal(SIGINT, handler)
|
|
@@ -6902,13 +7969,19 @@ def main():
|
|
|
6902
7969
|
help="Exclude checks for links from intelx.io",
|
|
6903
7970
|
default=False,
|
|
6904
7971
|
)
|
|
7972
|
+
parser.add_argument(
|
|
7973
|
+
"-xga",
|
|
7974
|
+
action="store_true",
|
|
7975
|
+
help="Exclude checks for links from ghostarchive.org",
|
|
7976
|
+
default=False,
|
|
7977
|
+
)
|
|
6905
7978
|
parser.add_argument(
|
|
6906
7979
|
"--providers",
|
|
6907
7980
|
action="store",
|
|
6908
|
-
help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal and
|
|
7981
|
+
help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal,intelx and ghostarchive. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
|
|
6909
7982
|
default=[],
|
|
6910
7983
|
type=validateArgProviders,
|
|
6911
|
-
metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx}",
|
|
7984
|
+
metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive}",
|
|
6912
7985
|
)
|
|
6913
7986
|
parser.add_argument(
|
|
6914
7987
|
"-lcc",
|
|
@@ -7075,6 +8148,10 @@ def main():
|
|
|
7075
8148
|
args.xix = True
|
|
7076
8149
|
else:
|
|
7077
8150
|
args.xix = False
|
|
8151
|
+
if "ghostarchive" not in args.providers:
|
|
8152
|
+
args.xga = True
|
|
8153
|
+
else:
|
|
8154
|
+
args.xga = False
|
|
7078
8155
|
|
|
7079
8156
|
# If no input was given, raise an error
|
|
7080
8157
|
if sys.stdin.isatty():
|
|
@@ -7145,6 +8222,7 @@ def main():
|
|
|
7145
8222
|
# Reset global variables
|
|
7146
8223
|
linksFound = set()
|
|
7147
8224
|
linkMimes = set()
|
|
8225
|
+
extraWarcLinks = set()
|
|
7148
8226
|
successCount = 0
|
|
7149
8227
|
failureCount = 0
|
|
7150
8228
|
fileCount = 0
|
|
@@ -7159,6 +8237,7 @@ def main():
|
|
|
7159
8237
|
stopSourceURLScan = False
|
|
7160
8238
|
stopSourceVirusTotal = False
|
|
7161
8239
|
stopSourceIntelx = False
|
|
8240
|
+
stopSourceGhostArchive = False
|
|
7162
8241
|
|
|
7163
8242
|
# Get the config settings from the config.yml file
|
|
7164
8243
|
getConfig()
|