waymore 4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -0
- waymore/waymore.py +3239 -0
- waymore-4.0.dist-info/LICENSE +21 -0
- waymore-4.0.dist-info/METADATA +282 -0
- waymore-4.0.dist-info/RECORD +8 -0
- waymore-4.0.dist-info/WHEEL +5 -0
- waymore-4.0.dist-info/entry_points.txt +2 -0
- waymore-4.0.dist-info/top_level.txt +1 -0
waymore/waymore.py
ADDED
|
@@ -0,0 +1,3239 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Python 3
|
|
3
|
+
# waymore - by @Xnl-h4ck3r: Find way more from the Wayback Machine (also get links from Common Crawl, AlienVault OTX, URLScan and VirusTotal)
|
|
4
|
+
# Full help here: https://github.com/xnl-h4ck3r/waymore/blob/main/README.md
|
|
5
|
+
# Good luck and good hunting! If you really love the tool (or any others), or they helped you find an awesome bounty, consider BUYING ME A COFFEE! (https://ko-fi.com/xnlh4ck3r) ☕ (I could use the caffeine!)
|
|
6
|
+
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
import requests
|
|
9
|
+
from requests.exceptions import ConnectionError
|
|
10
|
+
from requests.utils import quote
|
|
11
|
+
from requests.adapters import HTTPAdapter, Retry
|
|
12
|
+
import argparse
|
|
13
|
+
from signal import SIGINT, signal
|
|
14
|
+
import multiprocessing.dummy as mp
|
|
15
|
+
from termcolor import colored
|
|
16
|
+
from datetime import datetime, timedelta
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
import yaml
|
|
19
|
+
import os
|
|
20
|
+
import json
|
|
21
|
+
import re
|
|
22
|
+
import random
|
|
23
|
+
import sys
|
|
24
|
+
import math
|
|
25
|
+
import enum
|
|
26
|
+
import pickle
|
|
27
|
+
import time
|
|
28
|
+
import tldextract
|
|
29
|
+
try:
|
|
30
|
+
from . import __version__
|
|
31
|
+
except:
|
|
32
|
+
pass
|
|
33
|
+
from tqdm import tqdm
|
|
34
|
+
|
|
35
|
+
# Try to import psutil to show memory usage
|
|
36
|
+
try:
|
|
37
|
+
import psutil
|
|
38
|
+
except:
|
|
39
|
+
currentMemUsage = -1
|
|
40
|
+
maxMemoryUsage = -1
|
|
41
|
+
currentMemPercent = -1
|
|
42
|
+
maxMemoryPercent = -1
|
|
43
|
+
|
|
44
|
+
# Creating stopProgram enum
|
|
45
|
+
class StopProgram(enum.Enum):
|
|
46
|
+
SIGINT = 1
|
|
47
|
+
WEBARCHIVE_PROBLEM = 2
|
|
48
|
+
MEMORY_THRESHOLD = 3
|
|
49
|
+
stopProgram = None
|
|
50
|
+
|
|
51
|
+
# Global variables
|
|
52
|
+
linksFound = set()
|
|
53
|
+
linkMimes = set()
|
|
54
|
+
inputValues = set()
|
|
55
|
+
argsInput = ''
|
|
56
|
+
isInputFile = False
|
|
57
|
+
stopProgramCount = 0
|
|
58
|
+
stopSource = False
|
|
59
|
+
successCount = 0
|
|
60
|
+
failureCount = 0
|
|
61
|
+
fileCount = 0
|
|
62
|
+
totalResponses = 0
|
|
63
|
+
totalPages = 0
|
|
64
|
+
indexFile = None
|
|
65
|
+
continueRespFile = None
|
|
66
|
+
inputIsDomainANDPath = False
|
|
67
|
+
inputIsSubDomain = False
|
|
68
|
+
subs = '*.'
|
|
69
|
+
path = ''
|
|
70
|
+
waymorePath = ''
|
|
71
|
+
terminalWidth = 135
|
|
72
|
+
maxMemoryUsage = 0
|
|
73
|
+
currentMemUsage = 0
|
|
74
|
+
maxMemoryPercent = 0
|
|
75
|
+
currentMemPercent = 0
|
|
76
|
+
HTTP_ADAPTER = None
|
|
77
|
+
HTTP_ADAPTER_CC = None
|
|
78
|
+
checkWayback = 0
|
|
79
|
+
checkCommonCrawl = 0
|
|
80
|
+
checkAlienVault = 0
|
|
81
|
+
checkURLScan = 0
|
|
82
|
+
checkVirusTotal = 0
|
|
83
|
+
argsInputHostname = ''
|
|
84
|
+
responseOutputDirectory = ''
|
|
85
|
+
|
|
86
|
+
# Source Provider URLs
|
|
87
|
+
WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}&collapse={COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
|
|
88
|
+
CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
|
|
89
|
+
ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
|
|
90
|
+
URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
|
|
91
|
+
VIRUSTOTAL_URL = 'https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}'
|
|
92
|
+
|
|
93
|
+
# User Agents to use when making requests, chosen at random
|
|
94
|
+
USER_AGENT = [
|
|
95
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
|
|
96
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
|
|
97
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
|
|
98
|
+
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
|
|
99
|
+
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
|
|
100
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/99.0.1150.36",
|
|
101
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
|
|
102
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
|
|
103
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
|
|
104
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",
|
|
105
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
|
|
106
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
|
|
107
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0",
|
|
108
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
|
|
109
|
+
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
|
|
110
|
+
"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
|
|
111
|
+
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
|
|
112
|
+
"Mozilla/5.0 (iPad; CPU OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
|
|
113
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4",
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
# The default maximum number of responses to download
|
|
117
|
+
DEFAULT_LIMIT = 5000
|
|
118
|
+
|
|
119
|
+
# The default timeout for archived responses to be retrieved in seconds
|
|
120
|
+
DEFAULT_TIMEOUT = 30
|
|
121
|
+
|
|
122
|
+
# Exclusions used to exclude responses we will try to get from web.archive.org
|
|
123
|
+
DEFAULT_FILTER_URL = '.css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap'
|
|
124
|
+
|
|
125
|
+
# MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
|
|
126
|
+
DEFAULT_FILTER_MIME = 'text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff'
|
|
127
|
+
|
|
128
|
+
# Response code exclusions we will use to filter links and responses from web.archive.org through their API
|
|
129
|
+
DEFAULT_FILTER_CODE = '404,301,302'
|
|
130
|
+
|
|
131
|
+
# Used to filter out downloaded responses that could be custom 404 pages
|
|
132
|
+
REGEX_404 = r'<title>[^\<]*(404|not found)[^\<]*</title>'
|
|
133
|
+
|
|
134
|
+
# Keywords
|
|
135
|
+
DEFAULT_FILTER_KEYWORDS = 'admin,login,logon,signin,signup,register,registration,dash,portal,ftp,panel,.js,api,robots.txt,graph,gql,config,backup,debug,db,database,git,cgi-bin,swagger,zip,rar,tar.gz,internal,jira,jenkins,confluence,atlassian,okta,corp,upload,delete,email,sql,create,edit,test,temp,cache,wsdl,log,payment,setting,mail,file,redirect,chat,billing,doc,trace,cp,ftp,gateway,import,proxy,dev,stage,stg,uat'
|
|
136
|
+
|
|
137
|
+
# Yaml config values
|
|
138
|
+
FILTER_URL = ''
|
|
139
|
+
FILTER_MIME = ''
|
|
140
|
+
FILTER_CODE = ''
|
|
141
|
+
MATCH_CODE = ''
|
|
142
|
+
FILTER_KEYWORDS = ''
|
|
143
|
+
URLSCAN_API_KEY = ''
|
|
144
|
+
CONTINUE_RESPONSES_IF_PIPED = True
|
|
145
|
+
WEBHOOK_DISCORD = ''
|
|
146
|
+
DEFAULT_OUTPUT_DIR = ''
|
|
147
|
+
|
|
148
|
+
API_KEY_SECRET = "aHR0cHM6Ly95b3V0dS5iZS9kUXc0dzlXZ1hjUQ=="
|
|
149
|
+
|
|
150
|
+
# When -oijs is passed, and the downloaded responses are checked for scripts, files with these extensions will be ignored
|
|
151
|
+
INLINE_JS_EXCLUDE = ['.js', '.csv', '.xls', '.xlsx', '.doc', '.docx', '.pdf', '.msi', '.zip', '.gzip', '.gz', '.tar', '.rar', '.json']
|
|
152
|
+
|
|
153
|
+
# Get memory usage for
|
|
154
|
+
def getMemory():
|
|
155
|
+
|
|
156
|
+
global currentMemUsage, currentMemPercent, maxMemoryUsage, maxMemoryPercent, stopProgram
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
currentMemUsage = process.memory_info().rss
|
|
160
|
+
currentMemPercent = math.ceil(psutil.virtual_memory().percent)
|
|
161
|
+
if currentMemUsage > maxMemoryUsage:
|
|
162
|
+
maxMemoryUsage = currentMemUsage
|
|
163
|
+
if currentMemPercent > maxMemoryPercent:
|
|
164
|
+
maxMemoryPercent = currentMemPercent
|
|
165
|
+
if currentMemPercent > args.memory_threshold:
|
|
166
|
+
stopProgram = StopProgram.MEMORY_THRESHOLD
|
|
167
|
+
except:
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
# Convert bytes to human readable form
|
|
171
|
+
def humanReadableSize(size, decimal_places=2):
|
|
172
|
+
for unit in ["B", "KB", "MB", "GB", "TB", "PB"]:
|
|
173
|
+
if size < 1024.0 or unit == "PB":
|
|
174
|
+
break
|
|
175
|
+
size /= 1024.0
|
|
176
|
+
return f"{size:.{decimal_places}f} {unit}"
|
|
177
|
+
|
|
178
|
+
# Display stats if -v argument was chosen
|
|
179
|
+
def processStats():
|
|
180
|
+
if maxMemoryUsage > 0:
|
|
181
|
+
write("MAX MEMORY USAGE: " + humanReadableSize(maxMemoryUsage))
|
|
182
|
+
elif maxMemoryUsage < 0:
|
|
183
|
+
write('MAX MEMORY USAGE: To show memory usage, run "pip install psutil"')
|
|
184
|
+
if maxMemoryPercent > 0:
|
|
185
|
+
write(
|
|
186
|
+
"MAX TOTAL MEMORY: "
|
|
187
|
+
+ str(maxMemoryPercent)
|
|
188
|
+
+ "% (Threshold "
|
|
189
|
+
+ str(args.memory_threshold)
|
|
190
|
+
+ "%)"
|
|
191
|
+
)
|
|
192
|
+
elif maxMemoryUsage < 0:
|
|
193
|
+
write('MAX TOTAL MEMORY: To show total memory %, run "pip install psutil"')
|
|
194
|
+
write()
|
|
195
|
+
|
|
196
|
+
def write(text='',pipe=False):
|
|
197
|
+
# Only send text to stdout if the tool isn't piped to pass output to something else,
|
|
198
|
+
# or if the tool has been piped and the pipe parameter is True
|
|
199
|
+
if sys.stdout.isatty() or (not sys.stdout.isatty() and pipe):
|
|
200
|
+
# If it has carriage return in the string, don't add a newline
|
|
201
|
+
if text.find('\r') > 0:
|
|
202
|
+
sys.stdout.write(text)
|
|
203
|
+
else:
|
|
204
|
+
sys.stdout.write(text+'\n')
|
|
205
|
+
|
|
206
|
+
def writerr(text='',pipe=False):
|
|
207
|
+
# Only send text to stdout if the tool isn't piped to pass output to something else,
|
|
208
|
+
# or If the tool has been piped to output the send to stderr
|
|
209
|
+
if sys.stdout.isatty():
|
|
210
|
+
# If it has carriage return in the string, don't add a newline
|
|
211
|
+
if text.find('\r') > 0:
|
|
212
|
+
sys.stdout.write(text)
|
|
213
|
+
else:
|
|
214
|
+
sys.stdout.write(text+'\n')
|
|
215
|
+
else:
|
|
216
|
+
# If it has carriage return in the string, don't add a newline
|
|
217
|
+
if text.find('\r') > 0:
|
|
218
|
+
sys.stderr.write(text)
|
|
219
|
+
else:
|
|
220
|
+
sys.stderr.write(text+'\n')
|
|
221
|
+
|
|
222
|
+
def showVersion():
|
|
223
|
+
try:
|
|
224
|
+
try:
|
|
225
|
+
resp = requests.get('https://raw.githubusercontent.com/xnl-h4ck3r/waymore/main/waymore/__init__.py',timeout=3)
|
|
226
|
+
except:
|
|
227
|
+
write('Current waymore version '+__version__+' (unable to check if latest)\n')
|
|
228
|
+
if __version__ == resp.text.split('=')[1].replace('"',''):
|
|
229
|
+
write('Current waymore version '+__version__+' ('+colored('latest','green')+')\n')
|
|
230
|
+
else:
|
|
231
|
+
write('Current waymore version '+__version__+' ('+colored('outdated','red')+')\n')
|
|
232
|
+
except:
|
|
233
|
+
pass
|
|
234
|
+
|
|
235
|
+
def showBanner():
|
|
236
|
+
write()
|
|
237
|
+
write(colored(" _ _ _ _ _ ","red")+"____ ")
|
|
238
|
+
write(colored("| | | |_____| | | ","red")+r"/ \ ___ ____ _____ ")
|
|
239
|
+
write(colored("| | | (____ | | | ","red")+r"| | | |/ _ \ / ___) ___ |")
|
|
240
|
+
write(colored("| | | / ___ | |_| ","red")+"| | | | |_| | | | |_| |")
|
|
241
|
+
write(colored(r" \___/\_____|\__ ","red")+r"|_|_|_|\___/| | | ____/")
|
|
242
|
+
write(colored(" (____/ ","red")+colored(" by Xnl-h4ck3r ","magenta")+r" \_____)")
|
|
243
|
+
try:
|
|
244
|
+
currentDate = datetime.now().date()
|
|
245
|
+
if currentDate.month == 12 and currentDate.day in (24,25):
|
|
246
|
+
write(colored(" *** 🎅 HAPPY CHRISTMAS! 🎅 ***","green",attrs=["blink"]))
|
|
247
|
+
elif currentDate.month == 10 and currentDate.day == 31:
|
|
248
|
+
write(colored(" *** 🎃 HAPPY HALLOWEEN! 🎃 ***","red",attrs=["blink"]))
|
|
249
|
+
elif currentDate.month == 1 and currentDate.day in (1,2,3,4,5):
|
|
250
|
+
write(colored(" *** 🥳 HAPPY NEW YEAR!! 🥳 ***","yellow",attrs=["blink"]))
|
|
251
|
+
except:
|
|
252
|
+
pass
|
|
253
|
+
write()
|
|
254
|
+
showVersion()
|
|
255
|
+
|
|
256
|
+
def verbose():
|
|
257
|
+
"""
|
|
258
|
+
Functions used when printing messages dependant on verbose option
|
|
259
|
+
"""
|
|
260
|
+
return args.verbose
|
|
261
|
+
|
|
262
|
+
def handler(signal_received, frame):
|
|
263
|
+
"""
|
|
264
|
+
This function is called if Ctrl-C is called by the user
|
|
265
|
+
An attempt will be made to try and clean up properly
|
|
266
|
+
"""
|
|
267
|
+
global stopSource, stopProgram, stopProgramCount
|
|
268
|
+
|
|
269
|
+
if stopProgram is not None:
|
|
270
|
+
stopProgramCount = stopProgramCount + 1
|
|
271
|
+
if stopProgramCount == 1:
|
|
272
|
+
writerr(colored(getSPACER(">>> Please be patient... Trying to save data and end gracefully!"),'red'))
|
|
273
|
+
elif stopProgramCount == 2:
|
|
274
|
+
writerr(colored(getSPACER(">>> SERIOUSLY... YOU DON'T WANT YOUR DATA SAVED?!"), 'red'))
|
|
275
|
+
elif stopProgramCount == 3:
|
|
276
|
+
writerr(colored(getSPACER(r">>> Patience isn't your strong suit eh? ¯\_(ツ)_/¯"), 'red'))
|
|
277
|
+
sys.exit()
|
|
278
|
+
else:
|
|
279
|
+
stopProgram = StopProgram.SIGINT
|
|
280
|
+
stopSource = True
|
|
281
|
+
writerr(colored(getSPACER('>>> "Oh my God, they killed Kenny... and waymore!" - Kyle'), "red"))
|
|
282
|
+
writerr(colored(getSPACER('>>> Attempting to rescue any data gathered so far...'), "red"))
|
|
283
|
+
|
|
284
|
+
def showOptions():
|
|
285
|
+
"""
|
|
286
|
+
Show the chosen options and config settings
|
|
287
|
+
"""
|
|
288
|
+
global inputIsDomainANDPath, argsInput, isInputFile
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
write(colored('Selected config and settings:', 'cyan'))
|
|
292
|
+
|
|
293
|
+
if isInputFile:
|
|
294
|
+
inputArgDesc = '-i <FILE: current line>: '
|
|
295
|
+
else:
|
|
296
|
+
inputArgDesc = '-i: '
|
|
297
|
+
if inputIsDomainANDPath:
|
|
298
|
+
write(colored(inputArgDesc + argsInput, 'magenta')+colored(' The target URL to search for.','white'))
|
|
299
|
+
else: # input is a domain
|
|
300
|
+
write(colored(inputArgDesc + argsInput, 'magenta')+colored(' The target domain to search for.','white'))
|
|
301
|
+
|
|
302
|
+
if args.mode == 'U':
|
|
303
|
+
write(colored('-mode: ' + args.mode, 'magenta')+colored(' Only URLs will be retrieved for the input.','white'))
|
|
304
|
+
elif args.mode == 'R':
|
|
305
|
+
write(colored('-mode: ' + args.mode, 'magenta')+colored(' Only Responses will be downloaded for the input.','white'))
|
|
306
|
+
elif args.mode == 'B':
|
|
307
|
+
write(colored('-mode: ' + args.mode, 'magenta')+colored(' URLs will be retrieved AND Responses will be downloaded for the input.','white'))
|
|
308
|
+
|
|
309
|
+
if args.config is not None:
|
|
310
|
+
write(colored('-c: ' + args.config, 'magenta')+colored(' The path of the YML config file.','white'))
|
|
311
|
+
|
|
312
|
+
if args.no_subs:
|
|
313
|
+
write(colored('-n: ' +str(args.no_subs), 'magenta')+colored(' Sub domains are excluded in the search.','white'))
|
|
314
|
+
else:
|
|
315
|
+
write(colored('-n: ' +str(args.no_subs), 'magenta')+colored(' Sub domains are included in the search.','white'))
|
|
316
|
+
|
|
317
|
+
write(colored('-xwm: ' +str(args.xwm), 'magenta')+colored(' Whether to exclude checks for links from Wayback Machine (archive.org)','white'))
|
|
318
|
+
write(colored('-xcc: ' +str(args.xcc), 'magenta')+colored(' Whether to exclude checks for links from commoncrawl.org','white'))
|
|
319
|
+
if not args.xcc:
|
|
320
|
+
if args.lcc ==0 and args.lcy == 0:
|
|
321
|
+
write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' Search ALL Common Crawl index collections.','white'))
|
|
322
|
+
else:
|
|
323
|
+
if args.lcy == 0:
|
|
324
|
+
write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' The number of latest Common Crawl index collections to be searched.','white'))
|
|
325
|
+
else:
|
|
326
|
+
if args.lcc != 0:
|
|
327
|
+
write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' The number of latest Common Crawl index collections to be searched.','white'))
|
|
328
|
+
write(colored('-lcy: ' +str(args.lcy), 'magenta')+colored(' Search all Common Crawl index collections with data from year '+str(args.lcy)+' and after.','white'))
|
|
329
|
+
write(colored('-xav: ' +str(args.xav), 'magenta')+colored(' Whether to exclude checks for links from alienvault.com','white'))
|
|
330
|
+
write(colored('-xus: ' +str(args.xus), 'magenta')+colored(' Whether to exclude checks for links from urlscan.io','white'))
|
|
331
|
+
if URLSCAN_API_KEY == '':
|
|
332
|
+
write(colored('URLScan API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://urlscan.io/user/signup which will let you get more back, and quicker.','white'))
|
|
333
|
+
else:
|
|
334
|
+
write(colored('URLScan API Key: ', 'magenta')+colored(URLSCAN_API_KEY))
|
|
335
|
+
write(colored('-xvt: ' +str(args.xvt), 'magenta')+colored(' Whether to exclude checks for links from virustotal.com','white'))
|
|
336
|
+
if VIRUSTOTAL_API_KEY == '':
|
|
337
|
+
write(colored('VirusTotal API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://www.virustotal.com/gui/join-us which will let you get some extra URLs.','white'))
|
|
338
|
+
else:
|
|
339
|
+
write(colored('VirusTotal API Key: ', 'magenta')+colored(VIRUSTOTAL_API_KEY))
|
|
340
|
+
|
|
341
|
+
if args.mode in ['U','B']:
|
|
342
|
+
if args.output_urls != '':
|
|
343
|
+
write(colored('-oU: ' +str(args.output_urls), 'magenta')+colored(' The name of the output file for URL links.','white'))
|
|
344
|
+
write(colored('-ow: ' +str(args.output_overwrite), 'magenta')+colored(' Whether the URL output file will be overwritten if it already exists. If False (default), it will be appended to, and duplicates removed.','white'))
|
|
345
|
+
write(colored('-nlf: ' +str(args.new_links_file), 'magenta')+colored(' Whether the URL output file ".new" version will also be written. It will include only new links found for the same target on subsequent runs. This can be used for continuous monitoring of a target.','white'))
|
|
346
|
+
|
|
347
|
+
if args.mode in ['R','B']:
|
|
348
|
+
if args.output_responses != '':
|
|
349
|
+
write(colored('-oR: ' +str(args.output_responses), 'magenta')+colored(' The directory to store archived responses and index file.','white'))
|
|
350
|
+
if args.limit == 0:
|
|
351
|
+
write(colored('-l: ' +str(args.limit), 'magenta')+colored(' Save ALL responses found.','white'))
|
|
352
|
+
else:
|
|
353
|
+
if args.limit > 0:
|
|
354
|
+
write(colored('-l: ' +str(args.limit), 'magenta')+colored(' Only save the FIRST ' + str(args.limit) + ' responses found.','white'))
|
|
355
|
+
else:
|
|
356
|
+
write(colored('-l: ' +str(args.limit), 'magenta')+colored(' Only save the LAST ' + str(abs(args.limit)) + ' responses found.','white'))
|
|
357
|
+
|
|
358
|
+
if args.from_date is not None:
|
|
359
|
+
write(colored('-from: ' +str(args.from_date), 'magenta')+colored(' The date/time to get responses from.','white'))
|
|
360
|
+
if args.to_date is not None:
|
|
361
|
+
write(colored('-to: ' +str(args.to_date), 'magenta')+colored(' The date/time to get responses up to.','white'))
|
|
362
|
+
|
|
363
|
+
if args.capture_interval == 'h':
|
|
364
|
+
write(colored('-ci: ' +args.capture_interval, 'magenta')+colored(' Get at most 1 archived response per hour from Wayback Machine (archive.org)','white'))
|
|
365
|
+
elif args.capture_interval == 'd':
|
|
366
|
+
write(colored('-ci: ' +args.capture_interval, 'magenta')+colored(' Get at most 1 archived response per day from Wayback Machine (archive.org)','white'))
|
|
367
|
+
elif args.capture_interval == 'm':
|
|
368
|
+
write(colored('-ci: ' +args.capture_interval, 'magenta')+colored(' Get at most 1 archived response per month from Wayback Machine (archive.org)','white'))
|
|
369
|
+
elif args.capture_interval == 'none':
|
|
370
|
+
write(colored('-ci: ' +args.capture_interval, 'magenta')+colored(' There will not be any filtering based on the capture interval.','white'))
|
|
371
|
+
|
|
372
|
+
if args.url_filename:
|
|
373
|
+
write(colored('-url-filename: ' +str(args.url_filename), 'magenta')+colored(' The filenames of downloaded responses wil be set to the URL rather than the hash value of the response.','white'))
|
|
374
|
+
|
|
375
|
+
write(colored('-oijs: '+str(args.output_inline_js), 'magenta')+colored(' Whether the combined JS of all responses will be written to one or more files.','white'))
|
|
376
|
+
|
|
377
|
+
write(colored('-f: ' +str(args.filter_responses_only), 'magenta')+colored(' If True, the initial links from wayback machine will not be filtered, only the responses that are downloaded will be filtered. It maybe useful to still see all available paths even if you don\'t want to check the file for content.','white'))
|
|
378
|
+
if args.keywords_only is not None and args.keywords_only != '#CONFIG':
|
|
379
|
+
write(colored('-ko: ' +str(args.keywords_only), 'magenta')+colored(' Only get results that match the given Regex.','white'))
|
|
380
|
+
|
|
381
|
+
write(colored('-lr: ' +str(args.limit_requests), 'magenta')+colored(' The limit of requests made per source when getting links. A value of 0 (Zero) means no limit is applied.','white'))
|
|
382
|
+
if args.mc:
|
|
383
|
+
write(colored('-mc: ' +str(args.mc), 'magenta')+colored(' Only retrieve URLs and Responses that match these HTTP Status codes.','white'))
|
|
384
|
+
else:
|
|
385
|
+
if args.fc:
|
|
386
|
+
write(colored('-fc: ' +str(args.mc), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these HTTP Status codes.','white'))
|
|
387
|
+
write(colored('MIME Type exclusions: ', 'magenta')+colored(FILTER_MIME))
|
|
388
|
+
if not args.mc and args.fc:
|
|
389
|
+
write(colored('Response Code exclusions: ', 'magenta')+colored(FILTER_CODE))
|
|
390
|
+
write(colored('Response URL exclusions: ', 'magenta')+colored(FILTER_URL))
|
|
391
|
+
if args.keywords_only and args.keywords_only == '#CONFIG':
|
|
392
|
+
if FILTER_KEYWORDS == '':
|
|
393
|
+
write(colored('Keywords only: ', 'magenta')+colored('It looks like no keywords have been set in config.yml file.','red'))
|
|
394
|
+
else:
|
|
395
|
+
write(colored('Keywords only: ', 'magenta')+colored(FILTER_KEYWORDS))
|
|
396
|
+
|
|
397
|
+
if args.notify_discord:
|
|
398
|
+
if WEBHOOK_DISCORD == '' or WEBHOOK_DISCORD == 'YOUR_WEBHOOK':
|
|
399
|
+
write(colored('Discord Webhook: ', 'magenta')+colored('It looks like no Discord webhook has been set in config.yml file.','red'))
|
|
400
|
+
else:
|
|
401
|
+
write(colored('Discord Webhook: ', 'magenta')+colored(WEBHOOK_DISCORD))
|
|
402
|
+
|
|
403
|
+
write(colored('Default Output Directory: ', 'magenta')+colored(str(DEFAULT_OUTPUT_DIR)))
|
|
404
|
+
|
|
405
|
+
if args.regex_after is not None:
|
|
406
|
+
write(colored('-ra: ' + args.regex_after, 'magenta')+colored(' RegEx for filtering purposes against found links from all sources of URLs AND responses downloaded. Only positive matches will be output.','white'))
|
|
407
|
+
if args.mode in ['R','B']:
|
|
408
|
+
write(colored('-t: ' + str(args.timeout), 'magenta')+colored(' The number of seconds to wait for a an archived response.','white'))
|
|
409
|
+
if args.mode in ['R','B'] or (args.mode == 'U' and not args.xcc):
|
|
410
|
+
write(colored('-p: ' + str(args.processes), 'magenta')+colored(' The number of parallel requests made.','white'))
|
|
411
|
+
write(colored('-r: ' + str(args.retries), 'magenta')+colored(' The number of retries for requests that get connection error or rate limited.','white'))
|
|
412
|
+
|
|
413
|
+
if not args.xwm:
|
|
414
|
+
write(colored('-wrlr: ' + str(args.wayback_rate_limit_retry), 'magenta')+colored(' The number of minutes to wait for a rate limit pause on Wayback Machine (archive.org) instead of stopping with a 429 error.','white'))
|
|
415
|
+
if not args.xus:
|
|
416
|
+
write(colored('-urlr: ' + str(args.urlscan_rate_limit_retry), 'magenta')+colored(' The number of minutes to wait for a rate limit pause on URLScan.io instead of stopping with a 429 error.','white'))
|
|
417
|
+
|
|
418
|
+
write()
|
|
419
|
+
|
|
420
|
+
except Exception as e:
|
|
421
|
+
writerr(colored('ERROR showOptions: ' + str(e), 'red'))
|
|
422
|
+
|
|
423
|
+
def getConfig():
|
|
424
|
+
"""
|
|
425
|
+
Try to get the values from the config file, otherwise use the defaults
|
|
426
|
+
"""
|
|
427
|
+
global FILTER_CODE, FILTER_MIME, FILTER_URL, FILTER_KEYWORDS, URLSCAN_API_KEY, VIRUSTOTAL_API_KEY, CONTINUE_RESPONSES_IF_PIPED, subs, path, waymorePath, inputIsDomainANDPath, HTTP_ADAPTER, HTTP_ADAPTER_CC, argsInput, terminalWidth, MATCH_CODE, WEBHOOK_DISCORD, DEFAULT_OUTPUT_DIR
|
|
428
|
+
try:
|
|
429
|
+
|
|
430
|
+
# Set terminal width
|
|
431
|
+
try:
|
|
432
|
+
terminalWidth = os.get_terminal_size().columns
|
|
433
|
+
except:
|
|
434
|
+
terminalWidth = 135
|
|
435
|
+
|
|
436
|
+
# If the input doesn't have a / then assume it is a domain rather than a domain AND path
|
|
437
|
+
if str(argsInput).find('/') < 0:
|
|
438
|
+
path = '/*'
|
|
439
|
+
inputIsDomainANDPath = False
|
|
440
|
+
else:
|
|
441
|
+
# If there is only one / and is the last character, remove it
|
|
442
|
+
if str(argsInput).count('/') == 1 and str(argsInput)[-1:] == '/':
|
|
443
|
+
argsInput = argsInput.replace('/','')
|
|
444
|
+
path = '/*'
|
|
445
|
+
inputIsDomainANDPath = False
|
|
446
|
+
else:
|
|
447
|
+
path = '*'
|
|
448
|
+
inputIsDomainANDPath = True
|
|
449
|
+
|
|
450
|
+
# If the -no-subs argument was passed, don't include subs
|
|
451
|
+
# Also, if a path is passed, the subs will not be used
|
|
452
|
+
if args.no_subs or inputIsDomainANDPath:
|
|
453
|
+
subs = ''
|
|
454
|
+
|
|
455
|
+
# Set up an HTTPAdaptor for retry strategy when making requests
|
|
456
|
+
try:
|
|
457
|
+
retry= Retry(
|
|
458
|
+
total=args.retries,
|
|
459
|
+
backoff_factor=1.1,
|
|
460
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
|
461
|
+
raise_on_status=False,
|
|
462
|
+
respect_retry_after_header=False
|
|
463
|
+
)
|
|
464
|
+
HTTP_ADAPTER = HTTPAdapter(max_retries=retry)
|
|
465
|
+
except Exception as e:
|
|
466
|
+
writerr(colored('ERROR getConfig 2: ' + str(e), 'red'))
|
|
467
|
+
|
|
468
|
+
# Set up an HTTPAdaptor for retry strategy for Common Crawl when making requests
|
|
469
|
+
try:
|
|
470
|
+
retry= Retry(
|
|
471
|
+
total=args.retries+20,
|
|
472
|
+
backoff_factor=1.1,
|
|
473
|
+
status_forcelist=[503],
|
|
474
|
+
raise_on_status=False,
|
|
475
|
+
respect_retry_after_header=False
|
|
476
|
+
)
|
|
477
|
+
HTTP_ADAPTER_CC = HTTPAdapter(max_retries=retry)
|
|
478
|
+
except Exception as e:
|
|
479
|
+
writerr(colored('ERROR getConfig 3: ' + str(e), 'red'))
|
|
480
|
+
|
|
481
|
+
# Try to get the config file values
|
|
482
|
+
useDefaults = False
|
|
483
|
+
try:
|
|
484
|
+
# Get the path of the config file. If -c / --config argument is not passed, then it defaults to config.yml in the same directory as the run file
|
|
485
|
+
waymorePath = (
|
|
486
|
+
Path(os.path.join(os.getenv('APPDATA', ''), 'waymore')) if os.name == 'nt'
|
|
487
|
+
else Path(os.path.join(os.path.expanduser("~"), ".config", "waymore")) if os.name == 'posix'
|
|
488
|
+
else Path(os.path.join(os.path.expanduser("~"), "Library", "Application Support", "waymore")) if os.name == 'darwin'
|
|
489
|
+
else None
|
|
490
|
+
)
|
|
491
|
+
waymorePath.absolute
|
|
492
|
+
if args.config is None:
|
|
493
|
+
if waymorePath == '':
|
|
494
|
+
configPath = 'config.yml'
|
|
495
|
+
else:
|
|
496
|
+
configPath = Path(waymorePath / 'config.yml')
|
|
497
|
+
else:
|
|
498
|
+
configPath = Path(args.config)
|
|
499
|
+
config = yaml.safe_load(open(configPath))
|
|
500
|
+
try:
|
|
501
|
+
FILTER_URL = config.get('FILTER_URL')
|
|
502
|
+
if str(FILTER_URL) == 'None':
|
|
503
|
+
writerr(colored('No value for "FILTER_URL" in config.yml - default set', 'yellow'))
|
|
504
|
+
FILTER_URL = ''
|
|
505
|
+
except Exception as e:
|
|
506
|
+
writerr(colored('Unable to read "FILTER_URL" from config.yml - default set', 'red'))
|
|
507
|
+
FILTER_URL = DEFAULT_FILTER_URL
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
FILTER_MIME = config.get('FILTER_MIME')
|
|
511
|
+
if str(FILTER_MIME) == 'None':
|
|
512
|
+
writerr(colored('No value for "FILTER_MIME" in config.yml - default set', 'yellow'))
|
|
513
|
+
FILTER_MIME = ''
|
|
514
|
+
except Exception as e:
|
|
515
|
+
writerr(colored('Unable to read "FILTER_MIME" from config.yml - default set', 'red'))
|
|
516
|
+
FILTER_MIME = DEFAULT_FILTER_MIME
|
|
517
|
+
|
|
518
|
+
# If the argument -fc was passed, don't try to get from the config
|
|
519
|
+
if args.fc:
|
|
520
|
+
FILTER_CODE = args.fc
|
|
521
|
+
else:
|
|
522
|
+
try:
|
|
523
|
+
FILTER_CODE = str(config.get('FILTER_CODE'))
|
|
524
|
+
if str(FILTER_CODE) == 'None':
|
|
525
|
+
writerr(colored('No value for "FILTER_CODE" in config.yml - default set', 'yellow'))
|
|
526
|
+
FILTER_CODE = ''
|
|
527
|
+
except Exception as e:
|
|
528
|
+
writerr(colored('Unable to read "FILTER_CODE" from config.yml - default set', 'red'))
|
|
529
|
+
FILTER_CODE = DEFAULT_FILTER_CODE
|
|
530
|
+
|
|
531
|
+
# Set the match codes if they were passed
|
|
532
|
+
if args.mc:
|
|
533
|
+
MATCH_CODE = args.mc
|
|
534
|
+
|
|
535
|
+
try:
|
|
536
|
+
URLSCAN_API_KEY = config.get('URLSCAN_API_KEY')
|
|
537
|
+
if str(URLSCAN_API_KEY) == 'None':
|
|
538
|
+
if not args.xus:
|
|
539
|
+
writerr(colored('No value for "URLSCAN_API_KEY" in config.yml - consider adding (you can get a FREE api key at urlscan.io)', 'yellow'))
|
|
540
|
+
URLSCAN_API_KEY = ''
|
|
541
|
+
except Exception as e:
|
|
542
|
+
writerr(colored('Unable to read "URLSCAN_API_KEY" from config.yml - consider adding (you can get a FREE api key at urlscan.io)', 'red'))
|
|
543
|
+
URLSCAN_API_KEY = ''
|
|
544
|
+
|
|
545
|
+
try:
|
|
546
|
+
VIRUSTOTAL_API_KEY = config.get('VIRUSTOTAL_API_KEY')
|
|
547
|
+
if str(VIRUSTOTAL_API_KEY) == 'None':
|
|
548
|
+
if not args.xvt:
|
|
549
|
+
writerr(colored('No value for "VIRUSTOTAL_API_KEY" in config.yml - consider adding (you can get a FREE api key at virustotal.com)', 'yellow'))
|
|
550
|
+
VIRUSTOTAL_API_KEY = ''
|
|
551
|
+
except Exception as e:
|
|
552
|
+
writerr(colored('Unable to read "VIRUSTOTAL_API_KEY" from config.yml - consider adding (you can get a FREE api key at virustotal.com)', 'red'))
|
|
553
|
+
VIRUSTOTAL_API_KEY = ''
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
FILTER_KEYWORDS = config.get('FILTER_KEYWORDS')
|
|
557
|
+
if str(FILTER_KEYWORDS) == 'None':
|
|
558
|
+
writerr(colored('No value for "FILTER_KEYWORDS" in config.yml - default set', 'yellow'))
|
|
559
|
+
FILTER_KEYWORDS = ''
|
|
560
|
+
except Exception as e:
|
|
561
|
+
writerr(colored('Unable to read "FILTER_KEYWORDS" from config.yml - default set', 'red'))
|
|
562
|
+
FILTER_KEYWORDS = ''
|
|
563
|
+
|
|
564
|
+
try:
|
|
565
|
+
CONTINUE_RESPONSES_IF_PIPED = config.get('CONTINUE_RESPONSES_IF_PIPED')
|
|
566
|
+
if str(CONTINUE_RESPONSES_IF_PIPED) == 'None':
|
|
567
|
+
writerr(colored('No value for "CONTINUE_RESPONSES_IF_PIPED" in config.yml - default set', 'yellow'))
|
|
568
|
+
CONTINUE_RESPONSES_IF_PIPED = True
|
|
569
|
+
except Exception as e:
|
|
570
|
+
writerr(colored('Unable to read "CONTINUE_RESPONSES_IF_PIPED" from config.yml - default set', 'red'))
|
|
571
|
+
CONTINUE_RESPONSES_IF_PIPED = True
|
|
572
|
+
|
|
573
|
+
if args.notify_discord:
|
|
574
|
+
try:
|
|
575
|
+
WEBHOOK_DISCORD = config.get('WEBHOOK_DISCORD')
|
|
576
|
+
if str(WEBHOOK_DISCORD) == 'None' or str(WEBHOOK_DISCORD) == 'YOUR_WEBHOOK':
|
|
577
|
+
writerr(colored('No value for "WEBHOOK_DISCORD" in config.yml - default set', 'yellow'))
|
|
578
|
+
WEBHOOK_DISCORD = ''
|
|
579
|
+
except Exception as e:
|
|
580
|
+
writerr(colored('Unable to read "WEBHOOK_DISCORD" from config.yml - default set', 'red'))
|
|
581
|
+
WEBHOOK_DISCORD = ''
|
|
582
|
+
|
|
583
|
+
try:
|
|
584
|
+
DEFAULT_OUTPUT_DIR = config.get('DEFAULT_OUTPUT_DIR')
|
|
585
|
+
if str(DEFAULT_OUTPUT_DIR) == 'None' or str(DEFAULT_OUTPUT_DIR) == '':
|
|
586
|
+
DEFAULT_OUTPUT_DIR = os.path.expanduser(str(waymorePath))
|
|
587
|
+
else:
|
|
588
|
+
# Test if DEFAULT_OUTPUT_DIR is a valid directory
|
|
589
|
+
if not os.path.isdir(DEFAULT_OUTPUT_DIR):
|
|
590
|
+
writerr(colored('The "DEFAULT_OUTPUT_DIR" of "'+str(DEFAULT_OUTPUT_DIR)+'" is not a valid directory. Using "'+str(waymorePath)+'" instead.', 'yellow'))
|
|
591
|
+
DEFAULT_OUTPUT_DIR = os.path.expanduser(str(waymorePath))
|
|
592
|
+
else:
|
|
593
|
+
DEFAULT_OUTPUT_DIR = os.path.expanduser(DEFAULT_OUTPUT_DIR)
|
|
594
|
+
except Exception as e:
|
|
595
|
+
writerr(colored('Unable to read "DEFAULT_OUTPUT_DIR" from config.yml - default set', 'red'))
|
|
596
|
+
DEFAULT_OUTPUT_DIR = waymorePath
|
|
597
|
+
|
|
598
|
+
except yaml.YAMLError as e: # A scan error occurred reading the file
|
|
599
|
+
useDefaults = True
|
|
600
|
+
if args.config is None:
|
|
601
|
+
writerr(colored('WARNING: There seems to be a formatting error in "config.yml", so using default values', 'yellow'))
|
|
602
|
+
else:
|
|
603
|
+
writerr(colored('WARNING: There seems to be a formatting error in "' + args.config + '", so using default values', 'yellow'))
|
|
604
|
+
|
|
605
|
+
except FileNotFoundError as e: # The config file wasn't found
|
|
606
|
+
useDefaults = True
|
|
607
|
+
if args.config is None:
|
|
608
|
+
writerr(colored('WARNING: Cannot find file "config.yml", so using default values', 'yellow'))
|
|
609
|
+
else:
|
|
610
|
+
writerr(colored('WARNING: Cannot find file "' + args.config + '", so using default values', 'yellow'))
|
|
611
|
+
|
|
612
|
+
except Exception as e: # Another error occurred
|
|
613
|
+
useDefaults = True
|
|
614
|
+
if args.config is None:
|
|
615
|
+
writerr(colored('WARNING: Cannot read file "config.yml", so using default values. The following error occurred: ' + str(e), 'yellow'))
|
|
616
|
+
else:
|
|
617
|
+
writerr(colored('WARNING: Cannot read file "' + args.config + '", so using default values. The following error occurred: ' + str(e), 'yellow'))
|
|
618
|
+
|
|
619
|
+
# Use defaults if required
|
|
620
|
+
if useDefaults:
|
|
621
|
+
FILTER_URL = DEFAULT_FILTER_URL
|
|
622
|
+
FILTER_MIME = DEFAULT_FILTER_MIME
|
|
623
|
+
FILTER_CODE = DEFAULT_FILTER_CODE
|
|
624
|
+
URLSCAN_API_KEY = ''
|
|
625
|
+
VIRUSTOTAL_API_KEY = ''
|
|
626
|
+
FILTER_KEYWORDS = ''
|
|
627
|
+
CONTINUE_RESPONSES_IF_PIPED = True
|
|
628
|
+
WEBHOOK_DISCORD = ''
|
|
629
|
+
DEFAULT_OUTPUT_DIR = os.path.expanduser('~/.config/waymore')
|
|
630
|
+
outputInlineJSDir = DEFAULT_OUTPUT_DIR
|
|
631
|
+
|
|
632
|
+
except Exception as e:
|
|
633
|
+
writerr(colored('ERROR getConfig 1: ' + str(e), 'red'))
|
|
634
|
+
|
|
635
|
+
# Print iterations progress - copied from https://stackoverflow.com/questions/3173320/text-progress-bar-in-terminal-with-block-characters?noredirect=1&lq=1
|
|
636
|
+
def printProgressBar(
|
|
637
|
+
iteration,
|
|
638
|
+
total,
|
|
639
|
+
prefix="",
|
|
640
|
+
suffix="",
|
|
641
|
+
decimals=1,
|
|
642
|
+
length=100,
|
|
643
|
+
fill="█",
|
|
644
|
+
printEnd="\r",
|
|
645
|
+
):
|
|
646
|
+
"""
|
|
647
|
+
Call in a loop to create terminal progress bar
|
|
648
|
+
@params:
|
|
649
|
+
iteration - Required : current iteration (Int)
|
|
650
|
+
total - Required : total iterations (Int)
|
|
651
|
+
prefix - Optional : prefix string (Str)
|
|
652
|
+
suffix - Optional : suffix string (Str)
|
|
653
|
+
decimals - Optional : positive number of decimals in percent complete (Int)
|
|
654
|
+
length - Optional : character length of bar (Int)
|
|
655
|
+
fill - Optional : bar fill character (Str)
|
|
656
|
+
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
|
|
657
|
+
"""
|
|
658
|
+
try:
|
|
659
|
+
percent = ("{0:." + str(decimals) + "f}").format(
|
|
660
|
+
100 * (iteration / float(total))
|
|
661
|
+
).rjust(5)
|
|
662
|
+
filledLength = int(length * iteration // total)
|
|
663
|
+
bar = fill * filledLength + "-" * (length - filledLength)
|
|
664
|
+
# If the program is not piped with something else, write to stdout, otherwise write to stderr
|
|
665
|
+
if sys.stdout.isatty():
|
|
666
|
+
write(colored(f"\r{prefix} |{bar}| {percent}% {suffix}\r", "green"))
|
|
667
|
+
else:
|
|
668
|
+
writerr(colored(f"\r{prefix} |{bar}| {percent}% {suffix}\r", "green"))
|
|
669
|
+
# Print New Line on Complete
|
|
670
|
+
if iteration == total:
|
|
671
|
+
# If the program is not piped with something else, write to stdout, otherwise write to stderr
|
|
672
|
+
if sys.stdout.isatty():
|
|
673
|
+
write()
|
|
674
|
+
else:
|
|
675
|
+
writerr()
|
|
676
|
+
except Exception as e:
|
|
677
|
+
if verbose():
|
|
678
|
+
writerr(colored("ERROR printProgressBar: " + str(e), "red"))
|
|
679
|
+
|
|
680
|
+
def filehash(text):
|
|
681
|
+
"""
|
|
682
|
+
Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
|
|
683
|
+
"""
|
|
684
|
+
hash=0
|
|
685
|
+
for ch in text:
|
|
686
|
+
hash = (hash*281 ^ ord(ch)*997) & 0xFFFFFFFFFFF
|
|
687
|
+
return str(hash)
|
|
688
|
+
|
|
689
|
+
class WayBackException(Exception):
|
|
690
|
+
"""
|
|
691
|
+
A custom exception to raise if archive.org respond with specific text in the response that indicate there is a problem on their side
|
|
692
|
+
"""
|
|
693
|
+
def __init__(self):
|
|
694
|
+
message = f"WayBackException"
|
|
695
|
+
super().__init__(message)
|
|
696
|
+
|
|
697
|
+
def fixArchiveOrgUrl(url):
|
|
698
|
+
"""
|
|
699
|
+
Sometimes archive.org returns a URL that has %0A at the end followed by other characters. If you try to reach the archive URL with that it will fail, but remove from the %0A (newline) onwards and it succeeds, so it doesn't seem intentionally included. In this case, strip anything from %0A onwards from the URL
|
|
700
|
+
"""
|
|
701
|
+
newline = url.find('%0A')
|
|
702
|
+
if newline > 0:
|
|
703
|
+
url = url[0:newline]
|
|
704
|
+
else:
|
|
705
|
+
newline = url.find('%0a')
|
|
706
|
+
if newline > 0:
|
|
707
|
+
url = url[0:newline]
|
|
708
|
+
return url
|
|
709
|
+
|
|
710
|
+
# Add a link to the linksFound collection
|
|
711
|
+
def linksFoundAdd(link):
|
|
712
|
+
global linksFound, argsInput, argsInputHostname
|
|
713
|
+
# If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
|
|
714
|
+
try:
|
|
715
|
+
if inputIsDomainANDPath:
|
|
716
|
+
checkInput = argsInput
|
|
717
|
+
else:
|
|
718
|
+
checkInput = argsInputHostname
|
|
719
|
+
# Don't write it if the link does not contain the requested domain (this can sometimes happen)
|
|
720
|
+
if link.find(checkInput) >= 0:
|
|
721
|
+
parsed = urlparse(link.strip())
|
|
722
|
+
if parsed.netloc.find(':80') >= 0 or parsed.netloc.fnd(':443') >= 0:
|
|
723
|
+
newNetloc = parsed.netloc.split(':')[0]
|
|
724
|
+
parsed = parsed._replace(netloc=newNetloc).geturl()
|
|
725
|
+
linksFound.add(parsed)
|
|
726
|
+
except:
|
|
727
|
+
linksFound.add(link)
|
|
728
|
+
|
|
729
|
+
def processArchiveUrl(url):
|
|
730
|
+
"""
|
|
731
|
+
Get the passed web archive response
|
|
732
|
+
"""
|
|
733
|
+
global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, continueRespFile, REGEX_404
|
|
734
|
+
try:
|
|
735
|
+
if stopProgram is None:
|
|
736
|
+
|
|
737
|
+
archiveUrl = 'https://web.archive.org/web/' + fixArchiveOrgUrl(url)
|
|
738
|
+
hashValue = ''
|
|
739
|
+
|
|
740
|
+
# Get memory usage every 100 responses
|
|
741
|
+
if (successCount + failureCount) % 100 == 0:
|
|
742
|
+
try:
|
|
743
|
+
getMemory()
|
|
744
|
+
except:
|
|
745
|
+
pass
|
|
746
|
+
|
|
747
|
+
# Make a request to the web archive
|
|
748
|
+
try:
|
|
749
|
+
try:
|
|
750
|
+
# Choose a random user agent string to use for any requests
|
|
751
|
+
userAgent = random.choice(USER_AGENT)
|
|
752
|
+
|
|
753
|
+
session = requests.Session()
|
|
754
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
755
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
756
|
+
resp = session.get(url = archiveUrl, headers={"User-Agent":userAgent}, allow_redirects = True)
|
|
757
|
+
archiveHtml = str(resp.text)
|
|
758
|
+
try:
|
|
759
|
+
contentType = resp.headers.get("Content-Type").split(';')[0].lower()
|
|
760
|
+
except:
|
|
761
|
+
contentType = ''
|
|
762
|
+
|
|
763
|
+
# Only create a file if there is a response
|
|
764
|
+
if len(archiveHtml) != 0:
|
|
765
|
+
|
|
766
|
+
# If the FILTER_CODE includes 404, and it only process if it doesn't seem to be a custom 404 page
|
|
767
|
+
if '404' in FILTER_CODE and not re.findall(REGEX_404, archiveHtml, re.DOTALL|re.IGNORECASE):
|
|
768
|
+
|
|
769
|
+
# Add the URL as a comment at the start of the response
|
|
770
|
+
if args.url_filename:
|
|
771
|
+
archiveHtml = '/* Original URL: ' + archiveUrl + ' */\n' + archiveHtml
|
|
772
|
+
|
|
773
|
+
# Remove all web archive references in the response
|
|
774
|
+
archiveHtml = re.sub(r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
|
|
775
|
+
archiveHtml = re.sub(r'\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
|
|
776
|
+
archiveHtml = re.sub(r'\<script\>window\.RufflePlayer[^\<]*\<\/script\>','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
|
|
777
|
+
archiveHtml = re.sub(r'\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
|
|
778
|
+
archiveHtml = re.sub(r'(}\n)?(\/\*|<!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|-->)','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
|
|
779
|
+
archiveHtml = re.sub(r'var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
|
|
780
|
+
archiveHtml = re.sub(r'(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
|
|
781
|
+
archiveHtml = re.sub(r'((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/','',archiveHtml,flags=re.IGNORECASE)
|
|
782
|
+
archiveHtml = re.sub(r'((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/','',archiveHtml,flags=re.IGNORECASE)
|
|
783
|
+
archiveHtml = re.sub(r'((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F','',archiveHtml,flags=re.IGNORECASE)
|
|
784
|
+
archiveHtml = re.sub(r'((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F','',archiveHtml,flags=re.IGNORECASE)
|
|
785
|
+
archiveHtml = re.sub(r'\<script type=\"text\/javascript\">\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>','',archiveHtml,flags=re.IGNORECASE)
|
|
786
|
+
archiveHtml = re.sub(r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>','',archiveHtml,flags=re.IGNORECASE)
|
|
787
|
+
archiveHtml = re.sub(r'\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>','',archiveHtml,flags=re.IGNORECASE)
|
|
788
|
+
archiveHtml = re.sub(r'\<\!-- End Wayback Rewrite JS Include --\>','',archiveHtml,re.IGNORECASE)
|
|
789
|
+
|
|
790
|
+
# If there is a specific Wayback error in the response, raise an exception
|
|
791
|
+
if archiveHtml.lower().find('wayback machine has not archived that url') > 0 or archiveHtml.lower().find('snapshot cannot be displayed due to an internal error') > 0:
|
|
792
|
+
raise WayBackException
|
|
793
|
+
|
|
794
|
+
# Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
|
|
795
|
+
if args.url_filename:
|
|
796
|
+
fileName = url.replace('/','-').replace(':','')
|
|
797
|
+
fileName = fileName[0:254]
|
|
798
|
+
else:
|
|
799
|
+
hashValue = filehash(archiveHtml)
|
|
800
|
+
fileName = hashValue
|
|
801
|
+
|
|
802
|
+
# Determine extension of file from the content-type using the mimetypes library
|
|
803
|
+
extension = ''
|
|
804
|
+
try:
|
|
805
|
+
# Get path extension
|
|
806
|
+
targetUrl = 'https://' + url.split("://")[1]
|
|
807
|
+
parsed = urlparse(targetUrl.strip())
|
|
808
|
+
path = parsed.path
|
|
809
|
+
extension = path[path.rindex('.')+1:]
|
|
810
|
+
except:
|
|
811
|
+
pass
|
|
812
|
+
|
|
813
|
+
# If the extension is blank, numeric, longer than 4 characters or not alphanumeric - then it's not a valid file type so check contentType
|
|
814
|
+
if extension == '' or extension.isnumeric() or not extension.isalnum() or len(extension) > 4:
|
|
815
|
+
# Determine the extension from the content type
|
|
816
|
+
try:
|
|
817
|
+
if contentType != '':
|
|
818
|
+
extension = contentType.split('/')[1].replace('x-','')
|
|
819
|
+
if extension == '':
|
|
820
|
+
extension = contentType.lower()
|
|
821
|
+
except:
|
|
822
|
+
pass
|
|
823
|
+
if 'html' in extension:
|
|
824
|
+
extension = 'html'
|
|
825
|
+
elif 'javascript' in extension:
|
|
826
|
+
extension = 'js'
|
|
827
|
+
elif 'json' in extension:
|
|
828
|
+
extension = 'json'
|
|
829
|
+
elif 'css' in extension:
|
|
830
|
+
extension = 'css'
|
|
831
|
+
elif 'pdf' in extension:
|
|
832
|
+
extension = 'pdf'
|
|
833
|
+
elif 'plain' == extension:
|
|
834
|
+
extension = 'txt'
|
|
835
|
+
|
|
836
|
+
# If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
|
|
837
|
+
if extension == '':
|
|
838
|
+
if archiveHtml.lower().strip().endswith('</html>') or archiveHtml.lower().strip().startswith('<!doctype html') or archiveHtml.lower().strip().startswith('<html'):
|
|
839
|
+
extension = 'html'
|
|
840
|
+
else:
|
|
841
|
+
extension = 'unknown'
|
|
842
|
+
|
|
843
|
+
fileName = fileName + '.' + extension
|
|
844
|
+
|
|
845
|
+
# If -oR / --output-responses was passed then add the file to that directory,
|
|
846
|
+
# else add to the default "results/{target.domain}" directory in the same path as the .py file
|
|
847
|
+
if args.output_responses != '':
|
|
848
|
+
filePath = args.output_responses + '/' + f'{fileName}'
|
|
849
|
+
else:
|
|
850
|
+
filePath = (DEFAULT_OUTPUT_DIR + '/results/' + str(argsInput).replace('/','-') + '/' + f'{fileName}')
|
|
851
|
+
|
|
852
|
+
# Write the file
|
|
853
|
+
try:
|
|
854
|
+
responseFile = open(filePath, 'w', encoding='utf8')
|
|
855
|
+
responseFile.write(archiveHtml)
|
|
856
|
+
responseFile.close()
|
|
857
|
+
fileCount = fileCount + 1
|
|
858
|
+
except Exception as e:
|
|
859
|
+
writerr(colored(getSPACER('[ ERR ] Failed to write file ' + filePath + ': '+ str(e)), 'red'))
|
|
860
|
+
|
|
861
|
+
# Write the hash value and URL to the index file
|
|
862
|
+
if not args.url_filename:
|
|
863
|
+
try:
|
|
864
|
+
timestamp = str(datetime.now())
|
|
865
|
+
indexFile.write(hashValue+','+archiveUrl+' ,'+timestamp+'\n')
|
|
866
|
+
indexFile.flush()
|
|
867
|
+
except Exception as e:
|
|
868
|
+
writerr(colored(getSPACER('[ ERR ] Failed to write to index.txt for "' + archiveUrl + '": '+ str(e)), 'red'))
|
|
869
|
+
|
|
870
|
+
# FOR DEBUGGING PURPOSES
|
|
871
|
+
try:
|
|
872
|
+
if os.environ.get('USER') == 'xnl':
|
|
873
|
+
debugText = ''
|
|
874
|
+
if archiveHtml.lower().find('archive.org') > 0:
|
|
875
|
+
debugText = 'ARCHIVE.ORG'
|
|
876
|
+
elif archiveHtml.lower().find('internet archive') > 0:
|
|
877
|
+
debugText = 'INTERNET ARCHIVE'
|
|
878
|
+
elif archiveHtml.lower().find('wombat') > 0:
|
|
879
|
+
debugText = 'WOMBAT (JS)'
|
|
880
|
+
if debugText != '':
|
|
881
|
+
writerr(colored(getSPACER('"' + fileName + '" CONTAINS ' + debugText + ' - CHECK ITS A VALID REFERENCE'), 'yellow'))
|
|
882
|
+
except:
|
|
883
|
+
pass
|
|
884
|
+
|
|
885
|
+
successCount = successCount + 1
|
|
886
|
+
|
|
887
|
+
except WayBackException as wbe:
|
|
888
|
+
failureCount = failureCount + 1
|
|
889
|
+
if verbose():
|
|
890
|
+
writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) returned a problem for "' + archiveUrl + '"'), 'red'))
|
|
891
|
+
except ConnectionError as ce:
|
|
892
|
+
failureCount = failureCount + 1
|
|
893
|
+
if verbose():
|
|
894
|
+
writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error for "' + archiveUrl + '"'), 'red'))
|
|
895
|
+
except Exception as e:
|
|
896
|
+
failureCount = failureCount + 1
|
|
897
|
+
if verbose():
|
|
898
|
+
try:
|
|
899
|
+
writerr(colored(getSPACER('[ ' + str(resp.status_code) +' ] Failed to get response for "' + archiveUrl + '"'), 'red'))
|
|
900
|
+
except:
|
|
901
|
+
writerr(colored(getSPACER('[ ERR ] Failed to get response for "' + archiveUrl + '": '+ str(e)), 'red'))
|
|
902
|
+
|
|
903
|
+
# Show progress bar
|
|
904
|
+
fillTest = (successCount + failureCount) % 2
|
|
905
|
+
fillChar = "o"
|
|
906
|
+
if fillTest == 0:
|
|
907
|
+
fillChar = "O"
|
|
908
|
+
suffix="Complete "
|
|
909
|
+
# Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
|
|
910
|
+
if (successCount + failureCount) % 25 == 1 or (successCount + failureCount) == totalResponses:
|
|
911
|
+
try:
|
|
912
|
+
getMemory()
|
|
913
|
+
if verbose():
|
|
914
|
+
suffix = (
|
|
915
|
+
"Complete (Mem Usage "
|
|
916
|
+
+ humanReadableSize(currentMemUsage)
|
|
917
|
+
+ ", Total Mem "
|
|
918
|
+
+ str(currentMemPercent)
|
|
919
|
+
+ "%) "
|
|
920
|
+
)
|
|
921
|
+
except:
|
|
922
|
+
if verbose():
|
|
923
|
+
suffix = 'Complete (To show mem use, run "pip install psutil")'
|
|
924
|
+
printProgressBar(
|
|
925
|
+
successCount + failureCount,
|
|
926
|
+
totalResponses,
|
|
927
|
+
prefix="Downloading " + str(totalResponses) + " responses:",
|
|
928
|
+
suffix=suffix,
|
|
929
|
+
length=getProgressBarLength(),
|
|
930
|
+
fill=fillChar
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
# Write the total count to the continueResp.tmp file
|
|
934
|
+
try:
|
|
935
|
+
continueRespFile.seek(0)
|
|
936
|
+
continueRespFile.write(str(successCount + failureCount)+'\n')
|
|
937
|
+
except Exception as e:
|
|
938
|
+
if verbose():
|
|
939
|
+
writerr(colored(getSPACER('ERROR processArchiveUrl 2: ' + str(e)), 'red'))
|
|
940
|
+
|
|
941
|
+
except Exception as e:
|
|
942
|
+
if verbose():
|
|
943
|
+
writerr(colored(getSPACER('Error for "'+url+'": ' + str(e)), 'red'))
|
|
944
|
+
|
|
945
|
+
except Exception as e:
|
|
946
|
+
writerr(colored('ERROR processArchiveUrl 1: ' + str(e), 'red'))
|
|
947
|
+
|
|
948
|
+
def processURLOutput():
|
|
949
|
+
"""
|
|
950
|
+
Show results of the URL output, i.e. getting URLs from archive.org and commoncrawl.org and write results to file
|
|
951
|
+
"""
|
|
952
|
+
global linksFound, subs, path, argsInput, checkWayback, checkCommonCrawl, checkAlienVault, checkURLScan, checkVirusTotal, DEFAULT_OUTPUT_DIR
|
|
953
|
+
|
|
954
|
+
try:
|
|
955
|
+
|
|
956
|
+
if args.check_only:
|
|
957
|
+
totalRequests = checkWayback + checkCommonCrawl + checkAlienVault + checkURLScan + checkVirusTotal
|
|
958
|
+
minutes = totalRequests*1 // 60
|
|
959
|
+
hours = minutes // 60
|
|
960
|
+
days = hours // 24
|
|
961
|
+
if minutes < 5:
|
|
962
|
+
write(colored('\n-> Getting URLs (e.g. at 1 req/sec) should be quite quick!','green'))
|
|
963
|
+
elif hours < 2:
|
|
964
|
+
write(colored('\n-> Getting URLs (e.g. at 1 req/sec) could take more than '+str(minutes)+' minutes.','green'))
|
|
965
|
+
elif hours < 6:
|
|
966
|
+
write(colored('\n-> Getting URLs (e.g. at 1 req/sec) could take more than '+str(hours)+' hours.','green'))
|
|
967
|
+
elif hours < 24:
|
|
968
|
+
write(colored('\n-> Getting URLs (e.g. at 1 req/sec) take more than '+str(hours)+' hours.','yellow'))
|
|
969
|
+
elif days < 7:
|
|
970
|
+
write(colored('\n-> Getting URLs (e.g. at 1 req/sec) could take more than '+str(days)+' days. Consider using arguments -lr, -ci, -from and -to wisely!','red'))
|
|
971
|
+
else:
|
|
972
|
+
write(colored('\n-> Getting URLs (e.g. at 1 req/sec) could take more than '+str(days)+' days!!! Consider using arguments -lr, -ci, -from and -to wisely!','red'))
|
|
973
|
+
write('')
|
|
974
|
+
else:
|
|
975
|
+
linkCount = len(linksFound)
|
|
976
|
+
write(getSPACER(colored('Links found for ' + subs + argsInput + ': ', 'cyan')+colored(str(linkCount) + ' 🤘','white'))+'\n')
|
|
977
|
+
|
|
978
|
+
# If -oU / --output-urls was passed then use that file name, else use "waymore.txt" in the path of the .py file
|
|
979
|
+
if args.output_urls == '':
|
|
980
|
+
# Create 'results' and domain directory if needed
|
|
981
|
+
createDirs()
|
|
982
|
+
|
|
983
|
+
# If -oR / --output-responses was passed then set the path to that, otherwise it will be the "results/{target.domain}}" path
|
|
984
|
+
if args.output_responses != '':
|
|
985
|
+
fullPath = args.output_responses + '/'
|
|
986
|
+
else:
|
|
987
|
+
fullPath = str(DEFAULT_OUTPUT_DIR) + '/results/' + str(argsInput).replace('/','-') + '/'
|
|
988
|
+
filename = fullPath + 'waymore.txt'
|
|
989
|
+
filenameNew = fullPath + 'waymore.new'
|
|
990
|
+
filenameOld = fullPath + 'waymore.old'
|
|
991
|
+
else:
|
|
992
|
+
filename = args.output_urls
|
|
993
|
+
filenameNew = filename + '.new'
|
|
994
|
+
filenameOld = filename + '.old'
|
|
995
|
+
# If the filename has any "/" in it, remove the contents after the last one to just get the path and create the directories if necessary
|
|
996
|
+
try:
|
|
997
|
+
if filename.find('/') > 0:
|
|
998
|
+
f = os.path.basename(filename)
|
|
999
|
+
p = filename[:-(len(f))-1]
|
|
1000
|
+
if p != '' and not os.path.exists(p):
|
|
1001
|
+
os.makedirs(p)
|
|
1002
|
+
except Exception as e:
|
|
1003
|
+
if verbose():
|
|
1004
|
+
writerr(colored('ERROR processURLOutput 6: ' + str(e), 'red'))
|
|
1005
|
+
|
|
1006
|
+
# If the -ow / --output_overwrite argument was passed and the file exists already, get the contents of the file to include
|
|
1007
|
+
appendedUrls = False
|
|
1008
|
+
if not args.output_overwrite:
|
|
1009
|
+
try:
|
|
1010
|
+
with open(filename,'r') as existingLinks:
|
|
1011
|
+
for link in existingLinks.readlines():
|
|
1012
|
+
linksFound.add(link.strip())
|
|
1013
|
+
appendedUrls = True
|
|
1014
|
+
except Exception as e:
|
|
1015
|
+
pass
|
|
1016
|
+
|
|
1017
|
+
# If the -nlf / --new-links-file argument is passed, rename the old links file if it exists
|
|
1018
|
+
try:
|
|
1019
|
+
if args.new_links_file:
|
|
1020
|
+
if os.path.exists(filename):
|
|
1021
|
+
os.rename(filename, filenameOld)
|
|
1022
|
+
except Exception as e:
|
|
1023
|
+
if verbose():
|
|
1024
|
+
writerr(colored('ERROR processURLOutput 5: ' + str(e), 'red'))
|
|
1025
|
+
|
|
1026
|
+
try:
|
|
1027
|
+
# Open the output file
|
|
1028
|
+
outFile = open(filename,'w')
|
|
1029
|
+
except Exception as e:
|
|
1030
|
+
if verbose():
|
|
1031
|
+
writerr(colored('ERROR processURLOutput 2: ' + str(e), 'red'))
|
|
1032
|
+
sys.exit()
|
|
1033
|
+
|
|
1034
|
+
# Go through all links, and output what was found
|
|
1035
|
+
# If the -ra --regex-after was passed then only output if it matches
|
|
1036
|
+
outputCount = 0
|
|
1037
|
+
for link in linksFound:
|
|
1038
|
+
try:
|
|
1039
|
+
if args.regex_after is None or re.search(args.regex_after, link, flags=re.IGNORECASE):
|
|
1040
|
+
outFile.write(link + "\n")
|
|
1041
|
+
# If the tool is piped to pass output to something else, then write the link
|
|
1042
|
+
if not sys.stdout.isatty():
|
|
1043
|
+
write(link,True)
|
|
1044
|
+
outputCount = outputCount + 1
|
|
1045
|
+
except Exception as e:
|
|
1046
|
+
if verbose():
|
|
1047
|
+
writerr(colored('ERROR processURLOutput 3: ' + str(e), 'red'))
|
|
1048
|
+
|
|
1049
|
+
# If there are less links output because of filters, show the new total
|
|
1050
|
+
if args.regex_after is not None and linkCount > 0 and outputCount < linkCount:
|
|
1051
|
+
write(colored('Links found after applying filter "' + args.regex_after + '": ','cyan')+colored(str(outputCount) + ' 🤘\n','white'))
|
|
1052
|
+
|
|
1053
|
+
# Close the output file
|
|
1054
|
+
try:
|
|
1055
|
+
outFile.close()
|
|
1056
|
+
except Exception as e:
|
|
1057
|
+
if verbose():
|
|
1058
|
+
writerr(colored('ERROR processURLOutput 4: ' + str(e), 'red'))
|
|
1059
|
+
|
|
1060
|
+
if verbose():
|
|
1061
|
+
if outputCount == 0:
|
|
1062
|
+
write(colored('No links were found so nothing written to file.', 'cyan'))
|
|
1063
|
+
else:
|
|
1064
|
+
if appendedUrls:
|
|
1065
|
+
write(
|
|
1066
|
+
colored('Links successfully appended to file ', 'cyan')+colored(filename,
|
|
1067
|
+
'white')+colored(' and duplicates removed.','cyan'))
|
|
1068
|
+
else:
|
|
1069
|
+
write(
|
|
1070
|
+
colored('Links successfully written to file ', 'cyan')+colored(filename,
|
|
1071
|
+
'white'))
|
|
1072
|
+
|
|
1073
|
+
try:
|
|
1074
|
+
# If the -nlf / --new-links-file argument is passes, create the .new file
|
|
1075
|
+
if args.new_links_file:
|
|
1076
|
+
|
|
1077
|
+
# If the file and .old version exists then get the difference to write to .new file
|
|
1078
|
+
if os.path.exists(filenameOld) and os.path.exists(filename):
|
|
1079
|
+
|
|
1080
|
+
# Get all the old links
|
|
1081
|
+
with open(filenameOld,'r') as oldFile:
|
|
1082
|
+
oldLinks=set(oldFile.readlines())
|
|
1083
|
+
|
|
1084
|
+
# Get all the new links
|
|
1085
|
+
with open(filename,'r') as newFile:
|
|
1086
|
+
newLinks=set(newFile.readlines())
|
|
1087
|
+
|
|
1088
|
+
# Create a file with most recent new links
|
|
1089
|
+
with open(filenameNew,'w') as newOnly:
|
|
1090
|
+
for line in list(newLinks-oldLinks):
|
|
1091
|
+
newOnly.write(line)
|
|
1092
|
+
|
|
1093
|
+
# Delete the old file
|
|
1094
|
+
os.remove(filenameOld)
|
|
1095
|
+
|
|
1096
|
+
except Exception as e:
|
|
1097
|
+
if verbose():
|
|
1098
|
+
writerr(colored("ERROR processURLOutput 6: " + str(e), "red"))
|
|
1099
|
+
|
|
1100
|
+
except Exception as e:
|
|
1101
|
+
if verbose():
|
|
1102
|
+
writerr(colored("ERROR processURLOutput 1: " + str(e), "red"))
|
|
1103
|
+
|
|
1104
|
+
def processResponsesOutput():
|
|
1105
|
+
"""
|
|
1106
|
+
Show results of the archive responses saved
|
|
1107
|
+
"""
|
|
1108
|
+
global successCount, failureCount, subs, fileCount, argsInput, DEFAULT_OUTPUT_DIR, responseOutputDirectory
|
|
1109
|
+
try:
|
|
1110
|
+
|
|
1111
|
+
if failureCount > 0:
|
|
1112
|
+
if verbose():
|
|
1113
|
+
write(colored('\nResponses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' failed)\n','red'))
|
|
1114
|
+
else:
|
|
1115
|
+
write(colored('\nResponses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' failed)\n','red'))
|
|
1116
|
+
else:
|
|
1117
|
+
if verbose():
|
|
1118
|
+
write(colored('\nResponses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
|
|
1119
|
+
else:
|
|
1120
|
+
write(colored('\nResponses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
|
|
1121
|
+
except Exception as e:
|
|
1122
|
+
if verbose():
|
|
1123
|
+
writerr(colored("ERROR processResponsesOutput 1: " + str(e), "red"))
|
|
1124
|
+
|
|
1125
|
+
def validateArgProcesses(x):
|
|
1126
|
+
"""
|
|
1127
|
+
Validate the -p / --processes argument
|
|
1128
|
+
Only allow values between 1 and 5 inclusive
|
|
1129
|
+
"""
|
|
1130
|
+
x = int(x)
|
|
1131
|
+
if x < 1 or x > 5:
|
|
1132
|
+
raise argparse.ArgumentTypeError('The number of processes must be between 1 and 5. Be kind to Wayback Machine (archive.org) and commoncrawl.org! :)')
|
|
1133
|
+
return x
|
|
1134
|
+
|
|
1135
|
+
def stripUnwanted(url):
|
|
1136
|
+
"""
|
|
1137
|
+
Strip the scheme, port number, query string and fragment form any input values if they have them
|
|
1138
|
+
"""
|
|
1139
|
+
parsed = urlparse(url)
|
|
1140
|
+
# Strip scheme
|
|
1141
|
+
scheme = "%s://" % parsed.scheme
|
|
1142
|
+
strippedUrl = parsed.geturl().replace(scheme, '', 1)
|
|
1143
|
+
# Strip query string and fragment
|
|
1144
|
+
strippedUrl = strippedUrl.split('#')[0].split('?')[0]
|
|
1145
|
+
# Strip port number
|
|
1146
|
+
if re.search(r'^[^/]*:[0-9]+', strippedUrl):
|
|
1147
|
+
strippedUrl = re.sub(r':[0-9]+','', strippedUrl, 1)
|
|
1148
|
+
return strippedUrl
|
|
1149
|
+
|
|
1150
|
+
def validateArgInput(x):
|
|
1151
|
+
"""
|
|
1152
|
+
Validate the -i / --input argument.
|
|
1153
|
+
Ensure it is a domain only, or a URL, but with no schema or query parameters or fragment
|
|
1154
|
+
"""
|
|
1155
|
+
global inputValues, isInputFile
|
|
1156
|
+
# If the input was given through STDIN (piped from another program) then
|
|
1157
|
+
if x == '<stdin>':
|
|
1158
|
+
stdinFile = sys.stdin.readlines()
|
|
1159
|
+
count = 0
|
|
1160
|
+
for line in stdinFile:
|
|
1161
|
+
# Remove newline characters, and also *. if the domain starts with this
|
|
1162
|
+
inputValues.add(stripUnwanted(line.rstrip('\n').lstrip('*.')))
|
|
1163
|
+
count = count + 1
|
|
1164
|
+
if count > 1:
|
|
1165
|
+
isInputFile = True
|
|
1166
|
+
else:
|
|
1167
|
+
# Determine if a single input was given, or a file
|
|
1168
|
+
if os.path.isfile(x):
|
|
1169
|
+
isInputFile = True
|
|
1170
|
+
# Open file and put all values in input list
|
|
1171
|
+
with open(x, 'r') as inputFile:
|
|
1172
|
+
lines = inputFile.readlines()
|
|
1173
|
+
# Check if any lines start with a *. and replace without the *.
|
|
1174
|
+
for line in lines:
|
|
1175
|
+
inputValues.add(stripUnwanted(line.lstrip('*.')))
|
|
1176
|
+
else:
|
|
1177
|
+
# Just add the input value to the input list
|
|
1178
|
+
inputValues.add(stripUnwanted(x))
|
|
1179
|
+
return x
|
|
1180
|
+
|
|
1181
|
+
def validateArgStatusCodes(x):
|
|
1182
|
+
"""
|
|
1183
|
+
Validate the -fc and -mc arguments
|
|
1184
|
+
Only allow 3 digit numbers separated by a comma
|
|
1185
|
+
"""
|
|
1186
|
+
invalid = False
|
|
1187
|
+
codes = x.split(',')
|
|
1188
|
+
for code in codes:
|
|
1189
|
+
if len(code) != 3 or not code.isdigit():
|
|
1190
|
+
invalid = True
|
|
1191
|
+
break
|
|
1192
|
+
if invalid:
|
|
1193
|
+
raise argparse.ArgumentTypeError('Pass HTTP status codes separated by a comma')
|
|
1194
|
+
return x
|
|
1195
|
+
|
|
1196
|
+
def processAlienVaultPage(url):
|
|
1197
|
+
"""
|
|
1198
|
+
Get URLs from a specific page of otx.alienvault.org API for the input domain
|
|
1199
|
+
"""
|
|
1200
|
+
global totalPages, linkMimes, linksFound, stopSource, argsInput
|
|
1201
|
+
try:
|
|
1202
|
+
# Get memory in case it exceeds threshold
|
|
1203
|
+
getMemory()
|
|
1204
|
+
|
|
1205
|
+
if not stopSource:
|
|
1206
|
+
try:
|
|
1207
|
+
# Choose a random user agent string to use for any requests
|
|
1208
|
+
userAgent = random.choice(USER_AGENT)
|
|
1209
|
+
page = url.split('page=')[1]
|
|
1210
|
+
session = requests.Session()
|
|
1211
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
1212
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
1213
|
+
resp = session.get(url, headers={"User-Agent":userAgent})
|
|
1214
|
+
except ConnectionError as ce:
|
|
1215
|
+
writerr(colored(getSPACER('[ ERR ] alienvault.org connection error for page ' + page), 'red'))
|
|
1216
|
+
resp = None
|
|
1217
|
+
return
|
|
1218
|
+
except Exception as e:
|
|
1219
|
+
writerr(colored(getSPACER('[ ERR ] Error getting response for page ' + page + ' - ' + str(e)),'red'))
|
|
1220
|
+
resp = None
|
|
1221
|
+
return
|
|
1222
|
+
finally:
|
|
1223
|
+
try:
|
|
1224
|
+
if resp is not None:
|
|
1225
|
+
# If a status other of 429, then stop processing Alien Vault
|
|
1226
|
+
if resp.status_code == 429:
|
|
1227
|
+
writerr(colored(getSPACER('[ 429 ] Alien Vault rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
|
|
1228
|
+
stopSource = True
|
|
1229
|
+
return
|
|
1230
|
+
# If the response from alienvault.com is empty then skip
|
|
1231
|
+
if resp.text == '' and totalPages == 0:
|
|
1232
|
+
if verbose():
|
|
1233
|
+
writerr(colored(getSPACER('[ ERR ] '+url+' gave an empty response.'),'red'))
|
|
1234
|
+
return
|
|
1235
|
+
# If a status other than 200, then stop
|
|
1236
|
+
if resp.status_code != 200:
|
|
1237
|
+
if verbose():
|
|
1238
|
+
writerr(colored(getSPACER('[ '+str(resp.status_code)+' ] Error for '+url),'red'))
|
|
1239
|
+
return
|
|
1240
|
+
except:
|
|
1241
|
+
pass
|
|
1242
|
+
|
|
1243
|
+
# Get the JSON response
|
|
1244
|
+
jsonResp = json.loads(resp.text.strip())
|
|
1245
|
+
|
|
1246
|
+
# Go through each URL in the list
|
|
1247
|
+
for urlSection in jsonResp['url_list']:
|
|
1248
|
+
# Get the URL
|
|
1249
|
+
try:
|
|
1250
|
+
foundUrl = urlSection['url']
|
|
1251
|
+
except:
|
|
1252
|
+
foundUrl = ''
|
|
1253
|
+
|
|
1254
|
+
# If a URL was found
|
|
1255
|
+
if foundUrl != '':
|
|
1256
|
+
# If filters are not required and subs are wanted then just add the URL to the list
|
|
1257
|
+
if args.filter_responses_only and not args.no_subs:
|
|
1258
|
+
linksFoundAdd(foundUrl)
|
|
1259
|
+
else:
|
|
1260
|
+
addLink = True
|
|
1261
|
+
|
|
1262
|
+
# If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
|
|
1263
|
+
if args.no_subs:
|
|
1264
|
+
match = re.search(r'\:\/\/(www\.)?'+re.escape(argsInput), foundUrl, flags=re.IGNORECASE)
|
|
1265
|
+
if match is None:
|
|
1266
|
+
addLink = False
|
|
1267
|
+
|
|
1268
|
+
# If the user didn't requested -f / --filter-responses-only then check http code
|
|
1269
|
+
# Note we can't check MIME filter because it is not returned by Alien Vault API
|
|
1270
|
+
if addLink and not args.filter_responses_only:
|
|
1271
|
+
# Get the HTTP code
|
|
1272
|
+
try:
|
|
1273
|
+
httpCode = str(urlSection['httpcode'])
|
|
1274
|
+
except:
|
|
1275
|
+
httpCode = 'UNKNOWN'
|
|
1276
|
+
|
|
1277
|
+
# Compare the HTTP code gainst the Code exclusions and matches
|
|
1278
|
+
if MATCH_CODE != '':
|
|
1279
|
+
match = re.search(r'('+re.escape(MATCH_CODE).replace(',','|')+')', httpCode, flags=re.IGNORECASE)
|
|
1280
|
+
if match is None:
|
|
1281
|
+
addLink = False
|
|
1282
|
+
else:
|
|
1283
|
+
match = re.search(r'('+re.escape(FILTER_CODE).replace(',','|')+')', httpCode, flags=re.IGNORECASE)
|
|
1284
|
+
if match is not None:
|
|
1285
|
+
addLink = False
|
|
1286
|
+
|
|
1287
|
+
# Check the URL exclusions
|
|
1288
|
+
if addLink:
|
|
1289
|
+
match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
|
|
1290
|
+
if match is not None:
|
|
1291
|
+
addLink = False
|
|
1292
|
+
|
|
1293
|
+
# Set keywords filter if -ko argument passed
|
|
1294
|
+
if addLink and args.keywords_only:
|
|
1295
|
+
if args.keywords_only == '#CONFIG':
|
|
1296
|
+
match = re.search(r'('+re.escape(FILTER_KEYWORDS).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
|
|
1297
|
+
else:
|
|
1298
|
+
match = re.search(r'('+args.keywords_only+')', foundUrl, flags=re.IGNORECASE)
|
|
1299
|
+
if match is None:
|
|
1300
|
+
addLink = False
|
|
1301
|
+
|
|
1302
|
+
# Add link if it passed filters
|
|
1303
|
+
if addLink:
|
|
1304
|
+
linksFoundAdd(foundUrl)
|
|
1305
|
+
else:
|
|
1306
|
+
pass
|
|
1307
|
+
except Exception as e:
|
|
1308
|
+
if verbose():
|
|
1309
|
+
writerr(colored("ERROR processLAlienVaultPage 1: " + str(e), "red"))
|
|
1310
|
+
|
|
1311
|
+
def getAlienVaultUrls():
|
|
1312
|
+
"""
|
|
1313
|
+
Get URLs from the Alien Vault OTX, otx.alienvault.com
|
|
1314
|
+
"""
|
|
1315
|
+
global linksFound, waymorePath, subs, path, stopProgram, totalPages, stopSource, argsInput, checkAlienVault, inputIsSubDomain, argsInputHostname
|
|
1316
|
+
|
|
1317
|
+
# Write the file of URL's for the passed domain/URL
|
|
1318
|
+
try:
|
|
1319
|
+
stopSource = False
|
|
1320
|
+
originalLinkCount = len(linksFound)
|
|
1321
|
+
|
|
1322
|
+
# Set the Alien Vault API indicator types of domain or hostname (has subdomain)
|
|
1323
|
+
if inputIsSubDomain:
|
|
1324
|
+
indicatorType = 'hostname'
|
|
1325
|
+
else:
|
|
1326
|
+
indicatorType = 'domain'
|
|
1327
|
+
|
|
1328
|
+
url = ALIENVAULT_URL.replace('{TYPE}',indicatorType).replace('{DOMAIN}',quote(argsInputHostname))+'&page='
|
|
1329
|
+
|
|
1330
|
+
# Get the number of pages (i.e. separate requests) that are going to be made to alienvault.com
|
|
1331
|
+
totalPages = 0
|
|
1332
|
+
try:
|
|
1333
|
+
if not args.check_only:
|
|
1334
|
+
write(colored('\rGetting the number of alienvault.com pages to search...\r','cyan'))
|
|
1335
|
+
# Choose a random user agent string to use for any requests
|
|
1336
|
+
userAgent = random.choice(USER_AGENT)
|
|
1337
|
+
session = requests.Session()
|
|
1338
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
1339
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
1340
|
+
resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
|
|
1341
|
+
except Exception as e:
|
|
1342
|
+
writerr(colored(getSPACER('[ ERR ] Unable to get links from alienvault.com: ' + str(e)), 'red'))
|
|
1343
|
+
return
|
|
1344
|
+
|
|
1345
|
+
# If the rate limit was reached end now
|
|
1346
|
+
if resp.status_code == 429:
|
|
1347
|
+
writerr(colored(getSPACER('[ 429 ] Alien Vault rate limit reached so unable to get links.'),'red'))
|
|
1348
|
+
return
|
|
1349
|
+
|
|
1350
|
+
if verbose():
|
|
1351
|
+
write(getSPACER(colored('The Alien Vault URL requested to get links: ','magenta')+colored(url,'white'))+'\n')
|
|
1352
|
+
|
|
1353
|
+
# Carry on if something was found
|
|
1354
|
+
if resp.text.lower().find('"error": "') < 0:
|
|
1355
|
+
|
|
1356
|
+
# Get the JSON response
|
|
1357
|
+
jsonResp = json.loads(resp.text.strip())
|
|
1358
|
+
|
|
1359
|
+
# Try to get the number of results
|
|
1360
|
+
totalUrls = jsonResp['full_size']
|
|
1361
|
+
|
|
1362
|
+
# If there are results, carry on
|
|
1363
|
+
if totalUrls > 0 or args.check_only:
|
|
1364
|
+
|
|
1365
|
+
# Get total pages
|
|
1366
|
+
totalPages = math.ceil(totalUrls / 500)
|
|
1367
|
+
|
|
1368
|
+
# If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
|
|
1369
|
+
if args.limit_requests != 0 and totalPages > args.limit_requests:
|
|
1370
|
+
totalPages = args.limit_requests
|
|
1371
|
+
|
|
1372
|
+
if args.check_only:
|
|
1373
|
+
if totalPages == 0:
|
|
1374
|
+
checkAlienVault = 1
|
|
1375
|
+
else:
|
|
1376
|
+
checkAlienVault = totalPages
|
|
1377
|
+
write(colored('Get URLs from Alien Vault: ','cyan')+colored(str(checkAlienVault)+' requests','white'))
|
|
1378
|
+
else:
|
|
1379
|
+
# if the page number was found then display it, but otherwise we will just try to increment until we have everything
|
|
1380
|
+
write(colored('\rGetting links from ' + str(totalPages) + ' alienvault.com API requests (this can take a while for some domains)...\r','cyan'))
|
|
1381
|
+
|
|
1382
|
+
# Get a list of all the page URLs we need to visit
|
|
1383
|
+
pages = []
|
|
1384
|
+
for page in range(1, totalPages + 1):
|
|
1385
|
+
pages.append(url+str(page))
|
|
1386
|
+
|
|
1387
|
+
# Process the URLs from alien vault
|
|
1388
|
+
if stopProgram is None:
|
|
1389
|
+
p = mp.Pool(args.processes)
|
|
1390
|
+
p.map(processAlienVaultPage, pages)
|
|
1391
|
+
p.close()
|
|
1392
|
+
p.join()
|
|
1393
|
+
else:
|
|
1394
|
+
if verbose():
|
|
1395
|
+
writerr(colored(getSPACER('[ ERR ] An error was returned in the alienvault.com response.')+'\n', 'red'))
|
|
1396
|
+
|
|
1397
|
+
if not args.check_only:
|
|
1398
|
+
linkCount = len(linksFound) - originalLinkCount
|
|
1399
|
+
if args.xwm and args.xcc:
|
|
1400
|
+
write(getSPACER(colored('Links found on alienvault.com: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
1401
|
+
else:
|
|
1402
|
+
write(getSPACER(colored('Extra links found on alienvault.com: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
1403
|
+
|
|
1404
|
+
except Exception as e:
|
|
1405
|
+
writerr(colored('ERROR getAlienVaultUrls 1: ' + str(e), 'red'))
|
|
1406
|
+
|
|
1407
|
+
def processURLScanUrl(url, httpCode, mimeType):
|
|
1408
|
+
"""
|
|
1409
|
+
Process a specific URL from urlscan.io to determine whether to save the link
|
|
1410
|
+
"""
|
|
1411
|
+
global argsInput, argsInputHostname
|
|
1412
|
+
|
|
1413
|
+
addLink = True
|
|
1414
|
+
|
|
1415
|
+
try:
|
|
1416
|
+
# If filters are required then test them
|
|
1417
|
+
if not args.filter_responses_only:
|
|
1418
|
+
|
|
1419
|
+
# If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
|
|
1420
|
+
if args.no_subs:
|
|
1421
|
+
match = re.search(r'^[A-za-z]*\:\/\/(www\.)?'+re.escape(argsInputHostname), url, flags=re.IGNORECASE)
|
|
1422
|
+
if match is None:
|
|
1423
|
+
addLink = False
|
|
1424
|
+
|
|
1425
|
+
# If the user didn't requested -f / --filter-responses-only then check http code
|
|
1426
|
+
# Note we can't check MIME filter because it is not returned by URLScan API
|
|
1427
|
+
if addLink and not args.filter_responses_only:
|
|
1428
|
+
|
|
1429
|
+
# Compare the HTTP code against the Code exclusions and matches
|
|
1430
|
+
if MATCH_CODE != '':
|
|
1431
|
+
match = re.search(r'('+re.escape(MATCH_CODE).replace(',','|')+')', httpCode, flags=re.IGNORECASE)
|
|
1432
|
+
if match is None:
|
|
1433
|
+
addLink = False
|
|
1434
|
+
else:
|
|
1435
|
+
match = re.search(r'('+re.escape(FILTER_CODE).replace(',','|')+')', httpCode, flags=re.IGNORECASE)
|
|
1436
|
+
if match is not None:
|
|
1437
|
+
addLink = False
|
|
1438
|
+
|
|
1439
|
+
# Check the URL exclusions
|
|
1440
|
+
if addLink:
|
|
1441
|
+
match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', url, flags=re.IGNORECASE)
|
|
1442
|
+
if match is not None:
|
|
1443
|
+
addLink = False
|
|
1444
|
+
|
|
1445
|
+
# Set keywords filter if -ko argument passed
|
|
1446
|
+
if addLink and args.keywords_only:
|
|
1447
|
+
if args.keywords_only == '#CONFIG':
|
|
1448
|
+
match = re.search(r'('+re.escape(FILTER_KEYWORDS).replace(',','|')+')', url, flags=re.IGNORECASE)
|
|
1449
|
+
else:
|
|
1450
|
+
match = re.search(r'('+args.keywords_only+')', url, flags=re.IGNORECASE)
|
|
1451
|
+
if match is None:
|
|
1452
|
+
addLink = False
|
|
1453
|
+
|
|
1454
|
+
# Check the MIME exclusions
|
|
1455
|
+
if mimeType != '':
|
|
1456
|
+
match = re.search(r'('+re.escape(FILTER_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
|
|
1457
|
+
if match is not None:
|
|
1458
|
+
addLink = False
|
|
1459
|
+
else:
|
|
1460
|
+
# Add MIME Types if --verbose option was selected
|
|
1461
|
+
if verbose():
|
|
1462
|
+
linkMimes.add(mimeType)
|
|
1463
|
+
|
|
1464
|
+
# Add link if it passed filters
|
|
1465
|
+
if addLink:
|
|
1466
|
+
# Just get the hostname of the url
|
|
1467
|
+
tldExtract = tldextract.extract(url)
|
|
1468
|
+
subDomain = tldExtract.subdomain
|
|
1469
|
+
if subDomain != '':
|
|
1470
|
+
subDomain = subDomain+'.'
|
|
1471
|
+
domainOnly = subDomain+tldExtract.domain+'.'+tldExtract.suffix
|
|
1472
|
+
|
|
1473
|
+
# URLScan might return URLs that aren't for the domain passed so we need to check for those and not process them
|
|
1474
|
+
# Check the URL
|
|
1475
|
+
match = re.search(r'(^|\.)'+re.escape(argsInputHostname)+'$', domainOnly, flags=re.IGNORECASE)
|
|
1476
|
+
if match is not None:
|
|
1477
|
+
linksFoundAdd(url)
|
|
1478
|
+
|
|
1479
|
+
except Exception as e:
|
|
1480
|
+
writerr(colored('ERROR processURLScanUrl 1: ' + str(e), 'red'))
|
|
1481
|
+
|
|
1482
|
+
def getURLScanUrls():
|
|
1483
|
+
"""
|
|
1484
|
+
Get URLs from the URLSCan API, urlscan.io
|
|
1485
|
+
"""
|
|
1486
|
+
global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSource, argsInput, checkURLScan, argsInputHostname
|
|
1487
|
+
|
|
1488
|
+
# Write the file of URL's for the passed domain/URL
|
|
1489
|
+
try:
|
|
1490
|
+
requestsMade = 0
|
|
1491
|
+
stopSource = False
|
|
1492
|
+
linkMimes = set()
|
|
1493
|
+
originalLinkCount = len(linksFound)
|
|
1494
|
+
|
|
1495
|
+
# Set the URL to just the hostname
|
|
1496
|
+
url = URLSCAN_URL.replace('{DOMAIN}',quote(argsInputHostname))
|
|
1497
|
+
|
|
1498
|
+
if verbose():
|
|
1499
|
+
write(colored('The URLScan URL requested to get links: ','magenta')+colored(url+'\n','white'))
|
|
1500
|
+
|
|
1501
|
+
if not args.check_only:
|
|
1502
|
+
write(colored('\rGetting links from urlscan.io API (this can take a while for some domains)...\r','cyan'))
|
|
1503
|
+
|
|
1504
|
+
# Get the first page from urlscan.io
|
|
1505
|
+
try:
|
|
1506
|
+
# Choose a random user agent string to use for any requests
|
|
1507
|
+
userAgent = random.choice(USER_AGENT)
|
|
1508
|
+
session = requests.Session()
|
|
1509
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
1510
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
1511
|
+
# Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
|
|
1512
|
+
resp = session.get(url, headers={'User-Agent':userAgent, 'API-Key':URLSCAN_API_KEY})
|
|
1513
|
+
requestsMade = requestsMade + 1
|
|
1514
|
+
except Exception as e:
|
|
1515
|
+
write(colored(getSPACER('[ ERR ] Unable to get links from urlscan.io: ' + str(e)), 'red'))
|
|
1516
|
+
return
|
|
1517
|
+
|
|
1518
|
+
# If the rate limit was reached then determine if to wait and then try again
|
|
1519
|
+
if resp.status_code == 429:
|
|
1520
|
+
# Get the number of seconds the rate limit resets
|
|
1521
|
+
match = re.search(r'Reset in (\d+) seconds', resp.text, flags=re.IGNORECASE)
|
|
1522
|
+
if match is not None:
|
|
1523
|
+
seconds = int(match.group(1))
|
|
1524
|
+
if seconds <= args.urlscan_rate_limit_retry * 60:
|
|
1525
|
+
writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached, so waiting for another '+str(seconds)+' seconds before continuing...'),'yellow'))
|
|
1526
|
+
time.sleep(seconds+1)
|
|
1527
|
+
try:
|
|
1528
|
+
resp = session.get(url, headers={'User-Agent':userAgent, 'API-Key':URLSCAN_API_KEY})
|
|
1529
|
+
requestsMade = requestsMade + 1
|
|
1530
|
+
except Exception as e:
|
|
1531
|
+
write(colored(getSPACER('[ ERR ] Unable to get links from urlscan.io: ' + str(e)), 'red'))
|
|
1532
|
+
return
|
|
1533
|
+
|
|
1534
|
+
# If the rate limit was reached or if a 401 (which likely means the API key isn't valid), try without API key
|
|
1535
|
+
if resp.status_code in (401,429):
|
|
1536
|
+
if URLSCAN_API_KEY != '':
|
|
1537
|
+
try:
|
|
1538
|
+
if resp.status_code == 429:
|
|
1539
|
+
writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached so trying without API Key...'),'red'))
|
|
1540
|
+
else:
|
|
1541
|
+
writerr(colored(getSPACER('The URLScan API Key is invalid so trying without API Key...'),'red'))
|
|
1542
|
+
# Set key to blank for further requests
|
|
1543
|
+
URLSCAN_API_KEY = ''
|
|
1544
|
+
resp = requests.get(url, headers={'User-Agent':userAgent})
|
|
1545
|
+
except Exception as e:
|
|
1546
|
+
writerr(colored(getSPACER('[ ERR ] Unable to get links from urlscan.io: ' + str(e)), 'red'))
|
|
1547
|
+
return
|
|
1548
|
+
|
|
1549
|
+
# If the rate limit was reached end now
|
|
1550
|
+
if resp.status_code == 429:
|
|
1551
|
+
writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached without API Key so unable to get links.'),'red'))
|
|
1552
|
+
return
|
|
1553
|
+
else:
|
|
1554
|
+
writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached so unable to get links.'),'red'))
|
|
1555
|
+
return
|
|
1556
|
+
elif resp.status_code != 200:
|
|
1557
|
+
writerr(colored(getSPACER('[ ' + str(resp.status_code) + ' ] Unable to get links from urlscan.io'),'red'))
|
|
1558
|
+
return
|
|
1559
|
+
|
|
1560
|
+
# Get the JSON response
|
|
1561
|
+
jsonResp = json.loads(resp.text.strip())
|
|
1562
|
+
|
|
1563
|
+
# Get the number of results
|
|
1564
|
+
totalUrls = jsonResp['total']
|
|
1565
|
+
|
|
1566
|
+
if args.check_only:
|
|
1567
|
+
hasMore = jsonResp['has_more']
|
|
1568
|
+
if hasMore:
|
|
1569
|
+
write(colored('Get URLs from URLScan: ','cyan')+colored('UNKNOWN requests','white'))
|
|
1570
|
+
else:
|
|
1571
|
+
write(colored('Get URLs from URLScan: ','cyan')+colored('1 request','white'))
|
|
1572
|
+
checkURLScan = 1
|
|
1573
|
+
else:
|
|
1574
|
+
# Carry on if something was found
|
|
1575
|
+
if int(totalUrls) > 0:
|
|
1576
|
+
|
|
1577
|
+
while not stopSource:
|
|
1578
|
+
|
|
1579
|
+
searchAfter = ''
|
|
1580
|
+
|
|
1581
|
+
# Get memory in case it exceeds threshold
|
|
1582
|
+
getMemory()
|
|
1583
|
+
|
|
1584
|
+
# Go through each URL in the list
|
|
1585
|
+
for urlSection in jsonResp['results']:
|
|
1586
|
+
|
|
1587
|
+
# Get the URL
|
|
1588
|
+
try:
|
|
1589
|
+
foundUrl = urlSection['page']['url']
|
|
1590
|
+
except:
|
|
1591
|
+
foundUrl = ''
|
|
1592
|
+
|
|
1593
|
+
# Also get the "ptr" field which can also be a url we want
|
|
1594
|
+
try:
|
|
1595
|
+
pointer = urlSection['page']['ptr']
|
|
1596
|
+
if not pointer.startswith('http'):
|
|
1597
|
+
pointer = 'http://' + pointer
|
|
1598
|
+
except:
|
|
1599
|
+
pointer = ''
|
|
1600
|
+
|
|
1601
|
+
# Also get the "task" url field
|
|
1602
|
+
try:
|
|
1603
|
+
taskUrl = urlSection['task']['url']
|
|
1604
|
+
if not taskUrl.startswith('http'):
|
|
1605
|
+
taskUrl = 'http://' + taskUrl
|
|
1606
|
+
except:
|
|
1607
|
+
taskUrl = ''
|
|
1608
|
+
|
|
1609
|
+
# Get the sort value used for the search_after parameter to get to the next page later
|
|
1610
|
+
try:
|
|
1611
|
+
sort = urlSection['sort']
|
|
1612
|
+
except:
|
|
1613
|
+
sort = ''
|
|
1614
|
+
searchAfter = '&search_after='+str(sort[0])+','+str(sort[1])
|
|
1615
|
+
|
|
1616
|
+
# Get the HTTP code
|
|
1617
|
+
try:
|
|
1618
|
+
httpCode = str(urlSection['page']['status'])
|
|
1619
|
+
except:
|
|
1620
|
+
httpCode = 'UNKNOWN'
|
|
1621
|
+
|
|
1622
|
+
# Get the MIME type
|
|
1623
|
+
try:
|
|
1624
|
+
mimeType = urlSection['page']['mimeType']
|
|
1625
|
+
except:
|
|
1626
|
+
mimeType = ''
|
|
1627
|
+
|
|
1628
|
+
# If a URL was found the process it
|
|
1629
|
+
if foundUrl != '':
|
|
1630
|
+
processURLScanUrl(foundUrl, httpCode, mimeType)
|
|
1631
|
+
|
|
1632
|
+
# If a pointer was found the process it
|
|
1633
|
+
if pointer != '':
|
|
1634
|
+
processURLScanUrl(pointer, httpCode, mimeType)
|
|
1635
|
+
|
|
1636
|
+
# If a task url was found the process it
|
|
1637
|
+
if taskUrl != '':
|
|
1638
|
+
processURLScanUrl(taskUrl, httpCode, mimeType)
|
|
1639
|
+
|
|
1640
|
+
# If we have the field value to go to the next page...
|
|
1641
|
+
if searchAfter != '':
|
|
1642
|
+
|
|
1643
|
+
keepTrying = True
|
|
1644
|
+
while not stopSource and keepTrying:
|
|
1645
|
+
keepTrying = False
|
|
1646
|
+
# Get the next page from urlscan.io
|
|
1647
|
+
try:
|
|
1648
|
+
# Choose a random user agent string to use for any requests
|
|
1649
|
+
userAgent = random.choice(USER_AGENT)
|
|
1650
|
+
session = requests.Session()
|
|
1651
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
1652
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
1653
|
+
# Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
|
|
1654
|
+
resp = session.get(url+searchAfter, headers={'User-Agent':userAgent, 'API-Key':URLSCAN_API_KEY})
|
|
1655
|
+
requestsMade = requestsMade + 1
|
|
1656
|
+
except Exception as e:
|
|
1657
|
+
writerr(colored(getSPACER('[ ERR ] Unable to get links from urlscan.io: ' + str(e)), 'red'))
|
|
1658
|
+
pass
|
|
1659
|
+
|
|
1660
|
+
# If the rate limit was reached
|
|
1661
|
+
if resp.status_code == 429:
|
|
1662
|
+
# Get the number of seconds the rate limit resets
|
|
1663
|
+
match = re.search(r'Reset in (\d+) seconds', resp.text, flags=re.IGNORECASE)
|
|
1664
|
+
if match is not None:
|
|
1665
|
+
seconds = int(match.group(1))
|
|
1666
|
+
if seconds <= args.urlscan_rate_limit_retry * 60:
|
|
1667
|
+
writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached, so waiting for another '+str(seconds)+' seconds before continuing...'),'yellow'))
|
|
1668
|
+
time.sleep(seconds+1)
|
|
1669
|
+
keepTrying = True
|
|
1670
|
+
continue
|
|
1671
|
+
else:
|
|
1672
|
+
writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached (waiting time of '+str(seconds)+'), so stopping. Links that have already been retrieved will be saved.'),'red'))
|
|
1673
|
+
stopSource = True
|
|
1674
|
+
pass
|
|
1675
|
+
else:
|
|
1676
|
+
writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
|
|
1677
|
+
stopSource = True
|
|
1678
|
+
pass
|
|
1679
|
+
elif resp.status_code != 200:
|
|
1680
|
+
writerr(colored(getSPACER('[ ' + str(resp.status_code) + ' ] Unable to get links from urlscan.io'),'red'))
|
|
1681
|
+
stopSource = True
|
|
1682
|
+
pass
|
|
1683
|
+
|
|
1684
|
+
if not stopSource:
|
|
1685
|
+
# Get the JSON response
|
|
1686
|
+
jsonResp = json.loads(resp.text.strip())
|
|
1687
|
+
|
|
1688
|
+
# If there are no more results, or if the requests limit was specified and has been exceeded, then stop
|
|
1689
|
+
if jsonResp['results'] is None or len(jsonResp['results']) == 0 or (args.limit_requests != 0 and requestsMade > args.limit_requests):
|
|
1690
|
+
stopSource = True
|
|
1691
|
+
|
|
1692
|
+
# Show the MIME types found (in case user wants to exclude more)
|
|
1693
|
+
if verbose() and len(linkMimes) > 0:
|
|
1694
|
+
linkMimes.discard('warc/revisit')
|
|
1695
|
+
write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
|
|
1696
|
+
|
|
1697
|
+
linkCount = len(linksFound) - originalLinkCount
|
|
1698
|
+
if args.xwm and args.xcc and args.xav:
|
|
1699
|
+
write(getSPACER(colored('Links found on urlscan.io: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
1700
|
+
else:
|
|
1701
|
+
write(getSPACER(colored('Extra links found on urlscan.io: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
1702
|
+
|
|
1703
|
+
except Exception as e:
|
|
1704
|
+
writerr(colored('ERROR getURLScanUrls 1: ' + str(e), 'red'))
|
|
1705
|
+
|
|
1706
|
+
def processWayBackPage(url):
|
|
1707
|
+
"""
|
|
1708
|
+
Get URLs from a specific page of archive.org CDX API for the input domain
|
|
1709
|
+
"""
|
|
1710
|
+
global totalPages, linkMimes, linksFound, stopSource
|
|
1711
|
+
try:
|
|
1712
|
+
# Get memory in case it exceeds threshold
|
|
1713
|
+
getMemory()
|
|
1714
|
+
|
|
1715
|
+
if not stopSource:
|
|
1716
|
+
try:
|
|
1717
|
+
# Choose a random user agent string to use for any requests
|
|
1718
|
+
userAgent = random.choice(USER_AGENT)
|
|
1719
|
+
page = url.split('page=')[1]
|
|
1720
|
+
session = requests.Session()
|
|
1721
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
1722
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
1723
|
+
resp = session.get(url, headers={"User-Agent":userAgent})
|
|
1724
|
+
except ConnectionError as ce:
|
|
1725
|
+
writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error for page ' + page), 'red'))
|
|
1726
|
+
resp = None
|
|
1727
|
+
return
|
|
1728
|
+
except Exception as e:
|
|
1729
|
+
writerr(colored(getSPACER('[ ERR ] Error getting response for page ' + page + ' - ' + str(e)),'red'))
|
|
1730
|
+
resp = None
|
|
1731
|
+
return
|
|
1732
|
+
finally:
|
|
1733
|
+
try:
|
|
1734
|
+
if resp is not None:
|
|
1735
|
+
# If a status other of 429, then stop processing Wayback Machine
|
|
1736
|
+
if resp.status_code == 429:
|
|
1737
|
+
if args.wayback_rate_limit_retry > 0:
|
|
1738
|
+
seconds = args.wayback_rate_limit_retry * 60
|
|
1739
|
+
if args.processes == 1:
|
|
1740
|
+
writerr(colored('\r[ 429 ] Wayback Machine (archive.org) rate limit reached on page '+str(page)+' of '+str(totalPages)+', so waiting for '+str(seconds)+' seconds before continuing...\r','yellow'))
|
|
1741
|
+
else:
|
|
1742
|
+
writerr(colored('\r[ 429 ] Wayback Machine (archive.org) rate limit reached, so waiting for '+str(seconds)+' seconds before continuing...\r','yellow'))
|
|
1743
|
+
time.sleep(seconds)
|
|
1744
|
+
try:
|
|
1745
|
+
resp = session.get(url, headers={"User-Agent":userAgent})
|
|
1746
|
+
except ConnectionError as ce:
|
|
1747
|
+
writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error for page ' + page), 'red'))
|
|
1748
|
+
resp = None
|
|
1749
|
+
return
|
|
1750
|
+
except Exception as e:
|
|
1751
|
+
writerr(colored(getSPACER('[ ERR ] Error getting response for page ' + page + ' - ' + str(e)),'red'))
|
|
1752
|
+
resp = None
|
|
1753
|
+
return
|
|
1754
|
+
|
|
1755
|
+
if resp.status_code == 429:
|
|
1756
|
+
writerr(colored(getSPACER('[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
|
|
1757
|
+
stopSource = True
|
|
1758
|
+
return
|
|
1759
|
+
# If a status other of 503, then the site is unavailable
|
|
1760
|
+
if resp.status_code == 503:
|
|
1761
|
+
writerr(colored(getSPACER('[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.'),'red'))
|
|
1762
|
+
stopSource = True
|
|
1763
|
+
return
|
|
1764
|
+
# If the response from archive.org is empty then skip
|
|
1765
|
+
if resp.text == '' and totalPages == 0:
|
|
1766
|
+
if verbose():
|
|
1767
|
+
writerr(colored(getSPACER('[ ERR ] '+url+' gave an empty response.'),'red'))
|
|
1768
|
+
return
|
|
1769
|
+
# If a status other than 200, then stop
|
|
1770
|
+
if resp.status_code != 200:
|
|
1771
|
+
if verbose():
|
|
1772
|
+
writerr(colored(getSPACER('[ '+str(resp.status_code)+' ] Error for '+url),'red'))
|
|
1773
|
+
return
|
|
1774
|
+
except ConnectionError as ce:
|
|
1775
|
+
writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error for page ' + page), 'red'))
|
|
1776
|
+
resp = None
|
|
1777
|
+
return
|
|
1778
|
+
except Exception as e:
|
|
1779
|
+
writerr(colored(getSPACER('[ ERR ] Error getting response for page ' + page + ' - ' + str(e)),'red'))
|
|
1780
|
+
resp = None
|
|
1781
|
+
return
|
|
1782
|
+
|
|
1783
|
+
# Get the URLs and MIME types. Each line is a separate JSON string
|
|
1784
|
+
for line in resp.iter_lines():
|
|
1785
|
+
results = line.decode("utf-8")
|
|
1786
|
+
# Only get MIME Types if --verbose option was selected
|
|
1787
|
+
if verbose():
|
|
1788
|
+
try:
|
|
1789
|
+
linkMimes.add(str(results).split(' ')[2])
|
|
1790
|
+
except Exception as e:
|
|
1791
|
+
if verbose():
|
|
1792
|
+
writerr(colored(getSPACER('ERROR processWayBackPage 2: Cannot get MIME type from line: ' + str(line)),'red'))
|
|
1793
|
+
write(resp.text)
|
|
1794
|
+
try:
|
|
1795
|
+
foundUrl = fixArchiveOrgUrl(str(results).split(' ')[1])
|
|
1796
|
+
linksFoundAdd(foundUrl)
|
|
1797
|
+
except Exception as e:
|
|
1798
|
+
if verbose():
|
|
1799
|
+
writerr(colored(getSPACER('ERROR processWayBackPage 3: Cannot get link from line: ' + str(line)),'red'))
|
|
1800
|
+
write(resp.text)
|
|
1801
|
+
else:
|
|
1802
|
+
pass
|
|
1803
|
+
except Exception as e:
|
|
1804
|
+
if verbose():
|
|
1805
|
+
writerr(colored("ERROR processWayBackPage 1: " + str(e), "red"))
|
|
1806
|
+
|
|
1807
|
+
def getWaybackUrls():
|
|
1808
|
+
"""
|
|
1809
|
+
Get URLs from the Wayback Machine, archive.org
|
|
1810
|
+
"""
|
|
1811
|
+
global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages, stopSource, argsInput, checkWayback
|
|
1812
|
+
|
|
1813
|
+
# Write the file of URL's for the passed domain/URL
|
|
1814
|
+
try:
|
|
1815
|
+
stopSource = False
|
|
1816
|
+
# If there any + in the MIME types, e.g. image/svg+xml, then replace the + with a . otherwise the wayback API does not recognise it
|
|
1817
|
+
filterMIME = '&filter=!mimetype:warc/revisit|' + re.escape(FILTER_MIME).replace(',','|').replace('+','.')
|
|
1818
|
+
if MATCH_CODE != '':
|
|
1819
|
+
filterCode = '&filter=statuscode:' + re.escape(MATCH_CODE).replace(',','|')
|
|
1820
|
+
else:
|
|
1821
|
+
filterCode = '&filter=!statuscode:' + re.escape(FILTER_CODE).replace(',','|')
|
|
1822
|
+
|
|
1823
|
+
# Set keywords filter if -ko argument passed
|
|
1824
|
+
filterKeywords = ''
|
|
1825
|
+
if args.keywords_only:
|
|
1826
|
+
if args.keywords_only == '#CONFIG':
|
|
1827
|
+
filterKeywords = '&filter=original:.*(' + re.escape(FILTER_KEYWORDS).replace(',','|') + ').*'
|
|
1828
|
+
else:
|
|
1829
|
+
filterKeywords = '&filter=original:.*(' + args.keywords_only + ').*'
|
|
1830
|
+
|
|
1831
|
+
if args.filter_responses_only:
|
|
1832
|
+
url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}','') + '&page='
|
|
1833
|
+
else:
|
|
1834
|
+
url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}','') + filterMIME + filterCode + filterKeywords + '&page='
|
|
1835
|
+
|
|
1836
|
+
# Get the number of pages (i.e. separate requests) that are going to be made to archive.org
|
|
1837
|
+
totalPages = 0
|
|
1838
|
+
try:
|
|
1839
|
+
if not args.check_only:
|
|
1840
|
+
write(colored('\rGetting the number of Wayback Machine (archive.org) pages to search...\r','cyan'))
|
|
1841
|
+
# Choose a random user agent string to use for any requests
|
|
1842
|
+
userAgent = random.choice(USER_AGENT)
|
|
1843
|
+
session = requests.Session()
|
|
1844
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
1845
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
1846
|
+
resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
|
|
1847
|
+
totalPages = int(resp.text.strip())
|
|
1848
|
+
|
|
1849
|
+
# If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
|
|
1850
|
+
if args.limit_requests != 0 and totalPages > args.limit_requests:
|
|
1851
|
+
totalPages = args.limit_requests
|
|
1852
|
+
except Exception as e:
|
|
1853
|
+
try:
|
|
1854
|
+
# If the rate limit was reached end now
|
|
1855
|
+
if resp.status_code == 429:
|
|
1856
|
+
writerr(colored(getSPACER('[ 429 ] Wayback Machine (Archive.org) rate limit reached so unable to get links.'),'red'))
|
|
1857
|
+
return
|
|
1858
|
+
|
|
1859
|
+
# If a status other of 503, then the site is unavailable
|
|
1860
|
+
if resp.status_code == 503:
|
|
1861
|
+
writerr(colored(getSPACER('[ 503 ] Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.'),'red'))
|
|
1862
|
+
return
|
|
1863
|
+
|
|
1864
|
+
if resp.text.lower().find('blocked site error') > 0:
|
|
1865
|
+
writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)'), 'red'))
|
|
1866
|
+
else:
|
|
1867
|
+
writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(resp.text.strip())), 'red'))
|
|
1868
|
+
except:
|
|
1869
|
+
if str(e).lower().find('alert access denied'):
|
|
1870
|
+
writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don\'t know, but it has happened!)'), 'red'))
|
|
1871
|
+
elif str(e).lower().find('connection refused'):
|
|
1872
|
+
writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)'), 'red'))
|
|
1873
|
+
else:
|
|
1874
|
+
writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
|
|
1875
|
+
return
|
|
1876
|
+
|
|
1877
|
+
if args.check_only:
|
|
1878
|
+
checkWayback = totalPages
|
|
1879
|
+
write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
|
|
1880
|
+
else:
|
|
1881
|
+
if verbose():
|
|
1882
|
+
write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
|
|
1883
|
+
|
|
1884
|
+
# if the page number was found then display it, but otherwise we will just try to increment until we have everything
|
|
1885
|
+
write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
|
|
1886
|
+
|
|
1887
|
+
# Get a list of all the page URLs we need to visit
|
|
1888
|
+
pages = []
|
|
1889
|
+
if totalPages == 1:
|
|
1890
|
+
pages.append(url)
|
|
1891
|
+
else:
|
|
1892
|
+
for page in range(0, totalPages):
|
|
1893
|
+
pages.append(url+str(page))
|
|
1894
|
+
|
|
1895
|
+
# Process the URLs from web archive
|
|
1896
|
+
if stopProgram is None:
|
|
1897
|
+
p = mp.Pool(args.processes)
|
|
1898
|
+
p.map(processWayBackPage, pages)
|
|
1899
|
+
p.close()
|
|
1900
|
+
p.join()
|
|
1901
|
+
|
|
1902
|
+
# Show the MIME types found (in case user wants to exclude more)
|
|
1903
|
+
if verbose() and len(linkMimes) > 0 :
|
|
1904
|
+
linkMimes.discard('warc/revisit')
|
|
1905
|
+
write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
|
|
1906
|
+
linkMimes = None
|
|
1907
|
+
|
|
1908
|
+
if not args.xwm:
|
|
1909
|
+
linkCount = len(linksFound)
|
|
1910
|
+
write(getSPACER(colored('Links found on Wayback Machine (archive.org): ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
1911
|
+
|
|
1912
|
+
except Exception as e:
|
|
1913
|
+
writerr(colored('ERROR getWaybackUrls 1: ' + str(e), 'red'))
|
|
1914
|
+
|
|
1915
|
+
def processCommonCrawlCollection(cdxApiUrl):
|
|
1916
|
+
"""
|
|
1917
|
+
Get URLs from a given Common Crawl index collection
|
|
1918
|
+
"""
|
|
1919
|
+
global subs, path, linksFound, linkMimes, stopSource, argsInput
|
|
1920
|
+
|
|
1921
|
+
try:
|
|
1922
|
+
# Get memory in case it exceeds threshold
|
|
1923
|
+
getMemory()
|
|
1924
|
+
|
|
1925
|
+
if not stopSource:
|
|
1926
|
+
# Set mime content type filter
|
|
1927
|
+
filterMIME = '&filter=!~mime:(warc/revisit|'
|
|
1928
|
+
if FILTER_MIME.strip() != '':
|
|
1929
|
+
filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
|
|
1930
|
+
filterMIME = filterMIME + ')'
|
|
1931
|
+
|
|
1932
|
+
# Set status code filter
|
|
1933
|
+
filterCode = ''
|
|
1934
|
+
if MATCH_CODE.strip() != '':
|
|
1935
|
+
filterCode = '&filter=~status:(' + re.escape(MATCH_CODE).replace(',','|') + ')'
|
|
1936
|
+
else:
|
|
1937
|
+
filterCode = '&filter=!~status:(' + re.escape(FILTER_CODE).replace(',','|') + ')'
|
|
1938
|
+
|
|
1939
|
+
# Set keywords filter if -ko argument passed
|
|
1940
|
+
filterKeywords = ''
|
|
1941
|
+
if args.keywords_only:
|
|
1942
|
+
if args.keywords_only == '#CONFIG':
|
|
1943
|
+
filterKeywords = '&filter=~url:.*(' + re.escape(FILTER_KEYWORDS).replace(',','|') + ').*'
|
|
1944
|
+
else:
|
|
1945
|
+
filterKeywords = '&filter=~url:.*(' + args.keywords_only + ').*'
|
|
1946
|
+
|
|
1947
|
+
commonCrawlUrl = cdxApiUrl + '?output=json&fl=timestamp,url,mime,status,digest&url='
|
|
1948
|
+
|
|
1949
|
+
if args.filter_responses_only:
|
|
1950
|
+
url = commonCrawlUrl + subs + quote(argsInput) + path
|
|
1951
|
+
else:
|
|
1952
|
+
url = commonCrawlUrl + subs + quote(argsInput) + path + filterMIME + filterCode + filterKeywords
|
|
1953
|
+
|
|
1954
|
+
try:
|
|
1955
|
+
# Choose a random user agent string to use for any requests
|
|
1956
|
+
userAgent = random.choice(USER_AGENT)
|
|
1957
|
+
session = requests.Session()
|
|
1958
|
+
session.mount('https://', HTTP_ADAPTER_CC)
|
|
1959
|
+
session.mount('http://', HTTP_ADAPTER_CC)
|
|
1960
|
+
resp = session.get(url, stream=True, headers={"User-Agent":userAgent})
|
|
1961
|
+
except ConnectionError as ce:
|
|
1962
|
+
writerr(colored(getSPACER('[ ERR ] Common Crawl connection error for index '+cdxApiUrl), 'red'))
|
|
1963
|
+
resp = None
|
|
1964
|
+
return
|
|
1965
|
+
except Exception as e:
|
|
1966
|
+
writerr(colored(getSPACER('[ ERR ] Error getting response - ' + str(e)),'red'))
|
|
1967
|
+
resp = None
|
|
1968
|
+
return
|
|
1969
|
+
finally:
|
|
1970
|
+
try:
|
|
1971
|
+
if resp is not None:
|
|
1972
|
+
# If a status other of 429, then stop processing Common Crawl
|
|
1973
|
+
if resp.status_code == 429:
|
|
1974
|
+
writerr(colored(getSPACER('[ 429 ] Common Crawl rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
|
|
1975
|
+
stopSource = True
|
|
1976
|
+
return
|
|
1977
|
+
# If the response from commoncrawl.org says nothing was found...
|
|
1978
|
+
if resp.text.lower().find('no captures found') > 0:
|
|
1979
|
+
# Don't output any messages, just exit function
|
|
1980
|
+
return
|
|
1981
|
+
# If the response from commoncrawl.org is empty, then stop
|
|
1982
|
+
if resp.text == '':
|
|
1983
|
+
if verbose():
|
|
1984
|
+
writerr(colored(getSPACER('[ ERR ] '+url+' gave an empty response.'),'red'))
|
|
1985
|
+
return
|
|
1986
|
+
# If a status other than 200, then stop
|
|
1987
|
+
if resp.status_code != 200:
|
|
1988
|
+
if verbose():
|
|
1989
|
+
writerr(colored(getSPACER('[ '+str(resp.status_code)+' ] Error for '+cdxApiUrl),'red'))
|
|
1990
|
+
return
|
|
1991
|
+
except:
|
|
1992
|
+
pass
|
|
1993
|
+
|
|
1994
|
+
# Get the URLs and MIME types
|
|
1995
|
+
for line in resp.iter_lines():
|
|
1996
|
+
results = line.decode("utf-8")
|
|
1997
|
+
try:
|
|
1998
|
+
data = json.loads(results)
|
|
1999
|
+
# Get MIME Types if --verbose option was seletced
|
|
2000
|
+
if verbose():
|
|
2001
|
+
try:
|
|
2002
|
+
linkMimes.add(data['mime'])
|
|
2003
|
+
except:
|
|
2004
|
+
pass
|
|
2005
|
+
linksFoundAdd(data['url'])
|
|
2006
|
+
except Exception as e:
|
|
2007
|
+
if verbose():
|
|
2008
|
+
writerr(colored('ERROR processCommonCrawlCollection 2: Cannot get URL and MIME type from line: ' + str(line),'red'))
|
|
2009
|
+
else:
|
|
2010
|
+
pass
|
|
2011
|
+
except Exception as e:
|
|
2012
|
+
writerr(colored('ERROR processCommonCrawlCollection 1: ' + str(e), 'red'))
|
|
2013
|
+
|
|
2014
|
+
def getCommonCrawlIndexes():
|
|
2015
|
+
"""
|
|
2016
|
+
Requests the Common Crawl index file "collinfo.json" if it is not cached locally, or if the local file is older than a month.
|
|
2017
|
+
"""
|
|
2018
|
+
try:
|
|
2019
|
+
# Check if a local copy of the index file exists
|
|
2020
|
+
createFile = False
|
|
2021
|
+
collinfoPath = str(Path(__file__).parent.resolve())+'/collinfo.json'
|
|
2022
|
+
if os.path.exists(collinfoPath):
|
|
2023
|
+
# Check if the file was created over a month ago
|
|
2024
|
+
monthAgo = datetime.now() - timedelta(days=30)
|
|
2025
|
+
fileModTime = datetime.fromtimestamp(os.path.getctime(collinfoPath))
|
|
2026
|
+
if fileModTime < monthAgo:
|
|
2027
|
+
createFile = True
|
|
2028
|
+
# Delete the current file
|
|
2029
|
+
try:
|
|
2030
|
+
os.remove(collinfoPath)
|
|
2031
|
+
except Exception as e:
|
|
2032
|
+
writerr(colored(getSPACER('[ ERR ] Couldn\'t delete local version of Common Crawl index file: ' + str(e)), 'red'))
|
|
2033
|
+
else:
|
|
2034
|
+
createFile = True
|
|
2035
|
+
|
|
2036
|
+
# If the local file exists then read that instead of requesting the index file again
|
|
2037
|
+
if not createFile:
|
|
2038
|
+
# Read the indexes from the local file
|
|
2039
|
+
try:
|
|
2040
|
+
with open(collinfoPath,'r') as file:
|
|
2041
|
+
jsonResp = file.read()
|
|
2042
|
+
file.close()
|
|
2043
|
+
except Exception as e:
|
|
2044
|
+
createFile = True
|
|
2045
|
+
writerr(colored(getSPACER('[ ERR ] Couldn\'t read local version of Common Crawl index file: ' + str(e)),'red'))
|
|
2046
|
+
|
|
2047
|
+
# If the local file needs creating again then make a new request
|
|
2048
|
+
if createFile:
|
|
2049
|
+
try:
|
|
2050
|
+
# Choose a random user agent string to use for any requests
|
|
2051
|
+
userAgent = random.choice(USER_AGENT)
|
|
2052
|
+
session = requests.Session()
|
|
2053
|
+
session.mount('https://', HTTP_ADAPTER_CC)
|
|
2054
|
+
session.mount('http://', HTTP_ADAPTER_CC)
|
|
2055
|
+
indexes = session.get(CCRAWL_INDEX_URL, headers={"User-Agent":userAgent})
|
|
2056
|
+
except ConnectionError as ce:
|
|
2057
|
+
writerr(colored(getSPACER('[ ERR ] Common Crawl connection error getting Index file'), 'red'))
|
|
2058
|
+
return
|
|
2059
|
+
except Exception as e:
|
|
2060
|
+
writerr(colored(getSPACER('[ ERR ] Error getting Common Crawl index collection - ' + str(e)),'red'))
|
|
2061
|
+
return
|
|
2062
|
+
|
|
2063
|
+
# If the rate limit was reached end now
|
|
2064
|
+
if indexes.status_code == 429:
|
|
2065
|
+
writerr(colored(getSPACER('[ 429 ] Common Crawl rate limit reached so unable to get links.'),'red'))
|
|
2066
|
+
return
|
|
2067
|
+
# If the rate limit was reached end now
|
|
2068
|
+
elif indexes.status_code == 503:
|
|
2069
|
+
writerr(colored(getSPACER('[ 503 ] Common Crawl seems to be unavailable.'),'red'))
|
|
2070
|
+
return
|
|
2071
|
+
elif indexes.status_code != 200:
|
|
2072
|
+
writerr(colored(getSPACER('[ '+str(indexes.status_code)+' ] Common Crawl did not retrun the indexes file.'),'red'))
|
|
2073
|
+
return
|
|
2074
|
+
|
|
2075
|
+
# Get the the returned JSON
|
|
2076
|
+
jsonResp = indexes.text
|
|
2077
|
+
|
|
2078
|
+
# Write the contents of the response to a local file so we don't request in future. Overwrite it if it exists
|
|
2079
|
+
try:
|
|
2080
|
+
f = open(collinfoPath, 'w')
|
|
2081
|
+
f.write(jsonResp)
|
|
2082
|
+
f.close()
|
|
2083
|
+
except Exception as e:
|
|
2084
|
+
writerr(colored(getSPACER('[ ERR ] Couldn\'t create local version of Common Crawl index file: ' + str(e)),'red'))
|
|
2085
|
+
|
|
2086
|
+
# Get the API URLs from the returned JSON
|
|
2087
|
+
cdxApiUrls = set()
|
|
2088
|
+
collection = 0
|
|
2089
|
+
for values in json.loads(jsonResp):
|
|
2090
|
+
for key in values:
|
|
2091
|
+
if key == 'cdx-api':
|
|
2092
|
+
if args.lcy != 0:
|
|
2093
|
+
try:
|
|
2094
|
+
indexYear = values[key].split("CC-MAIN-")[1][:4]
|
|
2095
|
+
if int(indexYear) >= args.lcy:
|
|
2096
|
+
cdxApiUrls.add(values[key])
|
|
2097
|
+
except Exception as e:
|
|
2098
|
+
writerr(colored(getSPACER('[ ERR ] Failed to get the year from index name ' + values[key] + ' - ' + str(e)),'red'))
|
|
2099
|
+
else:
|
|
2100
|
+
cdxApiUrls.add(values[key])
|
|
2101
|
+
collection = collection + 1
|
|
2102
|
+
if collection == args.lcc: break
|
|
2103
|
+
|
|
2104
|
+
return cdxApiUrls
|
|
2105
|
+
|
|
2106
|
+
except Exception as e:
|
|
2107
|
+
writerr(colored('ERROR getCommonCrawlIndexes 1: ' + str(e), 'red'))
|
|
2108
|
+
|
|
2109
|
+
def getCommonCrawlUrls():
|
|
2110
|
+
"""
|
|
2111
|
+
Get all Common Crawl index collections to get all URLs from each one
|
|
2112
|
+
"""
|
|
2113
|
+
global linksFound, linkMimes, waymorePath, subs, path, stopSource, argsInput, checkCommonCrawl
|
|
2114
|
+
|
|
2115
|
+
try:
|
|
2116
|
+
stopSource = False
|
|
2117
|
+
linkMimes = set()
|
|
2118
|
+
originalLinkCount = len(linksFound)
|
|
2119
|
+
|
|
2120
|
+
# Set mime content type filter
|
|
2121
|
+
filterMIME = '&filter=!~mime:(warc/revisit|'
|
|
2122
|
+
if FILTER_MIME.strip() != '':
|
|
2123
|
+
filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
|
|
2124
|
+
filterMIME = filterMIME + ')'
|
|
2125
|
+
|
|
2126
|
+
# Set status code filter
|
|
2127
|
+
filterCode = ''
|
|
2128
|
+
if MATCH_CODE.strip() != '':
|
|
2129
|
+
filterCode = '&filter=~status:(' + re.escape(MATCH_CODE).replace(',','|') + ')'
|
|
2130
|
+
else:
|
|
2131
|
+
filterCode = '&filter=!~status:(' + re.escape(FILTER_CODE).replace(',','|') + ')'
|
|
2132
|
+
|
|
2133
|
+
if verbose():
|
|
2134
|
+
if args.filter_responses_only:
|
|
2135
|
+
url = '{CDX-API-URL}?output=json&fl=timestamp,url,mime,status,digest&url=' + subs + quote(argsInput) + path
|
|
2136
|
+
else:
|
|
2137
|
+
url = '{CDX-API-URL}?output=json&fl=timestamp,url,mime,status,digest&url=' + subs + quote(argsInput) + path + filterMIME + filterCode
|
|
2138
|
+
write(colored('The commoncrawl index URL requested to get links (where {CDX-API-URL} is from ' + CCRAWL_INDEX_URL + '): ','magenta')+colored(url+'\n','white'))
|
|
2139
|
+
|
|
2140
|
+
if not args.check_only:
|
|
2141
|
+
write(colored('\rGetting commoncrawl.org index collections list...\r','cyan'))
|
|
2142
|
+
|
|
2143
|
+
# Get the Common Crawl index collections
|
|
2144
|
+
cdxApiUrls = getCommonCrawlIndexes()
|
|
2145
|
+
|
|
2146
|
+
if args.check_only:
|
|
2147
|
+
if args.lcc < len(cdxApiUrls):
|
|
2148
|
+
checkCommonCrawl = args.lcc+1
|
|
2149
|
+
else:
|
|
2150
|
+
checkCommonCrawl = len(cdxApiUrls)+1
|
|
2151
|
+
write(colored('Get URLs from Common Crawl: ','cyan')+colored(str(checkCommonCrawl)+' requests','white'))
|
|
2152
|
+
else:
|
|
2153
|
+
write(colored('\rGetting links from the latest ' + str(len(cdxApiUrls)) + ' commoncrawl.org index collections (this can take a while for some domains)...\r','cyan'))
|
|
2154
|
+
|
|
2155
|
+
# Process the URLs from common crawl
|
|
2156
|
+
if stopProgram is None:
|
|
2157
|
+
p = mp.Pool(args.processes)
|
|
2158
|
+
p.map(processCommonCrawlCollection, cdxApiUrls)
|
|
2159
|
+
p.close()
|
|
2160
|
+
p.join()
|
|
2161
|
+
|
|
2162
|
+
# Show the MIME types found (in case user wants to exclude more)
|
|
2163
|
+
if verbose() and len(linkMimes) > 0:
|
|
2164
|
+
linkMimes.discard('warc/revisit')
|
|
2165
|
+
write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
|
|
2166
|
+
|
|
2167
|
+
linkCount = len(linksFound) - originalLinkCount
|
|
2168
|
+
if args.xwm:
|
|
2169
|
+
write(getSPACER(colored('Links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2170
|
+
else:
|
|
2171
|
+
write(getSPACER(colored('Extra links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2172
|
+
|
|
2173
|
+
except Exception as e:
|
|
2174
|
+
writerr(colored('ERROR getCommonCrawlUrls 1: ' + str(e), 'red'))
|
|
2175
|
+
|
|
2176
|
+
def processVirusTotalUrl(url):
|
|
2177
|
+
"""
|
|
2178
|
+
Process a specific URL from virustotal.io to determine whether to save the link
|
|
2179
|
+
"""
|
|
2180
|
+
global argsInput, argsInputHostname
|
|
2181
|
+
|
|
2182
|
+
addLink = True
|
|
2183
|
+
|
|
2184
|
+
# If the url passed doesn't have a scheme, prefix with http://
|
|
2185
|
+
match = re.search(r'^[A-za-z]*\:\/\/', url, flags=re.IGNORECASE)
|
|
2186
|
+
if match is None:
|
|
2187
|
+
url = 'http://'+url
|
|
2188
|
+
|
|
2189
|
+
try:
|
|
2190
|
+
# If filters are required then test them
|
|
2191
|
+
if not args.filter_responses_only:
|
|
2192
|
+
|
|
2193
|
+
# If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
|
|
2194
|
+
if args.no_subs:
|
|
2195
|
+
match = re.search(r'^[A-za-z]*\:\/\/(www\.)?'+re.escape(argsInputHostname), url, flags=re.IGNORECASE)
|
|
2196
|
+
if match is None:
|
|
2197
|
+
addLink = False
|
|
2198
|
+
|
|
2199
|
+
# If the user didn't requested -f / --filter-responses-only then check http code
|
|
2200
|
+
# Note we can't check MIME filter because it is not returned by VirusTotal API
|
|
2201
|
+
if addLink and not args.filter_responses_only:
|
|
2202
|
+
|
|
2203
|
+
# Check the URL exclusions
|
|
2204
|
+
if addLink:
|
|
2205
|
+
match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', url, flags=re.IGNORECASE)
|
|
2206
|
+
if match is not None:
|
|
2207
|
+
addLink = False
|
|
2208
|
+
|
|
2209
|
+
# Set keywords filter if -ko argument passed
|
|
2210
|
+
if addLink and args.keywords_only:
|
|
2211
|
+
if args.keywords_only == '#CONFIG':
|
|
2212
|
+
match = re.search(r'('+re.escape(FILTER_KEYWORDS).replace(',','|')+')', url, flags=re.IGNORECASE)
|
|
2213
|
+
else:
|
|
2214
|
+
match = re.search(r'('+args.keywords_only+')', url, flags=re.IGNORECASE)
|
|
2215
|
+
if match is None:
|
|
2216
|
+
addLink = False
|
|
2217
|
+
|
|
2218
|
+
# Add link if it passed filters
|
|
2219
|
+
if addLink:
|
|
2220
|
+
# Just get the hostname of the urkl
|
|
2221
|
+
tldExtract = tldextract.extract(url)
|
|
2222
|
+
subDomain = tldExtract.subdomain
|
|
2223
|
+
if subDomain != '':
|
|
2224
|
+
subDomain = subDomain+'.'
|
|
2225
|
+
domainOnly = subDomain+tldExtract.domain+'.'+tldExtract.suffix
|
|
2226
|
+
|
|
2227
|
+
# VirusTotal might return URLs that aren't for the domain passed so we need to check for those and not process them
|
|
2228
|
+
# Check the URL
|
|
2229
|
+
match = re.search(r'(^|\.)'+re.escape(argsInputHostname)+'$', domainOnly, flags=re.IGNORECASE)
|
|
2230
|
+
if match is not None:
|
|
2231
|
+
linksFoundAdd(url)
|
|
2232
|
+
|
|
2233
|
+
except Exception as e:
|
|
2234
|
+
writerr(colored('ERROR processVirusTotalUrl 1: ' + str(e), 'red'))
|
|
2235
|
+
|
|
2236
|
+
def getVirusTotalUrls():
|
|
2237
|
+
"""
|
|
2238
|
+
Get URLs from the VirusTotal API v2
|
|
2239
|
+
"""
|
|
2240
|
+
global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSource, argsInput, checkVirusTotal, argsInputHostname
|
|
2241
|
+
|
|
2242
|
+
# Write the file of URL's for the passed domain/URL
|
|
2243
|
+
try:
|
|
2244
|
+
requestsMade = 0
|
|
2245
|
+
stopSource = False
|
|
2246
|
+
linkMimes = set()
|
|
2247
|
+
originalLinkCount = len(linksFound)
|
|
2248
|
+
|
|
2249
|
+
# Just pass the hostname in the URL
|
|
2250
|
+
url = VIRUSTOTAL_URL.replace('{DOMAIN}',quote(argsInputHostname)).replace('{APIKEY}',VIRUSTOTAL_API_KEY)
|
|
2251
|
+
|
|
2252
|
+
if verbose():
|
|
2253
|
+
write(colored('The VirusTotal URL requested to get links: ','magenta')+colored(url+'\n','white'))
|
|
2254
|
+
|
|
2255
|
+
if not args.check_only:
|
|
2256
|
+
write(colored('\rGetting links from virustotal.com API...\r','cyan'))
|
|
2257
|
+
|
|
2258
|
+
# Get the domain report from virustotal
|
|
2259
|
+
try:
|
|
2260
|
+
# Choose a random user agent string to use for any requests
|
|
2261
|
+
userAgent = random.choice(USER_AGENT)
|
|
2262
|
+
session = requests.Session()
|
|
2263
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
2264
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
2265
|
+
# Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
|
|
2266
|
+
resp = session.get(url, headers={'User-Agent':userAgent})
|
|
2267
|
+
requestsMade = requestsMade + 1
|
|
2268
|
+
except Exception as e:
|
|
2269
|
+
write(colored(getSPACER('[ ERR ] Unable to get links from virustotal.io: ' + str(e)), 'red'))
|
|
2270
|
+
return
|
|
2271
|
+
|
|
2272
|
+
# Deal with any errors
|
|
2273
|
+
if resp.status_code == 429:
|
|
2274
|
+
writerr(colored(getSPACER('[ 429 ] VirusTotal rate limit reached so unable to get links.'),'red'))
|
|
2275
|
+
return
|
|
2276
|
+
elif resp.status_code == 403:
|
|
2277
|
+
writerr(colored(getSPACER('[ 403 ] VirusTotal: Permission denied. Check your API key is correct.'),'red'))
|
|
2278
|
+
return
|
|
2279
|
+
elif resp.status_code != 200:
|
|
2280
|
+
writerr(colored(getSPACER('[ ' + str(resp.status_code) + ' ] Unable to get links from virustotal.com'),'red'))
|
|
2281
|
+
return
|
|
2282
|
+
|
|
2283
|
+
# Get the JSON response
|
|
2284
|
+
jsonResp = json.loads(resp.text.strip())
|
|
2285
|
+
|
|
2286
|
+
# Get the different URLs
|
|
2287
|
+
if args.no_subs:
|
|
2288
|
+
subDomains = []
|
|
2289
|
+
else:
|
|
2290
|
+
try:
|
|
2291
|
+
subDomains = jsonResp['subdomains']
|
|
2292
|
+
except Exception as e:
|
|
2293
|
+
subDomains = []
|
|
2294
|
+
try:
|
|
2295
|
+
detectedUrls = [entry['url'] for entry in jsonResp.get('detected_urls', [])]
|
|
2296
|
+
except Exception as e:
|
|
2297
|
+
detectedUrls = []
|
|
2298
|
+
try:
|
|
2299
|
+
undetectedUrls = [entry[0] for entry in jsonResp.get('undetected_urls', [])]
|
|
2300
|
+
except Exception as e:
|
|
2301
|
+
undetectedUrls = []
|
|
2302
|
+
try:
|
|
2303
|
+
totalUrls = set(subDomains + detectedUrls + undetectedUrls)
|
|
2304
|
+
except Exception as e:
|
|
2305
|
+
totalUrls = []
|
|
2306
|
+
|
|
2307
|
+
if args.check_only:
|
|
2308
|
+
write(colored('Get URLs from VirusTotal: ','cyan')+colored('1 request','white'))
|
|
2309
|
+
checkVirusTotal = 1
|
|
2310
|
+
else:
|
|
2311
|
+
# Carry on if something was found
|
|
2312
|
+
for vturl in totalUrls:
|
|
2313
|
+
|
|
2314
|
+
if stopSource:
|
|
2315
|
+
break
|
|
2316
|
+
|
|
2317
|
+
# Get memory in case it exceeds threshold
|
|
2318
|
+
getMemory()
|
|
2319
|
+
|
|
2320
|
+
# Work out whether to include it
|
|
2321
|
+
processVirusTotalUrl(vturl)
|
|
2322
|
+
|
|
2323
|
+
linkCount = len(linksFound) - originalLinkCount
|
|
2324
|
+
if args.xwm and args.xcc and args.xav and args.xus:
|
|
2325
|
+
write(getSPACER(colored('Links found on virustotal.com: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2326
|
+
else:
|
|
2327
|
+
write(getSPACER(colored('Extra links found on virustotal.com: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
|
|
2328
|
+
|
|
2329
|
+
except Exception as e:
|
|
2330
|
+
writerr(colored('ERROR getVirusTotalUrls 1: ' + str(e), 'red'))
|
|
2331
|
+
|
|
2332
|
+
def processResponses():
|
|
2333
|
+
"""
|
|
2334
|
+
Get archived responses from Wayback Machine (archive.org)
|
|
2335
|
+
"""
|
|
2336
|
+
global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory
|
|
2337
|
+
try:
|
|
2338
|
+
if not args.check_only:
|
|
2339
|
+
# Create 'results' and domain directory if needed
|
|
2340
|
+
createDirs()
|
|
2341
|
+
|
|
2342
|
+
# Get the path of the files, depending on whether -oR / --output_responses was passed
|
|
2343
|
+
try:
|
|
2344
|
+
continuePath = responseOutputDirectory + 'continueResp.tmp'
|
|
2345
|
+
responsesPath = responseOutputDirectory + 'responses.tmp'
|
|
2346
|
+
indexPath = responseOutputDirectory + 'index.txt'
|
|
2347
|
+
except Exception as e:
|
|
2348
|
+
if verbose():
|
|
2349
|
+
writerr(colored('ERROR processResponses 4: ' + str(e), 'red'))
|
|
2350
|
+
|
|
2351
|
+
# Check if a continueResp.tmp and responses.tmp files exists
|
|
2352
|
+
runPrevious = 'n'
|
|
2353
|
+
if not args.check_only and os.path.exists(continuePath) and os.path.exists(responsesPath):
|
|
2354
|
+
|
|
2355
|
+
# Load the links into the set
|
|
2356
|
+
with open(responsesPath,'rb') as fl:
|
|
2357
|
+
linkRequests = pickle.load(fl)
|
|
2358
|
+
totalPrevResponses = len(linkRequests)
|
|
2359
|
+
|
|
2360
|
+
# Get the previous end position to start again at this point
|
|
2361
|
+
try:
|
|
2362
|
+
with open(continuePath,'r') as fc:
|
|
2363
|
+
successCount = int(fc.readline().strip())
|
|
2364
|
+
except Exception as e:
|
|
2365
|
+
successCount = 0
|
|
2366
|
+
|
|
2367
|
+
# Ask the user if we should continue with previous run if the current starting position is greater than 0 and less than the total
|
|
2368
|
+
if successCount > 0 and successCount < totalPrevResponses:
|
|
2369
|
+
# If the program is not piped from or to another process, then ask whether to continue with previous run
|
|
2370
|
+
if sys.stdout.isatty() and sys.stdin.isatty():
|
|
2371
|
+
write(colored('The previous run to get archived responses for ' + argsInput + ' was not completed.\nYou can start from response ' + str(successCount) + ' of ' + str(totalPrevResponses) + ' for the previous run, or you can start a new run with your specified arguments.', 'yellow'))
|
|
2372
|
+
runPrevious = input('Continue with previous run? y/n: ')
|
|
2373
|
+
else:
|
|
2374
|
+
if CONTINUE_RESPONSES_IF_PIPED:
|
|
2375
|
+
runPrevious = 'y'
|
|
2376
|
+
writerr(colored('The previous run to get archived responses for ' + argsInput + ' was not completed. Starting from response ' + str(successCount) + ' of ' + str(totalPrevResponses) + '... ', 'yellow'))
|
|
2377
|
+
else:
|
|
2378
|
+
runPrevious = 'n'
|
|
2379
|
+
|
|
2380
|
+
# If we are going to run a new run
|
|
2381
|
+
if runPrevious.lower() == 'n':
|
|
2382
|
+
|
|
2383
|
+
# Set start point
|
|
2384
|
+
successCount = 0
|
|
2385
|
+
|
|
2386
|
+
# Set up filters
|
|
2387
|
+
filterLimit = '&limit=' + str(args.limit)
|
|
2388
|
+
if args.from_date is None:
|
|
2389
|
+
filterFrom = ''
|
|
2390
|
+
else:
|
|
2391
|
+
filterFrom = '&from=' + str(args.from_date)
|
|
2392
|
+
if args.to_date is None:
|
|
2393
|
+
filterTo = ''
|
|
2394
|
+
else:
|
|
2395
|
+
filterTo = '&to=' + str(args.to_date)
|
|
2396
|
+
|
|
2397
|
+
# Set keywords filter if -ko argument passed
|
|
2398
|
+
filterKeywords = ''
|
|
2399
|
+
if args.keywords_only:
|
|
2400
|
+
if args.keywords_only == '#CONFIG':
|
|
2401
|
+
filterKeywords = '&filter=original:.*(' + re.escape(FILTER_KEYWORDS).replace(',','|') + ').*'
|
|
2402
|
+
else:
|
|
2403
|
+
filterKeywords = '&filter=original:.*(' + args.keywords_only + ').*'
|
|
2404
|
+
|
|
2405
|
+
# Get the list again with filters and include timestamp
|
|
2406
|
+
linksFound = set()
|
|
2407
|
+
|
|
2408
|
+
# Set mime content type filter
|
|
2409
|
+
filterMIME = '&filter=!mimetype:warc/revisit'
|
|
2410
|
+
if FILTER_MIME.strip() != '':
|
|
2411
|
+
filterMIME = filterMIME + '|' + re.escape(FILTER_MIME).replace(',','|')
|
|
2412
|
+
|
|
2413
|
+
# Set status code filter
|
|
2414
|
+
filterCode = ''
|
|
2415
|
+
if MATCH_CODE.strip() != '':
|
|
2416
|
+
filterCode = '&filter=statuscode:' + re.escape(MATCH_CODE).replace(',','|')
|
|
2417
|
+
else:
|
|
2418
|
+
filterCode = '&filter=!statuscode:' + re.escape(FILTER_CODE).replace(',','|')
|
|
2419
|
+
|
|
2420
|
+
# Set the collapse parameter value in the archive.org URL. From the Wayback API docs:
|
|
2421
|
+
# "A new form of filtering is the option to 'collapse' results based on a field, or a substring of a field.
|
|
2422
|
+
# Collapsing is done on adjacent cdx lines where all captures after the first one that are duplicate are filtered out.
|
|
2423
|
+
# This is useful for filtering out captures that are 'too dense' or when looking for unique captures."
|
|
2424
|
+
if args.capture_interval == 'none': # get all
|
|
2425
|
+
collapse = ''
|
|
2426
|
+
elif args.capture_interval == 'h': # get at most 1 capture per hour
|
|
2427
|
+
collapse = 'timestamp:10'
|
|
2428
|
+
elif args.capture_interval == 'd': # get at most 1 capture per day
|
|
2429
|
+
collapse = 'timestamp:8'
|
|
2430
|
+
elif args.capture_interval == 'm': # get at most 1 capture per month
|
|
2431
|
+
collapse = 'timestamp:6'
|
|
2432
|
+
|
|
2433
|
+
url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords
|
|
2434
|
+
|
|
2435
|
+
if verbose():
|
|
2436
|
+
write(colored('The archive URL requested to get responses: ','magenta')+colored(url+'\n','white'))
|
|
2437
|
+
|
|
2438
|
+
if args.check_only:
|
|
2439
|
+
write(colored('\rChecking archived response requests...\r','cyan'))
|
|
2440
|
+
else:
|
|
2441
|
+
write(colored('\rGetting list of response links (this can take a while for some domains)...\r','cyan'))
|
|
2442
|
+
|
|
2443
|
+
# Build the list of links, concatenating timestamp and URL
|
|
2444
|
+
try:
|
|
2445
|
+
# Choose a random user agent string to use for any requests
|
|
2446
|
+
success = True
|
|
2447
|
+
userAgent = random.choice(USER_AGENT)
|
|
2448
|
+
session = requests.Session()
|
|
2449
|
+
session.mount('https://', HTTP_ADAPTER)
|
|
2450
|
+
session.mount('http://', HTTP_ADAPTER)
|
|
2451
|
+
resp = session.get(url, stream=True, headers={"User-Agent":userAgent}, timeout=args.timeout)
|
|
2452
|
+
except ConnectionError as ce:
|
|
2453
|
+
writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error'), 'red'))
|
|
2454
|
+
resp = None
|
|
2455
|
+
success = False
|
|
2456
|
+
return
|
|
2457
|
+
except Exception as e:
|
|
2458
|
+
writerr(colored(getSPACER('[ ERR ] Couldn\'t get list of responses: ' + str(e)),'red'))
|
|
2459
|
+
resp = None
|
|
2460
|
+
success = False
|
|
2461
|
+
return
|
|
2462
|
+
finally:
|
|
2463
|
+
try:
|
|
2464
|
+
if resp is not None:
|
|
2465
|
+
# If the response from archive.org is empty, then no responses were found
|
|
2466
|
+
if resp.text == '':
|
|
2467
|
+
writerr(colored(getSPACER('No archived responses were found on Wayback Machine (archive.org) for the given search parameters.'),'red'))
|
|
2468
|
+
success = False
|
|
2469
|
+
# If a status other of 429, then stop processing Alien Vault
|
|
2470
|
+
if resp.status_code == 429:
|
|
2471
|
+
writerr(colored(getSPACER('[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
|
|
2472
|
+
success = False
|
|
2473
|
+
# If a status other of 503, then the site is unavailable
|
|
2474
|
+
elif resp.status_code == 503:
|
|
2475
|
+
writerr(colored(getSPACER('[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.'),'red'))
|
|
2476
|
+
success = False
|
|
2477
|
+
# If a status other than 200, then stop
|
|
2478
|
+
elif resp.status_code != 200:
|
|
2479
|
+
if verbose():
|
|
2480
|
+
writerr(colored(getSPACER('[ '+str(resp.status_code)+' ] Error for '+url),'red'))
|
|
2481
|
+
success = False
|
|
2482
|
+
if not success:
|
|
2483
|
+
if args.keywords_only:
|
|
2484
|
+
if args.keywords_only == '#CONFIG':
|
|
2485
|
+
writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml'), 'red'))
|
|
2486
|
+
else:
|
|
2487
|
+
writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed'), 'red'))
|
|
2488
|
+
else:
|
|
2489
|
+
if resp.text.lower().find('blocked site error') > 0:
|
|
2490
|
+
writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)'), 'red'))
|
|
2491
|
+
else:
|
|
2492
|
+
writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - check input domain and try again.'), 'red'))
|
|
2493
|
+
return
|
|
2494
|
+
except:
|
|
2495
|
+
pass
|
|
2496
|
+
|
|
2497
|
+
# Go through the response to save the links found
|
|
2498
|
+
for line in resp.iter_lines():
|
|
2499
|
+
try:
|
|
2500
|
+
results = line.decode("utf-8")
|
|
2501
|
+
timestamp = results.split(' ')[0]
|
|
2502
|
+
originalUrl = results.split(' ')[1]
|
|
2503
|
+
linksFoundAdd(timestamp+'/'+originalUrl)
|
|
2504
|
+
except Exception as e:
|
|
2505
|
+
writerr(colored(getSPACER('ERROR processResponses 3: Cannot to get link from line: '+str(line)), 'red'))
|
|
2506
|
+
|
|
2507
|
+
# Remove any links that have URL exclusions
|
|
2508
|
+
linkRequests = []
|
|
2509
|
+
exclusionRegex = re.compile(r'('+re.escape(FILTER_URL).replace(',','|')+')',flags=re.IGNORECASE)
|
|
2510
|
+
for link in linksFound:
|
|
2511
|
+
# Only add the link if:
|
|
2512
|
+
# a) if the -ra --regex-after was passed that it matches that
|
|
2513
|
+
# b) it does not match the URL exclusions
|
|
2514
|
+
if (args.regex_after is None or re.search(args.regex_after, link, flags=re.IGNORECASE) is not None) and exclusionRegex.search(link) is None:
|
|
2515
|
+
linkRequests.append(link)
|
|
2516
|
+
|
|
2517
|
+
# Write the links to a temp file
|
|
2518
|
+
if not args.check_only:
|
|
2519
|
+
with open(responsesPath,'wb') as f:
|
|
2520
|
+
pickle.dump(linkRequests, f)
|
|
2521
|
+
|
|
2522
|
+
# Get the total number of responses we will try to get and set the current file count to the success count
|
|
2523
|
+
totalResponses = len(linkRequests)
|
|
2524
|
+
fileCount = successCount
|
|
2525
|
+
|
|
2526
|
+
if args.check_only:
|
|
2527
|
+
if args.limit == 5000 and totalResponses+1 == 5000:
|
|
2528
|
+
writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests (the --limit argument defaults to '+str(DEFAULT_LIMIT)+')','cyan'))
|
|
2529
|
+
else:
|
|
2530
|
+
writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests','white'))
|
|
2531
|
+
minutes = round(totalResponses*2.5 // 60)
|
|
2532
|
+
hours = minutes // 60
|
|
2533
|
+
days = hours // 24
|
|
2534
|
+
if minutes < 5:
|
|
2535
|
+
write(colored('\n-> Downloading the responses (depending on their size) should be quite quick!','green'))
|
|
2536
|
+
elif hours < 2:
|
|
2537
|
+
write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(minutes)+' minutes.','green'))
|
|
2538
|
+
elif hours < 6:
|
|
2539
|
+
write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','green'))
|
|
2540
|
+
elif hours < 24:
|
|
2541
|
+
write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','yellow'))
|
|
2542
|
+
elif days < 7:
|
|
2543
|
+
write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days. Consider using arguments -ko, -l, -ci, -from and -to wisely! ','red'))
|
|
2544
|
+
else:
|
|
2545
|
+
write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days!!! Consider using arguments -ko, -l, -ci, -from and -to wisely!','red'))
|
|
2546
|
+
write('')
|
|
2547
|
+
else:
|
|
2548
|
+
# If the limit has been set over the default, give a warning that this could take a long time!
|
|
2549
|
+
if totalResponses - successCount > DEFAULT_LIMIT:
|
|
2550
|
+
if successCount > 0:
|
|
2551
|
+
writerr(colored(getSPACER('WARNING: Downloading remaining ' + str(totalResponses - successCount) + ' responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!'),'yellow'))
|
|
2552
|
+
else:
|
|
2553
|
+
writerr(colored(getSPACER('WARNING: Downloading ' + str(totalResponses) + ' responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!'),'yellow'))
|
|
2554
|
+
|
|
2555
|
+
# Open the index file if hash value is going to be used (not URL)
|
|
2556
|
+
if not args.url_filename:
|
|
2557
|
+
indexFile = open(indexPath,'a')
|
|
2558
|
+
|
|
2559
|
+
# Open the continue.tmp file to store what record we are upto
|
|
2560
|
+
continueRespFile = open(continuePath,'w+')
|
|
2561
|
+
|
|
2562
|
+
# Process the URLs from web archive
|
|
2563
|
+
if stopProgram is None:
|
|
2564
|
+
p = mp.Pool(args.processes)
|
|
2565
|
+
p.map(processArchiveUrl, linkRequests[successCount:])
|
|
2566
|
+
p.close()
|
|
2567
|
+
p.join()
|
|
2568
|
+
|
|
2569
|
+
# Delete the tmp files now it has run successfully
|
|
2570
|
+
if stopProgram is None:
|
|
2571
|
+
try:
|
|
2572
|
+
os.remove(continuePath)
|
|
2573
|
+
os.remove(responsesPath)
|
|
2574
|
+
except:
|
|
2575
|
+
pass
|
|
2576
|
+
|
|
2577
|
+
# Close the index file if hash value is going to be used (not URL)
|
|
2578
|
+
if not args.url_filename:
|
|
2579
|
+
indexFile.close()
|
|
2580
|
+
|
|
2581
|
+
# Close the continueResp.tmp file
|
|
2582
|
+
continueRespFile.close()
|
|
2583
|
+
|
|
2584
|
+
except Exception as e:
|
|
2585
|
+
writerr(colored(getSPACER('ERROR processResponses 1: ' + str(e)), 'red'))
|
|
2586
|
+
finally:
|
|
2587
|
+
linkRequests = None
|
|
2588
|
+
|
|
2589
|
+
def createDirs():
|
|
2590
|
+
"""
|
|
2591
|
+
Create a directory for the 'results' and the sub directory for the passed domain/URL, unless if
|
|
2592
|
+
-oR / --output-responses was passed, just create those directories
|
|
2593
|
+
"""
|
|
2594
|
+
global DEFAULT_OUTPUT_DIR, argsInput
|
|
2595
|
+
try:
|
|
2596
|
+
if (args.mode in 'R,B' and args.output_responses == '') or (args.mode in 'U,B' and args.output_urls == ''):
|
|
2597
|
+
# Create a directory for "results" if it doesn't already exist
|
|
2598
|
+
try:
|
|
2599
|
+
results_dir = Path(DEFAULT_OUTPUT_DIR+'/results')
|
|
2600
|
+
results_dir.mkdir(exist_ok=True)
|
|
2601
|
+
except:
|
|
2602
|
+
pass
|
|
2603
|
+
# Create a directory for the target domain
|
|
2604
|
+
try:
|
|
2605
|
+
domain_dir = Path(DEFAULT_OUTPUT_DIR + '/results/' + str(argsInput).replace('/','-'))
|
|
2606
|
+
domain_dir.mkdir(parents=True, exist_ok=True)
|
|
2607
|
+
except Exception as e:
|
|
2608
|
+
pass
|
|
2609
|
+
try:
|
|
2610
|
+
# Create specified directory for -oR if required
|
|
2611
|
+
if args.output_responses != '':
|
|
2612
|
+
responseDir = Path(args.output_responses)
|
|
2613
|
+
responseDir.mkdir(parents=True, exist_ok=True)
|
|
2614
|
+
# If -oU was passed and is prefixed with a directory, create it
|
|
2615
|
+
if args.output_urls != '' and '/' in args.output_urls:
|
|
2616
|
+
directoriesOnly = os.path.dirname(args.output_urls)
|
|
2617
|
+
responseDir = Path(directoriesOnly)
|
|
2618
|
+
responseDir.mkdir(parents=True, exist_ok=True)
|
|
2619
|
+
except Exception as e:
|
|
2620
|
+
pass
|
|
2621
|
+
except Exception as e:
|
|
2622
|
+
writerr(colored(getSPACER('ERROR createDirs 1: ' + str(e)), 'red'))
|
|
2623
|
+
|
|
2624
|
+
# Get width of the progress bar based on the width of the terminal
|
|
2625
|
+
def getProgressBarLength():
|
|
2626
|
+
global terminalWidth
|
|
2627
|
+
try:
|
|
2628
|
+
if verbose():
|
|
2629
|
+
offset = 90
|
|
2630
|
+
else:
|
|
2631
|
+
offset = 50
|
|
2632
|
+
progressBarLength = terminalWidth - offset
|
|
2633
|
+
except:
|
|
2634
|
+
progressBarLength = 20
|
|
2635
|
+
return progressBarLength
|
|
2636
|
+
|
|
2637
|
+
# Get the length of the space to add to a string to fill line up to width of terminal
|
|
2638
|
+
def getSPACER(text):
|
|
2639
|
+
global terminalWidth
|
|
2640
|
+
lenSpacer = terminalWidth - len(text) +5
|
|
2641
|
+
SPACER = ' ' * lenSpacer
|
|
2642
|
+
return text + SPACER
|
|
2643
|
+
|
|
2644
|
+
# For validating -m / --memory-threshold argument
|
|
2645
|
+
def argcheckPercent(value):
|
|
2646
|
+
ivalue = int(value)
|
|
2647
|
+
if ivalue > 99:
|
|
2648
|
+
raise argparse.ArgumentTypeError(
|
|
2649
|
+
"A valid integer percentage less than 100 must be entered."
|
|
2650
|
+
)
|
|
2651
|
+
return ivalue
|
|
2652
|
+
|
|
2653
|
+
def notifyDiscord():
|
|
2654
|
+
global WEBHOOK_DISCORD, args
|
|
2655
|
+
try:
|
|
2656
|
+
data = {
|
|
2657
|
+
'content': 'waymore has finished for `-i ' + args.input + ' -mode ' + args.mode + '` ! 🤘',
|
|
2658
|
+
'username': 'waymore',
|
|
2659
|
+
}
|
|
2660
|
+
try:
|
|
2661
|
+
result = requests.post(WEBHOOK_DISCORD, json=data)
|
|
2662
|
+
if 300 <= result.status_code < 200:
|
|
2663
|
+
writerr(colored(getSPACER('WARNING: Failed to send notification to Discord - ' + result.json()), 'yellow'))
|
|
2664
|
+
except Exception as e:
|
|
2665
|
+
writerr(colored(getSPACER('WARNING: Failed to send notification to Discord - ' + str(e)), 'yellow'))
|
|
2666
|
+
except Exception as e:
|
|
2667
|
+
writerr(colored('ERROR notifyDiscord 1: ' + str(e), 'red'))
|
|
2668
|
+
|
|
2669
|
+
def checkScript(script):
|
|
2670
|
+
try:
|
|
2671
|
+
if script.replace('\n','').strip() != '':
|
|
2672
|
+
return True
|
|
2673
|
+
except Exception as e:
|
|
2674
|
+
writerr(colored('ERROR extractScripts 1: ' + str(e), 'red'))
|
|
2675
|
+
|
|
2676
|
+
def extractScripts(filePath):
|
|
2677
|
+
try:
|
|
2678
|
+
with open(filePath, 'rb') as file:
|
|
2679
|
+
content = file.read().decode('utf-8', errors='ignore')
|
|
2680
|
+
scripts = re.findall(r'<script[^>]*>(.*?)</script>', content, re.DOTALL)
|
|
2681
|
+
scripts = list(filter(checkScript, scripts))
|
|
2682
|
+
return scripts
|
|
2683
|
+
except Exception as e:
|
|
2684
|
+
writerr(colored('ERROR extractScripts 1: ' + str(e), 'red'))
|
|
2685
|
+
|
|
2686
|
+
def extractExternalScripts(filePath):
|
|
2687
|
+
try:
|
|
2688
|
+
with open(filePath, 'rb') as file:
|
|
2689
|
+
content = file.read().decode('utf-8', errors='ignore')
|
|
2690
|
+
scripts = re.findall(r'<script[^>]* src="(.*?)".*?>', content, re.DOTALL)
|
|
2691
|
+
scripts = list(filter(checkScript, scripts))
|
|
2692
|
+
return scripts
|
|
2693
|
+
except Exception as e:
|
|
2694
|
+
writerr(colored('ERROR extractExternalScripts 1: ' + str(e), 'red'))
|
|
2695
|
+
|
|
2696
|
+
def combineInlineJS():
|
|
2697
|
+
global responseOutputDirectory, INLINE_JS_EXCLUDE
|
|
2698
|
+
try:
|
|
2699
|
+
write(colored('Creating combined inline JS files...', 'cyan'))
|
|
2700
|
+
outputFileTemplate = "combinedInline{}.js"
|
|
2701
|
+
excludedNames = ['index.txt', 'continueResp.tmp', 'responses.tmp']
|
|
2702
|
+
fileList = [name for name in os.listdir(responseOutputDirectory)
|
|
2703
|
+
if os.path.isfile(os.path.join(responseOutputDirectory, name))
|
|
2704
|
+
and not any(name.lower().endswith(ext) for ext in INLINE_JS_EXCLUDE)
|
|
2705
|
+
and name not in excludedNames
|
|
2706
|
+
and 'combinedInline' not in name]
|
|
2707
|
+
|
|
2708
|
+
allScripts = {} # To store all scripts from all files
|
|
2709
|
+
allExternalScripts = [] # To store all external script sources from all files
|
|
2710
|
+
|
|
2711
|
+
fileCount = len(fileList)
|
|
2712
|
+
currentFile = 1
|
|
2713
|
+
for filename in fileList:
|
|
2714
|
+
filePath = os.path.join(responseOutputDirectory, filename)
|
|
2715
|
+
scripts = extractScripts(filePath)
|
|
2716
|
+
if scripts:
|
|
2717
|
+
allScripts[filename] = scripts
|
|
2718
|
+
allExternalScripts.extend(extractExternalScripts(filePath))
|
|
2719
|
+
|
|
2720
|
+
# Show progress bar
|
|
2721
|
+
fillTest = currentFile % 2
|
|
2722
|
+
fillChar = "o"
|
|
2723
|
+
if fillTest == 0:
|
|
2724
|
+
fillChar = "O"
|
|
2725
|
+
suffix="Complete "
|
|
2726
|
+
printProgressBar(
|
|
2727
|
+
currentFile,
|
|
2728
|
+
fileCount,
|
|
2729
|
+
prefix="Checking "+str(fileCount)+" files:",
|
|
2730
|
+
suffix=suffix,
|
|
2731
|
+
length=getProgressBarLength(),
|
|
2732
|
+
fill=fillChar
|
|
2733
|
+
)
|
|
2734
|
+
currentFile += 1
|
|
2735
|
+
|
|
2736
|
+
# Write a file of external javascript files referenced in the inline scripts
|
|
2737
|
+
totalExternal = len(allExternalScripts)
|
|
2738
|
+
if totalExternal > 0:
|
|
2739
|
+
uniqueExternalScripts = set(allExternalScripts)
|
|
2740
|
+
outputFile = os.path.join(responseOutputDirectory, 'combinedInlineSrc.txt')
|
|
2741
|
+
inlineExternalFile = open(outputFile, 'w', encoding='utf-8')
|
|
2742
|
+
for script in uniqueExternalScripts:
|
|
2743
|
+
inlineExternalFile.write(script.strip() + '\n')
|
|
2744
|
+
write(colored('Created file ','cyan')+colored(responseOutputDirectory+'combinedInlineSrc.txt','white')+colored(' (src of external JS)','cyan'))
|
|
2745
|
+
|
|
2746
|
+
# Write files for all combined inline JS
|
|
2747
|
+
uniqueScripts = set()
|
|
2748
|
+
for scriptsList in allScripts.values():
|
|
2749
|
+
uniqueScripts.update(scriptsList)
|
|
2750
|
+
|
|
2751
|
+
totalSections = len(uniqueScripts)
|
|
2752
|
+
sectionCounter = 0 # Counter for inline JS sections
|
|
2753
|
+
currentOutputFile = os.path.join(responseOutputDirectory, outputFileTemplate.format(1))
|
|
2754
|
+
currentSectionsWritten = 0 # Counter for sections written in current file
|
|
2755
|
+
|
|
2756
|
+
if totalSections > 0:
|
|
2757
|
+
fileNumber = 1
|
|
2758
|
+
with open(currentOutputFile, 'w', encoding='utf-8') as inlineJSFile:
|
|
2759
|
+
currentScript = 1
|
|
2760
|
+
for script in uniqueScripts:
|
|
2761
|
+
# Show progress bar
|
|
2762
|
+
fillTest = currentScript % 2
|
|
2763
|
+
fillChar = "o"
|
|
2764
|
+
if fillTest == 0:
|
|
2765
|
+
fillChar = "O"
|
|
2766
|
+
suffix="Complete "
|
|
2767
|
+
printProgressBar(
|
|
2768
|
+
currentScript,
|
|
2769
|
+
totalSections,
|
|
2770
|
+
prefix="Writing "+str(totalSections)+" unique scripts:",
|
|
2771
|
+
suffix=suffix,
|
|
2772
|
+
length=getProgressBarLength(),
|
|
2773
|
+
fill=fillChar
|
|
2774
|
+
)
|
|
2775
|
+
sectionCounter += 1
|
|
2776
|
+
currentSectionsWritten += 1
|
|
2777
|
+
if currentSectionsWritten > 1000:
|
|
2778
|
+
# If 1000 sections have been written, switch to the next output file
|
|
2779
|
+
inlineJSFile.close()
|
|
2780
|
+
fileNumber = sectionCounter // 1000 + 1
|
|
2781
|
+
currentOutputFile = os.path.join(responseOutputDirectory, outputFileTemplate.format(fileNumber))
|
|
2782
|
+
inlineJSFile = open(currentOutputFile, 'w', encoding='utf-8')
|
|
2783
|
+
currentSectionsWritten = 1
|
|
2784
|
+
|
|
2785
|
+
# Insert comment line for the beginning of the section
|
|
2786
|
+
inlineJSFile.write(f"//****** INLINE JS SECTION {sectionCounter} ******//\n\n")
|
|
2787
|
+
|
|
2788
|
+
# Write comments indicating the files the script was found in
|
|
2789
|
+
files = ''
|
|
2790
|
+
for filename, scripts_list in allScripts.items():
|
|
2791
|
+
if script in scripts_list:
|
|
2792
|
+
files = files + filename + ', '
|
|
2793
|
+
|
|
2794
|
+
# Write the files the script appears in
|
|
2795
|
+
inlineJSFile.write('// ' + files.rstrip(', ') + '\n')
|
|
2796
|
+
|
|
2797
|
+
# Write the script content
|
|
2798
|
+
inlineJSFile.write('\n' + script.strip() + '\n\n')
|
|
2799
|
+
|
|
2800
|
+
currentScript += 1
|
|
2801
|
+
|
|
2802
|
+
if totalExternal == 0 and totalSections == 0:
|
|
2803
|
+
write(colored('No scripts found, so no combined JS files written.\n','cyan'))
|
|
2804
|
+
elif fileNumber == 1:
|
|
2805
|
+
write(colored('Created file ','cyan')+colored(responseOutputDirectory+'combinedInline1.js','white')+colored(' (contents of inline JS)\n','cyan'))
|
|
2806
|
+
else:
|
|
2807
|
+
write(colored('Created files ','cyan')+colored(responseOutputDirectory+'combinedInline{1-'+str(fileNumber)+'}.js','white')+colored(' (contents of inline JS)\n','cyan'))
|
|
2808
|
+
|
|
2809
|
+
except Exception as e:
|
|
2810
|
+
writerr(colored('ERROR combineInlineJS 1: ' + str(e), 'red'))
|
|
2811
|
+
|
|
2812
|
+
# Run waymore
|
|
2813
|
+
def main():
|
|
2814
|
+
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount
|
|
2815
|
+
|
|
2816
|
+
# Tell Python to run the handler() function when SIGINT is received
|
|
2817
|
+
signal(SIGINT, handler)
|
|
2818
|
+
|
|
2819
|
+
# Parse command line arguments
|
|
2820
|
+
parser = argparse.ArgumentParser(
|
|
2821
|
+
description='waymore - by @Xnl-h4ck3r: Find way more from the Wayback Machine'
|
|
2822
|
+
)
|
|
2823
|
+
parser.add_argument(
|
|
2824
|
+
'-i',
|
|
2825
|
+
'--input',
|
|
2826
|
+
action='store',
|
|
2827
|
+
help='The target domain (or file of domains) to find links for. This can be a domain only, or a domain with a specific path. If it is a domain only to get everything for that domain, don\'t prefix with "www."',
|
|
2828
|
+
type=validateArgInput
|
|
2829
|
+
)
|
|
2830
|
+
parser.add_argument(
|
|
2831
|
+
'-n',
|
|
2832
|
+
'--no-subs',
|
|
2833
|
+
action='store_true',
|
|
2834
|
+
help='Don\'t include subdomains of the target domain (only used if input is not a domain with a specific path).',
|
|
2835
|
+
)
|
|
2836
|
+
parser.add_argument(
|
|
2837
|
+
'-mode',
|
|
2838
|
+
action='store',
|
|
2839
|
+
help='The mode to run: U (retrieve URLs only), R (download Responses only) or B (Both).',
|
|
2840
|
+
choices = ['U','R','B'],
|
|
2841
|
+
default='B'
|
|
2842
|
+
)
|
|
2843
|
+
parser.add_argument(
|
|
2844
|
+
'-oU',
|
|
2845
|
+
'--output-urls',
|
|
2846
|
+
action='store',
|
|
2847
|
+
help='The file to save the Links output to, including path if necessary. If the "-oR" argument is not passed, a "results" directory will be created in the path specified by the DEFAULT_OUTPUT_DIR key in config.yml file (typically defaults to "~/.config/waymore/"). Within that, a directory will be created with target domain (or domain with path) passed with "-i" (or for each line of a file passed with "-i").' ,
|
|
2848
|
+
default='',
|
|
2849
|
+
)
|
|
2850
|
+
parser.add_argument(
|
|
2851
|
+
'-oR',
|
|
2852
|
+
'--output-responses',
|
|
2853
|
+
action='store',
|
|
2854
|
+
help='The directory to save the response output files to, including path if necessary. If the argument is not passed, a "results" directory will be created in the path specified by the DEFAULT_OUTPUT_DIR key in config.yml file (typically defaults to "~/.config/waymore/"). Within that, a directory will be created with target domain (or domain with path) passed with "-i" (or for each line of a file passed with "-i").' ,
|
|
2855
|
+
default='',
|
|
2856
|
+
)
|
|
2857
|
+
parser.add_argument(
|
|
2858
|
+
'-f',
|
|
2859
|
+
'--filter-responses-only',
|
|
2860
|
+
action='store_true',
|
|
2861
|
+
help='The initial links from Wayback Machine will not be filtered (MIME Type and Response Code), only the responses that are downloaded, e.g. it maybe useful to still see all available paths from the links even if you don\'t want to check the content.',
|
|
2862
|
+
)
|
|
2863
|
+
parser.add_argument(
|
|
2864
|
+
'-fc',
|
|
2865
|
+
action='store',
|
|
2866
|
+
help='Filter HTTP status codes for retrieved URLs and responses. Comma separated list of codes (default: the FILTER_CODE values from config.yml). Passing this argument will override the value from config.yml',
|
|
2867
|
+
type=validateArgStatusCodes,
|
|
2868
|
+
)
|
|
2869
|
+
parser.add_argument(
|
|
2870
|
+
'-mc',
|
|
2871
|
+
action='store',
|
|
2872
|
+
help='Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config FILTER_CODE and -fc.',
|
|
2873
|
+
type=validateArgStatusCodes,
|
|
2874
|
+
)
|
|
2875
|
+
parser.add_argument(
|
|
2876
|
+
'-l',
|
|
2877
|
+
'--limit',
|
|
2878
|
+
action='store',
|
|
2879
|
+
type=int,
|
|
2880
|
+
help='How many responses will be saved (if -mode is R or B). A positive value will get the first N results, a negative value will will get the last N results. A value of 0 will get ALL responses (default: '+str(DEFAULT_LIMIT)+')',
|
|
2881
|
+
default=DEFAULT_LIMIT,
|
|
2882
|
+
metavar='<signed integer>'
|
|
2883
|
+
)
|
|
2884
|
+
parser.add_argument(
|
|
2885
|
+
'-from',
|
|
2886
|
+
'--from-date',
|
|
2887
|
+
action='store',
|
|
2888
|
+
type=int,
|
|
2889
|
+
help='What date to get responses from. If not specified it will get from the earliest possible results. A partial value can be passed, e.g. 2016, 201805, etc.',
|
|
2890
|
+
metavar='<yyyyMMddhhmmss>'
|
|
2891
|
+
)
|
|
2892
|
+
parser.add_argument(
|
|
2893
|
+
'-to',
|
|
2894
|
+
'--to-date',
|
|
2895
|
+
action='store',
|
|
2896
|
+
type=int,
|
|
2897
|
+
help='What date to get responses to. If not specified it will get to the latest possible results. A partial value can be passed, e.g. 2016, 201805, etc.',
|
|
2898
|
+
metavar='<yyyyMMddhhmmss>'
|
|
2899
|
+
)
|
|
2900
|
+
parser.add_argument(
|
|
2901
|
+
'-ci',
|
|
2902
|
+
'--capture-interval',
|
|
2903
|
+
action='store',
|
|
2904
|
+
choices=['h', 'd', 'm', 'none'],
|
|
2905
|
+
help='Filters the search on Wayback Machine (archive.org) to only get at most 1 capture per hour (h), day (d) or month (m). This filter is used for responses only. The default is \'d\' but can also be set to \'none\' to not filter anything and get all responses.',
|
|
2906
|
+
default='d'
|
|
2907
|
+
)
|
|
2908
|
+
parser.add_argument(
|
|
2909
|
+
'-ra',
|
|
2910
|
+
'--regex-after',
|
|
2911
|
+
help='RegEx for filtering purposes against links found all sources of URLs AND responses downloaded. Only positive matches will be output.',
|
|
2912
|
+
action='store',
|
|
2913
|
+
)
|
|
2914
|
+
parser.add_argument(
|
|
2915
|
+
'-url-filename',
|
|
2916
|
+
action='store_true',
|
|
2917
|
+
help='Set the file name of downloaded responses to the URL that generated the response, otherwise it will be set to the hash value of the response. Using the hash value means multiple URLs that generated the same response will only result in one file being saved for that response.',
|
|
2918
|
+
default=False
|
|
2919
|
+
)
|
|
2920
|
+
parser.add_argument(
|
|
2921
|
+
'-xwm',
|
|
2922
|
+
action='store_true',
|
|
2923
|
+
help='Exclude checks for links from Wayback Machine (archive.org)',
|
|
2924
|
+
default=False
|
|
2925
|
+
)
|
|
2926
|
+
parser.add_argument(
|
|
2927
|
+
'-xcc',
|
|
2928
|
+
action='store_true',
|
|
2929
|
+
help='Exclude checks for links from commoncrawl.org',
|
|
2930
|
+
default=False
|
|
2931
|
+
)
|
|
2932
|
+
parser.add_argument(
|
|
2933
|
+
'-xav',
|
|
2934
|
+
action='store_true',
|
|
2935
|
+
help='Exclude checks for links from alienvault.com',
|
|
2936
|
+
default=False
|
|
2937
|
+
)
|
|
2938
|
+
parser.add_argument(
|
|
2939
|
+
'-xus',
|
|
2940
|
+
action='store_true',
|
|
2941
|
+
help='Exclude checks for links from urlscan.io',
|
|
2942
|
+
default=False
|
|
2943
|
+
)
|
|
2944
|
+
parser.add_argument(
|
|
2945
|
+
'-xvt',
|
|
2946
|
+
action='store_true',
|
|
2947
|
+
help='Exclude checks for links from virustotal.com',
|
|
2948
|
+
default=False
|
|
2949
|
+
)
|
|
2950
|
+
parser.add_argument(
|
|
2951
|
+
'-lcc',
|
|
2952
|
+
action='store',
|
|
2953
|
+
type=int,
|
|
2954
|
+
help='Limit the number of Common Crawl index collections searched, e.g. \'-lcc 10\' will just search the latest 10 collections (default: 3). As of July 2023 there are currently 95 collections. Setting to 0 (default) will search ALL collections. If you don\'t want to search Common Crawl at all, use the -xcc option.'
|
|
2955
|
+
)
|
|
2956
|
+
parser.add_argument(
|
|
2957
|
+
'-lcy',
|
|
2958
|
+
action='store',
|
|
2959
|
+
type=int,
|
|
2960
|
+
help='Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with -lcc). For example, if you are only interested in data from 2015 and after, pass -lcy 2015. If you don\'t want to search Common Crawl at all, use the -xcc option.',
|
|
2961
|
+
default=0
|
|
2962
|
+
)
|
|
2963
|
+
parser.add_argument(
|
|
2964
|
+
'-t',
|
|
2965
|
+
'--timeout',
|
|
2966
|
+
help='This is for archived responses only! How many seconds to wait for the server to send data before giving up (default: '+str(DEFAULT_TIMEOUT)+' seconds)',
|
|
2967
|
+
default=DEFAULT_TIMEOUT,
|
|
2968
|
+
type=int,
|
|
2969
|
+
metavar="<seconds>",
|
|
2970
|
+
)
|
|
2971
|
+
parser.add_argument(
|
|
2972
|
+
'-p',
|
|
2973
|
+
'--processes',
|
|
2974
|
+
help='Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1)',
|
|
2975
|
+
action='store',
|
|
2976
|
+
type=validateArgProcesses,
|
|
2977
|
+
default=1,
|
|
2978
|
+
metavar="<integer>",
|
|
2979
|
+
)
|
|
2980
|
+
parser.add_argument(
|
|
2981
|
+
'-r',
|
|
2982
|
+
'--retries',
|
|
2983
|
+
action='store',
|
|
2984
|
+
type=int,
|
|
2985
|
+
help='The number of retries for requests that get connection error or rate limited (default: 1).',
|
|
2986
|
+
default=1
|
|
2987
|
+
)
|
|
2988
|
+
parser.add_argument(
|
|
2989
|
+
"-m",
|
|
2990
|
+
"--memory-threshold",
|
|
2991
|
+
action="store",
|
|
2992
|
+
help="The memory threshold percentage. If the machines memory goes above the threshold, the program will be stopped and ended gracefully before running out of memory (default: 95)",
|
|
2993
|
+
default=95,
|
|
2994
|
+
metavar="<integer>",
|
|
2995
|
+
type=argcheckPercent,
|
|
2996
|
+
)
|
|
2997
|
+
parser.add_argument(
|
|
2998
|
+
'-ko',
|
|
2999
|
+
'--keywords-only',
|
|
3000
|
+
action='store',
|
|
3001
|
+
help=r'Only return links and responses that contain keywords that you are interested in. This can reduce the time it takes to get results. If you provide the flag with no value, Keywords are taken from the comma separated list in the "config.yml" file with the "FILTER_KEYWORDS" key, otherwise you can pass an specific Regex value to use, e.g. -ko "admin" to only get links containing the word admin, or -ko "\.js(\?|$)" to only get JS files. The Regex check is NOT case sensitive.',
|
|
3002
|
+
nargs='?',
|
|
3003
|
+
const="#CONFIG"
|
|
3004
|
+
)
|
|
3005
|
+
parser.add_argument(
|
|
3006
|
+
'-lr',
|
|
3007
|
+
'--limit-requests',
|
|
3008
|
+
type=int,
|
|
3009
|
+
help='Limit the number of requests that will be made when getting links from a source (this doesn\'t apply to Common Crawl). Some targets can return a huge amount of requests needed that are just not feasible to get, so this can be used to manage that situation. This defaults to 0 (Zero) which means there is no limit.',
|
|
3010
|
+
default=0,
|
|
3011
|
+
)
|
|
3012
|
+
parser.add_argument(
|
|
3013
|
+
"-ow",
|
|
3014
|
+
"--output-overwrite",
|
|
3015
|
+
action="store_true",
|
|
3016
|
+
help="If the URL output file (default waymore.txt) already exists, it will be overwritten instead of being appended to.",
|
|
3017
|
+
)
|
|
3018
|
+
parser.add_argument(
|
|
3019
|
+
"-nlf",
|
|
3020
|
+
"--new-links-file",
|
|
3021
|
+
action="store_true",
|
|
3022
|
+
help="If this argument is passed, a .new file will also be written that will contain links for the latest run. This is only relevant for mode U.",
|
|
3023
|
+
)
|
|
3024
|
+
parser.add_argument(
|
|
3025
|
+
"-c",
|
|
3026
|
+
"--config",
|
|
3027
|
+
action="store",
|
|
3028
|
+
help="Path to the YML config file. If not passed, it looks for file 'config.yml' in the same directory as runtime file 'waymore.py'.",
|
|
3029
|
+
)
|
|
3030
|
+
parser.add_argument(
|
|
3031
|
+
'-wrlr',
|
|
3032
|
+
'--wayback-rate-limit-retry',
|
|
3033
|
+
action='store',
|
|
3034
|
+
type=int,
|
|
3035
|
+
help='The number of minutes the user wants to wait for a rate limit pause on Watback Machine (archive.org) instead of stopping with a 429 error (default: 3).',
|
|
3036
|
+
default=3
|
|
3037
|
+
)
|
|
3038
|
+
parser.add_argument(
|
|
3039
|
+
'-urlr',
|
|
3040
|
+
'--urlscan-rate-limit-retry',
|
|
3041
|
+
action='store',
|
|
3042
|
+
type=int,
|
|
3043
|
+
help='The number of minutes the user wants to wait for a rate limit pause on URLScan.io instead of stopping with a 429 error (default: 1).',
|
|
3044
|
+
default=1
|
|
3045
|
+
)
|
|
3046
|
+
parser.add_argument(
|
|
3047
|
+
"-co",
|
|
3048
|
+
"--check-only",
|
|
3049
|
+
action="store_true",
|
|
3050
|
+
help="This will make a few minimal requests to show you how many requests, and roughly how long it could take, to get URLs from the sources and downloaded responses from Wayback Machine.",
|
|
3051
|
+
)
|
|
3052
|
+
parser.add_argument(
|
|
3053
|
+
"-nd",
|
|
3054
|
+
"--notify-discord",
|
|
3055
|
+
action="store_true",
|
|
3056
|
+
help="Whether to send a notification to Discord when waymore completes. It requires WEBHOOK_DISCORD to be provided in the config.yml file.",
|
|
3057
|
+
)
|
|
3058
|
+
parser.add_argument(
|
|
3059
|
+
'-oijs',
|
|
3060
|
+
'--output-inline-js',
|
|
3061
|
+
action="store_true",
|
|
3062
|
+
help='Whether to save combined inline javascript of all relevant files in the response directory when "-mode R" (or "-mode B") has been used. The files are saved with the name "combined_inline{}.js" where "{}" is the number of the file, saving 1000 unique scripts per file. '
|
|
3063
|
+
)
|
|
3064
|
+
parser.add_argument('-v', '--verbose', action='store_true', help="Verbose output")
|
|
3065
|
+
parser.add_argument('--version', action='store_true', help="Show version number")
|
|
3066
|
+
args = parser.parse_args()
|
|
3067
|
+
|
|
3068
|
+
# If --version was passed, display version and exit
|
|
3069
|
+
if args.version:
|
|
3070
|
+
write(colored('Waymore - v' + __version__,'cyan'))
|
|
3071
|
+
sys.exit()
|
|
3072
|
+
|
|
3073
|
+
# If -lcc wasn't passed then set to the default of 3 if -lcy is 0. This will make them work together
|
|
3074
|
+
if args.lcc is None:
|
|
3075
|
+
if args.lcy == 0:
|
|
3076
|
+
args.lcc = 3
|
|
3077
|
+
else:
|
|
3078
|
+
args.lcc = 0
|
|
3079
|
+
|
|
3080
|
+
# If no input was given, raise an error
|
|
3081
|
+
if sys.stdin.isatty():
|
|
3082
|
+
if args.input is None:
|
|
3083
|
+
writerr(colored('You need to provide an input with -i argument or through <stdin>.', 'red'))
|
|
3084
|
+
sys.exit()
|
|
3085
|
+
else:
|
|
3086
|
+
validateArgInput('<stdin>')
|
|
3087
|
+
|
|
3088
|
+
# Get the current Process ID to use to get memory usage that is displayed with -vv option
|
|
3089
|
+
try:
|
|
3090
|
+
process = psutil.Process(os.getpid())
|
|
3091
|
+
except:
|
|
3092
|
+
pass
|
|
3093
|
+
|
|
3094
|
+
showBanner()
|
|
3095
|
+
|
|
3096
|
+
try:
|
|
3097
|
+
|
|
3098
|
+
# For each input (maybe multiple if a file was passed)
|
|
3099
|
+
for inpt in inputValues:
|
|
3100
|
+
|
|
3101
|
+
argsInput = inpt.strip().rstrip('\n').strip('.').lower()
|
|
3102
|
+
|
|
3103
|
+
# Get the input hostname
|
|
3104
|
+
tldExtract = tldextract.extract(argsInput)
|
|
3105
|
+
subDomain = tldExtract.subdomain
|
|
3106
|
+
inputIsSubDomain = False
|
|
3107
|
+
if subDomain != '':
|
|
3108
|
+
inputIsSubDomain = True
|
|
3109
|
+
subDomain = subDomain+'.'
|
|
3110
|
+
argsInputHostname = subDomain+tldExtract.domain+'.'+tldExtract.suffix
|
|
3111
|
+
|
|
3112
|
+
# Warn user if a sub domains may have been passed
|
|
3113
|
+
if inputIsSubDomain:
|
|
3114
|
+
writerr(colored(getSPACER('IMPORTANT: It looks like you may be passing a subdomain. If you want ALL subs for a domain, then pass the domain only. It will be a LOT quicker, and you won\'t miss anything. NEVER pass a file of subdomains if you want everything, just the domains.\n'),'yellow'))
|
|
3115
|
+
|
|
3116
|
+
# Reset global variables
|
|
3117
|
+
linksFound = set()
|
|
3118
|
+
linkMimes = set()
|
|
3119
|
+
successCount = 0
|
|
3120
|
+
failureCount = 0
|
|
3121
|
+
fileCount = 0
|
|
3122
|
+
totalResponses = 0
|
|
3123
|
+
totalPages = 0
|
|
3124
|
+
indexFile = None
|
|
3125
|
+
path = ''
|
|
3126
|
+
stopSource = False
|
|
3127
|
+
|
|
3128
|
+
# Get the config settings from the config.yml file
|
|
3129
|
+
getConfig()
|
|
3130
|
+
|
|
3131
|
+
if verbose():
|
|
3132
|
+
showOptions()
|
|
3133
|
+
|
|
3134
|
+
if args.check_only:
|
|
3135
|
+
write(colored('*** Checking requests needed for ','cyan')+colored(argsInput,'white')+colored(' ***\n','cyan'))
|
|
3136
|
+
|
|
3137
|
+
# If the mode is U (URLs retrieved) or B (URLs retrieved AND Responses downloaded)
|
|
3138
|
+
if args.mode in ['U','B']:
|
|
3139
|
+
|
|
3140
|
+
# If not requested to exclude, get URLs from the Wayback Machine (archive.org)
|
|
3141
|
+
if not args.xwm and stopProgram is None:
|
|
3142
|
+
getWaybackUrls()
|
|
3143
|
+
|
|
3144
|
+
# If not requested to exclude, get URLs from commoncrawl.org
|
|
3145
|
+
if not args.xcc and stopProgram is None:
|
|
3146
|
+
getCommonCrawlUrls()
|
|
3147
|
+
|
|
3148
|
+
# If not requested to exclude and a TLD wasn't passed, get URLs from alienvault.com
|
|
3149
|
+
if not args.xav and stopProgram is None and not inpt.startswith('.'):
|
|
3150
|
+
getAlienVaultUrls()
|
|
3151
|
+
|
|
3152
|
+
# If not requested to exclude, get URLs from urlscan.io
|
|
3153
|
+
if not args.xus and stopProgram is None:
|
|
3154
|
+
getURLScanUrls()
|
|
3155
|
+
|
|
3156
|
+
# If not requested to exclude, get URLs from virustotal.com if we have an API key
|
|
3157
|
+
if not args.xvt and VIRUSTOTAL_API_KEY != '' and stopProgram is None:
|
|
3158
|
+
getVirusTotalUrls()
|
|
3159
|
+
|
|
3160
|
+
# Output results of all searches
|
|
3161
|
+
processURLOutput()
|
|
3162
|
+
|
|
3163
|
+
# Clean up
|
|
3164
|
+
linkMimes = None
|
|
3165
|
+
|
|
3166
|
+
# If we want to get actual archived responses from archive.org...
|
|
3167
|
+
if (args.mode in ['R','B']) and stopProgram is None:
|
|
3168
|
+
|
|
3169
|
+
# Get the output directory for responses
|
|
3170
|
+
if args.output_responses != '':
|
|
3171
|
+
responseOutputDirectory = args.output_responses + '/'
|
|
3172
|
+
else:
|
|
3173
|
+
responseOutputDirectory = str(DEFAULT_OUTPUT_DIR) + '/results/' + str(argsInput).replace('/','-') + '/'
|
|
3174
|
+
|
|
3175
|
+
processResponses()
|
|
3176
|
+
|
|
3177
|
+
# Output details of the responses downloaded
|
|
3178
|
+
if not args.check_only:
|
|
3179
|
+
processResponsesOutput()
|
|
3180
|
+
|
|
3181
|
+
# If requested, generate the combined inline JS files
|
|
3182
|
+
if stopProgram is None and fileCount > 0 and args.output_inline_js:
|
|
3183
|
+
combineInlineJS()
|
|
3184
|
+
|
|
3185
|
+
if args.check_only:
|
|
3186
|
+
write(colored('NOTE: The time frames are a very rough guide and doesn\'t take into account additonal time for rate limiting.','magenta'))
|
|
3187
|
+
|
|
3188
|
+
# Output stats if -v option was selected
|
|
3189
|
+
if verbose():
|
|
3190
|
+
processStats()
|
|
3191
|
+
|
|
3192
|
+
# If the program was stopped then alert the user
|
|
3193
|
+
if stopProgram is not None:
|
|
3194
|
+
if stopProgram == StopProgram.MEMORY_THRESHOLD:
|
|
3195
|
+
writerr(
|
|
3196
|
+
colored(
|
|
3197
|
+
"YOUR MEMORY USAGE REACHED "
|
|
3198
|
+
+ str(maxMemoryPercent)
|
|
3199
|
+
+ "% SO THE PROGRAM WAS STOPPED. DATA IS LIKELY TO BE INCOMPLETE.\n",
|
|
3200
|
+
"red",
|
|
3201
|
+
)
|
|
3202
|
+
)
|
|
3203
|
+
elif stopProgram == StopProgram.WEBARCHIVE_PROBLEM:
|
|
3204
|
+
writerr(
|
|
3205
|
+
colored(
|
|
3206
|
+
"THE PROGRAM WAS STOPPED DUE TO PROBLEM GETTING DATA FROM WAYBACK MACHINE (ARCHIVE.ORG)\n",
|
|
3207
|
+
"red",
|
|
3208
|
+
)
|
|
3209
|
+
)
|
|
3210
|
+
else:
|
|
3211
|
+
writerr(
|
|
3212
|
+
colored(
|
|
3213
|
+
"THE PROGRAM WAS STOPPED. DATA IS LIKELY TO BE INCOMPLETE.\n",
|
|
3214
|
+
"red",
|
|
3215
|
+
)
|
|
3216
|
+
)
|
|
3217
|
+
|
|
3218
|
+
except Exception as e:
|
|
3219
|
+
writerr(colored('ERROR main 1: ' + str(e), 'red'))
|
|
3220
|
+
|
|
3221
|
+
finally:
|
|
3222
|
+
# Send a notification to discord if requested
|
|
3223
|
+
try:
|
|
3224
|
+
if args.notify_discord and WEBHOOK_DISCORD != '':
|
|
3225
|
+
notifyDiscord()
|
|
3226
|
+
except:
|
|
3227
|
+
pass
|
|
3228
|
+
try:
|
|
3229
|
+
if sys.stdout.isatty():
|
|
3230
|
+
writerr(colored('✅ Want to buy me a coffee? ☕ https://ko-fi.com/xnlh4ck3r 🤘', 'green'))
|
|
3231
|
+
except:
|
|
3232
|
+
pass
|
|
3233
|
+
# Clean up
|
|
3234
|
+
linksFound = None
|
|
3235
|
+
linkMimes = None
|
|
3236
|
+
inputValues = None
|
|
3237
|
+
|
|
3238
|
+
if __name__ == '__main__':
|
|
3239
|
+
main()
|