waymore 4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/waymore.py ADDED
@@ -0,0 +1,3239 @@
1
+ #!/usr/bin/env python
2
+ # Python 3
3
+ # waymore - by @Xnl-h4ck3r: Find way more from the Wayback Machine (also get links from Common Crawl, AlienVault OTX, URLScan and VirusTotal)
4
+ # Full help here: https://github.com/xnl-h4ck3r/waymore/blob/main/README.md
5
+ # Good luck and good hunting! If you really love the tool (or any others), or they helped you find an awesome bounty, consider BUYING ME A COFFEE! (https://ko-fi.com/xnlh4ck3r) ☕ (I could use the caffeine!)
6
+
7
+ from urllib.parse import urlparse
8
+ import requests
9
+ from requests.exceptions import ConnectionError
10
+ from requests.utils import quote
11
+ from requests.adapters import HTTPAdapter, Retry
12
+ import argparse
13
+ from signal import SIGINT, signal
14
+ import multiprocessing.dummy as mp
15
+ from termcolor import colored
16
+ from datetime import datetime, timedelta
17
+ from pathlib import Path
18
+ import yaml
19
+ import os
20
+ import json
21
+ import re
22
+ import random
23
+ import sys
24
+ import math
25
+ import enum
26
+ import pickle
27
+ import time
28
+ import tldextract
29
+ try:
30
+ from . import __version__
31
+ except:
32
+ pass
33
+ from tqdm import tqdm
34
+
35
+ # Try to import psutil to show memory usage
36
+ try:
37
+ import psutil
38
+ except:
39
+ currentMemUsage = -1
40
+ maxMemoryUsage = -1
41
+ currentMemPercent = -1
42
+ maxMemoryPercent = -1
43
+
44
+ # Creating stopProgram enum
45
+ class StopProgram(enum.Enum):
46
+ SIGINT = 1
47
+ WEBARCHIVE_PROBLEM = 2
48
+ MEMORY_THRESHOLD = 3
49
+ stopProgram = None
50
+
51
+ # Global variables
52
+ linksFound = set()
53
+ linkMimes = set()
54
+ inputValues = set()
55
+ argsInput = ''
56
+ isInputFile = False
57
+ stopProgramCount = 0
58
+ stopSource = False
59
+ successCount = 0
60
+ failureCount = 0
61
+ fileCount = 0
62
+ totalResponses = 0
63
+ totalPages = 0
64
+ indexFile = None
65
+ continueRespFile = None
66
+ inputIsDomainANDPath = False
67
+ inputIsSubDomain = False
68
+ subs = '*.'
69
+ path = ''
70
+ waymorePath = ''
71
+ terminalWidth = 135
72
+ maxMemoryUsage = 0
73
+ currentMemUsage = 0
74
+ maxMemoryPercent = 0
75
+ currentMemPercent = 0
76
+ HTTP_ADAPTER = None
77
+ HTTP_ADAPTER_CC = None
78
+ checkWayback = 0
79
+ checkCommonCrawl = 0
80
+ checkAlienVault = 0
81
+ checkURLScan = 0
82
+ checkVirusTotal = 0
83
+ argsInputHostname = ''
84
+ responseOutputDirectory = ''
85
+
86
+ # Source Provider URLs
87
+ WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}&collapse={COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
88
+ CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
89
+ ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
90
+ URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
91
+ VIRUSTOTAL_URL = 'https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}'
92
+
93
+ # User Agents to use when making requests, chosen at random
94
+ USER_AGENT = [
95
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
96
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
97
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
98
+ "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
99
+ "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
100
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/99.0.1150.36",
101
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
102
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
103
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
104
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",
105
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
106
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
107
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0",
108
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
109
+ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
110
+ "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
111
+ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
112
+ "Mozilla/5.0 (iPad; CPU OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
113
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4",
114
+ ]
115
+
116
+ # The default maximum number of responses to download
117
+ DEFAULT_LIMIT = 5000
118
+
119
+ # The default timeout for archived responses to be retrieved in seconds
120
+ DEFAULT_TIMEOUT = 30
121
+
122
+ # Exclusions used to exclude responses we will try to get from web.archive.org
123
+ DEFAULT_FILTER_URL = '.css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap'
124
+
125
+ # MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
126
+ DEFAULT_FILTER_MIME = 'text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff'
127
+
128
+ # Response code exclusions we will use to filter links and responses from web.archive.org through their API
129
+ DEFAULT_FILTER_CODE = '404,301,302'
130
+
131
+ # Used to filter out downloaded responses that could be custom 404 pages
132
+ REGEX_404 = r'<title>[^\<]*(404|not found)[^\<]*</title>'
133
+
134
+ # Keywords
135
+ DEFAULT_FILTER_KEYWORDS = 'admin,login,logon,signin,signup,register,registration,dash,portal,ftp,panel,.js,api,robots.txt,graph,gql,config,backup,debug,db,database,git,cgi-bin,swagger,zip,rar,tar.gz,internal,jira,jenkins,confluence,atlassian,okta,corp,upload,delete,email,sql,create,edit,test,temp,cache,wsdl,log,payment,setting,mail,file,redirect,chat,billing,doc,trace,cp,ftp,gateway,import,proxy,dev,stage,stg,uat'
136
+
137
+ # Yaml config values
138
+ FILTER_URL = ''
139
+ FILTER_MIME = ''
140
+ FILTER_CODE = ''
141
+ MATCH_CODE = ''
142
+ FILTER_KEYWORDS = ''
143
+ URLSCAN_API_KEY = ''
144
+ CONTINUE_RESPONSES_IF_PIPED = True
145
+ WEBHOOK_DISCORD = ''
146
+ DEFAULT_OUTPUT_DIR = ''
147
+
148
+ API_KEY_SECRET = "aHR0cHM6Ly95b3V0dS5iZS9kUXc0dzlXZ1hjUQ=="
149
+
150
+ # When -oijs is passed, and the downloaded responses are checked for scripts, files with these extensions will be ignored
151
+ INLINE_JS_EXCLUDE = ['.js', '.csv', '.xls', '.xlsx', '.doc', '.docx', '.pdf', '.msi', '.zip', '.gzip', '.gz', '.tar', '.rar', '.json']
152
+
153
+ # Get memory usage for
154
+ def getMemory():
155
+
156
+ global currentMemUsage, currentMemPercent, maxMemoryUsage, maxMemoryPercent, stopProgram
157
+
158
+ try:
159
+ currentMemUsage = process.memory_info().rss
160
+ currentMemPercent = math.ceil(psutil.virtual_memory().percent)
161
+ if currentMemUsage > maxMemoryUsage:
162
+ maxMemoryUsage = currentMemUsage
163
+ if currentMemPercent > maxMemoryPercent:
164
+ maxMemoryPercent = currentMemPercent
165
+ if currentMemPercent > args.memory_threshold:
166
+ stopProgram = StopProgram.MEMORY_THRESHOLD
167
+ except:
168
+ pass
169
+
170
+ # Convert bytes to human readable form
171
+ def humanReadableSize(size, decimal_places=2):
172
+ for unit in ["B", "KB", "MB", "GB", "TB", "PB"]:
173
+ if size < 1024.0 or unit == "PB":
174
+ break
175
+ size /= 1024.0
176
+ return f"{size:.{decimal_places}f} {unit}"
177
+
178
+ # Display stats if -v argument was chosen
179
+ def processStats():
180
+ if maxMemoryUsage > 0:
181
+ write("MAX MEMORY USAGE: " + humanReadableSize(maxMemoryUsage))
182
+ elif maxMemoryUsage < 0:
183
+ write('MAX MEMORY USAGE: To show memory usage, run "pip install psutil"')
184
+ if maxMemoryPercent > 0:
185
+ write(
186
+ "MAX TOTAL MEMORY: "
187
+ + str(maxMemoryPercent)
188
+ + "% (Threshold "
189
+ + str(args.memory_threshold)
190
+ + "%)"
191
+ )
192
+ elif maxMemoryUsage < 0:
193
+ write('MAX TOTAL MEMORY: To show total memory %, run "pip install psutil"')
194
+ write()
195
+
196
+ def write(text='',pipe=False):
197
+ # Only send text to stdout if the tool isn't piped to pass output to something else,
198
+ # or if the tool has been piped and the pipe parameter is True
199
+ if sys.stdout.isatty() or (not sys.stdout.isatty() and pipe):
200
+ # If it has carriage return in the string, don't add a newline
201
+ if text.find('\r') > 0:
202
+ sys.stdout.write(text)
203
+ else:
204
+ sys.stdout.write(text+'\n')
205
+
206
+ def writerr(text='',pipe=False):
207
+ # Only send text to stdout if the tool isn't piped to pass output to something else,
208
+ # or If the tool has been piped to output the send to stderr
209
+ if sys.stdout.isatty():
210
+ # If it has carriage return in the string, don't add a newline
211
+ if text.find('\r') > 0:
212
+ sys.stdout.write(text)
213
+ else:
214
+ sys.stdout.write(text+'\n')
215
+ else:
216
+ # If it has carriage return in the string, don't add a newline
217
+ if text.find('\r') > 0:
218
+ sys.stderr.write(text)
219
+ else:
220
+ sys.stderr.write(text+'\n')
221
+
222
+ def showVersion():
223
+ try:
224
+ try:
225
+ resp = requests.get('https://raw.githubusercontent.com/xnl-h4ck3r/waymore/main/waymore/__init__.py',timeout=3)
226
+ except:
227
+ write('Current waymore version '+__version__+' (unable to check if latest)\n')
228
+ if __version__ == resp.text.split('=')[1].replace('"',''):
229
+ write('Current waymore version '+__version__+' ('+colored('latest','green')+')\n')
230
+ else:
231
+ write('Current waymore version '+__version__+' ('+colored('outdated','red')+')\n')
232
+ except:
233
+ pass
234
+
235
+ def showBanner():
236
+ write()
237
+ write(colored(" _ _ _ _ _ ","red")+"____ ")
238
+ write(colored("| | | |_____| | | ","red")+r"/ \ ___ ____ _____ ")
239
+ write(colored("| | | (____ | | | ","red")+r"| | | |/ _ \ / ___) ___ |")
240
+ write(colored("| | | / ___ | |_| ","red")+"| | | | |_| | | | |_| |")
241
+ write(colored(r" \___/\_____|\__ ","red")+r"|_|_|_|\___/| | | ____/")
242
+ write(colored(" (____/ ","red")+colored(" by Xnl-h4ck3r ","magenta")+r" \_____)")
243
+ try:
244
+ currentDate = datetime.now().date()
245
+ if currentDate.month == 12 and currentDate.day in (24,25):
246
+ write(colored(" *** 🎅 HAPPY CHRISTMAS! 🎅 ***","green",attrs=["blink"]))
247
+ elif currentDate.month == 10 and currentDate.day == 31:
248
+ write(colored(" *** 🎃 HAPPY HALLOWEEN! 🎃 ***","red",attrs=["blink"]))
249
+ elif currentDate.month == 1 and currentDate.day in (1,2,3,4,5):
250
+ write(colored(" *** 🥳 HAPPY NEW YEAR!! 🥳 ***","yellow",attrs=["blink"]))
251
+ except:
252
+ pass
253
+ write()
254
+ showVersion()
255
+
256
+ def verbose():
257
+ """
258
+ Functions used when printing messages dependant on verbose option
259
+ """
260
+ return args.verbose
261
+
262
+ def handler(signal_received, frame):
263
+ """
264
+ This function is called if Ctrl-C is called by the user
265
+ An attempt will be made to try and clean up properly
266
+ """
267
+ global stopSource, stopProgram, stopProgramCount
268
+
269
+ if stopProgram is not None:
270
+ stopProgramCount = stopProgramCount + 1
271
+ if stopProgramCount == 1:
272
+ writerr(colored(getSPACER(">>> Please be patient... Trying to save data and end gracefully!"),'red'))
273
+ elif stopProgramCount == 2:
274
+ writerr(colored(getSPACER(">>> SERIOUSLY... YOU DON'T WANT YOUR DATA SAVED?!"), 'red'))
275
+ elif stopProgramCount == 3:
276
+ writerr(colored(getSPACER(r">>> Patience isn't your strong suit eh? ¯\_(ツ)_/¯"), 'red'))
277
+ sys.exit()
278
+ else:
279
+ stopProgram = StopProgram.SIGINT
280
+ stopSource = True
281
+ writerr(colored(getSPACER('>>> "Oh my God, they killed Kenny... and waymore!" - Kyle'), "red"))
282
+ writerr(colored(getSPACER('>>> Attempting to rescue any data gathered so far...'), "red"))
283
+
284
+ def showOptions():
285
+ """
286
+ Show the chosen options and config settings
287
+ """
288
+ global inputIsDomainANDPath, argsInput, isInputFile
289
+
290
+ try:
291
+ write(colored('Selected config and settings:', 'cyan'))
292
+
293
+ if isInputFile:
294
+ inputArgDesc = '-i <FILE: current line>: '
295
+ else:
296
+ inputArgDesc = '-i: '
297
+ if inputIsDomainANDPath:
298
+ write(colored(inputArgDesc + argsInput, 'magenta')+colored(' The target URL to search for.','white'))
299
+ else: # input is a domain
300
+ write(colored(inputArgDesc + argsInput, 'magenta')+colored(' The target domain to search for.','white'))
301
+
302
+ if args.mode == 'U':
303
+ write(colored('-mode: ' + args.mode, 'magenta')+colored(' Only URLs will be retrieved for the input.','white'))
304
+ elif args.mode == 'R':
305
+ write(colored('-mode: ' + args.mode, 'magenta')+colored(' Only Responses will be downloaded for the input.','white'))
306
+ elif args.mode == 'B':
307
+ write(colored('-mode: ' + args.mode, 'magenta')+colored(' URLs will be retrieved AND Responses will be downloaded for the input.','white'))
308
+
309
+ if args.config is not None:
310
+ write(colored('-c: ' + args.config, 'magenta')+colored(' The path of the YML config file.','white'))
311
+
312
+ if args.no_subs:
313
+ write(colored('-n: ' +str(args.no_subs), 'magenta')+colored(' Sub domains are excluded in the search.','white'))
314
+ else:
315
+ write(colored('-n: ' +str(args.no_subs), 'magenta')+colored(' Sub domains are included in the search.','white'))
316
+
317
+ write(colored('-xwm: ' +str(args.xwm), 'magenta')+colored(' Whether to exclude checks for links from Wayback Machine (archive.org)','white'))
318
+ write(colored('-xcc: ' +str(args.xcc), 'magenta')+colored(' Whether to exclude checks for links from commoncrawl.org','white'))
319
+ if not args.xcc:
320
+ if args.lcc ==0 and args.lcy == 0:
321
+ write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' Search ALL Common Crawl index collections.','white'))
322
+ else:
323
+ if args.lcy == 0:
324
+ write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' The number of latest Common Crawl index collections to be searched.','white'))
325
+ else:
326
+ if args.lcc != 0:
327
+ write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' The number of latest Common Crawl index collections to be searched.','white'))
328
+ write(colored('-lcy: ' +str(args.lcy), 'magenta')+colored(' Search all Common Crawl index collections with data from year '+str(args.lcy)+' and after.','white'))
329
+ write(colored('-xav: ' +str(args.xav), 'magenta')+colored(' Whether to exclude checks for links from alienvault.com','white'))
330
+ write(colored('-xus: ' +str(args.xus), 'magenta')+colored(' Whether to exclude checks for links from urlscan.io','white'))
331
+ if URLSCAN_API_KEY == '':
332
+ write(colored('URLScan API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://urlscan.io/user/signup which will let you get more back, and quicker.','white'))
333
+ else:
334
+ write(colored('URLScan API Key: ', 'magenta')+colored(URLSCAN_API_KEY))
335
+ write(colored('-xvt: ' +str(args.xvt), 'magenta')+colored(' Whether to exclude checks for links from virustotal.com','white'))
336
+ if VIRUSTOTAL_API_KEY == '':
337
+ write(colored('VirusTotal API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://www.virustotal.com/gui/join-us which will let you get some extra URLs.','white'))
338
+ else:
339
+ write(colored('VirusTotal API Key: ', 'magenta')+colored(VIRUSTOTAL_API_KEY))
340
+
341
+ if args.mode in ['U','B']:
342
+ if args.output_urls != '':
343
+ write(colored('-oU: ' +str(args.output_urls), 'magenta')+colored(' The name of the output file for URL links.','white'))
344
+ write(colored('-ow: ' +str(args.output_overwrite), 'magenta')+colored(' Whether the URL output file will be overwritten if it already exists. If False (default), it will be appended to, and duplicates removed.','white'))
345
+ write(colored('-nlf: ' +str(args.new_links_file), 'magenta')+colored(' Whether the URL output file ".new" version will also be written. It will include only new links found for the same target on subsequent runs. This can be used for continuous monitoring of a target.','white'))
346
+
347
+ if args.mode in ['R','B']:
348
+ if args.output_responses != '':
349
+ write(colored('-oR: ' +str(args.output_responses), 'magenta')+colored(' The directory to store archived responses and index file.','white'))
350
+ if args.limit == 0:
351
+ write(colored('-l: ' +str(args.limit), 'magenta')+colored(' Save ALL responses found.','white'))
352
+ else:
353
+ if args.limit > 0:
354
+ write(colored('-l: ' +str(args.limit), 'magenta')+colored(' Only save the FIRST ' + str(args.limit) + ' responses found.','white'))
355
+ else:
356
+ write(colored('-l: ' +str(args.limit), 'magenta')+colored(' Only save the LAST ' + str(abs(args.limit)) + ' responses found.','white'))
357
+
358
+ if args.from_date is not None:
359
+ write(colored('-from: ' +str(args.from_date), 'magenta')+colored(' The date/time to get responses from.','white'))
360
+ if args.to_date is not None:
361
+ write(colored('-to: ' +str(args.to_date), 'magenta')+colored(' The date/time to get responses up to.','white'))
362
+
363
+ if args.capture_interval == 'h':
364
+ write(colored('-ci: ' +args.capture_interval, 'magenta')+colored(' Get at most 1 archived response per hour from Wayback Machine (archive.org)','white'))
365
+ elif args.capture_interval == 'd':
366
+ write(colored('-ci: ' +args.capture_interval, 'magenta')+colored(' Get at most 1 archived response per day from Wayback Machine (archive.org)','white'))
367
+ elif args.capture_interval == 'm':
368
+ write(colored('-ci: ' +args.capture_interval, 'magenta')+colored(' Get at most 1 archived response per month from Wayback Machine (archive.org)','white'))
369
+ elif args.capture_interval == 'none':
370
+ write(colored('-ci: ' +args.capture_interval, 'magenta')+colored(' There will not be any filtering based on the capture interval.','white'))
371
+
372
+ if args.url_filename:
373
+ write(colored('-url-filename: ' +str(args.url_filename), 'magenta')+colored(' The filenames of downloaded responses wil be set to the URL rather than the hash value of the response.','white'))
374
+
375
+ write(colored('-oijs: '+str(args.output_inline_js), 'magenta')+colored(' Whether the combined JS of all responses will be written to one or more files.','white'))
376
+
377
+ write(colored('-f: ' +str(args.filter_responses_only), 'magenta')+colored(' If True, the initial links from wayback machine will not be filtered, only the responses that are downloaded will be filtered. It maybe useful to still see all available paths even if you don\'t want to check the file for content.','white'))
378
+ if args.keywords_only is not None and args.keywords_only != '#CONFIG':
379
+ write(colored('-ko: ' +str(args.keywords_only), 'magenta')+colored(' Only get results that match the given Regex.','white'))
380
+
381
+ write(colored('-lr: ' +str(args.limit_requests), 'magenta')+colored(' The limit of requests made per source when getting links. A value of 0 (Zero) means no limit is applied.','white'))
382
+ if args.mc:
383
+ write(colored('-mc: ' +str(args.mc), 'magenta')+colored(' Only retrieve URLs and Responses that match these HTTP Status codes.','white'))
384
+ else:
385
+ if args.fc:
386
+ write(colored('-fc: ' +str(args.mc), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these HTTP Status codes.','white'))
387
+ write(colored('MIME Type exclusions: ', 'magenta')+colored(FILTER_MIME))
388
+ if not args.mc and args.fc:
389
+ write(colored('Response Code exclusions: ', 'magenta')+colored(FILTER_CODE))
390
+ write(colored('Response URL exclusions: ', 'magenta')+colored(FILTER_URL))
391
+ if args.keywords_only and args.keywords_only == '#CONFIG':
392
+ if FILTER_KEYWORDS == '':
393
+ write(colored('Keywords only: ', 'magenta')+colored('It looks like no keywords have been set in config.yml file.','red'))
394
+ else:
395
+ write(colored('Keywords only: ', 'magenta')+colored(FILTER_KEYWORDS))
396
+
397
+ if args.notify_discord:
398
+ if WEBHOOK_DISCORD == '' or WEBHOOK_DISCORD == 'YOUR_WEBHOOK':
399
+ write(colored('Discord Webhook: ', 'magenta')+colored('It looks like no Discord webhook has been set in config.yml file.','red'))
400
+ else:
401
+ write(colored('Discord Webhook: ', 'magenta')+colored(WEBHOOK_DISCORD))
402
+
403
+ write(colored('Default Output Directory: ', 'magenta')+colored(str(DEFAULT_OUTPUT_DIR)))
404
+
405
+ if args.regex_after is not None:
406
+ write(colored('-ra: ' + args.regex_after, 'magenta')+colored(' RegEx for filtering purposes against found links from all sources of URLs AND responses downloaded. Only positive matches will be output.','white'))
407
+ if args.mode in ['R','B']:
408
+ write(colored('-t: ' + str(args.timeout), 'magenta')+colored(' The number of seconds to wait for a an archived response.','white'))
409
+ if args.mode in ['R','B'] or (args.mode == 'U' and not args.xcc):
410
+ write(colored('-p: ' + str(args.processes), 'magenta')+colored(' The number of parallel requests made.','white'))
411
+ write(colored('-r: ' + str(args.retries), 'magenta')+colored(' The number of retries for requests that get connection error or rate limited.','white'))
412
+
413
+ if not args.xwm:
414
+ write(colored('-wrlr: ' + str(args.wayback_rate_limit_retry), 'magenta')+colored(' The number of minutes to wait for a rate limit pause on Wayback Machine (archive.org) instead of stopping with a 429 error.','white'))
415
+ if not args.xus:
416
+ write(colored('-urlr: ' + str(args.urlscan_rate_limit_retry), 'magenta')+colored(' The number of minutes to wait for a rate limit pause on URLScan.io instead of stopping with a 429 error.','white'))
417
+
418
+ write()
419
+
420
+ except Exception as e:
421
+ writerr(colored('ERROR showOptions: ' + str(e), 'red'))
422
+
423
+ def getConfig():
424
+ """
425
+ Try to get the values from the config file, otherwise use the defaults
426
+ """
427
+ global FILTER_CODE, FILTER_MIME, FILTER_URL, FILTER_KEYWORDS, URLSCAN_API_KEY, VIRUSTOTAL_API_KEY, CONTINUE_RESPONSES_IF_PIPED, subs, path, waymorePath, inputIsDomainANDPath, HTTP_ADAPTER, HTTP_ADAPTER_CC, argsInput, terminalWidth, MATCH_CODE, WEBHOOK_DISCORD, DEFAULT_OUTPUT_DIR
428
+ try:
429
+
430
+ # Set terminal width
431
+ try:
432
+ terminalWidth = os.get_terminal_size().columns
433
+ except:
434
+ terminalWidth = 135
435
+
436
+ # If the input doesn't have a / then assume it is a domain rather than a domain AND path
437
+ if str(argsInput).find('/') < 0:
438
+ path = '/*'
439
+ inputIsDomainANDPath = False
440
+ else:
441
+ # If there is only one / and is the last character, remove it
442
+ if str(argsInput).count('/') == 1 and str(argsInput)[-1:] == '/':
443
+ argsInput = argsInput.replace('/','')
444
+ path = '/*'
445
+ inputIsDomainANDPath = False
446
+ else:
447
+ path = '*'
448
+ inputIsDomainANDPath = True
449
+
450
+ # If the -no-subs argument was passed, don't include subs
451
+ # Also, if a path is passed, the subs will not be used
452
+ if args.no_subs or inputIsDomainANDPath:
453
+ subs = ''
454
+
455
+ # Set up an HTTPAdaptor for retry strategy when making requests
456
+ try:
457
+ retry= Retry(
458
+ total=args.retries,
459
+ backoff_factor=1.1,
460
+ status_forcelist=[429, 500, 502, 503, 504],
461
+ raise_on_status=False,
462
+ respect_retry_after_header=False
463
+ )
464
+ HTTP_ADAPTER = HTTPAdapter(max_retries=retry)
465
+ except Exception as e:
466
+ writerr(colored('ERROR getConfig 2: ' + str(e), 'red'))
467
+
468
+ # Set up an HTTPAdaptor for retry strategy for Common Crawl when making requests
469
+ try:
470
+ retry= Retry(
471
+ total=args.retries+20,
472
+ backoff_factor=1.1,
473
+ status_forcelist=[503],
474
+ raise_on_status=False,
475
+ respect_retry_after_header=False
476
+ )
477
+ HTTP_ADAPTER_CC = HTTPAdapter(max_retries=retry)
478
+ except Exception as e:
479
+ writerr(colored('ERROR getConfig 3: ' + str(e), 'red'))
480
+
481
+ # Try to get the config file values
482
+ useDefaults = False
483
+ try:
484
+ # Get the path of the config file. If -c / --config argument is not passed, then it defaults to config.yml in the same directory as the run file
485
+ waymorePath = (
486
+ Path(os.path.join(os.getenv('APPDATA', ''), 'waymore')) if os.name == 'nt'
487
+ else Path(os.path.join(os.path.expanduser("~"), ".config", "waymore")) if os.name == 'posix'
488
+ else Path(os.path.join(os.path.expanduser("~"), "Library", "Application Support", "waymore")) if os.name == 'darwin'
489
+ else None
490
+ )
491
+ waymorePath.absolute
492
+ if args.config is None:
493
+ if waymorePath == '':
494
+ configPath = 'config.yml'
495
+ else:
496
+ configPath = Path(waymorePath / 'config.yml')
497
+ else:
498
+ configPath = Path(args.config)
499
+ config = yaml.safe_load(open(configPath))
500
+ try:
501
+ FILTER_URL = config.get('FILTER_URL')
502
+ if str(FILTER_URL) == 'None':
503
+ writerr(colored('No value for "FILTER_URL" in config.yml - default set', 'yellow'))
504
+ FILTER_URL = ''
505
+ except Exception as e:
506
+ writerr(colored('Unable to read "FILTER_URL" from config.yml - default set', 'red'))
507
+ FILTER_URL = DEFAULT_FILTER_URL
508
+
509
+ try:
510
+ FILTER_MIME = config.get('FILTER_MIME')
511
+ if str(FILTER_MIME) == 'None':
512
+ writerr(colored('No value for "FILTER_MIME" in config.yml - default set', 'yellow'))
513
+ FILTER_MIME = ''
514
+ except Exception as e:
515
+ writerr(colored('Unable to read "FILTER_MIME" from config.yml - default set', 'red'))
516
+ FILTER_MIME = DEFAULT_FILTER_MIME
517
+
518
+ # If the argument -fc was passed, don't try to get from the config
519
+ if args.fc:
520
+ FILTER_CODE = args.fc
521
+ else:
522
+ try:
523
+ FILTER_CODE = str(config.get('FILTER_CODE'))
524
+ if str(FILTER_CODE) == 'None':
525
+ writerr(colored('No value for "FILTER_CODE" in config.yml - default set', 'yellow'))
526
+ FILTER_CODE = ''
527
+ except Exception as e:
528
+ writerr(colored('Unable to read "FILTER_CODE" from config.yml - default set', 'red'))
529
+ FILTER_CODE = DEFAULT_FILTER_CODE
530
+
531
+ # Set the match codes if they were passed
532
+ if args.mc:
533
+ MATCH_CODE = args.mc
534
+
535
+ try:
536
+ URLSCAN_API_KEY = config.get('URLSCAN_API_KEY')
537
+ if str(URLSCAN_API_KEY) == 'None':
538
+ if not args.xus:
539
+ writerr(colored('No value for "URLSCAN_API_KEY" in config.yml - consider adding (you can get a FREE api key at urlscan.io)', 'yellow'))
540
+ URLSCAN_API_KEY = ''
541
+ except Exception as e:
542
+ writerr(colored('Unable to read "URLSCAN_API_KEY" from config.yml - consider adding (you can get a FREE api key at urlscan.io)', 'red'))
543
+ URLSCAN_API_KEY = ''
544
+
545
+ try:
546
+ VIRUSTOTAL_API_KEY = config.get('VIRUSTOTAL_API_KEY')
547
+ if str(VIRUSTOTAL_API_KEY) == 'None':
548
+ if not args.xvt:
549
+ writerr(colored('No value for "VIRUSTOTAL_API_KEY" in config.yml - consider adding (you can get a FREE api key at virustotal.com)', 'yellow'))
550
+ VIRUSTOTAL_API_KEY = ''
551
+ except Exception as e:
552
+ writerr(colored('Unable to read "VIRUSTOTAL_API_KEY" from config.yml - consider adding (you can get a FREE api key at virustotal.com)', 'red'))
553
+ VIRUSTOTAL_API_KEY = ''
554
+
555
+ try:
556
+ FILTER_KEYWORDS = config.get('FILTER_KEYWORDS')
557
+ if str(FILTER_KEYWORDS) == 'None':
558
+ writerr(colored('No value for "FILTER_KEYWORDS" in config.yml - default set', 'yellow'))
559
+ FILTER_KEYWORDS = ''
560
+ except Exception as e:
561
+ writerr(colored('Unable to read "FILTER_KEYWORDS" from config.yml - default set', 'red'))
562
+ FILTER_KEYWORDS = ''
563
+
564
+ try:
565
+ CONTINUE_RESPONSES_IF_PIPED = config.get('CONTINUE_RESPONSES_IF_PIPED')
566
+ if str(CONTINUE_RESPONSES_IF_PIPED) == 'None':
567
+ writerr(colored('No value for "CONTINUE_RESPONSES_IF_PIPED" in config.yml - default set', 'yellow'))
568
+ CONTINUE_RESPONSES_IF_PIPED = True
569
+ except Exception as e:
570
+ writerr(colored('Unable to read "CONTINUE_RESPONSES_IF_PIPED" from config.yml - default set', 'red'))
571
+ CONTINUE_RESPONSES_IF_PIPED = True
572
+
573
+ if args.notify_discord:
574
+ try:
575
+ WEBHOOK_DISCORD = config.get('WEBHOOK_DISCORD')
576
+ if str(WEBHOOK_DISCORD) == 'None' or str(WEBHOOK_DISCORD) == 'YOUR_WEBHOOK':
577
+ writerr(colored('No value for "WEBHOOK_DISCORD" in config.yml - default set', 'yellow'))
578
+ WEBHOOK_DISCORD = ''
579
+ except Exception as e:
580
+ writerr(colored('Unable to read "WEBHOOK_DISCORD" from config.yml - default set', 'red'))
581
+ WEBHOOK_DISCORD = ''
582
+
583
+ try:
584
+ DEFAULT_OUTPUT_DIR = config.get('DEFAULT_OUTPUT_DIR')
585
+ if str(DEFAULT_OUTPUT_DIR) == 'None' or str(DEFAULT_OUTPUT_DIR) == '':
586
+ DEFAULT_OUTPUT_DIR = os.path.expanduser(str(waymorePath))
587
+ else:
588
+ # Test if DEFAULT_OUTPUT_DIR is a valid directory
589
+ if not os.path.isdir(DEFAULT_OUTPUT_DIR):
590
+ writerr(colored('The "DEFAULT_OUTPUT_DIR" of "'+str(DEFAULT_OUTPUT_DIR)+'" is not a valid directory. Using "'+str(waymorePath)+'" instead.', 'yellow'))
591
+ DEFAULT_OUTPUT_DIR = os.path.expanduser(str(waymorePath))
592
+ else:
593
+ DEFAULT_OUTPUT_DIR = os.path.expanduser(DEFAULT_OUTPUT_DIR)
594
+ except Exception as e:
595
+ writerr(colored('Unable to read "DEFAULT_OUTPUT_DIR" from config.yml - default set', 'red'))
596
+ DEFAULT_OUTPUT_DIR = waymorePath
597
+
598
+ except yaml.YAMLError as e: # A scan error occurred reading the file
599
+ useDefaults = True
600
+ if args.config is None:
601
+ writerr(colored('WARNING: There seems to be a formatting error in "config.yml", so using default values', 'yellow'))
602
+ else:
603
+ writerr(colored('WARNING: There seems to be a formatting error in "' + args.config + '", so using default values', 'yellow'))
604
+
605
+ except FileNotFoundError as e: # The config file wasn't found
606
+ useDefaults = True
607
+ if args.config is None:
608
+ writerr(colored('WARNING: Cannot find file "config.yml", so using default values', 'yellow'))
609
+ else:
610
+ writerr(colored('WARNING: Cannot find file "' + args.config + '", so using default values', 'yellow'))
611
+
612
+ except Exception as e: # Another error occurred
613
+ useDefaults = True
614
+ if args.config is None:
615
+ writerr(colored('WARNING: Cannot read file "config.yml", so using default values. The following error occurred: ' + str(e), 'yellow'))
616
+ else:
617
+ writerr(colored('WARNING: Cannot read file "' + args.config + '", so using default values. The following error occurred: ' + str(e), 'yellow'))
618
+
619
+ # Use defaults if required
620
+ if useDefaults:
621
+ FILTER_URL = DEFAULT_FILTER_URL
622
+ FILTER_MIME = DEFAULT_FILTER_MIME
623
+ FILTER_CODE = DEFAULT_FILTER_CODE
624
+ URLSCAN_API_KEY = ''
625
+ VIRUSTOTAL_API_KEY = ''
626
+ FILTER_KEYWORDS = ''
627
+ CONTINUE_RESPONSES_IF_PIPED = True
628
+ WEBHOOK_DISCORD = ''
629
+ DEFAULT_OUTPUT_DIR = os.path.expanduser('~/.config/waymore')
630
+ outputInlineJSDir = DEFAULT_OUTPUT_DIR
631
+
632
+ except Exception as e:
633
+ writerr(colored('ERROR getConfig 1: ' + str(e), 'red'))
634
+
635
+ # Print iterations progress - copied from https://stackoverflow.com/questions/3173320/text-progress-bar-in-terminal-with-block-characters?noredirect=1&lq=1
636
+ def printProgressBar(
637
+ iteration,
638
+ total,
639
+ prefix="",
640
+ suffix="",
641
+ decimals=1,
642
+ length=100,
643
+ fill="█",
644
+ printEnd="\r",
645
+ ):
646
+ """
647
+ Call in a loop to create terminal progress bar
648
+ @params:
649
+ iteration - Required : current iteration (Int)
650
+ total - Required : total iterations (Int)
651
+ prefix - Optional : prefix string (Str)
652
+ suffix - Optional : suffix string (Str)
653
+ decimals - Optional : positive number of decimals in percent complete (Int)
654
+ length - Optional : character length of bar (Int)
655
+ fill - Optional : bar fill character (Str)
656
+ printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
657
+ """
658
+ try:
659
+ percent = ("{0:." + str(decimals) + "f}").format(
660
+ 100 * (iteration / float(total))
661
+ ).rjust(5)
662
+ filledLength = int(length * iteration // total)
663
+ bar = fill * filledLength + "-" * (length - filledLength)
664
+ # If the program is not piped with something else, write to stdout, otherwise write to stderr
665
+ if sys.stdout.isatty():
666
+ write(colored(f"\r{prefix} |{bar}| {percent}% {suffix}\r", "green"))
667
+ else:
668
+ writerr(colored(f"\r{prefix} |{bar}| {percent}% {suffix}\r", "green"))
669
+ # Print New Line on Complete
670
+ if iteration == total:
671
+ # If the program is not piped with something else, write to stdout, otherwise write to stderr
672
+ if sys.stdout.isatty():
673
+ write()
674
+ else:
675
+ writerr()
676
+ except Exception as e:
677
+ if verbose():
678
+ writerr(colored("ERROR printProgressBar: " + str(e), "red"))
679
+
680
+ def filehash(text):
681
+ """
682
+ Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
683
+ """
684
+ hash=0
685
+ for ch in text:
686
+ hash = (hash*281 ^ ord(ch)*997) & 0xFFFFFFFFFFF
687
+ return str(hash)
688
+
689
+ class WayBackException(Exception):
690
+ """
691
+ A custom exception to raise if archive.org respond with specific text in the response that indicate there is a problem on their side
692
+ """
693
+ def __init__(self):
694
+ message = f"WayBackException"
695
+ super().__init__(message)
696
+
697
+ def fixArchiveOrgUrl(url):
698
+ """
699
+ Sometimes archive.org returns a URL that has %0A at the end followed by other characters. If you try to reach the archive URL with that it will fail, but remove from the %0A (newline) onwards and it succeeds, so it doesn't seem intentionally included. In this case, strip anything from %0A onwards from the URL
700
+ """
701
+ newline = url.find('%0A')
702
+ if newline > 0:
703
+ url = url[0:newline]
704
+ else:
705
+ newline = url.find('%0a')
706
+ if newline > 0:
707
+ url = url[0:newline]
708
+ return url
709
+
710
+ # Add a link to the linksFound collection
711
+ def linksFoundAdd(link):
712
+ global linksFound, argsInput, argsInputHostname
713
+ # If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
714
+ try:
715
+ if inputIsDomainANDPath:
716
+ checkInput = argsInput
717
+ else:
718
+ checkInput = argsInputHostname
719
+ # Don't write it if the link does not contain the requested domain (this can sometimes happen)
720
+ if link.find(checkInput) >= 0:
721
+ parsed = urlparse(link.strip())
722
+ if parsed.netloc.find(':80') >= 0 or parsed.netloc.fnd(':443') >= 0:
723
+ newNetloc = parsed.netloc.split(':')[0]
724
+ parsed = parsed._replace(netloc=newNetloc).geturl()
725
+ linksFound.add(parsed)
726
+ except:
727
+ linksFound.add(link)
728
+
729
+ def processArchiveUrl(url):
730
+ """
731
+ Get the passed web archive response
732
+ """
733
+ global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, continueRespFile, REGEX_404
734
+ try:
735
+ if stopProgram is None:
736
+
737
+ archiveUrl = 'https://web.archive.org/web/' + fixArchiveOrgUrl(url)
738
+ hashValue = ''
739
+
740
+ # Get memory usage every 100 responses
741
+ if (successCount + failureCount) % 100 == 0:
742
+ try:
743
+ getMemory()
744
+ except:
745
+ pass
746
+
747
+ # Make a request to the web archive
748
+ try:
749
+ try:
750
+ # Choose a random user agent string to use for any requests
751
+ userAgent = random.choice(USER_AGENT)
752
+
753
+ session = requests.Session()
754
+ session.mount('https://', HTTP_ADAPTER)
755
+ session.mount('http://', HTTP_ADAPTER)
756
+ resp = session.get(url = archiveUrl, headers={"User-Agent":userAgent}, allow_redirects = True)
757
+ archiveHtml = str(resp.text)
758
+ try:
759
+ contentType = resp.headers.get("Content-Type").split(';')[0].lower()
760
+ except:
761
+ contentType = ''
762
+
763
+ # Only create a file if there is a response
764
+ if len(archiveHtml) != 0:
765
+
766
+ # If the FILTER_CODE includes 404, and it only process if it doesn't seem to be a custom 404 page
767
+ if '404' in FILTER_CODE and not re.findall(REGEX_404, archiveHtml, re.DOTALL|re.IGNORECASE):
768
+
769
+ # Add the URL as a comment at the start of the response
770
+ if args.url_filename:
771
+ archiveHtml = '/* Original URL: ' + archiveUrl + ' */\n' + archiveHtml
772
+
773
+ # Remove all web archive references in the response
774
+ archiveHtml = re.sub(r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
775
+ archiveHtml = re.sub(r'\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
776
+ archiveHtml = re.sub(r'\<script\>window\.RufflePlayer[^\<]*\<\/script\>','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
777
+ archiveHtml = re.sub(r'\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
778
+ archiveHtml = re.sub(r'(}\n)?(\/\*|<!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|-->)','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
779
+ archiveHtml = re.sub(r'var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
780
+ archiveHtml = re.sub(r'(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)','',archiveHtml,1,flags=re.DOTALL|re.IGNORECASE)
781
+ archiveHtml = re.sub(r'((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/','',archiveHtml,flags=re.IGNORECASE)
782
+ archiveHtml = re.sub(r'((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/','',archiveHtml,flags=re.IGNORECASE)
783
+ archiveHtml = re.sub(r'((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F','',archiveHtml,flags=re.IGNORECASE)
784
+ archiveHtml = re.sub(r'((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F','',archiveHtml,flags=re.IGNORECASE)
785
+ archiveHtml = re.sub(r'\<script type=\"text\/javascript\">\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>','',archiveHtml,flags=re.IGNORECASE)
786
+ archiveHtml = re.sub(r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>','',archiveHtml,flags=re.IGNORECASE)
787
+ archiveHtml = re.sub(r'\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>','',archiveHtml,flags=re.IGNORECASE)
788
+ archiveHtml = re.sub(r'\<\!-- End Wayback Rewrite JS Include --\>','',archiveHtml,re.IGNORECASE)
789
+
790
+ # If there is a specific Wayback error in the response, raise an exception
791
+ if archiveHtml.lower().find('wayback machine has not archived that url') > 0 or archiveHtml.lower().find('snapshot cannot be displayed due to an internal error') > 0:
792
+ raise WayBackException
793
+
794
+ # Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
795
+ if args.url_filename:
796
+ fileName = url.replace('/','-').replace(':','')
797
+ fileName = fileName[0:254]
798
+ else:
799
+ hashValue = filehash(archiveHtml)
800
+ fileName = hashValue
801
+
802
+ # Determine extension of file from the content-type using the mimetypes library
803
+ extension = ''
804
+ try:
805
+ # Get path extension
806
+ targetUrl = 'https://' + url.split("://")[1]
807
+ parsed = urlparse(targetUrl.strip())
808
+ path = parsed.path
809
+ extension = path[path.rindex('.')+1:]
810
+ except:
811
+ pass
812
+
813
+ # If the extension is blank, numeric, longer than 4 characters or not alphanumeric - then it's not a valid file type so check contentType
814
+ if extension == '' or extension.isnumeric() or not extension.isalnum() or len(extension) > 4:
815
+ # Determine the extension from the content type
816
+ try:
817
+ if contentType != '':
818
+ extension = contentType.split('/')[1].replace('x-','')
819
+ if extension == '':
820
+ extension = contentType.lower()
821
+ except:
822
+ pass
823
+ if 'html' in extension:
824
+ extension = 'html'
825
+ elif 'javascript' in extension:
826
+ extension = 'js'
827
+ elif 'json' in extension:
828
+ extension = 'json'
829
+ elif 'css' in extension:
830
+ extension = 'css'
831
+ elif 'pdf' in extension:
832
+ extension = 'pdf'
833
+ elif 'plain' == extension:
834
+ extension = 'txt'
835
+
836
+ # If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
837
+ if extension == '':
838
+ if archiveHtml.lower().strip().endswith('</html>') or archiveHtml.lower().strip().startswith('<!doctype html') or archiveHtml.lower().strip().startswith('<html'):
839
+ extension = 'html'
840
+ else:
841
+ extension = 'unknown'
842
+
843
+ fileName = fileName + '.' + extension
844
+
845
+ # If -oR / --output-responses was passed then add the file to that directory,
846
+ # else add to the default "results/{target.domain}" directory in the same path as the .py file
847
+ if args.output_responses != '':
848
+ filePath = args.output_responses + '/' + f'{fileName}'
849
+ else:
850
+ filePath = (DEFAULT_OUTPUT_DIR + '/results/' + str(argsInput).replace('/','-') + '/' + f'{fileName}')
851
+
852
+ # Write the file
853
+ try:
854
+ responseFile = open(filePath, 'w', encoding='utf8')
855
+ responseFile.write(archiveHtml)
856
+ responseFile.close()
857
+ fileCount = fileCount + 1
858
+ except Exception as e:
859
+ writerr(colored(getSPACER('[ ERR ] Failed to write file ' + filePath + ': '+ str(e)), 'red'))
860
+
861
+ # Write the hash value and URL to the index file
862
+ if not args.url_filename:
863
+ try:
864
+ timestamp = str(datetime.now())
865
+ indexFile.write(hashValue+','+archiveUrl+' ,'+timestamp+'\n')
866
+ indexFile.flush()
867
+ except Exception as e:
868
+ writerr(colored(getSPACER('[ ERR ] Failed to write to index.txt for "' + archiveUrl + '": '+ str(e)), 'red'))
869
+
870
+ # FOR DEBUGGING PURPOSES
871
+ try:
872
+ if os.environ.get('USER') == 'xnl':
873
+ debugText = ''
874
+ if archiveHtml.lower().find('archive.org') > 0:
875
+ debugText = 'ARCHIVE.ORG'
876
+ elif archiveHtml.lower().find('internet archive') > 0:
877
+ debugText = 'INTERNET ARCHIVE'
878
+ elif archiveHtml.lower().find('wombat') > 0:
879
+ debugText = 'WOMBAT (JS)'
880
+ if debugText != '':
881
+ writerr(colored(getSPACER('"' + fileName + '" CONTAINS ' + debugText + ' - CHECK ITS A VALID REFERENCE'), 'yellow'))
882
+ except:
883
+ pass
884
+
885
+ successCount = successCount + 1
886
+
887
+ except WayBackException as wbe:
888
+ failureCount = failureCount + 1
889
+ if verbose():
890
+ writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) returned a problem for "' + archiveUrl + '"'), 'red'))
891
+ except ConnectionError as ce:
892
+ failureCount = failureCount + 1
893
+ if verbose():
894
+ writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error for "' + archiveUrl + '"'), 'red'))
895
+ except Exception as e:
896
+ failureCount = failureCount + 1
897
+ if verbose():
898
+ try:
899
+ writerr(colored(getSPACER('[ ' + str(resp.status_code) +' ] Failed to get response for "' + archiveUrl + '"'), 'red'))
900
+ except:
901
+ writerr(colored(getSPACER('[ ERR ] Failed to get response for "' + archiveUrl + '": '+ str(e)), 'red'))
902
+
903
+ # Show progress bar
904
+ fillTest = (successCount + failureCount) % 2
905
+ fillChar = "o"
906
+ if fillTest == 0:
907
+ fillChar = "O"
908
+ suffix="Complete "
909
+ # Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
910
+ if (successCount + failureCount) % 25 == 1 or (successCount + failureCount) == totalResponses:
911
+ try:
912
+ getMemory()
913
+ if verbose():
914
+ suffix = (
915
+ "Complete (Mem Usage "
916
+ + humanReadableSize(currentMemUsage)
917
+ + ", Total Mem "
918
+ + str(currentMemPercent)
919
+ + "%) "
920
+ )
921
+ except:
922
+ if verbose():
923
+ suffix = 'Complete (To show mem use, run "pip install psutil")'
924
+ printProgressBar(
925
+ successCount + failureCount,
926
+ totalResponses,
927
+ prefix="Downloading " + str(totalResponses) + " responses:",
928
+ suffix=suffix,
929
+ length=getProgressBarLength(),
930
+ fill=fillChar
931
+ )
932
+
933
+ # Write the total count to the continueResp.tmp file
934
+ try:
935
+ continueRespFile.seek(0)
936
+ continueRespFile.write(str(successCount + failureCount)+'\n')
937
+ except Exception as e:
938
+ if verbose():
939
+ writerr(colored(getSPACER('ERROR processArchiveUrl 2: ' + str(e)), 'red'))
940
+
941
+ except Exception as e:
942
+ if verbose():
943
+ writerr(colored(getSPACER('Error for "'+url+'": ' + str(e)), 'red'))
944
+
945
+ except Exception as e:
946
+ writerr(colored('ERROR processArchiveUrl 1: ' + str(e), 'red'))
947
+
948
+ def processURLOutput():
949
+ """
950
+ Show results of the URL output, i.e. getting URLs from archive.org and commoncrawl.org and write results to file
951
+ """
952
+ global linksFound, subs, path, argsInput, checkWayback, checkCommonCrawl, checkAlienVault, checkURLScan, checkVirusTotal, DEFAULT_OUTPUT_DIR
953
+
954
+ try:
955
+
956
+ if args.check_only:
957
+ totalRequests = checkWayback + checkCommonCrawl + checkAlienVault + checkURLScan + checkVirusTotal
958
+ minutes = totalRequests*1 // 60
959
+ hours = minutes // 60
960
+ days = hours // 24
961
+ if minutes < 5:
962
+ write(colored('\n-> Getting URLs (e.g. at 1 req/sec) should be quite quick!','green'))
963
+ elif hours < 2:
964
+ write(colored('\n-> Getting URLs (e.g. at 1 req/sec) could take more than '+str(minutes)+' minutes.','green'))
965
+ elif hours < 6:
966
+ write(colored('\n-> Getting URLs (e.g. at 1 req/sec) could take more than '+str(hours)+' hours.','green'))
967
+ elif hours < 24:
968
+ write(colored('\n-> Getting URLs (e.g. at 1 req/sec) take more than '+str(hours)+' hours.','yellow'))
969
+ elif days < 7:
970
+ write(colored('\n-> Getting URLs (e.g. at 1 req/sec) could take more than '+str(days)+' days. Consider using arguments -lr, -ci, -from and -to wisely!','red'))
971
+ else:
972
+ write(colored('\n-> Getting URLs (e.g. at 1 req/sec) could take more than '+str(days)+' days!!! Consider using arguments -lr, -ci, -from and -to wisely!','red'))
973
+ write('')
974
+ else:
975
+ linkCount = len(linksFound)
976
+ write(getSPACER(colored('Links found for ' + subs + argsInput + ': ', 'cyan')+colored(str(linkCount) + ' 🤘','white'))+'\n')
977
+
978
+ # If -oU / --output-urls was passed then use that file name, else use "waymore.txt" in the path of the .py file
979
+ if args.output_urls == '':
980
+ # Create 'results' and domain directory if needed
981
+ createDirs()
982
+
983
+ # If -oR / --output-responses was passed then set the path to that, otherwise it will be the "results/{target.domain}}" path
984
+ if args.output_responses != '':
985
+ fullPath = args.output_responses + '/'
986
+ else:
987
+ fullPath = str(DEFAULT_OUTPUT_DIR) + '/results/' + str(argsInput).replace('/','-') + '/'
988
+ filename = fullPath + 'waymore.txt'
989
+ filenameNew = fullPath + 'waymore.new'
990
+ filenameOld = fullPath + 'waymore.old'
991
+ else:
992
+ filename = args.output_urls
993
+ filenameNew = filename + '.new'
994
+ filenameOld = filename + '.old'
995
+ # If the filename has any "/" in it, remove the contents after the last one to just get the path and create the directories if necessary
996
+ try:
997
+ if filename.find('/') > 0:
998
+ f = os.path.basename(filename)
999
+ p = filename[:-(len(f))-1]
1000
+ if p != '' and not os.path.exists(p):
1001
+ os.makedirs(p)
1002
+ except Exception as e:
1003
+ if verbose():
1004
+ writerr(colored('ERROR processURLOutput 6: ' + str(e), 'red'))
1005
+
1006
+ # If the -ow / --output_overwrite argument was passed and the file exists already, get the contents of the file to include
1007
+ appendedUrls = False
1008
+ if not args.output_overwrite:
1009
+ try:
1010
+ with open(filename,'r') as existingLinks:
1011
+ for link in existingLinks.readlines():
1012
+ linksFound.add(link.strip())
1013
+ appendedUrls = True
1014
+ except Exception as e:
1015
+ pass
1016
+
1017
+ # If the -nlf / --new-links-file argument is passed, rename the old links file if it exists
1018
+ try:
1019
+ if args.new_links_file:
1020
+ if os.path.exists(filename):
1021
+ os.rename(filename, filenameOld)
1022
+ except Exception as e:
1023
+ if verbose():
1024
+ writerr(colored('ERROR processURLOutput 5: ' + str(e), 'red'))
1025
+
1026
+ try:
1027
+ # Open the output file
1028
+ outFile = open(filename,'w')
1029
+ except Exception as e:
1030
+ if verbose():
1031
+ writerr(colored('ERROR processURLOutput 2: ' + str(e), 'red'))
1032
+ sys.exit()
1033
+
1034
+ # Go through all links, and output what was found
1035
+ # If the -ra --regex-after was passed then only output if it matches
1036
+ outputCount = 0
1037
+ for link in linksFound:
1038
+ try:
1039
+ if args.regex_after is None or re.search(args.regex_after, link, flags=re.IGNORECASE):
1040
+ outFile.write(link + "\n")
1041
+ # If the tool is piped to pass output to something else, then write the link
1042
+ if not sys.stdout.isatty():
1043
+ write(link,True)
1044
+ outputCount = outputCount + 1
1045
+ except Exception as e:
1046
+ if verbose():
1047
+ writerr(colored('ERROR processURLOutput 3: ' + str(e), 'red'))
1048
+
1049
+ # If there are less links output because of filters, show the new total
1050
+ if args.regex_after is not None and linkCount > 0 and outputCount < linkCount:
1051
+ write(colored('Links found after applying filter "' + args.regex_after + '": ','cyan')+colored(str(outputCount) + ' 🤘\n','white'))
1052
+
1053
+ # Close the output file
1054
+ try:
1055
+ outFile.close()
1056
+ except Exception as e:
1057
+ if verbose():
1058
+ writerr(colored('ERROR processURLOutput 4: ' + str(e), 'red'))
1059
+
1060
+ if verbose():
1061
+ if outputCount == 0:
1062
+ write(colored('No links were found so nothing written to file.', 'cyan'))
1063
+ else:
1064
+ if appendedUrls:
1065
+ write(
1066
+ colored('Links successfully appended to file ', 'cyan')+colored(filename,
1067
+ 'white')+colored(' and duplicates removed.','cyan'))
1068
+ else:
1069
+ write(
1070
+ colored('Links successfully written to file ', 'cyan')+colored(filename,
1071
+ 'white'))
1072
+
1073
+ try:
1074
+ # If the -nlf / --new-links-file argument is passes, create the .new file
1075
+ if args.new_links_file:
1076
+
1077
+ # If the file and .old version exists then get the difference to write to .new file
1078
+ if os.path.exists(filenameOld) and os.path.exists(filename):
1079
+
1080
+ # Get all the old links
1081
+ with open(filenameOld,'r') as oldFile:
1082
+ oldLinks=set(oldFile.readlines())
1083
+
1084
+ # Get all the new links
1085
+ with open(filename,'r') as newFile:
1086
+ newLinks=set(newFile.readlines())
1087
+
1088
+ # Create a file with most recent new links
1089
+ with open(filenameNew,'w') as newOnly:
1090
+ for line in list(newLinks-oldLinks):
1091
+ newOnly.write(line)
1092
+
1093
+ # Delete the old file
1094
+ os.remove(filenameOld)
1095
+
1096
+ except Exception as e:
1097
+ if verbose():
1098
+ writerr(colored("ERROR processURLOutput 6: " + str(e), "red"))
1099
+
1100
+ except Exception as e:
1101
+ if verbose():
1102
+ writerr(colored("ERROR processURLOutput 1: " + str(e), "red"))
1103
+
1104
+ def processResponsesOutput():
1105
+ """
1106
+ Show results of the archive responses saved
1107
+ """
1108
+ global successCount, failureCount, subs, fileCount, argsInput, DEFAULT_OUTPUT_DIR, responseOutputDirectory
1109
+ try:
1110
+
1111
+ if failureCount > 0:
1112
+ if verbose():
1113
+ write(colored('\nResponses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' failed)\n','red'))
1114
+ else:
1115
+ write(colored('\nResponses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘','white')+colored(' (' + str(failureCount) + ' failed)\n','red'))
1116
+ else:
1117
+ if verbose():
1118
+ write(colored('\nResponses saved to ','cyan')+colored(responseOutputDirectory,'white') + colored(' for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
1119
+ else:
1120
+ write(colored('\nResponses saved for ' + subs + argsInput + ': ', 'cyan')+colored(str(fileCount) +' (' +str(successCount-fileCount) + ' empty responses) 🤘\n','white'))
1121
+ except Exception as e:
1122
+ if verbose():
1123
+ writerr(colored("ERROR processResponsesOutput 1: " + str(e), "red"))
1124
+
1125
+ def validateArgProcesses(x):
1126
+ """
1127
+ Validate the -p / --processes argument
1128
+ Only allow values between 1 and 5 inclusive
1129
+ """
1130
+ x = int(x)
1131
+ if x < 1 or x > 5:
1132
+ raise argparse.ArgumentTypeError('The number of processes must be between 1 and 5. Be kind to Wayback Machine (archive.org) and commoncrawl.org! :)')
1133
+ return x
1134
+
1135
+ def stripUnwanted(url):
1136
+ """
1137
+ Strip the scheme, port number, query string and fragment form any input values if they have them
1138
+ """
1139
+ parsed = urlparse(url)
1140
+ # Strip scheme
1141
+ scheme = "%s://" % parsed.scheme
1142
+ strippedUrl = parsed.geturl().replace(scheme, '', 1)
1143
+ # Strip query string and fragment
1144
+ strippedUrl = strippedUrl.split('#')[0].split('?')[0]
1145
+ # Strip port number
1146
+ if re.search(r'^[^/]*:[0-9]+', strippedUrl):
1147
+ strippedUrl = re.sub(r':[0-9]+','', strippedUrl, 1)
1148
+ return strippedUrl
1149
+
1150
+ def validateArgInput(x):
1151
+ """
1152
+ Validate the -i / --input argument.
1153
+ Ensure it is a domain only, or a URL, but with no schema or query parameters or fragment
1154
+ """
1155
+ global inputValues, isInputFile
1156
+ # If the input was given through STDIN (piped from another program) then
1157
+ if x == '<stdin>':
1158
+ stdinFile = sys.stdin.readlines()
1159
+ count = 0
1160
+ for line in stdinFile:
1161
+ # Remove newline characters, and also *. if the domain starts with this
1162
+ inputValues.add(stripUnwanted(line.rstrip('\n').lstrip('*.')))
1163
+ count = count + 1
1164
+ if count > 1:
1165
+ isInputFile = True
1166
+ else:
1167
+ # Determine if a single input was given, or a file
1168
+ if os.path.isfile(x):
1169
+ isInputFile = True
1170
+ # Open file and put all values in input list
1171
+ with open(x, 'r') as inputFile:
1172
+ lines = inputFile.readlines()
1173
+ # Check if any lines start with a *. and replace without the *.
1174
+ for line in lines:
1175
+ inputValues.add(stripUnwanted(line.lstrip('*.')))
1176
+ else:
1177
+ # Just add the input value to the input list
1178
+ inputValues.add(stripUnwanted(x))
1179
+ return x
1180
+
1181
+ def validateArgStatusCodes(x):
1182
+ """
1183
+ Validate the -fc and -mc arguments
1184
+ Only allow 3 digit numbers separated by a comma
1185
+ """
1186
+ invalid = False
1187
+ codes = x.split(',')
1188
+ for code in codes:
1189
+ if len(code) != 3 or not code.isdigit():
1190
+ invalid = True
1191
+ break
1192
+ if invalid:
1193
+ raise argparse.ArgumentTypeError('Pass HTTP status codes separated by a comma')
1194
+ return x
1195
+
1196
+ def processAlienVaultPage(url):
1197
+ """
1198
+ Get URLs from a specific page of otx.alienvault.org API for the input domain
1199
+ """
1200
+ global totalPages, linkMimes, linksFound, stopSource, argsInput
1201
+ try:
1202
+ # Get memory in case it exceeds threshold
1203
+ getMemory()
1204
+
1205
+ if not stopSource:
1206
+ try:
1207
+ # Choose a random user agent string to use for any requests
1208
+ userAgent = random.choice(USER_AGENT)
1209
+ page = url.split('page=')[1]
1210
+ session = requests.Session()
1211
+ session.mount('https://', HTTP_ADAPTER)
1212
+ session.mount('http://', HTTP_ADAPTER)
1213
+ resp = session.get(url, headers={"User-Agent":userAgent})
1214
+ except ConnectionError as ce:
1215
+ writerr(colored(getSPACER('[ ERR ] alienvault.org connection error for page ' + page), 'red'))
1216
+ resp = None
1217
+ return
1218
+ except Exception as e:
1219
+ writerr(colored(getSPACER('[ ERR ] Error getting response for page ' + page + ' - ' + str(e)),'red'))
1220
+ resp = None
1221
+ return
1222
+ finally:
1223
+ try:
1224
+ if resp is not None:
1225
+ # If a status other of 429, then stop processing Alien Vault
1226
+ if resp.status_code == 429:
1227
+ writerr(colored(getSPACER('[ 429 ] Alien Vault rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
1228
+ stopSource = True
1229
+ return
1230
+ # If the response from alienvault.com is empty then skip
1231
+ if resp.text == '' and totalPages == 0:
1232
+ if verbose():
1233
+ writerr(colored(getSPACER('[ ERR ] '+url+' gave an empty response.'),'red'))
1234
+ return
1235
+ # If a status other than 200, then stop
1236
+ if resp.status_code != 200:
1237
+ if verbose():
1238
+ writerr(colored(getSPACER('[ '+str(resp.status_code)+' ] Error for '+url),'red'))
1239
+ return
1240
+ except:
1241
+ pass
1242
+
1243
+ # Get the JSON response
1244
+ jsonResp = json.loads(resp.text.strip())
1245
+
1246
+ # Go through each URL in the list
1247
+ for urlSection in jsonResp['url_list']:
1248
+ # Get the URL
1249
+ try:
1250
+ foundUrl = urlSection['url']
1251
+ except:
1252
+ foundUrl = ''
1253
+
1254
+ # If a URL was found
1255
+ if foundUrl != '':
1256
+ # If filters are not required and subs are wanted then just add the URL to the list
1257
+ if args.filter_responses_only and not args.no_subs:
1258
+ linksFoundAdd(foundUrl)
1259
+ else:
1260
+ addLink = True
1261
+
1262
+ # If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
1263
+ if args.no_subs:
1264
+ match = re.search(r'\:\/\/(www\.)?'+re.escape(argsInput), foundUrl, flags=re.IGNORECASE)
1265
+ if match is None:
1266
+ addLink = False
1267
+
1268
+ # If the user didn't requested -f / --filter-responses-only then check http code
1269
+ # Note we can't check MIME filter because it is not returned by Alien Vault API
1270
+ if addLink and not args.filter_responses_only:
1271
+ # Get the HTTP code
1272
+ try:
1273
+ httpCode = str(urlSection['httpcode'])
1274
+ except:
1275
+ httpCode = 'UNKNOWN'
1276
+
1277
+ # Compare the HTTP code gainst the Code exclusions and matches
1278
+ if MATCH_CODE != '':
1279
+ match = re.search(r'('+re.escape(MATCH_CODE).replace(',','|')+')', httpCode, flags=re.IGNORECASE)
1280
+ if match is None:
1281
+ addLink = False
1282
+ else:
1283
+ match = re.search(r'('+re.escape(FILTER_CODE).replace(',','|')+')', httpCode, flags=re.IGNORECASE)
1284
+ if match is not None:
1285
+ addLink = False
1286
+
1287
+ # Check the URL exclusions
1288
+ if addLink:
1289
+ match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
1290
+ if match is not None:
1291
+ addLink = False
1292
+
1293
+ # Set keywords filter if -ko argument passed
1294
+ if addLink and args.keywords_only:
1295
+ if args.keywords_only == '#CONFIG':
1296
+ match = re.search(r'('+re.escape(FILTER_KEYWORDS).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
1297
+ else:
1298
+ match = re.search(r'('+args.keywords_only+')', foundUrl, flags=re.IGNORECASE)
1299
+ if match is None:
1300
+ addLink = False
1301
+
1302
+ # Add link if it passed filters
1303
+ if addLink:
1304
+ linksFoundAdd(foundUrl)
1305
+ else:
1306
+ pass
1307
+ except Exception as e:
1308
+ if verbose():
1309
+ writerr(colored("ERROR processLAlienVaultPage 1: " + str(e), "red"))
1310
+
1311
+ def getAlienVaultUrls():
1312
+ """
1313
+ Get URLs from the Alien Vault OTX, otx.alienvault.com
1314
+ """
1315
+ global linksFound, waymorePath, subs, path, stopProgram, totalPages, stopSource, argsInput, checkAlienVault, inputIsSubDomain, argsInputHostname
1316
+
1317
+ # Write the file of URL's for the passed domain/URL
1318
+ try:
1319
+ stopSource = False
1320
+ originalLinkCount = len(linksFound)
1321
+
1322
+ # Set the Alien Vault API indicator types of domain or hostname (has subdomain)
1323
+ if inputIsSubDomain:
1324
+ indicatorType = 'hostname'
1325
+ else:
1326
+ indicatorType = 'domain'
1327
+
1328
+ url = ALIENVAULT_URL.replace('{TYPE}',indicatorType).replace('{DOMAIN}',quote(argsInputHostname))+'&page='
1329
+
1330
+ # Get the number of pages (i.e. separate requests) that are going to be made to alienvault.com
1331
+ totalPages = 0
1332
+ try:
1333
+ if not args.check_only:
1334
+ write(colored('\rGetting the number of alienvault.com pages to search...\r','cyan'))
1335
+ # Choose a random user agent string to use for any requests
1336
+ userAgent = random.choice(USER_AGENT)
1337
+ session = requests.Session()
1338
+ session.mount('https://', HTTP_ADAPTER)
1339
+ session.mount('http://', HTTP_ADAPTER)
1340
+ resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
1341
+ except Exception as e:
1342
+ writerr(colored(getSPACER('[ ERR ] Unable to get links from alienvault.com: ' + str(e)), 'red'))
1343
+ return
1344
+
1345
+ # If the rate limit was reached end now
1346
+ if resp.status_code == 429:
1347
+ writerr(colored(getSPACER('[ 429 ] Alien Vault rate limit reached so unable to get links.'),'red'))
1348
+ return
1349
+
1350
+ if verbose():
1351
+ write(getSPACER(colored('The Alien Vault URL requested to get links: ','magenta')+colored(url,'white'))+'\n')
1352
+
1353
+ # Carry on if something was found
1354
+ if resp.text.lower().find('"error": "') < 0:
1355
+
1356
+ # Get the JSON response
1357
+ jsonResp = json.loads(resp.text.strip())
1358
+
1359
+ # Try to get the number of results
1360
+ totalUrls = jsonResp['full_size']
1361
+
1362
+ # If there are results, carry on
1363
+ if totalUrls > 0 or args.check_only:
1364
+
1365
+ # Get total pages
1366
+ totalPages = math.ceil(totalUrls / 500)
1367
+
1368
+ # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
1369
+ if args.limit_requests != 0 and totalPages > args.limit_requests:
1370
+ totalPages = args.limit_requests
1371
+
1372
+ if args.check_only:
1373
+ if totalPages == 0:
1374
+ checkAlienVault = 1
1375
+ else:
1376
+ checkAlienVault = totalPages
1377
+ write(colored('Get URLs from Alien Vault: ','cyan')+colored(str(checkAlienVault)+' requests','white'))
1378
+ else:
1379
+ # if the page number was found then display it, but otherwise we will just try to increment until we have everything
1380
+ write(colored('\rGetting links from ' + str(totalPages) + ' alienvault.com API requests (this can take a while for some domains)...\r','cyan'))
1381
+
1382
+ # Get a list of all the page URLs we need to visit
1383
+ pages = []
1384
+ for page in range(1, totalPages + 1):
1385
+ pages.append(url+str(page))
1386
+
1387
+ # Process the URLs from alien vault
1388
+ if stopProgram is None:
1389
+ p = mp.Pool(args.processes)
1390
+ p.map(processAlienVaultPage, pages)
1391
+ p.close()
1392
+ p.join()
1393
+ else:
1394
+ if verbose():
1395
+ writerr(colored(getSPACER('[ ERR ] An error was returned in the alienvault.com response.')+'\n', 'red'))
1396
+
1397
+ if not args.check_only:
1398
+ linkCount = len(linksFound) - originalLinkCount
1399
+ if args.xwm and args.xcc:
1400
+ write(getSPACER(colored('Links found on alienvault.com: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
1401
+ else:
1402
+ write(getSPACER(colored('Extra links found on alienvault.com: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
1403
+
1404
+ except Exception as e:
1405
+ writerr(colored('ERROR getAlienVaultUrls 1: ' + str(e), 'red'))
1406
+
1407
+ def processURLScanUrl(url, httpCode, mimeType):
1408
+ """
1409
+ Process a specific URL from urlscan.io to determine whether to save the link
1410
+ """
1411
+ global argsInput, argsInputHostname
1412
+
1413
+ addLink = True
1414
+
1415
+ try:
1416
+ # If filters are required then test them
1417
+ if not args.filter_responses_only:
1418
+
1419
+ # If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
1420
+ if args.no_subs:
1421
+ match = re.search(r'^[A-za-z]*\:\/\/(www\.)?'+re.escape(argsInputHostname), url, flags=re.IGNORECASE)
1422
+ if match is None:
1423
+ addLink = False
1424
+
1425
+ # If the user didn't requested -f / --filter-responses-only then check http code
1426
+ # Note we can't check MIME filter because it is not returned by URLScan API
1427
+ if addLink and not args.filter_responses_only:
1428
+
1429
+ # Compare the HTTP code against the Code exclusions and matches
1430
+ if MATCH_CODE != '':
1431
+ match = re.search(r'('+re.escape(MATCH_CODE).replace(',','|')+')', httpCode, flags=re.IGNORECASE)
1432
+ if match is None:
1433
+ addLink = False
1434
+ else:
1435
+ match = re.search(r'('+re.escape(FILTER_CODE).replace(',','|')+')', httpCode, flags=re.IGNORECASE)
1436
+ if match is not None:
1437
+ addLink = False
1438
+
1439
+ # Check the URL exclusions
1440
+ if addLink:
1441
+ match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', url, flags=re.IGNORECASE)
1442
+ if match is not None:
1443
+ addLink = False
1444
+
1445
+ # Set keywords filter if -ko argument passed
1446
+ if addLink and args.keywords_only:
1447
+ if args.keywords_only == '#CONFIG':
1448
+ match = re.search(r'('+re.escape(FILTER_KEYWORDS).replace(',','|')+')', url, flags=re.IGNORECASE)
1449
+ else:
1450
+ match = re.search(r'('+args.keywords_only+')', url, flags=re.IGNORECASE)
1451
+ if match is None:
1452
+ addLink = False
1453
+
1454
+ # Check the MIME exclusions
1455
+ if mimeType != '':
1456
+ match = re.search(r'('+re.escape(FILTER_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
1457
+ if match is not None:
1458
+ addLink = False
1459
+ else:
1460
+ # Add MIME Types if --verbose option was selected
1461
+ if verbose():
1462
+ linkMimes.add(mimeType)
1463
+
1464
+ # Add link if it passed filters
1465
+ if addLink:
1466
+ # Just get the hostname of the url
1467
+ tldExtract = tldextract.extract(url)
1468
+ subDomain = tldExtract.subdomain
1469
+ if subDomain != '':
1470
+ subDomain = subDomain+'.'
1471
+ domainOnly = subDomain+tldExtract.domain+'.'+tldExtract.suffix
1472
+
1473
+ # URLScan might return URLs that aren't for the domain passed so we need to check for those and not process them
1474
+ # Check the URL
1475
+ match = re.search(r'(^|\.)'+re.escape(argsInputHostname)+'$', domainOnly, flags=re.IGNORECASE)
1476
+ if match is not None:
1477
+ linksFoundAdd(url)
1478
+
1479
+ except Exception as e:
1480
+ writerr(colored('ERROR processURLScanUrl 1: ' + str(e), 'red'))
1481
+
1482
+ def getURLScanUrls():
1483
+ """
1484
+ Get URLs from the URLSCan API, urlscan.io
1485
+ """
1486
+ global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSource, argsInput, checkURLScan, argsInputHostname
1487
+
1488
+ # Write the file of URL's for the passed domain/URL
1489
+ try:
1490
+ requestsMade = 0
1491
+ stopSource = False
1492
+ linkMimes = set()
1493
+ originalLinkCount = len(linksFound)
1494
+
1495
+ # Set the URL to just the hostname
1496
+ url = URLSCAN_URL.replace('{DOMAIN}',quote(argsInputHostname))
1497
+
1498
+ if verbose():
1499
+ write(colored('The URLScan URL requested to get links: ','magenta')+colored(url+'\n','white'))
1500
+
1501
+ if not args.check_only:
1502
+ write(colored('\rGetting links from urlscan.io API (this can take a while for some domains)...\r','cyan'))
1503
+
1504
+ # Get the first page from urlscan.io
1505
+ try:
1506
+ # Choose a random user agent string to use for any requests
1507
+ userAgent = random.choice(USER_AGENT)
1508
+ session = requests.Session()
1509
+ session.mount('https://', HTTP_ADAPTER)
1510
+ session.mount('http://', HTTP_ADAPTER)
1511
+ # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
1512
+ resp = session.get(url, headers={'User-Agent':userAgent, 'API-Key':URLSCAN_API_KEY})
1513
+ requestsMade = requestsMade + 1
1514
+ except Exception as e:
1515
+ write(colored(getSPACER('[ ERR ] Unable to get links from urlscan.io: ' + str(e)), 'red'))
1516
+ return
1517
+
1518
+ # If the rate limit was reached then determine if to wait and then try again
1519
+ if resp.status_code == 429:
1520
+ # Get the number of seconds the rate limit resets
1521
+ match = re.search(r'Reset in (\d+) seconds', resp.text, flags=re.IGNORECASE)
1522
+ if match is not None:
1523
+ seconds = int(match.group(1))
1524
+ if seconds <= args.urlscan_rate_limit_retry * 60:
1525
+ writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached, so waiting for another '+str(seconds)+' seconds before continuing...'),'yellow'))
1526
+ time.sleep(seconds+1)
1527
+ try:
1528
+ resp = session.get(url, headers={'User-Agent':userAgent, 'API-Key':URLSCAN_API_KEY})
1529
+ requestsMade = requestsMade + 1
1530
+ except Exception as e:
1531
+ write(colored(getSPACER('[ ERR ] Unable to get links from urlscan.io: ' + str(e)), 'red'))
1532
+ return
1533
+
1534
+ # If the rate limit was reached or if a 401 (which likely means the API key isn't valid), try without API key
1535
+ if resp.status_code in (401,429):
1536
+ if URLSCAN_API_KEY != '':
1537
+ try:
1538
+ if resp.status_code == 429:
1539
+ writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached so trying without API Key...'),'red'))
1540
+ else:
1541
+ writerr(colored(getSPACER('The URLScan API Key is invalid so trying without API Key...'),'red'))
1542
+ # Set key to blank for further requests
1543
+ URLSCAN_API_KEY = ''
1544
+ resp = requests.get(url, headers={'User-Agent':userAgent})
1545
+ except Exception as e:
1546
+ writerr(colored(getSPACER('[ ERR ] Unable to get links from urlscan.io: ' + str(e)), 'red'))
1547
+ return
1548
+
1549
+ # If the rate limit was reached end now
1550
+ if resp.status_code == 429:
1551
+ writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached without API Key so unable to get links.'),'red'))
1552
+ return
1553
+ else:
1554
+ writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached so unable to get links.'),'red'))
1555
+ return
1556
+ elif resp.status_code != 200:
1557
+ writerr(colored(getSPACER('[ ' + str(resp.status_code) + ' ] Unable to get links from urlscan.io'),'red'))
1558
+ return
1559
+
1560
+ # Get the JSON response
1561
+ jsonResp = json.loads(resp.text.strip())
1562
+
1563
+ # Get the number of results
1564
+ totalUrls = jsonResp['total']
1565
+
1566
+ if args.check_only:
1567
+ hasMore = jsonResp['has_more']
1568
+ if hasMore:
1569
+ write(colored('Get URLs from URLScan: ','cyan')+colored('UNKNOWN requests','white'))
1570
+ else:
1571
+ write(colored('Get URLs from URLScan: ','cyan')+colored('1 request','white'))
1572
+ checkURLScan = 1
1573
+ else:
1574
+ # Carry on if something was found
1575
+ if int(totalUrls) > 0:
1576
+
1577
+ while not stopSource:
1578
+
1579
+ searchAfter = ''
1580
+
1581
+ # Get memory in case it exceeds threshold
1582
+ getMemory()
1583
+
1584
+ # Go through each URL in the list
1585
+ for urlSection in jsonResp['results']:
1586
+
1587
+ # Get the URL
1588
+ try:
1589
+ foundUrl = urlSection['page']['url']
1590
+ except:
1591
+ foundUrl = ''
1592
+
1593
+ # Also get the "ptr" field which can also be a url we want
1594
+ try:
1595
+ pointer = urlSection['page']['ptr']
1596
+ if not pointer.startswith('http'):
1597
+ pointer = 'http://' + pointer
1598
+ except:
1599
+ pointer = ''
1600
+
1601
+ # Also get the "task" url field
1602
+ try:
1603
+ taskUrl = urlSection['task']['url']
1604
+ if not taskUrl.startswith('http'):
1605
+ taskUrl = 'http://' + taskUrl
1606
+ except:
1607
+ taskUrl = ''
1608
+
1609
+ # Get the sort value used for the search_after parameter to get to the next page later
1610
+ try:
1611
+ sort = urlSection['sort']
1612
+ except:
1613
+ sort = ''
1614
+ searchAfter = '&search_after='+str(sort[0])+','+str(sort[1])
1615
+
1616
+ # Get the HTTP code
1617
+ try:
1618
+ httpCode = str(urlSection['page']['status'])
1619
+ except:
1620
+ httpCode = 'UNKNOWN'
1621
+
1622
+ # Get the MIME type
1623
+ try:
1624
+ mimeType = urlSection['page']['mimeType']
1625
+ except:
1626
+ mimeType = ''
1627
+
1628
+ # If a URL was found the process it
1629
+ if foundUrl != '':
1630
+ processURLScanUrl(foundUrl, httpCode, mimeType)
1631
+
1632
+ # If a pointer was found the process it
1633
+ if pointer != '':
1634
+ processURLScanUrl(pointer, httpCode, mimeType)
1635
+
1636
+ # If a task url was found the process it
1637
+ if taskUrl != '':
1638
+ processURLScanUrl(taskUrl, httpCode, mimeType)
1639
+
1640
+ # If we have the field value to go to the next page...
1641
+ if searchAfter != '':
1642
+
1643
+ keepTrying = True
1644
+ while not stopSource and keepTrying:
1645
+ keepTrying = False
1646
+ # Get the next page from urlscan.io
1647
+ try:
1648
+ # Choose a random user agent string to use for any requests
1649
+ userAgent = random.choice(USER_AGENT)
1650
+ session = requests.Session()
1651
+ session.mount('https://', HTTP_ADAPTER)
1652
+ session.mount('http://', HTTP_ADAPTER)
1653
+ # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
1654
+ resp = session.get(url+searchAfter, headers={'User-Agent':userAgent, 'API-Key':URLSCAN_API_KEY})
1655
+ requestsMade = requestsMade + 1
1656
+ except Exception as e:
1657
+ writerr(colored(getSPACER('[ ERR ] Unable to get links from urlscan.io: ' + str(e)), 'red'))
1658
+ pass
1659
+
1660
+ # If the rate limit was reached
1661
+ if resp.status_code == 429:
1662
+ # Get the number of seconds the rate limit resets
1663
+ match = re.search(r'Reset in (\d+) seconds', resp.text, flags=re.IGNORECASE)
1664
+ if match is not None:
1665
+ seconds = int(match.group(1))
1666
+ if seconds <= args.urlscan_rate_limit_retry * 60:
1667
+ writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached, so waiting for another '+str(seconds)+' seconds before continuing...'),'yellow'))
1668
+ time.sleep(seconds+1)
1669
+ keepTrying = True
1670
+ continue
1671
+ else:
1672
+ writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached (waiting time of '+str(seconds)+'), so stopping. Links that have already been retrieved will be saved.'),'red'))
1673
+ stopSource = True
1674
+ pass
1675
+ else:
1676
+ writerr(colored(getSPACER('[ 429 ] URLScan rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
1677
+ stopSource = True
1678
+ pass
1679
+ elif resp.status_code != 200:
1680
+ writerr(colored(getSPACER('[ ' + str(resp.status_code) + ' ] Unable to get links from urlscan.io'),'red'))
1681
+ stopSource = True
1682
+ pass
1683
+
1684
+ if not stopSource:
1685
+ # Get the JSON response
1686
+ jsonResp = json.loads(resp.text.strip())
1687
+
1688
+ # If there are no more results, or if the requests limit was specified and has been exceeded, then stop
1689
+ if jsonResp['results'] is None or len(jsonResp['results']) == 0 or (args.limit_requests != 0 and requestsMade > args.limit_requests):
1690
+ stopSource = True
1691
+
1692
+ # Show the MIME types found (in case user wants to exclude more)
1693
+ if verbose() and len(linkMimes) > 0:
1694
+ linkMimes.discard('warc/revisit')
1695
+ write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
1696
+
1697
+ linkCount = len(linksFound) - originalLinkCount
1698
+ if args.xwm and args.xcc and args.xav:
1699
+ write(getSPACER(colored('Links found on urlscan.io: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
1700
+ else:
1701
+ write(getSPACER(colored('Extra links found on urlscan.io: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
1702
+
1703
+ except Exception as e:
1704
+ writerr(colored('ERROR getURLScanUrls 1: ' + str(e), 'red'))
1705
+
1706
+ def processWayBackPage(url):
1707
+ """
1708
+ Get URLs from a specific page of archive.org CDX API for the input domain
1709
+ """
1710
+ global totalPages, linkMimes, linksFound, stopSource
1711
+ try:
1712
+ # Get memory in case it exceeds threshold
1713
+ getMemory()
1714
+
1715
+ if not stopSource:
1716
+ try:
1717
+ # Choose a random user agent string to use for any requests
1718
+ userAgent = random.choice(USER_AGENT)
1719
+ page = url.split('page=')[1]
1720
+ session = requests.Session()
1721
+ session.mount('https://', HTTP_ADAPTER)
1722
+ session.mount('http://', HTTP_ADAPTER)
1723
+ resp = session.get(url, headers={"User-Agent":userAgent})
1724
+ except ConnectionError as ce:
1725
+ writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error for page ' + page), 'red'))
1726
+ resp = None
1727
+ return
1728
+ except Exception as e:
1729
+ writerr(colored(getSPACER('[ ERR ] Error getting response for page ' + page + ' - ' + str(e)),'red'))
1730
+ resp = None
1731
+ return
1732
+ finally:
1733
+ try:
1734
+ if resp is not None:
1735
+ # If a status other of 429, then stop processing Wayback Machine
1736
+ if resp.status_code == 429:
1737
+ if args.wayback_rate_limit_retry > 0:
1738
+ seconds = args.wayback_rate_limit_retry * 60
1739
+ if args.processes == 1:
1740
+ writerr(colored('\r[ 429 ] Wayback Machine (archive.org) rate limit reached on page '+str(page)+' of '+str(totalPages)+', so waiting for '+str(seconds)+' seconds before continuing...\r','yellow'))
1741
+ else:
1742
+ writerr(colored('\r[ 429 ] Wayback Machine (archive.org) rate limit reached, so waiting for '+str(seconds)+' seconds before continuing...\r','yellow'))
1743
+ time.sleep(seconds)
1744
+ try:
1745
+ resp = session.get(url, headers={"User-Agent":userAgent})
1746
+ except ConnectionError as ce:
1747
+ writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error for page ' + page), 'red'))
1748
+ resp = None
1749
+ return
1750
+ except Exception as e:
1751
+ writerr(colored(getSPACER('[ ERR ] Error getting response for page ' + page + ' - ' + str(e)),'red'))
1752
+ resp = None
1753
+ return
1754
+
1755
+ if resp.status_code == 429:
1756
+ writerr(colored(getSPACER('[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
1757
+ stopSource = True
1758
+ return
1759
+ # If a status other of 503, then the site is unavailable
1760
+ if resp.status_code == 503:
1761
+ writerr(colored(getSPACER('[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.'),'red'))
1762
+ stopSource = True
1763
+ return
1764
+ # If the response from archive.org is empty then skip
1765
+ if resp.text == '' and totalPages == 0:
1766
+ if verbose():
1767
+ writerr(colored(getSPACER('[ ERR ] '+url+' gave an empty response.'),'red'))
1768
+ return
1769
+ # If a status other than 200, then stop
1770
+ if resp.status_code != 200:
1771
+ if verbose():
1772
+ writerr(colored(getSPACER('[ '+str(resp.status_code)+' ] Error for '+url),'red'))
1773
+ return
1774
+ except ConnectionError as ce:
1775
+ writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error for page ' + page), 'red'))
1776
+ resp = None
1777
+ return
1778
+ except Exception as e:
1779
+ writerr(colored(getSPACER('[ ERR ] Error getting response for page ' + page + ' - ' + str(e)),'red'))
1780
+ resp = None
1781
+ return
1782
+
1783
+ # Get the URLs and MIME types. Each line is a separate JSON string
1784
+ for line in resp.iter_lines():
1785
+ results = line.decode("utf-8")
1786
+ # Only get MIME Types if --verbose option was selected
1787
+ if verbose():
1788
+ try:
1789
+ linkMimes.add(str(results).split(' ')[2])
1790
+ except Exception as e:
1791
+ if verbose():
1792
+ writerr(colored(getSPACER('ERROR processWayBackPage 2: Cannot get MIME type from line: ' + str(line)),'red'))
1793
+ write(resp.text)
1794
+ try:
1795
+ foundUrl = fixArchiveOrgUrl(str(results).split(' ')[1])
1796
+ linksFoundAdd(foundUrl)
1797
+ except Exception as e:
1798
+ if verbose():
1799
+ writerr(colored(getSPACER('ERROR processWayBackPage 3: Cannot get link from line: ' + str(line)),'red'))
1800
+ write(resp.text)
1801
+ else:
1802
+ pass
1803
+ except Exception as e:
1804
+ if verbose():
1805
+ writerr(colored("ERROR processWayBackPage 1: " + str(e), "red"))
1806
+
1807
+ def getWaybackUrls():
1808
+ """
1809
+ Get URLs from the Wayback Machine, archive.org
1810
+ """
1811
+ global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages, stopSource, argsInput, checkWayback
1812
+
1813
+ # Write the file of URL's for the passed domain/URL
1814
+ try:
1815
+ stopSource = False
1816
+ # If there any + in the MIME types, e.g. image/svg+xml, then replace the + with a . otherwise the wayback API does not recognise it
1817
+ filterMIME = '&filter=!mimetype:warc/revisit|' + re.escape(FILTER_MIME).replace(',','|').replace('+','.')
1818
+ if MATCH_CODE != '':
1819
+ filterCode = '&filter=statuscode:' + re.escape(MATCH_CODE).replace(',','|')
1820
+ else:
1821
+ filterCode = '&filter=!statuscode:' + re.escape(FILTER_CODE).replace(',','|')
1822
+
1823
+ # Set keywords filter if -ko argument passed
1824
+ filterKeywords = ''
1825
+ if args.keywords_only:
1826
+ if args.keywords_only == '#CONFIG':
1827
+ filterKeywords = '&filter=original:.*(' + re.escape(FILTER_KEYWORDS).replace(',','|') + ').*'
1828
+ else:
1829
+ filterKeywords = '&filter=original:.*(' + args.keywords_only + ').*'
1830
+
1831
+ if args.filter_responses_only:
1832
+ url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}','') + '&page='
1833
+ else:
1834
+ url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}','') + filterMIME + filterCode + filterKeywords + '&page='
1835
+
1836
+ # Get the number of pages (i.e. separate requests) that are going to be made to archive.org
1837
+ totalPages = 0
1838
+ try:
1839
+ if not args.check_only:
1840
+ write(colored('\rGetting the number of Wayback Machine (archive.org) pages to search...\r','cyan'))
1841
+ # Choose a random user agent string to use for any requests
1842
+ userAgent = random.choice(USER_AGENT)
1843
+ session = requests.Session()
1844
+ session.mount('https://', HTTP_ADAPTER)
1845
+ session.mount('http://', HTTP_ADAPTER)
1846
+ resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
1847
+ totalPages = int(resp.text.strip())
1848
+
1849
+ # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
1850
+ if args.limit_requests != 0 and totalPages > args.limit_requests:
1851
+ totalPages = args.limit_requests
1852
+ except Exception as e:
1853
+ try:
1854
+ # If the rate limit was reached end now
1855
+ if resp.status_code == 429:
1856
+ writerr(colored(getSPACER('[ 429 ] Wayback Machine (Archive.org) rate limit reached so unable to get links.'),'red'))
1857
+ return
1858
+
1859
+ # If a status other of 503, then the site is unavailable
1860
+ if resp.status_code == 503:
1861
+ writerr(colored(getSPACER('[ 503 ] Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.'),'red'))
1862
+ return
1863
+
1864
+ if resp.text.lower().find('blocked site error') > 0:
1865
+ writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)'), 'red'))
1866
+ else:
1867
+ writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(resp.text.strip())), 'red'))
1868
+ except:
1869
+ if str(e).lower().find('alert access denied'):
1870
+ writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don\'t know, but it has happened!)'), 'red'))
1871
+ elif str(e).lower().find('connection refused'):
1872
+ writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)'), 'red'))
1873
+ else:
1874
+ writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
1875
+ return
1876
+
1877
+ if args.check_only:
1878
+ checkWayback = totalPages
1879
+ write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
1880
+ else:
1881
+ if verbose():
1882
+ write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
1883
+
1884
+ # if the page number was found then display it, but otherwise we will just try to increment until we have everything
1885
+ write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
1886
+
1887
+ # Get a list of all the page URLs we need to visit
1888
+ pages = []
1889
+ if totalPages == 1:
1890
+ pages.append(url)
1891
+ else:
1892
+ for page in range(0, totalPages):
1893
+ pages.append(url+str(page))
1894
+
1895
+ # Process the URLs from web archive
1896
+ if stopProgram is None:
1897
+ p = mp.Pool(args.processes)
1898
+ p.map(processWayBackPage, pages)
1899
+ p.close()
1900
+ p.join()
1901
+
1902
+ # Show the MIME types found (in case user wants to exclude more)
1903
+ if verbose() and len(linkMimes) > 0 :
1904
+ linkMimes.discard('warc/revisit')
1905
+ write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
1906
+ linkMimes = None
1907
+
1908
+ if not args.xwm:
1909
+ linkCount = len(linksFound)
1910
+ write(getSPACER(colored('Links found on Wayback Machine (archive.org): ', 'cyan')+colored(str(linkCount),'white'))+'\n')
1911
+
1912
+ except Exception as e:
1913
+ writerr(colored('ERROR getWaybackUrls 1: ' + str(e), 'red'))
1914
+
1915
+ def processCommonCrawlCollection(cdxApiUrl):
1916
+ """
1917
+ Get URLs from a given Common Crawl index collection
1918
+ """
1919
+ global subs, path, linksFound, linkMimes, stopSource, argsInput
1920
+
1921
+ try:
1922
+ # Get memory in case it exceeds threshold
1923
+ getMemory()
1924
+
1925
+ if not stopSource:
1926
+ # Set mime content type filter
1927
+ filterMIME = '&filter=!~mime:(warc/revisit|'
1928
+ if FILTER_MIME.strip() != '':
1929
+ filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
1930
+ filterMIME = filterMIME + ')'
1931
+
1932
+ # Set status code filter
1933
+ filterCode = ''
1934
+ if MATCH_CODE.strip() != '':
1935
+ filterCode = '&filter=~status:(' + re.escape(MATCH_CODE).replace(',','|') + ')'
1936
+ else:
1937
+ filterCode = '&filter=!~status:(' + re.escape(FILTER_CODE).replace(',','|') + ')'
1938
+
1939
+ # Set keywords filter if -ko argument passed
1940
+ filterKeywords = ''
1941
+ if args.keywords_only:
1942
+ if args.keywords_only == '#CONFIG':
1943
+ filterKeywords = '&filter=~url:.*(' + re.escape(FILTER_KEYWORDS).replace(',','|') + ').*'
1944
+ else:
1945
+ filterKeywords = '&filter=~url:.*(' + args.keywords_only + ').*'
1946
+
1947
+ commonCrawlUrl = cdxApiUrl + '?output=json&fl=timestamp,url,mime,status,digest&url='
1948
+
1949
+ if args.filter_responses_only:
1950
+ url = commonCrawlUrl + subs + quote(argsInput) + path
1951
+ else:
1952
+ url = commonCrawlUrl + subs + quote(argsInput) + path + filterMIME + filterCode + filterKeywords
1953
+
1954
+ try:
1955
+ # Choose a random user agent string to use for any requests
1956
+ userAgent = random.choice(USER_AGENT)
1957
+ session = requests.Session()
1958
+ session.mount('https://', HTTP_ADAPTER_CC)
1959
+ session.mount('http://', HTTP_ADAPTER_CC)
1960
+ resp = session.get(url, stream=True, headers={"User-Agent":userAgent})
1961
+ except ConnectionError as ce:
1962
+ writerr(colored(getSPACER('[ ERR ] Common Crawl connection error for index '+cdxApiUrl), 'red'))
1963
+ resp = None
1964
+ return
1965
+ except Exception as e:
1966
+ writerr(colored(getSPACER('[ ERR ] Error getting response - ' + str(e)),'red'))
1967
+ resp = None
1968
+ return
1969
+ finally:
1970
+ try:
1971
+ if resp is not None:
1972
+ # If a status other of 429, then stop processing Common Crawl
1973
+ if resp.status_code == 429:
1974
+ writerr(colored(getSPACER('[ 429 ] Common Crawl rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
1975
+ stopSource = True
1976
+ return
1977
+ # If the response from commoncrawl.org says nothing was found...
1978
+ if resp.text.lower().find('no captures found') > 0:
1979
+ # Don't output any messages, just exit function
1980
+ return
1981
+ # If the response from commoncrawl.org is empty, then stop
1982
+ if resp.text == '':
1983
+ if verbose():
1984
+ writerr(colored(getSPACER('[ ERR ] '+url+' gave an empty response.'),'red'))
1985
+ return
1986
+ # If a status other than 200, then stop
1987
+ if resp.status_code != 200:
1988
+ if verbose():
1989
+ writerr(colored(getSPACER('[ '+str(resp.status_code)+' ] Error for '+cdxApiUrl),'red'))
1990
+ return
1991
+ except:
1992
+ pass
1993
+
1994
+ # Get the URLs and MIME types
1995
+ for line in resp.iter_lines():
1996
+ results = line.decode("utf-8")
1997
+ try:
1998
+ data = json.loads(results)
1999
+ # Get MIME Types if --verbose option was seletced
2000
+ if verbose():
2001
+ try:
2002
+ linkMimes.add(data['mime'])
2003
+ except:
2004
+ pass
2005
+ linksFoundAdd(data['url'])
2006
+ except Exception as e:
2007
+ if verbose():
2008
+ writerr(colored('ERROR processCommonCrawlCollection 2: Cannot get URL and MIME type from line: ' + str(line),'red'))
2009
+ else:
2010
+ pass
2011
+ except Exception as e:
2012
+ writerr(colored('ERROR processCommonCrawlCollection 1: ' + str(e), 'red'))
2013
+
2014
+ def getCommonCrawlIndexes():
2015
+ """
2016
+ Requests the Common Crawl index file "collinfo.json" if it is not cached locally, or if the local file is older than a month.
2017
+ """
2018
+ try:
2019
+ # Check if a local copy of the index file exists
2020
+ createFile = False
2021
+ collinfoPath = str(Path(__file__).parent.resolve())+'/collinfo.json'
2022
+ if os.path.exists(collinfoPath):
2023
+ # Check if the file was created over a month ago
2024
+ monthAgo = datetime.now() - timedelta(days=30)
2025
+ fileModTime = datetime.fromtimestamp(os.path.getctime(collinfoPath))
2026
+ if fileModTime < monthAgo:
2027
+ createFile = True
2028
+ # Delete the current file
2029
+ try:
2030
+ os.remove(collinfoPath)
2031
+ except Exception as e:
2032
+ writerr(colored(getSPACER('[ ERR ] Couldn\'t delete local version of Common Crawl index file: ' + str(e)), 'red'))
2033
+ else:
2034
+ createFile = True
2035
+
2036
+ # If the local file exists then read that instead of requesting the index file again
2037
+ if not createFile:
2038
+ # Read the indexes from the local file
2039
+ try:
2040
+ with open(collinfoPath,'r') as file:
2041
+ jsonResp = file.read()
2042
+ file.close()
2043
+ except Exception as e:
2044
+ createFile = True
2045
+ writerr(colored(getSPACER('[ ERR ] Couldn\'t read local version of Common Crawl index file: ' + str(e)),'red'))
2046
+
2047
+ # If the local file needs creating again then make a new request
2048
+ if createFile:
2049
+ try:
2050
+ # Choose a random user agent string to use for any requests
2051
+ userAgent = random.choice(USER_AGENT)
2052
+ session = requests.Session()
2053
+ session.mount('https://', HTTP_ADAPTER_CC)
2054
+ session.mount('http://', HTTP_ADAPTER_CC)
2055
+ indexes = session.get(CCRAWL_INDEX_URL, headers={"User-Agent":userAgent})
2056
+ except ConnectionError as ce:
2057
+ writerr(colored(getSPACER('[ ERR ] Common Crawl connection error getting Index file'), 'red'))
2058
+ return
2059
+ except Exception as e:
2060
+ writerr(colored(getSPACER('[ ERR ] Error getting Common Crawl index collection - ' + str(e)),'red'))
2061
+ return
2062
+
2063
+ # If the rate limit was reached end now
2064
+ if indexes.status_code == 429:
2065
+ writerr(colored(getSPACER('[ 429 ] Common Crawl rate limit reached so unable to get links.'),'red'))
2066
+ return
2067
+ # If the rate limit was reached end now
2068
+ elif indexes.status_code == 503:
2069
+ writerr(colored(getSPACER('[ 503 ] Common Crawl seems to be unavailable.'),'red'))
2070
+ return
2071
+ elif indexes.status_code != 200:
2072
+ writerr(colored(getSPACER('[ '+str(indexes.status_code)+' ] Common Crawl did not retrun the indexes file.'),'red'))
2073
+ return
2074
+
2075
+ # Get the the returned JSON
2076
+ jsonResp = indexes.text
2077
+
2078
+ # Write the contents of the response to a local file so we don't request in future. Overwrite it if it exists
2079
+ try:
2080
+ f = open(collinfoPath, 'w')
2081
+ f.write(jsonResp)
2082
+ f.close()
2083
+ except Exception as e:
2084
+ writerr(colored(getSPACER('[ ERR ] Couldn\'t create local version of Common Crawl index file: ' + str(e)),'red'))
2085
+
2086
+ # Get the API URLs from the returned JSON
2087
+ cdxApiUrls = set()
2088
+ collection = 0
2089
+ for values in json.loads(jsonResp):
2090
+ for key in values:
2091
+ if key == 'cdx-api':
2092
+ if args.lcy != 0:
2093
+ try:
2094
+ indexYear = values[key].split("CC-MAIN-")[1][:4]
2095
+ if int(indexYear) >= args.lcy:
2096
+ cdxApiUrls.add(values[key])
2097
+ except Exception as e:
2098
+ writerr(colored(getSPACER('[ ERR ] Failed to get the year from index name ' + values[key] + ' - ' + str(e)),'red'))
2099
+ else:
2100
+ cdxApiUrls.add(values[key])
2101
+ collection = collection + 1
2102
+ if collection == args.lcc: break
2103
+
2104
+ return cdxApiUrls
2105
+
2106
+ except Exception as e:
2107
+ writerr(colored('ERROR getCommonCrawlIndexes 1: ' + str(e), 'red'))
2108
+
2109
+ def getCommonCrawlUrls():
2110
+ """
2111
+ Get all Common Crawl index collections to get all URLs from each one
2112
+ """
2113
+ global linksFound, linkMimes, waymorePath, subs, path, stopSource, argsInput, checkCommonCrawl
2114
+
2115
+ try:
2116
+ stopSource = False
2117
+ linkMimes = set()
2118
+ originalLinkCount = len(linksFound)
2119
+
2120
+ # Set mime content type filter
2121
+ filterMIME = '&filter=!~mime:(warc/revisit|'
2122
+ if FILTER_MIME.strip() != '':
2123
+ filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
2124
+ filterMIME = filterMIME + ')'
2125
+
2126
+ # Set status code filter
2127
+ filterCode = ''
2128
+ if MATCH_CODE.strip() != '':
2129
+ filterCode = '&filter=~status:(' + re.escape(MATCH_CODE).replace(',','|') + ')'
2130
+ else:
2131
+ filterCode = '&filter=!~status:(' + re.escape(FILTER_CODE).replace(',','|') + ')'
2132
+
2133
+ if verbose():
2134
+ if args.filter_responses_only:
2135
+ url = '{CDX-API-URL}?output=json&fl=timestamp,url,mime,status,digest&url=' + subs + quote(argsInput) + path
2136
+ else:
2137
+ url = '{CDX-API-URL}?output=json&fl=timestamp,url,mime,status,digest&url=' + subs + quote(argsInput) + path + filterMIME + filterCode
2138
+ write(colored('The commoncrawl index URL requested to get links (where {CDX-API-URL} is from ' + CCRAWL_INDEX_URL + '): ','magenta')+colored(url+'\n','white'))
2139
+
2140
+ if not args.check_only:
2141
+ write(colored('\rGetting commoncrawl.org index collections list...\r','cyan'))
2142
+
2143
+ # Get the Common Crawl index collections
2144
+ cdxApiUrls = getCommonCrawlIndexes()
2145
+
2146
+ if args.check_only:
2147
+ if args.lcc < len(cdxApiUrls):
2148
+ checkCommonCrawl = args.lcc+1
2149
+ else:
2150
+ checkCommonCrawl = len(cdxApiUrls)+1
2151
+ write(colored('Get URLs from Common Crawl: ','cyan')+colored(str(checkCommonCrawl)+' requests','white'))
2152
+ else:
2153
+ write(colored('\rGetting links from the latest ' + str(len(cdxApiUrls)) + ' commoncrawl.org index collections (this can take a while for some domains)...\r','cyan'))
2154
+
2155
+ # Process the URLs from common crawl
2156
+ if stopProgram is None:
2157
+ p = mp.Pool(args.processes)
2158
+ p.map(processCommonCrawlCollection, cdxApiUrls)
2159
+ p.close()
2160
+ p.join()
2161
+
2162
+ # Show the MIME types found (in case user wants to exclude more)
2163
+ if verbose() and len(linkMimes) > 0:
2164
+ linkMimes.discard('warc/revisit')
2165
+ write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
2166
+
2167
+ linkCount = len(linksFound) - originalLinkCount
2168
+ if args.xwm:
2169
+ write(getSPACER(colored('Links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
2170
+ else:
2171
+ write(getSPACER(colored('Extra links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
2172
+
2173
+ except Exception as e:
2174
+ writerr(colored('ERROR getCommonCrawlUrls 1: ' + str(e), 'red'))
2175
+
2176
+ def processVirusTotalUrl(url):
2177
+ """
2178
+ Process a specific URL from virustotal.io to determine whether to save the link
2179
+ """
2180
+ global argsInput, argsInputHostname
2181
+
2182
+ addLink = True
2183
+
2184
+ # If the url passed doesn't have a scheme, prefix with http://
2185
+ match = re.search(r'^[A-za-z]*\:\/\/', url, flags=re.IGNORECASE)
2186
+ if match is None:
2187
+ url = 'http://'+url
2188
+
2189
+ try:
2190
+ # If filters are required then test them
2191
+ if not args.filter_responses_only:
2192
+
2193
+ # If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
2194
+ if args.no_subs:
2195
+ match = re.search(r'^[A-za-z]*\:\/\/(www\.)?'+re.escape(argsInputHostname), url, flags=re.IGNORECASE)
2196
+ if match is None:
2197
+ addLink = False
2198
+
2199
+ # If the user didn't requested -f / --filter-responses-only then check http code
2200
+ # Note we can't check MIME filter because it is not returned by VirusTotal API
2201
+ if addLink and not args.filter_responses_only:
2202
+
2203
+ # Check the URL exclusions
2204
+ if addLink:
2205
+ match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', url, flags=re.IGNORECASE)
2206
+ if match is not None:
2207
+ addLink = False
2208
+
2209
+ # Set keywords filter if -ko argument passed
2210
+ if addLink and args.keywords_only:
2211
+ if args.keywords_only == '#CONFIG':
2212
+ match = re.search(r'('+re.escape(FILTER_KEYWORDS).replace(',','|')+')', url, flags=re.IGNORECASE)
2213
+ else:
2214
+ match = re.search(r'('+args.keywords_only+')', url, flags=re.IGNORECASE)
2215
+ if match is None:
2216
+ addLink = False
2217
+
2218
+ # Add link if it passed filters
2219
+ if addLink:
2220
+ # Just get the hostname of the urkl
2221
+ tldExtract = tldextract.extract(url)
2222
+ subDomain = tldExtract.subdomain
2223
+ if subDomain != '':
2224
+ subDomain = subDomain+'.'
2225
+ domainOnly = subDomain+tldExtract.domain+'.'+tldExtract.suffix
2226
+
2227
+ # VirusTotal might return URLs that aren't for the domain passed so we need to check for those and not process them
2228
+ # Check the URL
2229
+ match = re.search(r'(^|\.)'+re.escape(argsInputHostname)+'$', domainOnly, flags=re.IGNORECASE)
2230
+ if match is not None:
2231
+ linksFoundAdd(url)
2232
+
2233
+ except Exception as e:
2234
+ writerr(colored('ERROR processVirusTotalUrl 1: ' + str(e), 'red'))
2235
+
2236
+ def getVirusTotalUrls():
2237
+ """
2238
+ Get URLs from the VirusTotal API v2
2239
+ """
2240
+ global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSource, argsInput, checkVirusTotal, argsInputHostname
2241
+
2242
+ # Write the file of URL's for the passed domain/URL
2243
+ try:
2244
+ requestsMade = 0
2245
+ stopSource = False
2246
+ linkMimes = set()
2247
+ originalLinkCount = len(linksFound)
2248
+
2249
+ # Just pass the hostname in the URL
2250
+ url = VIRUSTOTAL_URL.replace('{DOMAIN}',quote(argsInputHostname)).replace('{APIKEY}',VIRUSTOTAL_API_KEY)
2251
+
2252
+ if verbose():
2253
+ write(colored('The VirusTotal URL requested to get links: ','magenta')+colored(url+'\n','white'))
2254
+
2255
+ if not args.check_only:
2256
+ write(colored('\rGetting links from virustotal.com API...\r','cyan'))
2257
+
2258
+ # Get the domain report from virustotal
2259
+ try:
2260
+ # Choose a random user agent string to use for any requests
2261
+ userAgent = random.choice(USER_AGENT)
2262
+ session = requests.Session()
2263
+ session.mount('https://', HTTP_ADAPTER)
2264
+ session.mount('http://', HTTP_ADAPTER)
2265
+ # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
2266
+ resp = session.get(url, headers={'User-Agent':userAgent})
2267
+ requestsMade = requestsMade + 1
2268
+ except Exception as e:
2269
+ write(colored(getSPACER('[ ERR ] Unable to get links from virustotal.io: ' + str(e)), 'red'))
2270
+ return
2271
+
2272
+ # Deal with any errors
2273
+ if resp.status_code == 429:
2274
+ writerr(colored(getSPACER('[ 429 ] VirusTotal rate limit reached so unable to get links.'),'red'))
2275
+ return
2276
+ elif resp.status_code == 403:
2277
+ writerr(colored(getSPACER('[ 403 ] VirusTotal: Permission denied. Check your API key is correct.'),'red'))
2278
+ return
2279
+ elif resp.status_code != 200:
2280
+ writerr(colored(getSPACER('[ ' + str(resp.status_code) + ' ] Unable to get links from virustotal.com'),'red'))
2281
+ return
2282
+
2283
+ # Get the JSON response
2284
+ jsonResp = json.loads(resp.text.strip())
2285
+
2286
+ # Get the different URLs
2287
+ if args.no_subs:
2288
+ subDomains = []
2289
+ else:
2290
+ try:
2291
+ subDomains = jsonResp['subdomains']
2292
+ except Exception as e:
2293
+ subDomains = []
2294
+ try:
2295
+ detectedUrls = [entry['url'] for entry in jsonResp.get('detected_urls', [])]
2296
+ except Exception as e:
2297
+ detectedUrls = []
2298
+ try:
2299
+ undetectedUrls = [entry[0] for entry in jsonResp.get('undetected_urls', [])]
2300
+ except Exception as e:
2301
+ undetectedUrls = []
2302
+ try:
2303
+ totalUrls = set(subDomains + detectedUrls + undetectedUrls)
2304
+ except Exception as e:
2305
+ totalUrls = []
2306
+
2307
+ if args.check_only:
2308
+ write(colored('Get URLs from VirusTotal: ','cyan')+colored('1 request','white'))
2309
+ checkVirusTotal = 1
2310
+ else:
2311
+ # Carry on if something was found
2312
+ for vturl in totalUrls:
2313
+
2314
+ if stopSource:
2315
+ break
2316
+
2317
+ # Get memory in case it exceeds threshold
2318
+ getMemory()
2319
+
2320
+ # Work out whether to include it
2321
+ processVirusTotalUrl(vturl)
2322
+
2323
+ linkCount = len(linksFound) - originalLinkCount
2324
+ if args.xwm and args.xcc and args.xav and args.xus:
2325
+ write(getSPACER(colored('Links found on virustotal.com: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
2326
+ else:
2327
+ write(getSPACER(colored('Extra links found on virustotal.com: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
2328
+
2329
+ except Exception as e:
2330
+ writerr(colored('ERROR getVirusTotalUrls 1: ' + str(e), 'red'))
2331
+
2332
+ def processResponses():
2333
+ """
2334
+ Get archived responses from Wayback Machine (archive.org)
2335
+ """
2336
+ global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory
2337
+ try:
2338
+ if not args.check_only:
2339
+ # Create 'results' and domain directory if needed
2340
+ createDirs()
2341
+
2342
+ # Get the path of the files, depending on whether -oR / --output_responses was passed
2343
+ try:
2344
+ continuePath = responseOutputDirectory + 'continueResp.tmp'
2345
+ responsesPath = responseOutputDirectory + 'responses.tmp'
2346
+ indexPath = responseOutputDirectory + 'index.txt'
2347
+ except Exception as e:
2348
+ if verbose():
2349
+ writerr(colored('ERROR processResponses 4: ' + str(e), 'red'))
2350
+
2351
+ # Check if a continueResp.tmp and responses.tmp files exists
2352
+ runPrevious = 'n'
2353
+ if not args.check_only and os.path.exists(continuePath) and os.path.exists(responsesPath):
2354
+
2355
+ # Load the links into the set
2356
+ with open(responsesPath,'rb') as fl:
2357
+ linkRequests = pickle.load(fl)
2358
+ totalPrevResponses = len(linkRequests)
2359
+
2360
+ # Get the previous end position to start again at this point
2361
+ try:
2362
+ with open(continuePath,'r') as fc:
2363
+ successCount = int(fc.readline().strip())
2364
+ except Exception as e:
2365
+ successCount = 0
2366
+
2367
+ # Ask the user if we should continue with previous run if the current starting position is greater than 0 and less than the total
2368
+ if successCount > 0 and successCount < totalPrevResponses:
2369
+ # If the program is not piped from or to another process, then ask whether to continue with previous run
2370
+ if sys.stdout.isatty() and sys.stdin.isatty():
2371
+ write(colored('The previous run to get archived responses for ' + argsInput + ' was not completed.\nYou can start from response ' + str(successCount) + ' of ' + str(totalPrevResponses) + ' for the previous run, or you can start a new run with your specified arguments.', 'yellow'))
2372
+ runPrevious = input('Continue with previous run? y/n: ')
2373
+ else:
2374
+ if CONTINUE_RESPONSES_IF_PIPED:
2375
+ runPrevious = 'y'
2376
+ writerr(colored('The previous run to get archived responses for ' + argsInput + ' was not completed. Starting from response ' + str(successCount) + ' of ' + str(totalPrevResponses) + '... ', 'yellow'))
2377
+ else:
2378
+ runPrevious = 'n'
2379
+
2380
+ # If we are going to run a new run
2381
+ if runPrevious.lower() == 'n':
2382
+
2383
+ # Set start point
2384
+ successCount = 0
2385
+
2386
+ # Set up filters
2387
+ filterLimit = '&limit=' + str(args.limit)
2388
+ if args.from_date is None:
2389
+ filterFrom = ''
2390
+ else:
2391
+ filterFrom = '&from=' + str(args.from_date)
2392
+ if args.to_date is None:
2393
+ filterTo = ''
2394
+ else:
2395
+ filterTo = '&to=' + str(args.to_date)
2396
+
2397
+ # Set keywords filter if -ko argument passed
2398
+ filterKeywords = ''
2399
+ if args.keywords_only:
2400
+ if args.keywords_only == '#CONFIG':
2401
+ filterKeywords = '&filter=original:.*(' + re.escape(FILTER_KEYWORDS).replace(',','|') + ').*'
2402
+ else:
2403
+ filterKeywords = '&filter=original:.*(' + args.keywords_only + ').*'
2404
+
2405
+ # Get the list again with filters and include timestamp
2406
+ linksFound = set()
2407
+
2408
+ # Set mime content type filter
2409
+ filterMIME = '&filter=!mimetype:warc/revisit'
2410
+ if FILTER_MIME.strip() != '':
2411
+ filterMIME = filterMIME + '|' + re.escape(FILTER_MIME).replace(',','|')
2412
+
2413
+ # Set status code filter
2414
+ filterCode = ''
2415
+ if MATCH_CODE.strip() != '':
2416
+ filterCode = '&filter=statuscode:' + re.escape(MATCH_CODE).replace(',','|')
2417
+ else:
2418
+ filterCode = '&filter=!statuscode:' + re.escape(FILTER_CODE).replace(',','|')
2419
+
2420
+ # Set the collapse parameter value in the archive.org URL. From the Wayback API docs:
2421
+ # "A new form of filtering is the option to 'collapse' results based on a field, or a substring of a field.
2422
+ # Collapsing is done on adjacent cdx lines where all captures after the first one that are duplicate are filtered out.
2423
+ # This is useful for filtering out captures that are 'too dense' or when looking for unique captures."
2424
+ if args.capture_interval == 'none': # get all
2425
+ collapse = ''
2426
+ elif args.capture_interval == 'h': # get at most 1 capture per hour
2427
+ collapse = 'timestamp:10'
2428
+ elif args.capture_interval == 'd': # get at most 1 capture per day
2429
+ collapse = 'timestamp:8'
2430
+ elif args.capture_interval == 'm': # get at most 1 capture per month
2431
+ collapse = 'timestamp:6'
2432
+
2433
+ url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords
2434
+
2435
+ if verbose():
2436
+ write(colored('The archive URL requested to get responses: ','magenta')+colored(url+'\n','white'))
2437
+
2438
+ if args.check_only:
2439
+ write(colored('\rChecking archived response requests...\r','cyan'))
2440
+ else:
2441
+ write(colored('\rGetting list of response links (this can take a while for some domains)...\r','cyan'))
2442
+
2443
+ # Build the list of links, concatenating timestamp and URL
2444
+ try:
2445
+ # Choose a random user agent string to use for any requests
2446
+ success = True
2447
+ userAgent = random.choice(USER_AGENT)
2448
+ session = requests.Session()
2449
+ session.mount('https://', HTTP_ADAPTER)
2450
+ session.mount('http://', HTTP_ADAPTER)
2451
+ resp = session.get(url, stream=True, headers={"User-Agent":userAgent}, timeout=args.timeout)
2452
+ except ConnectionError as ce:
2453
+ writerr(colored(getSPACER('[ ERR ] Wayback Machine (archive.org) connection error'), 'red'))
2454
+ resp = None
2455
+ success = False
2456
+ return
2457
+ except Exception as e:
2458
+ writerr(colored(getSPACER('[ ERR ] Couldn\'t get list of responses: ' + str(e)),'red'))
2459
+ resp = None
2460
+ success = False
2461
+ return
2462
+ finally:
2463
+ try:
2464
+ if resp is not None:
2465
+ # If the response from archive.org is empty, then no responses were found
2466
+ if resp.text == '':
2467
+ writerr(colored(getSPACER('No archived responses were found on Wayback Machine (archive.org) for the given search parameters.'),'red'))
2468
+ success = False
2469
+ # If a status other of 429, then stop processing Alien Vault
2470
+ if resp.status_code == 429:
2471
+ writerr(colored(getSPACER('[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved.'),'red'))
2472
+ success = False
2473
+ # If a status other of 503, then the site is unavailable
2474
+ elif resp.status_code == 503:
2475
+ writerr(colored(getSPACER('[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.'),'red'))
2476
+ success = False
2477
+ # If a status other than 200, then stop
2478
+ elif resp.status_code != 200:
2479
+ if verbose():
2480
+ writerr(colored(getSPACER('[ '+str(resp.status_code)+' ] Error for '+url),'red'))
2481
+ success = False
2482
+ if not success:
2483
+ if args.keywords_only:
2484
+ if args.keywords_only == '#CONFIG':
2485
+ writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml'), 'red'))
2486
+ else:
2487
+ writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed'), 'red'))
2488
+ else:
2489
+ if resp.text.lower().find('blocked site error') > 0:
2490
+ writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)'), 'red'))
2491
+ else:
2492
+ writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - check input domain and try again.'), 'red'))
2493
+ return
2494
+ except:
2495
+ pass
2496
+
2497
+ # Go through the response to save the links found
2498
+ for line in resp.iter_lines():
2499
+ try:
2500
+ results = line.decode("utf-8")
2501
+ timestamp = results.split(' ')[0]
2502
+ originalUrl = results.split(' ')[1]
2503
+ linksFoundAdd(timestamp+'/'+originalUrl)
2504
+ except Exception as e:
2505
+ writerr(colored(getSPACER('ERROR processResponses 3: Cannot to get link from line: '+str(line)), 'red'))
2506
+
2507
+ # Remove any links that have URL exclusions
2508
+ linkRequests = []
2509
+ exclusionRegex = re.compile(r'('+re.escape(FILTER_URL).replace(',','|')+')',flags=re.IGNORECASE)
2510
+ for link in linksFound:
2511
+ # Only add the link if:
2512
+ # a) if the -ra --regex-after was passed that it matches that
2513
+ # b) it does not match the URL exclusions
2514
+ if (args.regex_after is None or re.search(args.regex_after, link, flags=re.IGNORECASE) is not None) and exclusionRegex.search(link) is None:
2515
+ linkRequests.append(link)
2516
+
2517
+ # Write the links to a temp file
2518
+ if not args.check_only:
2519
+ with open(responsesPath,'wb') as f:
2520
+ pickle.dump(linkRequests, f)
2521
+
2522
+ # Get the total number of responses we will try to get and set the current file count to the success count
2523
+ totalResponses = len(linkRequests)
2524
+ fileCount = successCount
2525
+
2526
+ if args.check_only:
2527
+ if args.limit == 5000 and totalResponses+1 == 5000:
2528
+ writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests (the --limit argument defaults to '+str(DEFAULT_LIMIT)+')','cyan'))
2529
+ else:
2530
+ writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests','white'))
2531
+ minutes = round(totalResponses*2.5 // 60)
2532
+ hours = minutes // 60
2533
+ days = hours // 24
2534
+ if minutes < 5:
2535
+ write(colored('\n-> Downloading the responses (depending on their size) should be quite quick!','green'))
2536
+ elif hours < 2:
2537
+ write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(minutes)+' minutes.','green'))
2538
+ elif hours < 6:
2539
+ write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','green'))
2540
+ elif hours < 24:
2541
+ write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','yellow'))
2542
+ elif days < 7:
2543
+ write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days. Consider using arguments -ko, -l, -ci, -from and -to wisely! ','red'))
2544
+ else:
2545
+ write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days!!! Consider using arguments -ko, -l, -ci, -from and -to wisely!','red'))
2546
+ write('')
2547
+ else:
2548
+ # If the limit has been set over the default, give a warning that this could take a long time!
2549
+ if totalResponses - successCount > DEFAULT_LIMIT:
2550
+ if successCount > 0:
2551
+ writerr(colored(getSPACER('WARNING: Downloading remaining ' + str(totalResponses - successCount) + ' responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!'),'yellow'))
2552
+ else:
2553
+ writerr(colored(getSPACER('WARNING: Downloading ' + str(totalResponses) + ' responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!'),'yellow'))
2554
+
2555
+ # Open the index file if hash value is going to be used (not URL)
2556
+ if not args.url_filename:
2557
+ indexFile = open(indexPath,'a')
2558
+
2559
+ # Open the continue.tmp file to store what record we are upto
2560
+ continueRespFile = open(continuePath,'w+')
2561
+
2562
+ # Process the URLs from web archive
2563
+ if stopProgram is None:
2564
+ p = mp.Pool(args.processes)
2565
+ p.map(processArchiveUrl, linkRequests[successCount:])
2566
+ p.close()
2567
+ p.join()
2568
+
2569
+ # Delete the tmp files now it has run successfully
2570
+ if stopProgram is None:
2571
+ try:
2572
+ os.remove(continuePath)
2573
+ os.remove(responsesPath)
2574
+ except:
2575
+ pass
2576
+
2577
+ # Close the index file if hash value is going to be used (not URL)
2578
+ if not args.url_filename:
2579
+ indexFile.close()
2580
+
2581
+ # Close the continueResp.tmp file
2582
+ continueRespFile.close()
2583
+
2584
+ except Exception as e:
2585
+ writerr(colored(getSPACER('ERROR processResponses 1: ' + str(e)), 'red'))
2586
+ finally:
2587
+ linkRequests = None
2588
+
2589
+ def createDirs():
2590
+ """
2591
+ Create a directory for the 'results' and the sub directory for the passed domain/URL, unless if
2592
+ -oR / --output-responses was passed, just create those directories
2593
+ """
2594
+ global DEFAULT_OUTPUT_DIR, argsInput
2595
+ try:
2596
+ if (args.mode in 'R,B' and args.output_responses == '') or (args.mode in 'U,B' and args.output_urls == ''):
2597
+ # Create a directory for "results" if it doesn't already exist
2598
+ try:
2599
+ results_dir = Path(DEFAULT_OUTPUT_DIR+'/results')
2600
+ results_dir.mkdir(exist_ok=True)
2601
+ except:
2602
+ pass
2603
+ # Create a directory for the target domain
2604
+ try:
2605
+ domain_dir = Path(DEFAULT_OUTPUT_DIR + '/results/' + str(argsInput).replace('/','-'))
2606
+ domain_dir.mkdir(parents=True, exist_ok=True)
2607
+ except Exception as e:
2608
+ pass
2609
+ try:
2610
+ # Create specified directory for -oR if required
2611
+ if args.output_responses != '':
2612
+ responseDir = Path(args.output_responses)
2613
+ responseDir.mkdir(parents=True, exist_ok=True)
2614
+ # If -oU was passed and is prefixed with a directory, create it
2615
+ if args.output_urls != '' and '/' in args.output_urls:
2616
+ directoriesOnly = os.path.dirname(args.output_urls)
2617
+ responseDir = Path(directoriesOnly)
2618
+ responseDir.mkdir(parents=True, exist_ok=True)
2619
+ except Exception as e:
2620
+ pass
2621
+ except Exception as e:
2622
+ writerr(colored(getSPACER('ERROR createDirs 1: ' + str(e)), 'red'))
2623
+
2624
+ # Get width of the progress bar based on the width of the terminal
2625
+ def getProgressBarLength():
2626
+ global terminalWidth
2627
+ try:
2628
+ if verbose():
2629
+ offset = 90
2630
+ else:
2631
+ offset = 50
2632
+ progressBarLength = terminalWidth - offset
2633
+ except:
2634
+ progressBarLength = 20
2635
+ return progressBarLength
2636
+
2637
+ # Get the length of the space to add to a string to fill line up to width of terminal
2638
+ def getSPACER(text):
2639
+ global terminalWidth
2640
+ lenSpacer = terminalWidth - len(text) +5
2641
+ SPACER = ' ' * lenSpacer
2642
+ return text + SPACER
2643
+
2644
+ # For validating -m / --memory-threshold argument
2645
+ def argcheckPercent(value):
2646
+ ivalue = int(value)
2647
+ if ivalue > 99:
2648
+ raise argparse.ArgumentTypeError(
2649
+ "A valid integer percentage less than 100 must be entered."
2650
+ )
2651
+ return ivalue
2652
+
2653
+ def notifyDiscord():
2654
+ global WEBHOOK_DISCORD, args
2655
+ try:
2656
+ data = {
2657
+ 'content': 'waymore has finished for `-i ' + args.input + ' -mode ' + args.mode + '` ! 🤘',
2658
+ 'username': 'waymore',
2659
+ }
2660
+ try:
2661
+ result = requests.post(WEBHOOK_DISCORD, json=data)
2662
+ if 300 <= result.status_code < 200:
2663
+ writerr(colored(getSPACER('WARNING: Failed to send notification to Discord - ' + result.json()), 'yellow'))
2664
+ except Exception as e:
2665
+ writerr(colored(getSPACER('WARNING: Failed to send notification to Discord - ' + str(e)), 'yellow'))
2666
+ except Exception as e:
2667
+ writerr(colored('ERROR notifyDiscord 1: ' + str(e), 'red'))
2668
+
2669
+ def checkScript(script):
2670
+ try:
2671
+ if script.replace('\n','').strip() != '':
2672
+ return True
2673
+ except Exception as e:
2674
+ writerr(colored('ERROR extractScripts 1: ' + str(e), 'red'))
2675
+
2676
+ def extractScripts(filePath):
2677
+ try:
2678
+ with open(filePath, 'rb') as file:
2679
+ content = file.read().decode('utf-8', errors='ignore')
2680
+ scripts = re.findall(r'<script[^>]*>(.*?)</script>', content, re.DOTALL)
2681
+ scripts = list(filter(checkScript, scripts))
2682
+ return scripts
2683
+ except Exception as e:
2684
+ writerr(colored('ERROR extractScripts 1: ' + str(e), 'red'))
2685
+
2686
+ def extractExternalScripts(filePath):
2687
+ try:
2688
+ with open(filePath, 'rb') as file:
2689
+ content = file.read().decode('utf-8', errors='ignore')
2690
+ scripts = re.findall(r'<script[^>]* src="(.*?)".*?>', content, re.DOTALL)
2691
+ scripts = list(filter(checkScript, scripts))
2692
+ return scripts
2693
+ except Exception as e:
2694
+ writerr(colored('ERROR extractExternalScripts 1: ' + str(e), 'red'))
2695
+
2696
+ def combineInlineJS():
2697
+ global responseOutputDirectory, INLINE_JS_EXCLUDE
2698
+ try:
2699
+ write(colored('Creating combined inline JS files...', 'cyan'))
2700
+ outputFileTemplate = "combinedInline{}.js"
2701
+ excludedNames = ['index.txt', 'continueResp.tmp', 'responses.tmp']
2702
+ fileList = [name for name in os.listdir(responseOutputDirectory)
2703
+ if os.path.isfile(os.path.join(responseOutputDirectory, name))
2704
+ and not any(name.lower().endswith(ext) for ext in INLINE_JS_EXCLUDE)
2705
+ and name not in excludedNames
2706
+ and 'combinedInline' not in name]
2707
+
2708
+ allScripts = {} # To store all scripts from all files
2709
+ allExternalScripts = [] # To store all external script sources from all files
2710
+
2711
+ fileCount = len(fileList)
2712
+ currentFile = 1
2713
+ for filename in fileList:
2714
+ filePath = os.path.join(responseOutputDirectory, filename)
2715
+ scripts = extractScripts(filePath)
2716
+ if scripts:
2717
+ allScripts[filename] = scripts
2718
+ allExternalScripts.extend(extractExternalScripts(filePath))
2719
+
2720
+ # Show progress bar
2721
+ fillTest = currentFile % 2
2722
+ fillChar = "o"
2723
+ if fillTest == 0:
2724
+ fillChar = "O"
2725
+ suffix="Complete "
2726
+ printProgressBar(
2727
+ currentFile,
2728
+ fileCount,
2729
+ prefix="Checking "+str(fileCount)+" files:",
2730
+ suffix=suffix,
2731
+ length=getProgressBarLength(),
2732
+ fill=fillChar
2733
+ )
2734
+ currentFile += 1
2735
+
2736
+ # Write a file of external javascript files referenced in the inline scripts
2737
+ totalExternal = len(allExternalScripts)
2738
+ if totalExternal > 0:
2739
+ uniqueExternalScripts = set(allExternalScripts)
2740
+ outputFile = os.path.join(responseOutputDirectory, 'combinedInlineSrc.txt')
2741
+ inlineExternalFile = open(outputFile, 'w', encoding='utf-8')
2742
+ for script in uniqueExternalScripts:
2743
+ inlineExternalFile.write(script.strip() + '\n')
2744
+ write(colored('Created file ','cyan')+colored(responseOutputDirectory+'combinedInlineSrc.txt','white')+colored(' (src of external JS)','cyan'))
2745
+
2746
+ # Write files for all combined inline JS
2747
+ uniqueScripts = set()
2748
+ for scriptsList in allScripts.values():
2749
+ uniqueScripts.update(scriptsList)
2750
+
2751
+ totalSections = len(uniqueScripts)
2752
+ sectionCounter = 0 # Counter for inline JS sections
2753
+ currentOutputFile = os.path.join(responseOutputDirectory, outputFileTemplate.format(1))
2754
+ currentSectionsWritten = 0 # Counter for sections written in current file
2755
+
2756
+ if totalSections > 0:
2757
+ fileNumber = 1
2758
+ with open(currentOutputFile, 'w', encoding='utf-8') as inlineJSFile:
2759
+ currentScript = 1
2760
+ for script in uniqueScripts:
2761
+ # Show progress bar
2762
+ fillTest = currentScript % 2
2763
+ fillChar = "o"
2764
+ if fillTest == 0:
2765
+ fillChar = "O"
2766
+ suffix="Complete "
2767
+ printProgressBar(
2768
+ currentScript,
2769
+ totalSections,
2770
+ prefix="Writing "+str(totalSections)+" unique scripts:",
2771
+ suffix=suffix,
2772
+ length=getProgressBarLength(),
2773
+ fill=fillChar
2774
+ )
2775
+ sectionCounter += 1
2776
+ currentSectionsWritten += 1
2777
+ if currentSectionsWritten > 1000:
2778
+ # If 1000 sections have been written, switch to the next output file
2779
+ inlineJSFile.close()
2780
+ fileNumber = sectionCounter // 1000 + 1
2781
+ currentOutputFile = os.path.join(responseOutputDirectory, outputFileTemplate.format(fileNumber))
2782
+ inlineJSFile = open(currentOutputFile, 'w', encoding='utf-8')
2783
+ currentSectionsWritten = 1
2784
+
2785
+ # Insert comment line for the beginning of the section
2786
+ inlineJSFile.write(f"//****** INLINE JS SECTION {sectionCounter} ******//\n\n")
2787
+
2788
+ # Write comments indicating the files the script was found in
2789
+ files = ''
2790
+ for filename, scripts_list in allScripts.items():
2791
+ if script in scripts_list:
2792
+ files = files + filename + ', '
2793
+
2794
+ # Write the files the script appears in
2795
+ inlineJSFile.write('// ' + files.rstrip(', ') + '\n')
2796
+
2797
+ # Write the script content
2798
+ inlineJSFile.write('\n' + script.strip() + '\n\n')
2799
+
2800
+ currentScript += 1
2801
+
2802
+ if totalExternal == 0 and totalSections == 0:
2803
+ write(colored('No scripts found, so no combined JS files written.\n','cyan'))
2804
+ elif fileNumber == 1:
2805
+ write(colored('Created file ','cyan')+colored(responseOutputDirectory+'combinedInline1.js','white')+colored(' (contents of inline JS)\n','cyan'))
2806
+ else:
2807
+ write(colored('Created files ','cyan')+colored(responseOutputDirectory+'combinedInline{1-'+str(fileNumber)+'}.js','white')+colored(' (contents of inline JS)\n','cyan'))
2808
+
2809
+ except Exception as e:
2810
+ writerr(colored('ERROR combineInlineJS 1: ' + str(e), 'red'))
2811
+
2812
+ # Run waymore
2813
+ def main():
2814
+ global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount
2815
+
2816
+ # Tell Python to run the handler() function when SIGINT is received
2817
+ signal(SIGINT, handler)
2818
+
2819
+ # Parse command line arguments
2820
+ parser = argparse.ArgumentParser(
2821
+ description='waymore - by @Xnl-h4ck3r: Find way more from the Wayback Machine'
2822
+ )
2823
+ parser.add_argument(
2824
+ '-i',
2825
+ '--input',
2826
+ action='store',
2827
+ help='The target domain (or file of domains) to find links for. This can be a domain only, or a domain with a specific path. If it is a domain only to get everything for that domain, don\'t prefix with "www."',
2828
+ type=validateArgInput
2829
+ )
2830
+ parser.add_argument(
2831
+ '-n',
2832
+ '--no-subs',
2833
+ action='store_true',
2834
+ help='Don\'t include subdomains of the target domain (only used if input is not a domain with a specific path).',
2835
+ )
2836
+ parser.add_argument(
2837
+ '-mode',
2838
+ action='store',
2839
+ help='The mode to run: U (retrieve URLs only), R (download Responses only) or B (Both).',
2840
+ choices = ['U','R','B'],
2841
+ default='B'
2842
+ )
2843
+ parser.add_argument(
2844
+ '-oU',
2845
+ '--output-urls',
2846
+ action='store',
2847
+ help='The file to save the Links output to, including path if necessary. If the "-oR" argument is not passed, a "results" directory will be created in the path specified by the DEFAULT_OUTPUT_DIR key in config.yml file (typically defaults to "~/.config/waymore/"). Within that, a directory will be created with target domain (or domain with path) passed with "-i" (or for each line of a file passed with "-i").' ,
2848
+ default='',
2849
+ )
2850
+ parser.add_argument(
2851
+ '-oR',
2852
+ '--output-responses',
2853
+ action='store',
2854
+ help='The directory to save the response output files to, including path if necessary. If the argument is not passed, a "results" directory will be created in the path specified by the DEFAULT_OUTPUT_DIR key in config.yml file (typically defaults to "~/.config/waymore/"). Within that, a directory will be created with target domain (or domain with path) passed with "-i" (or for each line of a file passed with "-i").' ,
2855
+ default='',
2856
+ )
2857
+ parser.add_argument(
2858
+ '-f',
2859
+ '--filter-responses-only',
2860
+ action='store_true',
2861
+ help='The initial links from Wayback Machine will not be filtered (MIME Type and Response Code), only the responses that are downloaded, e.g. it maybe useful to still see all available paths from the links even if you don\'t want to check the content.',
2862
+ )
2863
+ parser.add_argument(
2864
+ '-fc',
2865
+ action='store',
2866
+ help='Filter HTTP status codes for retrieved URLs and responses. Comma separated list of codes (default: the FILTER_CODE values from config.yml). Passing this argument will override the value from config.yml',
2867
+ type=validateArgStatusCodes,
2868
+ )
2869
+ parser.add_argument(
2870
+ '-mc',
2871
+ action='store',
2872
+ help='Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config FILTER_CODE and -fc.',
2873
+ type=validateArgStatusCodes,
2874
+ )
2875
+ parser.add_argument(
2876
+ '-l',
2877
+ '--limit',
2878
+ action='store',
2879
+ type=int,
2880
+ help='How many responses will be saved (if -mode is R or B). A positive value will get the first N results, a negative value will will get the last N results. A value of 0 will get ALL responses (default: '+str(DEFAULT_LIMIT)+')',
2881
+ default=DEFAULT_LIMIT,
2882
+ metavar='<signed integer>'
2883
+ )
2884
+ parser.add_argument(
2885
+ '-from',
2886
+ '--from-date',
2887
+ action='store',
2888
+ type=int,
2889
+ help='What date to get responses from. If not specified it will get from the earliest possible results. A partial value can be passed, e.g. 2016, 201805, etc.',
2890
+ metavar='<yyyyMMddhhmmss>'
2891
+ )
2892
+ parser.add_argument(
2893
+ '-to',
2894
+ '--to-date',
2895
+ action='store',
2896
+ type=int,
2897
+ help='What date to get responses to. If not specified it will get to the latest possible results. A partial value can be passed, e.g. 2016, 201805, etc.',
2898
+ metavar='<yyyyMMddhhmmss>'
2899
+ )
2900
+ parser.add_argument(
2901
+ '-ci',
2902
+ '--capture-interval',
2903
+ action='store',
2904
+ choices=['h', 'd', 'm', 'none'],
2905
+ help='Filters the search on Wayback Machine (archive.org) to only get at most 1 capture per hour (h), day (d) or month (m). This filter is used for responses only. The default is \'d\' but can also be set to \'none\' to not filter anything and get all responses.',
2906
+ default='d'
2907
+ )
2908
+ parser.add_argument(
2909
+ '-ra',
2910
+ '--regex-after',
2911
+ help='RegEx for filtering purposes against links found all sources of URLs AND responses downloaded. Only positive matches will be output.',
2912
+ action='store',
2913
+ )
2914
+ parser.add_argument(
2915
+ '-url-filename',
2916
+ action='store_true',
2917
+ help='Set the file name of downloaded responses to the URL that generated the response, otherwise it will be set to the hash value of the response. Using the hash value means multiple URLs that generated the same response will only result in one file being saved for that response.',
2918
+ default=False
2919
+ )
2920
+ parser.add_argument(
2921
+ '-xwm',
2922
+ action='store_true',
2923
+ help='Exclude checks for links from Wayback Machine (archive.org)',
2924
+ default=False
2925
+ )
2926
+ parser.add_argument(
2927
+ '-xcc',
2928
+ action='store_true',
2929
+ help='Exclude checks for links from commoncrawl.org',
2930
+ default=False
2931
+ )
2932
+ parser.add_argument(
2933
+ '-xav',
2934
+ action='store_true',
2935
+ help='Exclude checks for links from alienvault.com',
2936
+ default=False
2937
+ )
2938
+ parser.add_argument(
2939
+ '-xus',
2940
+ action='store_true',
2941
+ help='Exclude checks for links from urlscan.io',
2942
+ default=False
2943
+ )
2944
+ parser.add_argument(
2945
+ '-xvt',
2946
+ action='store_true',
2947
+ help='Exclude checks for links from virustotal.com',
2948
+ default=False
2949
+ )
2950
+ parser.add_argument(
2951
+ '-lcc',
2952
+ action='store',
2953
+ type=int,
2954
+ help='Limit the number of Common Crawl index collections searched, e.g. \'-lcc 10\' will just search the latest 10 collections (default: 3). As of July 2023 there are currently 95 collections. Setting to 0 (default) will search ALL collections. If you don\'t want to search Common Crawl at all, use the -xcc option.'
2955
+ )
2956
+ parser.add_argument(
2957
+ '-lcy',
2958
+ action='store',
2959
+ type=int,
2960
+ help='Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with -lcc). For example, if you are only interested in data from 2015 and after, pass -lcy 2015. If you don\'t want to search Common Crawl at all, use the -xcc option.',
2961
+ default=0
2962
+ )
2963
+ parser.add_argument(
2964
+ '-t',
2965
+ '--timeout',
2966
+ help='This is for archived responses only! How many seconds to wait for the server to send data before giving up (default: '+str(DEFAULT_TIMEOUT)+' seconds)',
2967
+ default=DEFAULT_TIMEOUT,
2968
+ type=int,
2969
+ metavar="<seconds>",
2970
+ )
2971
+ parser.add_argument(
2972
+ '-p',
2973
+ '--processes',
2974
+ help='Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1)',
2975
+ action='store',
2976
+ type=validateArgProcesses,
2977
+ default=1,
2978
+ metavar="<integer>",
2979
+ )
2980
+ parser.add_argument(
2981
+ '-r',
2982
+ '--retries',
2983
+ action='store',
2984
+ type=int,
2985
+ help='The number of retries for requests that get connection error or rate limited (default: 1).',
2986
+ default=1
2987
+ )
2988
+ parser.add_argument(
2989
+ "-m",
2990
+ "--memory-threshold",
2991
+ action="store",
2992
+ help="The memory threshold percentage. If the machines memory goes above the threshold, the program will be stopped and ended gracefully before running out of memory (default: 95)",
2993
+ default=95,
2994
+ metavar="<integer>",
2995
+ type=argcheckPercent,
2996
+ )
2997
+ parser.add_argument(
2998
+ '-ko',
2999
+ '--keywords-only',
3000
+ action='store',
3001
+ help=r'Only return links and responses that contain keywords that you are interested in. This can reduce the time it takes to get results. If you provide the flag with no value, Keywords are taken from the comma separated list in the "config.yml" file with the "FILTER_KEYWORDS" key, otherwise you can pass an specific Regex value to use, e.g. -ko "admin" to only get links containing the word admin, or -ko "\.js(\?|$)" to only get JS files. The Regex check is NOT case sensitive.',
3002
+ nargs='?',
3003
+ const="#CONFIG"
3004
+ )
3005
+ parser.add_argument(
3006
+ '-lr',
3007
+ '--limit-requests',
3008
+ type=int,
3009
+ help='Limit the number of requests that will be made when getting links from a source (this doesn\'t apply to Common Crawl). Some targets can return a huge amount of requests needed that are just not feasible to get, so this can be used to manage that situation. This defaults to 0 (Zero) which means there is no limit.',
3010
+ default=0,
3011
+ )
3012
+ parser.add_argument(
3013
+ "-ow",
3014
+ "--output-overwrite",
3015
+ action="store_true",
3016
+ help="If the URL output file (default waymore.txt) already exists, it will be overwritten instead of being appended to.",
3017
+ )
3018
+ parser.add_argument(
3019
+ "-nlf",
3020
+ "--new-links-file",
3021
+ action="store_true",
3022
+ help="If this argument is passed, a .new file will also be written that will contain links for the latest run. This is only relevant for mode U.",
3023
+ )
3024
+ parser.add_argument(
3025
+ "-c",
3026
+ "--config",
3027
+ action="store",
3028
+ help="Path to the YML config file. If not passed, it looks for file 'config.yml' in the same directory as runtime file 'waymore.py'.",
3029
+ )
3030
+ parser.add_argument(
3031
+ '-wrlr',
3032
+ '--wayback-rate-limit-retry',
3033
+ action='store',
3034
+ type=int,
3035
+ help='The number of minutes the user wants to wait for a rate limit pause on Watback Machine (archive.org) instead of stopping with a 429 error (default: 3).',
3036
+ default=3
3037
+ )
3038
+ parser.add_argument(
3039
+ '-urlr',
3040
+ '--urlscan-rate-limit-retry',
3041
+ action='store',
3042
+ type=int,
3043
+ help='The number of minutes the user wants to wait for a rate limit pause on URLScan.io instead of stopping with a 429 error (default: 1).',
3044
+ default=1
3045
+ )
3046
+ parser.add_argument(
3047
+ "-co",
3048
+ "--check-only",
3049
+ action="store_true",
3050
+ help="This will make a few minimal requests to show you how many requests, and roughly how long it could take, to get URLs from the sources and downloaded responses from Wayback Machine.",
3051
+ )
3052
+ parser.add_argument(
3053
+ "-nd",
3054
+ "--notify-discord",
3055
+ action="store_true",
3056
+ help="Whether to send a notification to Discord when waymore completes. It requires WEBHOOK_DISCORD to be provided in the config.yml file.",
3057
+ )
3058
+ parser.add_argument(
3059
+ '-oijs',
3060
+ '--output-inline-js',
3061
+ action="store_true",
3062
+ help='Whether to save combined inline javascript of all relevant files in the response directory when "-mode R" (or "-mode B") has been used. The files are saved with the name "combined_inline{}.js" where "{}" is the number of the file, saving 1000 unique scripts per file. '
3063
+ )
3064
+ parser.add_argument('-v', '--verbose', action='store_true', help="Verbose output")
3065
+ parser.add_argument('--version', action='store_true', help="Show version number")
3066
+ args = parser.parse_args()
3067
+
3068
+ # If --version was passed, display version and exit
3069
+ if args.version:
3070
+ write(colored('Waymore - v' + __version__,'cyan'))
3071
+ sys.exit()
3072
+
3073
+ # If -lcc wasn't passed then set to the default of 3 if -lcy is 0. This will make them work together
3074
+ if args.lcc is None:
3075
+ if args.lcy == 0:
3076
+ args.lcc = 3
3077
+ else:
3078
+ args.lcc = 0
3079
+
3080
+ # If no input was given, raise an error
3081
+ if sys.stdin.isatty():
3082
+ if args.input is None:
3083
+ writerr(colored('You need to provide an input with -i argument or through <stdin>.', 'red'))
3084
+ sys.exit()
3085
+ else:
3086
+ validateArgInput('<stdin>')
3087
+
3088
+ # Get the current Process ID to use to get memory usage that is displayed with -vv option
3089
+ try:
3090
+ process = psutil.Process(os.getpid())
3091
+ except:
3092
+ pass
3093
+
3094
+ showBanner()
3095
+
3096
+ try:
3097
+
3098
+ # For each input (maybe multiple if a file was passed)
3099
+ for inpt in inputValues:
3100
+
3101
+ argsInput = inpt.strip().rstrip('\n').strip('.').lower()
3102
+
3103
+ # Get the input hostname
3104
+ tldExtract = tldextract.extract(argsInput)
3105
+ subDomain = tldExtract.subdomain
3106
+ inputIsSubDomain = False
3107
+ if subDomain != '':
3108
+ inputIsSubDomain = True
3109
+ subDomain = subDomain+'.'
3110
+ argsInputHostname = subDomain+tldExtract.domain+'.'+tldExtract.suffix
3111
+
3112
+ # Warn user if a sub domains may have been passed
3113
+ if inputIsSubDomain:
3114
+ writerr(colored(getSPACER('IMPORTANT: It looks like you may be passing a subdomain. If you want ALL subs for a domain, then pass the domain only. It will be a LOT quicker, and you won\'t miss anything. NEVER pass a file of subdomains if you want everything, just the domains.\n'),'yellow'))
3115
+
3116
+ # Reset global variables
3117
+ linksFound = set()
3118
+ linkMimes = set()
3119
+ successCount = 0
3120
+ failureCount = 0
3121
+ fileCount = 0
3122
+ totalResponses = 0
3123
+ totalPages = 0
3124
+ indexFile = None
3125
+ path = ''
3126
+ stopSource = False
3127
+
3128
+ # Get the config settings from the config.yml file
3129
+ getConfig()
3130
+
3131
+ if verbose():
3132
+ showOptions()
3133
+
3134
+ if args.check_only:
3135
+ write(colored('*** Checking requests needed for ','cyan')+colored(argsInput,'white')+colored(' ***\n','cyan'))
3136
+
3137
+ # If the mode is U (URLs retrieved) or B (URLs retrieved AND Responses downloaded)
3138
+ if args.mode in ['U','B']:
3139
+
3140
+ # If not requested to exclude, get URLs from the Wayback Machine (archive.org)
3141
+ if not args.xwm and stopProgram is None:
3142
+ getWaybackUrls()
3143
+
3144
+ # If not requested to exclude, get URLs from commoncrawl.org
3145
+ if not args.xcc and stopProgram is None:
3146
+ getCommonCrawlUrls()
3147
+
3148
+ # If not requested to exclude and a TLD wasn't passed, get URLs from alienvault.com
3149
+ if not args.xav and stopProgram is None and not inpt.startswith('.'):
3150
+ getAlienVaultUrls()
3151
+
3152
+ # If not requested to exclude, get URLs from urlscan.io
3153
+ if not args.xus and stopProgram is None:
3154
+ getURLScanUrls()
3155
+
3156
+ # If not requested to exclude, get URLs from virustotal.com if we have an API key
3157
+ if not args.xvt and VIRUSTOTAL_API_KEY != '' and stopProgram is None:
3158
+ getVirusTotalUrls()
3159
+
3160
+ # Output results of all searches
3161
+ processURLOutput()
3162
+
3163
+ # Clean up
3164
+ linkMimes = None
3165
+
3166
+ # If we want to get actual archived responses from archive.org...
3167
+ if (args.mode in ['R','B']) and stopProgram is None:
3168
+
3169
+ # Get the output directory for responses
3170
+ if args.output_responses != '':
3171
+ responseOutputDirectory = args.output_responses + '/'
3172
+ else:
3173
+ responseOutputDirectory = str(DEFAULT_OUTPUT_DIR) + '/results/' + str(argsInput).replace('/','-') + '/'
3174
+
3175
+ processResponses()
3176
+
3177
+ # Output details of the responses downloaded
3178
+ if not args.check_only:
3179
+ processResponsesOutput()
3180
+
3181
+ # If requested, generate the combined inline JS files
3182
+ if stopProgram is None and fileCount > 0 and args.output_inline_js:
3183
+ combineInlineJS()
3184
+
3185
+ if args.check_only:
3186
+ write(colored('NOTE: The time frames are a very rough guide and doesn\'t take into account additonal time for rate limiting.','magenta'))
3187
+
3188
+ # Output stats if -v option was selected
3189
+ if verbose():
3190
+ processStats()
3191
+
3192
+ # If the program was stopped then alert the user
3193
+ if stopProgram is not None:
3194
+ if stopProgram == StopProgram.MEMORY_THRESHOLD:
3195
+ writerr(
3196
+ colored(
3197
+ "YOUR MEMORY USAGE REACHED "
3198
+ + str(maxMemoryPercent)
3199
+ + "% SO THE PROGRAM WAS STOPPED. DATA IS LIKELY TO BE INCOMPLETE.\n",
3200
+ "red",
3201
+ )
3202
+ )
3203
+ elif stopProgram == StopProgram.WEBARCHIVE_PROBLEM:
3204
+ writerr(
3205
+ colored(
3206
+ "THE PROGRAM WAS STOPPED DUE TO PROBLEM GETTING DATA FROM WAYBACK MACHINE (ARCHIVE.ORG)\n",
3207
+ "red",
3208
+ )
3209
+ )
3210
+ else:
3211
+ writerr(
3212
+ colored(
3213
+ "THE PROGRAM WAS STOPPED. DATA IS LIKELY TO BE INCOMPLETE.\n",
3214
+ "red",
3215
+ )
3216
+ )
3217
+
3218
+ except Exception as e:
3219
+ writerr(colored('ERROR main 1: ' + str(e), 'red'))
3220
+
3221
+ finally:
3222
+ # Send a notification to discord if requested
3223
+ try:
3224
+ if args.notify_discord and WEBHOOK_DISCORD != '':
3225
+ notifyDiscord()
3226
+ except:
3227
+ pass
3228
+ try:
3229
+ if sys.stdout.isatty():
3230
+ writerr(colored('✅ Want to buy me a coffee? ☕ https://ko-fi.com/xnlh4ck3r 🤘', 'green'))
3231
+ except:
3232
+ pass
3233
+ # Clean up
3234
+ linksFound = None
3235
+ linkMimes = None
3236
+ inputValues = None
3237
+
3238
+ if __name__ == '__main__':
3239
+ main()