waymore 6.6__py3-none-any.whl → 7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -1
- waymore/waymore.py +831 -737
- {waymore-6.6.dist-info → waymore-7.1.dist-info}/METADATA +8 -5
- waymore-7.1.dist-info/RECORD +8 -0
- {waymore-6.6.dist-info → waymore-7.1.dist-info}/WHEEL +1 -1
- waymore-6.6.dist-info/RECORD +0 -8
- {waymore-6.6.dist-info → waymore-7.1.dist-info}/entry_points.txt +0 -0
- {waymore-6.6.dist-info → waymore-7.1.dist-info/licenses}/LICENSE +0 -0
- {waymore-6.6.dist-info → waymore-7.1.dist-info}/top_level.txt +0 -0
waymore/waymore.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
# Good luck and good hunting! If you really love the tool (or any others), or they helped you find an awesome bounty, consider BUYING ME A COFFEE! (https://ko-fi.com/xnlh4ck3r) ☕ (I could use the caffeine!)
|
|
6
6
|
|
|
7
7
|
import argparse
|
|
8
|
+
import asyncio
|
|
8
9
|
import enum
|
|
9
10
|
import json
|
|
10
11
|
import math
|
|
@@ -14,7 +15,7 @@ import pickle
|
|
|
14
15
|
import random
|
|
15
16
|
import re
|
|
16
17
|
import sys
|
|
17
|
-
import
|
|
18
|
+
import threading
|
|
18
19
|
from datetime import datetime, timedelta
|
|
19
20
|
from pathlib import Path
|
|
20
21
|
from signal import SIGINT, signal
|
|
@@ -60,6 +61,12 @@ argsInput = ""
|
|
|
60
61
|
isInputFile = False
|
|
61
62
|
stopProgramCount = 0
|
|
62
63
|
stopSource = False
|
|
64
|
+
stopSourceWayback = False
|
|
65
|
+
stopSourceCommonCrawl = False
|
|
66
|
+
stopSourceAlienVault = False
|
|
67
|
+
stopSourceURLScan = False
|
|
68
|
+
stopSourceVirusTotal = False
|
|
69
|
+
stopSourceIntelx = False
|
|
63
70
|
successCount = 0
|
|
64
71
|
failureCount = 0
|
|
65
72
|
fileCount = 0
|
|
@@ -80,6 +87,10 @@ currentMemUsage = 0
|
|
|
80
87
|
maxMemoryPercent = 0
|
|
81
88
|
currentMemPercent = 0
|
|
82
89
|
process = None
|
|
90
|
+
current_response = None
|
|
91
|
+
current_session = None
|
|
92
|
+
# Event used to interrupt long sleeps (e.g., rate-limit waits) when SIGINT is received
|
|
93
|
+
interrupt_event = threading.Event()
|
|
83
94
|
HTTP_ADAPTER = None
|
|
84
95
|
HTTP_ADAPTER_CC = None
|
|
85
96
|
checkWayback = 0
|
|
@@ -91,6 +102,20 @@ checkIntelx = 0
|
|
|
91
102
|
argsInputHostname = ""
|
|
92
103
|
responseOutputDirectory = ""
|
|
93
104
|
urlscanRequestLinks = set()
|
|
105
|
+
intelxAPIIssue = False
|
|
106
|
+
linkCountWayback = 0
|
|
107
|
+
linkCountCommonCrawl = 0
|
|
108
|
+
linkCountAlienVault = 0
|
|
109
|
+
linkCountURLScan = 0
|
|
110
|
+
linkCountVirusTotal = 0
|
|
111
|
+
linkCountIntelx = 0
|
|
112
|
+
|
|
113
|
+
# Thread lock for protecting shared state during concurrent operations
|
|
114
|
+
links_lock = threading.Lock()
|
|
115
|
+
|
|
116
|
+
# Shared state for link collection across all sources
|
|
117
|
+
linksFound = set()
|
|
118
|
+
linkMimes = set()
|
|
94
119
|
|
|
95
120
|
# Source Provider URLs
|
|
96
121
|
WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
|
|
@@ -133,7 +158,7 @@ DEFAULT_LIMIT = 5000
|
|
|
133
158
|
DEFAULT_TIMEOUT = 30
|
|
134
159
|
|
|
135
160
|
# Exclusions used to exclude responses we will try to get from web.archive.org
|
|
136
|
-
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap"
|
|
161
|
+
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource"
|
|
137
162
|
|
|
138
163
|
# MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
|
|
139
164
|
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff"
|
|
@@ -329,7 +354,7 @@ def handler(signal_received, frame):
|
|
|
329
354
|
This function is called if Ctrl-C is called by the user
|
|
330
355
|
An attempt will be made to try and clean up properly
|
|
331
356
|
"""
|
|
332
|
-
global stopSource, stopProgram, stopProgramCount
|
|
357
|
+
global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
|
|
333
358
|
|
|
334
359
|
if stopProgram is not None:
|
|
335
360
|
stopProgramCount = stopProgramCount + 1
|
|
@@ -358,6 +383,34 @@ def handler(signal_received, frame):
|
|
|
358
383
|
else:
|
|
359
384
|
stopProgram = StopProgram.SIGINT
|
|
360
385
|
stopSource = True
|
|
386
|
+
stopSourceWayback = True
|
|
387
|
+
stopSourceCommonCrawl = True
|
|
388
|
+
stopSourceAlienVault = True
|
|
389
|
+
stopSourceURLScan = True
|
|
390
|
+
stopSourceVirusTotal = True
|
|
391
|
+
stopSourceIntelx = True
|
|
392
|
+
# Try to close any active response or session to interrupt blocking network I/O
|
|
393
|
+
try:
|
|
394
|
+
if current_response is not None:
|
|
395
|
+
try:
|
|
396
|
+
current_response.close()
|
|
397
|
+
except Exception:
|
|
398
|
+
pass
|
|
399
|
+
except Exception:
|
|
400
|
+
pass
|
|
401
|
+
try:
|
|
402
|
+
if current_session is not None:
|
|
403
|
+
try:
|
|
404
|
+
current_session.close()
|
|
405
|
+
except Exception:
|
|
406
|
+
pass
|
|
407
|
+
except Exception:
|
|
408
|
+
pass
|
|
409
|
+
# Signal any waits to stop early
|
|
410
|
+
try:
|
|
411
|
+
interrupt_event.set()
|
|
412
|
+
except Exception:
|
|
413
|
+
pass
|
|
361
414
|
writerr(
|
|
362
415
|
colored(
|
|
363
416
|
getSPACER('>>> "Oh my God, they killed Kenny... and waymore!" - Kyle'),
|
|
@@ -760,7 +813,7 @@ def showOptions():
|
|
|
760
813
|
if args.mode in ["R", "B"] or (args.mode == "U" and not args.xcc):
|
|
761
814
|
write(
|
|
762
815
|
colored("-p: " + str(args.processes), "magenta")
|
|
763
|
-
+ colored(" The number of parallel requests made.", "white")
|
|
816
|
+
+ colored(" The number of parallel requests made per source.", "white")
|
|
764
817
|
)
|
|
765
818
|
write(
|
|
766
819
|
colored("-r: " + str(args.retries), "magenta")
|
|
@@ -1251,7 +1304,7 @@ def fixArchiveOrgUrl(url):
|
|
|
1251
1304
|
|
|
1252
1305
|
# Add a link to the linksFound collection for archived responses (included timestamp preifx)
|
|
1253
1306
|
def linksFoundResponseAdd(link):
|
|
1254
|
-
global linksFound, argsInput, argsInputHostname
|
|
1307
|
+
global linksFound, argsInput, argsInputHostname, links_lock
|
|
1255
1308
|
|
|
1256
1309
|
try:
|
|
1257
1310
|
if inputIsDomainANDPath:
|
|
@@ -1272,20 +1325,22 @@ def linksFoundResponseAdd(link):
|
|
|
1272
1325
|
|
|
1273
1326
|
# Don't write it if the link does not contain the requested domain (this can sometimes happen)
|
|
1274
1327
|
if parsed_url.lower().find(checkInput.lower()) >= 0:
|
|
1275
|
-
|
|
1328
|
+
with links_lock:
|
|
1329
|
+
linksFound.add(link)
|
|
1276
1330
|
# If streaming is enabled and mode is 'U', print the link to stdout
|
|
1277
1331
|
if args.stream and args.mode == "U":
|
|
1278
1332
|
write(link, pipe=True)
|
|
1279
1333
|
except Exception:
|
|
1280
|
-
|
|
1334
|
+
with links_lock:
|
|
1335
|
+
linksFound.add(link)
|
|
1281
1336
|
# If streaming is enabled and mode is 'U', print the link to stdout
|
|
1282
1337
|
if args.stream and args.mode == "U":
|
|
1283
1338
|
write(link, pipe=True)
|
|
1284
1339
|
|
|
1285
1340
|
|
|
1286
1341
|
# Add a link to the linksFound collection
|
|
1287
|
-
def linksFoundAdd(link):
|
|
1288
|
-
global linksFound, argsInput, argsInputHostname
|
|
1342
|
+
def linksFoundAdd(link, source_set=None):
|
|
1343
|
+
global linksFound, argsInput, argsInputHostname, links_lock
|
|
1289
1344
|
|
|
1290
1345
|
try:
|
|
1291
1346
|
if inputIsDomainANDPath:
|
|
@@ -1303,12 +1358,20 @@ def linksFoundAdd(link):
|
|
|
1303
1358
|
|
|
1304
1359
|
# Don't write it if the link does not contain the requested domain (this can sometimes happen)
|
|
1305
1360
|
if parsed_url.find(checkInput) >= 0:
|
|
1306
|
-
|
|
1361
|
+
with links_lock:
|
|
1362
|
+
if source_set is not None:
|
|
1363
|
+
source_set.add(link)
|
|
1364
|
+
else:
|
|
1365
|
+
linksFound.add(link)
|
|
1307
1366
|
# If streaming is enabled and mode is 'U', print the link to stdout
|
|
1308
1367
|
if args.stream and args.mode == "U":
|
|
1309
1368
|
write(link, pipe=True)
|
|
1310
1369
|
except Exception:
|
|
1311
|
-
|
|
1370
|
+
with links_lock:
|
|
1371
|
+
if source_set is not None:
|
|
1372
|
+
source_set.add(link)
|
|
1373
|
+
else:
|
|
1374
|
+
linksFound.add(link)
|
|
1312
1375
|
# If streaming is enabled and mode is 'U', print the link to stdout
|
|
1313
1376
|
if args.stream and args.mode == "U":
|
|
1314
1377
|
write(link, pipe=True)
|
|
@@ -1567,12 +1630,10 @@ def processArchiveUrl(url):
|
|
|
1567
1630
|
except Exception as e:
|
|
1568
1631
|
writerr(
|
|
1569
1632
|
colored(
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
+ str(e)
|
|
1575
|
-
),
|
|
1633
|
+
"Wayback - [ ERR ] Failed to write file "
|
|
1634
|
+
+ filePath
|
|
1635
|
+
+ ": "
|
|
1636
|
+
+ str(e),
|
|
1576
1637
|
"red",
|
|
1577
1638
|
)
|
|
1578
1639
|
)
|
|
@@ -1588,12 +1649,10 @@ def processArchiveUrl(url):
|
|
|
1588
1649
|
except Exception as e:
|
|
1589
1650
|
writerr(
|
|
1590
1651
|
colored(
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
+ str(e)
|
|
1596
|
-
),
|
|
1652
|
+
'Wayback - [ ERR ] Failed to write to waymore_index.txt for "'
|
|
1653
|
+
+ archiveUrl
|
|
1654
|
+
+ '": '
|
|
1655
|
+
+ str(e),
|
|
1597
1656
|
"red",
|
|
1598
1657
|
)
|
|
1599
1658
|
)
|
|
@@ -1631,11 +1690,7 @@ def processArchiveUrl(url):
|
|
|
1631
1690
|
if verbose():
|
|
1632
1691
|
writerr(
|
|
1633
1692
|
colored(
|
|
1634
|
-
|
|
1635
|
-
'[ ERR ] Wayback Machine (archive.org) returned a problem for "'
|
|
1636
|
-
+ archiveUrl
|
|
1637
|
-
+ '"'
|
|
1638
|
-
),
|
|
1693
|
+
'Wayback - [ ERR ] returned a problem for "' + archiveUrl + '"',
|
|
1639
1694
|
"red",
|
|
1640
1695
|
)
|
|
1641
1696
|
)
|
|
@@ -1644,11 +1699,7 @@ def processArchiveUrl(url):
|
|
|
1644
1699
|
if verbose():
|
|
1645
1700
|
writerr(
|
|
1646
1701
|
colored(
|
|
1647
|
-
|
|
1648
|
-
'[ ERR ] Wayback Machine (archive.org) connection error for "'
|
|
1649
|
-
+ archiveUrl
|
|
1650
|
-
+ '"'
|
|
1651
|
-
),
|
|
1702
|
+
'Wayback - [ ERR ] connection error for "' + archiveUrl + '"',
|
|
1652
1703
|
"red",
|
|
1653
1704
|
)
|
|
1654
1705
|
)
|
|
@@ -1658,25 +1709,21 @@ def processArchiveUrl(url):
|
|
|
1658
1709
|
try:
|
|
1659
1710
|
writerr(
|
|
1660
1711
|
colored(
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
+ '"'
|
|
1667
|
-
),
|
|
1712
|
+
"Wayback - [ "
|
|
1713
|
+
+ str(resp.status_code)
|
|
1714
|
+
+ ' ] Failed to get response for "'
|
|
1715
|
+
+ archiveUrl
|
|
1716
|
+
+ '"',
|
|
1668
1717
|
"red",
|
|
1669
1718
|
)
|
|
1670
1719
|
)
|
|
1671
1720
|
except Exception:
|
|
1672
1721
|
writerr(
|
|
1673
1722
|
colored(
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
+ str(e)
|
|
1679
|
-
),
|
|
1723
|
+
'Wayback - [ ERR ] Failed to get response for "'
|
|
1724
|
+
+ archiveUrl
|
|
1725
|
+
+ '": '
|
|
1726
|
+
+ str(e),
|
|
1680
1727
|
"red",
|
|
1681
1728
|
)
|
|
1682
1729
|
)
|
|
@@ -1728,7 +1775,7 @@ def processArchiveUrl(url):
|
|
|
1728
1775
|
|
|
1729
1776
|
except Exception as e:
|
|
1730
1777
|
if verbose():
|
|
1731
|
-
writerr(colored(
|
|
1778
|
+
writerr(colored('Wayback - [ ERR ] Error for "' + url + '": ' + str(e), "red"))
|
|
1732
1779
|
|
|
1733
1780
|
except Exception as e:
|
|
1734
1781
|
writerr(colored("ERROR processArchiveUrl 1: " + str(e), "red"))
|
|
@@ -1813,7 +1860,7 @@ def processURLOutput():
|
|
|
1813
1860
|
linkCount = len(linksFound)
|
|
1814
1861
|
write(
|
|
1815
1862
|
getSPACER(
|
|
1816
|
-
colored("
|
|
1863
|
+
colored("\nTotal unique links found for " + subs + argsInput + ": ", "cyan")
|
|
1817
1864
|
+ colored(str(linkCount) + " 🤘", "white")
|
|
1818
1865
|
)
|
|
1819
1866
|
+ "\n"
|
|
@@ -2139,12 +2186,12 @@ def processAlienVaultPage(url):
|
|
|
2139
2186
|
"""
|
|
2140
2187
|
Get URLs from a specific page of otx.alienvault.org API for the input domain
|
|
2141
2188
|
"""
|
|
2142
|
-
global totalPages, linkMimes, linksFound,
|
|
2189
|
+
global totalPages, linkMimes, linksFound, stopSourceAlienVault, argsInput, linkCountAlienVault
|
|
2143
2190
|
try:
|
|
2144
2191
|
# Get memory in case it exceeds threshold
|
|
2145
2192
|
getMemory()
|
|
2146
2193
|
|
|
2147
|
-
if not
|
|
2194
|
+
if not stopSourceAlienVault:
|
|
2148
2195
|
try:
|
|
2149
2196
|
# Choose a random user agent string to use for any requests
|
|
2150
2197
|
userAgent = random.choice(USER_AGENT)
|
|
@@ -2156,7 +2203,7 @@ def processAlienVaultPage(url):
|
|
|
2156
2203
|
except ConnectionError:
|
|
2157
2204
|
writerr(
|
|
2158
2205
|
colored(
|
|
2159
|
-
getSPACER("[ ERR ]
|
|
2206
|
+
getSPACER("AlienVault - [ ERR ] Connection error for page " + page),
|
|
2160
2207
|
"red",
|
|
2161
2208
|
)
|
|
2162
2209
|
)
|
|
@@ -2165,9 +2212,10 @@ def processAlienVaultPage(url):
|
|
|
2165
2212
|
except Exception as e:
|
|
2166
2213
|
writerr(
|
|
2167
2214
|
colored(
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2215
|
+
"AlienVault -[ ERR ] Error getting response for page "
|
|
2216
|
+
+ page
|
|
2217
|
+
+ " - "
|
|
2218
|
+
+ str(e),
|
|
2171
2219
|
"red",
|
|
2172
2220
|
)
|
|
2173
2221
|
)
|
|
@@ -2178,22 +2226,21 @@ def processAlienVaultPage(url):
|
|
|
2178
2226
|
if resp is not None:
|
|
2179
2227
|
# If a status other of 429, then stop processing Alien Vault
|
|
2180
2228
|
if resp.status_code == 429:
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
"[ 429 ]
|
|
2185
|
-
|
|
2186
|
-
|
|
2229
|
+
if not stopSourceAlienVault: # Only print message once
|
|
2230
|
+
writerr(
|
|
2231
|
+
colored(
|
|
2232
|
+
"AlienVault - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
|
|
2233
|
+
"red",
|
|
2234
|
+
)
|
|
2187
2235
|
)
|
|
2188
|
-
|
|
2189
|
-
stopSource = True
|
|
2236
|
+
stopSourceAlienVault = True
|
|
2190
2237
|
return
|
|
2191
2238
|
# If the response from alienvault.com is empty then skip
|
|
2192
2239
|
if resp.text == "" and totalPages == 0:
|
|
2193
2240
|
if verbose():
|
|
2194
2241
|
writerr(
|
|
2195
2242
|
colored(
|
|
2196
|
-
|
|
2243
|
+
"AlienVault - [ ERR ] " + url + " gave an empty response.",
|
|
2197
2244
|
"red",
|
|
2198
2245
|
)
|
|
2199
2246
|
)
|
|
@@ -2203,9 +2250,10 @@ def processAlienVaultPage(url):
|
|
|
2203
2250
|
if verbose():
|
|
2204
2251
|
writerr(
|
|
2205
2252
|
colored(
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
|
|
2253
|
+
"AlienVauilt - [ "
|
|
2254
|
+
+ str(resp.status_code)
|
|
2255
|
+
+ " ] Error for "
|
|
2256
|
+
+ url,
|
|
2209
2257
|
"red",
|
|
2210
2258
|
)
|
|
2211
2259
|
)
|
|
@@ -2228,6 +2276,7 @@ def processAlienVaultPage(url):
|
|
|
2228
2276
|
if foundUrl != "":
|
|
2229
2277
|
# If filters are not required and subs are wanted then just add the URL to the list
|
|
2230
2278
|
if args.filter_responses_only and not args.no_subs:
|
|
2279
|
+
linkCountAlienVault = linkCountAlienVault + 1
|
|
2231
2280
|
linksFoundAdd(foundUrl)
|
|
2232
2281
|
else:
|
|
2233
2282
|
addLink = True
|
|
@@ -2328,7 +2377,7 @@ def processAlienVaultPage(url):
|
|
|
2328
2377
|
|
|
2329
2378
|
# Add link if it passed filters
|
|
2330
2379
|
if addLink:
|
|
2331
|
-
linksFoundAdd(foundUrl)
|
|
2380
|
+
linksFoundAdd(foundUrl, linksFoundAlienVault)
|
|
2332
2381
|
else:
|
|
2333
2382
|
pass
|
|
2334
2383
|
except Exception as e:
|
|
@@ -2340,12 +2389,12 @@ def getAlienVaultUrls():
|
|
|
2340
2389
|
"""
|
|
2341
2390
|
Get URLs from the Alien Vault OTX, otx.alienvault.com
|
|
2342
2391
|
"""
|
|
2343
|
-
global linksFound, waymorePath, subs, path, stopProgram, totalPages,
|
|
2392
|
+
global linksFound, waymorePath, subs, path, stopProgram, totalPages, stopSourceAlienVault, argsInput, checkAlienVault, inputIsSubDomain, argsInputHostname, linkCountAlienVault, linksFoundAlienVault
|
|
2344
2393
|
|
|
2345
2394
|
# Write the file of URL's for the passed domain/URL
|
|
2346
2395
|
try:
|
|
2347
|
-
|
|
2348
|
-
|
|
2396
|
+
stopSourceAlienVault = False
|
|
2397
|
+
linksFoundAlienVault = set()
|
|
2349
2398
|
|
|
2350
2399
|
# Set the Alien Vault API indicator types of domain or hostname (has subdomain)
|
|
2351
2400
|
if inputIsSubDomain:
|
|
@@ -2362,11 +2411,12 @@ def getAlienVaultUrls():
|
|
|
2362
2411
|
|
|
2363
2412
|
# Get the number of pages (i.e. separate requests) that are going to be made to alienvault.com
|
|
2364
2413
|
totalPages = 0
|
|
2414
|
+
resp = None
|
|
2365
2415
|
try:
|
|
2366
2416
|
if not args.check_only:
|
|
2367
2417
|
write(
|
|
2368
2418
|
colored(
|
|
2369
|
-
"
|
|
2419
|
+
"AlienVault - [ INFO ] Getting the number of alienvault.com pages to search...",
|
|
2370
2420
|
"cyan",
|
|
2371
2421
|
)
|
|
2372
2422
|
)
|
|
@@ -2379,33 +2429,35 @@ def getAlienVaultUrls():
|
|
|
2379
2429
|
except Exception as e:
|
|
2380
2430
|
writerr(
|
|
2381
2431
|
colored(
|
|
2382
|
-
|
|
2432
|
+
"AlienVault - [ ERR ] Unable to get links from alienvault.com: " + str(e),
|
|
2383
2433
|
"red",
|
|
2384
2434
|
)
|
|
2385
2435
|
)
|
|
2386
|
-
return
|
|
2436
|
+
# Don't return - continue to show link count at the end
|
|
2387
2437
|
|
|
2388
2438
|
# If the rate limit was reached end now
|
|
2389
|
-
if resp.status_code == 429:
|
|
2439
|
+
if resp is not None and resp.status_code == 429:
|
|
2390
2440
|
writerr(
|
|
2391
2441
|
colored(
|
|
2392
|
-
|
|
2442
|
+
"AlienVault - [ 429 ] Rate limit reached so unable to get links.",
|
|
2393
2443
|
"red",
|
|
2394
2444
|
)
|
|
2395
2445
|
)
|
|
2396
|
-
return
|
|
2446
|
+
# Don't return - continue to show link count at the end
|
|
2397
2447
|
|
|
2398
|
-
if verbose():
|
|
2448
|
+
if resp is not None and verbose():
|
|
2399
2449
|
write(
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
+ colored(url, "white")
|
|
2403
|
-
)
|
|
2450
|
+
colored("AlienVault - [ INFO ] The URL requested to get links: ", "magenta")
|
|
2451
|
+
+ colored(url, "white")
|
|
2404
2452
|
+ "\n"
|
|
2405
2453
|
)
|
|
2406
2454
|
|
|
2407
2455
|
# Carry on if something was found
|
|
2408
|
-
if
|
|
2456
|
+
if (
|
|
2457
|
+
resp is not None
|
|
2458
|
+
and resp.status_code != 429
|
|
2459
|
+
and resp.text.lower().find('"error": "') < 0
|
|
2460
|
+
):
|
|
2409
2461
|
|
|
2410
2462
|
try:
|
|
2411
2463
|
# Get the JSON response
|
|
@@ -2416,9 +2468,7 @@ def getAlienVaultUrls():
|
|
|
2416
2468
|
except Exception:
|
|
2417
2469
|
writerr(
|
|
2418
2470
|
colored(
|
|
2419
|
-
|
|
2420
|
-
"[ ERR ] There was an unexpected response from the Alien Vault API"
|
|
2421
|
-
),
|
|
2471
|
+
"AlienVault - [ ERR ] There was an unexpected response from the API",
|
|
2422
2472
|
"red",
|
|
2423
2473
|
)
|
|
2424
2474
|
)
|
|
@@ -2440,16 +2490,16 @@ def getAlienVaultUrls():
|
|
|
2440
2490
|
else:
|
|
2441
2491
|
checkAlienVault = totalPages
|
|
2442
2492
|
write(
|
|
2443
|
-
colored("
|
|
2493
|
+
colored("AlienVault - [ INFO ] Getting URLs from Alien Vault: ", "cyan")
|
|
2444
2494
|
+ colored(str(checkAlienVault) + " requests", "white")
|
|
2445
2495
|
)
|
|
2446
2496
|
else:
|
|
2447
2497
|
# if the page number was found then display it, but otherwise we will just try to increment until we have everything
|
|
2448
2498
|
write(
|
|
2449
2499
|
colored(
|
|
2450
|
-
"
|
|
2500
|
+
"AlienVault - [ INFO ] Getting links from "
|
|
2451
2501
|
+ str(totalPages)
|
|
2452
|
-
+ " alienvault.com API requests (this can take a while for some domains)
|
|
2502
|
+
+ " alienvault.com API requests (this can take a while for some domains)...",
|
|
2453
2503
|
"cyan",
|
|
2454
2504
|
)
|
|
2455
2505
|
)
|
|
@@ -2469,30 +2519,19 @@ def getAlienVaultUrls():
|
|
|
2469
2519
|
if verbose():
|
|
2470
2520
|
writerr(
|
|
2471
2521
|
colored(
|
|
2472
|
-
|
|
2473
|
-
+ "\n",
|
|
2522
|
+
"AlienVault - [ ERR ] An error was returned in the response." + "\n",
|
|
2474
2523
|
"red",
|
|
2475
2524
|
)
|
|
2476
2525
|
)
|
|
2477
2526
|
|
|
2478
2527
|
if not args.check_only:
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
|
|
2482
|
-
|
|
2483
|
-
|
|
2484
|
-
|
|
2485
|
-
|
|
2486
|
-
+ "\n"
|
|
2487
|
-
)
|
|
2488
|
-
else:
|
|
2489
|
-
write(
|
|
2490
|
-
getSPACER(
|
|
2491
|
-
colored("Extra links found on alienvault.com: ", "cyan")
|
|
2492
|
-
+ colored(str(linkCount), "white")
|
|
2493
|
-
)
|
|
2494
|
-
+ "\n"
|
|
2495
|
-
)
|
|
2528
|
+
linkCountAlienVault = len(linksFoundAlienVault)
|
|
2529
|
+
write(
|
|
2530
|
+
colored("AlienVault - [ INFO ] Links found on alienvault.com: ", "cyan")
|
|
2531
|
+
+ colored(str(linkCountAlienVault), "white")
|
|
2532
|
+
)
|
|
2533
|
+
linksFound.update(linksFoundAlienVault)
|
|
2534
|
+
linksFoundAlienVault.clear()
|
|
2496
2535
|
|
|
2497
2536
|
except Exception as e:
|
|
2498
2537
|
writerr(colored("ERROR getAlienVaultUrls 1: " + str(e), "red"))
|
|
@@ -2502,7 +2541,7 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
|
|
|
2502
2541
|
"""
|
|
2503
2542
|
Process a specific URL from urlscan.io to determine whether to save the link
|
|
2504
2543
|
"""
|
|
2505
|
-
global argsInput, argsInputHostname, urlscanRequestLinks
|
|
2544
|
+
global argsInput, argsInputHostname, urlscanRequestLinks, links_lock, linkCountURLScan, linksFoundURLScan
|
|
2506
2545
|
|
|
2507
2546
|
addLink = True
|
|
2508
2547
|
|
|
@@ -2591,7 +2630,8 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
|
|
|
2591
2630
|
# Add MIME Types if --verbose option was selected
|
|
2592
2631
|
if verbose():
|
|
2593
2632
|
if mimeType.strip() != "":
|
|
2594
|
-
|
|
2633
|
+
with links_lock:
|
|
2634
|
+
linkMimes.add(mimeType)
|
|
2595
2635
|
|
|
2596
2636
|
# Add link if it passed filters
|
|
2597
2637
|
if addLink:
|
|
@@ -2611,11 +2651,12 @@ def processURLScanUrl(url, httpCode, mimeType, urlscanID=""):
|
|
|
2611
2651
|
)
|
|
2612
2652
|
if match is not None:
|
|
2613
2653
|
if args.mode in ("U", "B"):
|
|
2614
|
-
linksFoundAdd(url)
|
|
2654
|
+
linksFoundAdd(url, linksFoundURLScan)
|
|
2615
2655
|
# If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
|
|
2616
2656
|
if urlscanID != "" and args.mode in ("R", "B"):
|
|
2617
2657
|
if args.limit == 0 or len(urlscanRequestLinks) < args.limit:
|
|
2618
|
-
|
|
2658
|
+
with links_lock:
|
|
2659
|
+
urlscanRequestLinks.add((url, URLSCAN_DOM_URL + urlscanID))
|
|
2619
2660
|
|
|
2620
2661
|
except Exception as e:
|
|
2621
2662
|
writerr(colored("ERROR processURLScanUrl 1: " + str(e), "red"))
|
|
@@ -2721,9 +2762,10 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2721
2762
|
except Exception as e:
|
|
2722
2763
|
writerr(
|
|
2723
2764
|
colored(
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
2765
|
+
"URLScan - [ ERR ] Failed to write file "
|
|
2766
|
+
+ filePath
|
|
2767
|
+
+ ": "
|
|
2768
|
+
+ str(e),
|
|
2727
2769
|
"red",
|
|
2728
2770
|
)
|
|
2729
2771
|
)
|
|
@@ -2746,12 +2788,10 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2746
2788
|
except Exception as e:
|
|
2747
2789
|
writerr(
|
|
2748
2790
|
colored(
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
+ str(e)
|
|
2754
|
-
),
|
|
2791
|
+
'URLScan - [ ERR ] Failed to write to waymore_index.txt for "'
|
|
2792
|
+
+ domUrl
|
|
2793
|
+
+ '": '
|
|
2794
|
+
+ str(e),
|
|
2755
2795
|
"red",
|
|
2756
2796
|
)
|
|
2757
2797
|
)
|
|
@@ -2767,25 +2807,21 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2767
2807
|
try:
|
|
2768
2808
|
writerr(
|
|
2769
2809
|
colored(
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
+ '"'
|
|
2776
|
-
),
|
|
2810
|
+
"URLScan - [ "
|
|
2811
|
+
+ str(resp.status_code)
|
|
2812
|
+
+ ' ] Failed to get response for "'
|
|
2813
|
+
+ domUrl
|
|
2814
|
+
+ '"',
|
|
2777
2815
|
"red",
|
|
2778
2816
|
)
|
|
2779
2817
|
)
|
|
2780
2818
|
except Exception:
|
|
2781
2819
|
writerr(
|
|
2782
2820
|
colored(
|
|
2783
|
-
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
+ str(e)
|
|
2788
|
-
),
|
|
2821
|
+
'URLScan - [ ERR ] Failed to get response for "'
|
|
2822
|
+
+ domUrl
|
|
2823
|
+
+ '": '
|
|
2824
|
+
+ str(e),
|
|
2789
2825
|
"red",
|
|
2790
2826
|
)
|
|
2791
2827
|
)
|
|
@@ -2832,7 +2868,9 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2832
2868
|
|
|
2833
2869
|
except Exception as e:
|
|
2834
2870
|
if verbose():
|
|
2835
|
-
writerr(
|
|
2871
|
+
writerr(
|
|
2872
|
+
colored('URLScan - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red")
|
|
2873
|
+
)
|
|
2836
2874
|
|
|
2837
2875
|
except Exception as e:
|
|
2838
2876
|
writerr(colored("ERROR getURLScanDOM 1: " + str(e), "red"))
|
|
@@ -2857,14 +2895,15 @@ def getURLScanUrls():
|
|
|
2857
2895
|
"""
|
|
2858
2896
|
Get URLs from the URLSCan API, urlscan.io
|
|
2859
2897
|
"""
|
|
2860
|
-
global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram,
|
|
2898
|
+
global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
|
|
2861
2899
|
|
|
2862
2900
|
# Write the file of URL's for the passed domain/URL
|
|
2863
2901
|
try:
|
|
2864
2902
|
requestsMade = 0
|
|
2865
|
-
|
|
2866
|
-
|
|
2867
|
-
|
|
2903
|
+
stopSourceURLScan = False
|
|
2904
|
+
linksFoundURLScan = set()
|
|
2905
|
+
totalUrls = 0
|
|
2906
|
+
checkResponse = True
|
|
2868
2907
|
|
|
2869
2908
|
# Set the URL to just the hostname
|
|
2870
2909
|
url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
|
|
@@ -2887,21 +2926,23 @@ def getURLScanUrls():
|
|
|
2887
2926
|
if args.mode == "R":
|
|
2888
2927
|
write(
|
|
2889
2928
|
colored(
|
|
2890
|
-
"The URLScan URL requested to get links for responses: ",
|
|
2929
|
+
"URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
|
|
2891
2930
|
"magenta",
|
|
2892
2931
|
)
|
|
2893
2932
|
+ colored(url + "\n", "white")
|
|
2894
2933
|
)
|
|
2895
2934
|
else:
|
|
2896
2935
|
write(
|
|
2897
|
-
colored(
|
|
2936
|
+
colored(
|
|
2937
|
+
"URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
|
|
2938
|
+
)
|
|
2898
2939
|
+ colored(url + "\n", "white")
|
|
2899
2940
|
)
|
|
2900
2941
|
|
|
2901
|
-
if not args.check_only:
|
|
2942
|
+
if args.mode in ("U", "B") and not args.check_only:
|
|
2902
2943
|
write(
|
|
2903
2944
|
colored(
|
|
2904
|
-
"
|
|
2945
|
+
"URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
|
|
2905
2946
|
"cyan",
|
|
2906
2947
|
)
|
|
2907
2948
|
)
|
|
@@ -2922,7 +2963,7 @@ def getURLScanUrls():
|
|
|
2922
2963
|
except Exception as e:
|
|
2923
2964
|
write(
|
|
2924
2965
|
colored(
|
|
2925
|
-
|
|
2966
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
2926
2967
|
"red",
|
|
2927
2968
|
)
|
|
2928
2969
|
)
|
|
@@ -2937,15 +2978,17 @@ def getURLScanUrls():
|
|
|
2937
2978
|
if seconds <= args.urlscan_rate_limit_retry * 60:
|
|
2938
2979
|
writerr(
|
|
2939
2980
|
colored(
|
|
2940
|
-
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
+ " seconds before continuing..."
|
|
2944
|
-
),
|
|
2981
|
+
"URLScan - [ 429 ] Rate limit reached, so waiting for another "
|
|
2982
|
+
+ str(seconds)
|
|
2983
|
+
+ " seconds before continuing...",
|
|
2945
2984
|
"yellow",
|
|
2946
2985
|
)
|
|
2947
2986
|
)
|
|
2948
|
-
|
|
2987
|
+
# Wait can be interrupted by SIGINT via interrupt_event
|
|
2988
|
+
interrupt_event.clear()
|
|
2989
|
+
if interrupt_event.wait(seconds + 1):
|
|
2990
|
+
# Interrupted by SIGINT
|
|
2991
|
+
return
|
|
2949
2992
|
try:
|
|
2950
2993
|
resp = session.get(
|
|
2951
2994
|
url,
|
|
@@ -2958,7 +3001,7 @@ def getURLScanUrls():
|
|
|
2958
3001
|
except Exception as e:
|
|
2959
3002
|
write(
|
|
2960
3003
|
colored(
|
|
2961
|
-
|
|
3004
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
2962
3005
|
"red",
|
|
2963
3006
|
)
|
|
2964
3007
|
)
|
|
@@ -2971,18 +3014,14 @@ def getURLScanUrls():
|
|
|
2971
3014
|
if resp.status_code == 429:
|
|
2972
3015
|
writerr(
|
|
2973
3016
|
colored(
|
|
2974
|
-
|
|
2975
|
-
"[ 429 ] URLScan rate limit reached so trying without API Key..."
|
|
2976
|
-
),
|
|
3017
|
+
"URLScan - [ 429 ] Rate limit reached so trying without API Key...",
|
|
2977
3018
|
"red",
|
|
2978
3019
|
)
|
|
2979
3020
|
)
|
|
2980
3021
|
else:
|
|
2981
3022
|
writerr(
|
|
2982
3023
|
colored(
|
|
2983
|
-
|
|
2984
|
-
"The URLScan API Key is invalid so trying without API Key..."
|
|
2985
|
-
),
|
|
3024
|
+
"URLScan - [ INF ] The API Key is invalid so trying without API Key...",
|
|
2986
3025
|
"red",
|
|
2987
3026
|
)
|
|
2988
3027
|
)
|
|
@@ -2992,56 +3031,54 @@ def getURLScanUrls():
|
|
|
2992
3031
|
except Exception as e:
|
|
2993
3032
|
writerr(
|
|
2994
3033
|
colored(
|
|
2995
|
-
|
|
3034
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
2996
3035
|
"red",
|
|
2997
3036
|
)
|
|
2998
3037
|
)
|
|
2999
|
-
|
|
3038
|
+
checkResponse = False
|
|
3000
3039
|
|
|
3001
3040
|
# If the rate limit was reached end now
|
|
3002
3041
|
if resp.status_code == 429:
|
|
3003
3042
|
writerr(
|
|
3004
3043
|
colored(
|
|
3005
|
-
|
|
3006
|
-
"[ 429 ] URLScan rate limit reached without API Key so unable to get links."
|
|
3007
|
-
),
|
|
3044
|
+
"URLScan - [ 429 ] Rate limit reached without API Key so unable to get links.",
|
|
3008
3045
|
"red",
|
|
3009
3046
|
)
|
|
3010
3047
|
)
|
|
3011
|
-
|
|
3048
|
+
checkResponse = False
|
|
3012
3049
|
else:
|
|
3013
3050
|
writerr(
|
|
3014
3051
|
colored(
|
|
3015
|
-
|
|
3052
|
+
"URLScan - [ 429 ] Rate limit reached so unable to get links.",
|
|
3016
3053
|
"red",
|
|
3017
3054
|
)
|
|
3018
3055
|
)
|
|
3019
|
-
|
|
3056
|
+
checkResponse = False
|
|
3020
3057
|
elif resp.status_code != 200:
|
|
3021
3058
|
writerr(
|
|
3022
3059
|
colored(
|
|
3023
|
-
|
|
3024
|
-
|
|
3025
|
-
|
|
3060
|
+
"URLScan - [ "
|
|
3061
|
+
+ str(resp.status_code)
|
|
3062
|
+
+ " ] Unable to get links from urlscan.io",
|
|
3026
3063
|
"red",
|
|
3027
3064
|
)
|
|
3028
3065
|
)
|
|
3029
|
-
|
|
3066
|
+
checkResponse = False
|
|
3030
3067
|
|
|
3031
3068
|
try:
|
|
3032
|
-
|
|
3033
|
-
|
|
3069
|
+
if checkResponse:
|
|
3070
|
+
# Get the JSON response
|
|
3071
|
+
jsonResp = json.loads(resp.text.strip())
|
|
3034
3072
|
|
|
3035
|
-
|
|
3036
|
-
|
|
3073
|
+
# Get the number of results
|
|
3074
|
+
totalUrls = int(jsonResp["total"])
|
|
3037
3075
|
except Exception:
|
|
3038
3076
|
writerr(
|
|
3039
3077
|
colored(
|
|
3040
|
-
|
|
3078
|
+
"URLScan - [ ERR ] There was an unexpected response from the API",
|
|
3041
3079
|
"red",
|
|
3042
3080
|
)
|
|
3043
3081
|
)
|
|
3044
|
-
totalUrls = 0
|
|
3045
3082
|
|
|
3046
3083
|
# Carry on if something was found
|
|
3047
3084
|
if args.check_only and args.mode != "R":
|
|
@@ -3049,12 +3086,13 @@ def getURLScanUrls():
|
|
|
3049
3086
|
hasMore = jsonResp["has_more"]
|
|
3050
3087
|
if hasMore:
|
|
3051
3088
|
write(
|
|
3052
|
-
colored("Get URLs from URLScan: ", "cyan")
|
|
3089
|
+
colored("URLScan - [ INFO ] Get URLs from URLScan: ", "cyan")
|
|
3053
3090
|
+ colored("UNKNOWN requests", "white")
|
|
3054
3091
|
)
|
|
3055
3092
|
else:
|
|
3056
3093
|
write(
|
|
3057
|
-
colored("Get URLs from URLScan: ", "cyan")
|
|
3094
|
+
colored("URLScan - [ INFO ] Get URLs from URLScan: ", "cyan")
|
|
3095
|
+
+ colored("1 request", "white")
|
|
3058
3096
|
)
|
|
3059
3097
|
except Exception:
|
|
3060
3098
|
pass
|
|
@@ -3064,7 +3102,7 @@ def getURLScanUrls():
|
|
|
3064
3102
|
# Carry on if something was found
|
|
3065
3103
|
if int(totalUrls) > 0:
|
|
3066
3104
|
|
|
3067
|
-
while not
|
|
3105
|
+
while not stopSourceURLScan:
|
|
3068
3106
|
|
|
3069
3107
|
searchAfter = ""
|
|
3070
3108
|
|
|
@@ -3139,7 +3177,7 @@ def getURLScanUrls():
|
|
|
3139
3177
|
if searchAfter != "":
|
|
3140
3178
|
|
|
3141
3179
|
keepTrying = True
|
|
3142
|
-
while not
|
|
3180
|
+
while not stopSourceURLScan and keepTrying:
|
|
3143
3181
|
keepTrying = False
|
|
3144
3182
|
# Get the next page from urlscan.io
|
|
3145
3183
|
try:
|
|
@@ -3159,9 +3197,8 @@ def getURLScanUrls():
|
|
|
3159
3197
|
except Exception as e:
|
|
3160
3198
|
writerr(
|
|
3161
3199
|
colored(
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
),
|
|
3200
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: "
|
|
3201
|
+
+ str(e),
|
|
3165
3202
|
"red",
|
|
3166
3203
|
)
|
|
3167
3204
|
)
|
|
@@ -3180,56 +3217,53 @@ def getURLScanUrls():
|
|
|
3180
3217
|
if seconds <= args.urlscan_rate_limit_retry * 60:
|
|
3181
3218
|
writerr(
|
|
3182
3219
|
colored(
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
+ " seconds before continuing..."
|
|
3187
|
-
),
|
|
3220
|
+
"URLScan - [ 429 ] Rate limit reached, so waiting for another "
|
|
3221
|
+
+ str(seconds)
|
|
3222
|
+
+ " seconds before continuing...",
|
|
3188
3223
|
"yellow",
|
|
3189
3224
|
)
|
|
3190
3225
|
)
|
|
3191
|
-
|
|
3226
|
+
# Wait can be interrupted by SIGINT via interrupt_event
|
|
3227
|
+
interrupt_event.clear()
|
|
3228
|
+
if interrupt_event.wait(seconds + 1):
|
|
3229
|
+
# Interrupted by SIGINT
|
|
3230
|
+
keepTrying = False
|
|
3231
|
+
break
|
|
3192
3232
|
keepTrying = True
|
|
3193
3233
|
continue
|
|
3194
3234
|
else:
|
|
3195
3235
|
writerr(
|
|
3196
3236
|
colored(
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
+ "), so stopping. Links that have already been retrieved will be saved."
|
|
3201
|
-
),
|
|
3237
|
+
"URLScan - [ 429 ] Rate limit reached (waiting time of "
|
|
3238
|
+
+ str(seconds)
|
|
3239
|
+
+ "), so stopping. Links that have already been retrieved will be saved.",
|
|
3202
3240
|
"red",
|
|
3203
3241
|
)
|
|
3204
3242
|
)
|
|
3205
|
-
|
|
3243
|
+
stopSourceURLScan = True
|
|
3206
3244
|
pass
|
|
3207
3245
|
else:
|
|
3208
3246
|
writerr(
|
|
3209
3247
|
colored(
|
|
3210
|
-
|
|
3211
|
-
"[ 429 ] URLScan rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
3212
|
-
),
|
|
3248
|
+
"URLScan - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
|
|
3213
3249
|
"red",
|
|
3214
3250
|
)
|
|
3215
3251
|
)
|
|
3216
|
-
|
|
3252
|
+
stopSourceURLScan = True
|
|
3217
3253
|
pass
|
|
3218
3254
|
elif resp.status_code != 200:
|
|
3219
3255
|
writerr(
|
|
3220
3256
|
colored(
|
|
3221
|
-
|
|
3222
|
-
|
|
3223
|
-
|
|
3224
|
-
+ " ] Unable to get links from urlscan.io"
|
|
3225
|
-
),
|
|
3257
|
+
"URLScan - [ "
|
|
3258
|
+
+ str(resp.status_code)
|
|
3259
|
+
+ " ] Unable to get links from urlscan.io",
|
|
3226
3260
|
"red",
|
|
3227
3261
|
)
|
|
3228
3262
|
)
|
|
3229
|
-
|
|
3263
|
+
stopSourceURLScan = True
|
|
3230
3264
|
pass
|
|
3231
3265
|
|
|
3232
|
-
if not
|
|
3266
|
+
if not stopSourceURLScan:
|
|
3233
3267
|
# Get the JSON response
|
|
3234
3268
|
jsonResp = json.loads(resp.text.strip())
|
|
3235
3269
|
|
|
@@ -3244,36 +3278,25 @@ def getURLScanUrls():
|
|
|
3244
3278
|
and requestsMade > args.limit
|
|
3245
3279
|
)
|
|
3246
3280
|
):
|
|
3247
|
-
|
|
3281
|
+
stopSourceURLScan = True
|
|
3248
3282
|
|
|
3249
3283
|
# Show the MIME types found (in case user wants to exclude more)
|
|
3250
3284
|
if verbose() and len(linkMimes) > 0 and args.mode != "R":
|
|
3251
3285
|
linkMimes.discard("warc/revisit")
|
|
3252
3286
|
write(
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
)
|
|
3287
|
+
colored("URLScan - [ INFO ] MIME types found: ", "magenta")
|
|
3288
|
+
+ colored(str(linkMimes), "white")
|
|
3256
3289
|
+ "\n"
|
|
3257
3290
|
)
|
|
3258
3291
|
|
|
3259
|
-
linkCount = len(linksFound) - originalLinkCount
|
|
3260
3292
|
if args.mode != "R":
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
)
|
|
3269
|
-
else:
|
|
3270
|
-
write(
|
|
3271
|
-
getSPACER(
|
|
3272
|
-
colored("Extra links found on urlscan.io: ", "cyan")
|
|
3273
|
-
+ colored(str(linkCount), "white")
|
|
3274
|
-
)
|
|
3275
|
-
+ "\n"
|
|
3276
|
-
)
|
|
3293
|
+
linkCountURLScan = len(linksFoundURLScan)
|
|
3294
|
+
write(
|
|
3295
|
+
colored("URLScan - [ INFO ] Links found on urlscan.io: ", "cyan")
|
|
3296
|
+
+ colored(str(linkCountURLScan), "white")
|
|
3297
|
+
)
|
|
3298
|
+
linksFound.update(linksFoundURLScan)
|
|
3299
|
+
linksFoundURLScan.clear()
|
|
3277
3300
|
|
|
3278
3301
|
except Exception as e:
|
|
3279
3302
|
writerr(colored("ERROR getURLScanUrls 1: " + str(e), "red"))
|
|
@@ -3283,12 +3306,11 @@ def processWayBackPage(url):
|
|
|
3283
3306
|
"""
|
|
3284
3307
|
Get URLs from a specific page of archive.org CDX API for the input domain
|
|
3285
3308
|
"""
|
|
3286
|
-
global totalPages, linkMimes, linksFound,
|
|
3309
|
+
global totalPages, linkMimes, linksFound, stopSourceWayback, linkCountWayback, linksFoundWayback, current_response, current_session
|
|
3287
3310
|
try:
|
|
3288
3311
|
# Get memory in case it exceeds threshold
|
|
3289
3312
|
getMemory()
|
|
3290
|
-
|
|
3291
|
-
if not stopSource:
|
|
3313
|
+
if not stopSourceWayback:
|
|
3292
3314
|
try:
|
|
3293
3315
|
# Choose a random user agent string to use for any requests
|
|
3294
3316
|
resp = None
|
|
@@ -3297,210 +3319,231 @@ def processWayBackPage(url):
|
|
|
3297
3319
|
session = requests.Session()
|
|
3298
3320
|
session.mount("https://", HTTP_ADAPTER)
|
|
3299
3321
|
session.mount("http://", HTTP_ADAPTER)
|
|
3300
|
-
|
|
3301
|
-
|
|
3302
|
-
|
|
3303
|
-
|
|
3304
|
-
|
|
3305
|
-
|
|
3306
|
-
|
|
3307
|
-
|
|
3308
|
-
"red",
|
|
3309
|
-
)
|
|
3310
|
-
)
|
|
3311
|
-
resp = None
|
|
3312
|
-
return
|
|
3313
|
-
except Exception as e:
|
|
3314
|
-
writerr(
|
|
3315
|
-
colored(
|
|
3316
|
-
getSPACER(
|
|
3317
|
-
"[ ERR ] Error getting response for page " + page + " - " + str(e)
|
|
3318
|
-
),
|
|
3319
|
-
"red",
|
|
3320
|
-
)
|
|
3322
|
+
# expose session so SIGINT handler can close it to interrupt blocking network I/O
|
|
3323
|
+
try:
|
|
3324
|
+
current_session = session
|
|
3325
|
+
except Exception:
|
|
3326
|
+
pass
|
|
3327
|
+
|
|
3328
|
+
resp = session.get(
|
|
3329
|
+
url, headers={"User-Agent": userAgent}, stream=True, timeout=args.timeout
|
|
3321
3330
|
)
|
|
3322
|
-
|
|
3323
|
-
return
|
|
3324
|
-
finally:
|
|
3331
|
+
# expose live response so SIGINT handler can close it to interrupt blocking I/O
|
|
3325
3332
|
try:
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
|
|
3330
|
-
|
|
3331
|
-
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
|
|
3335
|
-
|
|
3336
|
-
|
|
3337
|
-
|
|
3338
|
-
|
|
3339
|
-
|
|
3340
|
-
|
|
3341
|
-
|
|
3342
|
-
|
|
3343
|
-
|
|
3344
|
-
|
|
3345
|
-
|
|
3346
|
-
colored(
|
|
3347
|
-
"\r[ 429 ] Wayback Machine (archive.org) rate limit reached, so waiting for "
|
|
3348
|
-
+ str(seconds)
|
|
3349
|
-
+ " seconds before continuing...\r",
|
|
3350
|
-
"yellow",
|
|
3351
|
-
)
|
|
3352
|
-
)
|
|
3353
|
-
time.sleep(seconds)
|
|
3354
|
-
try:
|
|
3355
|
-
resp = session.get(url, headers={"User-Agent": userAgent})
|
|
3356
|
-
except ConnectionError:
|
|
3357
|
-
writerr(
|
|
3358
|
-
colored(
|
|
3359
|
-
getSPACER(
|
|
3360
|
-
"[ ERR ] Wayback Machine (archive.org) connection error for page "
|
|
3361
|
-
+ page
|
|
3362
|
-
),
|
|
3363
|
-
"red",
|
|
3364
|
-
)
|
|
3333
|
+
current_response = resp
|
|
3334
|
+
except Exception:
|
|
3335
|
+
pass
|
|
3336
|
+
# Check response status in the finally block
|
|
3337
|
+
if resp is not None:
|
|
3338
|
+
# If a status other of 429, then stop processing Wayback Machine
|
|
3339
|
+
if resp.status_code == 429:
|
|
3340
|
+
if args.wayback_rate_limit_retry > 0:
|
|
3341
|
+
seconds = args.wayback_rate_limit_retry * 60
|
|
3342
|
+
if args.processes == 1:
|
|
3343
|
+
writerr(
|
|
3344
|
+
colored(
|
|
3345
|
+
"Wayback - [ 429 ] Rate limit reached on page "
|
|
3346
|
+
+ str(page)
|
|
3347
|
+
+ " of "
|
|
3348
|
+
+ str(totalPages)
|
|
3349
|
+
+ ", so waiting for "
|
|
3350
|
+
+ str(seconds)
|
|
3351
|
+
+ " seconds before continuing...",
|
|
3352
|
+
"yellow",
|
|
3365
3353
|
)
|
|
3366
|
-
|
|
3367
|
-
|
|
3368
|
-
|
|
3369
|
-
|
|
3370
|
-
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
|
|
3374
|
-
+ " - "
|
|
3375
|
-
+ str(e)
|
|
3376
|
-
),
|
|
3377
|
-
"red",
|
|
3378
|
-
)
|
|
3354
|
+
)
|
|
3355
|
+
else:
|
|
3356
|
+
writerr(
|
|
3357
|
+
colored(
|
|
3358
|
+
"Wayback - [ 429 ] Rate limit reached, so waiting for "
|
|
3359
|
+
+ str(seconds)
|
|
3360
|
+
+ " seconds before continuing...",
|
|
3361
|
+
"yellow",
|
|
3379
3362
|
)
|
|
3380
|
-
resp = None
|
|
3381
|
-
return
|
|
3382
|
-
|
|
3383
|
-
if resp.status_code == 429:
|
|
3384
|
-
writerr(
|
|
3385
|
-
colored(
|
|
3386
|
-
getSPACER(
|
|
3387
|
-
"[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
3388
|
-
),
|
|
3389
|
-
"red",
|
|
3390
3363
|
)
|
|
3391
|
-
|
|
3392
|
-
|
|
3393
|
-
|
|
3394
|
-
|
|
3395
|
-
|
|
3396
|
-
|
|
3397
|
-
|
|
3398
|
-
|
|
3399
|
-
|
|
3400
|
-
|
|
3401
|
-
"red",
|
|
3364
|
+
# Wait can be interrupted by SIGINT via interrupt_event
|
|
3365
|
+
interrupt_event.clear()
|
|
3366
|
+
if interrupt_event.wait(seconds):
|
|
3367
|
+
return
|
|
3368
|
+
try:
|
|
3369
|
+
resp = session.get(
|
|
3370
|
+
url,
|
|
3371
|
+
headers={"User-Agent": userAgent},
|
|
3372
|
+
stream=True,
|
|
3373
|
+
timeout=args.timeout,
|
|
3402
3374
|
)
|
|
3403
|
-
|
|
3404
|
-
|
|
3405
|
-
|
|
3406
|
-
|
|
3407
|
-
|
|
3408
|
-
if verbose():
|
|
3375
|
+
try:
|
|
3376
|
+
current_response = resp
|
|
3377
|
+
except Exception:
|
|
3378
|
+
pass
|
|
3379
|
+
except ConnectionError:
|
|
3409
3380
|
writerr(
|
|
3410
3381
|
colored(
|
|
3411
|
-
|
|
3382
|
+
"Wayback - [ ERR ] Connection error for page " + page,
|
|
3412
3383
|
"red",
|
|
3413
3384
|
)
|
|
3414
3385
|
)
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
|
|
3418
|
-
if verbose():
|
|
3386
|
+
resp = None
|
|
3387
|
+
return
|
|
3388
|
+
except Exception as e:
|
|
3419
3389
|
writerr(
|
|
3420
3390
|
colored(
|
|
3421
|
-
|
|
3422
|
-
|
|
3423
|
-
|
|
3391
|
+
"Wayback - [ ERR ] Error getting response for page "
|
|
3392
|
+
+ page
|
|
3393
|
+
+ " - "
|
|
3394
|
+
+ str(e),
|
|
3424
3395
|
"red",
|
|
3425
3396
|
)
|
|
3426
3397
|
)
|
|
3427
|
-
|
|
3428
|
-
|
|
3429
|
-
|
|
3430
|
-
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3398
|
+
resp = None
|
|
3399
|
+
return
|
|
3400
|
+
|
|
3401
|
+
if resp.status_code == 429:
|
|
3402
|
+
writerr(
|
|
3403
|
+
colored(
|
|
3404
|
+
"Wayback - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
|
|
3405
|
+
"red",
|
|
3406
|
+
)
|
|
3436
3407
|
)
|
|
3437
|
-
|
|
3438
|
-
|
|
3439
|
-
|
|
3440
|
-
|
|
3441
|
-
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
|
-
"
|
|
3445
|
-
)
|
|
3446
|
-
"red",
|
|
3408
|
+
stopSourceWayback = True
|
|
3409
|
+
return
|
|
3410
|
+
# If a status other of 503, then the site is unavailable
|
|
3411
|
+
if resp.status_code == 503:
|
|
3412
|
+
writerr(
|
|
3413
|
+
colored(
|
|
3414
|
+
"Wayback - [ 503 ] The Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.",
|
|
3415
|
+
"red",
|
|
3416
|
+
)
|
|
3447
3417
|
)
|
|
3448
|
-
|
|
3449
|
-
|
|
3450
|
-
|
|
3418
|
+
stopSourceWayback = True
|
|
3419
|
+
return
|
|
3420
|
+
# If a status other than 200, then stop
|
|
3421
|
+
if resp.status_code != 200:
|
|
3422
|
+
if verbose():
|
|
3423
|
+
writerr(
|
|
3424
|
+
colored(
|
|
3425
|
+
"Wayback - [ " + str(resp.status_code) + " ] Error for " + url,
|
|
3426
|
+
"red",
|
|
3427
|
+
)
|
|
3428
|
+
)
|
|
3429
|
+
try:
|
|
3430
|
+
current_response = None
|
|
3431
|
+
except Exception:
|
|
3432
|
+
pass
|
|
3433
|
+
try:
|
|
3434
|
+
current_session = None
|
|
3435
|
+
except Exception:
|
|
3436
|
+
pass
|
|
3437
|
+
return
|
|
3451
3438
|
|
|
3452
|
-
|
|
3453
|
-
|
|
3439
|
+
# Get the URLs and MIME types. Each line is a separate JSON string
|
|
3440
|
+
# Process lines as they arrive - if connection drops, we keep what we've already processed
|
|
3454
3441
|
for line in resp.iter_lines():
|
|
3455
|
-
|
|
3456
|
-
|
|
3442
|
+
try:
|
|
3443
|
+
results = line.decode("utf-8")
|
|
3444
|
+
foundUrl = fixArchiveOrgUrl(str(results).split(" ")[1])
|
|
3457
3445
|
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3463
|
-
|
|
3464
|
-
|
|
3465
|
-
|
|
3466
|
-
|
|
3467
|
-
|
|
3468
|
-
|
|
3469
|
-
|
|
3446
|
+
# If --filter-responses-only wasn't used, then check the URL exclusions
|
|
3447
|
+
if args.filter_responses_only:
|
|
3448
|
+
match = None
|
|
3449
|
+
else:
|
|
3450
|
+
match = re.search(
|
|
3451
|
+
r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
|
|
3452
|
+
foundUrl,
|
|
3453
|
+
flags=re.IGNORECASE,
|
|
3454
|
+
)
|
|
3455
|
+
if match is None:
|
|
3456
|
+
# Only get MIME Types if --verbose option was selected
|
|
3457
|
+
if verbose():
|
|
3458
|
+
try:
|
|
3459
|
+
mimeType = str(results).split(" ")[2]
|
|
3460
|
+
if mimeType != "":
|
|
3461
|
+
linkMimes.add(mimeType)
|
|
3462
|
+
except Exception:
|
|
3463
|
+
if verbose():
|
|
3464
|
+
writerr(
|
|
3465
|
+
colored(
|
|
3466
|
+
getSPACER(
|
|
3467
|
+
"ERROR processWayBackPage 2: Cannot get MIME type from line: "
|
|
3468
|
+
+ str(line)
|
|
3469
|
+
),
|
|
3470
|
+
"red",
|
|
3471
|
+
)
|
|
3472
|
+
)
|
|
3470
3473
|
try:
|
|
3471
|
-
|
|
3472
|
-
|
|
3473
|
-
linkMimes.add(mimeType)
|
|
3474
|
+
linksFoundAdd(foundUrl, linksFoundWayback)
|
|
3475
|
+
|
|
3474
3476
|
except Exception:
|
|
3475
3477
|
if verbose():
|
|
3476
3478
|
writerr(
|
|
3477
3479
|
colored(
|
|
3478
3480
|
getSPACER(
|
|
3479
|
-
"ERROR processWayBackPage
|
|
3481
|
+
"ERROR processWayBackPage 3: Cannot get link from line: "
|
|
3480
3482
|
+ str(line)
|
|
3481
3483
|
),
|
|
3482
3484
|
"red",
|
|
3483
3485
|
)
|
|
3484
3486
|
)
|
|
3485
|
-
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
writerr(
|
|
3491
|
-
colored(
|
|
3492
|
-
getSPACER(
|
|
3493
|
-
"ERROR processWayBackPage 3: Cannot get link from line: "
|
|
3494
|
-
+ str(line)
|
|
3495
|
-
),
|
|
3496
|
-
"red",
|
|
3497
|
-
)
|
|
3487
|
+
except Exception:
|
|
3488
|
+
if verbose():
|
|
3489
|
+
writerr(
|
|
3490
|
+
colored(
|
|
3491
|
+
getSPACER("ERROR processWayBackPage 4: " + str(line)), "red"
|
|
3498
3492
|
)
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
|
|
3502
|
-
|
|
3493
|
+
)
|
|
3494
|
+
|
|
3495
|
+
except ConnectionError:
|
|
3496
|
+
writerr(
|
|
3497
|
+
colored(
|
|
3498
|
+
"Wayback - [ ERR ] Connection error for page "
|
|
3499
|
+
+ page
|
|
3500
|
+
+ (
|
|
3501
|
+
f" (saved {len(linksFoundWayback)} URLs before error)"
|
|
3502
|
+
if len(linksFoundWayback) > 0
|
|
3503
|
+
else ""
|
|
3504
|
+
),
|
|
3505
|
+
"red",
|
|
3506
|
+
)
|
|
3507
|
+
)
|
|
3508
|
+
try:
|
|
3509
|
+
current_response = None
|
|
3510
|
+
except Exception:
|
|
3511
|
+
pass
|
|
3512
|
+
try:
|
|
3513
|
+
current_session = None
|
|
3514
|
+
except Exception:
|
|
3515
|
+
pass
|
|
3516
|
+
return
|
|
3517
|
+
except Exception as e:
|
|
3518
|
+
# Even if connection drops, we've already saved the URLs processed so far
|
|
3519
|
+
if len(linksFoundWayback) > 0:
|
|
3520
|
+
writerr(
|
|
3521
|
+
colored(
|
|
3522
|
+
f"Wayback - [ WARN ] Error getting response for page {page} - {str(e)} (saved {len(linksFoundWayback)} URLs before error)",
|
|
3523
|
+
"yellow",
|
|
3524
|
+
)
|
|
3525
|
+
)
|
|
3526
|
+
else:
|
|
3527
|
+
writerr(
|
|
3528
|
+
colored(
|
|
3529
|
+
"Wayback - [ ERR ] Error getting response for page "
|
|
3530
|
+
+ page
|
|
3531
|
+
+ " - "
|
|
3532
|
+
+ str(e),
|
|
3533
|
+
"red",
|
|
3534
|
+
)
|
|
3535
|
+
)
|
|
3536
|
+
try:
|
|
3537
|
+
current_response = None
|
|
3538
|
+
except Exception:
|
|
3539
|
+
pass
|
|
3540
|
+
try:
|
|
3541
|
+
current_session = None
|
|
3542
|
+
except Exception:
|
|
3543
|
+
pass
|
|
3544
|
+
return
|
|
3503
3545
|
else:
|
|
3546
|
+
print("DEBUG: HERE END!") # DEBUG
|
|
3504
3547
|
pass
|
|
3505
3548
|
except Exception as e:
|
|
3506
3549
|
if verbose():
|
|
@@ -3511,11 +3554,12 @@ def getWaybackUrls():
|
|
|
3511
3554
|
"""
|
|
3512
3555
|
Get URLs from the Wayback Machine, archive.org
|
|
3513
3556
|
"""
|
|
3514
|
-
global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages,
|
|
3557
|
+
global linksFound, linkMimes, waymorePath, subs, path, stopProgram, totalPages, stopSourceWayback, argsInput, checkWayback, linkCountWayback, linksFoundWayback
|
|
3515
3558
|
|
|
3516
3559
|
# Write the file of URL's for the passed domain/URL
|
|
3517
3560
|
try:
|
|
3518
|
-
|
|
3561
|
+
stopSourceWayback = False
|
|
3562
|
+
linksFoundWayback = set()
|
|
3519
3563
|
|
|
3520
3564
|
if MATCH_MIME != "":
|
|
3521
3565
|
filterMIME = "&filter=mimetype:" + re.escape(MATCH_MIME).replace(",", "|")
|
|
@@ -3577,7 +3621,7 @@ def getWaybackUrls():
|
|
|
3577
3621
|
if not args.check_only:
|
|
3578
3622
|
write(
|
|
3579
3623
|
colored(
|
|
3580
|
-
"
|
|
3624
|
+
"Wayback - [ INFO ] Getting the number of pages to search...",
|
|
3581
3625
|
"cyan",
|
|
3582
3626
|
)
|
|
3583
3627
|
)
|
|
@@ -3602,9 +3646,7 @@ def getWaybackUrls():
|
|
|
3602
3646
|
if resp.status_code == 429:
|
|
3603
3647
|
writerr(
|
|
3604
3648
|
colored(
|
|
3605
|
-
|
|
3606
|
-
"[ 429 ] Wayback Machine (Archive.org) rate limit reached so unable to get links."
|
|
3607
|
-
),
|
|
3649
|
+
"Wayback - [ 429 ] Rate limit reached so unable to get links.",
|
|
3608
3650
|
"red",
|
|
3609
3651
|
)
|
|
3610
3652
|
)
|
|
@@ -3614,9 +3656,7 @@ def getWaybackUrls():
|
|
|
3614
3656
|
if resp.status_code == 503:
|
|
3615
3657
|
writerr(
|
|
3616
3658
|
colored(
|
|
3617
|
-
|
|
3618
|
-
"[ 503 ] Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
|
|
3619
|
-
),
|
|
3659
|
+
"Wayback - [ 503 ] The Wayback Machine (Archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify.",
|
|
3620
3660
|
"red",
|
|
3621
3661
|
)
|
|
3622
3662
|
)
|
|
@@ -3625,19 +3665,15 @@ def getWaybackUrls():
|
|
|
3625
3665
|
if resp.text.lower().find("blocked site error") > 0:
|
|
3626
3666
|
writerr(
|
|
3627
3667
|
colored(
|
|
3628
|
-
|
|
3629
|
-
"[ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)"
|
|
3630
|
-
),
|
|
3668
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Blocked Site Error (they block the target site)",
|
|
3631
3669
|
"red",
|
|
3632
3670
|
)
|
|
3633
3671
|
)
|
|
3634
3672
|
else:
|
|
3635
3673
|
writerr(
|
|
3636
3674
|
colored(
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
+ str(resp.text.strip())
|
|
3640
|
-
),
|
|
3675
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): "
|
|
3676
|
+
+ str(resp.text.strip()),
|
|
3641
3677
|
"red",
|
|
3642
3678
|
)
|
|
3643
3679
|
)
|
|
@@ -3645,28 +3681,22 @@ def getWaybackUrls():
|
|
|
3645
3681
|
if str(e).lower().find("alert access denied"):
|
|
3646
3682
|
writerr(
|
|
3647
3683
|
colored(
|
|
3648
|
-
|
|
3649
|
-
"[ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don't know, but it has happened!)"
|
|
3650
|
-
),
|
|
3684
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Access Denied. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking you, e.g. your adult content filter is on (why it triggers that filter I don't know, but it has happened!)",
|
|
3651
3685
|
"red",
|
|
3652
3686
|
)
|
|
3653
3687
|
)
|
|
3654
3688
|
elif str(e).lower().find("connection refused"):
|
|
3655
3689
|
writerr(
|
|
3656
3690
|
colored(
|
|
3657
|
-
|
|
3658
|
-
"[ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)"
|
|
3659
|
-
),
|
|
3691
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): Connection Refused. Are you able to manually visit https://web.archive.org/? Your ISP may be blocking your IP)",
|
|
3660
3692
|
"red",
|
|
3661
3693
|
)
|
|
3662
3694
|
)
|
|
3663
3695
|
else:
|
|
3664
3696
|
writerr(
|
|
3665
3697
|
colored(
|
|
3666
|
-
|
|
3667
|
-
|
|
3668
|
-
+ str(e)
|
|
3669
|
-
),
|
|
3698
|
+
"Wayback - [ ERR ] Unable to get links from Wayback Machine (archive.org): "
|
|
3699
|
+
+ str(e),
|
|
3670
3700
|
"red",
|
|
3671
3701
|
)
|
|
3672
3702
|
)
|
|
@@ -3676,27 +3706,29 @@ def getWaybackUrls():
|
|
|
3676
3706
|
if totalPages < 0:
|
|
3677
3707
|
write(
|
|
3678
3708
|
colored(
|
|
3679
|
-
"Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.",
|
|
3709
|
+
"Wayback - [ INFO ] Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.",
|
|
3680
3710
|
"cyan",
|
|
3681
3711
|
)
|
|
3682
3712
|
)
|
|
3683
3713
|
else:
|
|
3684
3714
|
checkWayback = totalPages
|
|
3685
3715
|
write(
|
|
3686
|
-
colored("Get URLs from Wayback Machine: ", "cyan")
|
|
3716
|
+
colored("Wayback - [ INFO ] Get URLs from Wayback Machine: ", "cyan")
|
|
3687
3717
|
+ colored(str(checkWayback) + " requests", "white")
|
|
3688
3718
|
)
|
|
3689
3719
|
else:
|
|
3690
3720
|
if verbose():
|
|
3691
3721
|
write(
|
|
3692
|
-
colored(
|
|
3722
|
+
colored(
|
|
3723
|
+
"Wayback - [ INFO ] The archive URL requested to get links: ", "magenta"
|
|
3724
|
+
)
|
|
3693
3725
|
+ colored(url + "\n", "white")
|
|
3694
3726
|
)
|
|
3695
3727
|
|
|
3696
3728
|
if totalPages < 0:
|
|
3697
3729
|
write(
|
|
3698
3730
|
colored(
|
|
3699
|
-
"
|
|
3731
|
+
"Wayback - [ INFO ] Getting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...",
|
|
3700
3732
|
"cyan",
|
|
3701
3733
|
)
|
|
3702
3734
|
)
|
|
@@ -3706,9 +3738,9 @@ def getWaybackUrls():
|
|
|
3706
3738
|
# if the page number was found then display it, but otherwise we will just try to increment until we have everything
|
|
3707
3739
|
write(
|
|
3708
3740
|
colored(
|
|
3709
|
-
"
|
|
3741
|
+
"Wayback - [ INFO ] Getting links from "
|
|
3710
3742
|
+ str(totalPages)
|
|
3711
|
-
+ " Wayback Machine (archive.org) API requests (this can take a while for some domains)
|
|
3743
|
+
+ " Wayback Machine (archive.org) API requests (this can take a while for some domains)...",
|
|
3712
3744
|
"cyan",
|
|
3713
3745
|
)
|
|
3714
3746
|
)
|
|
@@ -3732,22 +3764,22 @@ def getWaybackUrls():
|
|
|
3732
3764
|
if verbose() and len(linkMimes) > 0:
|
|
3733
3765
|
linkMimes.discard("warc/revisit")
|
|
3734
3766
|
write(
|
|
3735
|
-
|
|
3736
|
-
|
|
3737
|
-
)
|
|
3767
|
+
colored("Wayback - [ INFO ] MIME types found: ", "magenta")
|
|
3768
|
+
+ colored(str(linkMimes), "white")
|
|
3738
3769
|
+ "\n"
|
|
3739
3770
|
)
|
|
3740
3771
|
linkMimes = None
|
|
3741
3772
|
|
|
3742
3773
|
if not args.xwm:
|
|
3743
|
-
|
|
3774
|
+
linkCountWayback = len(linksFoundWayback)
|
|
3744
3775
|
write(
|
|
3745
|
-
|
|
3746
|
-
|
|
3747
|
-
+ colored(str(linkCount), "white")
|
|
3776
|
+
colored(
|
|
3777
|
+
"Wayback - [ INFO ] Links found on Wayback Machine (archive.org): ", "cyan"
|
|
3748
3778
|
)
|
|
3749
|
-
+ "
|
|
3779
|
+
+ colored(str(linkCountWayback), "white")
|
|
3750
3780
|
)
|
|
3781
|
+
linksFound.update(linksFoundWayback)
|
|
3782
|
+
linksFoundWayback.clear()
|
|
3751
3783
|
|
|
3752
3784
|
except Exception as e:
|
|
3753
3785
|
writerr(colored("ERROR getWaybackUrls 1: " + str(e), "red"))
|
|
@@ -3757,13 +3789,13 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3757
3789
|
"""
|
|
3758
3790
|
Get URLs from a given Common Crawl index collection
|
|
3759
3791
|
"""
|
|
3760
|
-
global subs, path, linksFound, linkMimes,
|
|
3792
|
+
global subs, path, linksFound, linkMimes, stopSourceCommonCrawl, argsInput, linkCountCommonCrawl, linksFoundCommonCrawl, current_response, current_session
|
|
3761
3793
|
|
|
3762
3794
|
try:
|
|
3763
3795
|
# Get memory in case it exceeds threshold
|
|
3764
3796
|
getMemory()
|
|
3765
3797
|
|
|
3766
|
-
if not
|
|
3798
|
+
if not stopSourceCommonCrawl:
|
|
3767
3799
|
# Set mime content type filter
|
|
3768
3800
|
if MATCH_MIME.strip() != "":
|
|
3769
3801
|
filterMIME = "&filter=~mime:("
|
|
@@ -3812,18 +3844,26 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3812
3844
|
session = requests.Session()
|
|
3813
3845
|
session.mount("https://", HTTP_ADAPTER_CC)
|
|
3814
3846
|
session.mount("http://", HTTP_ADAPTER_CC)
|
|
3847
|
+
try:
|
|
3848
|
+
current_session = session
|
|
3849
|
+
except Exception:
|
|
3850
|
+
pass
|
|
3815
3851
|
resp = session.get(url, stream=True, headers={"User-Agent": userAgent})
|
|
3852
|
+
try:
|
|
3853
|
+
current_response = resp
|
|
3854
|
+
except Exception:
|
|
3855
|
+
pass
|
|
3816
3856
|
except ConnectionError:
|
|
3817
3857
|
writerr(
|
|
3818
3858
|
colored(
|
|
3819
|
-
|
|
3859
|
+
"CommonCrawl - [ ERR ] Connection error for index " + cdxApiUrl,
|
|
3820
3860
|
"red",
|
|
3821
3861
|
)
|
|
3822
3862
|
)
|
|
3823
3863
|
resp = None
|
|
3824
3864
|
return
|
|
3825
3865
|
except Exception as e:
|
|
3826
|
-
writerr(colored(
|
|
3866
|
+
writerr(colored("CommonCrawl - [ ERR ] Error getting response - " + str(e), "red"))
|
|
3827
3867
|
resp = None
|
|
3828
3868
|
return
|
|
3829
3869
|
finally:
|
|
@@ -3833,13 +3873,11 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3833
3873
|
if resp.status_code == 429:
|
|
3834
3874
|
writerr(
|
|
3835
3875
|
colored(
|
|
3836
|
-
|
|
3837
|
-
"[ 429 ] Common Crawl rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
3838
|
-
),
|
|
3876
|
+
"CommonCrawl - [ 429 ] Rate limit reached, so stopping. Links that have already been retrieved will be saved.",
|
|
3839
3877
|
"red",
|
|
3840
3878
|
)
|
|
3841
3879
|
)
|
|
3842
|
-
|
|
3880
|
+
stopSourceCommonCrawl = True
|
|
3843
3881
|
return
|
|
3844
3882
|
# If the response from commoncrawl.org says nothing was found...
|
|
3845
3883
|
if resp.text.lower().find("no captures found") > 0:
|
|
@@ -3850,7 +3888,7 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3850
3888
|
if verbose():
|
|
3851
3889
|
writerr(
|
|
3852
3890
|
colored(
|
|
3853
|
-
|
|
3891
|
+
"CommonCrawl - [ ERR ] " + url + " gave an empty response.",
|
|
3854
3892
|
"red",
|
|
3855
3893
|
)
|
|
3856
3894
|
)
|
|
@@ -3860,12 +3898,10 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3860
3898
|
if verbose():
|
|
3861
3899
|
writerr(
|
|
3862
3900
|
colored(
|
|
3863
|
-
|
|
3864
|
-
|
|
3865
|
-
|
|
3866
|
-
|
|
3867
|
-
+ cdxApiUrl
|
|
3868
|
-
),
|
|
3901
|
+
"CommonCrawl - [ "
|
|
3902
|
+
+ str(resp.status_code)
|
|
3903
|
+
+ " ] Error for "
|
|
3904
|
+
+ cdxApiUrl,
|
|
3869
3905
|
"red",
|
|
3870
3906
|
)
|
|
3871
3907
|
)
|
|
@@ -3874,63 +3910,71 @@ def processCommonCrawlCollection(cdxApiUrl):
|
|
|
3874
3910
|
pass
|
|
3875
3911
|
|
|
3876
3912
|
# Get the URLs and MIME types
|
|
3877
|
-
|
|
3878
|
-
|
|
3879
|
-
|
|
3880
|
-
|
|
3881
|
-
|
|
3882
|
-
|
|
3883
|
-
|
|
3884
|
-
|
|
3885
|
-
|
|
3886
|
-
|
|
3887
|
-
|
|
3888
|
-
|
|
3889
|
-
|
|
3890
|
-
|
|
3891
|
-
|
|
3892
|
-
|
|
3893
|
-
|
|
3894
|
-
|
|
3895
|
-
|
|
3896
|
-
|
|
3897
|
-
|
|
3898
|
-
|
|
3899
|
-
|
|
3900
|
-
|
|
3901
|
-
|
|
3902
|
-
|
|
3903
|
-
|
|
3913
|
+
try:
|
|
3914
|
+
for line in resp.iter_lines():
|
|
3915
|
+
results = line.decode("utf-8")
|
|
3916
|
+
try:
|
|
3917
|
+
data = json.loads(results)
|
|
3918
|
+
# Get MIME Types if --verbose option was seletced
|
|
3919
|
+
if verbose():
|
|
3920
|
+
try:
|
|
3921
|
+
if data["mime"] != "":
|
|
3922
|
+
linkMimes.add(data["mime"])
|
|
3923
|
+
except Exception:
|
|
3924
|
+
pass
|
|
3925
|
+
# If -from or -to were passed, check the timestamp of the URL.
|
|
3926
|
+
# Only continue if the URL falls within the date range specified
|
|
3927
|
+
if args.from_date is not None or args.to_date is not None:
|
|
3928
|
+
try:
|
|
3929
|
+
ts = data["timestamp"]
|
|
3930
|
+
|
|
3931
|
+
# Normalize helper: pad/truncate date string to 14 digits (YYYYMMDDhhmmss)
|
|
3932
|
+
def normalize_date(d, is_from):
|
|
3933
|
+
if d is None:
|
|
3934
|
+
return None
|
|
3935
|
+
d = d.strip()
|
|
3936
|
+
# Pad to 14 digits: from_date pads with 0s, to_date with 9s
|
|
3937
|
+
if is_from:
|
|
3938
|
+
return (d + "0" * (14 - len(d)))[:14]
|
|
3939
|
+
else:
|
|
3940
|
+
return (d + "9" * (14 - len(d)))[:14]
|
|
3904
3941
|
|
|
3905
|
-
|
|
3906
|
-
|
|
3942
|
+
from_ts = normalize_date(args.from_date, True)
|
|
3943
|
+
to_ts = normalize_date(args.to_date, False)
|
|
3907
3944
|
|
|
3908
|
-
|
|
3909
|
-
|
|
3910
|
-
|
|
3911
|
-
|
|
3912
|
-
|
|
3945
|
+
# Compare numerically
|
|
3946
|
+
if from_ts and ts < from_ts:
|
|
3947
|
+
continue
|
|
3948
|
+
if to_ts and ts > to_ts:
|
|
3949
|
+
continue
|
|
3913
3950
|
|
|
3914
|
-
|
|
3951
|
+
except Exception:
|
|
3952
|
+
writerr(
|
|
3953
|
+
colored(
|
|
3954
|
+
"ERROR processCommonCrawlCollection 3: Cannot get timestamp from line {line}: {str(e)}",
|
|
3955
|
+
"red",
|
|
3956
|
+
)
|
|
3957
|
+
)
|
|
3958
|
+
|
|
3959
|
+
linksFoundAdd(data["url"], linksFoundCommonCrawl)
|
|
3960
|
+
except Exception:
|
|
3961
|
+
if verbose():
|
|
3915
3962
|
writerr(
|
|
3916
3963
|
colored(
|
|
3917
|
-
|
|
3918
|
-
|
|
3919
|
-
),
|
|
3964
|
+
"ERROR processCommonCrawlCollection 2: Cannot get URL and MIME type from line: "
|
|
3965
|
+
+ str(line),
|
|
3920
3966
|
"red",
|
|
3921
3967
|
)
|
|
3922
3968
|
)
|
|
3923
|
-
|
|
3924
|
-
|
|
3969
|
+
finally:
|
|
3970
|
+
try:
|
|
3971
|
+
current_response = None
|
|
3925
3972
|
except Exception:
|
|
3926
|
-
|
|
3927
|
-
|
|
3928
|
-
|
|
3929
|
-
|
|
3930
|
-
|
|
3931
|
-
"red",
|
|
3932
|
-
)
|
|
3933
|
-
)
|
|
3973
|
+
pass
|
|
3974
|
+
try:
|
|
3975
|
+
current_session = None
|
|
3976
|
+
except Exception:
|
|
3977
|
+
pass
|
|
3934
3978
|
else:
|
|
3935
3979
|
pass
|
|
3936
3980
|
except Exception as e:
|
|
@@ -3957,10 +4001,8 @@ def getCommonCrawlIndexes():
|
|
|
3957
4001
|
except Exception as e:
|
|
3958
4002
|
writerr(
|
|
3959
4003
|
colored(
|
|
3960
|
-
|
|
3961
|
-
|
|
3962
|
-
+ str(e)
|
|
3963
|
-
),
|
|
4004
|
+
"CommonCrawl - [ ERR ] Couldn't delete local version of Common Crawl index file: "
|
|
4005
|
+
+ str(e),
|
|
3964
4006
|
"red",
|
|
3965
4007
|
)
|
|
3966
4008
|
)
|
|
@@ -3978,10 +4020,8 @@ def getCommonCrawlIndexes():
|
|
|
3978
4020
|
createFile = True
|
|
3979
4021
|
writerr(
|
|
3980
4022
|
colored(
|
|
3981
|
-
|
|
3982
|
-
|
|
3983
|
-
+ str(e)
|
|
3984
|
-
),
|
|
4023
|
+
"CommonCrawl - [ ERR ] Couldn't read local version of Common Crawl index file: "
|
|
4024
|
+
+ str(e),
|
|
3985
4025
|
"red",
|
|
3986
4026
|
)
|
|
3987
4027
|
)
|
|
@@ -3998,7 +4038,7 @@ def getCommonCrawlIndexes():
|
|
|
3998
4038
|
except ConnectionError:
|
|
3999
4039
|
writerr(
|
|
4000
4040
|
colored(
|
|
4001
|
-
|
|
4041
|
+
"CommonCrawl - [ ERR ] Connection error getting Index file",
|
|
4002
4042
|
"red",
|
|
4003
4043
|
)
|
|
4004
4044
|
)
|
|
@@ -4006,9 +4046,8 @@ def getCommonCrawlIndexes():
|
|
|
4006
4046
|
except Exception as e:
|
|
4007
4047
|
writerr(
|
|
4008
4048
|
colored(
|
|
4009
|
-
|
|
4010
|
-
|
|
4011
|
-
),
|
|
4049
|
+
"CommonCrawl - [ ERR ] Error getting Common Crawl index collection - "
|
|
4050
|
+
+ str(e),
|
|
4012
4051
|
"red",
|
|
4013
4052
|
)
|
|
4014
4053
|
)
|
|
@@ -4018,9 +4057,7 @@ def getCommonCrawlIndexes():
|
|
|
4018
4057
|
if indexes.status_code == 429:
|
|
4019
4058
|
writerr(
|
|
4020
4059
|
colored(
|
|
4021
|
-
|
|
4022
|
-
"[ 429 ] Common Crawl rate limit reached so unable to get links."
|
|
4023
|
-
),
|
|
4060
|
+
"CommonCrawl - [ 429 ] Rate limit reached so unable to get links.",
|
|
4024
4061
|
"red",
|
|
4025
4062
|
)
|
|
4026
4063
|
)
|
|
@@ -4029,7 +4066,7 @@ def getCommonCrawlIndexes():
|
|
|
4029
4066
|
elif indexes.status_code == 503:
|
|
4030
4067
|
writerr(
|
|
4031
4068
|
colored(
|
|
4032
|
-
|
|
4069
|
+
"CommonCrawl - [ 503 ] Common Crawl seems to be unavailable.",
|
|
4033
4070
|
"red",
|
|
4034
4071
|
)
|
|
4035
4072
|
)
|
|
@@ -4037,11 +4074,9 @@ def getCommonCrawlIndexes():
|
|
|
4037
4074
|
elif indexes.status_code != 200:
|
|
4038
4075
|
writerr(
|
|
4039
4076
|
colored(
|
|
4040
|
-
|
|
4041
|
-
|
|
4042
|
-
|
|
4043
|
-
+ " ] Common Crawl did not retrun the indexes file."
|
|
4044
|
-
),
|
|
4077
|
+
"CommonCrawl - [ "
|
|
4078
|
+
+ str(indexes.status_code)
|
|
4079
|
+
+ " ] Common Crawl did not retrun the indexes file.",
|
|
4045
4080
|
"red",
|
|
4046
4081
|
)
|
|
4047
4082
|
)
|
|
@@ -4058,10 +4093,8 @@ def getCommonCrawlIndexes():
|
|
|
4058
4093
|
except Exception as e:
|
|
4059
4094
|
writerr(
|
|
4060
4095
|
colored(
|
|
4061
|
-
|
|
4062
|
-
|
|
4063
|
-
+ str(e)
|
|
4064
|
-
),
|
|
4096
|
+
"CommonCrawl - [ ERR ] Couldn't create local version of Common Crawl index file: "
|
|
4097
|
+
+ str(e),
|
|
4065
4098
|
"red",
|
|
4066
4099
|
)
|
|
4067
4100
|
)
|
|
@@ -4094,12 +4127,10 @@ def getCommonCrawlIndexes():
|
|
|
4094
4127
|
except Exception as e:
|
|
4095
4128
|
writerr(
|
|
4096
4129
|
colored(
|
|
4097
|
-
|
|
4098
|
-
|
|
4099
|
-
|
|
4100
|
-
|
|
4101
|
-
+ str(e)
|
|
4102
|
-
),
|
|
4130
|
+
"CommonCrawl - [ ERR ] Failed to get the year from index name "
|
|
4131
|
+
+ values[key]
|
|
4132
|
+
+ " - "
|
|
4133
|
+
+ str(e),
|
|
4103
4134
|
"red",
|
|
4104
4135
|
)
|
|
4105
4136
|
)
|
|
@@ -4121,12 +4152,11 @@ def getCommonCrawlUrls():
|
|
|
4121
4152
|
"""
|
|
4122
4153
|
Get all Common Crawl index collections to get all URLs from each one
|
|
4123
4154
|
"""
|
|
4124
|
-
global linksFound, linkMimes, waymorePath, subs, path,
|
|
4155
|
+
global linksFound, linkMimes, waymorePath, subs, path, stopSourceCommonCrawl, argsInput, checkCommonCrawl, linkCountCommonCrawl, linksFoundCommonCrawl
|
|
4125
4156
|
|
|
4126
4157
|
try:
|
|
4127
|
-
|
|
4128
|
-
|
|
4129
|
-
originalLinkCount = len(linksFound)
|
|
4158
|
+
stopSourceCommonCrawl = False
|
|
4159
|
+
linksFoundCommonCrawl = set()
|
|
4130
4160
|
|
|
4131
4161
|
# Set mime content type filter
|
|
4132
4162
|
if MATCH_MIME.strip() != "":
|
|
@@ -4164,7 +4194,7 @@ def getCommonCrawlUrls():
|
|
|
4164
4194
|
)
|
|
4165
4195
|
write(
|
|
4166
4196
|
colored(
|
|
4167
|
-
"The
|
|
4197
|
+
"CommonCrawl - [ INFO ] The index URL requested to get links (where {CDX-API-URL} is from "
|
|
4168
4198
|
+ CCRAWL_INDEX_URL
|
|
4169
4199
|
+ "): ",
|
|
4170
4200
|
"magenta",
|
|
@@ -4173,7 +4203,7 @@ def getCommonCrawlUrls():
|
|
|
4173
4203
|
)
|
|
4174
4204
|
|
|
4175
4205
|
if not args.check_only:
|
|
4176
|
-
write(colored("
|
|
4206
|
+
write(colored("CommonCrawl - [ INFO ] Getting index collections list...", "cyan"))
|
|
4177
4207
|
|
|
4178
4208
|
# Get the Common Crawl index collections
|
|
4179
4209
|
cdxApiUrls = getCommonCrawlIndexes()
|
|
@@ -4186,15 +4216,15 @@ def getCommonCrawlUrls():
|
|
|
4186
4216
|
else:
|
|
4187
4217
|
checkCommonCrawl = len(cdxApiUrls) + 1
|
|
4188
4218
|
write(
|
|
4189
|
-
colored("Get URLs from Common Crawl: ", "cyan")
|
|
4219
|
+
colored("CommonCrawl - [ INFO ] Get URLs from Common Crawl: ", "cyan")
|
|
4190
4220
|
+ colored(str(checkCommonCrawl) + " requests", "white")
|
|
4191
4221
|
)
|
|
4192
4222
|
else:
|
|
4193
4223
|
write(
|
|
4194
4224
|
colored(
|
|
4195
|
-
"
|
|
4225
|
+
"CommonCrawl - [ INFO ] Getting links from the latest "
|
|
4196
4226
|
+ str(len(cdxApiUrls))
|
|
4197
|
-
+ " commoncrawl.org index collections (this can take a while for some domains)
|
|
4227
|
+
+ " commoncrawl.org index collections (this can take a while for some domains)...",
|
|
4198
4228
|
"cyan",
|
|
4199
4229
|
)
|
|
4200
4230
|
)
|
|
@@ -4210,30 +4240,18 @@ def getCommonCrawlUrls():
|
|
|
4210
4240
|
if verbose() and len(linkMimes) > 0:
|
|
4211
4241
|
linkMimes.discard("warc/revisit")
|
|
4212
4242
|
write(
|
|
4213
|
-
|
|
4214
|
-
|
|
4215
|
-
+ colored(str(linkMimes), "white")
|
|
4216
|
-
)
|
|
4243
|
+
colored("CommonCrawl - [ INFO ] MIME types found: ", "magenta")
|
|
4244
|
+
+ colored(str(linkMimes), "white")
|
|
4217
4245
|
+ "\n"
|
|
4218
4246
|
)
|
|
4219
4247
|
|
|
4220
|
-
|
|
4221
|
-
|
|
4222
|
-
|
|
4223
|
-
|
|
4224
|
-
|
|
4225
|
-
|
|
4226
|
-
|
|
4227
|
-
+ "\n"
|
|
4228
|
-
)
|
|
4229
|
-
else:
|
|
4230
|
-
write(
|
|
4231
|
-
getSPACER(
|
|
4232
|
-
colored("Extra links found on commoncrawl.org: ", "cyan")
|
|
4233
|
-
+ colored(str(linkCount), "white")
|
|
4234
|
-
)
|
|
4235
|
-
+ "\n"
|
|
4236
|
-
)
|
|
4248
|
+
linkCountCommonCrawl = len(linksFoundCommonCrawl)
|
|
4249
|
+
write(
|
|
4250
|
+
colored("CommonCrawl - [ INFO ] Links found on commoncrawl.org: ", "cyan")
|
|
4251
|
+
+ colored(str(linkCountCommonCrawl), "white")
|
|
4252
|
+
)
|
|
4253
|
+
linksFound.update(linksFoundCommonCrawl)
|
|
4254
|
+
linksFoundCommonCrawl.clear()
|
|
4237
4255
|
|
|
4238
4256
|
except Exception as e:
|
|
4239
4257
|
writerr(colored("ERROR getCommonCrawlUrls 1: " + str(e), "red"))
|
|
@@ -4243,7 +4261,7 @@ def processVirusTotalUrl(url):
|
|
|
4243
4261
|
"""
|
|
4244
4262
|
Process a specific URL from virustotal.com to determine whether to save the link
|
|
4245
4263
|
"""
|
|
4246
|
-
global argsInput, argsInputHostname
|
|
4264
|
+
global argsInput, argsInputHostname, linkCountVirusTotal, linksFoundVirusTotal
|
|
4247
4265
|
|
|
4248
4266
|
addLink = True
|
|
4249
4267
|
|
|
@@ -4310,7 +4328,7 @@ def processVirusTotalUrl(url):
|
|
|
4310
4328
|
flags=re.IGNORECASE,
|
|
4311
4329
|
)
|
|
4312
4330
|
if match is not None:
|
|
4313
|
-
linksFoundAdd(url)
|
|
4331
|
+
linksFoundAdd(url, linksFoundVirusTotal)
|
|
4314
4332
|
|
|
4315
4333
|
except Exception as e:
|
|
4316
4334
|
writerr(colored("ERROR processVirusTotalUrl 1: " + str(e), "red"))
|
|
@@ -4321,12 +4339,11 @@ def getVirusTotalUrls():
|
|
|
4321
4339
|
Get URLs from the VirusTotal API v2 and process them.
|
|
4322
4340
|
Each URL is normalized as (url, scan_date) tuple. Dates are filtered according to args.from_date / args.to_date.
|
|
4323
4341
|
"""
|
|
4324
|
-
global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram,
|
|
4342
|
+
global VIRUSTOTAL_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceVirusTotal, argsInput, checkVirusTotal, argsInputHostname, linkCountVirusTotal, linksFoundVirusTotal
|
|
4325
4343
|
|
|
4326
4344
|
try:
|
|
4327
|
-
|
|
4328
|
-
|
|
4329
|
-
originalLinkCount = len(linksFound)
|
|
4345
|
+
stopSourceVirusTotal = False
|
|
4346
|
+
linksFoundVirusTotal = set()
|
|
4330
4347
|
|
|
4331
4348
|
# Build the VirusTotal API URL
|
|
4332
4349
|
url = VIRUSTOTAL_URL.replace("{DOMAIN}", quote(argsInputHostname)).replace(
|
|
@@ -4335,12 +4352,12 @@ def getVirusTotalUrls():
|
|
|
4335
4352
|
|
|
4336
4353
|
if verbose():
|
|
4337
4354
|
write(
|
|
4338
|
-
colored("The
|
|
4355
|
+
colored("VirusTotal - [ INFO ] The URL requested to get links: ", "magenta")
|
|
4339
4356
|
+ colored(url + "\n", "white")
|
|
4340
4357
|
)
|
|
4341
4358
|
|
|
4342
4359
|
if not args.check_only:
|
|
4343
|
-
write(colored("
|
|
4360
|
+
write(colored("VirusTotal - [ INFO ] Getting links from virustotal.com API...", "cyan"))
|
|
4344
4361
|
|
|
4345
4362
|
# Make request
|
|
4346
4363
|
try:
|
|
@@ -4352,7 +4369,7 @@ def getVirusTotalUrls():
|
|
|
4352
4369
|
except Exception as e:
|
|
4353
4370
|
writerr(
|
|
4354
4371
|
colored(
|
|
4355
|
-
|
|
4372
|
+
"VirusTotal - [ ERR ] Unable to get links from virustotal.com: " + str(e),
|
|
4356
4373
|
"red",
|
|
4357
4374
|
)
|
|
4358
4375
|
)
|
|
@@ -4362,7 +4379,7 @@ def getVirusTotalUrls():
|
|
|
4362
4379
|
if resp.status_code == 429:
|
|
4363
4380
|
writerr(
|
|
4364
4381
|
colored(
|
|
4365
|
-
|
|
4382
|
+
"VirusTotal - [ 429 ] Rate limit reached so unable to get links.",
|
|
4366
4383
|
"red",
|
|
4367
4384
|
)
|
|
4368
4385
|
)
|
|
@@ -4370,9 +4387,7 @@ def getVirusTotalUrls():
|
|
|
4370
4387
|
elif resp.status_code == 403:
|
|
4371
4388
|
writerr(
|
|
4372
4389
|
colored(
|
|
4373
|
-
|
|
4374
|
-
"[ 403 ] VirusTotal: Permission denied. Check your API key is correct."
|
|
4375
|
-
),
|
|
4390
|
+
"VirusTotal - [ 403 ] Permission denied. Check your API key is correct.",
|
|
4376
4391
|
"red",
|
|
4377
4392
|
)
|
|
4378
4393
|
)
|
|
@@ -4380,7 +4395,9 @@ def getVirusTotalUrls():
|
|
|
4380
4395
|
elif resp.status_code != 200:
|
|
4381
4396
|
writerr(
|
|
4382
4397
|
colored(
|
|
4383
|
-
|
|
4398
|
+
"VirusTotal - [ ERR ] [ "
|
|
4399
|
+
+ str(resp.status_code)
|
|
4400
|
+
+ " ] Unable to get links from virustotal.com",
|
|
4384
4401
|
"red",
|
|
4385
4402
|
)
|
|
4386
4403
|
)
|
|
@@ -4411,7 +4428,7 @@ def getVirusTotalUrls():
|
|
|
4411
4428
|
except Exception as e:
|
|
4412
4429
|
writerr(
|
|
4413
4430
|
colored(
|
|
4414
|
-
|
|
4431
|
+
"VirusTotal - [ ERR ] Unexpected response from the VirusTotal API: " + str(e),
|
|
4415
4432
|
"red",
|
|
4416
4433
|
)
|
|
4417
4434
|
)
|
|
@@ -4419,12 +4436,15 @@ def getVirusTotalUrls():
|
|
|
4419
4436
|
|
|
4420
4437
|
# Check only mode
|
|
4421
4438
|
if args.check_only:
|
|
4422
|
-
write(
|
|
4439
|
+
write(
|
|
4440
|
+
colored("VirusTotal - [ INFO ] Get URLs from VirusTotal: ", "cyan")
|
|
4441
|
+
+ colored("1 request", "white")
|
|
4442
|
+
)
|
|
4423
4443
|
checkVirusTotal = 1
|
|
4424
4444
|
else:
|
|
4425
4445
|
# Process each URL tuple
|
|
4426
4446
|
for url, scan_date in all_urls:
|
|
4427
|
-
if
|
|
4447
|
+
if stopSourceVirusTotal:
|
|
4428
4448
|
break
|
|
4429
4449
|
getMemory()
|
|
4430
4450
|
|
|
@@ -4445,24 +4465,14 @@ def getVirusTotalUrls():
|
|
|
4445
4465
|
# Process URL
|
|
4446
4466
|
processVirusTotalUrl(url)
|
|
4447
4467
|
|
|
4448
|
-
#
|
|
4449
|
-
|
|
4450
|
-
|
|
4451
|
-
|
|
4452
|
-
|
|
4453
|
-
|
|
4454
|
-
|
|
4455
|
-
|
|
4456
|
-
+ "\n"
|
|
4457
|
-
)
|
|
4458
|
-
else:
|
|
4459
|
-
write(
|
|
4460
|
-
getSPACER(
|
|
4461
|
-
colored("Extra links found on virustotal.com: ", "cyan")
|
|
4462
|
-
+ colored(str(linkCount), "white")
|
|
4463
|
-
)
|
|
4464
|
-
+ "\n"
|
|
4465
|
-
)
|
|
4468
|
+
# Show links found
|
|
4469
|
+
linkCountVirusTotal = len(linksFoundVirusTotal)
|
|
4470
|
+
write(
|
|
4471
|
+
colored("VirusTotal - [ INFO ] Links found on virustotal.com: ", "cyan")
|
|
4472
|
+
+ colored(str(linkCountVirusTotal), "white")
|
|
4473
|
+
)
|
|
4474
|
+
linksFound.update(linksFoundVirusTotal)
|
|
4475
|
+
linksFoundVirusTotal.clear()
|
|
4466
4476
|
|
|
4467
4477
|
except Exception as e:
|
|
4468
4478
|
writerr(colored(f"ERROR getVirusTotalUrls: {e}", "red"))
|
|
@@ -4472,7 +4482,7 @@ def processIntelxUrl(url):
|
|
|
4472
4482
|
"""
|
|
4473
4483
|
Process a specific URL from intelx.io to determine whether to save the link
|
|
4474
4484
|
"""
|
|
4475
|
-
global argsInput, argsInputHostname
|
|
4485
|
+
global argsInput, argsInputHostname, linkCountIntelx, linksFoundIntelx
|
|
4476
4486
|
|
|
4477
4487
|
addLink = True
|
|
4478
4488
|
|
|
@@ -4524,7 +4534,7 @@ def processIntelxUrl(url):
|
|
|
4524
4534
|
|
|
4525
4535
|
# Add link if it passed filters
|
|
4526
4536
|
if addLink:
|
|
4527
|
-
linksFoundAdd(url)
|
|
4537
|
+
linksFoundAdd(url, linksFoundIntelx)
|
|
4528
4538
|
|
|
4529
4539
|
except Exception as e:
|
|
4530
4540
|
writerr(colored("ERROR processIntelxUrl 1: " + str(e), "red"))
|
|
@@ -4535,6 +4545,7 @@ def processIntelxType(target, credits):
|
|
|
4535
4545
|
target: 1 - Domains
|
|
4536
4546
|
target: 3 - URLs
|
|
4537
4547
|
"""
|
|
4548
|
+
global intelxAPIIssue
|
|
4538
4549
|
try:
|
|
4539
4550
|
try:
|
|
4540
4551
|
requestsMade = 0
|
|
@@ -4554,7 +4565,7 @@ def processIntelxType(target, credits):
|
|
|
4554
4565
|
except Exception as e:
|
|
4555
4566
|
write(
|
|
4556
4567
|
colored(
|
|
4557
|
-
|
|
4568
|
+
"IntelX - [ ERR ] Unable to get links from intelx.io: " + str(e),
|
|
4558
4569
|
"red",
|
|
4559
4570
|
)
|
|
4560
4571
|
)
|
|
@@ -4562,49 +4573,47 @@ def processIntelxType(target, credits):
|
|
|
4562
4573
|
|
|
4563
4574
|
# Deal with any errors
|
|
4564
4575
|
if resp.status_code == 429:
|
|
4576
|
+
intelxAPIIssue = True
|
|
4565
4577
|
writerr(
|
|
4566
4578
|
colored(
|
|
4567
|
-
|
|
4579
|
+
"IntelX - [ 429 ] Rate limit reached so unable to get links.",
|
|
4568
4580
|
"red",
|
|
4569
4581
|
)
|
|
4570
4582
|
)
|
|
4571
4583
|
return
|
|
4572
4584
|
elif resp.status_code == 401:
|
|
4585
|
+
intelxAPIIssue = True
|
|
4573
4586
|
writerr(
|
|
4574
4587
|
colored(
|
|
4575
|
-
|
|
4576
|
-
"[ 401 ] IntelX: Not authorized. The source requires a paid API key. Check your API key is correct."
|
|
4577
|
-
),
|
|
4588
|
+
"IntelX - [ 401 ] Not authorized. The source requires a paid API key. Check your API key is correct.",
|
|
4578
4589
|
"red",
|
|
4579
4590
|
)
|
|
4580
4591
|
)
|
|
4581
4592
|
return
|
|
4582
4593
|
elif resp.status_code == 402:
|
|
4594
|
+
intelxAPIIssue = True
|
|
4583
4595
|
if credits.startswith("0/"):
|
|
4584
4596
|
writerr(
|
|
4585
4597
|
colored(
|
|
4586
|
-
|
|
4587
|
-
|
|
4588
|
-
|
|
4589
|
-
+ ")."
|
|
4590
|
-
),
|
|
4598
|
+
"IntelX - [ 402 ] You have run out of daily credits on Intelx ("
|
|
4599
|
+
+ credits
|
|
4600
|
+
+ ").",
|
|
4591
4601
|
"red",
|
|
4592
4602
|
)
|
|
4593
4603
|
)
|
|
4594
4604
|
else:
|
|
4595
4605
|
writerr(
|
|
4596
4606
|
colored(
|
|
4597
|
-
|
|
4598
|
-
"[ 402 ] IntelX: It appears you have run out of daily credits on Intelx."
|
|
4599
|
-
),
|
|
4607
|
+
"IntelX - [ 402 ] It appears you have run out of daily credits on Intelx.",
|
|
4600
4608
|
"red",
|
|
4601
4609
|
)
|
|
4602
4610
|
)
|
|
4603
4611
|
return
|
|
4604
4612
|
elif resp.status_code == 403:
|
|
4613
|
+
intelxAPIIssue = True
|
|
4605
4614
|
writerr(
|
|
4606
4615
|
colored(
|
|
4607
|
-
|
|
4616
|
+
"IntelX - [ 403 ] Permission denied. Check your API key is correct.",
|
|
4608
4617
|
"red",
|
|
4609
4618
|
)
|
|
4610
4619
|
)
|
|
@@ -4612,9 +4621,7 @@ def processIntelxType(target, credits):
|
|
|
4612
4621
|
elif resp.status_code != 200:
|
|
4613
4622
|
writerr(
|
|
4614
4623
|
colored(
|
|
4615
|
-
|
|
4616
|
-
"[ " + str(resp.status_code) + " ] Unable to get links from intelx.io"
|
|
4617
|
-
),
|
|
4624
|
+
"IntelX - [ " + str(resp.status_code) + " ] Unable to get links from intelx.io",
|
|
4618
4625
|
"red",
|
|
4619
4626
|
)
|
|
4620
4627
|
)
|
|
@@ -4627,7 +4634,7 @@ def processIntelxType(target, credits):
|
|
|
4627
4634
|
except Exception:
|
|
4628
4635
|
writerr(
|
|
4629
4636
|
colored(
|
|
4630
|
-
|
|
4637
|
+
"IntelX - [ ERR ] There was an unexpected response from the Intelligence API",
|
|
4631
4638
|
"red",
|
|
4632
4639
|
)
|
|
4633
4640
|
)
|
|
@@ -4637,7 +4644,7 @@ def processIntelxType(target, credits):
|
|
|
4637
4644
|
moreResults = True
|
|
4638
4645
|
status = 0
|
|
4639
4646
|
while moreResults:
|
|
4640
|
-
if
|
|
4647
|
+
if stopSourceIntelx:
|
|
4641
4648
|
break
|
|
4642
4649
|
try:
|
|
4643
4650
|
resp = session.get(
|
|
@@ -4648,7 +4655,7 @@ def processIntelxType(target, credits):
|
|
|
4648
4655
|
except Exception as e:
|
|
4649
4656
|
write(
|
|
4650
4657
|
colored(
|
|
4651
|
-
|
|
4658
|
+
"IntelX - [ ERR ] Unable to get links from intelx.io: " + str(e),
|
|
4652
4659
|
"red",
|
|
4653
4660
|
)
|
|
4654
4661
|
)
|
|
@@ -4661,9 +4668,7 @@ def processIntelxType(target, credits):
|
|
|
4661
4668
|
except Exception:
|
|
4662
4669
|
writerr(
|
|
4663
4670
|
colored(
|
|
4664
|
-
|
|
4665
|
-
"[ ERR ] There was an unexpected response from the Intelligence API"
|
|
4666
|
-
),
|
|
4671
|
+
"IntelX - [ ERR ] There was an unexpected response from the Intelligence API",
|
|
4667
4672
|
"red",
|
|
4668
4673
|
)
|
|
4669
4674
|
)
|
|
@@ -4685,7 +4690,7 @@ def processIntelxType(target, credits):
|
|
|
4685
4690
|
# Work out whether to include each url
|
|
4686
4691
|
unique_values = list(set(selector_values + selector_valuesh))
|
|
4687
4692
|
for ixurl in unique_values:
|
|
4688
|
-
if
|
|
4693
|
+
if stopSourceIntelx:
|
|
4689
4694
|
break
|
|
4690
4695
|
processIntelxUrl(ixurl)
|
|
4691
4696
|
|
|
@@ -4727,56 +4732,51 @@ def getIntelxUrls():
|
|
|
4727
4732
|
"""
|
|
4728
4733
|
Get URLs from the Intelligence X Phonebook search
|
|
4729
4734
|
"""
|
|
4730
|
-
global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram,
|
|
4735
|
+
global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx
|
|
4731
4736
|
|
|
4732
4737
|
# Write the file of URL's for the passed domain/URL
|
|
4733
4738
|
try:
|
|
4734
4739
|
if args.check_only:
|
|
4735
4740
|
write(
|
|
4736
|
-
colored("Get URLs from Intelligence X: ", "cyan")
|
|
4741
|
+
colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
|
|
4737
4742
|
+ colored("minimum 4 requests", "white")
|
|
4738
4743
|
)
|
|
4739
4744
|
checkIntelx = 4
|
|
4740
4745
|
return
|
|
4741
4746
|
|
|
4742
|
-
|
|
4743
|
-
|
|
4747
|
+
stopSourceIntelx = False
|
|
4748
|
+
linksFoundIntelx = set()
|
|
4749
|
+
|
|
4744
4750
|
credits = getIntelxAccountInfo()
|
|
4745
4751
|
if verbose():
|
|
4746
4752
|
write(
|
|
4747
4753
|
colored(
|
|
4748
|
-
"The Intelligence X URL requested to get links (Credits: "
|
|
4754
|
+
"IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
|
|
4755
|
+
+ credits
|
|
4756
|
+
+ "): ",
|
|
4749
4757
|
"magenta",
|
|
4750
4758
|
)
|
|
4751
4759
|
+ colored(INTELX_SEARCH_URL + "\n", "white")
|
|
4752
4760
|
)
|
|
4753
4761
|
|
|
4754
4762
|
if not args.check_only:
|
|
4755
|
-
write(colored("
|
|
4763
|
+
write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
|
|
4756
4764
|
|
|
4757
4765
|
# Get the domains from Intelligence X if the --no-subs wasn't passed
|
|
4758
4766
|
if not args.no_subs:
|
|
4759
4767
|
processIntelxType(1, credits)
|
|
4760
4768
|
|
|
4761
4769
|
# Get the URLs from Intelligence X
|
|
4762
|
-
|
|
4770
|
+
if not intelxAPIIssue:
|
|
4771
|
+
processIntelxType(3, credits)
|
|
4763
4772
|
|
|
4764
|
-
|
|
4765
|
-
|
|
4766
|
-
|
|
4767
|
-
|
|
4768
|
-
|
|
4769
|
-
|
|
4770
|
-
|
|
4771
|
-
)
|
|
4772
|
-
else:
|
|
4773
|
-
write(
|
|
4774
|
-
getSPACER(
|
|
4775
|
-
colored("Extra links found on intelx.io: ", "cyan")
|
|
4776
|
-
+ colored(str(linkCount), "white")
|
|
4777
|
-
)
|
|
4778
|
-
+ "\n"
|
|
4779
|
-
)
|
|
4773
|
+
linkCountIntelx = len(linksFoundIntelx)
|
|
4774
|
+
write(
|
|
4775
|
+
colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
|
|
4776
|
+
+ colored(str(linkCountIntelx), "white")
|
|
4777
|
+
)
|
|
4778
|
+
linksFound.update(linksFoundIntelx)
|
|
4779
|
+
linksFoundIntelx.clear()
|
|
4780
4780
|
|
|
4781
4781
|
except Exception as e:
|
|
4782
4782
|
writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
|
|
@@ -4832,14 +4832,16 @@ def processResponsesURLScan():
|
|
|
4832
4832
|
writerr(colored("ERROR processResponsesURLScan 4: " + str(e), "red"))
|
|
4833
4833
|
|
|
4834
4834
|
# Get URLs from URLScan.io if the DOM ID's haven't been retrieved yet
|
|
4835
|
-
if
|
|
4836
|
-
|
|
4837
|
-
|
|
4838
|
-
|
|
4839
|
-
|
|
4835
|
+
if stopProgram is None and not args.check_only:
|
|
4836
|
+
if args.mode in ("R", "B"):
|
|
4837
|
+
write(
|
|
4838
|
+
colored(
|
|
4839
|
+
"URLScan - [ INFO ] Getting list of response links (this can take a while for some domains)...",
|
|
4840
|
+
"cyan",
|
|
4841
|
+
)
|
|
4840
4842
|
)
|
|
4841
|
-
|
|
4842
|
-
|
|
4843
|
+
if args.mode == "R":
|
|
4844
|
+
getURLScanUrls()
|
|
4843
4845
|
|
|
4844
4846
|
# Check if a continueResp.URLScan.tmp and responses.URLScan.tmp files exists
|
|
4845
4847
|
runPrevious = "n"
|
|
@@ -4937,25 +4939,6 @@ def processResponsesURLScan():
|
|
|
4937
4939
|
"green",
|
|
4938
4940
|
)
|
|
4939
4941
|
)
|
|
4940
|
-
# if args.limit == 5000 and totalResponses == 5000:
|
|
4941
|
-
# writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests (the --limit argument defaults to '+str(DEFAULT_LIMIT)+')','cyan'))
|
|
4942
|
-
# else:
|
|
4943
|
-
# writerr(colored('Downloading archived responses: ','cyan')+colored(str(totalResponses+1)+' requests','white'))
|
|
4944
|
-
# minutes = round(totalResponses*2.5 // 60)
|
|
4945
|
-
# hours = minutes // 60
|
|
4946
|
-
# days = hours // 24
|
|
4947
|
-
# if minutes < 5:
|
|
4948
|
-
# write(colored('\n-> Downloading the responses (depending on their size) should be quite quick!','green'))
|
|
4949
|
-
# elif hours < 2:
|
|
4950
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(minutes)+' minutes.','green'))
|
|
4951
|
-
# elif hours < 6:
|
|
4952
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','green'))
|
|
4953
|
-
# elif hours < 24:
|
|
4954
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(hours)+' hours.','yellow'))
|
|
4955
|
-
# elif days < 7:
|
|
4956
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days. Consider using arguments -ko, -l, -ci, -from and -to wisely! ','red'))
|
|
4957
|
-
# else:
|
|
4958
|
-
# write(colored('\n-> Downloading the responses (depending on their size) could take more than '+str(days)+' days!!! Consider using arguments -ko, -l, -ci, -from and -to wisely!','red'))
|
|
4959
4942
|
write("")
|
|
4960
4943
|
else:
|
|
4961
4944
|
# If the limit has been set over the default, give a warning that this could take a long time!
|
|
@@ -5017,7 +5000,7 @@ def processResponsesURLScan():
|
|
|
5017
5000
|
if failureCount > 0:
|
|
5018
5001
|
if verbose():
|
|
5019
5002
|
write(
|
|
5020
|
-
colored("
|
|
5003
|
+
colored("URLScan - [ INFO ] Responses saved to ", "cyan")
|
|
5021
5004
|
+ colored(responseOutputDirectory, "white")
|
|
5022
5005
|
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
5023
5006
|
+ colored(
|
|
@@ -5032,7 +5015,7 @@ def processResponsesURLScan():
|
|
|
5032
5015
|
else:
|
|
5033
5016
|
write(
|
|
5034
5017
|
colored(
|
|
5035
|
-
"
|
|
5018
|
+
"URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5036
5019
|
"cyan",
|
|
5037
5020
|
)
|
|
5038
5021
|
+ colored(
|
|
@@ -5047,7 +5030,10 @@ def processResponsesURLScan():
|
|
|
5047
5030
|
else:
|
|
5048
5031
|
if verbose():
|
|
5049
5032
|
write(
|
|
5050
|
-
colored(
|
|
5033
|
+
colored(
|
|
5034
|
+
"URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5035
|
+
"cyan",
|
|
5036
|
+
)
|
|
5051
5037
|
+ colored(responseOutputDirectory, "white")
|
|
5052
5038
|
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
5053
5039
|
+ colored(
|
|
@@ -5061,7 +5047,7 @@ def processResponsesURLScan():
|
|
|
5061
5047
|
else:
|
|
5062
5048
|
write(
|
|
5063
5049
|
colored(
|
|
5064
|
-
"
|
|
5050
|
+
"URLScan - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5065
5051
|
"cyan",
|
|
5066
5052
|
)
|
|
5067
5053
|
+ colored(
|
|
@@ -5087,7 +5073,7 @@ def processResponsesWayback():
|
|
|
5087
5073
|
"""
|
|
5088
5074
|
Get archived responses from Wayback Machine (archive.org)
|
|
5089
5075
|
"""
|
|
5090
|
-
global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount
|
|
5076
|
+
global linksFound, subs, path, indexFile, totalResponses, stopProgram, argsInput, continueRespFile, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, failureCount, totalFileCount, current_response, current_session
|
|
5091
5077
|
try:
|
|
5092
5078
|
fileCount = 0
|
|
5093
5079
|
failureCount = 0
|
|
@@ -5230,18 +5216,18 @@ def processResponsesWayback():
|
|
|
5230
5216
|
if verbose():
|
|
5231
5217
|
write(
|
|
5232
5218
|
colored(
|
|
5233
|
-
"
|
|
5219
|
+
"Wayback - [ INFO ] The URL requested to get responses: ",
|
|
5234
5220
|
"magenta",
|
|
5235
5221
|
)
|
|
5236
5222
|
+ colored(url + "\n", "white")
|
|
5237
5223
|
)
|
|
5238
5224
|
|
|
5239
5225
|
if args.check_only:
|
|
5240
|
-
write(colored("
|
|
5226
|
+
write(colored("Wayback - [ INFO ] Checking archived response requests...", "cyan"))
|
|
5241
5227
|
else:
|
|
5242
5228
|
write(
|
|
5243
5229
|
colored(
|
|
5244
|
-
"
|
|
5230
|
+
"Wayback - [ INFO ] Getting list of response links (this can take a while for some domains)...",
|
|
5245
5231
|
"cyan",
|
|
5246
5232
|
)
|
|
5247
5233
|
)
|
|
@@ -5254,16 +5240,24 @@ def processResponsesWayback():
|
|
|
5254
5240
|
session = requests.Session()
|
|
5255
5241
|
session.mount("https://", HTTP_ADAPTER)
|
|
5256
5242
|
session.mount("http://", HTTP_ADAPTER)
|
|
5243
|
+
try:
|
|
5244
|
+
current_session = session
|
|
5245
|
+
except Exception:
|
|
5246
|
+
pass
|
|
5257
5247
|
resp = session.get(
|
|
5258
5248
|
url,
|
|
5259
5249
|
stream=True,
|
|
5260
5250
|
headers={"User-Agent": userAgent},
|
|
5261
5251
|
timeout=args.timeout,
|
|
5262
5252
|
)
|
|
5253
|
+
try:
|
|
5254
|
+
current_response = resp
|
|
5255
|
+
except Exception:
|
|
5256
|
+
pass
|
|
5263
5257
|
except ConnectionError:
|
|
5264
5258
|
writerr(
|
|
5265
5259
|
colored(
|
|
5266
|
-
getSPACER("[ ERR ]
|
|
5260
|
+
getSPACER("Wayback - [ ERR ] Connection error"),
|
|
5267
5261
|
"red",
|
|
5268
5262
|
)
|
|
5269
5263
|
)
|
|
@@ -5273,7 +5267,7 @@ def processResponsesWayback():
|
|
|
5273
5267
|
except Exception as e:
|
|
5274
5268
|
writerr(
|
|
5275
5269
|
colored(
|
|
5276
|
-
getSPACER("[ ERR ] Couldn't get list of responses: " + str(e)),
|
|
5270
|
+
getSPACER("Wayback - [ ERR ] Couldn't get list of responses: " + str(e)),
|
|
5277
5271
|
"red",
|
|
5278
5272
|
)
|
|
5279
5273
|
)
|
|
@@ -5288,7 +5282,7 @@ def processResponsesWayback():
|
|
|
5288
5282
|
writerr(
|
|
5289
5283
|
colored(
|
|
5290
5284
|
getSPACER(
|
|
5291
|
-
"No archived responses were found on Wayback Machine (archive.org) for the given search parameters."
|
|
5285
|
+
"Wayback - [ ERR ] No archived responses were found on Wayback Machine (archive.org) for the given search parameters."
|
|
5292
5286
|
),
|
|
5293
5287
|
"red",
|
|
5294
5288
|
)
|
|
@@ -5299,7 +5293,7 @@ def processResponsesWayback():
|
|
|
5299
5293
|
writerr(
|
|
5300
5294
|
colored(
|
|
5301
5295
|
getSPACER(
|
|
5302
|
-
"[ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
5296
|
+
"Wayback - [ 429 ] Wayback Machine (archive.org) rate limit reached, so stopping. Links that have already been retrieved will be saved."
|
|
5303
5297
|
),
|
|
5304
5298
|
"red",
|
|
5305
5299
|
)
|
|
@@ -5310,7 +5304,7 @@ def processResponsesWayback():
|
|
|
5310
5304
|
writerr(
|
|
5311
5305
|
colored(
|
|
5312
5306
|
getSPACER(
|
|
5313
|
-
"[ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
|
|
5307
|
+
"Wayback - [ 503 ] Wayback Machine (archive.org) is currently unavailable. It may be down for maintenance. You can check https://web.archive.org/cdx/ to verify."
|
|
5314
5308
|
),
|
|
5315
5309
|
"red",
|
|
5316
5310
|
)
|
|
@@ -5322,7 +5316,10 @@ def processResponsesWayback():
|
|
|
5322
5316
|
writerr(
|
|
5323
5317
|
colored(
|
|
5324
5318
|
getSPACER(
|
|
5325
|
-
"
|
|
5319
|
+
"Wayback - [ "
|
|
5320
|
+
+ str(resp.status_code)
|
|
5321
|
+
+ " ] Error for "
|
|
5322
|
+
+ url
|
|
5326
5323
|
),
|
|
5327
5324
|
"red",
|
|
5328
5325
|
)
|
|
@@ -5334,7 +5331,7 @@ def processResponsesWayback():
|
|
|
5334
5331
|
writerr(
|
|
5335
5332
|
colored(
|
|
5336
5333
|
getSPACER(
|
|
5337
|
-
"Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml"
|
|
5334
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing FILTER_KEYWORDS in config.yml"
|
|
5338
5335
|
),
|
|
5339
5336
|
"red",
|
|
5340
5337
|
)
|
|
@@ -5343,7 +5340,7 @@ def processResponsesWayback():
|
|
|
5343
5340
|
writerr(
|
|
5344
5341
|
colored(
|
|
5345
5342
|
getSPACER(
|
|
5346
|
-
"Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed"
|
|
5343
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - consider removing -ko / --keywords-only argument, or changing the Regex value you passed"
|
|
5347
5344
|
),
|
|
5348
5345
|
"red",
|
|
5349
5346
|
)
|
|
@@ -5353,7 +5350,7 @@ def processResponsesWayback():
|
|
|
5353
5350
|
writerr(
|
|
5354
5351
|
colored(
|
|
5355
5352
|
getSPACER(
|
|
5356
|
-
"Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)"
|
|
5353
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - Blocked Site Error (they block the target site)"
|
|
5357
5354
|
),
|
|
5358
5355
|
"red",
|
|
5359
5356
|
)
|
|
@@ -5362,7 +5359,7 @@ def processResponsesWayback():
|
|
|
5362
5359
|
writerr(
|
|
5363
5360
|
colored(
|
|
5364
5361
|
getSPACER(
|
|
5365
|
-
"Failed to get links from Wayback Machine (archive.org) - check input domain and try again."
|
|
5362
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - check input domain and try again."
|
|
5366
5363
|
),
|
|
5367
5364
|
"red",
|
|
5368
5365
|
)
|
|
@@ -5372,23 +5369,43 @@ def processResponsesWayback():
|
|
|
5372
5369
|
pass
|
|
5373
5370
|
|
|
5374
5371
|
# Go through the response to save the links found
|
|
5375
|
-
|
|
5372
|
+
try:
|
|
5373
|
+
for line in resp.iter_lines():
|
|
5374
|
+
try:
|
|
5375
|
+
results = line.decode("utf-8")
|
|
5376
|
+
parts = results.split(" ", 2)
|
|
5377
|
+
timestamp = parts[0]
|
|
5378
|
+
originalUrl = parts[1]
|
|
5379
|
+
linksFoundResponseAdd(timestamp + "/" + originalUrl)
|
|
5380
|
+
except Exception:
|
|
5381
|
+
writerr(
|
|
5382
|
+
colored(
|
|
5383
|
+
getSPACER(
|
|
5384
|
+
"ERROR processResponsesWayback 3: Cannot to get link from line: "
|
|
5385
|
+
+ str(line)
|
|
5386
|
+
),
|
|
5387
|
+
"red",
|
|
5388
|
+
)
|
|
5389
|
+
)
|
|
5390
|
+
finally:
|
|
5376
5391
|
try:
|
|
5377
|
-
|
|
5378
|
-
parts = results.split(" ", 2)
|
|
5379
|
-
timestamp = parts[0]
|
|
5380
|
-
originalUrl = parts[1]
|
|
5381
|
-
linksFoundResponseAdd(timestamp + "/" + originalUrl)
|
|
5392
|
+
current_response = None
|
|
5382
5393
|
except Exception:
|
|
5383
|
-
|
|
5384
|
-
|
|
5385
|
-
|
|
5386
|
-
|
|
5387
|
-
|
|
5388
|
-
|
|
5389
|
-
|
|
5390
|
-
|
|
5391
|
-
|
|
5394
|
+
pass
|
|
5395
|
+
try:
|
|
5396
|
+
current_session = None
|
|
5397
|
+
except Exception:
|
|
5398
|
+
pass
|
|
5399
|
+
|
|
5400
|
+
# Cleanup shared response/session references now the response has been processed
|
|
5401
|
+
try:
|
|
5402
|
+
current_response = None
|
|
5403
|
+
except Exception:
|
|
5404
|
+
pass
|
|
5405
|
+
try:
|
|
5406
|
+
current_session = None
|
|
5407
|
+
except Exception:
|
|
5408
|
+
pass
|
|
5392
5409
|
|
|
5393
5410
|
# Remove any links that have URL exclusions
|
|
5394
5411
|
linkRequests = []
|
|
@@ -5421,7 +5438,7 @@ def processResponsesWayback():
|
|
|
5421
5438
|
writerr(
|
|
5422
5439
|
colored(
|
|
5423
5440
|
getSPACER(
|
|
5424
|
-
'Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'
|
|
5441
|
+
'Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'
|
|
5425
5442
|
+ originalUrl
|
|
5426
5443
|
+ "\") but they didn't match the input you gave. Check input and try again."
|
|
5427
5444
|
),
|
|
@@ -5432,7 +5449,7 @@ def processResponsesWayback():
|
|
|
5432
5449
|
writerr(
|
|
5433
5450
|
colored(
|
|
5434
5451
|
getSPACER(
|
|
5435
|
-
"Failed to get links from Wayback Machine (archive.org) - check input and try again."
|
|
5452
|
+
"Wayback - [ ERR ] Failed to get links from Wayback Machine (archive.org) - check input and try again."
|
|
5436
5453
|
),
|
|
5437
5454
|
"red",
|
|
5438
5455
|
)
|
|
@@ -5575,7 +5592,7 @@ def processResponsesWayback():
|
|
|
5575
5592
|
if failureCount > 0:
|
|
5576
5593
|
if verbose():
|
|
5577
5594
|
write(
|
|
5578
|
-
colored("
|
|
5595
|
+
colored("Wayback - [ INFO ] Responses saved to ", "cyan")
|
|
5579
5596
|
+ colored(responseOutputDirectory, "white")
|
|
5580
5597
|
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
5581
5598
|
+ colored(
|
|
@@ -5590,7 +5607,7 @@ def processResponsesWayback():
|
|
|
5590
5607
|
else:
|
|
5591
5608
|
write(
|
|
5592
5609
|
colored(
|
|
5593
|
-
"
|
|
5610
|
+
"Wayback - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5594
5611
|
"cyan",
|
|
5595
5612
|
)
|
|
5596
5613
|
+ colored(
|
|
@@ -5605,7 +5622,7 @@ def processResponsesWayback():
|
|
|
5605
5622
|
else:
|
|
5606
5623
|
if verbose():
|
|
5607
5624
|
write(
|
|
5608
|
-
colored("
|
|
5625
|
+
colored("Wayback - [ INFO ] Responses saved to ", "cyan")
|
|
5609
5626
|
+ colored(responseOutputDirectory, "white")
|
|
5610
5627
|
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
5611
5628
|
+ colored(
|
|
@@ -5619,7 +5636,7 @@ def processResponsesWayback():
|
|
|
5619
5636
|
else:
|
|
5620
5637
|
write(
|
|
5621
5638
|
colored(
|
|
5622
|
-
"
|
|
5639
|
+
"Wayback - [ INFO ] Responses saved for " + subs + argsInput + ": ",
|
|
5623
5640
|
"cyan",
|
|
5624
5641
|
)
|
|
5625
5642
|
+ colored(
|
|
@@ -5933,9 +5950,91 @@ def combineInlineJS():
|
|
|
5933
5950
|
writerr(colored("ERROR combineInlineJS 1: " + str(e), "red"))
|
|
5934
5951
|
|
|
5935
5952
|
|
|
5953
|
+
# Async wrapper functions for concurrent source fetching
|
|
5954
|
+
async def fetch_wayback_async():
|
|
5955
|
+
"""Async wrapper for getWaybackUrls - runs in thread pool"""
|
|
5956
|
+
loop = asyncio.get_event_loop()
|
|
5957
|
+
await loop.run_in_executor(None, getWaybackUrls)
|
|
5958
|
+
|
|
5959
|
+
|
|
5960
|
+
async def fetch_commoncrawl_async():
|
|
5961
|
+
"""Async wrapper for getCommonCrawlUrls - runs in thread pool"""
|
|
5962
|
+
loop = asyncio.get_event_loop()
|
|
5963
|
+
await loop.run_in_executor(None, getCommonCrawlUrls)
|
|
5964
|
+
|
|
5965
|
+
|
|
5966
|
+
async def fetch_alienvault_async():
|
|
5967
|
+
"""Async wrapper for getAlienVaultUrls - runs in thread pool"""
|
|
5968
|
+
loop = asyncio.get_event_loop()
|
|
5969
|
+
await loop.run_in_executor(None, getAlienVaultUrls)
|
|
5970
|
+
|
|
5971
|
+
|
|
5972
|
+
async def fetch_urlscan_async():
|
|
5973
|
+
"""Async wrapper for getURLScanUrls - runs in thread pool"""
|
|
5974
|
+
loop = asyncio.get_event_loop()
|
|
5975
|
+
await loop.run_in_executor(None, getURLScanUrls)
|
|
5976
|
+
|
|
5977
|
+
|
|
5978
|
+
async def fetch_virustotal_async():
|
|
5979
|
+
"""Async wrapper for getVirusTotalUrls - runs in thread pool"""
|
|
5980
|
+
loop = asyncio.get_event_loop()
|
|
5981
|
+
await loop.run_in_executor(None, getVirusTotalUrls)
|
|
5982
|
+
|
|
5983
|
+
|
|
5984
|
+
async def fetch_intelx_async():
|
|
5985
|
+
"""Async wrapper for getIntelxUrls - runs in thread pool"""
|
|
5986
|
+
loop = asyncio.get_event_loop()
|
|
5987
|
+
await loop.run_in_executor(None, getIntelxUrls)
|
|
5988
|
+
|
|
5989
|
+
|
|
5990
|
+
async def fetch_all_sources_async():
|
|
5991
|
+
"""
|
|
5992
|
+
Orchestrator function to fetch from all enabled sources concurrently.
|
|
5993
|
+
Each source runs in its own thread pool executor while orchestration happens async.
|
|
5994
|
+
"""
|
|
5995
|
+
global args, stopProgram, VIRUSTOTAL_API_KEY, INTELX_API_KEY, argsInput
|
|
5996
|
+
|
|
5997
|
+
tasks = []
|
|
5998
|
+
|
|
5999
|
+
# Build list of tasks for enabled sources
|
|
6000
|
+
if not args.xwm and stopProgram is None:
|
|
6001
|
+
tasks.append(("Wayback Machine", fetch_wayback_async()))
|
|
6002
|
+
if not args.xcc and stopProgram is None:
|
|
6003
|
+
tasks.append(("Common Crawl", fetch_commoncrawl_async()))
|
|
6004
|
+
if not args.xav and stopProgram is None and not argsInput.startswith("."):
|
|
6005
|
+
tasks.append(("AlienVault OTX", fetch_alienvault_async()))
|
|
6006
|
+
if not args.xus and stopProgram is None:
|
|
6007
|
+
tasks.append(("URLScan", fetch_urlscan_async()))
|
|
6008
|
+
if not args.xvt and VIRUSTOTAL_API_KEY != "" and stopProgram is None:
|
|
6009
|
+
tasks.append(("VirusTotal", fetch_virustotal_async()))
|
|
6010
|
+
if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
|
|
6011
|
+
tasks.append(("Intelligence X", fetch_intelx_async()))
|
|
6012
|
+
|
|
6013
|
+
if not tasks:
|
|
6014
|
+
return
|
|
6015
|
+
|
|
6016
|
+
# Extract just the coroutines for gather
|
|
6017
|
+
task_coros = [task[1] for task in tasks]
|
|
6018
|
+
|
|
6019
|
+
# Fetch all concurrently, capturing exceptions so one failure doesn't stop others
|
|
6020
|
+
results = await asyncio.gather(*task_coros, return_exceptions=True)
|
|
6021
|
+
|
|
6022
|
+
# Check for any exceptions that occurred
|
|
6023
|
+
for i, result in enumerate(results):
|
|
6024
|
+
if isinstance(result, Exception):
|
|
6025
|
+
source_name = tasks[i][0]
|
|
6026
|
+
if verbose():
|
|
6027
|
+
writerr(
|
|
6028
|
+
colored(
|
|
6029
|
+
getSPACER(f"ERROR in {source_name} during concurrent fetch: {str(result)}"),
|
|
6030
|
+
"red",
|
|
6031
|
+
)
|
|
6032
|
+
)
|
|
6033
|
+
|
|
6034
|
+
|
|
5936
6035
|
# Run waymore
|
|
5937
6036
|
def main():
|
|
5938
|
-
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY
|
|
6037
|
+
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
|
|
5939
6038
|
|
|
5940
6039
|
# Tell Python to run the handler() function when SIGINT is received
|
|
5941
6040
|
signal(SIGINT, handler)
|
|
@@ -6104,6 +6203,7 @@ def main():
|
|
|
6104
6203
|
action="store",
|
|
6105
6204
|
type=int,
|
|
6106
6205
|
help="Limit the number of Common Crawl index collections searched, e.g. '-lcc 10' will just search the latest 10 collections (default: 1). As of November 2024 there are currently 106 collections. Setting to 0 (default) will search ALL collections. If you don't want to search Common Crawl at all, use the -xcc option.",
|
|
6206
|
+
default=1,
|
|
6107
6207
|
)
|
|
6108
6208
|
parser.add_argument(
|
|
6109
6209
|
"-t",
|
|
@@ -6118,10 +6218,10 @@ def main():
|
|
|
6118
6218
|
parser.add_argument(
|
|
6119
6219
|
"-p",
|
|
6120
6220
|
"--processes",
|
|
6121
|
-
help="Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default:
|
|
6221
|
+
help="Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 2)",
|
|
6122
6222
|
action="store",
|
|
6123
6223
|
type=validateArgProcesses,
|
|
6124
|
-
default=
|
|
6224
|
+
default=2,
|
|
6125
6225
|
metavar="<integer>",
|
|
6126
6226
|
)
|
|
6127
6227
|
parser.add_argument(
|
|
@@ -6326,6 +6426,12 @@ def main():
|
|
|
6326
6426
|
indexFile = None
|
|
6327
6427
|
path = ""
|
|
6328
6428
|
stopSource = False
|
|
6429
|
+
stopSourceWayback = False
|
|
6430
|
+
stopSourceCommonCrawl = False
|
|
6431
|
+
stopSourceAlienVault = False
|
|
6432
|
+
stopSourceURLScan = False
|
|
6433
|
+
stopSourceVirusTotal = False
|
|
6434
|
+
stopSourceIntelx = False
|
|
6329
6435
|
|
|
6330
6436
|
# Get the config settings from the config.yml file
|
|
6331
6437
|
getConfig()
|
|
@@ -6343,29 +6449,17 @@ def main():
|
|
|
6343
6449
|
# If the mode is U (URLs retrieved) or B (URLs retrieved AND Responses downloaded)
|
|
6344
6450
|
if args.mode in ["U", "B"]:
|
|
6345
6451
|
|
|
6346
|
-
#
|
|
6347
|
-
|
|
6348
|
-
|
|
6349
|
-
|
|
6350
|
-
|
|
6351
|
-
|
|
6352
|
-
|
|
6353
|
-
|
|
6354
|
-
|
|
6355
|
-
|
|
6356
|
-
|
|
6357
|
-
|
|
6358
|
-
# If not requested to exclude, get URLs from urlscan.io
|
|
6359
|
-
if not args.xus and stopProgram is None:
|
|
6360
|
-
getURLScanUrls()
|
|
6361
|
-
|
|
6362
|
-
# If not requested to exclude, get URLs from virustotal.com if we have an API key
|
|
6363
|
-
if not args.xvt and VIRUSTOTAL_API_KEY != "" and stopProgram is None:
|
|
6364
|
-
getVirusTotalUrls()
|
|
6365
|
-
|
|
6366
|
-
# If not requested to exclude, get URLs from intelx.io if we have an API key
|
|
6367
|
-
if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
|
|
6368
|
-
getIntelxUrls()
|
|
6452
|
+
# Fetch from all sources concurrently using async/await
|
|
6453
|
+
try:
|
|
6454
|
+
asyncio.run(fetch_all_sources_async())
|
|
6455
|
+
except Exception as e:
|
|
6456
|
+
if verbose():
|
|
6457
|
+
writerr(
|
|
6458
|
+
colored(
|
|
6459
|
+
getSPACER(f"ERROR during concurrent source fetching: {str(e)}"),
|
|
6460
|
+
"red",
|
|
6461
|
+
)
|
|
6462
|
+
)
|
|
6369
6463
|
|
|
6370
6464
|
# Output results of all searches
|
|
6371
6465
|
processURLOutput()
|