@paa1997/metho 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/metho.js +3 -0
- package/package.json +25 -0
- package/scripts/findsomething_secrets.py +1389 -0
- package/src/cli.js +76 -0
- package/src/constants.js +20 -0
- package/src/engine.js +180 -0
- package/src/gui.html +772 -0
- package/src/logger.js +95 -0
- package/src/server.js +261 -0
- package/src/signals.js +101 -0
- package/src/steps/base-step.js +111 -0
- package/src/steps/filter.js +43 -0
- package/src/steps/findsomething.js +42 -0
- package/src/steps/gau.js +86 -0
- package/src/steps/katana.js +61 -0
- package/src/steps/subdomain-probe.js +19 -0
- package/src/steps/subfinder.js +22 -0
- package/src/tools/installer.js +111 -0
- package/src/tools/manager.js +114 -0
- package/src/tools/registry.js +45 -0
- package/src/utils/dedup.js +37 -0
- package/src/utils/file-splitter.js +48 -0
- package/src/utils/filter-extensions.js +52 -0
|
@@ -0,0 +1,1389 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import base64
|
|
4
|
+
import json
|
|
5
|
+
import tempfile
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
import re
|
|
9
|
+
import shutil
|
|
10
|
+
import signal
|
|
11
|
+
import subprocess
|
|
12
|
+
from requests.adapters import HTTPAdapter
|
|
13
|
+
import sys
|
|
14
|
+
import zlib
|
|
15
|
+
from urllib3.util.retry import Retry
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor, wait, FIRST_COMPLETED, as_completed, TimeoutError
|
|
17
|
+
from typing import List, Dict, Any, Tuple, Optional, Union
|
|
18
|
+
import time
|
|
19
|
+
import threading
|
|
20
|
+
|
|
21
|
+
import requests
|
|
22
|
+
import urllib3
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from rich.progress import (
|
|
26
|
+
Progress,
|
|
27
|
+
BarColumn,
|
|
28
|
+
TextColumn,
|
|
29
|
+
TimeElapsedColumn,
|
|
30
|
+
TimeRemainingColumn,
|
|
31
|
+
MofNCompleteColumn,
|
|
32
|
+
SpinnerColumn,
|
|
33
|
+
)
|
|
34
|
+
except ImportError:
|
|
35
|
+
Progress = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ==============================
|
|
39
|
+
# 1. Generic secret patterns (same style as before)
|
|
40
|
+
# We'll treat critical/high/medium as [secret]
|
|
41
|
+
# and low as [secret/low]
|
|
42
|
+
# ==============================
|
|
43
|
+
|
|
44
|
+
SEVERITY_PATTERNS_RAW: Dict[str, List[Tuple[str, int]]] = {
|
|
45
|
+
"critical": [
|
|
46
|
+
(r"sk-[a-zA-Z0-9]{20}T3BlbkFJ[a-zA-Z0-9]{20}", 0),
|
|
47
|
+
(r"sk-proj-[a-zA-Z0-9\-_]{80,}", 0),
|
|
48
|
+
(r"sk-ant-api[a-zA-Z0-9\-_]{80,}", 0),
|
|
49
|
+
(r"AKIA[A-Z0-9]{16}", 0),
|
|
50
|
+
(r"ASIA[A-Z0-9]{16}", 0),
|
|
51
|
+
(r"sk_live_[a-zA-Z0-9]{24,}", 0),
|
|
52
|
+
(r"rk_live_[a-zA-Z0-9]{24,}", 0),
|
|
53
|
+
(r"ghp_[a-zA-Z0-9]{36}", 0),
|
|
54
|
+
(r"gho_[a-zA-Z0-9]{36}", 0),
|
|
55
|
+
(r"github_pat_[a-zA-Z0-9_]{36,}", 0),
|
|
56
|
+
(r"glpat-[a-zA-Z0-9\-=_]{20,}", 0),
|
|
57
|
+
(r"-----BEGIN\s*(RSA|DSA|EC|OPENSSH|PGP)?\s*PRIVATE KEY-----", 0),
|
|
58
|
+
(r"mongodb(\+srv)?://[^:]+:[^@]+@", 0),
|
|
59
|
+
(r"postgres(ql)?://[^:]+:[^@]+@", 0),
|
|
60
|
+
(r"mysql://[^:]+:[^@]+@", 0),
|
|
61
|
+
(r"xox[baprs]-[a-zA-Z0-9\-]{10,}", 0),
|
|
62
|
+
(r"[MN][A-Za-z0-9]{23,}\.[A-Za-z0-9_-]{6}\.[A-Za-z0-9_-]{27}", 0),
|
|
63
|
+
(r"SK[a-f0-9]{32}", 0),
|
|
64
|
+
(r"SG\.[a-zA-Z0-9_-]{22}\.[a-zA-Z0-9_-]{43}", 0),
|
|
65
|
+
],
|
|
66
|
+
"high": [
|
|
67
|
+
(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*", 0),
|
|
68
|
+
(r"api[_-]?key[\"'\s]*[:=][\"'\s]*[a-zA-Z0-9]{32,}", re.IGNORECASE),
|
|
69
|
+
(r"api[_-]?secret[\"'\s]*[:=][\"'\s]*[a-zA-Z0-9]{32,}", re.IGNORECASE),
|
|
70
|
+
(r"[Bb]earer\s+[a-zA-Z0-9\-=._+/\\]{20,500}", 0),
|
|
71
|
+
(r"[Bb]asic\s+[A-Za-z0-9+/]{18,}={0,2}", 0),
|
|
72
|
+
(r"AIza[0-9A-Za-z_\-]{35}", 0),
|
|
73
|
+
(r"sk_test_[a-zA-Z0-9]{24,}", 0),
|
|
74
|
+
(r"pk_live_[a-zA-Z0-9]{24,}", 0),
|
|
75
|
+
(r"[0-9]+-[a-z0-9]{32}\.apps\.googleusercontent\.com", 0),
|
|
76
|
+
(r"https://hooks\.slack\.com/services/[a-zA-Z0-9/_-]+", 0),
|
|
77
|
+
(r"https://discord(app)?\.com/api/webhooks/\d+/[a-zA-Z0-9_-]+", 0),
|
|
78
|
+
(r"https://oapi\.dingtalk\.com/robot/send\?access_token=[a-z0-9]+", 0),
|
|
79
|
+
(r"sbp_[a-f0-9]{40}", 0),
|
|
80
|
+
(r"hf_[a-zA-Z0-9]{30,}", 0),
|
|
81
|
+
(r"shpat_[a-fA-F0-9]{32}", 0),
|
|
82
|
+
(r"shpss_[a-fA-F0-9]{32}", 0),
|
|
83
|
+
],
|
|
84
|
+
"medium": [
|
|
85
|
+
(r"[\"']?password[\"']?\s*[:=]\s*[\"'][^\"']{8,}[\"']", re.IGNORECASE),
|
|
86
|
+
(r"[\"']?secret[\"']?\s*[:=]\s*[\"'][^\"']{8,}[\"']", re.IGNORECASE),
|
|
87
|
+
(r"[\"']?token[\"']?\s*[:=]\s*[\"'][^\"']{16,}[\"']", re.IGNORECASE),
|
|
88
|
+
(r"pk_test_[a-zA-Z0-9]{24,}", 0),
|
|
89
|
+
(r"[\"'](10\.\d{1,3}\.\d{1,3}\.\d{1,3})[\"']", 0),
|
|
90
|
+
(r"[\"'](172\.(1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3})[\"']", 0),
|
|
91
|
+
(r"[\"'](192\.168\.\d{1,3}\.\d{1,3})[\"']", 0),
|
|
92
|
+
(r"[\"']/admin[/a-zA-Z0-9_-]*[\"']", 0),
|
|
93
|
+
(r"[\"']/debug[/a-zA-Z0-9_-]*[\"']", 0),
|
|
94
|
+
(r"[\"']/api/internal[/a-zA-Z0-9_-]*[\"']", 0),
|
|
95
|
+
(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", 0),
|
|
96
|
+
],
|
|
97
|
+
"low": [
|
|
98
|
+
(r"[\"']?[a-z_]*key[\"']?\s*[:=]\s*[a-z]", re.IGNORECASE),
|
|
99
|
+
(r"[\"']?[a-z_]*token[\"']?\s*[:=]\s*[a-z]", re.IGNORECASE),
|
|
100
|
+
(r"[\"']?[a-z_]*secret[\"']?\s*[:=]\s*[a-z]", re.IGNORECASE),
|
|
101
|
+
(r"=\s*function", 0),
|
|
102
|
+
(r":\s*function", 0),
|
|
103
|
+
],
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
COMPILED_SEVERITY: Dict[str, List[Tuple[re.Pattern, str]]] = {
|
|
107
|
+
sev: [(re.compile(p, flags), p) for (p, flags) in rules]
|
|
108
|
+
for sev, rules in SEVERITY_PATTERNS_RAW.items()
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# ==============================
|
|
112
|
+
# 2. False positive filters (from the extension)
|
|
113
|
+
# ==============================
|
|
114
|
+
|
|
115
|
+
FALSE_POSITIVE_PATTERNS_RAW: List[Tuple[str, int]] = [
|
|
116
|
+
(r"/cdn-cgi/challenge-platform/", 0),
|
|
117
|
+
(r"\d+\.\d+:\d+:[a-zA-Z0-9_-]+ivzz", 0),
|
|
118
|
+
(r"[\"']secret:[a-z]+_[a-z]+_[a-z]+[\"']", 0),
|
|
119
|
+
(r"[a-zA-Z]+Key\s*=\s*[a-z]\.define\(", 0),
|
|
120
|
+
(r"[a-zA-Z]+Key\s*=\s*function", 0),
|
|
121
|
+
(r"[a-zA-Z]+Secret\s*=\s*function", 0),
|
|
122
|
+
(r"PrivateKey\s*=\s*[a-z]\.define\(", 0),
|
|
123
|
+
(r"PublicKey\s*=\s*[a-z]\.define\(", 0),
|
|
124
|
+
(
|
|
125
|
+
r"[\"']?password[\"']?\s*[:=]\s*[\"']?(test|demo|example|placeholder|changeme|password|123456)[\"']?",
|
|
126
|
+
re.IGNORECASE,
|
|
127
|
+
),
|
|
128
|
+
(
|
|
129
|
+
r"[\"']?secret[\"']?\s*[:=]\s*[\"']?(test|demo|example|placeholder)[\"']?",
|
|
130
|
+
re.IGNORECASE,
|
|
131
|
+
),
|
|
132
|
+
(r"tokenize\s*:\s*function", 0),
|
|
133
|
+
(r"getToken\s*:\s*[a-z]$", 0),
|
|
134
|
+
(r"setPassword\s*:\s*function", 0),
|
|
135
|
+
# HTML head/meta blocks that aren't secrets (e.g., BMW homepage metadata)
|
|
136
|
+
(r"<meta\s+name=['\"]generator['\"]\s+content=['\"]bmwcom:[^>]+>", re.IGNORECASE),
|
|
137
|
+
(r"<meta\s+property=['\"]og:title['\"][^>]+bmw\.com", re.IGNORECASE),
|
|
138
|
+
(r"BMWTypeNextLatin\.woff2", re.IGNORECASE),
|
|
139
|
+
(r"The international BMW Website", re.IGNORECASE),
|
|
140
|
+
(r"og:image.*BMW_OG\.jpg", re.IGNORECASE),
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
COMPILED_FP: List[re.Pattern] = [
|
|
144
|
+
re.compile(p, flags) for (p, flags) in FALSE_POSITIVE_PATTERNS_RAW
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# ==============================
|
|
149
|
+
# 3. Embedded nuclei_regex from background.js (compressed)
|
|
150
|
+
# ==============================
|
|
151
|
+
|
|
152
|
+
NUCLEI_PATTERNS_B64 = """
|
|
153
|
+

|
|
154
|
+
""".strip()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# ==============================
|
|
158
|
+
# 4. Core scanning helpers (multi-threaded)
|
|
159
|
+
# ==============================
|
|
160
|
+
|
|
161
|
+
REGEX_CHUNK_SIZE = 50
|
|
162
|
+
LABEL_ORDER = {"secret": 0, "secret/low": 1}
|
|
163
|
+
|
|
164
|
+
DEFAULT_CONNECT_TIMEOUT = 5
|
|
165
|
+
DEFAULT_READ_TIMEOUT = 90
|
|
166
|
+
DEFAULT_USE_CURL = True # prefer curl path that worked without flags; may disable for high parallelism
|
|
167
|
+
DEFAULT_SCAN_TIMEOUT = 80.0
|
|
168
|
+
DEFAULT_FETCH_TIMEOUT = 30.0 # wall-clock timeout per fetch batch
|
|
169
|
+
DEFAULT_REQUIRE_STATUS_200 = False
|
|
170
|
+
DEFAULT_MAX_BODY_BYTES = 100_000_000 # 100 MB guardrail to avoid OOM on giant responses
|
|
171
|
+
MAX_PARALLEL_HARD_LIMIT = 50 # hard cap on simultaneous fetch threads
|
|
172
|
+
PROGRESS_BAR_WIDTH = 20
|
|
173
|
+
FAILED_FETCH_LOG = os.path.join(os.getcwd(), "failed_to_fetch.txt")
|
|
174
|
+
|
|
175
|
+
ANSI_RESET = "\033[0m"
|
|
176
|
+
ANSI_COLORS_LABEL = {
|
|
177
|
+
"secret": "\033[93m", # yellow
|
|
178
|
+
"secret/low": "\033[90m", # gray
|
|
179
|
+
}
|
|
180
|
+
ANSI_MATCH = "\033[96m" # cyan
|
|
181
|
+
ANSI_SOURCE = "\033[95m" # magenta
|
|
182
|
+
|
|
183
|
+
STOP_EVENT = threading.Event()
|
|
184
|
+
QUIET = False
|
|
185
|
+
thread_local = threading.local()
|
|
186
|
+
RG_PATH = shutil.which("rg")
|
|
187
|
+
RG_SEM = threading.Semaphore(8) if RG_PATH else None
|
|
188
|
+
RG_ENABLED_LOCK = threading.Lock()
|
|
189
|
+
RG_ENABLED = bool(RG_PATH)
|
|
190
|
+
|
|
191
|
+
# Proxy management
|
|
192
|
+
PROXY_LIST: List[str] = []
|
|
193
|
+
PROXY_LOCK = threading.Lock()
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def parse_proxy_string(proxy_raw: str) -> Optional[str]:
|
|
197
|
+
"""
|
|
198
|
+
Parse a proxy string in format ip:port:username:password
|
|
199
|
+
and return a URL in format http://username:password@ip:port
|
|
200
|
+
Returns None if parsing fails.
|
|
201
|
+
"""
|
|
202
|
+
proxy_raw = proxy_raw.strip()
|
|
203
|
+
if not proxy_raw:
|
|
204
|
+
return None
|
|
205
|
+
parts = proxy_raw.split(":")
|
|
206
|
+
if len(parts) == 4:
|
|
207
|
+
ip, port, username, password = parts
|
|
208
|
+
return f"http://{username}:{password}@{ip}:{port}"
|
|
209
|
+
elif len(parts) == 2:
|
|
210
|
+
# Simple ip:port format without auth
|
|
211
|
+
ip, port = parts
|
|
212
|
+
return f"http://{ip}:{port}"
|
|
213
|
+
else:
|
|
214
|
+
debug_log(f"[!] Invalid proxy format: {proxy_raw} (expected ip:port:user:pass or ip:port)\n")
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def load_proxies(proxy_input: str) -> List[str]:
|
|
219
|
+
"""
|
|
220
|
+
Load proxies from either:
|
|
221
|
+
- A comma-separated string of proxies
|
|
222
|
+
- A file path containing one proxy per line
|
|
223
|
+
|
|
224
|
+
Proxy format: ip:port:username:password
|
|
225
|
+
Returns list of proxy URLs in format http://username:password@ip:port
|
|
226
|
+
"""
|
|
227
|
+
proxies: List[str] = []
|
|
228
|
+
|
|
229
|
+
# Check if it's a file path
|
|
230
|
+
if os.path.isfile(proxy_input):
|
|
231
|
+
try:
|
|
232
|
+
with open(proxy_input, "r", encoding="utf-8", errors="ignore") as f:
|
|
233
|
+
for line in f:
|
|
234
|
+
line = line.strip()
|
|
235
|
+
if line and not line.startswith("#"):
|
|
236
|
+
proxy_url = parse_proxy_string(line)
|
|
237
|
+
if proxy_url:
|
|
238
|
+
proxies.append(proxy_url)
|
|
239
|
+
except OSError as e:
|
|
240
|
+
debug_log(f"[!] Failed to read proxy file {proxy_input}: {e}\n")
|
|
241
|
+
else:
|
|
242
|
+
# Treat as comma-separated string
|
|
243
|
+
for proxy_raw in proxy_input.split(","):
|
|
244
|
+
proxy_url = parse_proxy_string(proxy_raw)
|
|
245
|
+
if proxy_url:
|
|
246
|
+
proxies.append(proxy_url)
|
|
247
|
+
|
|
248
|
+
return proxies
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def get_random_proxy() -> Optional[str]:
|
|
252
|
+
"""Get a random proxy from the proxy list. Returns None if no proxies configured."""
|
|
253
|
+
with PROXY_LOCK:
|
|
254
|
+
if not PROXY_LIST:
|
|
255
|
+
return None
|
|
256
|
+
return random.choice(PROXY_LIST)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# Ensure Ctrl+C stops all work immediately
|
|
260
|
+
def _sigint_handler(signum, frame):
|
|
261
|
+
STOP_EVENT.set()
|
|
262
|
+
debug_log("[!] Interrupt received, stopping...\n")
|
|
263
|
+
raise KeyboardInterrupt
|
|
264
|
+
|
|
265
|
+
signal.signal(signal.SIGINT, _sigint_handler)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def debug_log(msg: str) -> None:
|
|
269
|
+
if QUIET:
|
|
270
|
+
return
|
|
271
|
+
try:
|
|
272
|
+
sys.stderr.write(msg)
|
|
273
|
+
sys.stderr.flush()
|
|
274
|
+
except Exception:
|
|
275
|
+
pass
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def is_false_positive(context: str, use_fp: bool) -> bool:
|
|
279
|
+
if not use_fp:
|
|
280
|
+
return False
|
|
281
|
+
return any(fp.search(context) for fp in COMPILED_FP)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def load_embedded_nuclei_patterns() -> List[Tuple[re.Pattern, str]]:
|
|
285
|
+
raw_bytes = zlib.decompress(base64.b64decode(NUCLEI_PATTERNS_B64))
|
|
286
|
+
raw_patterns: List[List[Any]] = json.loads(raw_bytes.decode("utf-8"))
|
|
287
|
+
compiled: List[Tuple[re.Pattern, str]] = []
|
|
288
|
+
for pattern, flags in raw_patterns:
|
|
289
|
+
pattern_str = str(pattern).strip()
|
|
290
|
+
if not pattern_str:
|
|
291
|
+
continue
|
|
292
|
+
try:
|
|
293
|
+
compiled.append((re.compile(pattern_str, flags), pattern_str))
|
|
294
|
+
except re.error as e:
|
|
295
|
+
# Skip malformed ones so the scanner keeps running
|
|
296
|
+
debug_log(f"[!] Failed to compile nuclei regex: /{pattern_str}/ : {e}\n")
|
|
297
|
+
return compiled
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def filter_nuclei_patterns(
|
|
301
|
+
patterns: List[Tuple[re.Pattern, str]], keep_all: bool
|
|
302
|
+
) -> List[Tuple[re.Pattern, str]]:
|
|
303
|
+
if keep_all:
|
|
304
|
+
return patterns
|
|
305
|
+
keywords = [
|
|
306
|
+
"key",
|
|
307
|
+
"secret",
|
|
308
|
+
"token",
|
|
309
|
+
"bearer",
|
|
310
|
+
"api",
|
|
311
|
+
"auth",
|
|
312
|
+
"password",
|
|
313
|
+
"passphrase",
|
|
314
|
+
"private",
|
|
315
|
+
"credential",
|
|
316
|
+
"aws",
|
|
317
|
+
"github",
|
|
318
|
+
"gitlab",
|
|
319
|
+
"slack",
|
|
320
|
+
"discord",
|
|
321
|
+
"webhook",
|
|
322
|
+
"xox",
|
|
323
|
+
"sg.",
|
|
324
|
+
"sk_",
|
|
325
|
+
"pk_",
|
|
326
|
+
"rk_",
|
|
327
|
+
"ssh",
|
|
328
|
+
"jwt",
|
|
329
|
+
]
|
|
330
|
+
filtered: List[Tuple[re.Pattern, str]] = []
|
|
331
|
+
for rx, pat in patterns:
|
|
332
|
+
low = pat.lower()
|
|
333
|
+
if any(k in low for k in keywords):
|
|
334
|
+
filtered.append((rx, pat))
|
|
335
|
+
return filtered
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def chunk_patterns(
|
|
339
|
+
patterns: List[Tuple[re.Pattern, str]], chunk_size: int
|
|
340
|
+
) -> List[List[Tuple[re.Pattern, str]]]:
|
|
341
|
+
return [patterns[i : i + chunk_size] for i in range(0, len(patterns), chunk_size)]
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def deduplicate_findings(findings: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
345
|
+
"""Remove duplicate findings while preserving order."""
|
|
346
|
+
seen = set()
|
|
347
|
+
unique: List[Dict[str, Any]] = []
|
|
348
|
+
for f in findings:
|
|
349
|
+
key = (f.get("severity"), f.get("match"), f.get("source"))
|
|
350
|
+
if key in seen:
|
|
351
|
+
continue
|
|
352
|
+
seen.add(key)
|
|
353
|
+
unique.append(f)
|
|
354
|
+
return unique
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def inline_flags_pattern(pattern: str, flags: int) -> str:
|
|
358
|
+
"""
|
|
359
|
+
Prepend inline PCRE flags for rg based on Python regex flags.
|
|
360
|
+
Only adds the commonly used flags we support.
|
|
361
|
+
"""
|
|
362
|
+
prefix = ""
|
|
363
|
+
if flags & re.IGNORECASE:
|
|
364
|
+
prefix += "(?i)"
|
|
365
|
+
if flags & re.MULTILINE:
|
|
366
|
+
prefix += "(?m)"
|
|
367
|
+
if flags & re.DOTALL:
|
|
368
|
+
prefix += "(?s)"
|
|
369
|
+
return prefix + pattern if prefix else pattern
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def ensure_rg_pcre2_available() -> None:
|
|
373
|
+
"""Fail fast if rg is present but lacks PCRE2 support."""
|
|
374
|
+
if not RG_PATH:
|
|
375
|
+
return
|
|
376
|
+
cmd = [RG_PATH, "-P", "--version"]
|
|
377
|
+
try:
|
|
378
|
+
proc = subprocess.run(
|
|
379
|
+
cmd,
|
|
380
|
+
capture_output=True,
|
|
381
|
+
text=True,
|
|
382
|
+
encoding="utf-8",
|
|
383
|
+
errors="ignore",
|
|
384
|
+
)
|
|
385
|
+
except Exception as e:
|
|
386
|
+
debug_log(f"[!] Failed to run rg for PCRE2 check: {e}\n")
|
|
387
|
+
return
|
|
388
|
+
if proc.returncode != 0:
|
|
389
|
+
msg = (proc.stderr or proc.stdout or "").strip()
|
|
390
|
+
raise SystemExit(
|
|
391
|
+
"[!] ripgrep is installed without PCRE2 support. Install a PCRE2-enabled ripgrep "
|
|
392
|
+
"(e.g., package 'ripgrep-pcre2' or 'cargo install ripgrep --features \"pcre2\" --locked').\n"
|
|
393
|
+
f"rg output: {msg}"
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def scan_one_chunk(
|
|
398
|
+
label: str,
|
|
399
|
+
chunk: List[Tuple[re.Pattern, str]],
|
|
400
|
+
text: str,
|
|
401
|
+
source: str,
|
|
402
|
+
use_fp: bool,
|
|
403
|
+
tmp_path: Optional[str],
|
|
404
|
+
use_rg: bool,
|
|
405
|
+
) -> List[Dict[str, Any]]:
|
|
406
|
+
"""Scan a chunk of regex patterns sequentially."""
|
|
407
|
+
global RG_ENABLED
|
|
408
|
+
results: List[Dict[str, Any]] = []
|
|
409
|
+
with RG_ENABLED_LOCK:
|
|
410
|
+
rg_allowed = bool(use_rg and tmp_path and RG_PATH and RG_ENABLED)
|
|
411
|
+
if rg_allowed:
|
|
412
|
+
for rx, pattern_str in chunk:
|
|
413
|
+
cmd = [
|
|
414
|
+
RG_PATH,
|
|
415
|
+
"--pcre2",
|
|
416
|
+
"--multiline",
|
|
417
|
+
"--text",
|
|
418
|
+
"--json",
|
|
419
|
+
"--byte-offset",
|
|
420
|
+
"-o",
|
|
421
|
+
"-e",
|
|
422
|
+
pattern_str,
|
|
423
|
+
tmp_path,
|
|
424
|
+
]
|
|
425
|
+
try:
|
|
426
|
+
if RG_SEM:
|
|
427
|
+
RG_SEM.acquire()
|
|
428
|
+
proc = subprocess.run(
|
|
429
|
+
cmd,
|
|
430
|
+
capture_output=True,
|
|
431
|
+
text=True,
|
|
432
|
+
encoding="utf-8",
|
|
433
|
+
errors="ignore",
|
|
434
|
+
)
|
|
435
|
+
except Exception as e:
|
|
436
|
+
debug_log(f"[!] rg failed for {source}: {e}\n")
|
|
437
|
+
with RG_ENABLED_LOCK:
|
|
438
|
+
RG_ENABLED = False
|
|
439
|
+
rg_allowed = False
|
|
440
|
+
break
|
|
441
|
+
finally:
|
|
442
|
+
if RG_SEM:
|
|
443
|
+
RG_SEM.release()
|
|
444
|
+
if proc.returncode not in (0, 1): # 1 = no matches
|
|
445
|
+
debug_log(f"[!] rg exit {proc.returncode} for {source}: {proc.stderr.strip()}\n")
|
|
446
|
+
with RG_ENABLED_LOCK:
|
|
447
|
+
RG_ENABLED = False
|
|
448
|
+
rg_allowed = False
|
|
449
|
+
break
|
|
450
|
+
for line in proc.stdout.splitlines():
|
|
451
|
+
try:
|
|
452
|
+
rec = json.loads(line)
|
|
453
|
+
except json.JSONDecodeError:
|
|
454
|
+
continue
|
|
455
|
+
if rec.get("type") != "match":
|
|
456
|
+
continue
|
|
457
|
+
for sub in rec.get("data", {}).get("submatches", []):
|
|
458
|
+
if "match" not in sub or "byte_offset" not in sub:
|
|
459
|
+
continue
|
|
460
|
+
start = sub["byte_offset"]
|
|
461
|
+
match_text = sub["match"]["text"]
|
|
462
|
+
ctx_start = max(0, start - 60)
|
|
463
|
+
ctx_end = min(len(text), start + len(match_text) + 60)
|
|
464
|
+
context = text[ctx_start:ctx_end].replace("\n", " ")
|
|
465
|
+
if is_false_positive(context, use_fp):
|
|
466
|
+
continue
|
|
467
|
+
results.append(
|
|
468
|
+
{
|
|
469
|
+
"severity": label,
|
|
470
|
+
"source": source,
|
|
471
|
+
"index": start,
|
|
472
|
+
"match": match_text,
|
|
473
|
+
"pattern": pattern_str,
|
|
474
|
+
"context": context.strip(),
|
|
475
|
+
}
|
|
476
|
+
)
|
|
477
|
+
if not rg_allowed:
|
|
478
|
+
for rx, pattern_str in chunk:
|
|
479
|
+
for m in rx.finditer(text):
|
|
480
|
+
start, end = m.start(), m.end()
|
|
481
|
+
ctx_start = max(0, start - 60)
|
|
482
|
+
ctx_end = min(len(text), end + 60)
|
|
483
|
+
context = text[ctx_start:ctx_end].replace("\n", " ")
|
|
484
|
+
if is_false_positive(context, use_fp):
|
|
485
|
+
continue
|
|
486
|
+
results.append(
|
|
487
|
+
{
|
|
488
|
+
"severity": label, # "secret" or "secret/low"
|
|
489
|
+
"source": source,
|
|
490
|
+
"index": start,
|
|
491
|
+
"match": m.group(0),
|
|
492
|
+
"pattern": pattern_str,
|
|
493
|
+
"context": context.strip(),
|
|
494
|
+
}
|
|
495
|
+
)
|
|
496
|
+
return results
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def scan_text_grouped(
|
|
500
|
+
text: str,
|
|
501
|
+
source: str,
|
|
502
|
+
pattern_groups: Dict[str, List[Tuple[re.Pattern, str]]],
|
|
503
|
+
use_fp: bool,
|
|
504
|
+
chunk_size: int = REGEX_CHUNK_SIZE,
|
|
505
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
506
|
+
# Deprecated in favor of explicit chunk submission in main loop
|
|
507
|
+
raise NotImplementedError("scan_text_grouped is not used in the current pipeline")
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
# ==============================
|
|
511
|
+
# 5. Fetching, formatting, CLI
|
|
512
|
+
# ==============================
|
|
513
|
+
|
|
514
|
+
def build_session(
|
|
515
|
+
retries: int, backoff: float, trust_env: bool, pool_maxsize: Optional[int] = None
|
|
516
|
+
) -> requests.Session:
|
|
517
|
+
"""Create a requests session with retry/backoff to survive flaky endpoints."""
|
|
518
|
+
session = requests.Session()
|
|
519
|
+
session.trust_env = trust_env
|
|
520
|
+
retry_cfg = Retry(
|
|
521
|
+
total=retries,
|
|
522
|
+
connect=retries,
|
|
523
|
+
read=retries,
|
|
524
|
+
status=retries,
|
|
525
|
+
backoff_factor=backoff,
|
|
526
|
+
allowed_methods=["GET", "HEAD", "OPTIONS"],
|
|
527
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
|
528
|
+
raise_on_status=False,
|
|
529
|
+
)
|
|
530
|
+
adapter_kwargs = {"max_retries": retry_cfg}
|
|
531
|
+
if pool_maxsize:
|
|
532
|
+
adapter_kwargs["pool_maxsize"] = pool_maxsize
|
|
533
|
+
adapter_kwargs["pool_connections"] = pool_maxsize
|
|
534
|
+
adapter = HTTPAdapter(**adapter_kwargs)
|
|
535
|
+
session.mount("http://", adapter)
|
|
536
|
+
session.mount("https://", adapter)
|
|
537
|
+
return session
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def get_thread_session(
|
|
541
|
+
retries: int,
|
|
542
|
+
backoff: float,
|
|
543
|
+
trust_env: bool,
|
|
544
|
+
pool_maxsize: Optional[int],
|
|
545
|
+
) -> requests.Session:
|
|
546
|
+
"""Provide one session per thread to avoid cross-thread contention."""
|
|
547
|
+
sess = getattr(thread_local, "session", None)
|
|
548
|
+
if sess is None:
|
|
549
|
+
sess = build_session(retries, backoff, trust_env=trust_env, pool_maxsize=pool_maxsize)
|
|
550
|
+
thread_local.session = sess
|
|
551
|
+
return sess
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def fetch_with_curl(
|
|
555
|
+
url: str,
|
|
556
|
+
connect_timeout: float,
|
|
557
|
+
read_timeout: float,
|
|
558
|
+
verify: bool,
|
|
559
|
+
user_agent: Optional[str],
|
|
560
|
+
use_proxy_env: bool,
|
|
561
|
+
curl_path: Optional[str],
|
|
562
|
+
verbose: bool,
|
|
563
|
+
max_body_bytes: Optional[int],
|
|
564
|
+
require_status_200: bool,
|
|
565
|
+
dest_path: Optional[str] = None,
|
|
566
|
+
proxy: Optional[str] = None,
|
|
567
|
+
) -> str:
|
|
568
|
+
total_timeout = max(int(connect_timeout + read_timeout), int(read_timeout))
|
|
569
|
+
curl_bin = curl_path or shutil.which("curl") or "curl"
|
|
570
|
+
cmd = [
|
|
571
|
+
curl_bin,
|
|
572
|
+
"-L",
|
|
573
|
+
"--connect-timeout",
|
|
574
|
+
str(connect_timeout),
|
|
575
|
+
"--max-time",
|
|
576
|
+
str(total_timeout),
|
|
577
|
+
]
|
|
578
|
+
if max_body_bytes is not None:
|
|
579
|
+
cmd.extend(["--max-filesize", str(max_body_bytes)])
|
|
580
|
+
if not verify:
|
|
581
|
+
cmd.append("-k")
|
|
582
|
+
if user_agent:
|
|
583
|
+
cmd.extend(["-A", user_agent])
|
|
584
|
+
if verbose:
|
|
585
|
+
cmd.append("-v")
|
|
586
|
+
if require_status_200:
|
|
587
|
+
cmd.extend(["-w", "\\n%{http_code}"])
|
|
588
|
+
if dest_path:
|
|
589
|
+
cmd.extend(["-o", dest_path])
|
|
590
|
+
if proxy:
|
|
591
|
+
cmd.extend(["-x", proxy])
|
|
592
|
+
cmd.append(url)
|
|
593
|
+
|
|
594
|
+
env = None
|
|
595
|
+
if not use_proxy_env and not proxy:
|
|
596
|
+
env = dict(os.environ)
|
|
597
|
+
for k in list(env.keys()):
|
|
598
|
+
if k.upper() in {"HTTP_PROXY", "HTTPS_PROXY", "ALL_PROXY"}:
|
|
599
|
+
env.pop(k, None)
|
|
600
|
+
env["NO_PROXY"] = "*"
|
|
601
|
+
|
|
602
|
+
result = subprocess.run(
|
|
603
|
+
cmd, capture_output=True, text=True, env=env, encoding="utf-8", errors="ignore"
|
|
604
|
+
)
|
|
605
|
+
if result.returncode != 0:
|
|
606
|
+
stderr_msg = result.stderr.strip()
|
|
607
|
+
raise RuntimeError(f"curl exited with code {result.returncode}: {stderr_msg}")
|
|
608
|
+
stdout = result.stdout
|
|
609
|
+
if require_status_200:
|
|
610
|
+
stdout, _, status_str = stdout.rpartition("\n")
|
|
611
|
+
try:
|
|
612
|
+
status_code = int(status_str.strip())
|
|
613
|
+
except ValueError:
|
|
614
|
+
raise RuntimeError(
|
|
615
|
+
f"Could not parse HTTP status from curl output for {url}: {status_str!r}"
|
|
616
|
+
)
|
|
617
|
+
if status_code != 200:
|
|
618
|
+
raise RuntimeError(f"HTTP status {status_code} for {url}, skipping.")
|
|
619
|
+
return stdout
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def fetch_url_body(
|
|
623
|
+
url: str,
|
|
624
|
+
session: requests.Session,
|
|
625
|
+
connect_timeout: float,
|
|
626
|
+
read_timeout: float,
|
|
627
|
+
verify: bool,
|
|
628
|
+
user_agent: Optional[str],
|
|
629
|
+
use_curl: bool,
|
|
630
|
+
curl_path: Optional[str],
|
|
631
|
+
curl_verbose: bool,
|
|
632
|
+
trust_env: bool,
|
|
633
|
+
max_body_bytes: Optional[int],
|
|
634
|
+
dest_path: Optional[str],
|
|
635
|
+
require_status_200: bool,
|
|
636
|
+
proxy: Optional[str] = None,
|
|
637
|
+
) -> str:
|
|
638
|
+
if use_curl:
|
|
639
|
+
return fetch_with_curl(
|
|
640
|
+
url=url,
|
|
641
|
+
connect_timeout=connect_timeout,
|
|
642
|
+
read_timeout=read_timeout,
|
|
643
|
+
verify=verify,
|
|
644
|
+
user_agent=user_agent,
|
|
645
|
+
use_proxy_env=trust_env,
|
|
646
|
+
curl_path=curl_path,
|
|
647
|
+
verbose=curl_verbose,
|
|
648
|
+
max_body_bytes=max_body_bytes,
|
|
649
|
+
require_status_200=require_status_200,
|
|
650
|
+
dest_path=dest_path,
|
|
651
|
+
proxy=proxy,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
headers = {}
|
|
655
|
+
if user_agent:
|
|
656
|
+
headers["User-Agent"] = user_agent
|
|
657
|
+
proxies = {"http": proxy, "https": proxy} if proxy else None
|
|
658
|
+
if dest_path:
|
|
659
|
+
tmp_path = dest_path
|
|
660
|
+
# Stream to file to avoid buffering huge bodies in memory
|
|
661
|
+
try:
|
|
662
|
+
with session.get(
|
|
663
|
+
url,
|
|
664
|
+
headers=headers,
|
|
665
|
+
timeout=(connect_timeout, read_timeout),
|
|
666
|
+
verify=verify,
|
|
667
|
+
stream=True,
|
|
668
|
+
proxies=proxies,
|
|
669
|
+
) as resp, open(tmp_path, "wb") as out:
|
|
670
|
+
if require_status_200 and resp.status_code != 200:
|
|
671
|
+
raise RuntimeError(f"HTTP status {resp.status_code} for {url}, skipping.")
|
|
672
|
+
total_bytes = 0
|
|
673
|
+
for chunk in resp.iter_content(chunk_size=16384):
|
|
674
|
+
if not chunk:
|
|
675
|
+
continue
|
|
676
|
+
total_bytes += len(chunk)
|
|
677
|
+
if max_body_bytes is not None and total_bytes > max_body_bytes:
|
|
678
|
+
raise RuntimeError(
|
|
679
|
+
f"Body size exceeded limit ({max_body_bytes} bytes), skipping."
|
|
680
|
+
)
|
|
681
|
+
out.write(chunk)
|
|
682
|
+
return tmp_path
|
|
683
|
+
except Exception:
|
|
684
|
+
# On failure, clean partial file if present
|
|
685
|
+
try:
|
|
686
|
+
if os.path.exists(dest_path):
|
|
687
|
+
os.remove(dest_path)
|
|
688
|
+
except OSError:
|
|
689
|
+
pass
|
|
690
|
+
raise
|
|
691
|
+
|
|
692
|
+
with session.get(
|
|
693
|
+
url,
|
|
694
|
+
headers=headers,
|
|
695
|
+
timeout=(connect_timeout, read_timeout),
|
|
696
|
+
verify=verify,
|
|
697
|
+
stream=True,
|
|
698
|
+
proxies=proxies,
|
|
699
|
+
) as resp:
|
|
700
|
+
if require_status_200 and resp.status_code != 200:
|
|
701
|
+
raise RuntimeError(f"HTTP status {resp.status_code} for {url}, skipping.")
|
|
702
|
+
|
|
703
|
+
body_chunks: List[bytes] = []
|
|
704
|
+
total_bytes = 0
|
|
705
|
+
for chunk in resp.iter_content(chunk_size=16384, decode_unicode=False):
|
|
706
|
+
if not chunk:
|
|
707
|
+
continue
|
|
708
|
+
total_bytes += len(chunk)
|
|
709
|
+
if max_body_bytes is not None and total_bytes > max_body_bytes:
|
|
710
|
+
raise RuntimeError(
|
|
711
|
+
f"Body size exceeded limit ({max_body_bytes} bytes), skipping."
|
|
712
|
+
)
|
|
713
|
+
body_chunks.append(chunk)
|
|
714
|
+
|
|
715
|
+
encoding = resp.encoding or "utf-8"
|
|
716
|
+
return b"".join(body_chunks).decode(encoding, errors="ignore")
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def normalize_url(raw: str, auto_http: bool) -> Optional[str]:
|
|
720
|
+
"""
|
|
721
|
+
Clean up a URL string. If the scheme is missing and auto_http is True,
|
|
722
|
+
prepend http:// for host-like entries. Returns None for empty/invalid lines.
|
|
723
|
+
"""
|
|
724
|
+
url = raw.strip()
|
|
725
|
+
if not url or url.startswith("#"):
|
|
726
|
+
return None
|
|
727
|
+
if url.startswith("//"):
|
|
728
|
+
return f"http:{url}" if auto_http else None
|
|
729
|
+
if "://" not in url:
|
|
730
|
+
if auto_http and re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(:\d+)?(/.*)?$", url):
|
|
731
|
+
return f"http://{url}"
|
|
732
|
+
debug_log(f"[!] Skipping URL without scheme: {url}\n")
|
|
733
|
+
return None
|
|
734
|
+
return url
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def record_failed_urls(urls: List[str], log_path: str = FAILED_FETCH_LOG) -> None:
|
|
738
|
+
"""Append failed URLs to a log file so they can be retried later."""
|
|
739
|
+
if not urls:
|
|
740
|
+
return
|
|
741
|
+
try:
|
|
742
|
+
with open(log_path, "a", encoding="utf-8") as f:
|
|
743
|
+
for url in urls:
|
|
744
|
+
if url:
|
|
745
|
+
f.write(url.strip() + "\n")
|
|
746
|
+
except Exception as e:
|
|
747
|
+
debug_log(f"[!] Failed to record failed URLs to {log_path}: {e}\n")
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
def load_urls_from_file(path: str, auto_http: bool) -> List[str]:
|
|
751
|
+
urls: List[str] = []
|
|
752
|
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
|
753
|
+
for line in f:
|
|
754
|
+
norm = normalize_url(line, auto_http=auto_http)
|
|
755
|
+
if norm:
|
|
756
|
+
urls.append(norm)
|
|
757
|
+
return urls
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
def format_finding(rec: Dict[str, Any], color: bool) -> str:
|
|
761
|
+
"""
|
|
762
|
+
Format as:
|
|
763
|
+
[secret] 'match' [source]
|
|
764
|
+
with different colors when color=True.
|
|
765
|
+
"""
|
|
766
|
+
label = rec["severity"] # "secret" or "secret/low"
|
|
767
|
+
match_repr = repr(rec["match"])
|
|
768
|
+
src = rec["source"]
|
|
769
|
+
|
|
770
|
+
if color:
|
|
771
|
+
lab = label
|
|
772
|
+
lab_color = ANSI_COLORS_LABEL.get(label, "")
|
|
773
|
+
if lab_color:
|
|
774
|
+
lab = f"{lab_color}{lab}{ANSI_RESET}"
|
|
775
|
+
match_repr = f"{ANSI_MATCH}{match_repr}{ANSI_RESET}"
|
|
776
|
+
src = f"{ANSI_SOURCE}{src}{ANSI_RESET}"
|
|
777
|
+
return f"[{lab}] {match_repr} [{src}]"
|
|
778
|
+
else:
|
|
779
|
+
return f"[{label}] {match_repr} [{src}]"
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def main():
|
|
783
|
+
parser = argparse.ArgumentParser(
|
|
784
|
+
description=(
|
|
785
|
+
"Web Secret Scanner - Detect exposed secrets, API keys, tokens, and credentials in web responses.\n\n"
|
|
786
|
+
"Scans URLs for sensitive data leaks using pattern matching with nuclei-regex signatures\n"
|
|
787
|
+
"and custom secret detection rules. Supports single URL (-u) or batch scanning from file (-l).\n\n"
|
|
788
|
+
"Output format: [secret] 'matched_value' [source_url]"
|
|
789
|
+
),
|
|
790
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
791
|
+
)
|
|
792
|
+
parser.add_argument("-u", "--url", help="Single URL to scan.")
|
|
793
|
+
parser.add_argument("-l", "--list", help="File containing URLs (one per line).")
|
|
794
|
+
parser.add_argument(
|
|
795
|
+
"--timeout",
|
|
796
|
+
type=float,
|
|
797
|
+
default=None,
|
|
798
|
+
help="Request timeout in seconds (connect + read). Default: 95s total.",
|
|
799
|
+
)
|
|
800
|
+
parser.add_argument(
|
|
801
|
+
"--retries",
|
|
802
|
+
type=int,
|
|
803
|
+
default=3,
|
|
804
|
+
help="Number of HTTP retries for fetch errors. Default: 3.",
|
|
805
|
+
)
|
|
806
|
+
parser.add_argument(
|
|
807
|
+
"--secure",
|
|
808
|
+
dest="insecure",
|
|
809
|
+
action="store_false",
|
|
810
|
+
default=True,
|
|
811
|
+
help="Enable TLS certificate verification. Default: disabled (insecure).",
|
|
812
|
+
)
|
|
813
|
+
parser.add_argument(
|
|
814
|
+
"--proxy",
|
|
815
|
+
type=str,
|
|
816
|
+
default=None,
|
|
817
|
+
help=(
|
|
818
|
+
"Proxy configuration: comma-separated proxies or path to file. "
|
|
819
|
+
"Format: ip:port:username:password (e.g., 185.195.222.170:3199:user:pass). "
|
|
820
|
+
"Proxies are randomly shuffled for each request."
|
|
821
|
+
),
|
|
822
|
+
)
|
|
823
|
+
parser.add_argument(
|
|
824
|
+
"--user-agent",
|
|
825
|
+
type=str,
|
|
826
|
+
default=None,
|
|
827
|
+
nargs="?",
|
|
828
|
+
const="",
|
|
829
|
+
help="Override User-Agent header (empty to use client default).",
|
|
830
|
+
)
|
|
831
|
+
parser.add_argument(
|
|
832
|
+
"--npb",
|
|
833
|
+
dest="progress",
|
|
834
|
+
action="store_false",
|
|
835
|
+
default=True,
|
|
836
|
+
help="No Progress Bar - disable the progress bar display.",
|
|
837
|
+
)
|
|
838
|
+
parser.add_argument(
|
|
839
|
+
"--quiet",
|
|
840
|
+
action="store_true",
|
|
841
|
+
help="Suppress all terminal output except the progress bar.",
|
|
842
|
+
)
|
|
843
|
+
parser.add_argument(
|
|
844
|
+
"--chunk-size",
|
|
845
|
+
type=int,
|
|
846
|
+
default=REGEX_CHUNK_SIZE,
|
|
847
|
+
help="Number of regex patterns per processing chunk. Default: 50.",
|
|
848
|
+
)
|
|
849
|
+
parser.add_argument(
|
|
850
|
+
"--all-patterns",
|
|
851
|
+
action="store_true",
|
|
852
|
+
help="Include noisy patterns (IPs, endpoints, emails). Default: secrets-only.",
|
|
853
|
+
)
|
|
854
|
+
parser.add_argument(
|
|
855
|
+
"-o",
|
|
856
|
+
"--output",
|
|
857
|
+
type=str,
|
|
858
|
+
default=None,
|
|
859
|
+
help="Output file path. Default: <list_file>_secrets.txt when --list is used.",
|
|
860
|
+
)
|
|
861
|
+
parser.add_argument(
|
|
862
|
+
"--html-output",
|
|
863
|
+
nargs="?",
|
|
864
|
+
const="",
|
|
865
|
+
help="Generate HTML report. Optional path (defaults to <output>.html).",
|
|
866
|
+
)
|
|
867
|
+
parser.add_argument(
|
|
868
|
+
"--append",
|
|
869
|
+
action="store_true",
|
|
870
|
+
help="Append to the output file instead of overwriting.",
|
|
871
|
+
)
|
|
872
|
+
parser.add_argument(
|
|
873
|
+
"--nou",
|
|
874
|
+
dest="number_of_urls",
|
|
875
|
+
type=int,
|
|
876
|
+
default=6,
|
|
877
|
+
help=(
|
|
878
|
+
"Number of URLs to fetch concurrently. Default: 6. "
|
|
879
|
+
"NOTE: This controls URL fetching parallelism only; actual thread count may be "
|
|
880
|
+
"significantly higher due to regex processing workers."
|
|
881
|
+
),
|
|
882
|
+
)
|
|
883
|
+
parser.add_argument(
|
|
884
|
+
"--no-color",
|
|
885
|
+
dest="color",
|
|
886
|
+
action="store_false",
|
|
887
|
+
default=True,
|
|
888
|
+
help="Disable colored output.",
|
|
889
|
+
)
|
|
890
|
+
args = parser.parse_args()
|
|
891
|
+
|
|
892
|
+
if args.output is None and args.list:
|
|
893
|
+
list_stem = os.path.splitext(os.path.basename(args.list.rstrip(os.sep)))[0] or "scan"
|
|
894
|
+
args.output = f"{list_stem}_secrets.txt"
|
|
895
|
+
|
|
896
|
+
html_requested = args.html_output is not None
|
|
897
|
+
html_output_path: Optional[str] = None
|
|
898
|
+
if html_requested:
|
|
899
|
+
if not args.output:
|
|
900
|
+
parser.error("--html-output requires --output to be set.")
|
|
901
|
+
base, _ = os.path.splitext(args.output)
|
|
902
|
+
html_output_path = args.html_output if args.html_output else (base or args.output) + ".html"
|
|
903
|
+
|
|
904
|
+
# Reset globals per run
|
|
905
|
+
STOP_EVENT.clear()
|
|
906
|
+
with RG_ENABLED_LOCK:
|
|
907
|
+
RG_ENABLED = bool(RG_PATH)
|
|
908
|
+
|
|
909
|
+
# Ensure rg supports PCRE2; fail fast with guidance if not
|
|
910
|
+
ensure_rg_pcre2_available()
|
|
911
|
+
|
|
912
|
+
if not args.url and not args.list:
|
|
913
|
+
parser.error("You must provide --url or --list")
|
|
914
|
+
|
|
915
|
+
if args.progress and Progress is None:
|
|
916
|
+
debug_log("[i] Disabling progress (rich not installed).\n")
|
|
917
|
+
args.progress = False
|
|
918
|
+
|
|
919
|
+
# Set global quiet flag for helpers
|
|
920
|
+
global QUIET
|
|
921
|
+
QUIET = args.quiet
|
|
922
|
+
|
|
923
|
+
# Initialize proxy list if provided
|
|
924
|
+
global PROXY_LIST
|
|
925
|
+
if args.proxy:
|
|
926
|
+
PROXY_LIST = load_proxies(args.proxy)
|
|
927
|
+
if PROXY_LIST:
|
|
928
|
+
debug_log(f"[i] Loaded {len(PROXY_LIST)} proxies for rotation.\n")
|
|
929
|
+
else:
|
|
930
|
+
debug_log("[!] No valid proxies loaded from --proxy argument.\n")
|
|
931
|
+
else:
|
|
932
|
+
PROXY_LIST = []
|
|
933
|
+
|
|
934
|
+
max_parallel = max(1, args.number_of_urls)
|
|
935
|
+
if max_parallel > MAX_PARALLEL_HARD_LIMIT:
|
|
936
|
+
debug_log(
|
|
937
|
+
f"[i] Limiting parallel URL workers to {MAX_PARALLEL_HARD_LIMIT} "
|
|
938
|
+
f"(requested {max_parallel})."
|
|
939
|
+
)
|
|
940
|
+
max_parallel = MAX_PARALLEL_HARD_LIMIT
|
|
941
|
+
fetch_workers = max_parallel
|
|
942
|
+
pool_maxsize = fetch_workers * 2 # allow multiple connections per worker
|
|
943
|
+
|
|
944
|
+
# Automatically avoid spawning tons of curl processes at high concurrency
|
|
945
|
+
use_curl = DEFAULT_USE_CURL
|
|
946
|
+
if use_curl and max_parallel > 8:
|
|
947
|
+
debug_log("[i] Disabling curl for high concurrency to reduce process overhead.")
|
|
948
|
+
use_curl = False
|
|
949
|
+
|
|
950
|
+
if args.insecure:
|
|
951
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
952
|
+
|
|
953
|
+
nuclei_patterns = filter_nuclei_patterns(
|
|
954
|
+
load_embedded_nuclei_patterns(), keep_all=False
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
# Build pattern groups:
|
|
958
|
+
# [secret] => nuclei_regex + critical/high/medium
|
|
959
|
+
# [secret/low] => low-generic patterns
|
|
960
|
+
# Build pattern groups with optional exclusion of noisy patterns
|
|
961
|
+
noisy_medium_patterns = {
|
|
962
|
+
r"[\"'](10\.\d{1,3}\.\d{1,3}\.\d{1,3})[\"']",
|
|
963
|
+
r"[\"'](172\.(1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3})[\"']",
|
|
964
|
+
r"[\"'](192\.168\.\d{1,3}\.\d{1,3})[\"']",
|
|
965
|
+
r"[\"']/admin[/a-zA-Z0-9_-]*[\"']",
|
|
966
|
+
r"[\"']/debug[/a-zA-Z0-9_-]*[\"']",
|
|
967
|
+
r"[\"']/api/internal[/a-zA-Z0-9_-]*[\"']",
|
|
968
|
+
r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
secret_patterns: List[Tuple[re.Pattern, str]] = []
|
|
972
|
+
secret_patterns.extend(nuclei_patterns)
|
|
973
|
+
secret_patterns.extend(COMPILED_SEVERITY["critical"])
|
|
974
|
+
secret_patterns.extend(COMPILED_SEVERITY["high"])
|
|
975
|
+
|
|
976
|
+
for rx, pat in COMPILED_SEVERITY["medium"]:
|
|
977
|
+
if args.all_patterns or pat not in noisy_medium_patterns:
|
|
978
|
+
secret_patterns.append((rx, pat))
|
|
979
|
+
|
|
980
|
+
secret_low_patterns: List[Tuple[re.Pattern, str]] = (
|
|
981
|
+
COMPILED_SEVERITY["low"] if args.all_patterns else []
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
pattern_groups: Dict[str, List[Tuple[re.Pattern, str]]] = {
|
|
985
|
+
"secret": secret_patterns,
|
|
986
|
+
"secret/low": secret_low_patterns,
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
# Collect URLs
|
|
990
|
+
urls: List[str] = []
|
|
991
|
+
auto_http = True # Always auto-prepend http:// for URLs without scheme
|
|
992
|
+
if args.url:
|
|
993
|
+
norm = normalize_url(args.url, auto_http=auto_http)
|
|
994
|
+
if norm:
|
|
995
|
+
urls.append(norm)
|
|
996
|
+
if args.list:
|
|
997
|
+
urls.extend(load_urls_from_file(args.list, auto_http=auto_http))
|
|
998
|
+
|
|
999
|
+
# Prepare pattern chunks for rg to avoid PCRE2 size issues
|
|
1000
|
+
max_pattern_len = 512 # tighter cap to avoid PCRE2 size errors
|
|
1001
|
+
patterns_dir = os.path.join(os.getcwd(), "patterns_split")
|
|
1002
|
+
os.makedirs(patterns_dir, exist_ok=True)
|
|
1003
|
+
for entry in os.scandir(patterns_dir):
|
|
1004
|
+
try:
|
|
1005
|
+
os.remove(entry.path)
|
|
1006
|
+
except OSError:
|
|
1007
|
+
pass
|
|
1008
|
+
patterns_per_file = 50
|
|
1009
|
+
pattern_lines: List[str] = []
|
|
1010
|
+
skipped_patterns = 0
|
|
1011
|
+
for rules in [
|
|
1012
|
+
nuclei_patterns,
|
|
1013
|
+
COMPILED_SEVERITY["critical"],
|
|
1014
|
+
COMPILED_SEVERITY["high"],
|
|
1015
|
+
COMPILED_SEVERITY["medium"],
|
|
1016
|
+
COMPILED_SEVERITY["low"],
|
|
1017
|
+
]:
|
|
1018
|
+
for rx, pat in rules:
|
|
1019
|
+
pat_inline = inline_flags_pattern(pat, rx.flags)
|
|
1020
|
+
if len(pat_inline) > max_pattern_len:
|
|
1021
|
+
skipped_patterns += 1
|
|
1022
|
+
continue
|
|
1023
|
+
pattern_lines.append(pat_inline)
|
|
1024
|
+
pattern_files: List[str] = []
|
|
1025
|
+
for i in range(0, len(pattern_lines), patterns_per_file):
|
|
1026
|
+
chunk_lines = pattern_lines[i : i + patterns_per_file]
|
|
1027
|
+
p_path = os.path.join(patterns_dir, f"patterns_{i//patterns_per_file}.txt")
|
|
1028
|
+
with open(p_path, "w", encoding="utf-8", errors="ignore") as pf:
|
|
1029
|
+
pf.write("\n".join(chunk_lines))
|
|
1030
|
+
pattern_files.append(p_path)
|
|
1031
|
+
debug_log(f"[i] Wrote {len(pattern_lines)} patterns into {len(pattern_files)} chunk file(s) under {patterns_dir}")
|
|
1032
|
+
if skipped_patterns:
|
|
1033
|
+
debug_log(f"[i] Skipped {skipped_patterns} patterns that exceeded {max_pattern_len} chars (PCRE2 limit)")
|
|
1034
|
+
|
|
1035
|
+
# Build pattern chunks for Python fallback scanning
|
|
1036
|
+
pattern_chunks: List[Tuple[str, List[Tuple[re.Pattern, str]]]] = []
|
|
1037
|
+
for label, rules in pattern_groups.items():
|
|
1038
|
+
if not rules:
|
|
1039
|
+
continue
|
|
1040
|
+
for chunk in chunk_patterns(rules, args.chunk_size):
|
|
1041
|
+
pattern_chunks.append((label, chunk))
|
|
1042
|
+
|
|
1043
|
+
temp_dir = os.path.join(os.getcwd(), "temp")
|
|
1044
|
+
temp_dir2 = os.path.join(os.getcwd(), "temp2")
|
|
1045
|
+
os.makedirs(temp_dir, exist_ok=True)
|
|
1046
|
+
os.makedirs(temp_dir2, exist_ok=True)
|
|
1047
|
+
|
|
1048
|
+
def clear_dir(path: str):
|
|
1049
|
+
for entry in os.scandir(path):
|
|
1050
|
+
try:
|
|
1051
|
+
os.remove(entry.path)
|
|
1052
|
+
except OSError:
|
|
1053
|
+
pass
|
|
1054
|
+
|
|
1055
|
+
def fetch_batch(batch_urls: List[str], directory: str) -> Tuple[Dict[str, str], List[str]]:
|
|
1056
|
+
os.makedirs(directory, exist_ok=True)
|
|
1057
|
+
mapping: Dict[str, str] = {}
|
|
1058
|
+
failed_urls: List[str] = []
|
|
1059
|
+
if not batch_urls:
|
|
1060
|
+
return mapping, failed_urls
|
|
1061
|
+
batch_timeout = DEFAULT_FETCH_TIMEOUT
|
|
1062
|
+
def worker(url: str) -> Optional[Tuple[str, str]]:
|
|
1063
|
+
if STOP_EVENT.is_set():
|
|
1064
|
+
return None
|
|
1065
|
+
session = get_thread_session(**session_params)
|
|
1066
|
+
# Get a random proxy for this request (shuffled rotation)
|
|
1067
|
+
proxy = get_random_proxy()
|
|
1068
|
+
try:
|
|
1069
|
+
fname = os.path.join(directory, f"u{hash(url)}_{int(time.time()*1000)}.txt")
|
|
1070
|
+
fetch_url_body(
|
|
1071
|
+
url,
|
|
1072
|
+
session=session,
|
|
1073
|
+
connect_timeout=connect_timeout,
|
|
1074
|
+
read_timeout=read_timeout,
|
|
1075
|
+
verify=not args.insecure,
|
|
1076
|
+
user_agent=user_agent,
|
|
1077
|
+
use_curl=use_curl,
|
|
1078
|
+
curl_path=None,
|
|
1079
|
+
curl_verbose=False,
|
|
1080
|
+
trust_env=trust_env,
|
|
1081
|
+
max_body_bytes=max_body_bytes,
|
|
1082
|
+
dest_path=fname,
|
|
1083
|
+
require_status_200=DEFAULT_REQUIRE_STATUS_200,
|
|
1084
|
+
proxy=proxy,
|
|
1085
|
+
)
|
|
1086
|
+
except Exception as e:
|
|
1087
|
+
failed_urls.append(url)
|
|
1088
|
+
return None
|
|
1089
|
+
return fname, url
|
|
1090
|
+
|
|
1091
|
+
with ThreadPoolExecutor(max_workers=fetch_workers) as ex:
|
|
1092
|
+
fut_map = {ex.submit(worker, u): u for u in batch_urls}
|
|
1093
|
+
try:
|
|
1094
|
+
for fut in as_completed(fut_map, timeout=batch_timeout):
|
|
1095
|
+
res = fut.result()
|
|
1096
|
+
if res:
|
|
1097
|
+
fname, url = res
|
|
1098
|
+
mapping[fname] = url
|
|
1099
|
+
except TimeoutError:
|
|
1100
|
+
for fut, url in fut_map.items():
|
|
1101
|
+
if not fut.done():
|
|
1102
|
+
fut.cancel()
|
|
1103
|
+
failed_urls.append(url)
|
|
1104
|
+
except KeyboardInterrupt:
|
|
1105
|
+
STOP_EVENT.set()
|
|
1106
|
+
ex.shutdown(wait=False, cancel_futures=True)
|
|
1107
|
+
raise
|
|
1108
|
+
if failed_urls:
|
|
1109
|
+
record_failed_urls(sorted(set(failed_urls)))
|
|
1110
|
+
return mapping, failed_urls
|
|
1111
|
+
|
|
1112
|
+
def run_rg_on_dir(directory: str, file_map: Dict[str, str], pattern_files: List[str]) -> Dict[str, List[Dict[str, Any]]]:
|
|
1113
|
+
# Normalize mapping keys to absolute, case-folded paths for rg output
|
|
1114
|
+
def _norm(p: str) -> str:
|
|
1115
|
+
return os.path.normcase(os.path.abspath(p))
|
|
1116
|
+
abs_map: Dict[str, str] = {_norm(p): u for p, u in file_map.items()}
|
|
1117
|
+
results: Dict[str, List[Dict[str, Any]]] = {url: [] for url in abs_map.values()}
|
|
1118
|
+
if not abs_map or not pattern_files:
|
|
1119
|
+
return results
|
|
1120
|
+
global RG_ENABLED
|
|
1121
|
+
with RG_ENABLED_LOCK:
|
|
1122
|
+
if not RG_ENABLED:
|
|
1123
|
+
return results
|
|
1124
|
+
rg_timeout = DEFAULT_SCAN_TIMEOUT
|
|
1125
|
+
for pf in pattern_files:
|
|
1126
|
+
if STOP_EVENT.is_set():
|
|
1127
|
+
break
|
|
1128
|
+
cmd = [
|
|
1129
|
+
RG_PATH or "rg",
|
|
1130
|
+
"--pcre2",
|
|
1131
|
+
"--multiline",
|
|
1132
|
+
"--text",
|
|
1133
|
+
"--json",
|
|
1134
|
+
"--byte-offset",
|
|
1135
|
+
"-f",
|
|
1136
|
+
pf,
|
|
1137
|
+
directory,
|
|
1138
|
+
]
|
|
1139
|
+
try:
|
|
1140
|
+
proc = subprocess.run(
|
|
1141
|
+
cmd,
|
|
1142
|
+
capture_output=True,
|
|
1143
|
+
text=True,
|
|
1144
|
+
encoding="utf-8",
|
|
1145
|
+
errors="ignore",
|
|
1146
|
+
timeout=rg_timeout,
|
|
1147
|
+
)
|
|
1148
|
+
except subprocess.TimeoutExpired:
|
|
1149
|
+
debug_log(f"[!] rg timed out after {rg_timeout}s using {pf}; skipping remaining pattern chunks.\n")
|
|
1150
|
+
break
|
|
1151
|
+
except Exception as e:
|
|
1152
|
+
debug_log(f"[!] rg failed to start: {e}\n")
|
|
1153
|
+
with RG_ENABLED_LOCK:
|
|
1154
|
+
RG_ENABLED = False
|
|
1155
|
+
break
|
|
1156
|
+
|
|
1157
|
+
if proc.returncode not in (0, 1): # 1 = no matches
|
|
1158
|
+
stderr_msg = proc.stderr.strip()
|
|
1159
|
+
debug_log(f"[!] rg exit {proc.returncode} on {pf}: {stderr_msg}\n")
|
|
1160
|
+
if "PCRE2 is not available" in stderr_msg or "PCRE2" in stderr_msg:
|
|
1161
|
+
raise SystemExit(
|
|
1162
|
+
"[!] ripgrep is installed without PCRE2 support. Install a PCRE2-enabled ripgrep "
|
|
1163
|
+
"(e.g., package 'ripgrep-pcre2' or 'cargo install ripgrep --features \"pcre2\" --locked')."
|
|
1164
|
+
)
|
|
1165
|
+
with RG_ENABLED_LOCK:
|
|
1166
|
+
RG_ENABLED = False
|
|
1167
|
+
break
|
|
1168
|
+
|
|
1169
|
+
for line in proc.stdout.splitlines():
|
|
1170
|
+
try:
|
|
1171
|
+
rec = json.loads(line)
|
|
1172
|
+
except json.JSONDecodeError:
|
|
1173
|
+
continue
|
|
1174
|
+
if rec.get("type") != "match":
|
|
1175
|
+
continue
|
|
1176
|
+
path = rec.get("data", {}).get("path", {}).get("text")
|
|
1177
|
+
if not path:
|
|
1178
|
+
continue
|
|
1179
|
+
abs_path = _norm(path)
|
|
1180
|
+
if abs_path not in abs_map:
|
|
1181
|
+
continue
|
|
1182
|
+
url = abs_map[abs_path]
|
|
1183
|
+
for sub in rec.get("data", {}).get("submatches", []):
|
|
1184
|
+
if "match" not in sub:
|
|
1185
|
+
continue
|
|
1186
|
+
start = sub.get("byte_offset", sub.get("start"))
|
|
1187
|
+
if start is None:
|
|
1188
|
+
continue
|
|
1189
|
+
match_text = sub["match"].get("text", "")
|
|
1190
|
+
line_text = rec.get("data", {}).get("lines", {}).get("text", "")
|
|
1191
|
+
if len(line_text) > 500:
|
|
1192
|
+
line_text = line_text[:500] + "...(truncated)"
|
|
1193
|
+
results[url].append(
|
|
1194
|
+
{
|
|
1195
|
+
"severity": "secret",
|
|
1196
|
+
"source": url,
|
|
1197
|
+
"index": start,
|
|
1198
|
+
"match": match_text,
|
|
1199
|
+
"pattern": rec.get("data", {}).get("pattern", ""),
|
|
1200
|
+
"context": line_text.strip(),
|
|
1201
|
+
}
|
|
1202
|
+
)
|
|
1203
|
+
return results
|
|
1204
|
+
|
|
1205
|
+
def scan_files_python(file_map: Dict[str, str]) -> Dict[str, List[Dict[str, Any]]]:
|
|
1206
|
+
results: Dict[str, List[Dict[str, Any]]] = {url: [] for url in file_map.values()}
|
|
1207
|
+
for path, url in file_map.items():
|
|
1208
|
+
if STOP_EVENT.is_set():
|
|
1209
|
+
break
|
|
1210
|
+
try:
|
|
1211
|
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
|
1212
|
+
body = f.read()
|
|
1213
|
+
except OSError as e:
|
|
1214
|
+
debug_log(f"[!] Failed to read {path} for {url}: {e}\n")
|
|
1215
|
+
continue
|
|
1216
|
+
for label, chunk in pattern_chunks:
|
|
1217
|
+
results[url].extend(
|
|
1218
|
+
scan_one_chunk(
|
|
1219
|
+
label,
|
|
1220
|
+
chunk,
|
|
1221
|
+
body,
|
|
1222
|
+
url,
|
|
1223
|
+
True, # use false-positive filtering
|
|
1224
|
+
None,
|
|
1225
|
+
False,
|
|
1226
|
+
)
|
|
1227
|
+
)
|
|
1228
|
+
return results
|
|
1229
|
+
|
|
1230
|
+
# Config
|
|
1231
|
+
connect_timeout = args.timeout if args.timeout is not None else DEFAULT_CONNECT_TIMEOUT
|
|
1232
|
+
read_timeout = args.timeout if args.timeout is not None else DEFAULT_READ_TIMEOUT
|
|
1233
|
+
trust_env = True # Honor proxy environment variables
|
|
1234
|
+
user_agent = None if args.user_agent in (None, "") else args.user_agent
|
|
1235
|
+
max_body_bytes = DEFAULT_MAX_BODY_BYTES
|
|
1236
|
+
session_params = dict(
|
|
1237
|
+
retries=args.retries,
|
|
1238
|
+
backoff=1.0, # Default backoff factor
|
|
1239
|
+
trust_env=trust_env,
|
|
1240
|
+
pool_maxsize=pool_maxsize,
|
|
1241
|
+
)
|
|
1242
|
+
|
|
1243
|
+
total_urls = len(urls)
|
|
1244
|
+
progress_enabled = args.progress and total_urls > 0 and Progress is not None
|
|
1245
|
+
progress_cm = None
|
|
1246
|
+
progress = None
|
|
1247
|
+
progress_task = None
|
|
1248
|
+
if progress_enabled:
|
|
1249
|
+
progress_cm = Progress(
|
|
1250
|
+
SpinnerColumn(),
|
|
1251
|
+
BarColumn(bar_width=PROGRESS_BAR_WIDTH),
|
|
1252
|
+
MofNCompleteColumn(),
|
|
1253
|
+
TimeElapsedColumn(),
|
|
1254
|
+
TimeRemainingColumn(),
|
|
1255
|
+
TextColumn("{task.fields[url]}", justify="left"),
|
|
1256
|
+
transient=False,
|
|
1257
|
+
console=None,
|
|
1258
|
+
)
|
|
1259
|
+
progress = progress_cm.__enter__()
|
|
1260
|
+
progress_task = progress.add_task("scan", total=total_urls, url="")
|
|
1261
|
+
|
|
1262
|
+
# Determine output behavior: default overwrites once, then appends per write
|
|
1263
|
+
output_mode = "a"
|
|
1264
|
+
if args.output and not args.append:
|
|
1265
|
+
try:
|
|
1266
|
+
with open(args.output, "w", encoding="utf-8") as _:
|
|
1267
|
+
pass # truncate existing file for a fresh run
|
|
1268
|
+
except OSError as e:
|
|
1269
|
+
raise SystemExit(f"[!] Failed to open output file {args.output}: {e}")
|
|
1270
|
+
|
|
1271
|
+
def advance_progress(url_msg: str):
|
|
1272
|
+
if not progress_enabled or progress is None or progress_task is None:
|
|
1273
|
+
return
|
|
1274
|
+
progress.update(progress_task, advance=1, url=url_msg)
|
|
1275
|
+
|
|
1276
|
+
remaining = list(urls)
|
|
1277
|
+
processed = 0
|
|
1278
|
+
try:
|
|
1279
|
+
batch_dir = temp_dir
|
|
1280
|
+
buffer_dir = temp_dir2
|
|
1281
|
+
clear_dir(batch_dir)
|
|
1282
|
+
clear_dir(buffer_dir)
|
|
1283
|
+
batch_map: Dict[str, str] = {}
|
|
1284
|
+
buffer_map: Dict[str, str] = {}
|
|
1285
|
+
buffer_failed: List[str] = []
|
|
1286
|
+
while remaining or batch_map or buffer_map:
|
|
1287
|
+
try:
|
|
1288
|
+
if STOP_EVENT.is_set():
|
|
1289
|
+
break
|
|
1290
|
+
# Fill batch if empty
|
|
1291
|
+
if not batch_map and remaining:
|
|
1292
|
+
take = remaining[:max_parallel]
|
|
1293
|
+
remaining = remaining[max_parallel:]
|
|
1294
|
+
batch_map, failed_batch = fetch_batch(take, batch_dir)
|
|
1295
|
+
for url in failed_batch:
|
|
1296
|
+
advance_progress(f"{url} (failed)")
|
|
1297
|
+
processed += 1
|
|
1298
|
+
# If this batch yielded nothing (all failed), try next chunk
|
|
1299
|
+
if not batch_map:
|
|
1300
|
+
continue
|
|
1301
|
+
if not batch_map and not remaining and not buffer_map:
|
|
1302
|
+
# nothing left anywhere
|
|
1303
|
+
break
|
|
1304
|
+
|
|
1305
|
+
# Start rg on batch
|
|
1306
|
+
rg_results: Dict[str, List[Dict[str, Any]]] = {}
|
|
1307
|
+
rg_thread = ThreadPoolExecutor(max_workers=1)
|
|
1308
|
+
rg_future = rg_thread.submit(run_rg_on_dir, batch_dir, batch_map, pattern_files)
|
|
1309
|
+
|
|
1310
|
+
# While rg runs, fill buffer up to nou
|
|
1311
|
+
buffer_map = {}
|
|
1312
|
+
if remaining:
|
|
1313
|
+
take_buf = remaining[:max_parallel]
|
|
1314
|
+
remaining = remaining[max_parallel:]
|
|
1315
|
+
buffer_map, buffer_failed = fetch_batch(take_buf, buffer_dir)
|
|
1316
|
+
for url in buffer_failed:
|
|
1317
|
+
advance_progress(f"{url} (failed)")
|
|
1318
|
+
processed += 1
|
|
1319
|
+
|
|
1320
|
+
rg_results = rg_future.result()
|
|
1321
|
+
rg_thread.shutdown(wait=True, cancel_futures=True)
|
|
1322
|
+
with RG_ENABLED_LOCK:
|
|
1323
|
+
rg_active = RG_ENABLED
|
|
1324
|
+
if not rg_active:
|
|
1325
|
+
rg_results = scan_files_python(batch_map)
|
|
1326
|
+
except KeyboardInterrupt:
|
|
1327
|
+
STOP_EVENT.set()
|
|
1328
|
+
try:
|
|
1329
|
+
rg_thread.shutdown(wait=False, cancel_futures=True) # type: ignore[name-defined]
|
|
1330
|
+
except Exception:
|
|
1331
|
+
pass
|
|
1332
|
+
break
|
|
1333
|
+
|
|
1334
|
+
# Emit results per URL after rg finishes (dedupe per URL only)
|
|
1335
|
+
for url in batch_map.values():
|
|
1336
|
+
findings = deduplicate_findings(rg_results.get(url, []))
|
|
1337
|
+
if findings:
|
|
1338
|
+
if args.output:
|
|
1339
|
+
with open(args.output, output_mode, encoding="utf-8") as tf:
|
|
1340
|
+
tf.write(f"=== Results for {url} ===\n")
|
|
1341
|
+
for r in findings:
|
|
1342
|
+
tf.write(format_finding(r, color=False) + "\n")
|
|
1343
|
+
tf.write("\n")
|
|
1344
|
+
if not args.quiet:
|
|
1345
|
+
print(f"=== Results for {url} ===")
|
|
1346
|
+
for r in findings:
|
|
1347
|
+
print(format_finding(r, color=args.color))
|
|
1348
|
+
print()
|
|
1349
|
+
advance_progress(url)
|
|
1350
|
+
processed += 1
|
|
1351
|
+
|
|
1352
|
+
clear_dir(batch_dir)
|
|
1353
|
+
# Swap batch and buffer for next cycle
|
|
1354
|
+
batch_dir, buffer_dir = buffer_dir, batch_dir
|
|
1355
|
+
batch_map = buffer_map
|
|
1356
|
+
buffer_map = {}
|
|
1357
|
+
buffer_failed = []
|
|
1358
|
+
except KeyboardInterrupt:
|
|
1359
|
+
STOP_EVENT.set()
|
|
1360
|
+
finally:
|
|
1361
|
+
if progress_enabled and progress is not None and progress_task is not None:
|
|
1362
|
+
try:
|
|
1363
|
+
progress.update(progress_task, completed=processed)
|
|
1364
|
+
except Exception:
|
|
1365
|
+
pass
|
|
1366
|
+
if progress_cm is not None:
|
|
1367
|
+
try:
|
|
1368
|
+
progress_cm.__exit__(None, None, None)
|
|
1369
|
+
except Exception:
|
|
1370
|
+
pass
|
|
1371
|
+
|
|
1372
|
+
if html_requested and html_output_path:
|
|
1373
|
+
html_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), "html_secret_gen.py")
|
|
1374
|
+
cmd = [
|
|
1375
|
+
sys.executable or "python3",
|
|
1376
|
+
html_script,
|
|
1377
|
+
"-i",
|
|
1378
|
+
args.output,
|
|
1379
|
+
"-o",
|
|
1380
|
+
html_output_path,
|
|
1381
|
+
]
|
|
1382
|
+
try:
|
|
1383
|
+
subprocess.run(cmd, check=True)
|
|
1384
|
+
except subprocess.CalledProcessError as e:
|
|
1385
|
+
raise SystemExit(f"[!] Failed to generate HTML report: {e}")
|
|
1386
|
+
|
|
1387
|
+
|
|
1388
|
+
if __name__ == "__main__":
|
|
1389
|
+
main()
|