@paa1997/metho 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1389 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import base64
4
+ import json
5
+ import tempfile
6
+ import os
7
+ import random
8
+ import re
9
+ import shutil
10
+ import signal
11
+ import subprocess
12
+ from requests.adapters import HTTPAdapter
13
+ import sys
14
+ import zlib
15
+ from urllib3.util.retry import Retry
16
+ from concurrent.futures import ThreadPoolExecutor, wait, FIRST_COMPLETED, as_completed, TimeoutError
17
+ from typing import List, Dict, Any, Tuple, Optional, Union
18
+ import time
19
+ import threading
20
+
21
+ import requests
22
+ import urllib3
23
+
24
+ try:
25
+ from rich.progress import (
26
+ Progress,
27
+ BarColumn,
28
+ TextColumn,
29
+ TimeElapsedColumn,
30
+ TimeRemainingColumn,
31
+ MofNCompleteColumn,
32
+ SpinnerColumn,
33
+ )
34
+ except ImportError:
35
+ Progress = None
36
+
37
+
38
+ # ==============================
39
+ # 1. Generic secret patterns (same style as before)
40
+ # We'll treat critical/high/medium as [secret]
41
+ # and low as [secret/low]
42
+ # ==============================
43
+
44
+ SEVERITY_PATTERNS_RAW: Dict[str, List[Tuple[str, int]]] = {
45
+ "critical": [
46
+ (r"sk-[a-zA-Z0-9]{20}T3BlbkFJ[a-zA-Z0-9]{20}", 0),
47
+ (r"sk-proj-[a-zA-Z0-9\-_]{80,}", 0),
48
+ (r"sk-ant-api[a-zA-Z0-9\-_]{80,}", 0),
49
+ (r"AKIA[A-Z0-9]{16}", 0),
50
+ (r"ASIA[A-Z0-9]{16}", 0),
51
+ (r"sk_live_[a-zA-Z0-9]{24,}", 0),
52
+ (r"rk_live_[a-zA-Z0-9]{24,}", 0),
53
+ (r"ghp_[a-zA-Z0-9]{36}", 0),
54
+ (r"gho_[a-zA-Z0-9]{36}", 0),
55
+ (r"github_pat_[a-zA-Z0-9_]{36,}", 0),
56
+ (r"glpat-[a-zA-Z0-9\-=_]{20,}", 0),
57
+ (r"-----BEGIN\s*(RSA|DSA|EC|OPENSSH|PGP)?\s*PRIVATE KEY-----", 0),
58
+ (r"mongodb(\+srv)?://[^:]+:[^@]+@", 0),
59
+ (r"postgres(ql)?://[^:]+:[^@]+@", 0),
60
+ (r"mysql://[^:]+:[^@]+@", 0),
61
+ (r"xox[baprs]-[a-zA-Z0-9\-]{10,}", 0),
62
+ (r"[MN][A-Za-z0-9]{23,}\.[A-Za-z0-9_-]{6}\.[A-Za-z0-9_-]{27}", 0),
63
+ (r"SK[a-f0-9]{32}", 0),
64
+ (r"SG\.[a-zA-Z0-9_-]{22}\.[a-zA-Z0-9_-]{43}", 0),
65
+ ],
66
+ "high": [
67
+ (r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*", 0),
68
+ (r"api[_-]?key[\"'\s]*[:=][\"'\s]*[a-zA-Z0-9]{32,}", re.IGNORECASE),
69
+ (r"api[_-]?secret[\"'\s]*[:=][\"'\s]*[a-zA-Z0-9]{32,}", re.IGNORECASE),
70
+ (r"[Bb]earer\s+[a-zA-Z0-9\-=._+/\\]{20,500}", 0),
71
+ (r"[Bb]asic\s+[A-Za-z0-9+/]{18,}={0,2}", 0),
72
+ (r"AIza[0-9A-Za-z_\-]{35}", 0),
73
+ (r"sk_test_[a-zA-Z0-9]{24,}", 0),
74
+ (r"pk_live_[a-zA-Z0-9]{24,}", 0),
75
+ (r"[0-9]+-[a-z0-9]{32}\.apps\.googleusercontent\.com", 0),
76
+ (r"https://hooks\.slack\.com/services/[a-zA-Z0-9/_-]+", 0),
77
+ (r"https://discord(app)?\.com/api/webhooks/\d+/[a-zA-Z0-9_-]+", 0),
78
+ (r"https://oapi\.dingtalk\.com/robot/send\?access_token=[a-z0-9]+", 0),
79
+ (r"sbp_[a-f0-9]{40}", 0),
80
+ (r"hf_[a-zA-Z0-9]{30,}", 0),
81
+ (r"shpat_[a-fA-F0-9]{32}", 0),
82
+ (r"shpss_[a-fA-F0-9]{32}", 0),
83
+ ],
84
+ "medium": [
85
+ (r"[\"']?password[\"']?\s*[:=]\s*[\"'][^\"']{8,}[\"']", re.IGNORECASE),
86
+ (r"[\"']?secret[\"']?\s*[:=]\s*[\"'][^\"']{8,}[\"']", re.IGNORECASE),
87
+ (r"[\"']?token[\"']?\s*[:=]\s*[\"'][^\"']{16,}[\"']", re.IGNORECASE),
88
+ (r"pk_test_[a-zA-Z0-9]{24,}", 0),
89
+ (r"[\"'](10\.\d{1,3}\.\d{1,3}\.\d{1,3})[\"']", 0),
90
+ (r"[\"'](172\.(1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3})[\"']", 0),
91
+ (r"[\"'](192\.168\.\d{1,3}\.\d{1,3})[\"']", 0),
92
+ (r"[\"']/admin[/a-zA-Z0-9_-]*[\"']", 0),
93
+ (r"[\"']/debug[/a-zA-Z0-9_-]*[\"']", 0),
94
+ (r"[\"']/api/internal[/a-zA-Z0-9_-]*[\"']", 0),
95
+ (r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", 0),
96
+ ],
97
+ "low": [
98
+ (r"[\"']?[a-z_]*key[\"']?\s*[:=]\s*[a-z]", re.IGNORECASE),
99
+ (r"[\"']?[a-z_]*token[\"']?\s*[:=]\s*[a-z]", re.IGNORECASE),
100
+ (r"[\"']?[a-z_]*secret[\"']?\s*[:=]\s*[a-z]", re.IGNORECASE),
101
+ (r"=\s*function", 0),
102
+ (r":\s*function", 0),
103
+ ],
104
+ }
105
+
106
+ COMPILED_SEVERITY: Dict[str, List[Tuple[re.Pattern, str]]] = {
107
+ sev: [(re.compile(p, flags), p) for (p, flags) in rules]
108
+ for sev, rules in SEVERITY_PATTERNS_RAW.items()
109
+ }
110
+
111
+ # ==============================
112
+ # 2. False positive filters (from the extension)
113
+ # ==============================
114
+
115
+ FALSE_POSITIVE_PATTERNS_RAW: List[Tuple[str, int]] = [
116
+ (r"/cdn-cgi/challenge-platform/", 0),
117
+ (r"\d+\.\d+:\d+:[a-zA-Z0-9_-]+ivzz", 0),
118
+ (r"[\"']secret:[a-z]+_[a-z]+_[a-z]+[\"']", 0),
119
+ (r"[a-zA-Z]+Key\s*=\s*[a-z]\.define\(", 0),
120
+ (r"[a-zA-Z]+Key\s*=\s*function", 0),
121
+ (r"[a-zA-Z]+Secret\s*=\s*function", 0),
122
+ (r"PrivateKey\s*=\s*[a-z]\.define\(", 0),
123
+ (r"PublicKey\s*=\s*[a-z]\.define\(", 0),
124
+ (
125
+ r"[\"']?password[\"']?\s*[:=]\s*[\"']?(test|demo|example|placeholder|changeme|password|123456)[\"']?",
126
+ re.IGNORECASE,
127
+ ),
128
+ (
129
+ r"[\"']?secret[\"']?\s*[:=]\s*[\"']?(test|demo|example|placeholder)[\"']?",
130
+ re.IGNORECASE,
131
+ ),
132
+ (r"tokenize\s*:\s*function", 0),
133
+ (r"getToken\s*:\s*[a-z]$", 0),
134
+ (r"setPassword\s*:\s*function", 0),
135
+ # HTML head/meta blocks that aren't secrets (e.g., BMW homepage metadata)
136
+ (r"<meta\s+name=['\"]generator['\"]\s+content=['\"]bmwcom:[^>]+>", re.IGNORECASE),
137
+ (r"<meta\s+property=['\"]og:title['\"][^>]+bmw\.com", re.IGNORECASE),
138
+ (r"BMWTypeNextLatin\.woff2", re.IGNORECASE),
139
+ (r"The international BMW Website", re.IGNORECASE),
140
+ (r"og:image.*BMW_OG\.jpg", re.IGNORECASE),
141
+ ]
142
+
143
+ COMPILED_FP: List[re.Pattern] = [
144
+ re.compile(p, flags) for (p, flags) in FALSE_POSITIVE_PATTERNS_RAW
145
+ ]
146
+
147
+
148
+ # ==============================
149
+ # 3. Embedded nuclei_regex from background.js (compressed)
150
+ # ==============================
151
+
152
+ NUCLEI_PATTERNS_B64 = """
153
+ eJzFXQ1z2ziS/Su6uY+NJ0rGVsbZTKpSWU+Snc0mk0klubnaszwuiIRIWBRBA6RlOc5/P4AESJCS7cTz2peq2LJEvQc0Go3uxtfh4XeH0+/+cvT8QhZieXj84Og5iyJZ5WX9esHXzceHf0ynH6dTNZ3mR98fPnt61H+jeWY6XT04ut/88d14NDkaj1r4tMoEy5MaNUnrX6Vc8BwEz3Mtc1auC14wrVdSxTDgmOtFU17FzoRuaiDKtJphKNaNqDVXZ1w1LVAIrPQdRcFUmTsOxeeKa3hDbGGKMsFzX8dI8RLKRItO1RJhH6NtiZCJSlamX+u0yon69jZ4bCdfrVYNuJRJxk2bN508kksQfuGE7lq5UOKMlbzVLPt7xjR//CMNXyTznEeg1vbgihdSEfUSz1EYEqYaQcVCYcHjWf2rMoaXBBiso0baCQnuSTPsF2lR5aIMOkLzkstIxpyonU01TBtrTdQg2+DB4hN5xFVJghwnvAEuue75CGj8+UAy9o9Hd8IyIWdB4xs7LaLNiuzfFRFqiLiRCK8AVxDhdWALEYaClSbKoAWO+ZmIOC2H8QeMGdGsFES1WRjoJdGYcaYjDvQuTURXscx5NJFMzBDopYIvulCVLmXp+Aw+ELoIA9NGk4pMrntOpm3nu+KjoplVIotbD5SOuRlw5zGFIRmAE8Vl1pFyOQsYXqO4WvNSh+V1kaZ3tWCtsI1ywGN/C1C7GLo2cAIB1nicOQdur/kza1SZLdmFzNkKHG5WasZyvDa1qEtTG9csNAQ49THWfO09Z2HMLhgVaxXKlSiNYCWryrRRcuA4twmObDuH7r0LXS0JFORKEpi+GIacwvcyuJmQ6MRYgKpRFjDANCKei6RSnZtIwpKyzUSl97xsvGu9YTRphw/VziE8Ti8VP68R5aJkQ5cBqVGehwq+nUChQTXtWbl0Ic8bJ+uMKZD7E/AUVeZDhtPKJmfQBPgsegDOJ5wQ3boxdOgzxfII5JwFsL7H0hQ68FiRBDVUI3Kn9CfsjIHAU64aSyP1Vrcbw8JbT9JgaoqR16dOS1w39ZBBUIpsVq4jomFEn3XpHhBipRK4MelAM5kIFGipROF8u2qWiQgP62b1cLjDCBunCC16LQudslnG0RRSLQtWphuqjHa8rmdCuay6NI5E6qRmgyi41vcIAmcY5nVbhs4W3kH2ZpMwk1ETThDhz6pogdOrIbrP1BBmoAypMXnNRK9dElC/qJTzNqvcjA0FysToZVkEkR4KNQUOLyqKsv6aKGSPO6384gLFWewzFvGizS5rYWwLRapHn7r0C66vnepGLXGIZkBzmrhkIqOQQiFLMV/32pcoF76NisYt1wUzsHdiXjsmSotkBqI4ymQVU2hAH5yq8UmWhnrUQIFqYw1bvnIFAWEtiKEJYHN+XlHEjT2SpOhWXhWpssMyHYnvvzlbAmkU3My1oIWSJ3Z1Ida2tfBSJSwXF10WGsgRKebyqWjt6ZBZUcBln68XZJgUjlYu5UqxwvlVJMudexzg1gyhqYaozC8Ibqe8ee11odCdxGN+xjMTQjhf13oQ4E51LRWGQiS590zbhALM3dkGjmzoLfhgdR0wYEB5iQ5YeBlrGlB0ILTpzsOxSeIDN3u6kVBCqrMxhgq89CrAJCoqz+NCihwPHPM5q7LSOy5wfJLEo5HGyk5lwNUvjxMl4nZUA3q0Q2gCWLBRDqFpRIyfLvLoILSsXUfCM56LqpknTiVqYi5gyGQz/mXWB/HeCIolWfLBOhWgyK29w4kcPLhaOPjg2htU27w3GvwnOOITOOJf4YiP4YigjS4BImhHS4AI2roSIIL2qASIe3t4yF08JBwRVcYYn9GKVFWKXFxwPDSrOj8c7+ezPJ7J894UJfXUwxZKyjmIkI5m1oZlXM+lcs00qzKXYQumyivl133fJTHYD2XdbIv1XvCgw+aBh3HXkWEYHvUTcdDSP6JLJWxAu00zVOgusW18bSyHtx4ExW+hKcuOxOPnpQ2cM+fX1GWn2TXkCN0KmnYGiqghttJQNQrlyLSVAQKtqtk64UtNlvhRZWNF7QI69CStx8bvPFWyKvk+ecMqWejSz7gPzmvBEIh8wWdrQoJ22Ro6S2GXRiH1sD5Jxk9ZrPgsZs3eDbvujZahWKHUhWfcL9eDiqaDxe8JUTwROki1270zeGBg16eZOlY8FrpzKoFaF/t9SWWV5y4RaeBR4jBRkVpWWkSmh8M3jCo2m4lyeUoQhJxWAr4m4lQIt+ACCFqsnem09ZcrVP2LSqfyDJ9mcIv54UFLiAsbREJQN45jcIPlSMiFzQY3JovdWnBsR2thCR2l0HcJlx6ga7KxFhk7pViYwYUrY7JpsLtTS2pPHgMrdZmYAfE0A4+zA+B4hoVtAlu3P7f3JlhnbuSD1Syr7EY1gipksszaVes4c5N1K8fJYMEWsun9uNliBxizkuEmHAuu7MrVDD8Eh8A4oXIVwTPaHagfijGwbF24+hOtwfRdF3lmUsGUC99OwOmhgiVcxVVJYB1YtDDgXUQEVA6ptUoJhtgOF6kRHSrWrPdx0ZgnQlEsapcUY7QcpDdhwXd7Wm4QDiSKxW6f851sIvFlCHjD4txNGQqer1LhjrCH2QgL2yTTCLBzbgMJ1rn5fR0hIoK7LHIp/HlyBJgmfqZAJcDEQfrTj+pTzCaBLwA1+ley4McX2uOc5NzBQ8vsQW2SF7WTvwXF6UpEtIIiAPYTlmBXZpMAXQO06zHrxrR2LtFoXEyJjzzqYUDAEh5qeD0845p3Oxe4iSnmi/NqOTNhqLMoqndONYohQWdWO0hkiNRDBRe4gJ+f5iEpMtceG6u/HhW4B9JDkvQMj43fOzuExjWcXMHL6owlFDL2gZObBV+Hx2KBNyNcyUZz1NuVdG21UJM0LZNNg2MQEyXhm9A7UJJeakNssJ0iitpzvnIeSOZusphx/PR2zstseCgOri+xUpxxfmajMwzicq1PM2QoUwM2p5njNhHWoARhQR8XjamkpAiWOgIqXDshRCFk7LzQsredGWqIl1VWNl3XiHfRnegAOxGhTzDPnN9AgG+XYbmr6ogYiOFncgaGFrk/4Z3Ad+/QKdYvL4XLeV9wJcHQFJMyy/YAUVw5E5+q9cM4fJh1FHhcVszkOXxVYQPbFPlutqptYSRcDRWy4YPBEJ1k69OS5WK+kd5EzcltRceWPlYiI5rOMegs4XbvP8Vy2w6dQCo+yQh1njtYoIhFllT5hlnAN2XHU7i4nhQcOqiE+EqcEZYeO74GyFTyINEUN/+Ml0aUimWBL28HSyIOEmEkxvBJYuRukYkLzEeEHNhqZLIshTsfsMgkc8fl+puH24Pqejv57pyejJRw13TAkhqXhC7Y2koE76CZNI7D4Nh6ZTcfuWvTaNYxOJdLp8wvoMdVKOtCKA1eI5aJvDp3ATvBmYmZ3eEZu3XhG5KXarSp4Xh9EElaptJfbInH5wtTkyZ7093Cis18bucAGxvO8iJzR5LBpLM4l1mp80dnMoll8dMEtFV/Uc04cmtTjTdPQkWs71PBgMszRriN2t4bRbB/fcHmC7/MyZlRmNnpsEWuS5a7hC2uw3QELF46+wMr/ckKP36cxLPoaZN7h+G1swNgSD8xABOo6C4Ug095Cre+OpaRHhpQJE0enFLSKAaoH1rkJLiCNFwah8rZXcMBHi1FHrsLNnGdXZw4M+VcSuzYO0THjrrt1fHngk0o8OWSz5SbSw/WcBCkZlMZGTUhSGqmXMlFRYgKXGMVoML7Tto0XSTjetH+RuBJxxJe/wFiYeCcowekWDaTKNNzWtOCwnTbR2gWVoUbfci2pF9Bgp5RCjcPEZ3vcAUFTDt9B+qnVUB9qe2dFNEF1c0/Hleucq5KVaG81OASIZwPQHczUSJl0m6M6zef/X1PxDvP4UxLVhDtUQ9IiBJt2xlgpmYrOtA/CAjCCynqTZAggoRiOaYZpTI3wlJczzMcBFGWkWZs7VDrqAAPSnKSWsAAO/QtxETrmwcm2cmzgY9GxZnUDjOtcn9ZPDbG3c5BI+4m/0KxzGODpKnQzOd/wCFMQEc11gXxOm0fIOkCBDmGLegUHY1maZkPIgkgoaaAYHB3KYXlUpTeuuCL3McnqIDtJZKq9AE4suh1/p+mv6R4z8pNb9tJoZlzk7vhn8rK4g1r2k8Z46FpfKIhNq3Ac35e/v819zeww4LLAfNdVRVqTohsiZucQx5pkEQUqxaT7lCw8AQAoCS6i+aRh8cFuEghz0uKA0BCVCxisQLjQZXLo2bSHqYJhcStA5hLrRmNGZhnIlqoHjbSHm6BBxaduVPAKJZOzoXic7fvBSoRgzsjyDX1cMMT39yV2AQsUFm3qAQx7Fb8E406SGruNx1iTRM/pzB4vFLcL9RaafBtnf7A494pg+DyewraHXwDGigmvJCUSyA8B3V6mOfBzcoUahOpddEutwKDm5GoFJHmTEUUOzEcvgtc2ovoYBnE2PZXFY4cMPUJoIEuVqxkkbne705Oxkvdkvidr53Kz0wjo1zkWJrCMn8fBUGniu0CMWX6LVgyHpZC6jU2jSCa4oaH8cMW1IYEVPKogWlg4aA4A9JhEimcMEMby2TE3ZYBHSxFapJRKPHcxDSTMUpqQyqa+ZWYaq3SYF8PHBQpgw4VeZ0Z2a6jOE6ioUeHFEfcaES7oJ13+3dBmaArKDDY3YobXHItAIUCwgRKYldn4CyYA8QekhX7tdf1GU2ws6VtIWPpj8kB77fvg4PjvVa8BN1gCE0Aa+/cpIDFdo0Qmki+wH7H8sR5YLRLTaJ1obyTonhkxQ1V7cjefyjc5TU6YvCDY2sClmVwP6sPTDHL3WegaFwZnrwF8qiN2TbF1t02OqzB2oDHmgEDX/K8nFeNe3Q2IfTUB1yt6+Svi4mW8JNeB5RF2gyDw0OnfEnomAeMXrk3BF1bYw7yqW5ZBgru4V5TumO/BsStUtG3MZkgdbXkCjgENYDogS3n/lYX5ClXkYz9DjOoUGMzuJ+RwGZi6Tf00IySMRvcMwhniCQrZOxUWVU5/MDzqyiAUXS0eZwewRxdtJHaQu+DjLKql6jbtmHMetyPf8TydRvrUfnpetpI5EytW9y6YiVLbBen4sDhzjPmDlxB6mkfGX/b21UEhHVAJwRqfNaulewW4GETMT0aY1xtn3bnelESYR36PrS6gwqYsjtvgpCkPTIID01YamaMUUnfAnbOX5zR8ZD16hOmKKbNiVZWG9jGck44WcCaMbGETz0HqLoqkLd+BMioS+VDSKwqiy7rixawaIcm+EYnh20v5dOlQk9IOvQwlMA5Te56C6ppxChVcukjrbniGr54JWCgsikpNyqTw6eEozmFwLkqxVxEPv5Ao+d6rfESZmXpz6shWKQdwNfnIfe9ldbjhnNRBLNMJfh8BTMq3kif7jTj2TlB6t2BYpV8VuVxxrW4cDffkSyAmFUiayQ8U0YN06EZxsm93lTE/U3ld3OzyXZSwhTMTMmV0SzTpaMFXa0aEp8SIWViVoCDiTySFdWzrOJLQdI5A2TYwSY9UGy3HyB7p4sKHo6KG8R6qOiYcibyUrE1Gg/eLwJYkrKGiyKBWhwwICd5BrhgsTRz2sj9rw4SvqoKvxvKIN7NREhHRDgQGxL4FcGd03InzsuQjlZaQY1AY8IWH49ANnfUBLgg02PCC+nuO+jKC776ewsPWRe7A12n1BeHjcEMDhWiuul+42yhmiWOQZlDC79LmRrrCLxoGiI0B9q5My6XsOkflwpibfPiWtam4VhUEtixHjJRgZGuWB/5rkaQraSU5s0RSgWOGWzGD75kzfjFLkspYoIoxMATGAL8DQYesikmKO/hQW3eRoVLv7Bq4VhcT63bkCvQNU7GEiarYY+Byl349T/4MZXggJqwkDU2zGoMgYkKjQHsFudjAXElzLXwR8adsSqjWL5jb3KWzmoSX17Ecmac/hi+hIEt2YW8m5xDxxQYKlyOmWUiPPeNIOANCUgklAnWLbBBYSbSwDqhtMeNeCOzR85CiY83awHPpj2uecl5lhFq5L6CBA/e3t9HW4cNmsldkKC6iFCHD46N7a4PMrWvbJvY312bWNT61cMGYwgRX3C1IGrPtt7Alcbh9Rk4P4nGDe1QoY7XZpQJhq8FDAHD+kbYCWT7mRHe90Eh3Rtg/K7YNPhBQpiGwKsXDbpPBtGgu/6MBTcf/Pzql9fvRuah+6P3H17/fvDp1ejNq3/ZTwYPpxVbcTGdPpRam5/32OJSLy6bSpu/rWJc8jwupMjLy0xGLDPv2luId0BdkPZ+qdAe36srY0j+PPjbTwevDw8e/C97cDGdxkef9ybjR7tfzMe79ccHb16/7H38aPxj8PE/X744Ptx98JN54ujzZL/3WUN07/nTg0efLIR57OjS4B1cHvzy3vx4/dL8+PCbfWX/fFf/+N38+Pj6YMd/4fPe4y9tiRtci/jm7ScD9en9zqEpmH/00f54v1f0t5+6j81gbbDGkyfBA68vmCu8eex4On1gMYIK/Dw7qs9/m071/Q7JPPfs4fH9H6bmn6n0riHd7X+JaRHV32mQzXfu/2DYn4y/PPu8O570RfSX6fTQNNEBO6onEcQF8weTmE+Ojr43QN8fPn12VP8273139Hw6nRkhHH4qj2yfs0xGD8LyHVs+VzL3nU58SWaU/kG/Psf145PJl53uOUORpMVlkkrzvzL/tfmvLpt1dMcGY+e4AzEAj4x09/cDhIP3RnfC9pmMfxxU/uje6vywEZER0P5478mXHe899Z5afc1TSXrcPbY33ns0fMxUqfaVjuuh6NKPR5f2W6Z84739L8ftKHVphxP3pHu5ii/991fxjlWA54dWur1G6t52hfm3v/37f/znf31/9Hl/PNn90n3c9sG0LAv9dDo1KvXD6dq4h8YwGVN2LkzbPjw9NT8iuTSfRYn55MHMvv3Dis9SKRfmlTZGzUAaa/Cs16h1h2w6xCaNbFjsjt+SZYuWQsmZLDvQZsQ7rm37s1a2+7vjJ1fhFlYhH8650GllUfP2zQeG0hjmHxqCs4n54WrggG8qs33aWnadsagrsdukph1O1weOPj8e702+fMP7RrUmP3bcfP1P9fpEis2uZQSwZ7rWoDcnWXTcdXn/rNH6ybZnNQseto99sX//nT2Y138Hdmrzxejg9eijr3c+Mv/MR3uPG+Rx/Zd78DcjevOw6YqjN3z9Vc++bzYofN13DvIyVbIQ0cbjrp5ifIt3Rr9zFfHsNt/8WBX1RhZIOcJ3fPXFeBtdXX3TBUfGLtqbyofF+rs7lBperCsF8T5jOS+1cXP4ljpc99aVkC8yE8LeSmjjDZhR4+tamQ2/v+Vhd2msDb9737gG3x5D8W0k9TeuYvL6bmfKQQ2khF6y23zzgwnyV+zmrjb6YKy4cV9upBhvfukmhX7RbkTGSOMDr4ec6wtmH7mhYFug/7vQJdPpbUr1jst89PLnmx/8VCktb8PwVuTGx2zfnWypd/OIr/dGEaX1F/98r2xwRsHxMaN62L+2SdyX6gev0pQDoeq82Teqof/aqODKHiudjRpnZGupRu+lLpfs+sK6Z64S5EtpJ9M3est4yzNdmW4Wkf+K81O2F/8fRj9FJFUx+t1Od13fUvUj1+Jte37Gyijdzv6rzBMZaPmT+m3rIP3x9Oj+08M//nZ0/282otTWm+7i7k3HxB+3fO8023nu3LZvBlmu9Wl22y8rHgt9T38ze/097b8V87kV2rcQuyj9trPW/Tjpy9YUwJ9Z/fU1+OEl8yQEg3P5vy0Bfj3BoB1MY7ZHCiLPuw1gXQT2p4F/fXcUZlgmj8YmNnnYPPz5cfB68tctMUEYILnS3WNFsfO8DZDqSM/Hi7p+Mr5ve0ZTnBtVuuSZHRGatXUmeMPJ1BTk8xMTSX15GiQSBkmYzRc6NQONDfTnJlzysdPWxyL2VY8VX/eYiYNvfGwguiJjboUBwXV3HThmVqJTqflxr7Pt9tJ6gyqmVWLPFJqzCHjJUWeVn/SL8nh7UYYaKxNephx+E1U3vupFr1z7u+PHX1GuSJpCwQ/uGeqc8RbtEXHENCvOzoRPd9PRnMaKDU5RxJMsRXZWAWclA9ed5cnHpSi/LfjY6vW1UFf5r79ImRh3uQ7MRvdYHAs70LJsBxOgHVxUihPgvmzOt75ddLYyQWEmom8MLdrvXSXKt6zKo/QlU4vsuih7e0oo/PLo48s32whetOdZ3irHxXJR3ibTNvpYKusO3KaZhOJRWembn/zFUKQvfv3YvdV0j8S+Hy3hs//pukYm6L7tvJ5euvuY0OsXOio7g3bYzqVY/7ae6Hvz6l+XH1+9+PDq0+Wn3968enf5/uDjx//57cPLS/Pey1fvPr0+eHtptHinR/msXwAfvhhnq8luH/0f7Qin0A==
154
+ """.strip()
155
+
156
+
157
+ # ==============================
158
+ # 4. Core scanning helpers (multi-threaded)
159
+ # ==============================
160
+
161
+ REGEX_CHUNK_SIZE = 50
162
+ LABEL_ORDER = {"secret": 0, "secret/low": 1}
163
+
164
+ DEFAULT_CONNECT_TIMEOUT = 5
165
+ DEFAULT_READ_TIMEOUT = 90
166
+ DEFAULT_USE_CURL = True # prefer curl path that worked without flags; may disable for high parallelism
167
+ DEFAULT_SCAN_TIMEOUT = 80.0
168
+ DEFAULT_FETCH_TIMEOUT = 30.0 # wall-clock timeout per fetch batch
169
+ DEFAULT_REQUIRE_STATUS_200 = False
170
+ DEFAULT_MAX_BODY_BYTES = 100_000_000 # 100 MB guardrail to avoid OOM on giant responses
171
+ MAX_PARALLEL_HARD_LIMIT = 50 # hard cap on simultaneous fetch threads
172
+ PROGRESS_BAR_WIDTH = 20
173
+ FAILED_FETCH_LOG = os.path.join(os.getcwd(), "failed_to_fetch.txt")
174
+
175
+ ANSI_RESET = "\033[0m"
176
+ ANSI_COLORS_LABEL = {
177
+ "secret": "\033[93m", # yellow
178
+ "secret/low": "\033[90m", # gray
179
+ }
180
+ ANSI_MATCH = "\033[96m" # cyan
181
+ ANSI_SOURCE = "\033[95m" # magenta
182
+
183
+ STOP_EVENT = threading.Event()
184
+ QUIET = False
185
+ thread_local = threading.local()
186
+ RG_PATH = shutil.which("rg")
187
+ RG_SEM = threading.Semaphore(8) if RG_PATH else None
188
+ RG_ENABLED_LOCK = threading.Lock()
189
+ RG_ENABLED = bool(RG_PATH)
190
+
191
+ # Proxy management
192
+ PROXY_LIST: List[str] = []
193
+ PROXY_LOCK = threading.Lock()
194
+
195
+
196
+ def parse_proxy_string(proxy_raw: str) -> Optional[str]:
197
+ """
198
+ Parse a proxy string in format ip:port:username:password
199
+ and return a URL in format http://username:password@ip:port
200
+ Returns None if parsing fails.
201
+ """
202
+ proxy_raw = proxy_raw.strip()
203
+ if not proxy_raw:
204
+ return None
205
+ parts = proxy_raw.split(":")
206
+ if len(parts) == 4:
207
+ ip, port, username, password = parts
208
+ return f"http://{username}:{password}@{ip}:{port}"
209
+ elif len(parts) == 2:
210
+ # Simple ip:port format without auth
211
+ ip, port = parts
212
+ return f"http://{ip}:{port}"
213
+ else:
214
+ debug_log(f"[!] Invalid proxy format: {proxy_raw} (expected ip:port:user:pass or ip:port)\n")
215
+ return None
216
+
217
+
218
+ def load_proxies(proxy_input: str) -> List[str]:
219
+ """
220
+ Load proxies from either:
221
+ - A comma-separated string of proxies
222
+ - A file path containing one proxy per line
223
+
224
+ Proxy format: ip:port:username:password
225
+ Returns list of proxy URLs in format http://username:password@ip:port
226
+ """
227
+ proxies: List[str] = []
228
+
229
+ # Check if it's a file path
230
+ if os.path.isfile(proxy_input):
231
+ try:
232
+ with open(proxy_input, "r", encoding="utf-8", errors="ignore") as f:
233
+ for line in f:
234
+ line = line.strip()
235
+ if line and not line.startswith("#"):
236
+ proxy_url = parse_proxy_string(line)
237
+ if proxy_url:
238
+ proxies.append(proxy_url)
239
+ except OSError as e:
240
+ debug_log(f"[!] Failed to read proxy file {proxy_input}: {e}\n")
241
+ else:
242
+ # Treat as comma-separated string
243
+ for proxy_raw in proxy_input.split(","):
244
+ proxy_url = parse_proxy_string(proxy_raw)
245
+ if proxy_url:
246
+ proxies.append(proxy_url)
247
+
248
+ return proxies
249
+
250
+
251
+ def get_random_proxy() -> Optional[str]:
252
+ """Get a random proxy from the proxy list. Returns None if no proxies configured."""
253
+ with PROXY_LOCK:
254
+ if not PROXY_LIST:
255
+ return None
256
+ return random.choice(PROXY_LIST)
257
+
258
+
259
+ # Ensure Ctrl+C stops all work immediately
260
+ def _sigint_handler(signum, frame):
261
+ STOP_EVENT.set()
262
+ debug_log("[!] Interrupt received, stopping...\n")
263
+ raise KeyboardInterrupt
264
+
265
+ signal.signal(signal.SIGINT, _sigint_handler)
266
+
267
+
268
+ def debug_log(msg: str) -> None:
269
+ if QUIET:
270
+ return
271
+ try:
272
+ sys.stderr.write(msg)
273
+ sys.stderr.flush()
274
+ except Exception:
275
+ pass
276
+
277
+
278
+ def is_false_positive(context: str, use_fp: bool) -> bool:
279
+ if not use_fp:
280
+ return False
281
+ return any(fp.search(context) for fp in COMPILED_FP)
282
+
283
+
284
+ def load_embedded_nuclei_patterns() -> List[Tuple[re.Pattern, str]]:
285
+ raw_bytes = zlib.decompress(base64.b64decode(NUCLEI_PATTERNS_B64))
286
+ raw_patterns: List[List[Any]] = json.loads(raw_bytes.decode("utf-8"))
287
+ compiled: List[Tuple[re.Pattern, str]] = []
288
+ for pattern, flags in raw_patterns:
289
+ pattern_str = str(pattern).strip()
290
+ if not pattern_str:
291
+ continue
292
+ try:
293
+ compiled.append((re.compile(pattern_str, flags), pattern_str))
294
+ except re.error as e:
295
+ # Skip malformed ones so the scanner keeps running
296
+ debug_log(f"[!] Failed to compile nuclei regex: /{pattern_str}/ : {e}\n")
297
+ return compiled
298
+
299
+
300
+ def filter_nuclei_patterns(
301
+ patterns: List[Tuple[re.Pattern, str]], keep_all: bool
302
+ ) -> List[Tuple[re.Pattern, str]]:
303
+ if keep_all:
304
+ return patterns
305
+ keywords = [
306
+ "key",
307
+ "secret",
308
+ "token",
309
+ "bearer",
310
+ "api",
311
+ "auth",
312
+ "password",
313
+ "passphrase",
314
+ "private",
315
+ "credential",
316
+ "aws",
317
+ "github",
318
+ "gitlab",
319
+ "slack",
320
+ "discord",
321
+ "webhook",
322
+ "xox",
323
+ "sg.",
324
+ "sk_",
325
+ "pk_",
326
+ "rk_",
327
+ "ssh",
328
+ "jwt",
329
+ ]
330
+ filtered: List[Tuple[re.Pattern, str]] = []
331
+ for rx, pat in patterns:
332
+ low = pat.lower()
333
+ if any(k in low for k in keywords):
334
+ filtered.append((rx, pat))
335
+ return filtered
336
+
337
+
338
+ def chunk_patterns(
339
+ patterns: List[Tuple[re.Pattern, str]], chunk_size: int
340
+ ) -> List[List[Tuple[re.Pattern, str]]]:
341
+ return [patterns[i : i + chunk_size] for i in range(0, len(patterns), chunk_size)]
342
+
343
+
344
+ def deduplicate_findings(findings: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
345
+ """Remove duplicate findings while preserving order."""
346
+ seen = set()
347
+ unique: List[Dict[str, Any]] = []
348
+ for f in findings:
349
+ key = (f.get("severity"), f.get("match"), f.get("source"))
350
+ if key in seen:
351
+ continue
352
+ seen.add(key)
353
+ unique.append(f)
354
+ return unique
355
+
356
+
357
+ def inline_flags_pattern(pattern: str, flags: int) -> str:
358
+ """
359
+ Prepend inline PCRE flags for rg based on Python regex flags.
360
+ Only adds the commonly used flags we support.
361
+ """
362
+ prefix = ""
363
+ if flags & re.IGNORECASE:
364
+ prefix += "(?i)"
365
+ if flags & re.MULTILINE:
366
+ prefix += "(?m)"
367
+ if flags & re.DOTALL:
368
+ prefix += "(?s)"
369
+ return prefix + pattern if prefix else pattern
370
+
371
+
372
+ def ensure_rg_pcre2_available() -> None:
373
+ """Fail fast if rg is present but lacks PCRE2 support."""
374
+ if not RG_PATH:
375
+ return
376
+ cmd = [RG_PATH, "-P", "--version"]
377
+ try:
378
+ proc = subprocess.run(
379
+ cmd,
380
+ capture_output=True,
381
+ text=True,
382
+ encoding="utf-8",
383
+ errors="ignore",
384
+ )
385
+ except Exception as e:
386
+ debug_log(f"[!] Failed to run rg for PCRE2 check: {e}\n")
387
+ return
388
+ if proc.returncode != 0:
389
+ msg = (proc.stderr or proc.stdout or "").strip()
390
+ raise SystemExit(
391
+ "[!] ripgrep is installed without PCRE2 support. Install a PCRE2-enabled ripgrep "
392
+ "(e.g., package 'ripgrep-pcre2' or 'cargo install ripgrep --features \"pcre2\" --locked').\n"
393
+ f"rg output: {msg}"
394
+ )
395
+
396
+
397
+ def scan_one_chunk(
398
+ label: str,
399
+ chunk: List[Tuple[re.Pattern, str]],
400
+ text: str,
401
+ source: str,
402
+ use_fp: bool,
403
+ tmp_path: Optional[str],
404
+ use_rg: bool,
405
+ ) -> List[Dict[str, Any]]:
406
+ """Scan a chunk of regex patterns sequentially."""
407
+ global RG_ENABLED
408
+ results: List[Dict[str, Any]] = []
409
+ with RG_ENABLED_LOCK:
410
+ rg_allowed = bool(use_rg and tmp_path and RG_PATH and RG_ENABLED)
411
+ if rg_allowed:
412
+ for rx, pattern_str in chunk:
413
+ cmd = [
414
+ RG_PATH,
415
+ "--pcre2",
416
+ "--multiline",
417
+ "--text",
418
+ "--json",
419
+ "--byte-offset",
420
+ "-o",
421
+ "-e",
422
+ pattern_str,
423
+ tmp_path,
424
+ ]
425
+ try:
426
+ if RG_SEM:
427
+ RG_SEM.acquire()
428
+ proc = subprocess.run(
429
+ cmd,
430
+ capture_output=True,
431
+ text=True,
432
+ encoding="utf-8",
433
+ errors="ignore",
434
+ )
435
+ except Exception as e:
436
+ debug_log(f"[!] rg failed for {source}: {e}\n")
437
+ with RG_ENABLED_LOCK:
438
+ RG_ENABLED = False
439
+ rg_allowed = False
440
+ break
441
+ finally:
442
+ if RG_SEM:
443
+ RG_SEM.release()
444
+ if proc.returncode not in (0, 1): # 1 = no matches
445
+ debug_log(f"[!] rg exit {proc.returncode} for {source}: {proc.stderr.strip()}\n")
446
+ with RG_ENABLED_LOCK:
447
+ RG_ENABLED = False
448
+ rg_allowed = False
449
+ break
450
+ for line in proc.stdout.splitlines():
451
+ try:
452
+ rec = json.loads(line)
453
+ except json.JSONDecodeError:
454
+ continue
455
+ if rec.get("type") != "match":
456
+ continue
457
+ for sub in rec.get("data", {}).get("submatches", []):
458
+ if "match" not in sub or "byte_offset" not in sub:
459
+ continue
460
+ start = sub["byte_offset"]
461
+ match_text = sub["match"]["text"]
462
+ ctx_start = max(0, start - 60)
463
+ ctx_end = min(len(text), start + len(match_text) + 60)
464
+ context = text[ctx_start:ctx_end].replace("\n", " ")
465
+ if is_false_positive(context, use_fp):
466
+ continue
467
+ results.append(
468
+ {
469
+ "severity": label,
470
+ "source": source,
471
+ "index": start,
472
+ "match": match_text,
473
+ "pattern": pattern_str,
474
+ "context": context.strip(),
475
+ }
476
+ )
477
+ if not rg_allowed:
478
+ for rx, pattern_str in chunk:
479
+ for m in rx.finditer(text):
480
+ start, end = m.start(), m.end()
481
+ ctx_start = max(0, start - 60)
482
+ ctx_end = min(len(text), end + 60)
483
+ context = text[ctx_start:ctx_end].replace("\n", " ")
484
+ if is_false_positive(context, use_fp):
485
+ continue
486
+ results.append(
487
+ {
488
+ "severity": label, # "secret" or "secret/low"
489
+ "source": source,
490
+ "index": start,
491
+ "match": m.group(0),
492
+ "pattern": pattern_str,
493
+ "context": context.strip(),
494
+ }
495
+ )
496
+ return results
497
+
498
+
499
+ def scan_text_grouped(
500
+ text: str,
501
+ source: str,
502
+ pattern_groups: Dict[str, List[Tuple[re.Pattern, str]]],
503
+ use_fp: bool,
504
+ chunk_size: int = REGEX_CHUNK_SIZE,
505
+ ) -> Tuple[List[Dict[str, Any]], int]:
506
+ # Deprecated in favor of explicit chunk submission in main loop
507
+ raise NotImplementedError("scan_text_grouped is not used in the current pipeline")
508
+
509
+
510
+ # ==============================
511
+ # 5. Fetching, formatting, CLI
512
+ # ==============================
513
+
514
+ def build_session(
515
+ retries: int, backoff: float, trust_env: bool, pool_maxsize: Optional[int] = None
516
+ ) -> requests.Session:
517
+ """Create a requests session with retry/backoff to survive flaky endpoints."""
518
+ session = requests.Session()
519
+ session.trust_env = trust_env
520
+ retry_cfg = Retry(
521
+ total=retries,
522
+ connect=retries,
523
+ read=retries,
524
+ status=retries,
525
+ backoff_factor=backoff,
526
+ allowed_methods=["GET", "HEAD", "OPTIONS"],
527
+ status_forcelist=[429, 500, 502, 503, 504],
528
+ raise_on_status=False,
529
+ )
530
+ adapter_kwargs = {"max_retries": retry_cfg}
531
+ if pool_maxsize:
532
+ adapter_kwargs["pool_maxsize"] = pool_maxsize
533
+ adapter_kwargs["pool_connections"] = pool_maxsize
534
+ adapter = HTTPAdapter(**adapter_kwargs)
535
+ session.mount("http://", adapter)
536
+ session.mount("https://", adapter)
537
+ return session
538
+
539
+
540
+ def get_thread_session(
541
+ retries: int,
542
+ backoff: float,
543
+ trust_env: bool,
544
+ pool_maxsize: Optional[int],
545
+ ) -> requests.Session:
546
+ """Provide one session per thread to avoid cross-thread contention."""
547
+ sess = getattr(thread_local, "session", None)
548
+ if sess is None:
549
+ sess = build_session(retries, backoff, trust_env=trust_env, pool_maxsize=pool_maxsize)
550
+ thread_local.session = sess
551
+ return sess
552
+
553
+
554
+ def fetch_with_curl(
555
+ url: str,
556
+ connect_timeout: float,
557
+ read_timeout: float,
558
+ verify: bool,
559
+ user_agent: Optional[str],
560
+ use_proxy_env: bool,
561
+ curl_path: Optional[str],
562
+ verbose: bool,
563
+ max_body_bytes: Optional[int],
564
+ require_status_200: bool,
565
+ dest_path: Optional[str] = None,
566
+ proxy: Optional[str] = None,
567
+ ) -> str:
568
+ total_timeout = max(int(connect_timeout + read_timeout), int(read_timeout))
569
+ curl_bin = curl_path or shutil.which("curl") or "curl"
570
+ cmd = [
571
+ curl_bin,
572
+ "-L",
573
+ "--connect-timeout",
574
+ str(connect_timeout),
575
+ "--max-time",
576
+ str(total_timeout),
577
+ ]
578
+ if max_body_bytes is not None:
579
+ cmd.extend(["--max-filesize", str(max_body_bytes)])
580
+ if not verify:
581
+ cmd.append("-k")
582
+ if user_agent:
583
+ cmd.extend(["-A", user_agent])
584
+ if verbose:
585
+ cmd.append("-v")
586
+ if require_status_200:
587
+ cmd.extend(["-w", "\\n%{http_code}"])
588
+ if dest_path:
589
+ cmd.extend(["-o", dest_path])
590
+ if proxy:
591
+ cmd.extend(["-x", proxy])
592
+ cmd.append(url)
593
+
594
+ env = None
595
+ if not use_proxy_env and not proxy:
596
+ env = dict(os.environ)
597
+ for k in list(env.keys()):
598
+ if k.upper() in {"HTTP_PROXY", "HTTPS_PROXY", "ALL_PROXY"}:
599
+ env.pop(k, None)
600
+ env["NO_PROXY"] = "*"
601
+
602
+ result = subprocess.run(
603
+ cmd, capture_output=True, text=True, env=env, encoding="utf-8", errors="ignore"
604
+ )
605
+ if result.returncode != 0:
606
+ stderr_msg = result.stderr.strip()
607
+ raise RuntimeError(f"curl exited with code {result.returncode}: {stderr_msg}")
608
+ stdout = result.stdout
609
+ if require_status_200:
610
+ stdout, _, status_str = stdout.rpartition("\n")
611
+ try:
612
+ status_code = int(status_str.strip())
613
+ except ValueError:
614
+ raise RuntimeError(
615
+ f"Could not parse HTTP status from curl output for {url}: {status_str!r}"
616
+ )
617
+ if status_code != 200:
618
+ raise RuntimeError(f"HTTP status {status_code} for {url}, skipping.")
619
+ return stdout
620
+
621
+
622
+ def fetch_url_body(
623
+ url: str,
624
+ session: requests.Session,
625
+ connect_timeout: float,
626
+ read_timeout: float,
627
+ verify: bool,
628
+ user_agent: Optional[str],
629
+ use_curl: bool,
630
+ curl_path: Optional[str],
631
+ curl_verbose: bool,
632
+ trust_env: bool,
633
+ max_body_bytes: Optional[int],
634
+ dest_path: Optional[str],
635
+ require_status_200: bool,
636
+ proxy: Optional[str] = None,
637
+ ) -> str:
638
+ if use_curl:
639
+ return fetch_with_curl(
640
+ url=url,
641
+ connect_timeout=connect_timeout,
642
+ read_timeout=read_timeout,
643
+ verify=verify,
644
+ user_agent=user_agent,
645
+ use_proxy_env=trust_env,
646
+ curl_path=curl_path,
647
+ verbose=curl_verbose,
648
+ max_body_bytes=max_body_bytes,
649
+ require_status_200=require_status_200,
650
+ dest_path=dest_path,
651
+ proxy=proxy,
652
+ )
653
+
654
+ headers = {}
655
+ if user_agent:
656
+ headers["User-Agent"] = user_agent
657
+ proxies = {"http": proxy, "https": proxy} if proxy else None
658
+ if dest_path:
659
+ tmp_path = dest_path
660
+ # Stream to file to avoid buffering huge bodies in memory
661
+ try:
662
+ with session.get(
663
+ url,
664
+ headers=headers,
665
+ timeout=(connect_timeout, read_timeout),
666
+ verify=verify,
667
+ stream=True,
668
+ proxies=proxies,
669
+ ) as resp, open(tmp_path, "wb") as out:
670
+ if require_status_200 and resp.status_code != 200:
671
+ raise RuntimeError(f"HTTP status {resp.status_code} for {url}, skipping.")
672
+ total_bytes = 0
673
+ for chunk in resp.iter_content(chunk_size=16384):
674
+ if not chunk:
675
+ continue
676
+ total_bytes += len(chunk)
677
+ if max_body_bytes is not None and total_bytes > max_body_bytes:
678
+ raise RuntimeError(
679
+ f"Body size exceeded limit ({max_body_bytes} bytes), skipping."
680
+ )
681
+ out.write(chunk)
682
+ return tmp_path
683
+ except Exception:
684
+ # On failure, clean partial file if present
685
+ try:
686
+ if os.path.exists(dest_path):
687
+ os.remove(dest_path)
688
+ except OSError:
689
+ pass
690
+ raise
691
+
692
+ with session.get(
693
+ url,
694
+ headers=headers,
695
+ timeout=(connect_timeout, read_timeout),
696
+ verify=verify,
697
+ stream=True,
698
+ proxies=proxies,
699
+ ) as resp:
700
+ if require_status_200 and resp.status_code != 200:
701
+ raise RuntimeError(f"HTTP status {resp.status_code} for {url}, skipping.")
702
+
703
+ body_chunks: List[bytes] = []
704
+ total_bytes = 0
705
+ for chunk in resp.iter_content(chunk_size=16384, decode_unicode=False):
706
+ if not chunk:
707
+ continue
708
+ total_bytes += len(chunk)
709
+ if max_body_bytes is not None and total_bytes > max_body_bytes:
710
+ raise RuntimeError(
711
+ f"Body size exceeded limit ({max_body_bytes} bytes), skipping."
712
+ )
713
+ body_chunks.append(chunk)
714
+
715
+ encoding = resp.encoding or "utf-8"
716
+ return b"".join(body_chunks).decode(encoding, errors="ignore")
717
+
718
+
719
+ def normalize_url(raw: str, auto_http: bool) -> Optional[str]:
720
+ """
721
+ Clean up a URL string. If the scheme is missing and auto_http is True,
722
+ prepend http:// for host-like entries. Returns None for empty/invalid lines.
723
+ """
724
+ url = raw.strip()
725
+ if not url or url.startswith("#"):
726
+ return None
727
+ if url.startswith("//"):
728
+ return f"http:{url}" if auto_http else None
729
+ if "://" not in url:
730
+ if auto_http and re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(:\d+)?(/.*)?$", url):
731
+ return f"http://{url}"
732
+ debug_log(f"[!] Skipping URL without scheme: {url}\n")
733
+ return None
734
+ return url
735
+
736
+
737
+ def record_failed_urls(urls: List[str], log_path: str = FAILED_FETCH_LOG) -> None:
738
+ """Append failed URLs to a log file so they can be retried later."""
739
+ if not urls:
740
+ return
741
+ try:
742
+ with open(log_path, "a", encoding="utf-8") as f:
743
+ for url in urls:
744
+ if url:
745
+ f.write(url.strip() + "\n")
746
+ except Exception as e:
747
+ debug_log(f"[!] Failed to record failed URLs to {log_path}: {e}\n")
748
+
749
+
750
+ def load_urls_from_file(path: str, auto_http: bool) -> List[str]:
751
+ urls: List[str] = []
752
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
753
+ for line in f:
754
+ norm = normalize_url(line, auto_http=auto_http)
755
+ if norm:
756
+ urls.append(norm)
757
+ return urls
758
+
759
+
760
+ def format_finding(rec: Dict[str, Any], color: bool) -> str:
761
+ """
762
+ Format as:
763
+ [secret] 'match' [source]
764
+ with different colors when color=True.
765
+ """
766
+ label = rec["severity"] # "secret" or "secret/low"
767
+ match_repr = repr(rec["match"])
768
+ src = rec["source"]
769
+
770
+ if color:
771
+ lab = label
772
+ lab_color = ANSI_COLORS_LABEL.get(label, "")
773
+ if lab_color:
774
+ lab = f"{lab_color}{lab}{ANSI_RESET}"
775
+ match_repr = f"{ANSI_MATCH}{match_repr}{ANSI_RESET}"
776
+ src = f"{ANSI_SOURCE}{src}{ANSI_RESET}"
777
+ return f"[{lab}] {match_repr} [{src}]"
778
+ else:
779
+ return f"[{label}] {match_repr} [{src}]"
780
+
781
+
782
+ def main():
783
+ parser = argparse.ArgumentParser(
784
+ description=(
785
+ "Web Secret Scanner - Detect exposed secrets, API keys, tokens, and credentials in web responses.\n\n"
786
+ "Scans URLs for sensitive data leaks using pattern matching with nuclei-regex signatures\n"
787
+ "and custom secret detection rules. Supports single URL (-u) or batch scanning from file (-l).\n\n"
788
+ "Output format: [secret] 'matched_value' [source_url]"
789
+ ),
790
+ formatter_class=argparse.RawDescriptionHelpFormatter,
791
+ )
792
+ parser.add_argument("-u", "--url", help="Single URL to scan.")
793
+ parser.add_argument("-l", "--list", help="File containing URLs (one per line).")
794
+ parser.add_argument(
795
+ "--timeout",
796
+ type=float,
797
+ default=None,
798
+ help="Request timeout in seconds (connect + read). Default: 95s total.",
799
+ )
800
+ parser.add_argument(
801
+ "--retries",
802
+ type=int,
803
+ default=3,
804
+ help="Number of HTTP retries for fetch errors. Default: 3.",
805
+ )
806
+ parser.add_argument(
807
+ "--secure",
808
+ dest="insecure",
809
+ action="store_false",
810
+ default=True,
811
+ help="Enable TLS certificate verification. Default: disabled (insecure).",
812
+ )
813
+ parser.add_argument(
814
+ "--proxy",
815
+ type=str,
816
+ default=None,
817
+ help=(
818
+ "Proxy configuration: comma-separated proxies or path to file. "
819
+ "Format: ip:port:username:password (e.g., 185.195.222.170:3199:user:pass). "
820
+ "Proxies are randomly shuffled for each request."
821
+ ),
822
+ )
823
+ parser.add_argument(
824
+ "--user-agent",
825
+ type=str,
826
+ default=None,
827
+ nargs="?",
828
+ const="",
829
+ help="Override User-Agent header (empty to use client default).",
830
+ )
831
+ parser.add_argument(
832
+ "--npb",
833
+ dest="progress",
834
+ action="store_false",
835
+ default=True,
836
+ help="No Progress Bar - disable the progress bar display.",
837
+ )
838
+ parser.add_argument(
839
+ "--quiet",
840
+ action="store_true",
841
+ help="Suppress all terminal output except the progress bar.",
842
+ )
843
+ parser.add_argument(
844
+ "--chunk-size",
845
+ type=int,
846
+ default=REGEX_CHUNK_SIZE,
847
+ help="Number of regex patterns per processing chunk. Default: 50.",
848
+ )
849
+ parser.add_argument(
850
+ "--all-patterns",
851
+ action="store_true",
852
+ help="Include noisy patterns (IPs, endpoints, emails). Default: secrets-only.",
853
+ )
854
+ parser.add_argument(
855
+ "-o",
856
+ "--output",
857
+ type=str,
858
+ default=None,
859
+ help="Output file path. Default: <list_file>_secrets.txt when --list is used.",
860
+ )
861
+ parser.add_argument(
862
+ "--html-output",
863
+ nargs="?",
864
+ const="",
865
+ help="Generate HTML report. Optional path (defaults to <output>.html).",
866
+ )
867
+ parser.add_argument(
868
+ "--append",
869
+ action="store_true",
870
+ help="Append to the output file instead of overwriting.",
871
+ )
872
+ parser.add_argument(
873
+ "--nou",
874
+ dest="number_of_urls",
875
+ type=int,
876
+ default=6,
877
+ help=(
878
+ "Number of URLs to fetch concurrently. Default: 6. "
879
+ "NOTE: This controls URL fetching parallelism only; actual thread count may be "
880
+ "significantly higher due to regex processing workers."
881
+ ),
882
+ )
883
+ parser.add_argument(
884
+ "--no-color",
885
+ dest="color",
886
+ action="store_false",
887
+ default=True,
888
+ help="Disable colored output.",
889
+ )
890
+ args = parser.parse_args()
891
+
892
+ if args.output is None and args.list:
893
+ list_stem = os.path.splitext(os.path.basename(args.list.rstrip(os.sep)))[0] or "scan"
894
+ args.output = f"{list_stem}_secrets.txt"
895
+
896
+ html_requested = args.html_output is not None
897
+ html_output_path: Optional[str] = None
898
+ if html_requested:
899
+ if not args.output:
900
+ parser.error("--html-output requires --output to be set.")
901
+ base, _ = os.path.splitext(args.output)
902
+ html_output_path = args.html_output if args.html_output else (base or args.output) + ".html"
903
+
904
+ # Reset globals per run
905
+ STOP_EVENT.clear()
906
+ with RG_ENABLED_LOCK:
907
+ RG_ENABLED = bool(RG_PATH)
908
+
909
+ # Ensure rg supports PCRE2; fail fast with guidance if not
910
+ ensure_rg_pcre2_available()
911
+
912
+ if not args.url and not args.list:
913
+ parser.error("You must provide --url or --list")
914
+
915
+ if args.progress and Progress is None:
916
+ debug_log("[i] Disabling progress (rich not installed).\n")
917
+ args.progress = False
918
+
919
+ # Set global quiet flag for helpers
920
+ global QUIET
921
+ QUIET = args.quiet
922
+
923
+ # Initialize proxy list if provided
924
+ global PROXY_LIST
925
+ if args.proxy:
926
+ PROXY_LIST = load_proxies(args.proxy)
927
+ if PROXY_LIST:
928
+ debug_log(f"[i] Loaded {len(PROXY_LIST)} proxies for rotation.\n")
929
+ else:
930
+ debug_log("[!] No valid proxies loaded from --proxy argument.\n")
931
+ else:
932
+ PROXY_LIST = []
933
+
934
+ max_parallel = max(1, args.number_of_urls)
935
+ if max_parallel > MAX_PARALLEL_HARD_LIMIT:
936
+ debug_log(
937
+ f"[i] Limiting parallel URL workers to {MAX_PARALLEL_HARD_LIMIT} "
938
+ f"(requested {max_parallel})."
939
+ )
940
+ max_parallel = MAX_PARALLEL_HARD_LIMIT
941
+ fetch_workers = max_parallel
942
+ pool_maxsize = fetch_workers * 2 # allow multiple connections per worker
943
+
944
+ # Automatically avoid spawning tons of curl processes at high concurrency
945
+ use_curl = DEFAULT_USE_CURL
946
+ if use_curl and max_parallel > 8:
947
+ debug_log("[i] Disabling curl for high concurrency to reduce process overhead.")
948
+ use_curl = False
949
+
950
+ if args.insecure:
951
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
952
+
953
+ nuclei_patterns = filter_nuclei_patterns(
954
+ load_embedded_nuclei_patterns(), keep_all=False
955
+ )
956
+
957
+ # Build pattern groups:
958
+ # [secret] => nuclei_regex + critical/high/medium
959
+ # [secret/low] => low-generic patterns
960
+ # Build pattern groups with optional exclusion of noisy patterns
961
+ noisy_medium_patterns = {
962
+ r"[\"'](10\.\d{1,3}\.\d{1,3}\.\d{1,3})[\"']",
963
+ r"[\"'](172\.(1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3})[\"']",
964
+ r"[\"'](192\.168\.\d{1,3}\.\d{1,3})[\"']",
965
+ r"[\"']/admin[/a-zA-Z0-9_-]*[\"']",
966
+ r"[\"']/debug[/a-zA-Z0-9_-]*[\"']",
967
+ r"[\"']/api/internal[/a-zA-Z0-9_-]*[\"']",
968
+ r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
969
+ }
970
+
971
+ secret_patterns: List[Tuple[re.Pattern, str]] = []
972
+ secret_patterns.extend(nuclei_patterns)
973
+ secret_patterns.extend(COMPILED_SEVERITY["critical"])
974
+ secret_patterns.extend(COMPILED_SEVERITY["high"])
975
+
976
+ for rx, pat in COMPILED_SEVERITY["medium"]:
977
+ if args.all_patterns or pat not in noisy_medium_patterns:
978
+ secret_patterns.append((rx, pat))
979
+
980
+ secret_low_patterns: List[Tuple[re.Pattern, str]] = (
981
+ COMPILED_SEVERITY["low"] if args.all_patterns else []
982
+ )
983
+
984
+ pattern_groups: Dict[str, List[Tuple[re.Pattern, str]]] = {
985
+ "secret": secret_patterns,
986
+ "secret/low": secret_low_patterns,
987
+ }
988
+
989
+ # Collect URLs
990
+ urls: List[str] = []
991
+ auto_http = True # Always auto-prepend http:// for URLs without scheme
992
+ if args.url:
993
+ norm = normalize_url(args.url, auto_http=auto_http)
994
+ if norm:
995
+ urls.append(norm)
996
+ if args.list:
997
+ urls.extend(load_urls_from_file(args.list, auto_http=auto_http))
998
+
999
+ # Prepare pattern chunks for rg to avoid PCRE2 size issues
1000
+ max_pattern_len = 512 # tighter cap to avoid PCRE2 size errors
1001
+ patterns_dir = os.path.join(os.getcwd(), "patterns_split")
1002
+ os.makedirs(patterns_dir, exist_ok=True)
1003
+ for entry in os.scandir(patterns_dir):
1004
+ try:
1005
+ os.remove(entry.path)
1006
+ except OSError:
1007
+ pass
1008
+ patterns_per_file = 50
1009
+ pattern_lines: List[str] = []
1010
+ skipped_patterns = 0
1011
+ for rules in [
1012
+ nuclei_patterns,
1013
+ COMPILED_SEVERITY["critical"],
1014
+ COMPILED_SEVERITY["high"],
1015
+ COMPILED_SEVERITY["medium"],
1016
+ COMPILED_SEVERITY["low"],
1017
+ ]:
1018
+ for rx, pat in rules:
1019
+ pat_inline = inline_flags_pattern(pat, rx.flags)
1020
+ if len(pat_inline) > max_pattern_len:
1021
+ skipped_patterns += 1
1022
+ continue
1023
+ pattern_lines.append(pat_inline)
1024
+ pattern_files: List[str] = []
1025
+ for i in range(0, len(pattern_lines), patterns_per_file):
1026
+ chunk_lines = pattern_lines[i : i + patterns_per_file]
1027
+ p_path = os.path.join(patterns_dir, f"patterns_{i//patterns_per_file}.txt")
1028
+ with open(p_path, "w", encoding="utf-8", errors="ignore") as pf:
1029
+ pf.write("\n".join(chunk_lines))
1030
+ pattern_files.append(p_path)
1031
+ debug_log(f"[i] Wrote {len(pattern_lines)} patterns into {len(pattern_files)} chunk file(s) under {patterns_dir}")
1032
+ if skipped_patterns:
1033
+ debug_log(f"[i] Skipped {skipped_patterns} patterns that exceeded {max_pattern_len} chars (PCRE2 limit)")
1034
+
1035
+ # Build pattern chunks for Python fallback scanning
1036
+ pattern_chunks: List[Tuple[str, List[Tuple[re.Pattern, str]]]] = []
1037
+ for label, rules in pattern_groups.items():
1038
+ if not rules:
1039
+ continue
1040
+ for chunk in chunk_patterns(rules, args.chunk_size):
1041
+ pattern_chunks.append((label, chunk))
1042
+
1043
+ temp_dir = os.path.join(os.getcwd(), "temp")
1044
+ temp_dir2 = os.path.join(os.getcwd(), "temp2")
1045
+ os.makedirs(temp_dir, exist_ok=True)
1046
+ os.makedirs(temp_dir2, exist_ok=True)
1047
+
1048
+ def clear_dir(path: str):
1049
+ for entry in os.scandir(path):
1050
+ try:
1051
+ os.remove(entry.path)
1052
+ except OSError:
1053
+ pass
1054
+
1055
+ def fetch_batch(batch_urls: List[str], directory: str) -> Tuple[Dict[str, str], List[str]]:
1056
+ os.makedirs(directory, exist_ok=True)
1057
+ mapping: Dict[str, str] = {}
1058
+ failed_urls: List[str] = []
1059
+ if not batch_urls:
1060
+ return mapping, failed_urls
1061
+ batch_timeout = DEFAULT_FETCH_TIMEOUT
1062
+ def worker(url: str) -> Optional[Tuple[str, str]]:
1063
+ if STOP_EVENT.is_set():
1064
+ return None
1065
+ session = get_thread_session(**session_params)
1066
+ # Get a random proxy for this request (shuffled rotation)
1067
+ proxy = get_random_proxy()
1068
+ try:
1069
+ fname = os.path.join(directory, f"u{hash(url)}_{int(time.time()*1000)}.txt")
1070
+ fetch_url_body(
1071
+ url,
1072
+ session=session,
1073
+ connect_timeout=connect_timeout,
1074
+ read_timeout=read_timeout,
1075
+ verify=not args.insecure,
1076
+ user_agent=user_agent,
1077
+ use_curl=use_curl,
1078
+ curl_path=None,
1079
+ curl_verbose=False,
1080
+ trust_env=trust_env,
1081
+ max_body_bytes=max_body_bytes,
1082
+ dest_path=fname,
1083
+ require_status_200=DEFAULT_REQUIRE_STATUS_200,
1084
+ proxy=proxy,
1085
+ )
1086
+ except Exception as e:
1087
+ failed_urls.append(url)
1088
+ return None
1089
+ return fname, url
1090
+
1091
+ with ThreadPoolExecutor(max_workers=fetch_workers) as ex:
1092
+ fut_map = {ex.submit(worker, u): u for u in batch_urls}
1093
+ try:
1094
+ for fut in as_completed(fut_map, timeout=batch_timeout):
1095
+ res = fut.result()
1096
+ if res:
1097
+ fname, url = res
1098
+ mapping[fname] = url
1099
+ except TimeoutError:
1100
+ for fut, url in fut_map.items():
1101
+ if not fut.done():
1102
+ fut.cancel()
1103
+ failed_urls.append(url)
1104
+ except KeyboardInterrupt:
1105
+ STOP_EVENT.set()
1106
+ ex.shutdown(wait=False, cancel_futures=True)
1107
+ raise
1108
+ if failed_urls:
1109
+ record_failed_urls(sorted(set(failed_urls)))
1110
+ return mapping, failed_urls
1111
+
1112
+ def run_rg_on_dir(directory: str, file_map: Dict[str, str], pattern_files: List[str]) -> Dict[str, List[Dict[str, Any]]]:
1113
+ # Normalize mapping keys to absolute, case-folded paths for rg output
1114
+ def _norm(p: str) -> str:
1115
+ return os.path.normcase(os.path.abspath(p))
1116
+ abs_map: Dict[str, str] = {_norm(p): u for p, u in file_map.items()}
1117
+ results: Dict[str, List[Dict[str, Any]]] = {url: [] for url in abs_map.values()}
1118
+ if not abs_map or not pattern_files:
1119
+ return results
1120
+ global RG_ENABLED
1121
+ with RG_ENABLED_LOCK:
1122
+ if not RG_ENABLED:
1123
+ return results
1124
+ rg_timeout = DEFAULT_SCAN_TIMEOUT
1125
+ for pf in pattern_files:
1126
+ if STOP_EVENT.is_set():
1127
+ break
1128
+ cmd = [
1129
+ RG_PATH or "rg",
1130
+ "--pcre2",
1131
+ "--multiline",
1132
+ "--text",
1133
+ "--json",
1134
+ "--byte-offset",
1135
+ "-f",
1136
+ pf,
1137
+ directory,
1138
+ ]
1139
+ try:
1140
+ proc = subprocess.run(
1141
+ cmd,
1142
+ capture_output=True,
1143
+ text=True,
1144
+ encoding="utf-8",
1145
+ errors="ignore",
1146
+ timeout=rg_timeout,
1147
+ )
1148
+ except subprocess.TimeoutExpired:
1149
+ debug_log(f"[!] rg timed out after {rg_timeout}s using {pf}; skipping remaining pattern chunks.\n")
1150
+ break
1151
+ except Exception as e:
1152
+ debug_log(f"[!] rg failed to start: {e}\n")
1153
+ with RG_ENABLED_LOCK:
1154
+ RG_ENABLED = False
1155
+ break
1156
+
1157
+ if proc.returncode not in (0, 1): # 1 = no matches
1158
+ stderr_msg = proc.stderr.strip()
1159
+ debug_log(f"[!] rg exit {proc.returncode} on {pf}: {stderr_msg}\n")
1160
+ if "PCRE2 is not available" in stderr_msg or "PCRE2" in stderr_msg:
1161
+ raise SystemExit(
1162
+ "[!] ripgrep is installed without PCRE2 support. Install a PCRE2-enabled ripgrep "
1163
+ "(e.g., package 'ripgrep-pcre2' or 'cargo install ripgrep --features \"pcre2\" --locked')."
1164
+ )
1165
+ with RG_ENABLED_LOCK:
1166
+ RG_ENABLED = False
1167
+ break
1168
+
1169
+ for line in proc.stdout.splitlines():
1170
+ try:
1171
+ rec = json.loads(line)
1172
+ except json.JSONDecodeError:
1173
+ continue
1174
+ if rec.get("type") != "match":
1175
+ continue
1176
+ path = rec.get("data", {}).get("path", {}).get("text")
1177
+ if not path:
1178
+ continue
1179
+ abs_path = _norm(path)
1180
+ if abs_path not in abs_map:
1181
+ continue
1182
+ url = abs_map[abs_path]
1183
+ for sub in rec.get("data", {}).get("submatches", []):
1184
+ if "match" not in sub:
1185
+ continue
1186
+ start = sub.get("byte_offset", sub.get("start"))
1187
+ if start is None:
1188
+ continue
1189
+ match_text = sub["match"].get("text", "")
1190
+ line_text = rec.get("data", {}).get("lines", {}).get("text", "")
1191
+ if len(line_text) > 500:
1192
+ line_text = line_text[:500] + "...(truncated)"
1193
+ results[url].append(
1194
+ {
1195
+ "severity": "secret",
1196
+ "source": url,
1197
+ "index": start,
1198
+ "match": match_text,
1199
+ "pattern": rec.get("data", {}).get("pattern", ""),
1200
+ "context": line_text.strip(),
1201
+ }
1202
+ )
1203
+ return results
1204
+
1205
+ def scan_files_python(file_map: Dict[str, str]) -> Dict[str, List[Dict[str, Any]]]:
1206
+ results: Dict[str, List[Dict[str, Any]]] = {url: [] for url in file_map.values()}
1207
+ for path, url in file_map.items():
1208
+ if STOP_EVENT.is_set():
1209
+ break
1210
+ try:
1211
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
1212
+ body = f.read()
1213
+ except OSError as e:
1214
+ debug_log(f"[!] Failed to read {path} for {url}: {e}\n")
1215
+ continue
1216
+ for label, chunk in pattern_chunks:
1217
+ results[url].extend(
1218
+ scan_one_chunk(
1219
+ label,
1220
+ chunk,
1221
+ body,
1222
+ url,
1223
+ True, # use false-positive filtering
1224
+ None,
1225
+ False,
1226
+ )
1227
+ )
1228
+ return results
1229
+
1230
+ # Config
1231
+ connect_timeout = args.timeout if args.timeout is not None else DEFAULT_CONNECT_TIMEOUT
1232
+ read_timeout = args.timeout if args.timeout is not None else DEFAULT_READ_TIMEOUT
1233
+ trust_env = True # Honor proxy environment variables
1234
+ user_agent = None if args.user_agent in (None, "") else args.user_agent
1235
+ max_body_bytes = DEFAULT_MAX_BODY_BYTES
1236
+ session_params = dict(
1237
+ retries=args.retries,
1238
+ backoff=1.0, # Default backoff factor
1239
+ trust_env=trust_env,
1240
+ pool_maxsize=pool_maxsize,
1241
+ )
1242
+
1243
+ total_urls = len(urls)
1244
+ progress_enabled = args.progress and total_urls > 0 and Progress is not None
1245
+ progress_cm = None
1246
+ progress = None
1247
+ progress_task = None
1248
+ if progress_enabled:
1249
+ progress_cm = Progress(
1250
+ SpinnerColumn(),
1251
+ BarColumn(bar_width=PROGRESS_BAR_WIDTH),
1252
+ MofNCompleteColumn(),
1253
+ TimeElapsedColumn(),
1254
+ TimeRemainingColumn(),
1255
+ TextColumn("{task.fields[url]}", justify="left"),
1256
+ transient=False,
1257
+ console=None,
1258
+ )
1259
+ progress = progress_cm.__enter__()
1260
+ progress_task = progress.add_task("scan", total=total_urls, url="")
1261
+
1262
+ # Determine output behavior: default overwrites once, then appends per write
1263
+ output_mode = "a"
1264
+ if args.output and not args.append:
1265
+ try:
1266
+ with open(args.output, "w", encoding="utf-8") as _:
1267
+ pass # truncate existing file for a fresh run
1268
+ except OSError as e:
1269
+ raise SystemExit(f"[!] Failed to open output file {args.output}: {e}")
1270
+
1271
+ def advance_progress(url_msg: str):
1272
+ if not progress_enabled or progress is None or progress_task is None:
1273
+ return
1274
+ progress.update(progress_task, advance=1, url=url_msg)
1275
+
1276
+ remaining = list(urls)
1277
+ processed = 0
1278
+ try:
1279
+ batch_dir = temp_dir
1280
+ buffer_dir = temp_dir2
1281
+ clear_dir(batch_dir)
1282
+ clear_dir(buffer_dir)
1283
+ batch_map: Dict[str, str] = {}
1284
+ buffer_map: Dict[str, str] = {}
1285
+ buffer_failed: List[str] = []
1286
+ while remaining or batch_map or buffer_map:
1287
+ try:
1288
+ if STOP_EVENT.is_set():
1289
+ break
1290
+ # Fill batch if empty
1291
+ if not batch_map and remaining:
1292
+ take = remaining[:max_parallel]
1293
+ remaining = remaining[max_parallel:]
1294
+ batch_map, failed_batch = fetch_batch(take, batch_dir)
1295
+ for url in failed_batch:
1296
+ advance_progress(f"{url} (failed)")
1297
+ processed += 1
1298
+ # If this batch yielded nothing (all failed), try next chunk
1299
+ if not batch_map:
1300
+ continue
1301
+ if not batch_map and not remaining and not buffer_map:
1302
+ # nothing left anywhere
1303
+ break
1304
+
1305
+ # Start rg on batch
1306
+ rg_results: Dict[str, List[Dict[str, Any]]] = {}
1307
+ rg_thread = ThreadPoolExecutor(max_workers=1)
1308
+ rg_future = rg_thread.submit(run_rg_on_dir, batch_dir, batch_map, pattern_files)
1309
+
1310
+ # While rg runs, fill buffer up to nou
1311
+ buffer_map = {}
1312
+ if remaining:
1313
+ take_buf = remaining[:max_parallel]
1314
+ remaining = remaining[max_parallel:]
1315
+ buffer_map, buffer_failed = fetch_batch(take_buf, buffer_dir)
1316
+ for url in buffer_failed:
1317
+ advance_progress(f"{url} (failed)")
1318
+ processed += 1
1319
+
1320
+ rg_results = rg_future.result()
1321
+ rg_thread.shutdown(wait=True, cancel_futures=True)
1322
+ with RG_ENABLED_LOCK:
1323
+ rg_active = RG_ENABLED
1324
+ if not rg_active:
1325
+ rg_results = scan_files_python(batch_map)
1326
+ except KeyboardInterrupt:
1327
+ STOP_EVENT.set()
1328
+ try:
1329
+ rg_thread.shutdown(wait=False, cancel_futures=True) # type: ignore[name-defined]
1330
+ except Exception:
1331
+ pass
1332
+ break
1333
+
1334
+ # Emit results per URL after rg finishes (dedupe per URL only)
1335
+ for url in batch_map.values():
1336
+ findings = deduplicate_findings(rg_results.get(url, []))
1337
+ if findings:
1338
+ if args.output:
1339
+ with open(args.output, output_mode, encoding="utf-8") as tf:
1340
+ tf.write(f"=== Results for {url} ===\n")
1341
+ for r in findings:
1342
+ tf.write(format_finding(r, color=False) + "\n")
1343
+ tf.write("\n")
1344
+ if not args.quiet:
1345
+ print(f"=== Results for {url} ===")
1346
+ for r in findings:
1347
+ print(format_finding(r, color=args.color))
1348
+ print()
1349
+ advance_progress(url)
1350
+ processed += 1
1351
+
1352
+ clear_dir(batch_dir)
1353
+ # Swap batch and buffer for next cycle
1354
+ batch_dir, buffer_dir = buffer_dir, batch_dir
1355
+ batch_map = buffer_map
1356
+ buffer_map = {}
1357
+ buffer_failed = []
1358
+ except KeyboardInterrupt:
1359
+ STOP_EVENT.set()
1360
+ finally:
1361
+ if progress_enabled and progress is not None and progress_task is not None:
1362
+ try:
1363
+ progress.update(progress_task, completed=processed)
1364
+ except Exception:
1365
+ pass
1366
+ if progress_cm is not None:
1367
+ try:
1368
+ progress_cm.__exit__(None, None, None)
1369
+ except Exception:
1370
+ pass
1371
+
1372
+ if html_requested and html_output_path:
1373
+ html_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), "html_secret_gen.py")
1374
+ cmd = [
1375
+ sys.executable or "python3",
1376
+ html_script,
1377
+ "-i",
1378
+ args.output,
1379
+ "-o",
1380
+ html_output_path,
1381
+ ]
1382
+ try:
1383
+ subprocess.run(cmd, check=True)
1384
+ except subprocess.CalledProcessError as e:
1385
+ raise SystemExit(f"[!] Failed to generate HTML report: {e}")
1386
+
1387
+
1388
+ if __name__ == "__main__":
1389
+ main()