deadpush 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deadpush/__init__.py +1 -0
- deadpush/churn.py +189 -0
- deadpush/cli.py +1584 -0
- deadpush/comments.py +265 -0
- deadpush/complexity.py +254 -0
- deadpush/config.py +284 -0
- deadpush/crawler.py +133 -0
- deadpush/deadness.py +477 -0
- deadpush/debris.py +729 -0
- deadpush/deps.py +323 -0
- deadpush/deps_guard.py +382 -0
- deadpush/entrypoints.py +193 -0
- deadpush/graph.py +401 -0
- deadpush/guard.py +1386 -0
- deadpush/hooks.py +369 -0
- deadpush/importgraph.py +122 -0
- deadpush/imports.py +239 -0
- deadpush/intercept.py +995 -0
- deadpush/languages/__init__.py +143 -0
- deadpush/languages/base.py +70 -0
- deadpush/languages/cpp.py +150 -0
- deadpush/languages/go_.py +177 -0
- deadpush/languages/java.py +185 -0
- deadpush/languages/javascript.py +202 -0
- deadpush/languages/python_.py +278 -0
- deadpush/languages/rust.py +147 -0
- deadpush/languages/typescript.py +192 -0
- deadpush/layers.py +197 -0
- deadpush/mcp_server.py +1061 -0
- deadpush/reachability.py +183 -0
- deadpush/registration.py +280 -0
- deadpush/report.py +113 -0
- deadpush/rules.py +190 -0
- deadpush/sarif.py +123 -0
- deadpush/scorer.py +151 -0
- deadpush/security.py +187 -0
- deadpush/session.py +224 -0
- deadpush/tests.py +333 -0
- deadpush/ui.py +156 -0
- deadpush/verifier.py +168 -0
- deadpush/watch.py +103 -0
- deadpush-0.2.0.dist-info/METADATA +230 -0
- deadpush-0.2.0.dist-info/RECORD +46 -0
- deadpush-0.2.0.dist-info/WHEEL +4 -0
- deadpush-0.2.0.dist-info/entry_points.txt +2 -0
- deadpush-0.2.0.dist-info/licenses/LICENSE +21 -0
deadpush/debris.py
ADDED
|
@@ -0,0 +1,729 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Debris Detection - Production Level
|
|
3
|
+
|
|
4
|
+
Includes:
|
|
5
|
+
- LLM context files, vibe scratchpads, env files, chat exports
|
|
6
|
+
- Content-based + filename-based detection
|
|
7
|
+
- **Structural duplicate detection** using Python AST (detects AI-regenerated files)
|
|
8
|
+
- Content hash + name similarity fallback
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import ast
|
|
14
|
+
import hashlib
|
|
15
|
+
import re
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from .config import Config
|
|
22
|
+
from .crawler import FileInfo
|
|
23
|
+
from .graph import DebrisFile, content_hash
|
|
24
|
+
|
|
25
|
+
SILENT_FAILURE_PATTERNS = [
|
|
26
|
+
re.compile(r'try\s*:.*?except\s*[^:]*:\s*\n\s*(?:pass|#.*|\.{3}|\.\.\.)', re.DOTALL),
|
|
27
|
+
re.compile(r'try\s*:.*?except\s*[^:]*:\s*\n\s*\n\s*(?:pass|return\s+None|return\s+"")', re.DOTALL),
|
|
28
|
+
re.compile(r'except\s*(?:Exception|BaseException|Error|RuntimeError|\(.*?\))?\s*:\s*(?:pass|#.*)'),
|
|
29
|
+
re.compile(r'except\s*:\s*#\s*(?:TODO|FIXME|HACK|ignore)', re.IGNORECASE),
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
import math
|
|
33
|
+
import string
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# =============================================================================
|
|
37
|
+
# Category Definitions
|
|
38
|
+
# =============================================================================
|
|
39
|
+
LLM_CONTEXT_FILES = {
|
|
40
|
+
"claude.md", "claude_context.md", ".claude_instructions",
|
|
41
|
+
".cursorrules", "cursor_rules.md", ".cursorignore",
|
|
42
|
+
".copilot-instructions.md", "agents.md", "windsurf_rules.md",
|
|
43
|
+
"llm_context.txt", "ai_prompt.md", "system_prompt.txt",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
VIBE_SCRATCHPAD_NAMES = {
|
|
47
|
+
"scratch", "playground", "temp", "tmp", "untitled", "copy_of",
|
|
48
|
+
"backup", "old", "new", "v2", "final", "todo_delete", "debug",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
CHAT_PATTERNS = [
|
|
52
|
+
r"^\s*(User|Assistant|Human|System):\s",
|
|
53
|
+
r"^(Human|Assistant):",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class _FileSignature:
|
|
59
|
+
"""Lightweight structural signature for duplicate detection."""
|
|
60
|
+
functions: frozenset[str]
|
|
61
|
+
classes: frozenset[str]
|
|
62
|
+
imports: frozenset[str]
|
|
63
|
+
total_nodes: int
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class DebrisDetector:
|
|
67
|
+
def __init__(self, config: Config):
|
|
68
|
+
self.config = config
|
|
69
|
+
self._content_hashes: dict[str, list[Path]] = defaultdict(list)
|
|
70
|
+
self._signatures: dict[Path, _FileSignature] = {}
|
|
71
|
+
|
|
72
|
+
def scan(self, files: list[FileInfo]) -> list[DebrisFile]:
|
|
73
|
+
results: list[DebrisFile] = []
|
|
74
|
+
|
|
75
|
+
# Build indexes
|
|
76
|
+
for f in files:
|
|
77
|
+
if f.is_text and f.size < 2 * 1024 * 1024:
|
|
78
|
+
h = content_hash(f.path)
|
|
79
|
+
if h:
|
|
80
|
+
self._content_hashes[h].append(f.path)
|
|
81
|
+
|
|
82
|
+
if f.path.suffix == ".py":
|
|
83
|
+
sig = self._extract_python_signature(f.path)
|
|
84
|
+
if sig:
|
|
85
|
+
self._signatures[f.path] = sig
|
|
86
|
+
|
|
87
|
+
for f in files:
|
|
88
|
+
flags = self._check_file(f, files)
|
|
89
|
+
if flags:
|
|
90
|
+
results.append(self._build_debris_file(f, flags))
|
|
91
|
+
|
|
92
|
+
return sorted(results, key=lambda d: (not d.block_push, -d.confidence, d.path))
|
|
93
|
+
|
|
94
|
+
# -------------------------------------------------------------------------
|
|
95
|
+
# Per-file checking
|
|
96
|
+
# -------------------------------------------------------------------------
|
|
97
|
+
def _check_file(self, f: FileInfo, all_files: list[FileInfo]) -> list[dict[str, Any]]:
|
|
98
|
+
flags = []
|
|
99
|
+
flags += self._check_filename(f)
|
|
100
|
+
content = None
|
|
101
|
+
if f.is_text and f.size < 2 * 1024 * 1024:
|
|
102
|
+
try:
|
|
103
|
+
content = f.path.read_text(encoding="utf-8", errors="ignore")
|
|
104
|
+
except Exception:
|
|
105
|
+
pass
|
|
106
|
+
if content is not None:
|
|
107
|
+
flags += self._check_content(f, content)
|
|
108
|
+
flags += self._detect_hardcoded_secrets(f, content)
|
|
109
|
+
flags += self._check_anti_patterns(f, content)
|
|
110
|
+
flags += self._check_prompt_injection(f, content)
|
|
111
|
+
flags += self._check_duplicates(f, all_files)
|
|
112
|
+
flags += self._check_structural_duplicates(f, all_files)
|
|
113
|
+
flags += self._check_git_status(f)
|
|
114
|
+
return flags
|
|
115
|
+
|
|
116
|
+
def _check_filename(self, f: FileInfo) -> list[dict[str, Any]]:
|
|
117
|
+
flags = []
|
|
118
|
+
name_lower = f.path.name.lower()
|
|
119
|
+
rel = str(f.rel_path).lower()
|
|
120
|
+
|
|
121
|
+
if name_lower in LLM_CONTEXT_FILES or any(p in rel for p in LLM_CONTEXT_FILES):
|
|
122
|
+
flags.append({
|
|
123
|
+
"category": "llm_context_file",
|
|
124
|
+
"confidence": 0.99,
|
|
125
|
+
"reason": f"Known LLM/AI coding assistant context file: {f.path.name}",
|
|
126
|
+
"block": True,
|
|
127
|
+
"suggestion": "Add to .gitignore. These should never be committed.",
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
for bad in VIBE_SCRATCHPAD_NAMES:
|
|
131
|
+
if bad in name_lower:
|
|
132
|
+
flags.append({
|
|
133
|
+
"category": "vibe_scratchpad",
|
|
134
|
+
"confidence": 0.82,
|
|
135
|
+
"reason": f"Looks like a temporary/AI scratch file: {f.path.name}",
|
|
136
|
+
"block": False,
|
|
137
|
+
"suggestion": "Delete or move to a gitignored location.",
|
|
138
|
+
})
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
if any(x in name_lower for x in ["_copy", "_backup", "_old", "_v2", "_final", "_new"]):
|
|
142
|
+
flags.append({
|
|
143
|
+
"category": "duplicate_file",
|
|
144
|
+
"confidence": 0.75,
|
|
145
|
+
"reason": f"Filename suggests copy/regenerated version: {f.path.name}",
|
|
146
|
+
"block": False,
|
|
147
|
+
"suggestion": "Compare with original and remove duplicate.",
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
return flags
|
|
151
|
+
|
|
152
|
+
def _check_content(self, f: FileInfo, content: str | None = None) -> list[dict[str, Any]]:
|
|
153
|
+
flags = []
|
|
154
|
+
if content is not None:
|
|
155
|
+
lines = content.splitlines()
|
|
156
|
+
head = "\n".join(lines[:55])
|
|
157
|
+
else:
|
|
158
|
+
try:
|
|
159
|
+
with f.path.open("r", encoding="utf-8", errors="ignore") as fh:
|
|
160
|
+
head = "".join([next(fh) for _ in range(55)])
|
|
161
|
+
except Exception:
|
|
162
|
+
return flags
|
|
163
|
+
|
|
164
|
+
content_lower = head.lower()
|
|
165
|
+
|
|
166
|
+
if any(p in content_lower for p in [
|
|
167
|
+
"you are a helpful assistant", "you are an expert software engineer",
|
|
168
|
+
"as an ai coding assistant", "claude", "cursor rules",
|
|
169
|
+
]):
|
|
170
|
+
flags.append({
|
|
171
|
+
"category": "llm_context_file",
|
|
172
|
+
"confidence": 0.96,
|
|
173
|
+
"reason": "Contains AI system prompt or context instructions",
|
|
174
|
+
"block": True,
|
|
175
|
+
"suggestion": "This appears to be an exported AI context/prompt file.",
|
|
176
|
+
})
|
|
177
|
+
|
|
178
|
+
for pattern in CHAT_PATTERNS:
|
|
179
|
+
if re.search(pattern, head, re.IGNORECASE | re.MULTILINE):
|
|
180
|
+
flags.append({
|
|
181
|
+
"category": "chat_export",
|
|
182
|
+
"confidence": 0.88,
|
|
183
|
+
"reason": "Matches exported LLM chat log format",
|
|
184
|
+
"block": False,
|
|
185
|
+
"suggestion": "Remove chat export files from the repository.",
|
|
186
|
+
})
|
|
187
|
+
break
|
|
188
|
+
|
|
189
|
+
if f.path.name.startswith(".env") and any(x in f.path.name.lower() for x in ["local", "dev", "development"]):
|
|
190
|
+
flags.append({
|
|
191
|
+
"category": "env_file",
|
|
192
|
+
"confidence": 0.97,
|
|
193
|
+
"reason": "Committed local/development environment file",
|
|
194
|
+
"block": True,
|
|
195
|
+
"suggestion": "Add to .gitignore and rotate any exposed secrets.",
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
return flags
|
|
199
|
+
|
|
200
|
+
# -------------------------------------------------------------------------
|
|
201
|
+
# Content + Name Duplicate Detection
|
|
202
|
+
# -------------------------------------------------------------------------
|
|
203
|
+
def _check_duplicates(self, f: FileInfo, all_files: list[FileInfo]) -> list[dict[str, Any]]:
|
|
204
|
+
flags = []
|
|
205
|
+
my_hash = content_hash(f.path)
|
|
206
|
+
if my_hash:
|
|
207
|
+
duplicates = [p for p in self._content_hashes.get(my_hash, []) if p != f.path]
|
|
208
|
+
if duplicates:
|
|
209
|
+
flags.append({
|
|
210
|
+
"category": "duplicate_file",
|
|
211
|
+
"confidence": 0.99,
|
|
212
|
+
"reason": f"Exact content duplicate of: {duplicates[0].name}",
|
|
213
|
+
"block": False,
|
|
214
|
+
"suggestion": "Delete all but one copy.",
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
# Name similarity
|
|
218
|
+
for other in all_files:
|
|
219
|
+
if other.path == f.path:
|
|
220
|
+
continue
|
|
221
|
+
if self._name_similarity(f.path.name, other.path.name) > 0.80:
|
|
222
|
+
flags.append({
|
|
223
|
+
"category": "duplicate_file",
|
|
224
|
+
"confidence": 0.70,
|
|
225
|
+
"reason": f"Very similar filename to existing file: {other.rel_path}",
|
|
226
|
+
"block": False,
|
|
227
|
+
"suggestion": "Review — this may be an AI-regenerated duplicate.",
|
|
228
|
+
})
|
|
229
|
+
break
|
|
230
|
+
return flags
|
|
231
|
+
|
|
232
|
+
def _name_similarity(self, a: str, b: str) -> float:
|
|
233
|
+
def bigrams(s): return {s[i:i+2] for i in range(len(s)-1)}
|
|
234
|
+
inter = len(bigrams(a) & bigrams(b))
|
|
235
|
+
union = len(bigrams(a) | bigrams(b))
|
|
236
|
+
return inter / union if union else 0.0
|
|
237
|
+
|
|
238
|
+
# -------------------------------------------------------------------------
|
|
239
|
+
# NEW: Structural / AST-based Duplicate Detection (The Wow Feature)
|
|
240
|
+
# -------------------------------------------------------------------------
|
|
241
|
+
def _extract_python_signature(self, path: Path) -> _FileSignature | None:
|
|
242
|
+
"""Extract structural signature of a Python file using AST."""
|
|
243
|
+
try:
|
|
244
|
+
with path.open("r", encoding="utf-8", errors="ignore") as f:
|
|
245
|
+
tree = ast.parse(f.read(), filename=str(path))
|
|
246
|
+
except Exception:
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
functions = set()
|
|
250
|
+
classes = set()
|
|
251
|
+
imports = set()
|
|
252
|
+
total_nodes = 0
|
|
253
|
+
|
|
254
|
+
for node in ast.walk(tree):
|
|
255
|
+
total_nodes += 1
|
|
256
|
+
if isinstance(node, ast.FunctionDef):
|
|
257
|
+
args = tuple(arg.arg for arg in node.args.args)
|
|
258
|
+
functions.add(f"{node.name}{args}")
|
|
259
|
+
elif isinstance(node, ast.ClassDef):
|
|
260
|
+
classes.add(node.name)
|
|
261
|
+
elif isinstance(node, (ast.Import, ast.ImportFrom)):
|
|
262
|
+
for alias in node.names:
|
|
263
|
+
imports.add(alias.name)
|
|
264
|
+
|
|
265
|
+
return _FileSignature(
|
|
266
|
+
functions=frozenset(functions),
|
|
267
|
+
classes=frozenset(classes),
|
|
268
|
+
imports=frozenset(imports),
|
|
269
|
+
total_nodes=total_nodes
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
def _check_structural_duplicates(self, f: FileInfo, all_files: list[FileInfo]) -> list[dict[str, Any]]:
|
|
273
|
+
"""
|
|
274
|
+
Detect files that have very similar structure to existing files.
|
|
275
|
+
This catches cases where an LLM was asked to "rewrite" or "improve" a file
|
|
276
|
+
and created a near-duplicate instead of editing the original.
|
|
277
|
+
"""
|
|
278
|
+
if f.path.suffix != ".py" or f.path not in self._signatures:
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
flags = []
|
|
282
|
+
my_sig = self._signatures[f.path]
|
|
283
|
+
|
|
284
|
+
for other_path, other_sig in self._signatures.items():
|
|
285
|
+
if other_path == f.path:
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
func_overlap = len(my_sig.functions & other_sig.functions)
|
|
289
|
+
class_overlap = len(my_sig.classes & other_sig.classes)
|
|
290
|
+
total_unique = len(my_sig.functions | other_sig.functions) + len(my_sig.classes | other_sig.classes)
|
|
291
|
+
|
|
292
|
+
if total_unique == 0:
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
similarity = (func_overlap + class_overlap) / total_unique
|
|
296
|
+
|
|
297
|
+
if similarity > 0.75 and len(my_sig.functions) > 1:
|
|
298
|
+
flags.append({
|
|
299
|
+
"category": "ai_regenerated_duplicate",
|
|
300
|
+
"confidence": min(0.92, 0.65 + similarity * 0.3),
|
|
301
|
+
"reason": f"Structurally very similar to {other_path.name} (likely AI-regenerated copy)",
|
|
302
|
+
"block": False,
|
|
303
|
+
"suggestion": "Compare with original. Delete the regenerated version and edit the original instead.",
|
|
304
|
+
})
|
|
305
|
+
break
|
|
306
|
+
|
|
307
|
+
return flags
|
|
308
|
+
|
|
309
|
+
# -------------------------------------------------------------------------
|
|
310
|
+
# ADVANCED Hardcoded Secrets Detection (Production-Grade)
|
|
311
|
+
# -------------------------------------------------------------------------
|
|
312
|
+
def _detect_hardcoded_secrets(self, f: FileInfo, content: str | None = None) -> list[dict[str, Any]]:
|
|
313
|
+
"""
|
|
314
|
+
Advanced, multi-layered secret detection engine.
|
|
315
|
+
|
|
316
|
+
Techniques used:
|
|
317
|
+
- High-order entropy analysis (bigram/trigram aware)
|
|
318
|
+
- Keyword proximity scoring (how close "secret"/"key" is to candidate)
|
|
319
|
+
- AST-aware context analysis (for Python)
|
|
320
|
+
- Known high-value secret formats with validation
|
|
321
|
+
- Obfuscation detection (concatenation, base64-ish strings)
|
|
322
|
+
- Strong false-positive filtering
|
|
323
|
+
"""
|
|
324
|
+
if not f.is_text or f.size > 800_000:
|
|
325
|
+
return []
|
|
326
|
+
|
|
327
|
+
flags = []
|
|
328
|
+
if content is None:
|
|
329
|
+
try:
|
|
330
|
+
content = f.path.read_text(encoding="utf-8", errors="ignore")
|
|
331
|
+
except Exception:
|
|
332
|
+
return flags
|
|
333
|
+
lines = content.splitlines()
|
|
334
|
+
|
|
335
|
+
# === Layer 1: High-confidence known formats ===
|
|
336
|
+
known_formats = self._get_known_secret_formats()
|
|
337
|
+
for i, line in enumerate(lines):
|
|
338
|
+
for pattern, secret_type, confidence in known_formats:
|
|
339
|
+
if re.search(pattern, line):
|
|
340
|
+
flags.append({
|
|
341
|
+
"category": "hardcoded_secret",
|
|
342
|
+
"confidence": confidence,
|
|
343
|
+
"reason": f"High-confidence {secret_type} detected (line {i+1})",
|
|
344
|
+
"block": True,
|
|
345
|
+
"suggestion": f"Remove the hardcoded {secret_type}. Use a secrets manager or environment variables immediately.",
|
|
346
|
+
})
|
|
347
|
+
|
|
348
|
+
# === Layer 2: Advanced entropy + context scoring ===
|
|
349
|
+
candidates = self._extract_potential_secrets(content, lines)
|
|
350
|
+
|
|
351
|
+
for candidate in candidates:
|
|
352
|
+
score, reasons = self._score_secret_candidate(candidate, content)
|
|
353
|
+
|
|
354
|
+
if score >= 0.78: # High confidence threshold
|
|
355
|
+
flags.append({
|
|
356
|
+
"category": "hardcoded_secret",
|
|
357
|
+
"confidence": round(score, 3),
|
|
358
|
+
"reason": " | ".join(reasons),
|
|
359
|
+
"block": True,
|
|
360
|
+
"suggestion": "This appears to be a hardcoded secret. Move it to environment variables or a proper secrets manager (AWS Secrets Manager, Doppler, Infisical, etc.).",
|
|
361
|
+
})
|
|
362
|
+
|
|
363
|
+
# === Layer 3: Python AST-based deep analysis (most accurate) ===
|
|
364
|
+
if f.path.suffix == ".py":
|
|
365
|
+
ast_flags = self._detect_secrets_via_ast(content, str(f.path))
|
|
366
|
+
flags.extend(ast_flags)
|
|
367
|
+
|
|
368
|
+
# === Layer 4: Obfuscation & reconstruction ===
|
|
369
|
+
obfuscated_flags = self._detect_obfuscated_secrets(content, lines)
|
|
370
|
+
flags.extend(obfuscated_flags)
|
|
371
|
+
|
|
372
|
+
return flags
|
|
373
|
+
|
|
374
|
+
def _get_known_secret_formats(self):
|
|
375
|
+
"""High-precision patterns for well-known secret types."""
|
|
376
|
+
return [
|
|
377
|
+
(r'AKIA[0-9A-Z]{16}', "AWS Access Key ID", 0.97),
|
|
378
|
+
(r'(?i)aws.*secret.*access.*key["\']?\s*[:=]\s*["\']?[A-Za-z0-9/+=]{40}', "AWS Secret Access Key", 0.95),
|
|
379
|
+
(r'ghp_[a-zA-Z0-9]{36}', "GitHub Personal Access Token", 0.98),
|
|
380
|
+
(r'gho_[a-zA-Z0-9]{36}', "GitHub OAuth Token", 0.96),
|
|
381
|
+
(r'ghs_[a-zA-Z0-9]{36}', "GitHub App Token", 0.96),
|
|
382
|
+
(r'sk-[a-zA-Z0-9]{48}', "OpenAI API Key", 0.97),
|
|
383
|
+
(r'sk-ant-[a-zA-Z0-9]{48}', "Anthropic API Key", 0.97),
|
|
384
|
+
(r'AIza[0-9A-Za-z\-_]{35}', "Google API Key", 0.94),
|
|
385
|
+
(r'-----BEGIN (RSA|EC|DSA|OPENSSH) PRIVATE KEY-----', "Private Key", 0.99),
|
|
386
|
+
(r'eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}', "JWT Token", 0.90),
|
|
387
|
+
]
|
|
388
|
+
|
|
389
|
+
def _extract_potential_secrets(self, content: str, lines: list[str]) -> list[dict]:
|
|
390
|
+
"""Extract candidate strings that could be secrets using multiple strategies."""
|
|
391
|
+
candidates = []
|
|
392
|
+
|
|
393
|
+
# Strategy 1: String literals assigned to sensitive-looking variables
|
|
394
|
+
sensitive_keywords = {
|
|
395
|
+
"key", "token", "secret", "password", "passwd", "credential", "auth",
|
|
396
|
+
"api", "private", "access", "bearer", "jwt", "oauth", "client_secret"
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
# Python/JS/TS style assignments
|
|
400
|
+
assignment_pattern = re.compile(
|
|
401
|
+
r'([a-zA-Z_][a-zA-Z0-9_]*)\s*[:=]\s*["\']([A-Za-z0-9_\-/.+=@#$%^&*!~|]{16,})["\']'
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
for i, line in enumerate(lines):
|
|
405
|
+
for match in assignment_pattern.finditer(line):
|
|
406
|
+
var_name = match.group(1).lower()
|
|
407
|
+
value = match.group(2)
|
|
408
|
+
|
|
409
|
+
# Check if variable name contains sensitive keywords
|
|
410
|
+
if any(kw in var_name for kw in sensitive_keywords):
|
|
411
|
+
candidates.append({
|
|
412
|
+
"value": value,
|
|
413
|
+
"line": i + 1,
|
|
414
|
+
"context": line.strip(),
|
|
415
|
+
"variable": match.group(1),
|
|
416
|
+
"type": "assignment"
|
|
417
|
+
})
|
|
418
|
+
|
|
419
|
+
# Strategy 2: High-entropy standalone strings (even without assignment)
|
|
420
|
+
string_literal_pattern = re.compile(r'["\']([A-Za-z0-9_\-/.+=@#$%^&*!~|]{24,})["\']')
|
|
421
|
+
for i, line in enumerate(lines):
|
|
422
|
+
for match in string_literal_pattern.finditer(line):
|
|
423
|
+
value = match.group(1)
|
|
424
|
+
if self._advanced_entropy(value) > 4.2:
|
|
425
|
+
candidates.append({
|
|
426
|
+
"value": value,
|
|
427
|
+
"line": i + 1,
|
|
428
|
+
"context": line.strip(),
|
|
429
|
+
"variable": None,
|
|
430
|
+
"type": "standalone"
|
|
431
|
+
})
|
|
432
|
+
|
|
433
|
+
return candidates
|
|
434
|
+
|
|
435
|
+
def _advanced_entropy(self, s: str) -> float:
|
|
436
|
+
"""More sophisticated entropy calculation using character distribution + bigrams."""
|
|
437
|
+
if len(s) < 16:
|
|
438
|
+
return 0.0
|
|
439
|
+
|
|
440
|
+
# Character-level entropy
|
|
441
|
+
prob = [float(s.count(c)) / len(s) for c in set(s)]
|
|
442
|
+
char_entropy = -sum(p * math.log2(p) for p in prob if p > 0)
|
|
443
|
+
|
|
444
|
+
# Bigram entropy (detects structured vs random strings)
|
|
445
|
+
if len(s) > 2:
|
|
446
|
+
bigrams = [s[i:i+2] for i in range(len(s)-1)]
|
|
447
|
+
bigram_prob = [float(bigrams.count(b)) / len(bigrams) for b in set(bigrams)]
|
|
448
|
+
bigram_entropy = -sum(p * math.log2(p) for p in bigram_prob if p > 0)
|
|
449
|
+
return (char_entropy * 0.6) + (bigram_entropy * 0.4)
|
|
450
|
+
|
|
451
|
+
return char_entropy
|
|
452
|
+
|
|
453
|
+
def _score_secret_candidate(self, candidate: dict, full_content: str) -> tuple[float, list[str]]:
|
|
454
|
+
"""Multi-factor scoring for secret likelihood."""
|
|
455
|
+
value = candidate["value"]
|
|
456
|
+
context = candidate.get("context", "")
|
|
457
|
+
var_name = candidate.get("variable", "") or ""
|
|
458
|
+
|
|
459
|
+
score = 0.0
|
|
460
|
+
reasons = []
|
|
461
|
+
|
|
462
|
+
# Factor 1: Advanced entropy
|
|
463
|
+
entropy = self._advanced_entropy(value)
|
|
464
|
+
if entropy > 4.8:
|
|
465
|
+
score += 0.35
|
|
466
|
+
reasons.append("Very high entropy")
|
|
467
|
+
elif entropy > 4.2:
|
|
468
|
+
score += 0.25
|
|
469
|
+
reasons.append("High entropy")
|
|
470
|
+
|
|
471
|
+
# Factor 2: Character diversity
|
|
472
|
+
diversity = self._character_diversity_score(value)
|
|
473
|
+
if diversity >= 3.5:
|
|
474
|
+
score += 0.20
|
|
475
|
+
reasons.append("High character diversity")
|
|
476
|
+
|
|
477
|
+
# Factor 3: Keyword proximity (very important)
|
|
478
|
+
keyword_score = self._keyword_proximity_score(context, var_name)
|
|
479
|
+
score += keyword_score * 0.25
|
|
480
|
+
if keyword_score > 0.6:
|
|
481
|
+
reasons.append("Strong keyword context")
|
|
482
|
+
|
|
483
|
+
# Factor 4: Looks like base64 / hex
|
|
484
|
+
if self._looks_like_encoded(value):
|
|
485
|
+
score += 0.10
|
|
486
|
+
reasons.append("Looks like encoded/encrypted data")
|
|
487
|
+
|
|
488
|
+
# Factor 5: Length
|
|
489
|
+
if 32 <= len(value) <= 128:
|
|
490
|
+
score += 0.10
|
|
491
|
+
|
|
492
|
+
# Factor 6: Penalize common false positives
|
|
493
|
+
if self._is_common_false_positive(value):
|
|
494
|
+
score -= 0.40
|
|
495
|
+
reasons.append("Common test/example value (penalized)")
|
|
496
|
+
|
|
497
|
+
return min(max(score, 0.0), 1.0), reasons
|
|
498
|
+
|
|
499
|
+
def _character_diversity_score(self, s: str) -> float:
|
|
500
|
+
has_upper = bool(re.search(r'[A-Z]', s))
|
|
501
|
+
has_lower = bool(re.search(r'[a-z]', s))
|
|
502
|
+
has_digit = bool(re.search(r'[0-9]', s))
|
|
503
|
+
has_special = bool(re.search(r'[^A-Za-z0-9]', s))
|
|
504
|
+
return sum([has_upper, has_lower, has_digit, has_special])
|
|
505
|
+
|
|
506
|
+
def _keyword_proximity_score(self, context: str, var_name: str) -> float:
|
|
507
|
+
"""How strongly the surrounding context indicates a secret."""
|
|
508
|
+
keywords = ["secret", "key", "token", "password", "credential", "auth", "private", "api"]
|
|
509
|
+
context_lower = (context + " " + var_name).lower()
|
|
510
|
+
|
|
511
|
+
matches = sum(1 for kw in keywords if kw in context_lower)
|
|
512
|
+
return min(matches / 3.0, 1.0)
|
|
513
|
+
|
|
514
|
+
def _looks_like_encoded(self, s: str) -> bool:
|
|
515
|
+
"""Detect base64-like or hex-like strings."""
|
|
516
|
+
if len(s) % 4 == 0 and re.match(r'^[A-Za-z0-9+/=]+$', s):
|
|
517
|
+
return True
|
|
518
|
+
if re.match(r'^[0-9a-fA-F]+$', s) and len(s) > 32:
|
|
519
|
+
return True
|
|
520
|
+
return False
|
|
521
|
+
|
|
522
|
+
def _is_common_false_positive(self, s: str) -> bool:
|
|
523
|
+
"""Filter out common example/test values."""
|
|
524
|
+
false_positives = {
|
|
525
|
+
"your_api_key_here", "insert_token_here", "example", "test", "demo",
|
|
526
|
+
"placeholder", "changeme", "secret123", "password123", "AKIAIOSFODNN7EXAMPLE"
|
|
527
|
+
}
|
|
528
|
+
s_lower = s.lower()
|
|
529
|
+
return any(fp in s_lower for fp in false_positives) or s.startswith("EXAMPLE")
|
|
530
|
+
|
|
531
|
+
# -------------------------------------------------------------------------
|
|
532
|
+
# Python AST-based Secret Detection (Deep & Accurate)
|
|
533
|
+
# -------------------------------------------------------------------------
|
|
534
|
+
def _detect_secrets_via_ast(self, content: str, filepath: str) -> list[dict[str, Any]]:
|
|
535
|
+
"""Use Python AST to find secrets assigned to sensitive variables more accurately."""
|
|
536
|
+
flags = []
|
|
537
|
+
try:
|
|
538
|
+
tree = ast.parse(content, filename=filepath)
|
|
539
|
+
except Exception:
|
|
540
|
+
return flags
|
|
541
|
+
|
|
542
|
+
sensitive_names = {
|
|
543
|
+
"key", "token", "secret", "password", "credential", "auth",
|
|
544
|
+
"apikey", "api_key", "private_key", "access_key", "bearer_token"
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
for node in ast.walk(tree):
|
|
548
|
+
if isinstance(node, ast.Assign):
|
|
549
|
+
for target in node.targets:
|
|
550
|
+
if isinstance(target, ast.Name):
|
|
551
|
+
var_name = target.id.lower()
|
|
552
|
+
if any(s in var_name for s in sensitive_names):
|
|
553
|
+
val = getattr(node, "value", None)
|
|
554
|
+
if isinstance(val, ast.Constant) and isinstance(val.value, str):
|
|
555
|
+
value = node.value.value
|
|
556
|
+
if len(value) > 16 and self._advanced_entropy(value) > 4.0:
|
|
557
|
+
if not self._is_common_false_positive(value):
|
|
558
|
+
flags.append({
|
|
559
|
+
"category": "hardcoded_secret",
|
|
560
|
+
"confidence": 0.91,
|
|
561
|
+
"reason": f"Secret-like value assigned to '{target.id}' via AST analysis",
|
|
562
|
+
"block": True,
|
|
563
|
+
"suggestion": "Move this secret out of source code into environment variables or a secrets manager.",
|
|
564
|
+
})
|
|
565
|
+
|
|
566
|
+
# Also detect os.environ / os.getenv usage with string literals (sometimes people do bad things)
|
|
567
|
+
if isinstance(node, ast.Call):
|
|
568
|
+
if isinstance(node.func, ast.Attribute) and node.func.attr in ("getenv", "environ"):
|
|
569
|
+
for arg in node.args:
|
|
570
|
+
if isinstance(arg, ast.Constant) and isinstance(getattr(arg, "value", None), str):
|
|
571
|
+
if any(s in arg.value.lower() for s in ["key", "token", "secret"]):
|
|
572
|
+
# This is actually good practice, so we don't flag it
|
|
573
|
+
pass
|
|
574
|
+
|
|
575
|
+
return flags
|
|
576
|
+
|
|
577
|
+
# -------------------------------------------------------------------------
|
|
578
|
+
# Obfuscation Detection (Concatenation, Base64, etc.)
|
|
579
|
+
# -------------------------------------------------------------------------
|
|
580
|
+
def _detect_obfuscated_secrets(self, content: str, lines: list[str] | None = None) -> list[dict[str, Any]]:
|
|
581
|
+
"""Detect secrets that are split or encoded to evade simple detection."""
|
|
582
|
+
flags = []
|
|
583
|
+
|
|
584
|
+
# Detect string concatenation patterns (common obfuscation)
|
|
585
|
+
concat_pattern = re.compile(
|
|
586
|
+
r'["\']([A-Za-z0-9_\-/.+=]{8,})["\']\s*\+\s*["\']([A-Za-z0-9_\-/.+=]{8,})["\']'
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
for i, line in enumerate(lines):
|
|
590
|
+
matches = concat_pattern.findall(line)
|
|
591
|
+
for part1, part2 in matches:
|
|
592
|
+
combined = part1 + part2
|
|
593
|
+
if len(combined) >= 24 and self._advanced_entropy(combined) > 4.3:
|
|
594
|
+
flags.append({
|
|
595
|
+
"category": "hardcoded_secret",
|
|
596
|
+
"confidence": 0.85,
|
|
597
|
+
"reason": f"Secret reconstructed from string concatenation (line {i+1})",
|
|
598
|
+
"block": True,
|
|
599
|
+
"suggestion": "Secrets should never be constructed via string concatenation in source code.",
|
|
600
|
+
})
|
|
601
|
+
|
|
602
|
+
# Attempt base64 decoding on suspicious long strings
|
|
603
|
+
b64_pattern = re.compile(r'["\']([A-Za-z0-9+/=]{40,})["\']')
|
|
604
|
+
for i, line in enumerate(lines):
|
|
605
|
+
for match in b64_pattern.finditer(line):
|
|
606
|
+
try:
|
|
607
|
+
import base64
|
|
608
|
+
decoded = base64.b64decode(match.group(1) + "==").decode("utf-8", errors="ignore")
|
|
609
|
+
if self._advanced_entropy(decoded) > 3.8 or any(kw in decoded.lower() for kw in ["key", "token", "secret"]):
|
|
610
|
+
flags.append({
|
|
611
|
+
"category": "hardcoded_secret",
|
|
612
|
+
"confidence": 0.80,
|
|
613
|
+
"reason": f"Potential base64-encoded secret detected (line {i+1})",
|
|
614
|
+
"block": True,
|
|
615
|
+
"suggestion": "Avoid storing encoded secrets directly in source code.",
|
|
616
|
+
})
|
|
617
|
+
except Exception:
|
|
618
|
+
pass
|
|
619
|
+
|
|
620
|
+
return flags
|
|
621
|
+
|
|
622
|
+
def _check_anti_patterns(self, f: FileInfo, content: str) -> list[dict[str, Any]]:
|
|
623
|
+
"""Detect silent failure patterns that hide bugs."""
|
|
624
|
+
if not f.is_text or f.size > 800_000:
|
|
625
|
+
return []
|
|
626
|
+
flags = []
|
|
627
|
+
for i, pattern in enumerate(SILENT_FAILURE_PATTERNS):
|
|
628
|
+
matches = pattern.findall(content)
|
|
629
|
+
if matches:
|
|
630
|
+
flags.append({
|
|
631
|
+
"category": "silent_failure",
|
|
632
|
+
"confidence": 0.88,
|
|
633
|
+
"reason": f"Empty/silent except block that swallows errors ({len(matches)} occurrence{'s' if len(matches) > 1 else ''})",
|
|
634
|
+
"block": False,
|
|
635
|
+
"suggestion": "Remove the bare except or at minimum log the error. Bare except: pass is a leading cause of silent bugs in AI-generated code.",
|
|
636
|
+
})
|
|
637
|
+
break
|
|
638
|
+
|
|
639
|
+
# Bare except: pass (line-level detection for non-multiline)
|
|
640
|
+
for i, line in enumerate(content.splitlines()):
|
|
641
|
+
stripped = line.strip()
|
|
642
|
+
if stripped == "except:" or stripped == "except: pass":
|
|
643
|
+
flags.append({
|
|
644
|
+
"category": "silent_failure",
|
|
645
|
+
"confidence": 0.95,
|
|
646
|
+
"reason": f"Bare 'except:' on line {i+1} — catches everything silently",
|
|
647
|
+
"block": False,
|
|
648
|
+
"suggestion": "Specify the exception type and at minimum log it.",
|
|
649
|
+
})
|
|
650
|
+
break
|
|
651
|
+
|
|
652
|
+
return flags
|
|
653
|
+
|
|
654
|
+
# -------------------------------------------------------------------------
|
|
655
|
+
# Prompt Injection Detection
|
|
656
|
+
# -------------------------------------------------------------------------
|
|
657
|
+
PROMPT_INJECTION_PATTERNS: list[tuple[str, float, str]] = [
|
|
658
|
+
# Direct AI instruction patterns
|
|
659
|
+
(r'(?i)AI[:;]\s*(?:ignore|skip|bypass|disable|don\'?t\s+(?:check|validate|scan))', 0.95, "AI override instruction — tells agents to skip or bypass safety checks"),
|
|
660
|
+
(r'(?i)as\s+(?:an\s+)?AI\s*(?:assistant|agent|model|coding\s+agent)[,;]\s*(?:please|you\s+(?:should|must|will|need\s+to))\s+(?:ignore|skip|bypass|disable|never|always)', 0.93, "AI role-playing instruction — may influence agent behavior unsafely"),
|
|
661
|
+
(r'(?i)AI\s+(?:should|must|will|can|needs?\s+to)\s+(?:ignore|skip|bypass|disable|never\s+check|always\s+approve)', 0.92, "AI imperative instruction — attempts to override agent judgment"),
|
|
662
|
+
(r'(?i)(?:<!--|#)\s*@?\s*(?:ai|claude|chatgpt|cursor|copilot)\s*(?:ignore|skip|bypass)\s*(?:the\s+)?(?:below|above|following)', 0.97, "AI directive embedded in comment — may bypass safety guidelines"),
|
|
663
|
+
# System prompt remnants
|
|
664
|
+
(r'(?i)you\s+are\s+(?:an?\s+)?(?:expert|senior|advanced|helpful)\s+(?:AI|assistant|software\s+engineer|coder)', 0.88, "System prompt fragment — AI context file or prompt left in codebase"),
|
|
665
|
+
(r'(?i)your\s+(?:task|job|mission|purpose|goal)\s+is\s+to', 0.85, "AI task instruction — may be a system prompt remnant"),
|
|
666
|
+
(r'(?i)never\s+(?:tell|reveal|disclose|mention|share)\s+(?:the\s+)?(?:user|anyone|them)\s+(?:about|that)', 0.94, "AI instruction to hide information — potential prompt injection"),
|
|
667
|
+
# Unsafe override patterns
|
|
668
|
+
(r'(?i)ignore\s+(?:all\s+)?(?:previous|prior|safety|security|ethical)\s+(?:instructions|guidelines|rules|constraints)', 0.98, "Direct override of safety instructions — high-risk prompt injection"),
|
|
669
|
+
(r'(?i)you\s+(?:have|have\s+been)\s+(?:full|complete)\s+(?:permission|authorization|clearance)\s+to', 0.90, "AI permission override — attempts to grant unauthorized capabilities"),
|
|
670
|
+
# Embedded in documentation/comments
|
|
671
|
+
(r'(?i)<\|?(?:im_start|im_end|system|assistant|user)\|?>', 0.96, "Chat markup format token — LLM conversation exported to codebase"),
|
|
672
|
+
(r'(?i)\{\{.*(?:prompt|system|user|assistant).*\}\}', 0.80, "Template injection pattern — may be prompt template"),
|
|
673
|
+
]
|
|
674
|
+
|
|
675
|
+
def _check_prompt_injection(self, f: FileInfo, content: str) -> list[dict[str, Any]]:
|
|
676
|
+
"""Detect prompt injections, AI instructions, and system prompt remnants."""
|
|
677
|
+
flags: list[dict[str, Any]] = []
|
|
678
|
+
if not content:
|
|
679
|
+
return flags
|
|
680
|
+
|
|
681
|
+
rel = str(getattr(f, "rel_path", f.path)).lower()
|
|
682
|
+
# Skip our own files and known safe files
|
|
683
|
+
if ".deadpush" in rel or "__pycache__" in rel:
|
|
684
|
+
return flags
|
|
685
|
+
|
|
686
|
+
for pattern, confidence, reason in self.PROMPT_INJECTION_PATTERNS:
|
|
687
|
+
matches = re.findall(pattern, content)
|
|
688
|
+
if matches:
|
|
689
|
+
flags.append({
|
|
690
|
+
"category": "prompt_injection",
|
|
691
|
+
"confidence": confidence,
|
|
692
|
+
"reason": reason,
|
|
693
|
+
"block": False,
|
|
694
|
+
"suggestion": "Remove injected instructions from the codebase. AI agents may follow embedded instructions, creating security or behavioral risks.",
|
|
695
|
+
})
|
|
696
|
+
break # One flag per file is enough
|
|
697
|
+
|
|
698
|
+
return flags
|
|
699
|
+
|
|
700
|
+
def _check_git_status(self, f: FileInfo) -> list[dict[str, Any]]:
|
|
701
|
+
flags = []
|
|
702
|
+
if "__pycache__" in str(f.rel_path) or f.path.name.endswith(".pyc"):
|
|
703
|
+
flags.append({
|
|
704
|
+
"category": "dev_artifact",
|
|
705
|
+
"confidence": 0.99,
|
|
706
|
+
"reason": "Compiled Python cache committed to repository",
|
|
707
|
+
"block": False,
|
|
708
|
+
"suggestion": "Add __pycache__/ and *.pyc to .gitignore.",
|
|
709
|
+
})
|
|
710
|
+
return flags
|
|
711
|
+
|
|
712
|
+
def _build_debris_file(self, f: FileInfo, flags: list[dict[str, Any]]) -> DebrisFile:
|
|
713
|
+
best = max(flags, key=lambda x: (x.get("block", False), x["confidence"]))
|
|
714
|
+
category = best["category"]
|
|
715
|
+
block = best.get("block", False)
|
|
716
|
+
|
|
717
|
+
if self.config.should_block_debris_category(category):
|
|
718
|
+
block = True
|
|
719
|
+
elif self.config.should_warn_debris_category(category):
|
|
720
|
+
block = False
|
|
721
|
+
|
|
722
|
+
return DebrisFile(
|
|
723
|
+
path=str(f.rel_path),
|
|
724
|
+
category=category,
|
|
725
|
+
confidence=best["confidence"],
|
|
726
|
+
reasons=[fl["reason"] for fl in flags],
|
|
727
|
+
block_push=block,
|
|
728
|
+
suggestion=best["suggestion"],
|
|
729
|
+
)
|