deadpush 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deadpush/debris.py ADDED
@@ -0,0 +1,729 @@
1
+ """
2
+ Semantic Debris Detection - Production Level
3
+
4
+ Includes:
5
+ - LLM context files, vibe scratchpads, env files, chat exports
6
+ - Content-based + filename-based detection
7
+ - **Structural duplicate detection** using Python AST (detects AI-regenerated files)
8
+ - Content hash + name similarity fallback
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import ast
14
+ import hashlib
15
+ import re
16
+ from collections import defaultdict
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ from .config import Config
22
+ from .crawler import FileInfo
23
+ from .graph import DebrisFile, content_hash
24
+
25
+ SILENT_FAILURE_PATTERNS = [
26
+ re.compile(r'try\s*:.*?except\s*[^:]*:\s*\n\s*(?:pass|#.*|\.{3}|\.\.\.)', re.DOTALL),
27
+ re.compile(r'try\s*:.*?except\s*[^:]*:\s*\n\s*\n\s*(?:pass|return\s+None|return\s+"")', re.DOTALL),
28
+ re.compile(r'except\s*(?:Exception|BaseException|Error|RuntimeError|\(.*?\))?\s*:\s*(?:pass|#.*)'),
29
+ re.compile(r'except\s*:\s*#\s*(?:TODO|FIXME|HACK|ignore)', re.IGNORECASE),
30
+ ]
31
+
32
+ import math
33
+ import string
34
+
35
+
36
+ # =============================================================================
37
+ # Category Definitions
38
+ # =============================================================================
39
+ LLM_CONTEXT_FILES = {
40
+ "claude.md", "claude_context.md", ".claude_instructions",
41
+ ".cursorrules", "cursor_rules.md", ".cursorignore",
42
+ ".copilot-instructions.md", "agents.md", "windsurf_rules.md",
43
+ "llm_context.txt", "ai_prompt.md", "system_prompt.txt",
44
+ }
45
+
46
+ VIBE_SCRATCHPAD_NAMES = {
47
+ "scratch", "playground", "temp", "tmp", "untitled", "copy_of",
48
+ "backup", "old", "new", "v2", "final", "todo_delete", "debug",
49
+ }
50
+
51
+ CHAT_PATTERNS = [
52
+ r"^\s*(User|Assistant|Human|System):\s",
53
+ r"^(Human|Assistant):",
54
+ ]
55
+
56
+
57
+ @dataclass
58
+ class _FileSignature:
59
+ """Lightweight structural signature for duplicate detection."""
60
+ functions: frozenset[str]
61
+ classes: frozenset[str]
62
+ imports: frozenset[str]
63
+ total_nodes: int
64
+
65
+
66
+ class DebrisDetector:
67
+ def __init__(self, config: Config):
68
+ self.config = config
69
+ self._content_hashes: dict[str, list[Path]] = defaultdict(list)
70
+ self._signatures: dict[Path, _FileSignature] = {}
71
+
72
+ def scan(self, files: list[FileInfo]) -> list[DebrisFile]:
73
+ results: list[DebrisFile] = []
74
+
75
+ # Build indexes
76
+ for f in files:
77
+ if f.is_text and f.size < 2 * 1024 * 1024:
78
+ h = content_hash(f.path)
79
+ if h:
80
+ self._content_hashes[h].append(f.path)
81
+
82
+ if f.path.suffix == ".py":
83
+ sig = self._extract_python_signature(f.path)
84
+ if sig:
85
+ self._signatures[f.path] = sig
86
+
87
+ for f in files:
88
+ flags = self._check_file(f, files)
89
+ if flags:
90
+ results.append(self._build_debris_file(f, flags))
91
+
92
+ return sorted(results, key=lambda d: (not d.block_push, -d.confidence, d.path))
93
+
94
+ # -------------------------------------------------------------------------
95
+ # Per-file checking
96
+ # -------------------------------------------------------------------------
97
+ def _check_file(self, f: FileInfo, all_files: list[FileInfo]) -> list[dict[str, Any]]:
98
+ flags = []
99
+ flags += self._check_filename(f)
100
+ content = None
101
+ if f.is_text and f.size < 2 * 1024 * 1024:
102
+ try:
103
+ content = f.path.read_text(encoding="utf-8", errors="ignore")
104
+ except Exception:
105
+ pass
106
+ if content is not None:
107
+ flags += self._check_content(f, content)
108
+ flags += self._detect_hardcoded_secrets(f, content)
109
+ flags += self._check_anti_patterns(f, content)
110
+ flags += self._check_prompt_injection(f, content)
111
+ flags += self._check_duplicates(f, all_files)
112
+ flags += self._check_structural_duplicates(f, all_files)
113
+ flags += self._check_git_status(f)
114
+ return flags
115
+
116
+ def _check_filename(self, f: FileInfo) -> list[dict[str, Any]]:
117
+ flags = []
118
+ name_lower = f.path.name.lower()
119
+ rel = str(f.rel_path).lower()
120
+
121
+ if name_lower in LLM_CONTEXT_FILES or any(p in rel for p in LLM_CONTEXT_FILES):
122
+ flags.append({
123
+ "category": "llm_context_file",
124
+ "confidence": 0.99,
125
+ "reason": f"Known LLM/AI coding assistant context file: {f.path.name}",
126
+ "block": True,
127
+ "suggestion": "Add to .gitignore. These should never be committed.",
128
+ })
129
+
130
+ for bad in VIBE_SCRATCHPAD_NAMES:
131
+ if bad in name_lower:
132
+ flags.append({
133
+ "category": "vibe_scratchpad",
134
+ "confidence": 0.82,
135
+ "reason": f"Looks like a temporary/AI scratch file: {f.path.name}",
136
+ "block": False,
137
+ "suggestion": "Delete or move to a gitignored location.",
138
+ })
139
+ break
140
+
141
+ if any(x in name_lower for x in ["_copy", "_backup", "_old", "_v2", "_final", "_new"]):
142
+ flags.append({
143
+ "category": "duplicate_file",
144
+ "confidence": 0.75,
145
+ "reason": f"Filename suggests copy/regenerated version: {f.path.name}",
146
+ "block": False,
147
+ "suggestion": "Compare with original and remove duplicate.",
148
+ })
149
+
150
+ return flags
151
+
152
+ def _check_content(self, f: FileInfo, content: str | None = None) -> list[dict[str, Any]]:
153
+ flags = []
154
+ if content is not None:
155
+ lines = content.splitlines()
156
+ head = "\n".join(lines[:55])
157
+ else:
158
+ try:
159
+ with f.path.open("r", encoding="utf-8", errors="ignore") as fh:
160
+ head = "".join([next(fh) for _ in range(55)])
161
+ except Exception:
162
+ return flags
163
+
164
+ content_lower = head.lower()
165
+
166
+ if any(p in content_lower for p in [
167
+ "you are a helpful assistant", "you are an expert software engineer",
168
+ "as an ai coding assistant", "claude", "cursor rules",
169
+ ]):
170
+ flags.append({
171
+ "category": "llm_context_file",
172
+ "confidence": 0.96,
173
+ "reason": "Contains AI system prompt or context instructions",
174
+ "block": True,
175
+ "suggestion": "This appears to be an exported AI context/prompt file.",
176
+ })
177
+
178
+ for pattern in CHAT_PATTERNS:
179
+ if re.search(pattern, head, re.IGNORECASE | re.MULTILINE):
180
+ flags.append({
181
+ "category": "chat_export",
182
+ "confidence": 0.88,
183
+ "reason": "Matches exported LLM chat log format",
184
+ "block": False,
185
+ "suggestion": "Remove chat export files from the repository.",
186
+ })
187
+ break
188
+
189
+ if f.path.name.startswith(".env") and any(x in f.path.name.lower() for x in ["local", "dev", "development"]):
190
+ flags.append({
191
+ "category": "env_file",
192
+ "confidence": 0.97,
193
+ "reason": "Committed local/development environment file",
194
+ "block": True,
195
+ "suggestion": "Add to .gitignore and rotate any exposed secrets.",
196
+ })
197
+
198
+ return flags
199
+
200
+ # -------------------------------------------------------------------------
201
+ # Content + Name Duplicate Detection
202
+ # -------------------------------------------------------------------------
203
+ def _check_duplicates(self, f: FileInfo, all_files: list[FileInfo]) -> list[dict[str, Any]]:
204
+ flags = []
205
+ my_hash = content_hash(f.path)
206
+ if my_hash:
207
+ duplicates = [p for p in self._content_hashes.get(my_hash, []) if p != f.path]
208
+ if duplicates:
209
+ flags.append({
210
+ "category": "duplicate_file",
211
+ "confidence": 0.99,
212
+ "reason": f"Exact content duplicate of: {duplicates[0].name}",
213
+ "block": False,
214
+ "suggestion": "Delete all but one copy.",
215
+ })
216
+
217
+ # Name similarity
218
+ for other in all_files:
219
+ if other.path == f.path:
220
+ continue
221
+ if self._name_similarity(f.path.name, other.path.name) > 0.80:
222
+ flags.append({
223
+ "category": "duplicate_file",
224
+ "confidence": 0.70,
225
+ "reason": f"Very similar filename to existing file: {other.rel_path}",
226
+ "block": False,
227
+ "suggestion": "Review — this may be an AI-regenerated duplicate.",
228
+ })
229
+ break
230
+ return flags
231
+
232
+ def _name_similarity(self, a: str, b: str) -> float:
233
+ def bigrams(s): return {s[i:i+2] for i in range(len(s)-1)}
234
+ inter = len(bigrams(a) & bigrams(b))
235
+ union = len(bigrams(a) | bigrams(b))
236
+ return inter / union if union else 0.0
237
+
238
+ # -------------------------------------------------------------------------
239
+ # NEW: Structural / AST-based Duplicate Detection (The Wow Feature)
240
+ # -------------------------------------------------------------------------
241
+ def _extract_python_signature(self, path: Path) -> _FileSignature | None:
242
+ """Extract structural signature of a Python file using AST."""
243
+ try:
244
+ with path.open("r", encoding="utf-8", errors="ignore") as f:
245
+ tree = ast.parse(f.read(), filename=str(path))
246
+ except Exception:
247
+ return None
248
+
249
+ functions = set()
250
+ classes = set()
251
+ imports = set()
252
+ total_nodes = 0
253
+
254
+ for node in ast.walk(tree):
255
+ total_nodes += 1
256
+ if isinstance(node, ast.FunctionDef):
257
+ args = tuple(arg.arg for arg in node.args.args)
258
+ functions.add(f"{node.name}{args}")
259
+ elif isinstance(node, ast.ClassDef):
260
+ classes.add(node.name)
261
+ elif isinstance(node, (ast.Import, ast.ImportFrom)):
262
+ for alias in node.names:
263
+ imports.add(alias.name)
264
+
265
+ return _FileSignature(
266
+ functions=frozenset(functions),
267
+ classes=frozenset(classes),
268
+ imports=frozenset(imports),
269
+ total_nodes=total_nodes
270
+ )
271
+
272
+ def _check_structural_duplicates(self, f: FileInfo, all_files: list[FileInfo]) -> list[dict[str, Any]]:
273
+ """
274
+ Detect files that have very similar structure to existing files.
275
+ This catches cases where an LLM was asked to "rewrite" or "improve" a file
276
+ and created a near-duplicate instead of editing the original.
277
+ """
278
+ if f.path.suffix != ".py" or f.path not in self._signatures:
279
+ return []
280
+
281
+ flags = []
282
+ my_sig = self._signatures[f.path]
283
+
284
+ for other_path, other_sig in self._signatures.items():
285
+ if other_path == f.path:
286
+ continue
287
+
288
+ func_overlap = len(my_sig.functions & other_sig.functions)
289
+ class_overlap = len(my_sig.classes & other_sig.classes)
290
+ total_unique = len(my_sig.functions | other_sig.functions) + len(my_sig.classes | other_sig.classes)
291
+
292
+ if total_unique == 0:
293
+ continue
294
+
295
+ similarity = (func_overlap + class_overlap) / total_unique
296
+
297
+ if similarity > 0.75 and len(my_sig.functions) > 1:
298
+ flags.append({
299
+ "category": "ai_regenerated_duplicate",
300
+ "confidence": min(0.92, 0.65 + similarity * 0.3),
301
+ "reason": f"Structurally very similar to {other_path.name} (likely AI-regenerated copy)",
302
+ "block": False,
303
+ "suggestion": "Compare with original. Delete the regenerated version and edit the original instead.",
304
+ })
305
+ break
306
+
307
+ return flags
308
+
309
+ # -------------------------------------------------------------------------
310
+ # ADVANCED Hardcoded Secrets Detection (Production-Grade)
311
+ # -------------------------------------------------------------------------
312
+ def _detect_hardcoded_secrets(self, f: FileInfo, content: str | None = None) -> list[dict[str, Any]]:
313
+ """
314
+ Advanced, multi-layered secret detection engine.
315
+
316
+ Techniques used:
317
+ - High-order entropy analysis (bigram/trigram aware)
318
+ - Keyword proximity scoring (how close "secret"/"key" is to candidate)
319
+ - AST-aware context analysis (for Python)
320
+ - Known high-value secret formats with validation
321
+ - Obfuscation detection (concatenation, base64-ish strings)
322
+ - Strong false-positive filtering
323
+ """
324
+ if not f.is_text or f.size > 800_000:
325
+ return []
326
+
327
+ flags = []
328
+ if content is None:
329
+ try:
330
+ content = f.path.read_text(encoding="utf-8", errors="ignore")
331
+ except Exception:
332
+ return flags
333
+ lines = content.splitlines()
334
+
335
+ # === Layer 1: High-confidence known formats ===
336
+ known_formats = self._get_known_secret_formats()
337
+ for i, line in enumerate(lines):
338
+ for pattern, secret_type, confidence in known_formats:
339
+ if re.search(pattern, line):
340
+ flags.append({
341
+ "category": "hardcoded_secret",
342
+ "confidence": confidence,
343
+ "reason": f"High-confidence {secret_type} detected (line {i+1})",
344
+ "block": True,
345
+ "suggestion": f"Remove the hardcoded {secret_type}. Use a secrets manager or environment variables immediately.",
346
+ })
347
+
348
+ # === Layer 2: Advanced entropy + context scoring ===
349
+ candidates = self._extract_potential_secrets(content, lines)
350
+
351
+ for candidate in candidates:
352
+ score, reasons = self._score_secret_candidate(candidate, content)
353
+
354
+ if score >= 0.78: # High confidence threshold
355
+ flags.append({
356
+ "category": "hardcoded_secret",
357
+ "confidence": round(score, 3),
358
+ "reason": " | ".join(reasons),
359
+ "block": True,
360
+ "suggestion": "This appears to be a hardcoded secret. Move it to environment variables or a proper secrets manager (AWS Secrets Manager, Doppler, Infisical, etc.).",
361
+ })
362
+
363
+ # === Layer 3: Python AST-based deep analysis (most accurate) ===
364
+ if f.path.suffix == ".py":
365
+ ast_flags = self._detect_secrets_via_ast(content, str(f.path))
366
+ flags.extend(ast_flags)
367
+
368
+ # === Layer 4: Obfuscation & reconstruction ===
369
+ obfuscated_flags = self._detect_obfuscated_secrets(content, lines)
370
+ flags.extend(obfuscated_flags)
371
+
372
+ return flags
373
+
374
+ def _get_known_secret_formats(self):
375
+ """High-precision patterns for well-known secret types."""
376
+ return [
377
+ (r'AKIA[0-9A-Z]{16}', "AWS Access Key ID", 0.97),
378
+ (r'(?i)aws.*secret.*access.*key["\']?\s*[:=]\s*["\']?[A-Za-z0-9/+=]{40}', "AWS Secret Access Key", 0.95),
379
+ (r'ghp_[a-zA-Z0-9]{36}', "GitHub Personal Access Token", 0.98),
380
+ (r'gho_[a-zA-Z0-9]{36}', "GitHub OAuth Token", 0.96),
381
+ (r'ghs_[a-zA-Z0-9]{36}', "GitHub App Token", 0.96),
382
+ (r'sk-[a-zA-Z0-9]{48}', "OpenAI API Key", 0.97),
383
+ (r'sk-ant-[a-zA-Z0-9]{48}', "Anthropic API Key", 0.97),
384
+ (r'AIza[0-9A-Za-z\-_]{35}', "Google API Key", 0.94),
385
+ (r'-----BEGIN (RSA|EC|DSA|OPENSSH) PRIVATE KEY-----', "Private Key", 0.99),
386
+ (r'eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}', "JWT Token", 0.90),
387
+ ]
388
+
389
+ def _extract_potential_secrets(self, content: str, lines: list[str]) -> list[dict]:
390
+ """Extract candidate strings that could be secrets using multiple strategies."""
391
+ candidates = []
392
+
393
+ # Strategy 1: String literals assigned to sensitive-looking variables
394
+ sensitive_keywords = {
395
+ "key", "token", "secret", "password", "passwd", "credential", "auth",
396
+ "api", "private", "access", "bearer", "jwt", "oauth", "client_secret"
397
+ }
398
+
399
+ # Python/JS/TS style assignments
400
+ assignment_pattern = re.compile(
401
+ r'([a-zA-Z_][a-zA-Z0-9_]*)\s*[:=]\s*["\']([A-Za-z0-9_\-/.+=@#$%^&*!~|]{16,})["\']'
402
+ )
403
+
404
+ for i, line in enumerate(lines):
405
+ for match in assignment_pattern.finditer(line):
406
+ var_name = match.group(1).lower()
407
+ value = match.group(2)
408
+
409
+ # Check if variable name contains sensitive keywords
410
+ if any(kw in var_name for kw in sensitive_keywords):
411
+ candidates.append({
412
+ "value": value,
413
+ "line": i + 1,
414
+ "context": line.strip(),
415
+ "variable": match.group(1),
416
+ "type": "assignment"
417
+ })
418
+
419
+ # Strategy 2: High-entropy standalone strings (even without assignment)
420
+ string_literal_pattern = re.compile(r'["\']([A-Za-z0-9_\-/.+=@#$%^&*!~|]{24,})["\']')
421
+ for i, line in enumerate(lines):
422
+ for match in string_literal_pattern.finditer(line):
423
+ value = match.group(1)
424
+ if self._advanced_entropy(value) > 4.2:
425
+ candidates.append({
426
+ "value": value,
427
+ "line": i + 1,
428
+ "context": line.strip(),
429
+ "variable": None,
430
+ "type": "standalone"
431
+ })
432
+
433
+ return candidates
434
+
435
+ def _advanced_entropy(self, s: str) -> float:
436
+ """More sophisticated entropy calculation using character distribution + bigrams."""
437
+ if len(s) < 16:
438
+ return 0.0
439
+
440
+ # Character-level entropy
441
+ prob = [float(s.count(c)) / len(s) for c in set(s)]
442
+ char_entropy = -sum(p * math.log2(p) for p in prob if p > 0)
443
+
444
+ # Bigram entropy (detects structured vs random strings)
445
+ if len(s) > 2:
446
+ bigrams = [s[i:i+2] for i in range(len(s)-1)]
447
+ bigram_prob = [float(bigrams.count(b)) / len(bigrams) for b in set(bigrams)]
448
+ bigram_entropy = -sum(p * math.log2(p) for p in bigram_prob if p > 0)
449
+ return (char_entropy * 0.6) + (bigram_entropy * 0.4)
450
+
451
+ return char_entropy
452
+
453
+ def _score_secret_candidate(self, candidate: dict, full_content: str) -> tuple[float, list[str]]:
454
+ """Multi-factor scoring for secret likelihood."""
455
+ value = candidate["value"]
456
+ context = candidate.get("context", "")
457
+ var_name = candidate.get("variable", "") or ""
458
+
459
+ score = 0.0
460
+ reasons = []
461
+
462
+ # Factor 1: Advanced entropy
463
+ entropy = self._advanced_entropy(value)
464
+ if entropy > 4.8:
465
+ score += 0.35
466
+ reasons.append("Very high entropy")
467
+ elif entropy > 4.2:
468
+ score += 0.25
469
+ reasons.append("High entropy")
470
+
471
+ # Factor 2: Character diversity
472
+ diversity = self._character_diversity_score(value)
473
+ if diversity >= 3.5:
474
+ score += 0.20
475
+ reasons.append("High character diversity")
476
+
477
+ # Factor 3: Keyword proximity (very important)
478
+ keyword_score = self._keyword_proximity_score(context, var_name)
479
+ score += keyword_score * 0.25
480
+ if keyword_score > 0.6:
481
+ reasons.append("Strong keyword context")
482
+
483
+ # Factor 4: Looks like base64 / hex
484
+ if self._looks_like_encoded(value):
485
+ score += 0.10
486
+ reasons.append("Looks like encoded/encrypted data")
487
+
488
+ # Factor 5: Length
489
+ if 32 <= len(value) <= 128:
490
+ score += 0.10
491
+
492
+ # Factor 6: Penalize common false positives
493
+ if self._is_common_false_positive(value):
494
+ score -= 0.40
495
+ reasons.append("Common test/example value (penalized)")
496
+
497
+ return min(max(score, 0.0), 1.0), reasons
498
+
499
+ def _character_diversity_score(self, s: str) -> float:
500
+ has_upper = bool(re.search(r'[A-Z]', s))
501
+ has_lower = bool(re.search(r'[a-z]', s))
502
+ has_digit = bool(re.search(r'[0-9]', s))
503
+ has_special = bool(re.search(r'[^A-Za-z0-9]', s))
504
+ return sum([has_upper, has_lower, has_digit, has_special])
505
+
506
+ def _keyword_proximity_score(self, context: str, var_name: str) -> float:
507
+ """How strongly the surrounding context indicates a secret."""
508
+ keywords = ["secret", "key", "token", "password", "credential", "auth", "private", "api"]
509
+ context_lower = (context + " " + var_name).lower()
510
+
511
+ matches = sum(1 for kw in keywords if kw in context_lower)
512
+ return min(matches / 3.0, 1.0)
513
+
514
+ def _looks_like_encoded(self, s: str) -> bool:
515
+ """Detect base64-like or hex-like strings."""
516
+ if len(s) % 4 == 0 and re.match(r'^[A-Za-z0-9+/=]+$', s):
517
+ return True
518
+ if re.match(r'^[0-9a-fA-F]+$', s) and len(s) > 32:
519
+ return True
520
+ return False
521
+
522
+ def _is_common_false_positive(self, s: str) -> bool:
523
+ """Filter out common example/test values."""
524
+ false_positives = {
525
+ "your_api_key_here", "insert_token_here", "example", "test", "demo",
526
+ "placeholder", "changeme", "secret123", "password123", "AKIAIOSFODNN7EXAMPLE"
527
+ }
528
+ s_lower = s.lower()
529
+ return any(fp in s_lower for fp in false_positives) or s.startswith("EXAMPLE")
530
+
531
+ # -------------------------------------------------------------------------
532
+ # Python AST-based Secret Detection (Deep & Accurate)
533
+ # -------------------------------------------------------------------------
534
+ def _detect_secrets_via_ast(self, content: str, filepath: str) -> list[dict[str, Any]]:
535
+ """Use Python AST to find secrets assigned to sensitive variables more accurately."""
536
+ flags = []
537
+ try:
538
+ tree = ast.parse(content, filename=filepath)
539
+ except Exception:
540
+ return flags
541
+
542
+ sensitive_names = {
543
+ "key", "token", "secret", "password", "credential", "auth",
544
+ "apikey", "api_key", "private_key", "access_key", "bearer_token"
545
+ }
546
+
547
+ for node in ast.walk(tree):
548
+ if isinstance(node, ast.Assign):
549
+ for target in node.targets:
550
+ if isinstance(target, ast.Name):
551
+ var_name = target.id.lower()
552
+ if any(s in var_name for s in sensitive_names):
553
+ val = getattr(node, "value", None)
554
+ if isinstance(val, ast.Constant) and isinstance(val.value, str):
555
+ value = node.value.value
556
+ if len(value) > 16 and self._advanced_entropy(value) > 4.0:
557
+ if not self._is_common_false_positive(value):
558
+ flags.append({
559
+ "category": "hardcoded_secret",
560
+ "confidence": 0.91,
561
+ "reason": f"Secret-like value assigned to '{target.id}' via AST analysis",
562
+ "block": True,
563
+ "suggestion": "Move this secret out of source code into environment variables or a secrets manager.",
564
+ })
565
+
566
+ # Also detect os.environ / os.getenv usage with string literals (sometimes people do bad things)
567
+ if isinstance(node, ast.Call):
568
+ if isinstance(node.func, ast.Attribute) and node.func.attr in ("getenv", "environ"):
569
+ for arg in node.args:
570
+ if isinstance(arg, ast.Constant) and isinstance(getattr(arg, "value", None), str):
571
+ if any(s in arg.value.lower() for s in ["key", "token", "secret"]):
572
+ # This is actually good practice, so we don't flag it
573
+ pass
574
+
575
+ return flags
576
+
577
+ # -------------------------------------------------------------------------
578
+ # Obfuscation Detection (Concatenation, Base64, etc.)
579
+ # -------------------------------------------------------------------------
580
+ def _detect_obfuscated_secrets(self, content: str, lines: list[str] | None = None) -> list[dict[str, Any]]:
581
+ """Detect secrets that are split or encoded to evade simple detection."""
582
+ flags = []
583
+
584
+ # Detect string concatenation patterns (common obfuscation)
585
+ concat_pattern = re.compile(
586
+ r'["\']([A-Za-z0-9_\-/.+=]{8,})["\']\s*\+\s*["\']([A-Za-z0-9_\-/.+=]{8,})["\']'
587
+ )
588
+
589
+ for i, line in enumerate(lines):
590
+ matches = concat_pattern.findall(line)
591
+ for part1, part2 in matches:
592
+ combined = part1 + part2
593
+ if len(combined) >= 24 and self._advanced_entropy(combined) > 4.3:
594
+ flags.append({
595
+ "category": "hardcoded_secret",
596
+ "confidence": 0.85,
597
+ "reason": f"Secret reconstructed from string concatenation (line {i+1})",
598
+ "block": True,
599
+ "suggestion": "Secrets should never be constructed via string concatenation in source code.",
600
+ })
601
+
602
+ # Attempt base64 decoding on suspicious long strings
603
+ b64_pattern = re.compile(r'["\']([A-Za-z0-9+/=]{40,})["\']')
604
+ for i, line in enumerate(lines):
605
+ for match in b64_pattern.finditer(line):
606
+ try:
607
+ import base64
608
+ decoded = base64.b64decode(match.group(1) + "==").decode("utf-8", errors="ignore")
609
+ if self._advanced_entropy(decoded) > 3.8 or any(kw in decoded.lower() for kw in ["key", "token", "secret"]):
610
+ flags.append({
611
+ "category": "hardcoded_secret",
612
+ "confidence": 0.80,
613
+ "reason": f"Potential base64-encoded secret detected (line {i+1})",
614
+ "block": True,
615
+ "suggestion": "Avoid storing encoded secrets directly in source code.",
616
+ })
617
+ except Exception:
618
+ pass
619
+
620
+ return flags
621
+
622
+ def _check_anti_patterns(self, f: FileInfo, content: str) -> list[dict[str, Any]]:
623
+ """Detect silent failure patterns that hide bugs."""
624
+ if not f.is_text or f.size > 800_000:
625
+ return []
626
+ flags = []
627
+ for i, pattern in enumerate(SILENT_FAILURE_PATTERNS):
628
+ matches = pattern.findall(content)
629
+ if matches:
630
+ flags.append({
631
+ "category": "silent_failure",
632
+ "confidence": 0.88,
633
+ "reason": f"Empty/silent except block that swallows errors ({len(matches)} occurrence{'s' if len(matches) > 1 else ''})",
634
+ "block": False,
635
+ "suggestion": "Remove the bare except or at minimum log the error. Bare except: pass is a leading cause of silent bugs in AI-generated code.",
636
+ })
637
+ break
638
+
639
+ # Bare except: pass (line-level detection for non-multiline)
640
+ for i, line in enumerate(content.splitlines()):
641
+ stripped = line.strip()
642
+ if stripped == "except:" or stripped == "except: pass":
643
+ flags.append({
644
+ "category": "silent_failure",
645
+ "confidence": 0.95,
646
+ "reason": f"Bare 'except:' on line {i+1} — catches everything silently",
647
+ "block": False,
648
+ "suggestion": "Specify the exception type and at minimum log it.",
649
+ })
650
+ break
651
+
652
+ return flags
653
+
654
+ # -------------------------------------------------------------------------
655
+ # Prompt Injection Detection
656
+ # -------------------------------------------------------------------------
657
+ PROMPT_INJECTION_PATTERNS: list[tuple[str, float, str]] = [
658
+ # Direct AI instruction patterns
659
+ (r'(?i)AI[:;]\s*(?:ignore|skip|bypass|disable|don\'?t\s+(?:check|validate|scan))', 0.95, "AI override instruction — tells agents to skip or bypass safety checks"),
660
+ (r'(?i)as\s+(?:an\s+)?AI\s*(?:assistant|agent|model|coding\s+agent)[,;]\s*(?:please|you\s+(?:should|must|will|need\s+to))\s+(?:ignore|skip|bypass|disable|never|always)', 0.93, "AI role-playing instruction — may influence agent behavior unsafely"),
661
+ (r'(?i)AI\s+(?:should|must|will|can|needs?\s+to)\s+(?:ignore|skip|bypass|disable|never\s+check|always\s+approve)', 0.92, "AI imperative instruction — attempts to override agent judgment"),
662
+ (r'(?i)(?:<!--|#)\s*@?\s*(?:ai|claude|chatgpt|cursor|copilot)\s*(?:ignore|skip|bypass)\s*(?:the\s+)?(?:below|above|following)', 0.97, "AI directive embedded in comment — may bypass safety guidelines"),
663
+ # System prompt remnants
664
+ (r'(?i)you\s+are\s+(?:an?\s+)?(?:expert|senior|advanced|helpful)\s+(?:AI|assistant|software\s+engineer|coder)', 0.88, "System prompt fragment — AI context file or prompt left in codebase"),
665
+ (r'(?i)your\s+(?:task|job|mission|purpose|goal)\s+is\s+to', 0.85, "AI task instruction — may be a system prompt remnant"),
666
+ (r'(?i)never\s+(?:tell|reveal|disclose|mention|share)\s+(?:the\s+)?(?:user|anyone|them)\s+(?:about|that)', 0.94, "AI instruction to hide information — potential prompt injection"),
667
+ # Unsafe override patterns
668
+ (r'(?i)ignore\s+(?:all\s+)?(?:previous|prior|safety|security|ethical)\s+(?:instructions|guidelines|rules|constraints)', 0.98, "Direct override of safety instructions — high-risk prompt injection"),
669
+ (r'(?i)you\s+(?:have|have\s+been)\s+(?:full|complete)\s+(?:permission|authorization|clearance)\s+to', 0.90, "AI permission override — attempts to grant unauthorized capabilities"),
670
+ # Embedded in documentation/comments
671
+ (r'(?i)<\|?(?:im_start|im_end|system|assistant|user)\|?>', 0.96, "Chat markup format token — LLM conversation exported to codebase"),
672
+ (r'(?i)\{\{.*(?:prompt|system|user|assistant).*\}\}', 0.80, "Template injection pattern — may be prompt template"),
673
+ ]
674
+
675
+ def _check_prompt_injection(self, f: FileInfo, content: str) -> list[dict[str, Any]]:
676
+ """Detect prompt injections, AI instructions, and system prompt remnants."""
677
+ flags: list[dict[str, Any]] = []
678
+ if not content:
679
+ return flags
680
+
681
+ rel = str(getattr(f, "rel_path", f.path)).lower()
682
+ # Skip our own files and known safe files
683
+ if ".deadpush" in rel or "__pycache__" in rel:
684
+ return flags
685
+
686
+ for pattern, confidence, reason in self.PROMPT_INJECTION_PATTERNS:
687
+ matches = re.findall(pattern, content)
688
+ if matches:
689
+ flags.append({
690
+ "category": "prompt_injection",
691
+ "confidence": confidence,
692
+ "reason": reason,
693
+ "block": False,
694
+ "suggestion": "Remove injected instructions from the codebase. AI agents may follow embedded instructions, creating security or behavioral risks.",
695
+ })
696
+ break # One flag per file is enough
697
+
698
+ return flags
699
+
700
+ def _check_git_status(self, f: FileInfo) -> list[dict[str, Any]]:
701
+ flags = []
702
+ if "__pycache__" in str(f.rel_path) or f.path.name.endswith(".pyc"):
703
+ flags.append({
704
+ "category": "dev_artifact",
705
+ "confidence": 0.99,
706
+ "reason": "Compiled Python cache committed to repository",
707
+ "block": False,
708
+ "suggestion": "Add __pycache__/ and *.pyc to .gitignore.",
709
+ })
710
+ return flags
711
+
712
+ def _build_debris_file(self, f: FileInfo, flags: list[dict[str, Any]]) -> DebrisFile:
713
+ best = max(flags, key=lambda x: (x.get("block", False), x["confidence"]))
714
+ category = best["category"]
715
+ block = best.get("block", False)
716
+
717
+ if self.config.should_block_debris_category(category):
718
+ block = True
719
+ elif self.config.should_warn_debris_category(category):
720
+ block = False
721
+
722
+ return DebrisFile(
723
+ path=str(f.rel_path),
724
+ category=category,
725
+ confidence=best["confidence"],
726
+ reasons=[fl["reason"] for fl in flags],
727
+ block_push=block,
728
+ suggestion=best["suggestion"],
729
+ )