codeguard-pro 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
learning_loop.py ADDED
@@ -0,0 +1,168 @@
1
+ """Reviewable learning loop for new threats and missed detections.
2
+
3
+ Stores suspicious samples as JSON artifacts, generates issue-ready markdown,
4
+ and summarizes the local learning corpus. This is intentionally review-first:
5
+ it does not auto-modify detection rules.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import re
11
+ import time
12
+ from collections import Counter
13
+ from typing import Dict, List, Optional
14
+
15
+
16
+ DEFAULT_CORPUS_DIR = "learning"
17
+
18
+
19
+ def _safe_slug(text: str, fallback: str = "sample") -> str:
20
+ slug = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-")
21
+ return slug[:60] or fallback
22
+
23
+
24
+ def _ensure_dirs(corpus_dir: str) -> Dict[str, str]:
25
+ base = os.path.abspath(corpus_dir)
26
+ candidates = os.path.join(base, "candidates")
27
+ issues = os.path.join(base, "issues")
28
+ os.makedirs(candidates, exist_ok=True)
29
+ os.makedirs(issues, exist_ok=True)
30
+ return {"base": base, "candidates": candidates, "issues": issues}
31
+
32
+
33
+ def record_candidate(
34
+ title: str,
35
+ code: str,
36
+ language: str = "python",
37
+ source: str = "manual",
38
+ expected_behavior: str = "",
39
+ regex_findings: Optional[List[Dict]] = None,
40
+ ai_findings: Optional[List[Dict]] = None,
41
+ corpus_dir: str = DEFAULT_CORPUS_DIR,
42
+ ) -> Dict:
43
+ """Persist a suspicious sample for later review."""
44
+ dirs = _ensure_dirs(corpus_dir)
45
+ ts = time.strftime("%Y%m%d-%H%M%S")
46
+ slug = _safe_slug(title)
47
+ candidate_id = f"{ts}-{slug}"
48
+ path = os.path.join(dirs["candidates"], f"{candidate_id}.json")
49
+
50
+ data = {
51
+ "id": candidate_id,
52
+ "title": title,
53
+ "language": language,
54
+ "source": source,
55
+ "expected_behavior": expected_behavior,
56
+ "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
57
+ "regex_findings": regex_findings or [],
58
+ "ai_findings": ai_findings or [],
59
+ "code": code,
60
+ "status": "triage",
61
+ }
62
+
63
+ with open(path, "w") as f:
64
+ json.dump(data, f, indent=2)
65
+ f.write("\n")
66
+
67
+ return {
68
+ "status": "RECORDED",
69
+ "id": candidate_id,
70
+ "path": path,
71
+ "regex_findings": len(data["regex_findings"]),
72
+ "ai_findings": len(data["ai_findings"]),
73
+ }
74
+
75
+
76
+ def load_candidate(candidate_path: str) -> Dict:
77
+ with open(candidate_path, "r") as f:
78
+ return json.load(f)
79
+
80
+
81
+ def generate_issue_markdown(candidate_path: str, corpus_dir: str = DEFAULT_CORPUS_DIR) -> Dict:
82
+ """Generate a GitHub-issue-ready markdown report from a candidate."""
83
+ dirs = _ensure_dirs(corpus_dir)
84
+ candidate = load_candidate(candidate_path)
85
+ regex_count = len(candidate.get("regex_findings", []))
86
+ ai_count = len(candidate.get("ai_findings", []))
87
+ capability = "unknown"
88
+ if ai_count and not regex_count:
89
+ capability = "AI-only detection"
90
+ elif regex_count and ai_count:
91
+ capability = "Both layers detected"
92
+ elif regex_count:
93
+ capability = "Regex-only detection"
94
+
95
+ md = "\n".join([
96
+ f"# Detection Gap: {candidate.get('title', candidate.get('id', 'unknown'))}",
97
+ "",
98
+ "## Summary",
99
+ f"- Candidate ID: `{candidate.get('id', 'unknown')}`",
100
+ f"- Language: `{candidate.get('language', 'unknown')}`",
101
+ f"- Source: `{candidate.get('source', 'unknown')}`",
102
+ f"- Expected behavior: {candidate.get('expected_behavior', 'n/a') or 'n/a'}",
103
+ f"- Current capability: {capability}",
104
+ f"- Regex findings: {regex_count}",
105
+ f"- AI findings: {ai_count}",
106
+ "",
107
+ "## Why This Matters",
108
+ "This sample should be reviewed to determine whether it warrants a new deterministic detector, an AST rule, a red-team test, or a documentation-only limitation.",
109
+ "",
110
+ "## Reproduction Sample",
111
+ f"```{candidate.get('language', 'text')}",
112
+ candidate.get("code", ""),
113
+ "```",
114
+ "",
115
+ "## Current Findings",
116
+ "```json",
117
+ json.dumps({
118
+ "regex_findings": candidate.get("regex_findings", []),
119
+ "ai_findings": candidate.get("ai_findings", []),
120
+ }, indent=2),
121
+ "```",
122
+ "",
123
+ "## Acceptance Criteria",
124
+ "- Add or document the expected detector behavior.",
125
+ "- Add a regression test covering this sample.",
126
+ "- Avoid introducing false positives on clean code.",
127
+ ])
128
+
129
+ issue_path = os.path.join(dirs["issues"], f"{candidate['id']}.md")
130
+ with open(issue_path, "w") as f:
131
+ f.write(md)
132
+ f.write("\n")
133
+
134
+ return {"status": "ISSUE_READY", "path": issue_path, "candidate_id": candidate["id"]}
135
+
136
+
137
+ def corpus_summary(corpus_dir: str = DEFAULT_CORPUS_DIR) -> Dict:
138
+ """Summarize all stored learning artifacts."""
139
+ dirs = _ensure_dirs(corpus_dir)
140
+ candidates = []
141
+ for fname in sorted(os.listdir(dirs["candidates"])):
142
+ if not fname.endswith(".json"):
143
+ continue
144
+ try:
145
+ candidates.append(load_candidate(os.path.join(dirs["candidates"], fname)))
146
+ except Exception:
147
+ continue
148
+
149
+ by_language = Counter(c.get("language", "unknown") for c in candidates)
150
+ by_source = Counter(c.get("source", "unknown") for c in candidates)
151
+ by_status = Counter(c.get("status", "triage") for c in candidates)
152
+ ai_only = sum(1 for c in candidates if c.get("ai_findings") and not c.get("regex_findings"))
153
+ regex_only = sum(1 for c in candidates if c.get("regex_findings") and not c.get("ai_findings"))
154
+ both = sum(1 for c in candidates if c.get("regex_findings") and c.get("ai_findings"))
155
+
156
+ return {
157
+ "corpus_dir": dirs["base"],
158
+ "total_candidates": len(candidates),
159
+ "issue_drafts": len([f for f in os.listdir(dirs["issues"]) if f.endswith(".md")]),
160
+ "by_language": dict(by_language),
161
+ "by_source": dict(by_source),
162
+ "by_status": dict(by_status),
163
+ "coverage": {
164
+ "ai_only": ai_only,
165
+ "regex_only": regex_only,
166
+ "both": both,
167
+ },
168
+ }
secret_scanner.py ADDED
@@ -0,0 +1,273 @@
1
+ """Secrets detection engine — the core of CodeGuard Pro.
2
+
3
+ Catches hardcoded API keys, tokens, passwords, and credentials
4
+ BEFORE they hit git. Zero false-positive tolerance on high-confidence matches.
5
+ """
6
+
7
+ import re
8
+ from typing import List, Dict, Optional
9
+ from dataclasses import dataclass, field
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class SecretFinding:
14
+ """Immutable finding from secrets scan."""
15
+ line: int
16
+ column: int
17
+ secret_type: str
18
+ severity: str # CRITICAL, HIGH
19
+ matched: str # redacted version
20
+ raw_line: str
21
+ fix: str # exact fix suggestion
22
+ confidence: float # 0-1
23
+
24
+
25
+ # High-confidence patterns — these are REAL secrets, not variable names
26
+ SECRET_PATTERNS: List[Dict] = [
27
+ # OpenAI
28
+ {"name": "OpenAI API Key", "pattern": r'sk-(?:proj-)?[a-zA-Z0-9_-]{20,}', "severity": "CRITICAL",
29
+ "fix": 'Use environment variable: os.environ["OPENAI_API_KEY"]'},
30
+ # Anthropic
31
+ {"name": "Anthropic API Key", "pattern": r'sk-ant-[a-zA-Z0-9_-]{20,}', "severity": "CRITICAL",
32
+ "fix": 'Use environment variable: os.environ["ANTHROPIC_API_KEY"]'},
33
+ # GitHub
34
+ {"name": "GitHub Token", "pattern": r'ghp_[a-zA-Z0-9]{36}', "severity": "CRITICAL",
35
+ "fix": 'Use environment variable: os.environ["GITHUB_TOKEN"]'},
36
+ {"name": "GitHub OAuth", "pattern": r'gho_[a-zA-Z0-9]{36}', "severity": "CRITICAL",
37
+ "fix": 'Use environment variable: os.environ["GITHUB_OAUTH_TOKEN"]'},
38
+ {"name": "GitHub App Token", "pattern": r'(?:ghu|ghs|ghr)_[a-zA-Z0-9]{36}', "severity": "CRITICAL",
39
+ "fix": 'Use environment variable or GitHub App auth flow'},
40
+ # AWS
41
+ {"name": "AWS Access Key", "pattern": r'AKIA[0-9A-Z]{16}', "severity": "CRITICAL",
42
+ "fix": 'Use AWS credentials file (~/.aws/credentials) or IAM roles'},
43
+ {"name": "AWS Secret Key", "pattern": r'(?:aws_secret_access_key|AWS_SECRET)\s*[=:]\s*["\']?[A-Za-z0-9/+=]{40}["\']?',
44
+ "severity": "CRITICAL", "fix": 'Use AWS credentials file or environment variables'},
45
+ # Google
46
+ {"name": "Google API Key", "pattern": r'AIza[0-9A-Za-z_-]{35}', "severity": "CRITICAL",
47
+ "fix": 'Use environment variable: os.environ["GOOGLE_API_KEY"]'},
48
+ {"name": "Google OAuth Secret", "pattern": r'GOCSPX-[a-zA-Z0-9_-]{28}', "severity": "CRITICAL",
49
+ "fix": 'Store in secrets manager, never in source'},
50
+ # Stripe
51
+ {"name": "Stripe Secret Key", "pattern": r'sk_(?:live|test)_[a-zA-Z0-9]{24,}', "severity": "CRITICAL",
52
+ "fix": 'Use environment variable: os.environ["STRIPE_SECRET_KEY"]'},
53
+ {"name": "Stripe Publishable Key", "pattern": r'pk_(?:live|test)_[a-zA-Z0-9]{24,}', "severity": "HIGH",
54
+ "fix": 'Publishable keys are less sensitive but still use env vars'},
55
+ # Slack
56
+ {"name": "Slack Token", "pattern": r'xox[bpors]-[a-zA-Z0-9\-]{10,}', "severity": "CRITICAL",
57
+ "fix": 'Use environment variable: os.environ["SLACK_TOKEN"]'},
58
+ {"name": "Slack Webhook", "pattern": r'hooks\.slack\.com/services/T[a-zA-Z0-9_]+/B[a-zA-Z0-9_]+/[a-zA-Z0-9_]+',
59
+ "severity": "HIGH", "fix": 'Store webhook URL in environment variable'},
60
+ # Twilio / Telnyx
61
+ {"name": "Twilio API Key", "pattern": r'SK[a-f0-9]{32}', "severity": "CRITICAL",
62
+ "fix": 'Use environment variable: os.environ["TWILIO_API_KEY"]'},
63
+ # Discord
64
+ {"name": "Discord Token", "pattern": r'(?:discord|bot).*?["\'][A-Za-z0-9_-]{24}\.[A-Za-z0-9_-]{6}\.[A-Za-z0-9_-]{27}["\']',
65
+ "severity": "CRITICAL", "fix": 'Use environment variable: os.environ["DISCORD_TOKEN"]'},
66
+ # Database URLs
67
+ {"name": "Database URL", "pattern": r'(?:postgres|mysql|mongodb|redis)://[^\s"\']+:[^\s"\']+@[^\s"\']+',
68
+ "severity": "CRITICAL", "fix": 'Use environment variable: os.environ["DATABASE_URL"]'},
69
+ # JWT / Generic tokens
70
+ {"name": "JWT Token", "pattern": r'eyJ[a-zA-Z0-9_-]{10,}\.eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}',
71
+ "severity": "HIGH", "fix": 'Never hardcode JWTs — generate at runtime'},
72
+ # Private keys
73
+ {"name": "Private Key", "pattern": r'-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----',
74
+ "severity": "CRITICAL", "fix": 'Store private keys in secrets manager or ~/.ssh/, NEVER in source'},
75
+ # Generic high-entropy secrets (password assignments)
76
+ {"name": "Hardcoded Password", "pattern": r'(?:\w*(?:password|passwd|pwd|secret))\s*[=:]\s*["\'][^\s"\']{8,}["\']',
77
+ "severity": "CRITICAL", "fix": 'Use environment variable or secrets manager for passwords'},
78
+ # Generic API key assignment
79
+ {"name": "Generic API Key", "pattern": r'(?:api[_-]?key|apikey|api[_-]?secret)\s*[=:]\s*["\'][a-zA-Z0-9_\-]{20,}["\']',
80
+ "severity": "HIGH", "fix": 'Use environment variable for API keys'},
81
+ # Supabase
82
+ {"name": "Supabase Key", "pattern": r'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9\.[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+',
83
+ "severity": "HIGH", "fix": 'Use environment variable: os.environ["SUPABASE_KEY"]'},
84
+ # SendGrid
85
+ {"name": "SendGrid Key", "pattern": r'SG\.[a-zA-Z0-9_-]{22}\.[a-zA-Z0-9_-]{43}', "severity": "CRITICAL",
86
+ "fix": 'Use environment variable: os.environ["SENDGRID_API_KEY"]'},
87
+ # Cloudflare
88
+ {"name": "Cloudflare API Token", "pattern": r'[a-zA-Z0-9_-]{40}', "severity": "HIGH",
89
+ "fix": 'Use environment variable: os.environ["CF_API_TOKEN"]',
90
+ "context_required": r'(?:cloudflare|cf)[_-]?(?:api)?[_-]?(?:token|key)\s*[=:]'},
91
+ # MiniMax
92
+ {"name": "MiniMax API Key", "pattern": r'sk-cp-[a-zA-Z0-9_-]{20,}', "severity": "CRITICAL",
93
+ "fix": 'Use environment variable: os.environ["MINIMAX_API_KEY"]'},
94
+ # Vercel
95
+ {"name": "Vercel Token", "pattern": r'vercel_[a-zA-Z0-9]{24,}', "severity": "CRITICAL",
96
+ "fix": 'Use environment variable or Vercel CLI auth'},
97
+ ]
98
+
99
+ # Files to ALWAYS skip
100
+ SKIP_FILES = {
101
+ '.env', '.env.local', '.env.example', '.gitignore',
102
+ 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
103
+ 'poetry.lock', 'Pipfile.lock', 'requirements.txt',
104
+ }
105
+
106
+ SKIP_EXTENSIONS = {
107
+ '.pyc', '.whl', '.egg', '.so', '.dll', '.exe',
108
+ '.png', '.jpg', '.gif', '.ico', '.svg', '.woff',
109
+ '.ttf', '.mp3', '.mp4', '.zip', '.tar', '.gz',
110
+ }
111
+
112
+ # Lines that are clearly not real secrets
113
+ FALSE_POSITIVE_INDICATORS = [
114
+ 'example', 'placeholder', 'your_', 'xxx', 'changeme',
115
+ 'TODO', 'FIXME', '<your', 'INSERT_', 'REPLACE_',
116
+ 'test_key', 'fake_', 'dummy', 'sample',
117
+ ]
118
+
119
+
120
+ def scan_secrets(code: str, filename: str = "") -> List[SecretFinding]:
121
+ """Scan code for hardcoded secrets. Returns list of findings."""
122
+ findings = []
123
+ lines = code.split("\n")
124
+
125
+ for i, line in enumerate(lines, 1):
126
+ stripped = line.strip()
127
+
128
+ # Skip comments
129
+ if stripped.startswith('#') or stripped.startswith('//') or stripped.startswith('*'):
130
+ continue
131
+
132
+ # Skip lines that are clearly examples/placeholders
133
+ line_lower = line.lower()
134
+ if any(indicator in line_lower for indicator in FALSE_POSITIVE_INDICATORS):
135
+ continue
136
+
137
+ for pattern_def in SECRET_PATTERNS:
138
+ # If pattern requires context (like Cloudflare), check context first
139
+ if 'context_required' in pattern_def:
140
+ if not re.search(pattern_def['context_required'], line, re.IGNORECASE):
141
+ continue
142
+
143
+ matches = list(re.finditer(pattern_def['pattern'], line, re.IGNORECASE))
144
+ for match in matches:
145
+ secret_text = match.group()
146
+
147
+ # Skip very short matches (likely false positives)
148
+ if len(secret_text) < 10 and pattern_def['severity'] != 'CRITICAL':
149
+ continue
150
+
151
+ # Redact the secret for display
152
+ if len(secret_text) > 8:
153
+ redacted = secret_text[:4] + '*' * (len(secret_text) - 8) + secret_text[-4:]
154
+ else:
155
+ redacted = '*' * len(secret_text)
156
+
157
+ findings.append(SecretFinding(
158
+ line=i,
159
+ column=match.start() + 1,
160
+ secret_type=pattern_def['name'],
161
+ severity=pattern_def['severity'],
162
+ matched=redacted,
163
+ raw_line=stripped[:120],
164
+ fix=pattern_def['fix'],
165
+ confidence=0.95 if pattern_def['severity'] == 'CRITICAL' else 0.80,
166
+ ))
167
+
168
+ return findings
169
+
170
+
171
+ def scan_diff(diff_text: str) -> List[SecretFinding]:
172
+ """Scan a git diff for secrets in ADDED lines only."""
173
+ findings = []
174
+ current_file = ""
175
+
176
+ for line in diff_text.split("\n"):
177
+ if line.startswith("diff --git"):
178
+ parts = line.split(" b/")
179
+ current_file = parts[-1] if len(parts) > 1 else ""
180
+ continue
181
+
182
+ if line.startswith("+") and not line.startswith("+++"):
183
+ added_line = line[1:]
184
+ line_findings = scan_secrets(added_line, current_file)
185
+ for f in line_findings:
186
+ findings.append(f)
187
+
188
+ return findings
189
+
190
+
191
+ def format_findings(findings: List[SecretFinding], block_mode: bool = True) -> str:
192
+ """Format findings into actionable report."""
193
+ if not findings:
194
+ return "PASS — No secrets detected."
195
+
196
+ critical = [f for f in findings if f.severity == "CRITICAL"]
197
+ high = [f for f in findings if f.severity == "HIGH"]
198
+
199
+ lines = []
200
+ if block_mode and critical:
201
+ lines.append("BLOCKED — Secrets detected in staged changes.\n")
202
+ else:
203
+ lines.append(f"WARNING — {len(findings)} potential secrets found.\n")
204
+
205
+ lines.append(f" CRITICAL: {len(critical)}")
206
+ lines.append(f" HIGH: {len(high)}")
207
+ lines.append("")
208
+
209
+ for f in findings:
210
+ lines.append(f" [{f.severity}] {f.secret_type} (line {f.line}, col {f.column})")
211
+ lines.append(f" Found: {f.matched}")
212
+ lines.append(f" Fix: {f.fix}")
213
+ lines.append("")
214
+
215
+ if block_mode and critical:
216
+ lines.append("Commit blocked. Fix the issues above and try again.")
217
+ lines.append("To bypass (NOT recommended): git commit --no-verify")
218
+
219
+ return "\n".join(lines)
220
+
221
+
222
+ def generate_fix_patch(code: str, findings: List[SecretFinding]) -> str:
223
+ """Generate a diff-style patch that replaces secrets with env var lookups."""
224
+ if not findings:
225
+ return ""
226
+
227
+ lines = code.split("\n")
228
+ patches = []
229
+
230
+ for f in findings:
231
+ if f.line <= len(lines):
232
+ original = lines[f.line - 1]
233
+
234
+ # Generate the env var name from the secret type
235
+ env_name = f.secret_type.upper().replace(" ", "_").replace("-", "_")
236
+
237
+ # Build replacement based on context
238
+ if 'os.environ' in f.fix:
239
+ # Extract the suggested env var name from fix
240
+ import re as _re
241
+ env_match = _re.search(r'os\.environ\["([^"]+)"\]', f.fix)
242
+ if env_match:
243
+ env_name = env_match.group(1)
244
+
245
+ patches.append(f"--- line {f.line}")
246
+ patches.append(f"- {original.strip()}")
247
+
248
+ # Try to generate a smart replacement
249
+ new_line = _generate_replacement(original, f, env_name)
250
+ patches.append(f"+ {new_line.strip()}")
251
+ patches.append("")
252
+
253
+ return "\n".join(patches)
254
+
255
+
256
+ def _generate_replacement(line: str, finding: SecretFinding, env_name: str) -> str:
257
+ """Generate a replacement line using env vars."""
258
+ # Find the assignment pattern
259
+ assign_match = re.search(
260
+ r'(\w+)\s*([=:])\s*["\']' + re.escape(finding.matched.replace('*', '.')) + r'["\']',
261
+ line
262
+ )
263
+
264
+ if assign_match:
265
+ var_name = assign_match.group(1)
266
+ operator = assign_match.group(2)
267
+ if operator == '=':
268
+ return f'{var_name} = os.environ["{env_name}"]'
269
+ else:
270
+ return f'{var_name}: os.environ.get("{env_name}")'
271
+
272
+ # Fallback: just suggest the env var
273
+ return f'os.environ["{env_name}"] # was: {finding.matched}'