PyPI - codeguard-pro - Versions diffs - 0.3.0__py3-none-any.whl - Mend

codeguard-pro 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

agent_analyzer.py +291 -0
autofix.py +146 -0
cli.py +465 -0
codeguard_pro-0.3.0.dist-info/METADATA +398 -0
codeguard_pro-0.3.0.dist-info/RECORD +15 -0
codeguard_pro-0.3.0.dist-info/WHEEL +5 -0
codeguard_pro-0.3.0.dist-info/entry_points.txt +2 -0
codeguard_pro-0.3.0.dist-info/top_level.txt +10 -0
hook.py +107 -0
learning_loop.py +168 -0
secret_scanner.py +273 -0
server.py +544 -0
supply_chain.py +744 -0
tools_review.py +283 -0
tools_security.py +668 -0

learning_loop.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Reviewable learning loop for new threats and missed detections.
+Stores suspicious samples as JSON artifacts, generates issue-ready markdown,
+and summarizes the local learning corpus. This is intentionally review-first:
+it does not auto-modify detection rules.
+"""
+import json
+import os
+import re
+import time
+from collections import Counter
+from typing import Dict, List, Optional
+DEFAULT_CORPUS_DIR = "learning"
+def _safe_slug(text: str, fallback: str = "sample") -> str:
+    slug = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-")
+    return slug[:60] or fallback
+def _ensure_dirs(corpus_dir: str) -> Dict[str, str]:
+    base = os.path.abspath(corpus_dir)
+    candidates = os.path.join(base, "candidates")
+    issues = os.path.join(base, "issues")
+    os.makedirs(candidates, exist_ok=True)
+    os.makedirs(issues, exist_ok=True)
+    return {"base": base, "candidates": candidates, "issues": issues}
+def record_candidate(
+    title: str,
+    code: str,
+    language: str = "python",
+    source: str = "manual",
+    expected_behavior: str = "",
+    regex_findings: Optional[List[Dict]] = None,
+    ai_findings: Optional[List[Dict]] = None,
+    corpus_dir: str = DEFAULT_CORPUS_DIR,
+) -> Dict:
+    """Persist a suspicious sample for later review."""
+    dirs = _ensure_dirs(corpus_dir)
+    ts = time.strftime("%Y%m%d-%H%M%S")
+    slug = _safe_slug(title)
+    candidate_id = f"{ts}-{slug}"
+    path = os.path.join(dirs["candidates"], f"{candidate_id}.json")
+    data = {
+        "id": candidate_id,
+        "title": title,
+        "language": language,
+        "source": source,
+        "expected_behavior": expected_behavior,
+        "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "regex_findings": regex_findings or [],
+        "ai_findings": ai_findings or [],
+        "code": code,
+        "status": "triage",
+    }
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+        f.write("\n")
+    return {
+        "status": "RECORDED",
+        "id": candidate_id,
+        "path": path,
+        "regex_findings": len(data["regex_findings"]),
+        "ai_findings": len(data["ai_findings"]),
+    }
+def load_candidate(candidate_path: str) -> Dict:
+    with open(candidate_path, "r") as f:
+        return json.load(f)
+def generate_issue_markdown(candidate_path: str, corpus_dir: str = DEFAULT_CORPUS_DIR) -> Dict:
+    """Generate a GitHub-issue-ready markdown report from a candidate."""
+    dirs = _ensure_dirs(corpus_dir)
+    candidate = load_candidate(candidate_path)
+    regex_count = len(candidate.get("regex_findings", []))
+    ai_count = len(candidate.get("ai_findings", []))
+    capability = "unknown"
+    if ai_count and not regex_count:
+        capability = "AI-only detection"
+    elif regex_count and ai_count:
+        capability = "Both layers detected"
+    elif regex_count:
+        capability = "Regex-only detection"
+    md = "\n".join([
+        f"# Detection Gap: {candidate.get('title', candidate.get('id', 'unknown'))}",
+        "",
+        "## Summary",
+        f"- Candidate ID: `{candidate.get('id', 'unknown')}`",
+        f"- Language: `{candidate.get('language', 'unknown')}`",
+        f"- Source: `{candidate.get('source', 'unknown')}`",
+        f"- Expected behavior: {candidate.get('expected_behavior', 'n/a') or 'n/a'}",
+        f"- Current capability: {capability}",
+        f"- Regex findings: {regex_count}",
+        f"- AI findings: {ai_count}",
+        "",
+        "## Why This Matters",
+        "This sample should be reviewed to determine whether it warrants a new deterministic detector, an AST rule, a red-team test, or a documentation-only limitation.",
+        "",
+        "## Reproduction Sample",
+        f"```{candidate.get('language', 'text')}",
+        candidate.get("code", ""),
+        "```",
+        "",
+        "## Current Findings",
+        "```json",
+        json.dumps({
+            "regex_findings": candidate.get("regex_findings", []),
+            "ai_findings": candidate.get("ai_findings", []),
+        }, indent=2),
+        "```",
+        "",
+        "## Acceptance Criteria",
+        "- Add or document the expected detector behavior.",
+        "- Add a regression test covering this sample.",
+        "- Avoid introducing false positives on clean code.",
+    ])
+    issue_path = os.path.join(dirs["issues"], f"{candidate['id']}.md")
+    with open(issue_path, "w") as f:
+        f.write(md)
+        f.write("\n")
+    return {"status": "ISSUE_READY", "path": issue_path, "candidate_id": candidate["id"]}
+def corpus_summary(corpus_dir: str = DEFAULT_CORPUS_DIR) -> Dict:
+    """Summarize all stored learning artifacts."""
+    dirs = _ensure_dirs(corpus_dir)
+    candidates = []
+    for fname in sorted(os.listdir(dirs["candidates"])):
+        if not fname.endswith(".json"):
+            continue
+        try:
+            candidates.append(load_candidate(os.path.join(dirs["candidates"], fname)))
+        except Exception:
+            continue
+    by_language = Counter(c.get("language", "unknown") for c in candidates)
+    by_source = Counter(c.get("source", "unknown") for c in candidates)
+    by_status = Counter(c.get("status", "triage") for c in candidates)
+    ai_only = sum(1 for c in candidates if c.get("ai_findings") and not c.get("regex_findings"))
+    regex_only = sum(1 for c in candidates if c.get("regex_findings") and not c.get("ai_findings"))
+    both = sum(1 for c in candidates if c.get("regex_findings") and c.get("ai_findings"))
+    return {
+        "corpus_dir": dirs["base"],
+        "total_candidates": len(candidates),
+        "issue_drafts": len([f for f in os.listdir(dirs["issues"]) if f.endswith(".md")]),
+        "by_language": dict(by_language),
+        "by_source": dict(by_source),
+        "by_status": dict(by_status),
+        "coverage": {
+            "ai_only": ai_only,
+            "regex_only": regex_only,
+            "both": both,
+        },
+    }

secret_scanner.py ADDED Viewed

@@ -0,0 +1,273 @@
+"""Secrets detection engine — the core of CodeGuard Pro.
+Catches hardcoded API keys, tokens, passwords, and credentials
+BEFORE they hit git. Zero false-positive tolerance on high-confidence matches.
+"""
+import re
+from typing import List, Dict, Optional
+from dataclasses import dataclass, field
+@dataclass(frozen=True)
+class SecretFinding:
+    """Immutable finding from secrets scan."""
+    line: int
+    column: int
+    secret_type: str
+    severity: str  # CRITICAL, HIGH
+    matched: str  # redacted version
+    raw_line: str
+    fix: str  # exact fix suggestion
+    confidence: float  # 0-1
+# High-confidence patterns — these are REAL secrets, not variable names
+SECRET_PATTERNS: List[Dict] = [
+    # OpenAI
+    {"name": "OpenAI API Key", "pattern": r'sk-(?:proj-)?[a-zA-Z0-9_-]{20,}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["OPENAI_API_KEY"]'},
+    # Anthropic
+    {"name": "Anthropic API Key", "pattern": r'sk-ant-[a-zA-Z0-9_-]{20,}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["ANTHROPIC_API_KEY"]'},
+    # GitHub
+    {"name": "GitHub Token", "pattern": r'ghp_[a-zA-Z0-9]{36}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["GITHUB_TOKEN"]'},
+    {"name": "GitHub OAuth", "pattern": r'gho_[a-zA-Z0-9]{36}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["GITHUB_OAUTH_TOKEN"]'},
+    {"name": "GitHub App Token", "pattern": r'(?:ghu|ghs|ghr)_[a-zA-Z0-9]{36}', "severity": "CRITICAL",
+     "fix": 'Use environment variable or GitHub App auth flow'},
+    # AWS
+    {"name": "AWS Access Key", "pattern": r'AKIA[0-9A-Z]{16}', "severity": "CRITICAL",
+     "fix": 'Use AWS credentials file (~/.aws/credentials) or IAM roles'},
+    {"name": "AWS Secret Key", "pattern": r'(?:aws_secret_access_key|AWS_SECRET)\s*[=:]\s*["\']?[A-Za-z0-9/+=]{40}["\']?',
+     "severity": "CRITICAL", "fix": 'Use AWS credentials file or environment variables'},
+    # Google
+    {"name": "Google API Key", "pattern": r'AIza[0-9A-Za-z_-]{35}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["GOOGLE_API_KEY"]'},
+    {"name": "Google OAuth Secret", "pattern": r'GOCSPX-[a-zA-Z0-9_-]{28}', "severity": "CRITICAL",
+     "fix": 'Store in secrets manager, never in source'},
+    # Stripe
+    {"name": "Stripe Secret Key", "pattern": r'sk_(?:live|test)_[a-zA-Z0-9]{24,}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["STRIPE_SECRET_KEY"]'},
+    {"name": "Stripe Publishable Key", "pattern": r'pk_(?:live|test)_[a-zA-Z0-9]{24,}', "severity": "HIGH",
+     "fix": 'Publishable keys are less sensitive but still use env vars'},
+    # Slack
+    {"name": "Slack Token", "pattern": r'xox[bpors]-[a-zA-Z0-9\-]{10,}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["SLACK_TOKEN"]'},
+    {"name": "Slack Webhook", "pattern": r'hooks\.slack\.com/services/T[a-zA-Z0-9_]+/B[a-zA-Z0-9_]+/[a-zA-Z0-9_]+',
+     "severity": "HIGH", "fix": 'Store webhook URL in environment variable'},
+    # Twilio / Telnyx
+    {"name": "Twilio API Key", "pattern": r'SK[a-f0-9]{32}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["TWILIO_API_KEY"]'},
+    # Discord
+    {"name": "Discord Token", "pattern": r'(?:discord|bot).*?["\'][A-Za-z0-9_-]{24}\.[A-Za-z0-9_-]{6}\.[A-Za-z0-9_-]{27}["\']',
+     "severity": "CRITICAL", "fix": 'Use environment variable: os.environ["DISCORD_TOKEN"]'},
+    # Database URLs
+    {"name": "Database URL", "pattern": r'(?:postgres|mysql|mongodb|redis)://[^\s"\']+:[^\s"\']+@[^\s"\']+',
+     "severity": "CRITICAL", "fix": 'Use environment variable: os.environ["DATABASE_URL"]'},
+    # JWT / Generic tokens
+    {"name": "JWT Token", "pattern": r'eyJ[a-zA-Z0-9_-]{10,}\.eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{10,}',
+     "severity": "HIGH", "fix": 'Never hardcode JWTs — generate at runtime'},
+    # Private keys
+    {"name": "Private Key", "pattern": r'-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----',
+     "severity": "CRITICAL", "fix": 'Store private keys in secrets manager or ~/.ssh/, NEVER in source'},
+    # Generic high-entropy secrets (password assignments)
+    {"name": "Hardcoded Password", "pattern": r'(?:\w*(?:password|passwd|pwd|secret))\s*[=:]\s*["\'][^\s"\']{8,}["\']',
+     "severity": "CRITICAL", "fix": 'Use environment variable or secrets manager for passwords'},
+    # Generic API key assignment
+    {"name": "Generic API Key", "pattern": r'(?:api[_-]?key|apikey|api[_-]?secret)\s*[=:]\s*["\'][a-zA-Z0-9_\-]{20,}["\']',
+     "severity": "HIGH", "fix": 'Use environment variable for API keys'},
+    # Supabase
+    {"name": "Supabase Key", "pattern": r'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9\.[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+',
+     "severity": "HIGH", "fix": 'Use environment variable: os.environ["SUPABASE_KEY"]'},
+    # SendGrid
+    {"name": "SendGrid Key", "pattern": r'SG\.[a-zA-Z0-9_-]{22}\.[a-zA-Z0-9_-]{43}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["SENDGRID_API_KEY"]'},
+    # Cloudflare
+    {"name": "Cloudflare API Token", "pattern": r'[a-zA-Z0-9_-]{40}', "severity": "HIGH",
+     "fix": 'Use environment variable: os.environ["CF_API_TOKEN"]',
+     "context_required": r'(?:cloudflare|cf)[_-]?(?:api)?[_-]?(?:token|key)\s*[=:]'},
+    # MiniMax
+    {"name": "MiniMax API Key", "pattern": r'sk-cp-[a-zA-Z0-9_-]{20,}', "severity": "CRITICAL",
+     "fix": 'Use environment variable: os.environ["MINIMAX_API_KEY"]'},
+    # Vercel
+    {"name": "Vercel Token", "pattern": r'vercel_[a-zA-Z0-9]{24,}', "severity": "CRITICAL",
+     "fix": 'Use environment variable or Vercel CLI auth'},
+]
+# Files to ALWAYS skip
+SKIP_FILES = {
+    '.env', '.env.local', '.env.example', '.gitignore',
+    'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
+    'poetry.lock', 'Pipfile.lock', 'requirements.txt',
+}
+SKIP_EXTENSIONS = {
+    '.pyc', '.whl', '.egg', '.so', '.dll', '.exe',
+    '.png', '.jpg', '.gif', '.ico', '.svg', '.woff',
+    '.ttf', '.mp3', '.mp4', '.zip', '.tar', '.gz',
+}
+# Lines that are clearly not real secrets
+FALSE_POSITIVE_INDICATORS = [
+    'example', 'placeholder', 'your_', 'xxx', 'changeme',
+    'TODO', 'FIXME', '<your', 'INSERT_', 'REPLACE_',
+    'test_key', 'fake_', 'dummy', 'sample',
+]
+def scan_secrets(code: str, filename: str = "") -> List[SecretFinding]:
+    """Scan code for hardcoded secrets. Returns list of findings."""
+    findings = []
+    lines = code.split("\n")
+    for i, line in enumerate(lines, 1):
+        stripped = line.strip()
+        # Skip comments
+        if stripped.startswith('#') or stripped.startswith('//') or stripped.startswith('*'):
+            continue
+        # Skip lines that are clearly examples/placeholders
+        line_lower = line.lower()
+        if any(indicator in line_lower for indicator in FALSE_POSITIVE_INDICATORS):
+            continue
+        for pattern_def in SECRET_PATTERNS:
+            # If pattern requires context (like Cloudflare), check context first
+            if 'context_required' in pattern_def:
+                if not re.search(pattern_def['context_required'], line, re.IGNORECASE):
+                    continue
+            matches = list(re.finditer(pattern_def['pattern'], line, re.IGNORECASE))
+            for match in matches:
+                secret_text = match.group()
+                # Skip very short matches (likely false positives)
+                if len(secret_text) < 10 and pattern_def['severity'] != 'CRITICAL':
+                    continue
+                # Redact the secret for display
+                if len(secret_text) > 8:
+                    redacted = secret_text[:4] + '*' * (len(secret_text) - 8) + secret_text[-4:]
+                else:
+                    redacted = '*' * len(secret_text)
+                findings.append(SecretFinding(
+                    line=i,
+                    column=match.start() + 1,
+                    secret_type=pattern_def['name'],
+                    severity=pattern_def['severity'],
+                    matched=redacted,
+                    raw_line=stripped[:120],
+                    fix=pattern_def['fix'],
+                    confidence=0.95 if pattern_def['severity'] == 'CRITICAL' else 0.80,
+                ))
+    return findings
+def scan_diff(diff_text: str) -> List[SecretFinding]:
+    """Scan a git diff for secrets in ADDED lines only."""
+    findings = []
+    current_file = ""
+    for line in diff_text.split("\n"):
+        if line.startswith("diff --git"):
+            parts = line.split(" b/")
+            current_file = parts[-1] if len(parts) > 1 else ""
+            continue
+        if line.startswith("+") and not line.startswith("+++"):
+            added_line = line[1:]
+            line_findings = scan_secrets(added_line, current_file)
+            for f in line_findings:
+                findings.append(f)
+    return findings
+def format_findings(findings: List[SecretFinding], block_mode: bool = True) -> str:
+    """Format findings into actionable report."""
+    if not findings:
+        return "PASS — No secrets detected."
+    critical = [f for f in findings if f.severity == "CRITICAL"]
+    high = [f for f in findings if f.severity == "HIGH"]
+    lines = []
+    if block_mode and critical:
+        lines.append("BLOCKED — Secrets detected in staged changes.\n")
+    else:
+        lines.append(f"WARNING — {len(findings)} potential secrets found.\n")
+    lines.append(f"  CRITICAL: {len(critical)}")
+    lines.append(f"  HIGH:     {len(high)}")
+    lines.append("")
+    for f in findings:
+        lines.append(f"  [{f.severity}] {f.secret_type} (line {f.line}, col {f.column})")
+        lines.append(f"    Found:  {f.matched}")
+        lines.append(f"    Fix:    {f.fix}")
+        lines.append("")
+    if block_mode and critical:
+        lines.append("Commit blocked. Fix the issues above and try again.")
+        lines.append("To bypass (NOT recommended): git commit --no-verify")
+    return "\n".join(lines)
+def generate_fix_patch(code: str, findings: List[SecretFinding]) -> str:
+    """Generate a diff-style patch that replaces secrets with env var lookups."""
+    if not findings:
+        return ""
+    lines = code.split("\n")
+    patches = []
+    for f in findings:
+        if f.line <= len(lines):
+            original = lines[f.line - 1]
+            # Generate the env var name from the secret type
+            env_name = f.secret_type.upper().replace(" ", "_").replace("-", "_")
+            # Build replacement based on context
+            if 'os.environ' in f.fix:
+                # Extract the suggested env var name from fix
+                import re as _re
+                env_match = _re.search(r'os\.environ\["([^"]+)"\]', f.fix)
+                if env_match:
+                    env_name = env_match.group(1)
+            patches.append(f"--- line {f.line}")
+            patches.append(f"-  {original.strip()}")
+            # Try to generate a smart replacement
+            new_line = _generate_replacement(original, f, env_name)
+            patches.append(f"+  {new_line.strip()}")
+            patches.append("")
+    return "\n".join(patches)
+def _generate_replacement(line: str, finding: SecretFinding, env_name: str) -> str:
+    """Generate a replacement line using env vars."""
+    # Find the assignment pattern
+    assign_match = re.search(
+        r'(\w+)\s*([=:])\s*["\']' + re.escape(finding.matched.replace('*', '.')) + r'["\']',
+        line
+    )
+    if assign_match:
+        var_name = assign_match.group(1)
+        operator = assign_match.group(2)
+        if operator == '=':
+            return f'{var_name} = os.environ["{env_name}"]'
+        else:
+            return f'{var_name}: os.environ.get("{env_name}")'
+    # Fallback: just suggest the env var
+    return f'os.environ["{env_name}"]  # was: {finding.matched}'