delimit-cli 3.15.11 → 3.15.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ """PII and secret redaction — sanitize before sending to external LLMs (STR-055).
2
+
3
+ Auto-detect and redact secrets, API keys, and PII before prompts
4
+ leave the local environment. Replacement tokens allow reconstruction
5
+ if needed.
6
+
7
+ Focus group (Security): "Auto-detect and redact secrets, API keys, PII
8
+ before sending prompt to external LLM. This is a massive security win."
9
+ """
10
+
11
+ import re
12
+ import json
13
+ import hashlib
14
+ from typing import Any, Dict, List, Optional, Tuple
15
+
16
+ # Patterns for sensitive data detection
17
+ PATTERNS = {
18
+ "api_key": [
19
+ (r'\b(sk-[a-zA-Z0-9]{20,})\b', "OpenAI API key"),
20
+ (r'\b(xai-[a-zA-Z0-9]{20,})\b', "xAI API key"),
21
+ (r'\b(AIza[a-zA-Z0-9_-]{30,})\b', "Google API key"),
22
+ (r'\b(ghp_[a-zA-Z0-9]{36,})\b', "GitHub PAT"),
23
+ (r'\b(ghu_[a-zA-Z0-9]{36,})\b', "GitHub user token"),
24
+ (r'\b(glpat-[a-zA-Z0-9_-]{20,})\b', "GitLab PAT"),
25
+ (r'\b(npm_[a-zA-Z0-9]{36,})\b', "npm token"),
26
+ (r'\b(pypi-[a-zA-Z0-9]{50,})\b', "PyPI token"),
27
+ ],
28
+ "secret": [
29
+ (r'(?i)(password|passwd|pwd)\s*[=:]\s*["\']([^"\']{4,})["\']', "password"),
30
+ (r'(?i)(secret|token|api_key|apikey)\s*[=:]\s*["\']([^"\']{8,})["\']', "secret/token"),
31
+ (r'(?i)bearer\s+([a-zA-Z0-9._-]{20,})', "bearer token"),
32
+ ],
33
+ "pii": [
34
+ (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', "email"),
35
+ (r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', "phone number"),
36
+ (r'\b\d{3}-\d{2}-\d{4}\b', "SSN"),
37
+ (r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', "credit card"),
38
+ ],
39
+ "infra": [
40
+ (r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', "IP address"),
41
+ (r'(?i)(mongodb|postgres|mysql|redis)://[^\s]+', "database URL"),
42
+ (r'(?i)https?://[^\s]*:(password|secret|token)[^\s]*', "URL with credentials"),
43
+ ],
44
+ }
45
+
46
+ # Allowlist — patterns that look like secrets but aren't
47
+ ALLOWLIST = [
48
+ r'0\.0\.0\.0',
49
+ r'127\.0\.0\.1',
50
+ r'localhost',
51
+ r'example\.com',
52
+ r'test@test\.com',
53
+ r'placeholder',
54
+ r'REDACTED',
55
+ r'<your-',
56
+ r'\$\{', # Template variables
57
+ ]
58
+
59
+
60
+ def _is_allowlisted(match: str) -> bool:
61
+ for pattern in ALLOWLIST:
62
+ if re.search(pattern, match, re.IGNORECASE):
63
+ return True
64
+ return False
65
+
66
+
67
+ def _make_token(category: str, index: int) -> str:
68
+ return f"[REDACTED_{category.upper()}_{index}]"
69
+
70
+
71
+ def redact(
72
+ text: str,
73
+ categories: Optional[List[str]] = None,
74
+ ) -> Dict[str, Any]:
75
+ """Redact sensitive data from text.
76
+
77
+ Returns the redacted text and a mapping of tokens to original values
78
+ (stored locally, never sent externally).
79
+
80
+ Args:
81
+ text: Text to scan and redact.
82
+ categories: Which categories to scan (api_key, secret, pii, infra).
83
+ None = scan all.
84
+ """
85
+ if not text:
86
+ return {"redacted": "", "findings": [], "token_count": 0}
87
+
88
+ active_categories = categories or list(PATTERNS.keys())
89
+ findings = []
90
+ token_map = {}
91
+ redacted = text
92
+ token_index = 0
93
+
94
+ for category in active_categories:
95
+ if category not in PATTERNS:
96
+ continue
97
+
98
+ for pattern, label in PATTERNS[category]:
99
+ for match in re.finditer(pattern, redacted):
100
+ matched_text = match.group(0)
101
+
102
+ if _is_allowlisted(matched_text):
103
+ continue
104
+
105
+ token_index += 1
106
+ token = _make_token(category, token_index)
107
+
108
+ findings.append({
109
+ "category": category,
110
+ "type": label,
111
+ "token": token,
112
+ "position": match.start(),
113
+ "length": len(matched_text),
114
+ "preview": matched_text[:4] + "..." + matched_text[-4:] if len(matched_text) > 12 else "***",
115
+ })
116
+
117
+ token_map[token] = matched_text
118
+ redacted = redacted.replace(matched_text, token, 1)
119
+
120
+ return {
121
+ "redacted": redacted,
122
+ "findings": findings,
123
+ "token_count": token_index,
124
+ "token_map": token_map, # Keep local — never send externally
125
+ "categories_scanned": active_categories,
126
+ }
127
+
128
+
129
+ def scan(text: str) -> Dict[str, Any]:
130
+ """Scan text for sensitive data WITHOUT redacting.
131
+
132
+ Use this to preview what would be redacted.
133
+ """
134
+ result = redact(text)
135
+ return {
136
+ "findings": result["findings"],
137
+ "total": result["token_count"],
138
+ "categories": list(set(f["category"] for f in result["findings"])),
139
+ "safe": result["token_count"] == 0,
140
+ "message": f"Found {result['token_count']} sensitive item(s)" if result["token_count"] > 0 else "No sensitive data detected",
141
+ }
142
+
143
+
144
+ def restore(redacted_text: str, token_map: Dict[str, str]) -> str:
145
+ """Restore redacted text using the token map."""
146
+ result = redacted_text
147
+ for token, original in token_map.items():
148
+ result = result.replace(token, original)
149
+ return result
@@ -0,0 +1,207 @@
1
+ """Prompt drift detection — same task behaves differently across models (STR-052).
2
+
3
+ Detects when the same prompt produces inconsistent results across
4
+ Claude, Codex, and Gemini. Flags divergence and suggests model-specific
5
+ adaptations.
6
+
7
+ Focus group (Indie): "Prompt drift — same task behaves differently
8
+ in Claude vs Codex vs Gemini."
9
+ """
10
+
11
+ import json
12
+ import time
13
+ import hashlib
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ DRIFT_DIR = Path.home() / ".delimit" / "prompt_drift"
18
+ HISTORY_FILE = DRIFT_DIR / "history.jsonl"
19
+
20
+
21
+ def _ensure_dir():
22
+ DRIFT_DIR.mkdir(parents=True, exist_ok=True)
23
+
24
+
25
+ def _hash_prompt(prompt: str) -> str:
26
+ return hashlib.sha256(prompt.encode()).hexdigest()[:12]
27
+
28
+
29
+ def record_result(
30
+ prompt: str,
31
+ model: str,
32
+ result_summary: str,
33
+ success: bool = True,
34
+ task_type: str = "",
35
+ duration_ms: int = 0,
36
+ ) -> Dict[str, Any]:
37
+ """Record a prompt execution result for drift analysis."""
38
+ if not prompt or not model:
39
+ return {"error": "prompt and model are required"}
40
+
41
+ _ensure_dir()
42
+ prompt_hash = _hash_prompt(prompt)
43
+
44
+ entry = {
45
+ "prompt_hash": prompt_hash,
46
+ "prompt_preview": prompt[:100],
47
+ "model": model.lower().strip(),
48
+ "result_summary": result_summary[:200],
49
+ "success": success,
50
+ "task_type": task_type,
51
+ "duration_ms": duration_ms,
52
+ "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
53
+ }
54
+
55
+ with open(HISTORY_FILE, "a") as f:
56
+ f.write(json.dumps(entry) + "\n")
57
+
58
+ return {
59
+ "status": "recorded",
60
+ "prompt_hash": prompt_hash,
61
+ "model": model,
62
+ "message": f"Result recorded for {model}",
63
+ }
64
+
65
+
66
+ def check_drift(
67
+ prompt: str = "",
68
+ task_type: str = "",
69
+ threshold: float = 0.3,
70
+ ) -> Dict[str, Any]:
71
+ """Check for prompt drift — inconsistent results across models.
72
+
73
+ Args:
74
+ prompt: Specific prompt to check (by hash). Empty = check all recent.
75
+ task_type: Filter by task type.
76
+ threshold: Drift threshold (0-1). Higher = more tolerant.
77
+ """
78
+ if not HISTORY_FILE.exists():
79
+ return {
80
+ "status": "no_data",
81
+ "drift_detected": False,
82
+ "message": "No prompt history. Use record_result() to start tracking.",
83
+ }
84
+
85
+ entries: List[Dict] = []
86
+ try:
87
+ for line in HISTORY_FILE.read_text().strip().split("\n"):
88
+ try:
89
+ entries.append(json.loads(line))
90
+ except json.JSONDecodeError:
91
+ pass
92
+ except OSError:
93
+ return {"status": "error", "message": "Could not read history"}
94
+
95
+ # Filter
96
+ if prompt:
97
+ prompt_hash = _hash_prompt(prompt)
98
+ entries = [e for e in entries if e.get("prompt_hash") == prompt_hash]
99
+ if task_type:
100
+ entries = [e for e in entries if e.get("task_type") == task_type]
101
+
102
+ if not entries:
103
+ return {
104
+ "status": "no_matches",
105
+ "drift_detected": False,
106
+ "message": "No matching prompt history found.",
107
+ }
108
+
109
+ # Group by prompt hash
110
+ by_prompt: Dict[str, List[Dict]] = {}
111
+ for e in entries:
112
+ by_prompt.setdefault(e["prompt_hash"], []).append(e)
113
+
114
+ drift_findings = []
115
+ for ph, results in by_prompt.items():
116
+ models = set(r["model"] for r in results)
117
+ if len(models) < 2:
118
+ continue # Need at least 2 models to compare
119
+
120
+ # Check success rate divergence
121
+ model_success = {}
122
+ for r in results:
123
+ m = r["model"]
124
+ model_success.setdefault(m, []).append(r["success"])
125
+
126
+ success_rates = {
127
+ m: sum(s) / len(s) for m, s in model_success.items()
128
+ }
129
+
130
+ # Check for significant divergence
131
+ rates = list(success_rates.values())
132
+ if max(rates) - min(rates) > threshold:
133
+ best = max(success_rates, key=success_rates.get)
134
+ worst = min(success_rates, key=success_rates.get)
135
+ drift_findings.append({
136
+ "prompt_hash": ph,
137
+ "prompt_preview": results[0].get("prompt_preview", ""),
138
+ "models_compared": list(models),
139
+ "success_rates": success_rates,
140
+ "best_model": best,
141
+ "worst_model": worst,
142
+ "divergence": round(max(rates) - min(rates), 2),
143
+ "recommendation": f"Use {best} for this task. {worst} has {round(success_rates[worst]*100)}% success rate.",
144
+ })
145
+
146
+ return {
147
+ "status": "ok",
148
+ "drift_detected": len(drift_findings) > 0,
149
+ "findings": drift_findings,
150
+ "total_prompts_analyzed": len(by_prompt),
151
+ "total_entries": len(entries),
152
+ "message": f"{len(drift_findings)} drift(s) detected across {len(by_prompt)} prompt(s)" if drift_findings else "No significant drift detected",
153
+ }
154
+
155
+
156
+ def get_model_rankings(task_type: str = "") -> Dict[str, Any]:
157
+ """Rank models by success rate and speed for a task type."""
158
+ if not HISTORY_FILE.exists():
159
+ return {"status": "no_data", "rankings": []}
160
+
161
+ entries: List[Dict] = []
162
+ try:
163
+ for line in HISTORY_FILE.read_text().strip().split("\n"):
164
+ try:
165
+ entries.append(json.loads(line))
166
+ except json.JSONDecodeError:
167
+ pass
168
+ except OSError:
169
+ return {"status": "error", "rankings": []}
170
+
171
+ if task_type:
172
+ entries = [e for e in entries if e.get("task_type") == task_type]
173
+
174
+ if not entries:
175
+ return {"status": "no_data", "rankings": [], "task_type": task_type}
176
+
177
+ # Aggregate per model
178
+ model_stats: Dict[str, Dict] = {}
179
+ for e in entries:
180
+ m = e["model"]
181
+ if m not in model_stats:
182
+ model_stats[m] = {"successes": 0, "total": 0, "durations": []}
183
+ model_stats[m]["total"] += 1
184
+ if e.get("success"):
185
+ model_stats[m]["successes"] += 1
186
+ if e.get("duration_ms"):
187
+ model_stats[m]["durations"].append(e["duration_ms"])
188
+
189
+ rankings = []
190
+ for model, stats in model_stats.items():
191
+ avg_duration = sum(stats["durations"]) / len(stats["durations"]) if stats["durations"] else 0
192
+ success_rate = stats["successes"] / stats["total"] if stats["total"] > 0 else 0
193
+ rankings.append({
194
+ "model": model,
195
+ "success_rate": round(success_rate * 100, 1),
196
+ "avg_duration_ms": round(avg_duration),
197
+ "total_tasks": stats["total"],
198
+ })
199
+
200
+ rankings.sort(key=lambda r: (-r["success_rate"], r["avg_duration_ms"]))
201
+
202
+ return {
203
+ "status": "ok",
204
+ "rankings": rankings,
205
+ "task_type": task_type or "all",
206
+ "total_entries": len(entries),
207
+ }