delimit-cli 3.15.11 → 3.15.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/gateway/ai/activate_helpers.py +210 -0
- package/gateway/ai/collision_detect.py +141 -0
- package/gateway/ai/content_engine.py +2 -7
- package/gateway/ai/cross_model_audit.py +600 -0
- package/gateway/ai/github_scanner.py +622 -0
- package/gateway/ai/handoff_receipts.py +409 -0
- package/gateway/ai/key_resolver.py +2 -7
- package/gateway/ai/multi_review.py +154 -0
- package/gateway/ai/notify.py +4 -4
- package/gateway/ai/pii_redact.py +149 -0
- package/gateway/ai/prompt_drift.py +207 -0
- package/gateway/ai/reddit_scanner.py +562 -0
- package/gateway/ai/secrets_broker.py +232 -4
- package/gateway/ai/server.py +9 -1
- package/gateway/ai/session_phoenix.py +371 -0
- package/gateway/ai/supabase_sync.py +2 -7
- package/gateway/ai/swarm.py +106 -0
- package/gateway/ai/tool_metadata.py +34 -6
- package/gateway/ai/toolcard_cache.py +327 -0
- package/package.json +1 -1
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""PII and secret redaction — sanitize before sending to external LLMs (STR-055).
|
|
2
|
+
|
|
3
|
+
Auto-detect and redact secrets, API keys, and PII before prompts
|
|
4
|
+
leave the local environment. Replacement tokens allow reconstruction
|
|
5
|
+
if needed.
|
|
6
|
+
|
|
7
|
+
Focus group (Security): "Auto-detect and redact secrets, API keys, PII
|
|
8
|
+
before sending prompt to external LLM. This is a massive security win."
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import json
|
|
13
|
+
import hashlib
|
|
14
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
# Patterns for sensitive data detection
|
|
17
|
+
PATTERNS = {
|
|
18
|
+
"api_key": [
|
|
19
|
+
(r'\b(sk-[a-zA-Z0-9]{20,})\b', "OpenAI API key"),
|
|
20
|
+
(r'\b(xai-[a-zA-Z0-9]{20,})\b', "xAI API key"),
|
|
21
|
+
(r'\b(AIza[a-zA-Z0-9_-]{30,})\b', "Google API key"),
|
|
22
|
+
(r'\b(ghp_[a-zA-Z0-9]{36,})\b', "GitHub PAT"),
|
|
23
|
+
(r'\b(ghu_[a-zA-Z0-9]{36,})\b', "GitHub user token"),
|
|
24
|
+
(r'\b(glpat-[a-zA-Z0-9_-]{20,})\b', "GitLab PAT"),
|
|
25
|
+
(r'\b(npm_[a-zA-Z0-9]{36,})\b', "npm token"),
|
|
26
|
+
(r'\b(pypi-[a-zA-Z0-9]{50,})\b', "PyPI token"),
|
|
27
|
+
],
|
|
28
|
+
"secret": [
|
|
29
|
+
(r'(?i)(password|passwd|pwd)\s*[=:]\s*["\']([^"\']{4,})["\']', "password"),
|
|
30
|
+
(r'(?i)(secret|token|api_key|apikey)\s*[=:]\s*["\']([^"\']{8,})["\']', "secret/token"),
|
|
31
|
+
(r'(?i)bearer\s+([a-zA-Z0-9._-]{20,})', "bearer token"),
|
|
32
|
+
],
|
|
33
|
+
"pii": [
|
|
34
|
+
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', "email"),
|
|
35
|
+
(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', "phone number"),
|
|
36
|
+
(r'\b\d{3}-\d{2}-\d{4}\b', "SSN"),
|
|
37
|
+
(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', "credit card"),
|
|
38
|
+
],
|
|
39
|
+
"infra": [
|
|
40
|
+
(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', "IP address"),
|
|
41
|
+
(r'(?i)(mongodb|postgres|mysql|redis)://[^\s]+', "database URL"),
|
|
42
|
+
(r'(?i)https?://[^\s]*:(password|secret|token)[^\s]*', "URL with credentials"),
|
|
43
|
+
],
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Allowlist — patterns that look like secrets but aren't
|
|
47
|
+
ALLOWLIST = [
|
|
48
|
+
r'0\.0\.0\.0',
|
|
49
|
+
r'127\.0\.0\.1',
|
|
50
|
+
r'localhost',
|
|
51
|
+
r'example\.com',
|
|
52
|
+
r'test@test\.com',
|
|
53
|
+
r'placeholder',
|
|
54
|
+
r'REDACTED',
|
|
55
|
+
r'<your-',
|
|
56
|
+
r'\$\{', # Template variables
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _is_allowlisted(match: str) -> bool:
|
|
61
|
+
for pattern in ALLOWLIST:
|
|
62
|
+
if re.search(pattern, match, re.IGNORECASE):
|
|
63
|
+
return True
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _make_token(category: str, index: int) -> str:
|
|
68
|
+
return f"[REDACTED_{category.upper()}_{index}]"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def redact(
|
|
72
|
+
text: str,
|
|
73
|
+
categories: Optional[List[str]] = None,
|
|
74
|
+
) -> Dict[str, Any]:
|
|
75
|
+
"""Redact sensitive data from text.
|
|
76
|
+
|
|
77
|
+
Returns the redacted text and a mapping of tokens to original values
|
|
78
|
+
(stored locally, never sent externally).
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: Text to scan and redact.
|
|
82
|
+
categories: Which categories to scan (api_key, secret, pii, infra).
|
|
83
|
+
None = scan all.
|
|
84
|
+
"""
|
|
85
|
+
if not text:
|
|
86
|
+
return {"redacted": "", "findings": [], "token_count": 0}
|
|
87
|
+
|
|
88
|
+
active_categories = categories or list(PATTERNS.keys())
|
|
89
|
+
findings = []
|
|
90
|
+
token_map = {}
|
|
91
|
+
redacted = text
|
|
92
|
+
token_index = 0
|
|
93
|
+
|
|
94
|
+
for category in active_categories:
|
|
95
|
+
if category not in PATTERNS:
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
for pattern, label in PATTERNS[category]:
|
|
99
|
+
for match in re.finditer(pattern, redacted):
|
|
100
|
+
matched_text = match.group(0)
|
|
101
|
+
|
|
102
|
+
if _is_allowlisted(matched_text):
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
token_index += 1
|
|
106
|
+
token = _make_token(category, token_index)
|
|
107
|
+
|
|
108
|
+
findings.append({
|
|
109
|
+
"category": category,
|
|
110
|
+
"type": label,
|
|
111
|
+
"token": token,
|
|
112
|
+
"position": match.start(),
|
|
113
|
+
"length": len(matched_text),
|
|
114
|
+
"preview": matched_text[:4] + "..." + matched_text[-4:] if len(matched_text) > 12 else "***",
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
token_map[token] = matched_text
|
|
118
|
+
redacted = redacted.replace(matched_text, token, 1)
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
"redacted": redacted,
|
|
122
|
+
"findings": findings,
|
|
123
|
+
"token_count": token_index,
|
|
124
|
+
"token_map": token_map, # Keep local — never send externally
|
|
125
|
+
"categories_scanned": active_categories,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def scan(text: str) -> Dict[str, Any]:
|
|
130
|
+
"""Scan text for sensitive data WITHOUT redacting.
|
|
131
|
+
|
|
132
|
+
Use this to preview what would be redacted.
|
|
133
|
+
"""
|
|
134
|
+
result = redact(text)
|
|
135
|
+
return {
|
|
136
|
+
"findings": result["findings"],
|
|
137
|
+
"total": result["token_count"],
|
|
138
|
+
"categories": list(set(f["category"] for f in result["findings"])),
|
|
139
|
+
"safe": result["token_count"] == 0,
|
|
140
|
+
"message": f"Found {result['token_count']} sensitive item(s)" if result["token_count"] > 0 else "No sensitive data detected",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def restore(redacted_text: str, token_map: Dict[str, str]) -> str:
|
|
145
|
+
"""Restore redacted text using the token map."""
|
|
146
|
+
result = redacted_text
|
|
147
|
+
for token, original in token_map.items():
|
|
148
|
+
result = result.replace(token, original)
|
|
149
|
+
return result
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Prompt drift detection — same task behaves differently across models (STR-052).
|
|
2
|
+
|
|
3
|
+
Detects when the same prompt produces inconsistent results across
|
|
4
|
+
Claude, Codex, and Gemini. Flags divergence and suggests model-specific
|
|
5
|
+
adaptations.
|
|
6
|
+
|
|
7
|
+
Focus group (Indie): "Prompt drift — same task behaves differently
|
|
8
|
+
in Claude vs Codex vs Gemini."
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import time
|
|
13
|
+
import hashlib
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
DRIFT_DIR = Path.home() / ".delimit" / "prompt_drift"
|
|
18
|
+
HISTORY_FILE = DRIFT_DIR / "history.jsonl"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _ensure_dir():
|
|
22
|
+
DRIFT_DIR.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _hash_prompt(prompt: str) -> str:
|
|
26
|
+
return hashlib.sha256(prompt.encode()).hexdigest()[:12]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def record_result(
|
|
30
|
+
prompt: str,
|
|
31
|
+
model: str,
|
|
32
|
+
result_summary: str,
|
|
33
|
+
success: bool = True,
|
|
34
|
+
task_type: str = "",
|
|
35
|
+
duration_ms: int = 0,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
"""Record a prompt execution result for drift analysis."""
|
|
38
|
+
if not prompt or not model:
|
|
39
|
+
return {"error": "prompt and model are required"}
|
|
40
|
+
|
|
41
|
+
_ensure_dir()
|
|
42
|
+
prompt_hash = _hash_prompt(prompt)
|
|
43
|
+
|
|
44
|
+
entry = {
|
|
45
|
+
"prompt_hash": prompt_hash,
|
|
46
|
+
"prompt_preview": prompt[:100],
|
|
47
|
+
"model": model.lower().strip(),
|
|
48
|
+
"result_summary": result_summary[:200],
|
|
49
|
+
"success": success,
|
|
50
|
+
"task_type": task_type,
|
|
51
|
+
"duration_ms": duration_ms,
|
|
52
|
+
"ts": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
with open(HISTORY_FILE, "a") as f:
|
|
56
|
+
f.write(json.dumps(entry) + "\n")
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"status": "recorded",
|
|
60
|
+
"prompt_hash": prompt_hash,
|
|
61
|
+
"model": model,
|
|
62
|
+
"message": f"Result recorded for {model}",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def check_drift(
|
|
67
|
+
prompt: str = "",
|
|
68
|
+
task_type: str = "",
|
|
69
|
+
threshold: float = 0.3,
|
|
70
|
+
) -> Dict[str, Any]:
|
|
71
|
+
"""Check for prompt drift — inconsistent results across models.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
prompt: Specific prompt to check (by hash). Empty = check all recent.
|
|
75
|
+
task_type: Filter by task type.
|
|
76
|
+
threshold: Drift threshold (0-1). Higher = more tolerant.
|
|
77
|
+
"""
|
|
78
|
+
if not HISTORY_FILE.exists():
|
|
79
|
+
return {
|
|
80
|
+
"status": "no_data",
|
|
81
|
+
"drift_detected": False,
|
|
82
|
+
"message": "No prompt history. Use record_result() to start tracking.",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
entries: List[Dict] = []
|
|
86
|
+
try:
|
|
87
|
+
for line in HISTORY_FILE.read_text().strip().split("\n"):
|
|
88
|
+
try:
|
|
89
|
+
entries.append(json.loads(line))
|
|
90
|
+
except json.JSONDecodeError:
|
|
91
|
+
pass
|
|
92
|
+
except OSError:
|
|
93
|
+
return {"status": "error", "message": "Could not read history"}
|
|
94
|
+
|
|
95
|
+
# Filter
|
|
96
|
+
if prompt:
|
|
97
|
+
prompt_hash = _hash_prompt(prompt)
|
|
98
|
+
entries = [e for e in entries if e.get("prompt_hash") == prompt_hash]
|
|
99
|
+
if task_type:
|
|
100
|
+
entries = [e for e in entries if e.get("task_type") == task_type]
|
|
101
|
+
|
|
102
|
+
if not entries:
|
|
103
|
+
return {
|
|
104
|
+
"status": "no_matches",
|
|
105
|
+
"drift_detected": False,
|
|
106
|
+
"message": "No matching prompt history found.",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Group by prompt hash
|
|
110
|
+
by_prompt: Dict[str, List[Dict]] = {}
|
|
111
|
+
for e in entries:
|
|
112
|
+
by_prompt.setdefault(e["prompt_hash"], []).append(e)
|
|
113
|
+
|
|
114
|
+
drift_findings = []
|
|
115
|
+
for ph, results in by_prompt.items():
|
|
116
|
+
models = set(r["model"] for r in results)
|
|
117
|
+
if len(models) < 2:
|
|
118
|
+
continue # Need at least 2 models to compare
|
|
119
|
+
|
|
120
|
+
# Check success rate divergence
|
|
121
|
+
model_success = {}
|
|
122
|
+
for r in results:
|
|
123
|
+
m = r["model"]
|
|
124
|
+
model_success.setdefault(m, []).append(r["success"])
|
|
125
|
+
|
|
126
|
+
success_rates = {
|
|
127
|
+
m: sum(s) / len(s) for m, s in model_success.items()
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
# Check for significant divergence
|
|
131
|
+
rates = list(success_rates.values())
|
|
132
|
+
if max(rates) - min(rates) > threshold:
|
|
133
|
+
best = max(success_rates, key=success_rates.get)
|
|
134
|
+
worst = min(success_rates, key=success_rates.get)
|
|
135
|
+
drift_findings.append({
|
|
136
|
+
"prompt_hash": ph,
|
|
137
|
+
"prompt_preview": results[0].get("prompt_preview", ""),
|
|
138
|
+
"models_compared": list(models),
|
|
139
|
+
"success_rates": success_rates,
|
|
140
|
+
"best_model": best,
|
|
141
|
+
"worst_model": worst,
|
|
142
|
+
"divergence": round(max(rates) - min(rates), 2),
|
|
143
|
+
"recommendation": f"Use {best} for this task. {worst} has {round(success_rates[worst]*100)}% success rate.",
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"status": "ok",
|
|
148
|
+
"drift_detected": len(drift_findings) > 0,
|
|
149
|
+
"findings": drift_findings,
|
|
150
|
+
"total_prompts_analyzed": len(by_prompt),
|
|
151
|
+
"total_entries": len(entries),
|
|
152
|
+
"message": f"{len(drift_findings)} drift(s) detected across {len(by_prompt)} prompt(s)" if drift_findings else "No significant drift detected",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_model_rankings(task_type: str = "") -> Dict[str, Any]:
|
|
157
|
+
"""Rank models by success rate and speed for a task type."""
|
|
158
|
+
if not HISTORY_FILE.exists():
|
|
159
|
+
return {"status": "no_data", "rankings": []}
|
|
160
|
+
|
|
161
|
+
entries: List[Dict] = []
|
|
162
|
+
try:
|
|
163
|
+
for line in HISTORY_FILE.read_text().strip().split("\n"):
|
|
164
|
+
try:
|
|
165
|
+
entries.append(json.loads(line))
|
|
166
|
+
except json.JSONDecodeError:
|
|
167
|
+
pass
|
|
168
|
+
except OSError:
|
|
169
|
+
return {"status": "error", "rankings": []}
|
|
170
|
+
|
|
171
|
+
if task_type:
|
|
172
|
+
entries = [e for e in entries if e.get("task_type") == task_type]
|
|
173
|
+
|
|
174
|
+
if not entries:
|
|
175
|
+
return {"status": "no_data", "rankings": [], "task_type": task_type}
|
|
176
|
+
|
|
177
|
+
# Aggregate per model
|
|
178
|
+
model_stats: Dict[str, Dict] = {}
|
|
179
|
+
for e in entries:
|
|
180
|
+
m = e["model"]
|
|
181
|
+
if m not in model_stats:
|
|
182
|
+
model_stats[m] = {"successes": 0, "total": 0, "durations": []}
|
|
183
|
+
model_stats[m]["total"] += 1
|
|
184
|
+
if e.get("success"):
|
|
185
|
+
model_stats[m]["successes"] += 1
|
|
186
|
+
if e.get("duration_ms"):
|
|
187
|
+
model_stats[m]["durations"].append(e["duration_ms"])
|
|
188
|
+
|
|
189
|
+
rankings = []
|
|
190
|
+
for model, stats in model_stats.items():
|
|
191
|
+
avg_duration = sum(stats["durations"]) / len(stats["durations"]) if stats["durations"] else 0
|
|
192
|
+
success_rate = stats["successes"] / stats["total"] if stats["total"] > 0 else 0
|
|
193
|
+
rankings.append({
|
|
194
|
+
"model": model,
|
|
195
|
+
"success_rate": round(success_rate * 100, 1),
|
|
196
|
+
"avg_duration_ms": round(avg_duration),
|
|
197
|
+
"total_tasks": stats["total"],
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
rankings.sort(key=lambda r: (-r["success_rate"], r["avg_duration_ms"]))
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"status": "ok",
|
|
204
|
+
"rankings": rankings,
|
|
205
|
+
"task_type": task_type or "all",
|
|
206
|
+
"total_entries": len(entries),
|
|
207
|
+
}
|