delimit-cli 3.15.11 → 3.15.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/gateway/ai/collision_detect.py +141 -0
- package/gateway/ai/multi_review.py +154 -0
- package/gateway/ai/pii_redact.py +149 -0
- package/gateway/ai/prompt_drift.py +207 -0
- package/package.json +1 -1
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Duplicate work detection — prevent two AI models from editing the same file (STR-051).
|
|
2
|
+
|
|
3
|
+
Tracks which model is working on which files. Alerts before collision.
|
|
4
|
+
Adjacent problem nobody else solves.
|
|
5
|
+
|
|
6
|
+
Storage: ~/.delimit/agents/file_locks.json
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
AGENTS_DIR = Path.home() / ".delimit" / "agents"
|
|
15
|
+
LOCKS_FILE = AGENTS_DIR / "file_locks.json"
|
|
16
|
+
|
|
17
|
+
# Lock expires after 30 minutes of inactivity
|
|
18
|
+
LOCK_TTL_SECONDS = 1800
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _ensure_dir():
|
|
22
|
+
AGENTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _load_locks() -> Dict[str, Any]:
|
|
26
|
+
if not LOCKS_FILE.exists():
|
|
27
|
+
return {}
|
|
28
|
+
try:
|
|
29
|
+
return json.loads(LOCKS_FILE.read_text())
|
|
30
|
+
except (json.JSONDecodeError, OSError):
|
|
31
|
+
return {}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _save_locks(locks: Dict[str, Any]):
|
|
35
|
+
_ensure_dir()
|
|
36
|
+
LOCKS_FILE.write_text(json.dumps(locks, indent=2))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _cleanup_expired(locks: Dict[str, Any]) -> Dict[str, Any]:
|
|
40
|
+
now = time.time()
|
|
41
|
+
return {
|
|
42
|
+
path: lock for path, lock in locks.items()
|
|
43
|
+
if now - lock.get("ts", 0) < LOCK_TTL_SECONDS
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def claim_file(
|
|
48
|
+
file_path: str,
|
|
49
|
+
model: str,
|
|
50
|
+
task_id: str = "",
|
|
51
|
+
) -> Dict[str, Any]:
|
|
52
|
+
"""Claim a file for editing. Returns collision info if another model holds it."""
|
|
53
|
+
if not file_path or not model:
|
|
54
|
+
return {"error": "file_path and model are required"}
|
|
55
|
+
|
|
56
|
+
file_path = str(Path(file_path).resolve())
|
|
57
|
+
model = model.lower().strip()
|
|
58
|
+
|
|
59
|
+
locks = _cleanup_expired(_load_locks())
|
|
60
|
+
|
|
61
|
+
existing = locks.get(file_path)
|
|
62
|
+
if existing and existing["model"] != model:
|
|
63
|
+
return {
|
|
64
|
+
"status": "collision",
|
|
65
|
+
"file": file_path,
|
|
66
|
+
"held_by": existing["model"],
|
|
67
|
+
"held_since": existing.get("claimed_at", "unknown"),
|
|
68
|
+
"task_id": existing.get("task_id", ""),
|
|
69
|
+
"your_model": model,
|
|
70
|
+
"message": f"COLLISION: {existing['model']} is already editing {Path(file_path).name}",
|
|
71
|
+
"recommendation": "Coordinate with the other model or wait for them to finish.",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
locks[file_path] = {
|
|
75
|
+
"model": model,
|
|
76
|
+
"task_id": task_id,
|
|
77
|
+
"ts": time.time(),
|
|
78
|
+
"claimed_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
79
|
+
}
|
|
80
|
+
_save_locks(locks)
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
"status": "claimed",
|
|
84
|
+
"file": file_path,
|
|
85
|
+
"model": model,
|
|
86
|
+
"message": f"{model} claimed {Path(file_path).name}",
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def release_file(file_path: str, model: str = "") -> Dict[str, Any]:
|
|
91
|
+
"""Release a file lock."""
|
|
92
|
+
file_path = str(Path(file_path).resolve())
|
|
93
|
+
locks = _load_locks()
|
|
94
|
+
|
|
95
|
+
if file_path in locks:
|
|
96
|
+
if model and locks[file_path]["model"] != model.lower():
|
|
97
|
+
return {"error": f"File held by {locks[file_path]['model']}, not {model}"}
|
|
98
|
+
del locks[file_path]
|
|
99
|
+
_save_locks(locks)
|
|
100
|
+
return {"status": "released", "file": file_path}
|
|
101
|
+
|
|
102
|
+
return {"status": "ok", "message": "File was not locked"}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def check_collisions(model: str = "") -> Dict[str, Any]:
|
|
106
|
+
"""Check for active file locks and potential collisions."""
|
|
107
|
+
locks = _cleanup_expired(_load_locks())
|
|
108
|
+
_save_locks(locks)
|
|
109
|
+
|
|
110
|
+
active = []
|
|
111
|
+
by_model = {}
|
|
112
|
+
for path, lock in locks.items():
|
|
113
|
+
entry = {
|
|
114
|
+
"file": Path(path).name,
|
|
115
|
+
"full_path": path,
|
|
116
|
+
"model": lock["model"],
|
|
117
|
+
"claimed_at": lock.get("claimed_at", ""),
|
|
118
|
+
"task_id": lock.get("task_id", ""),
|
|
119
|
+
}
|
|
120
|
+
active.append(entry)
|
|
121
|
+
by_model.setdefault(lock["model"], []).append(entry)
|
|
122
|
+
|
|
123
|
+
# Detect overlapping directories (two models in same folder)
|
|
124
|
+
dir_models = {}
|
|
125
|
+
for path, lock in locks.items():
|
|
126
|
+
parent = str(Path(path).parent)
|
|
127
|
+
dir_models.setdefault(parent, set()).add(lock["model"])
|
|
128
|
+
|
|
129
|
+
hotspots = [
|
|
130
|
+
{"directory": d, "models": list(models), "risk": "high"}
|
|
131
|
+
for d, models in dir_models.items() if len(models) > 1
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
"status": "ok",
|
|
136
|
+
"active_locks": len(active),
|
|
137
|
+
"locks": active,
|
|
138
|
+
"by_model": {m: len(files) for m, files in by_model.items()},
|
|
139
|
+
"hotspots": hotspots,
|
|
140
|
+
"message": f"{len(active)} active lock(s), {len(hotspots)} hotspot(s)" if active else "No active locks",
|
|
141
|
+
}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Multi-model PR review — consolidated code review from multiple AI models (STR-053).
|
|
2
|
+
|
|
3
|
+
Takes a diff or file changes, sends them to multiple models for review,
|
|
4
|
+
and consolidates the feedback into a single structured report.
|
|
5
|
+
|
|
6
|
+
Focus group: "GitHub Action runs delimit review, posts consolidated PR
|
|
7
|
+
review combining feedback from multiple models. 10x over standard
|
|
8
|
+
Copilot review."
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
REVIEWS_DIR = Path.home() / ".delimit" / "reviews"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _ensure_dir():
|
|
20
|
+
REVIEWS_DIR.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def generate_review_prompt(diff: str, context: str = "") -> str:
|
|
24
|
+
"""Generate a code review prompt from a diff."""
|
|
25
|
+
return f"""Review this code change. For each issue found, provide:
|
|
26
|
+
- Line number or location
|
|
27
|
+
- Severity (critical/warning/suggestion)
|
|
28
|
+
- What's wrong and why
|
|
29
|
+
- How to fix it
|
|
30
|
+
|
|
31
|
+
Be concise. Only flag real issues, not style preferences.
|
|
32
|
+
|
|
33
|
+
{f"Context: {context}" if context else ""}
|
|
34
|
+
|
|
35
|
+
```diff
|
|
36
|
+
{diff[:8000]}
|
|
37
|
+
```"""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def consolidate_reviews(reviews: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
41
|
+
"""Consolidate reviews from multiple models into one report.
|
|
42
|
+
|
|
43
|
+
Groups findings by file/line, identifies agreements and disagreements,
|
|
44
|
+
and ranks by severity.
|
|
45
|
+
"""
|
|
46
|
+
all_findings = []
|
|
47
|
+
model_summaries = []
|
|
48
|
+
|
|
49
|
+
for review in reviews:
|
|
50
|
+
model = review.get("model", "unknown")
|
|
51
|
+
content = review.get("content", "")
|
|
52
|
+
duration = review.get("duration_ms", 0)
|
|
53
|
+
|
|
54
|
+
model_summaries.append({
|
|
55
|
+
"model": model,
|
|
56
|
+
"response_length": len(content),
|
|
57
|
+
"duration_ms": duration,
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
# Each model's review content becomes a finding block
|
|
61
|
+
all_findings.append({
|
|
62
|
+
"model": model,
|
|
63
|
+
"review": content,
|
|
64
|
+
"duration_ms": duration,
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
# Build consolidated report
|
|
68
|
+
report = {
|
|
69
|
+
"models_used": [r.get("model") for r in reviews],
|
|
70
|
+
"total_models": len(reviews),
|
|
71
|
+
"reviews": all_findings,
|
|
72
|
+
"model_summaries": model_summaries,
|
|
73
|
+
"generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return report
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def format_pr_comment(report: Dict[str, Any]) -> str:
|
|
80
|
+
"""Format the consolidated review as a GitHub PR comment."""
|
|
81
|
+
models = report.get("models_used", [])
|
|
82
|
+
reviews = report.get("reviews", [])
|
|
83
|
+
|
|
84
|
+
lines = []
|
|
85
|
+
lines.append("## Delimit Multi-Model Review")
|
|
86
|
+
lines.append("")
|
|
87
|
+
lines.append(f"Reviewed by: **{', '.join(models)}**")
|
|
88
|
+
lines.append("")
|
|
89
|
+
|
|
90
|
+
for review in reviews:
|
|
91
|
+
model = review.get("model", "unknown")
|
|
92
|
+
content = review.get("review", "")
|
|
93
|
+
duration = review.get("duration_ms", 0)
|
|
94
|
+
|
|
95
|
+
lines.append(f"### {model}")
|
|
96
|
+
if duration:
|
|
97
|
+
lines.append(f"*({duration}ms)*")
|
|
98
|
+
lines.append("")
|
|
99
|
+
lines.append(content)
|
|
100
|
+
lines.append("")
|
|
101
|
+
|
|
102
|
+
lines.append("---")
|
|
103
|
+
lines.append("Powered by [Delimit](https://delimit.ai) multi-model review")
|
|
104
|
+
|
|
105
|
+
return "\n".join(lines)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def save_review(
|
|
109
|
+
diff: str,
|
|
110
|
+
report: Dict[str, Any],
|
|
111
|
+
pr_url: str = "",
|
|
112
|
+
) -> Dict[str, Any]:
|
|
113
|
+
"""Save a review report to disk."""
|
|
114
|
+
_ensure_dir()
|
|
115
|
+
|
|
116
|
+
review_id = f"review-{int(time.time())}"
|
|
117
|
+
review_file = REVIEWS_DIR / f"{review_id}.json"
|
|
118
|
+
|
|
119
|
+
data = {
|
|
120
|
+
"id": review_id,
|
|
121
|
+
"diff_preview": diff[:500],
|
|
122
|
+
"report": report,
|
|
123
|
+
"pr_url": pr_url,
|
|
124
|
+
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
review_file.write_text(json.dumps(data, indent=2))
|
|
128
|
+
|
|
129
|
+
return {
|
|
130
|
+
"status": "saved",
|
|
131
|
+
"review_id": review_id,
|
|
132
|
+
"path": str(review_file),
|
|
133
|
+
"pr_comment": format_pr_comment(report),
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def list_reviews(limit: int = 10) -> Dict[str, Any]:
|
|
138
|
+
"""List recent reviews."""
|
|
139
|
+
_ensure_dir()
|
|
140
|
+
reviews = []
|
|
141
|
+
|
|
142
|
+
for f in sorted(REVIEWS_DIR.glob("review-*.json"), reverse=True)[:limit]:
|
|
143
|
+
try:
|
|
144
|
+
data = json.loads(f.read_text())
|
|
145
|
+
reviews.append({
|
|
146
|
+
"id": data["id"],
|
|
147
|
+
"models": data["report"].get("models_used", []),
|
|
148
|
+
"created_at": data.get("created_at", ""),
|
|
149
|
+
"pr_url": data.get("pr_url", ""),
|
|
150
|
+
})
|
|
151
|
+
except:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
return {"status": "ok", "reviews": reviews, "total": len(reviews)}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""PII and secret redaction — sanitize before sending to external LLMs (STR-055).
|
|
2
|
+
|
|
3
|
+
Auto-detect and redact secrets, API keys, and PII before prompts
|
|
4
|
+
leave the local environment. Replacement tokens allow reconstruction
|
|
5
|
+
if needed.
|
|
6
|
+
|
|
7
|
+
Focus group (Security): "Auto-detect and redact secrets, API keys, PII
|
|
8
|
+
before sending prompt to external LLM. This is a massive security win."
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import json
|
|
13
|
+
import hashlib
|
|
14
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
# Patterns for sensitive data detection
|
|
17
|
+
PATTERNS = {
|
|
18
|
+
"api_key": [
|
|
19
|
+
(r'\b(sk-[a-zA-Z0-9]{20,})\b', "OpenAI API key"),
|
|
20
|
+
(r'\b(xai-[a-zA-Z0-9]{20,})\b', "xAI API key"),
|
|
21
|
+
(r'\b(AIza[a-zA-Z0-9_-]{30,})\b', "Google API key"),
|
|
22
|
+
(r'\b(ghp_[a-zA-Z0-9]{36,})\b', "GitHub PAT"),
|
|
23
|
+
(r'\b(ghu_[a-zA-Z0-9]{36,})\b', "GitHub user token"),
|
|
24
|
+
(r'\b(glpat-[a-zA-Z0-9_-]{20,})\b', "GitLab PAT"),
|
|
25
|
+
(r'\b(npm_[a-zA-Z0-9]{36,})\b', "npm token"),
|
|
26
|
+
(r'\b(pypi-[a-zA-Z0-9]{50,})\b', "PyPI token"),
|
|
27
|
+
],
|
|
28
|
+
"secret": [
|
|
29
|
+
(r'(?i)(password|passwd|pwd)\s*[=:]\s*["\']([^"\']{4,})["\']', "password"),
|
|
30
|
+
(r'(?i)(secret|token|api_key|apikey)\s*[=:]\s*["\']([^"\']{8,})["\']', "secret/token"),
|
|
31
|
+
(r'(?i)bearer\s+([a-zA-Z0-9._-]{20,})', "bearer token"),
|
|
32
|
+
],
|
|
33
|
+
"pii": [
|
|
34
|
+
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', "email"),
|
|
35
|
+
(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', "phone number"),
|
|
36
|
+
(r'\b\d{3}-\d{2}-\d{4}\b', "SSN"),
|
|
37
|
+
(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', "credit card"),
|
|
38
|
+
],
|
|
39
|
+
"infra": [
|
|
40
|
+
(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', "IP address"),
|
|
41
|
+
(r'(?i)(mongodb|postgres|mysql|redis)://[^\s]+', "database URL"),
|
|
42
|
+
(r'(?i)https?://[^\s]*:(password|secret|token)[^\s]*', "URL with credentials"),
|
|
43
|
+
],
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Allowlist — patterns that look like secrets but aren't
|
|
47
|
+
ALLOWLIST = [
|
|
48
|
+
r'0\.0\.0\.0',
|
|
49
|
+
r'127\.0\.0\.1',
|
|
50
|
+
r'localhost',
|
|
51
|
+
r'example\.com',
|
|
52
|
+
r'test@test\.com',
|
|
53
|
+
r'placeholder',
|
|
54
|
+
r'REDACTED',
|
|
55
|
+
r'<your-',
|
|
56
|
+
r'\$\{', # Template variables
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _is_allowlisted(match: str) -> bool:
|
|
61
|
+
for pattern in ALLOWLIST:
|
|
62
|
+
if re.search(pattern, match, re.IGNORECASE):
|
|
63
|
+
return True
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _make_token(category: str, index: int) -> str:
|
|
68
|
+
return f"[REDACTED_{category.upper()}_{index}]"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def redact(
|
|
72
|
+
text: str,
|
|
73
|
+
categories: Optional[List[str]] = None,
|
|
74
|
+
) -> Dict[str, Any]:
|
|
75
|
+
"""Redact sensitive data from text.
|
|
76
|
+
|
|
77
|
+
Returns the redacted text and a mapping of tokens to original values
|
|
78
|
+
(stored locally, never sent externally).
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: Text to scan and redact.
|
|
82
|
+
categories: Which categories to scan (api_key, secret, pii, infra).
|
|
83
|
+
None = scan all.
|
|
84
|
+
"""
|
|
85
|
+
if not text:
|
|
86
|
+
return {"redacted": "", "findings": [], "token_count": 0}
|
|
87
|
+
|
|
88
|
+
active_categories = categories or list(PATTERNS.keys())
|
|
89
|
+
findings = []
|
|
90
|
+
token_map = {}
|
|
91
|
+
redacted = text
|
|
92
|
+
token_index = 0
|
|
93
|
+
|
|
94
|
+
for category in active_categories:
|
|
95
|
+
if category not in PATTERNS:
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
for pattern, label in PATTERNS[category]:
|
|
99
|
+
for match in re.finditer(pattern, redacted):
|
|
100
|
+
matched_text = match.group(0)
|
|
101
|
+
|
|
102
|
+
if _is_allowlisted(matched_text):
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
token_index += 1
|
|
106
|
+
token = _make_token(category, token_index)
|
|
107
|
+
|
|
108
|
+
findings.append({
|
|
109
|
+
"category": category,
|
|
110
|
+
"type": label,
|
|
111
|
+
"token": token,
|
|
112
|
+
"position": match.start(),
|
|
113
|
+
"length": len(matched_text),
|
|
114
|
+
"preview": matched_text[:4] + "..." + matched_text[-4:] if len(matched_text) > 12 else "***",
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
token_map[token] = matched_text
|
|
118
|
+
redacted = redacted.replace(matched_text, token, 1)
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
"redacted": redacted,
|
|
122
|
+
"findings": findings,
|
|
123
|
+
"token_count": token_index,
|
|
124
|
+
"token_map": token_map, # Keep local — never send externally
|
|
125
|
+
"categories_scanned": active_categories,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def scan(text: str) -> Dict[str, Any]:
|
|
130
|
+
"""Scan text for sensitive data WITHOUT redacting.
|
|
131
|
+
|
|
132
|
+
Use this to preview what would be redacted.
|
|
133
|
+
"""
|
|
134
|
+
result = redact(text)
|
|
135
|
+
return {
|
|
136
|
+
"findings": result["findings"],
|
|
137
|
+
"total": result["token_count"],
|
|
138
|
+
"categories": list(set(f["category"] for f in result["findings"])),
|
|
139
|
+
"safe": result["token_count"] == 0,
|
|
140
|
+
"message": f"Found {result['token_count']} sensitive item(s)" if result["token_count"] > 0 else "No sensitive data detected",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def restore(redacted_text: str, token_map: Dict[str, str]) -> str:
|
|
145
|
+
"""Restore redacted text using the token map."""
|
|
146
|
+
result = redacted_text
|
|
147
|
+
for token, original in token_map.items():
|
|
148
|
+
result = result.replace(token, original)
|
|
149
|
+
return result
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Prompt drift detection — same task behaves differently across models (STR-052).
|
|
2
|
+
|
|
3
|
+
Detects when the same prompt produces inconsistent results across
|
|
4
|
+
Claude, Codex, and Gemini. Flags divergence and suggests model-specific
|
|
5
|
+
adaptations.
|
|
6
|
+
|
|
7
|
+
Focus group (Indie): "Prompt drift — same task behaves differently
|
|
8
|
+
in Claude vs Codex vs Gemini."
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import time
|
|
13
|
+
import hashlib
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
DRIFT_DIR = Path.home() / ".delimit" / "prompt_drift"
|
|
18
|
+
HISTORY_FILE = DRIFT_DIR / "history.jsonl"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _ensure_dir():
|
|
22
|
+
DRIFT_DIR.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _hash_prompt(prompt: str) -> str:
|
|
26
|
+
return hashlib.sha256(prompt.encode()).hexdigest()[:12]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def record_result(
|
|
30
|
+
prompt: str,
|
|
31
|
+
model: str,
|
|
32
|
+
result_summary: str,
|
|
33
|
+
success: bool = True,
|
|
34
|
+
task_type: str = "",
|
|
35
|
+
duration_ms: int = 0,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
"""Record a prompt execution result for drift analysis."""
|
|
38
|
+
if not prompt or not model:
|
|
39
|
+
return {"error": "prompt and model are required"}
|
|
40
|
+
|
|
41
|
+
_ensure_dir()
|
|
42
|
+
prompt_hash = _hash_prompt(prompt)
|
|
43
|
+
|
|
44
|
+
entry = {
|
|
45
|
+
"prompt_hash": prompt_hash,
|
|
46
|
+
"prompt_preview": prompt[:100],
|
|
47
|
+
"model": model.lower().strip(),
|
|
48
|
+
"result_summary": result_summary[:200],
|
|
49
|
+
"success": success,
|
|
50
|
+
"task_type": task_type,
|
|
51
|
+
"duration_ms": duration_ms,
|
|
52
|
+
"ts": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
with open(HISTORY_FILE, "a") as f:
|
|
56
|
+
f.write(json.dumps(entry) + "\n")
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"status": "recorded",
|
|
60
|
+
"prompt_hash": prompt_hash,
|
|
61
|
+
"model": model,
|
|
62
|
+
"message": f"Result recorded for {model}",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def check_drift(
|
|
67
|
+
prompt: str = "",
|
|
68
|
+
task_type: str = "",
|
|
69
|
+
threshold: float = 0.3,
|
|
70
|
+
) -> Dict[str, Any]:
|
|
71
|
+
"""Check for prompt drift — inconsistent results across models.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
prompt: Specific prompt to check (by hash). Empty = check all recent.
|
|
75
|
+
task_type: Filter by task type.
|
|
76
|
+
threshold: Drift threshold (0-1). Higher = more tolerant.
|
|
77
|
+
"""
|
|
78
|
+
if not HISTORY_FILE.exists():
|
|
79
|
+
return {
|
|
80
|
+
"status": "no_data",
|
|
81
|
+
"drift_detected": False,
|
|
82
|
+
"message": "No prompt history. Use record_result() to start tracking.",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
entries: List[Dict] = []
|
|
86
|
+
try:
|
|
87
|
+
for line in HISTORY_FILE.read_text().strip().split("\n"):
|
|
88
|
+
try:
|
|
89
|
+
entries.append(json.loads(line))
|
|
90
|
+
except json.JSONDecodeError:
|
|
91
|
+
pass
|
|
92
|
+
except OSError:
|
|
93
|
+
return {"status": "error", "message": "Could not read history"}
|
|
94
|
+
|
|
95
|
+
# Filter
|
|
96
|
+
if prompt:
|
|
97
|
+
prompt_hash = _hash_prompt(prompt)
|
|
98
|
+
entries = [e for e in entries if e.get("prompt_hash") == prompt_hash]
|
|
99
|
+
if task_type:
|
|
100
|
+
entries = [e for e in entries if e.get("task_type") == task_type]
|
|
101
|
+
|
|
102
|
+
if not entries:
|
|
103
|
+
return {
|
|
104
|
+
"status": "no_matches",
|
|
105
|
+
"drift_detected": False,
|
|
106
|
+
"message": "No matching prompt history found.",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Group by prompt hash
|
|
110
|
+
by_prompt: Dict[str, List[Dict]] = {}
|
|
111
|
+
for e in entries:
|
|
112
|
+
by_prompt.setdefault(e["prompt_hash"], []).append(e)
|
|
113
|
+
|
|
114
|
+
drift_findings = []
|
|
115
|
+
for ph, results in by_prompt.items():
|
|
116
|
+
models = set(r["model"] for r in results)
|
|
117
|
+
if len(models) < 2:
|
|
118
|
+
continue # Need at least 2 models to compare
|
|
119
|
+
|
|
120
|
+
# Check success rate divergence
|
|
121
|
+
model_success = {}
|
|
122
|
+
for r in results:
|
|
123
|
+
m = r["model"]
|
|
124
|
+
model_success.setdefault(m, []).append(r["success"])
|
|
125
|
+
|
|
126
|
+
success_rates = {
|
|
127
|
+
m: sum(s) / len(s) for m, s in model_success.items()
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
# Check for significant divergence
|
|
131
|
+
rates = list(success_rates.values())
|
|
132
|
+
if max(rates) - min(rates) > threshold:
|
|
133
|
+
best = max(success_rates, key=success_rates.get)
|
|
134
|
+
worst = min(success_rates, key=success_rates.get)
|
|
135
|
+
drift_findings.append({
|
|
136
|
+
"prompt_hash": ph,
|
|
137
|
+
"prompt_preview": results[0].get("prompt_preview", ""),
|
|
138
|
+
"models_compared": list(models),
|
|
139
|
+
"success_rates": success_rates,
|
|
140
|
+
"best_model": best,
|
|
141
|
+
"worst_model": worst,
|
|
142
|
+
"divergence": round(max(rates) - min(rates), 2),
|
|
143
|
+
"recommendation": f"Use {best} for this task. {worst} has {round(success_rates[worst]*100)}% success rate.",
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"status": "ok",
|
|
148
|
+
"drift_detected": len(drift_findings) > 0,
|
|
149
|
+
"findings": drift_findings,
|
|
150
|
+
"total_prompts_analyzed": len(by_prompt),
|
|
151
|
+
"total_entries": len(entries),
|
|
152
|
+
"message": f"{len(drift_findings)} drift(s) detected across {len(by_prompt)} prompt(s)" if drift_findings else "No significant drift detected",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_model_rankings(task_type: str = "") -> Dict[str, Any]:
|
|
157
|
+
"""Rank models by success rate and speed for a task type."""
|
|
158
|
+
if not HISTORY_FILE.exists():
|
|
159
|
+
return {"status": "no_data", "rankings": []}
|
|
160
|
+
|
|
161
|
+
entries: List[Dict] = []
|
|
162
|
+
try:
|
|
163
|
+
for line in HISTORY_FILE.read_text().strip().split("\n"):
|
|
164
|
+
try:
|
|
165
|
+
entries.append(json.loads(line))
|
|
166
|
+
except json.JSONDecodeError:
|
|
167
|
+
pass
|
|
168
|
+
except OSError:
|
|
169
|
+
return {"status": "error", "rankings": []}
|
|
170
|
+
|
|
171
|
+
if task_type:
|
|
172
|
+
entries = [e for e in entries if e.get("task_type") == task_type]
|
|
173
|
+
|
|
174
|
+
if not entries:
|
|
175
|
+
return {"status": "no_data", "rankings": [], "task_type": task_type}
|
|
176
|
+
|
|
177
|
+
# Aggregate per model
|
|
178
|
+
model_stats: Dict[str, Dict] = {}
|
|
179
|
+
for e in entries:
|
|
180
|
+
m = e["model"]
|
|
181
|
+
if m not in model_stats:
|
|
182
|
+
model_stats[m] = {"successes": 0, "total": 0, "durations": []}
|
|
183
|
+
model_stats[m]["total"] += 1
|
|
184
|
+
if e.get("success"):
|
|
185
|
+
model_stats[m]["successes"] += 1
|
|
186
|
+
if e.get("duration_ms"):
|
|
187
|
+
model_stats[m]["durations"].append(e["duration_ms"])
|
|
188
|
+
|
|
189
|
+
rankings = []
|
|
190
|
+
for model, stats in model_stats.items():
|
|
191
|
+
avg_duration = sum(stats["durations"]) / len(stats["durations"]) if stats["durations"] else 0
|
|
192
|
+
success_rate = stats["successes"] / stats["total"] if stats["total"] > 0 else 0
|
|
193
|
+
rankings.append({
|
|
194
|
+
"model": model,
|
|
195
|
+
"success_rate": round(success_rate * 100, 1),
|
|
196
|
+
"avg_duration_ms": round(avg_duration),
|
|
197
|
+
"total_tasks": stats["total"],
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
rankings.sort(key=lambda r: (-r["success_rate"], r["avg_duration_ms"]))
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"status": "ok",
|
|
204
|
+
"rankings": rankings,
|
|
205
|
+
"task_type": task_type or "all",
|
|
206
|
+
"total_entries": len(entries),
|
|
207
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "delimit-cli",
|
|
3
3
|
"mcpName": "io.github.delimit-ai/delimit-mcp-server",
|
|
4
|
-
"version": "3.15.
|
|
4
|
+
"version": "3.15.12",
|
|
5
5
|
"description": "Unify Claude Code, Codex, Cursor, and Gemini CLI with persistent context, governance, and multi-model debate.",
|
|
6
6
|
"main": "index.js",
|
|
7
7
|
"files": [
|