delimit-cli 3.15.13 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,600 +0,0 @@
1
- """
2
- Delimit Cross-Model Audit — Trust through triangulation.
3
-
4
- Run the same code review through 3 different AI models, each with a different
5
- review lens (security, correctness, governance). A synthesis step merges their
6
- findings: agreements become high-confidence, disagreements surface tradeoffs.
7
-
8
- This is different from `delimit_deliberate` (which debates a question).
9
- Cross-Model Audit reviews actual code/specs for specific issues.
10
-
11
- Models are configured via ~/.delimit/models.json or ~/.delimit/secrets/hosted-models.json.
12
- Uses the same infrastructure as deliberation.py.
13
- """
14
-
15
- import json
16
- import logging
17
- import os
18
- import re
19
- import threading
20
- import time
21
- from datetime import datetime, timezone
22
- from pathlib import Path
23
- from typing import Any, Dict, List, Optional, Tuple
24
-
25
- logger = logging.getLogger("delimit.cross_model_audit")
26
-
27
- AUDIT_DIR = Path.home() / ".delimit" / "audits"
28
-
29
- # ═══════════════════════════════════════════════════════════════════════
30
- # Audit Lenses — each model gets a different review focus
31
- # ═══════════════════════════════════════════════════════════════════════
32
-
33
- AUDIT_LENSES = {
34
- "security": (
35
- "Review for security vulnerabilities: injection, auth bypass, data exposure, "
36
- "privilege escalation, secret leaks. Focus on exploitable issues."
37
- ),
38
- "correctness": (
39
- "Review for logical errors, edge cases, off-by-one, race conditions, "
40
- "null handling, error propagation. Focus on bugs that cause wrong behavior."
41
- ),
42
- "governance": (
43
- "Review for breaking changes, API contract violations, backward compatibility, "
44
- "schema drift, missing validation. Focus on issues that affect consumers."
45
- ),
46
- }
47
-
48
- # Severity levels for structured findings
49
- SEVERITY_ORDER = {"critical": 0, "high": 1, "medium": 2, "low": 3, "info": 4}
50
-
51
- MODEL_TIMEOUT = 60 # seconds
52
-
53
-
54
- def _build_lens_prompt(lens_name: str, lens_description: str, target_code: str, target_type: str) -> str:
55
- """Build the prompt for a model with its assigned lens."""
56
- type_label = {
57
- "file": "source file",
58
- "diff": "git diff",
59
- "snippet": "code snippet",
60
- }.get(target_type, "code")
61
-
62
- return f"""You are a code auditor focused on **{lens_name}**.
63
-
64
- {lens_description}
65
-
66
- Analyze the following {type_label} and return your findings as a JSON array.
67
- Each finding must be a JSON object with these fields:
68
- - "severity": one of "critical", "high", "medium", "low", "info"
69
- - "location": line number, function name, or description of where the issue is (e.g. "Line 42", "function validate_token", "JWT handling block")
70
- - "finding": clear description of the issue
71
- - "recommendation": what to do about it
72
-
73
- Return ONLY a JSON array. No markdown fences, no explanatory text before or after.
74
- If you find no issues, return an empty array: []
75
-
76
- --- BEGIN CODE ---
77
- {target_code}
78
- --- END CODE ---"""
79
-
80
-
81
- def _resolve_target(target: str, target_type: str) -> Tuple[str, Optional[str]]:
82
- """Resolve the target to actual code content.
83
-
84
- Returns (code_content, error_message).
85
- """
86
- if target_type == "file":
87
- path = Path(target).expanduser()
88
- if not path.exists():
89
- return "", f"File not found: {target}"
90
- if not path.is_file():
91
- return "", f"Not a file: {target}"
92
- try:
93
- content = path.read_text(errors="replace")
94
- if len(content) > 50000:
95
- content = content[:50000] + "\n\n[... truncated at 50,000 characters ...]"
96
- return content, None
97
- except Exception as e:
98
- return "", f"Failed to read file: {e}"
99
- elif target_type in ("diff", "snippet"):
100
- if not target.strip():
101
- return "", "Empty target provided."
102
- return target, None
103
- else:
104
- return "", f"Unknown target_type: {target_type}. Use 'file', 'diff', or 'snippet'."
105
-
106
-
107
- def _parse_model_findings(raw_response: str, model_name: str) -> List[Dict[str, str]]:
108
- """Parse structured findings from a model response.
109
-
110
- Tries to extract a JSON array from the response. Handles markdown fences
111
- and other common formatting issues.
112
- """
113
- text = raw_response.strip()
114
-
115
- # Strip markdown code fences
116
- if text.startswith("```"):
117
- lines = text.split("\n")
118
- # Remove first line (```json or ```) and last line (```)
119
- lines = [l for l in lines[1:] if not l.strip().startswith("```")]
120
- text = "\n".join(lines).strip()
121
-
122
- # Try direct parse
123
- try:
124
- findings = json.loads(text)
125
- if isinstance(findings, list):
126
- return _validate_findings(findings, model_name)
127
- except json.JSONDecodeError:
128
- pass
129
-
130
- # Try to extract JSON array from text
131
- match = re.search(r'\[.*\]', text, re.DOTALL)
132
- if match:
133
- try:
134
- findings = json.loads(match.group())
135
- if isinstance(findings, list):
136
- return _validate_findings(findings, model_name)
137
- except json.JSONDecodeError:
138
- pass
139
-
140
- # Could not parse — return the raw response as a single finding
141
- logger.warning("Could not parse structured findings from %s, wrapping raw response", model_name)
142
- return [{
143
- "severity": "info",
144
- "location": "general",
145
- "finding": f"[Unstructured response from {model_name}]: {raw_response[:500]}",
146
- "recommendation": "Review raw model output manually.",
147
- }]
148
-
149
-
150
- def _validate_findings(findings: List, model_name: str) -> List[Dict[str, str]]:
151
- """Validate and normalize finding objects."""
152
- validated = []
153
- for f in findings:
154
- if not isinstance(f, dict):
155
- continue
156
- validated.append({
157
- "severity": str(f.get("severity", "info")).lower(),
158
- "location": str(f.get("location", "unknown")),
159
- "finding": str(f.get("finding", "")),
160
- "recommendation": str(f.get("recommendation", "")),
161
- })
162
- return validated
163
-
164
-
165
- def _call_model_with_lens(
166
- model_id: str,
167
- model_config: Dict,
168
- lens_name: str,
169
- target_code: str,
170
- target_type: str,
171
- ) -> Dict[str, Any]:
172
- """Call a single model with its lens prompt. Returns result dict."""
173
- from ai.deliberation import _call_model
174
-
175
- lens_description = AUDIT_LENSES[lens_name]
176
- prompt = _build_lens_prompt(lens_name, lens_description, target_code, target_type)
177
- system_prompt = (
178
- f"You are a senior code auditor performing a {lens_name} review. "
179
- "Return findings as a JSON array. Be thorough but precise."
180
- )
181
-
182
- start = time.time()
183
- try:
184
- raw = _call_model(model_id, model_config, prompt, system_prompt)
185
- elapsed = round(time.time() - start, 1)
186
- except Exception as e:
187
- elapsed = round(time.time() - start, 1)
188
- return {
189
- "model_id": model_id,
190
- "model_name": model_config.get("name", model_id),
191
- "lens": lens_name,
192
- "status": "error",
193
- "error": str(e),
194
- "elapsed_seconds": elapsed,
195
- "findings": [],
196
- }
197
-
198
- # Check for model-level errors
199
- if raw.startswith("[") and "unavailable" in raw.lower() or "error" in raw.lower():
200
- if raw.startswith("[") and raw.endswith("]") and ("unavailable" in raw or "error:" in raw):
201
- return {
202
- "model_id": model_id,
203
- "model_name": model_config.get("name", model_id),
204
- "lens": lens_name,
205
- "status": "error",
206
- "error": raw,
207
- "elapsed_seconds": elapsed,
208
- "findings": [],
209
- }
210
-
211
- findings = _parse_model_findings(raw, model_config.get("name", model_id))
212
-
213
- return {
214
- "model_id": model_id,
215
- "model_name": model_config.get("name", model_id),
216
- "lens": lens_name,
217
- "status": "ok",
218
- "elapsed_seconds": elapsed,
219
- "findings": findings,
220
- "raw_response": raw,
221
- }
222
-
223
-
224
- def _normalize_location(loc: str) -> str:
225
- """Normalize a location string for matching purposes."""
226
- loc = loc.lower().strip()
227
- # Extract line numbers
228
- line_match = re.search(r'line\s*(\d+)', loc)
229
- if line_match:
230
- return f"line_{line_match.group(1)}"
231
- # Extract function names
232
- func_match = re.search(r'function\s+(\w+)', loc)
233
- if func_match:
234
- return f"func_{func_match.group(1)}"
235
- # Fall back to cleaned string
236
- return re.sub(r'[^a-z0-9_]', '_', loc).strip('_')
237
-
238
-
239
- def synthesize(audit_results: Dict[str, Any]) -> Dict[str, Any]:
240
- """Merge findings from multiple model audits.
241
-
242
- Returns:
243
- agreements: findings flagged by 2+ models (high confidence)
244
- unique_findings: flagged by only 1 model (review needed)
245
- disagreements: models contradict each other on severity/recommendation
246
- summary: one-paragraph synthesis
247
- """
248
- model_results = audit_results.get("model_results", [])
249
- if not model_results:
250
- return {
251
- "agreements": [],
252
- "unique_findings": [],
253
- "disagreements": [],
254
- "summary": "No model results to synthesize.",
255
- }
256
-
257
- # Collect all findings with their source lens
258
- all_findings: List[Dict[str, Any]] = []
259
- for result in model_results:
260
- if result.get("status") != "ok":
261
- continue
262
- lens = result["lens"]
263
- model_name = result["model_name"]
264
- for f in result.get("findings", []):
265
- all_findings.append({
266
- **f,
267
- "lens": lens,
268
- "model": model_name,
269
- "norm_location": _normalize_location(f.get("location", "")),
270
- })
271
-
272
- if not all_findings:
273
- return {
274
- "agreements": [],
275
- "unique_findings": [],
276
- "disagreements": [],
277
- "summary": "All models returned clean results. No issues found.",
278
- }
279
-
280
- # Group by normalized location
281
- location_groups: Dict[str, List[Dict]] = {}
282
- for f in all_findings:
283
- key = f["norm_location"]
284
- location_groups.setdefault(key, []).append(f)
285
-
286
- agreements = []
287
- unique_findings = []
288
- disagreements = []
289
-
290
- for loc_key, findings in location_groups.items():
291
- models_involved = set(f["model"] for f in findings)
292
- lenses_involved = set(f["lens"] for f in findings)
293
-
294
- if len(models_involved) >= 2:
295
- # Check for severity disagreements
296
- severities = set(f["severity"] for f in findings)
297
- if len(severities) > 1:
298
- # Models agree on location but disagree on severity
299
- disagreements.append({
300
- "location": findings[0]["location"],
301
- "models": {f["model"]: {
302
- "lens": f["lens"],
303
- "severity": f["severity"],
304
- "finding": f["finding"],
305
- "recommendation": f["recommendation"],
306
- } for f in findings},
307
- "type": "severity_disagreement",
308
- })
309
- else:
310
- # Full agreement
311
- agreements.append({
312
- "location": findings[0]["location"],
313
- "severity": findings[0]["severity"],
314
- "models_agreed": list(models_involved),
315
- "lenses": list(lenses_involved),
316
- "findings": {f["lens"]: {
317
- "finding": f["finding"],
318
- "recommendation": f["recommendation"],
319
- } for f in findings},
320
- })
321
- else:
322
- # Only one model flagged this
323
- f = findings[0]
324
- unique_findings.append({
325
- "location": f["location"],
326
- "severity": f["severity"],
327
- "lens": f["lens"],
328
- "model": f["model"],
329
- "finding": f["finding"],
330
- "recommendation": f["recommendation"],
331
- })
332
-
333
- # Sort by severity
334
- agreements.sort(key=lambda x: SEVERITY_ORDER.get(x["severity"], 5))
335
- unique_findings.sort(key=lambda x: SEVERITY_ORDER.get(x["severity"], 5))
336
-
337
- # Build summary
338
- total = len(agreements) + len(unique_findings) + len(disagreements)
339
- successful_models = [r for r in model_results if r.get("status") == "ok"]
340
- failed_models = [r for r in model_results if r.get("status") != "ok"]
341
-
342
- summary_parts = [
343
- f"{len(agreements)} high-confidence finding(s)",
344
- f"{len(unique_findings)} unique catch(es)",
345
- f"{len(disagreements)} tradeoff(s)",
346
- ]
347
- summary = f"{total} total findings across {len(successful_models)} models: " + ", ".join(summary_parts) + "."
348
-
349
- if failed_models:
350
- failed_names = [r.get("model_name", "unknown") for r in failed_models]
351
- summary += f" ({', '.join(failed_names)} failed — results are partial.)"
352
-
353
- if agreements:
354
- critical_count = sum(1 for a in agreements if a["severity"] == "critical")
355
- if critical_count:
356
- summary += f" {critical_count} CRITICAL issue(s) confirmed by multiple models."
357
-
358
- return {
359
- "agreements": agreements,
360
- "unique_findings": unique_findings,
361
- "disagreements": disagreements,
362
- "summary": summary,
363
- }
364
-
365
-
366
- def format_audit_output(audit_results: Dict[str, Any], synthesis: Dict[str, Any]) -> str:
367
- """Format audit results as human-readable text."""
368
- lines = []
369
- lines.append("=== CROSS-MODEL AUDIT ===")
370
-
371
- target_display = audit_results.get("target_display", audit_results.get("target", "unknown"))
372
- lines.append(f"Target: {target_display}")
373
-
374
- # Model assignments
375
- model_parts = []
376
- for r in audit_results.get("model_results", []):
377
- name = r.get("model_name", r.get("model_id", "?"))
378
- lens = r.get("lens", "?")
379
- status = "" if r.get("status") == "ok" else " [FAILED]"
380
- model_parts.append(f"{name} ({lens}){status}")
381
- lines.append(f"Models: {' | '.join(model_parts)}")
382
- lines.append("")
383
-
384
- # High confidence
385
- agreements = synthesis.get("agreements", [])
386
- if agreements:
387
- lines.append("HIGH CONFIDENCE (2+ models agree):")
388
- for a in agreements:
389
- sev = a["severity"].upper()
390
- loc = a["location"]
391
- lines.append(f" [{sev}] {loc}")
392
- for lens, detail in a.get("findings", {}).items():
393
- lines.append(f" {lens.title()}: {detail['finding']}")
394
- # Show first recommendation
395
- recs = [d["recommendation"] for d in a.get("findings", {}).values() if d.get("recommendation")]
396
- if recs:
397
- lines.append(f" Action: {recs[0]}")
398
- lines.append("")
399
- else:
400
- lines.append("HIGH CONFIDENCE: None (no multi-model agreement)")
401
- lines.append("")
402
-
403
- # Unique findings
404
- unique = synthesis.get("unique_findings", [])
405
- if unique:
406
- lines.append("UNIQUE FINDINGS (single model):")
407
- for u in unique:
408
- sev = u["severity"].upper()
409
- loc = u["location"]
410
- lens = u["lens"].title()
411
- lines.append(f" [{sev}] {loc} ({lens})")
412
- lines.append(f" {u['finding']}")
413
- if u.get("recommendation"):
414
- lines.append(f" Recommendation: {u['recommendation']}")
415
- lines.append("")
416
- else:
417
- lines.append("UNIQUE FINDINGS: None")
418
- lines.append("")
419
-
420
- # Disagreements
421
- disagreements = synthesis.get("disagreements", [])
422
- if disagreements:
423
- lines.append("DISAGREEMENTS:")
424
- for d in disagreements:
425
- loc = d["location"]
426
- lines.append(f" {loc}:")
427
- for model_name, detail in d.get("models", {}).items():
428
- lines.append(f" {detail['lens'].title()} ({model_name}): {detail['finding']} [severity: {detail['severity']}]")
429
- lines.append(f" Tradeoff: review and decide based on your risk tolerance.")
430
- lines.append("")
431
- else:
432
- lines.append("DISAGREEMENTS: None")
433
- lines.append("")
434
-
435
- lines.append(f"Summary: {synthesis.get('summary', 'No summary available.')}")
436
-
437
- return "\n".join(lines)
438
-
439
-
440
- def _select_models_and_lenses(
441
- lenses: Optional[List[str]] = None,
442
- models: Optional[List[str]] = None,
443
- ) -> Tuple[List[Tuple[str, Dict, str]], Optional[str]]:
444
- """Select which models get which lenses.
445
-
446
- Returns (assignments, error).
447
- Each assignment is (model_id, model_config, lens_name).
448
- """
449
- from ai.deliberation import get_models_config
450
-
451
- # Resolve lenses
452
- active_lenses = list(lenses) if lenses else list(AUDIT_LENSES.keys())
453
- for lens in active_lenses:
454
- if lens not in AUDIT_LENSES:
455
- return [], f"Unknown lens: {lens}. Available: {', '.join(AUDIT_LENSES.keys())}"
456
-
457
- # Get available models
458
- config = get_models_config(allow_hosted_fallback=True)
459
- enabled = {k: v for k, v in config.items() if v.get("enabled")}
460
-
461
- if not enabled:
462
- return [], (
463
- "No models available for audit. Configure API keys in ~/.delimit/models.json "
464
- "or ensure hosted models are available."
465
- )
466
-
467
- # If specific models requested, filter
468
- if models:
469
- filtered = {}
470
- for m in models:
471
- m_lower = m.lower()
472
- if m_lower in enabled:
473
- filtered[m_lower] = enabled[m_lower]
474
- else:
475
- # Try partial match
476
- for k, v in enabled.items():
477
- if m_lower in k.lower() or m_lower in v.get("name", "").lower():
478
- filtered[k] = v
479
- break
480
- if not filtered:
481
- return [], f"None of the requested models ({', '.join(models)}) are available."
482
- enabled = filtered
483
-
484
- # Assign lenses to models round-robin
485
- model_ids = list(enabled.keys())
486
- assignments = []
487
- for i, lens in enumerate(active_lenses):
488
- model_id = model_ids[i % len(model_ids)]
489
- assignments.append((model_id, enabled[model_id], lens))
490
-
491
- return assignments, None
492
-
493
-
494
- def audit(
495
- target: str,
496
- target_type: str = "file",
497
- lenses: Optional[List[str]] = None,
498
- models: Optional[List[str]] = None,
499
- ) -> Dict[str, Any]:
500
- """Run cross-model audit on a target.
501
-
502
- Args:
503
- target: File path, diff text, or code snippet.
504
- target_type: "file", "diff", or "snippet".
505
- lenses: Which lenses to apply (default: all 3).
506
- models: Which models to use (default: auto-detect).
507
-
508
- Returns:
509
- Full audit results with model_results, synthesis, and formatted output.
510
- """
511
- start_time = time.time()
512
- timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
513
-
514
- # Resolve target
515
- target_code, err = _resolve_target(target, target_type)
516
- if err:
517
- return {"status": "error", "error": err}
518
-
519
- # Select models and assign lenses
520
- assignments, err = _select_models_and_lenses(lenses, models)
521
- if err:
522
- return {"status": "error", "error": err}
523
-
524
- # Target display name
525
- if target_type == "file":
526
- target_display = target
527
- elif target_type == "diff":
528
- first_line = target.strip().split("\n")[0][:80]
529
- target_display = f"diff: {first_line}..."
530
- else:
531
- target_display = f"snippet ({len(target_code)} chars)"
532
-
533
- # Call models in parallel
534
- results: List[Dict[str, Any]] = [None] * len(assignments) # type: ignore
535
- threads = []
536
-
537
- def _run(index: int, model_id: str, config: Dict, lens: str):
538
- results[index] = _call_model_with_lens(model_id, config, lens, target_code, target_type)
539
-
540
- for i, (model_id, config, lens) in enumerate(assignments):
541
- t = threading.Thread(target=_run, args=(i, model_id, config, lens), daemon=True)
542
- threads.append(t)
543
- t.start()
544
-
545
- # Wait with timeout
546
- for t in threads:
547
- t.join(timeout=MODEL_TIMEOUT)
548
-
549
- # Replace any None results (timed out threads)
550
- for i, r in enumerate(results):
551
- if r is None:
552
- model_id, config, lens = assignments[i]
553
- results[i] = {
554
- "model_id": model_id,
555
- "model_name": config.get("name", model_id),
556
- "lens": lens,
557
- "status": "error",
558
- "error": "Timed out after 60 seconds",
559
- "elapsed_seconds": MODEL_TIMEOUT,
560
- "findings": [],
561
- }
562
-
563
- audit_results = {
564
- "status": "ok",
565
- "target": target if target_type != "file" else str(target),
566
- "target_type": target_type,
567
- "target_display": target_display,
568
- "timestamp": timestamp,
569
- "model_results": results,
570
- "elapsed_seconds": round(time.time() - start_time, 1),
571
- }
572
-
573
- # Synthesize
574
- synthesis_result = synthesize(audit_results)
575
- audit_results["synthesis"] = synthesis_result
576
-
577
- # Format output
578
- audit_results["formatted"] = format_audit_output(audit_results, synthesis_result)
579
-
580
- # Save to disk
581
- _save_audit(audit_results, timestamp)
582
-
583
- return audit_results
584
-
585
-
586
- def _save_audit(audit_results: Dict[str, Any], timestamp: str) -> Optional[str]:
587
- """Save audit results to ~/.delimit/audits/."""
588
- try:
589
- AUDIT_DIR.mkdir(parents=True, exist_ok=True)
590
- path = AUDIT_DIR / f"{timestamp}.json"
591
- # Remove raw_response before saving (can be large)
592
- save_data = json.loads(json.dumps(audit_results, default=str))
593
- for r in save_data.get("model_results", []):
594
- r.pop("raw_response", None)
595
- path.write_text(json.dumps(save_data, indent=2, default=str))
596
- audit_results["saved_to"] = str(path)
597
- return str(path)
598
- except Exception as e:
599
- logger.warning("Failed to save audit results: %s", e)
600
- return None