gdmcode 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gdmcode-0.1.0.dist-info/METADATA +240 -0
- gdmcode-0.1.0.dist-info/RECORD +131 -0
- gdmcode-0.1.0.dist-info/WHEEL +4 -0
- gdmcode-0.1.0.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/_internal/__init__.py +0 -0
- src/_internal/constants.py +244 -0
- src/_internal/domain_skills.py +339 -0
- src/agent/__init__.py +0 -0
- src/agent/commit_classifier.py +91 -0
- src/agent/context_budget.py +391 -0
- src/agent/daemon.py +681 -0
- src/agent/dag_validator.py +153 -0
- src/agent/debug_loop.py +473 -0
- src/agent/impact_analyzer.py +149 -0
- src/agent/impact_graph.py +117 -0
- src/agent/loop.py +1410 -0
- src/agent/orchestrator.py +141 -0
- src/agent/regression_guard.py +251 -0
- src/agent/review_gate.py +648 -0
- src/agent/risk_scorer.py +169 -0
- src/agent/self_healing.py +145 -0
- src/agent/smart_test_selector.py +89 -0
- src/agent/system_prompt.py +226 -0
- src/agent/task_tracker.py +320 -0
- src/agent/test_validator.py +210 -0
- src/agent/tool_orchestrator.py +402 -0
- src/agent/transcript.py +230 -0
- src/agent/verification_loop.py +133 -0
- src/agent/work_director.py +136 -0
- src/agent/worktree_manager.py +53 -0
- src/artifacts/__init__.py +16 -0
- src/artifacts/artifact_store.py +456 -0
- src/artifacts/verification_graph.py +75 -0
- src/auth.py +411 -0
- src/cli.py +1290 -0
- src/commands.py +1398 -0
- src/config.py +762 -0
- src/cost_tracker.py +348 -0
- src/db/__init__.py +4 -0
- src/db/migrations.py +337 -0
- src/enterprise/__init__.py +3 -0
- src/enterprise/audit_log.py +182 -0
- src/enterprise/identity.py +90 -0
- src/enterprise/rbac.py +100 -0
- src/enterprise/team_config.py +125 -0
- src/enterprise/usage_analytics.py +261 -0
- src/exceptions.py +207 -0
- src/git_workflow.py +651 -0
- src/integrations/__init__.py +6 -0
- src/integrations/github_actions.py +106 -0
- src/integrations/mcp_server.py +333 -0
- src/integrations/sentry_integration.py +100 -0
- src/integrations/sentry_server.py +82 -0
- src/integrations/webhook_security.py +19 -0
- src/main.py +27 -0
- src/memory/__init__.py +0 -0
- src/memory/code_index.py +376 -0
- src/memory/compressor.py +378 -0
- src/memory/context_memory.py +135 -0
- src/memory/continuous_memory.py +234 -0
- src/memory/conventions.py +495 -0
- src/memory/db.py +1119 -0
- src/memory/document_index.py +205 -0
- src/memory/file_cache.py +128 -0
- src/memory/project_scanner.py +178 -0
- src/memory/session_store.py +201 -0
- src/models/__init__.py +0 -0
- src/models/client.py +715 -0
- src/models/definitions.py +459 -0
- src/models/router.py +418 -0
- src/models/schemas.py +389 -0
- src/permissions.py +294 -0
- src/remote/__init__.py +5 -0
- src/remote/command_filter.py +33 -0
- src/remote/models.py +31 -0
- src/remote/permission_handler.py +79 -0
- src/remote/phone_ui.py +48 -0
- src/remote/protocol.py +59 -0
- src/remote/qr.py +65 -0
- src/remote/server.py +586 -0
- src/remote/token_manager.py +61 -0
- src/remote/tunnel.py +212 -0
- src/repl.py +475 -0
- src/runtime/__init__.py +1 -0
- src/runtime/branch_farm.py +372 -0
- src/runtime/replay.py +351 -0
- src/sandbox/__init__.py +2 -0
- src/sandbox/hermetic.py +214 -0
- src/sandbox/policy.py +44 -0
- src/sdk/__init__.py +3 -0
- src/sdk/plugin_base.py +39 -0
- src/sdk/plugin_host.py +100 -0
- src/sdk/plugin_loader.py +101 -0
- src/security.py +409 -0
- src/server/__init__.py +7 -0
- src/server/bridge.py +427 -0
- src/server/bridge_cli.py +103 -0
- src/server/bridge_client.py +170 -0
- src/server/protocol_version.py +103 -0
- src/session/__init__.py +10 -0
- src/session/event_fanout.py +46 -0
- src/session/input_broker.py +38 -0
- src/session/permission_bridge.py +100 -0
- src/tools/__init__.py +160 -0
- src/tools/_atomic.py +72 -0
- src/tools/agent_tools.py +423 -0
- src/tools/ask_user_tool.py +83 -0
- src/tools/bash_tool.py +384 -0
- src/tools/browser_tool.py +352 -0
- src/tools/browser_tools.py +179 -0
- src/tools/dep_tools.py +210 -0
- src/tools/document_reader.py +167 -0
- src/tools/document_tool.py +240 -0
- src/tools/document_writer.py +171 -0
- src/tools/impact_tools.py +240 -0
- src/tools/playwright_tool.py +172 -0
- src/tools/quality_tools.py +366 -0
- src/tools/read_tools.py +318 -0
- src/tools/result_cache.py +157 -0
- src/tools/search_tools.py +310 -0
- src/tools/shell_tools.py +311 -0
- src/tools/write_tools.py +337 -0
- src/voice/__init__.py +25 -0
- src/voice/audio_capture.py +92 -0
- src/voice/audio_playback.py +68 -0
- src/voice/errors.py +14 -0
- src/voice/models.py +35 -0
- src/voice/providers.py +143 -0
- src/voice/vad.py +55 -0
- src/voice/voice_loop.py +156 -0
src/agent/review_gate.py
ADDED
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
"""Multi-agent review gate — uses Grok Debate model to review security-sensitive changes.
|
|
2
|
+
|
|
3
|
+
Triggered automatically by the agent loop before:
|
|
4
|
+
- Multi-file changes touching auth/session/crypto/permissions/SQL/file I/O
|
|
5
|
+
- Architectural decisions (3+ files affected)
|
|
6
|
+
- After a debug loop that needed 3+ iterations
|
|
7
|
+
|
|
8
|
+
Uses the Debate model tier (Grok) or Reasoner tier (other providers) for review.
|
|
9
|
+
Falls back to a single-model review if the Debate model is unavailable.
|
|
10
|
+
|
|
11
|
+
Output: ReviewReport (Pydantic, from src.models.schemas).
|
|
12
|
+
- CRITICAL findings: agent loop BLOCKS until resolved
|
|
13
|
+
- HIGH findings: show to user with "Fix automatically? [Y/n]"
|
|
14
|
+
- LOW findings: logged silently
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import subprocess
|
|
20
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
21
|
+
from concurrent.futures import TimeoutError as FutureTimeoutError
|
|
22
|
+
from concurrent.futures import as_completed
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
import openai
|
|
28
|
+
|
|
29
|
+
from src.models.definitions import ModelTier, Provider, get_model
|
|
30
|
+
from src.models.schemas import ReviewReport, Severity
|
|
31
|
+
|
|
32
|
+
__all__ = ["ReviewGate", "ReviewTrigger", "ReviewGateResult", "DisagreementDetector",
|
|
33
|
+
"ConfidenceSignals", "aggregate_confidence"]
|
|
34
|
+
|
|
35
|
+
log = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
_REVIEW_MAX_TOKENS: int = 1_500
|
|
38
|
+
_DIFF_MAX_CHARS: int = 6_000
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# Confidence scoring (ide-003)
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ConfidenceSignals:
|
|
47
|
+
"""Raw boolean signals used to compute a confidence score for a code hunk."""
|
|
48
|
+
|
|
49
|
+
tests_pass: bool
|
|
50
|
+
review_approved: bool
|
|
51
|
+
lint_clean: bool
|
|
52
|
+
no_security_findings: bool
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def aggregate_confidence(signals: ConfidenceSignals) -> int:
|
|
56
|
+
"""Return a 0–100 confidence score from *signals*.
|
|
57
|
+
|
|
58
|
+
Scoring rubric:
|
|
59
|
+
- Base score: 40
|
|
60
|
+
- tests_pass: +30
|
|
61
|
+
- review_approved: +20
|
|
62
|
+
- lint_clean: +5
|
|
63
|
+
- no_security_findings: +5
|
|
64
|
+
Any combination is valid; result is always clamped to [0, 100].
|
|
65
|
+
"""
|
|
66
|
+
score = 40
|
|
67
|
+
if signals.tests_pass:
|
|
68
|
+
score += 30
|
|
69
|
+
if signals.review_approved:
|
|
70
|
+
score += 20
|
|
71
|
+
if signals.lint_clean:
|
|
72
|
+
score += 5
|
|
73
|
+
if signals.no_security_findings:
|
|
74
|
+
score += 5
|
|
75
|
+
return max(0, min(100, score))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
# Disagreement detector (cheap pre-filter — no LLM call)
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
class DisagreementDetector:
|
|
83
|
+
"""Cheap keyword + diff-size heuristic to decide whether a full 4-agent debate is needed."""
|
|
84
|
+
|
|
85
|
+
_DEBATE_TRIGGERS: frozenset[str] = frozenset({
|
|
86
|
+
"security", "auth", "crypto", "architecture", "migration",
|
|
87
|
+
"trade-off", "tradeoff", "breaking change", "performance",
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
def needs_debate(self, question: str, diff_text: str) -> bool:
|
|
91
|
+
"""Return True if the change warrants a full multi-agent debate."""
|
|
92
|
+
combined = (question + diff_text).lower()
|
|
93
|
+
keyword_hit = any(kw in combined for kw in self._DEBATE_TRIGGERS)
|
|
94
|
+
large_diff = len(diff_text) > 2_000
|
|
95
|
+
return keyword_hit or large_diff
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
# Trigger classification
|
|
100
|
+
# ---------------------------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
class ReviewTrigger:
|
|
103
|
+
"""Determines whether a code change should trigger a review gate."""
|
|
104
|
+
|
|
105
|
+
# File patterns / imports that flag security-sensitive code
|
|
106
|
+
_SECURITY_PATTERNS: frozenset[str] = frozenset({
|
|
107
|
+
"auth", "session", "password", "token", "secret", "crypto",
|
|
108
|
+
"permission", "sql", "query", "database", "file_io", "subprocess",
|
|
109
|
+
"exec", "eval", "pickle",
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
def should_review(
|
|
113
|
+
self,
|
|
114
|
+
files_changed: list[Path],
|
|
115
|
+
trigger_reason: str = "",
|
|
116
|
+
) -> bool:
|
|
117
|
+
"""Return True if this change warrants multi-agent review."""
|
|
118
|
+
if len(files_changed) >= 3:
|
|
119
|
+
return True
|
|
120
|
+
for path in files_changed:
|
|
121
|
+
name_lower = path.name.lower()
|
|
122
|
+
if any(p in name_lower for p in self._SECURITY_PATTERNS):
|
|
123
|
+
return True
|
|
124
|
+
return bool(trigger_reason)
|
|
125
|
+
|
|
126
|
+
def classify(self, files_changed: list[Path]) -> str:
|
|
127
|
+
"""Return a short reason string explaining why review was triggered."""
|
|
128
|
+
if len(files_changed) >= 3:
|
|
129
|
+
return f"architectural change ({len(files_changed)} files)"
|
|
130
|
+
for path in files_changed:
|
|
131
|
+
name_lower = path.name.lower()
|
|
132
|
+
matched = [p for p in self._SECURITY_PATTERNS if p in name_lower]
|
|
133
|
+
if matched:
|
|
134
|
+
return f"security-sensitive file: {path.name} ({', '.join(matched)})"
|
|
135
|
+
return "manual trigger"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
# Review result
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
@dataclass
|
|
143
|
+
class ReviewGateResult:
|
|
144
|
+
"""Output of a review gate run."""
|
|
145
|
+
|
|
146
|
+
report: ReviewReport
|
|
147
|
+
trigger_reason: str
|
|
148
|
+
model_used: str
|
|
149
|
+
files_reviewed: list[Path]
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def blocks_merge(self) -> bool:
|
|
153
|
+
"""True if there are CRITICAL findings that must be resolved."""
|
|
154
|
+
return any(f.severity == Severity.CRITICAL for f in self.report.findings)
|
|
155
|
+
|
|
156
|
+
def critical_findings(self) -> list[str]:
|
|
157
|
+
return [
|
|
158
|
+
f.message for f in self.report.findings
|
|
159
|
+
if f.severity == Severity.CRITICAL
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
def high_findings(self) -> list[str]:
|
|
163
|
+
return [
|
|
164
|
+
f.message for f in self.report.findings
|
|
165
|
+
if f.severity == Severity.HIGH
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
def format_for_user(self) -> str:
|
|
169
|
+
"""Rich-markup string for display in the REPL."""
|
|
170
|
+
lines: list[str] = []
|
|
171
|
+
icon = "❌" if self.blocks_merge else "⚠" if self.high_findings() else "✓"
|
|
172
|
+
lines.append(f"{icon} [bold]Review Gate:[/bold] {self.trigger_reason}")
|
|
173
|
+
for f in self.report.findings:
|
|
174
|
+
color = {
|
|
175
|
+
Severity.CRITICAL: "red",
|
|
176
|
+
Severity.HIGH: "yellow",
|
|
177
|
+
Severity.MEDIUM: "cyan",
|
|
178
|
+
Severity.LOW: "dim",
|
|
179
|
+
Severity.INFO: "dim",
|
|
180
|
+
}.get(f.severity, "white")
|
|
181
|
+
loc = f" ({f.file}:{f.line})" if f.file and f.line else ""
|
|
182
|
+
lines.append(f" [{color}]{f.severity.value.upper()}[/{color}]{loc}: {f.message}")
|
|
183
|
+
return "\n".join(lines)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
# ReviewGate
|
|
188
|
+
# ---------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
class ReviewGate:
|
|
191
|
+
"""Runs the multi-agent review gate on a set of changes.
|
|
192
|
+
|
|
193
|
+
Uses the Debate model tier when available (Grok only).
|
|
194
|
+
Falls back to Reasoner for other providers.
|
|
195
|
+
|
|
196
|
+
Usage::
|
|
197
|
+
|
|
198
|
+
gate = ReviewGate(cfg)
|
|
199
|
+
result = gate.review(
|
|
200
|
+
files_changed=[Path("src/auth.py")],
|
|
201
|
+
diff_text="...",
|
|
202
|
+
trigger_reason="security-sensitive file: auth.py",
|
|
203
|
+
)
|
|
204
|
+
if result.blocks_merge:
|
|
205
|
+
# Surface critical findings before proceeding
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
def __init__(self, cfg: Any, vg: Any = None) -> None: # cfg: GdmConfig
|
|
209
|
+
self._cfg = cfg
|
|
210
|
+
self._vg = vg
|
|
211
|
+
self._trigger = ReviewTrigger()
|
|
212
|
+
self._model = self._pick_model()
|
|
213
|
+
self._client = openai.OpenAI(
|
|
214
|
+
api_key=cfg.api_key,
|
|
215
|
+
base_url=_base_url(cfg.provider),
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def review(
|
|
219
|
+
self,
|
|
220
|
+
files_changed: list[Path],
|
|
221
|
+
diff_text: str,
|
|
222
|
+
trigger_reason: str = "",
|
|
223
|
+
node_id: "str | None" = None,
|
|
224
|
+
) -> ReviewGateResult:
|
|
225
|
+
"""Run the review gate. Never raises — returns a safe result on failure."""
|
|
226
|
+
if not trigger_reason:
|
|
227
|
+
trigger_reason = self._trigger.classify(files_changed)
|
|
228
|
+
|
|
229
|
+
truncated_diff = _truncate_diff(diff_text, _DIFF_MAX_CHARS)
|
|
230
|
+
prompt = _build_review_prompt(files_changed, truncated_diff, trigger_reason)
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
report = self._call_model(prompt)
|
|
234
|
+
except Exception as exc: # noqa: BLE001
|
|
235
|
+
log.warning("Review gate failed: %s — returning empty report", exc)
|
|
236
|
+
from src.models.schemas import ReviewReport
|
|
237
|
+
report = ReviewReport(approved=True, confidence=0.5, findings=[], summary="")
|
|
238
|
+
|
|
239
|
+
return ReviewGateResult(
|
|
240
|
+
report=report,
|
|
241
|
+
trigger_reason=trigger_reason,
|
|
242
|
+
model_used=self._model.id,
|
|
243
|
+
files_reviewed=files_changed,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# run is an alias for review (backward compat)
|
|
247
|
+
run = review
|
|
248
|
+
|
|
249
|
+
def debate_adr(
|
|
250
|
+
self,
|
|
251
|
+
task_description: str,
|
|
252
|
+
planned_files: list[Path],
|
|
253
|
+
) -> Any: # -> ADRReport
|
|
254
|
+
"""Generate an Architecture Decision Record for a large planned change."""
|
|
255
|
+
from src.models.schemas import ADRReport
|
|
256
|
+
try:
|
|
257
|
+
model_id = self._cfg.debate_model or self._model.id
|
|
258
|
+
file_list = ", ".join(str(p) for p in planned_files)
|
|
259
|
+
messages = [
|
|
260
|
+
{"role": "system", "content": _ADR_SYSTEM_PROMPT},
|
|
261
|
+
{"role": "user", "content": (
|
|
262
|
+
f"Task: {task_description}\nFiles planned: {file_list}"
|
|
263
|
+
)},
|
|
264
|
+
]
|
|
265
|
+
response = self._client.beta.chat.completions.parse(
|
|
266
|
+
model=model_id,
|
|
267
|
+
messages=messages,
|
|
268
|
+
response_format=ADRReport,
|
|
269
|
+
max_tokens=_REVIEW_MAX_TOKENS,
|
|
270
|
+
)
|
|
271
|
+
parsed = response.choices[0].message.parsed
|
|
272
|
+
if parsed is None:
|
|
273
|
+
raise ValueError("Model returned no parsed output")
|
|
274
|
+
return parsed
|
|
275
|
+
except Exception as exc: # noqa: BLE001
|
|
276
|
+
log.warning("debate_adr failed: %s — returning empty report", exc)
|
|
277
|
+
return ADRReport(decision="", rationale="", alternatives_rejected=[], risks=[])
|
|
278
|
+
|
|
279
|
+
def debate_security(
|
|
280
|
+
self,
|
|
281
|
+
file_path: Path,
|
|
282
|
+
diff_text: str,
|
|
283
|
+
) -> Any: # -> ThreatModelReport
|
|
284
|
+
"""Run security threat modeling on a changed file and its diff."""
|
|
285
|
+
from src.models.schemas import ThreatModelReport
|
|
286
|
+
try:
|
|
287
|
+
model_id = self._cfg.debate_model or self._model.id
|
|
288
|
+
truncated_diff = _truncate_diff(diff_text, _DIFF_MAX_CHARS)
|
|
289
|
+
messages = [
|
|
290
|
+
{"role": "system", "content": _SECURITY_SYSTEM_PROMPT},
|
|
291
|
+
{"role": "user", "content": (
|
|
292
|
+
f"File: {file_path}\n\nDiff:\n```\n{truncated_diff}\n```"
|
|
293
|
+
)},
|
|
294
|
+
]
|
|
295
|
+
response = self._client.beta.chat.completions.parse(
|
|
296
|
+
model=model_id,
|
|
297
|
+
messages=messages,
|
|
298
|
+
response_format=ThreatModelReport,
|
|
299
|
+
max_tokens=_REVIEW_MAX_TOKENS,
|
|
300
|
+
)
|
|
301
|
+
parsed = response.choices[0].message.parsed
|
|
302
|
+
if parsed is None:
|
|
303
|
+
raise ValueError("Model returned no parsed output")
|
|
304
|
+
return parsed
|
|
305
|
+
except Exception as exc: # noqa: BLE001
|
|
306
|
+
log.warning("debate_security failed: %s — returning empty report", exc)
|
|
307
|
+
return ThreatModelReport(attack_surfaces=[], mitigations=[], severity="low")
|
|
308
|
+
|
|
309
|
+
def debate_debug(
|
|
310
|
+
self,
|
|
311
|
+
error: str,
|
|
312
|
+
attempts_history: list[str],
|
|
313
|
+
) -> Any: # -> DebugHypotheses
|
|
314
|
+
"""Generate fresh debug hypotheses after the ensemble loop is exhausted."""
|
|
315
|
+
from src.models.schemas import DebugHypotheses
|
|
316
|
+
try:
|
|
317
|
+
model_id = self._cfg.debate_model or self._model.id
|
|
318
|
+
history_text = "\n".join(f"- {a}" for a in attempts_history)
|
|
319
|
+
messages = [
|
|
320
|
+
{"role": "system", "content": _DEBUG_SYSTEM_PROMPT},
|
|
321
|
+
{"role": "user", "content": (
|
|
322
|
+
f"Error:\n{error}\n\nPrevious fix attempts:\n{history_text}"
|
|
323
|
+
)},
|
|
324
|
+
]
|
|
325
|
+
response = self._client.beta.chat.completions.parse(
|
|
326
|
+
model=model_id,
|
|
327
|
+
messages=messages,
|
|
328
|
+
response_format=DebugHypotheses,
|
|
329
|
+
max_tokens=_REVIEW_MAX_TOKENS,
|
|
330
|
+
)
|
|
331
|
+
parsed = response.choices[0].message.parsed
|
|
332
|
+
if parsed is None:
|
|
333
|
+
raise ValueError("Model returned no parsed output")
|
|
334
|
+
return parsed
|
|
335
|
+
except Exception as exc: # noqa: BLE001
|
|
336
|
+
log.warning("debate_debug failed: %s — returning empty report", exc)
|
|
337
|
+
return DebugHypotheses(
|
|
338
|
+
hypotheses=[], suggested_next_action="", dead_ends_identified=[]
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
def _call_model(self, prompt: str) -> ReviewReport:
|
|
342
|
+
"""Call the review model using structured output (parse)."""
|
|
343
|
+
messages = [
|
|
344
|
+
{"role": "system", "content": _REVIEW_SYSTEM_PROMPT},
|
|
345
|
+
{"role": "user", "content": prompt},
|
|
346
|
+
]
|
|
347
|
+
response = self._client.beta.chat.completions.parse(
|
|
348
|
+
model=self._model.id,
|
|
349
|
+
messages=messages,
|
|
350
|
+
response_format=ReviewReport,
|
|
351
|
+
max_tokens=_REVIEW_MAX_TOKENS,
|
|
352
|
+
)
|
|
353
|
+
parsed = response.choices[0].message.parsed
|
|
354
|
+
if parsed is None:
|
|
355
|
+
raise ValueError("Model returned no parsed output")
|
|
356
|
+
return parsed
|
|
357
|
+
|
|
358
|
+
def _pick_model(self) -> Any: # -> ModelDef
|
|
359
|
+
"""Pick review model: Debate for Grok, Reasoner for others."""
|
|
360
|
+
tier = ModelTier.DEBATE if self._cfg.provider == Provider.GROK else ModelTier.REASONER
|
|
361
|
+
return get_model(tier, self._cfg.provider)
|
|
362
|
+
|
|
363
|
+
# ------------------------------------------------------------------
|
|
364
|
+
# 4-agent debate
|
|
365
|
+
# ------------------------------------------------------------------
|
|
366
|
+
|
|
367
|
+
def run_debate(
|
|
368
|
+
self,
|
|
369
|
+
question: str,
|
|
370
|
+
context: str,
|
|
371
|
+
*,
|
|
372
|
+
agent_count: int = 4,
|
|
373
|
+
timeout_secs: float = 30.0,
|
|
374
|
+
) -> Any: # -> DebateReport
|
|
375
|
+
"""Run a 4-agent parallel debate and return a synthesised DebateReport.
|
|
376
|
+
|
|
377
|
+
Returns a ``debate_skipped=True`` report immediately when
|
|
378
|
+
:class:`DisagreementDetector` decides no debate is needed.
|
|
379
|
+
Falls back to a single-model review when fewer than 2 agents respond.
|
|
380
|
+
"""
|
|
381
|
+
from src.models.schemas import DebateReport
|
|
382
|
+
|
|
383
|
+
detector = DisagreementDetector()
|
|
384
|
+
if not detector.needs_debate(question, context):
|
|
385
|
+
return DebateReport(
|
|
386
|
+
perspectives=[],
|
|
387
|
+
consensus_points=[],
|
|
388
|
+
disagreements=[],
|
|
389
|
+
missing_evidence=[],
|
|
390
|
+
recommendation="Debate skipped — no disagreement triggers detected",
|
|
391
|
+
approved=True,
|
|
392
|
+
confidence=0.5,
|
|
393
|
+
debate_skipped=True,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
from src.agent.system_prompt import (
|
|
397
|
+
_ARCHITECT_PROMPT,
|
|
398
|
+
_DEVIL_ADVOCATE_PROMPT,
|
|
399
|
+
_PERFORMANCE_PROMPT,
|
|
400
|
+
_SECURITY_PROMPT,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
prompts: list[tuple[str, str]] = [
|
|
404
|
+
("architect", _ARCHITECT_PROMPT),
|
|
405
|
+
("security", _SECURITY_PROMPT),
|
|
406
|
+
("performance", _PERFORMANCE_PROMPT),
|
|
407
|
+
("devil_advocate", _DEVIL_ADVOCATE_PROMPT),
|
|
408
|
+
][:agent_count]
|
|
409
|
+
|
|
410
|
+
user_content = f"{question}\n\nContext/Diff:\n{context}"
|
|
411
|
+
results = self._run_agents_parallel(prompts, user_content, timeout_secs=timeout_secs)
|
|
412
|
+
|
|
413
|
+
active = [r for r in results if r is not None]
|
|
414
|
+
if len(active) < 2:
|
|
415
|
+
log.warning(
|
|
416
|
+
"Only %d/%d debate agents responded — falling back to single-model review",
|
|
417
|
+
len(active),
|
|
418
|
+
len(prompts),
|
|
419
|
+
)
|
|
420
|
+
return self._single_model_review(question, context)
|
|
421
|
+
|
|
422
|
+
return self._synthesize(active)
|
|
423
|
+
|
|
424
|
+
def _run_agents_parallel(
|
|
425
|
+
self,
|
|
426
|
+
prompts: list[tuple[str, str]],
|
|
427
|
+
user_content: str,
|
|
428
|
+
timeout_secs: float = 30.0,
|
|
429
|
+
) -> list[Any]:
|
|
430
|
+
"""Submit all agent calls in parallel via ThreadPoolExecutor and collect results."""
|
|
431
|
+
results: list[Any] = [None] * len(prompts)
|
|
432
|
+
with ThreadPoolExecutor(max_workers=4) as pool:
|
|
433
|
+
futures = {
|
|
434
|
+
pool.submit(self._call_single_agent, role, sys_prompt, user_content): i
|
|
435
|
+
for i, (role, sys_prompt) in enumerate(prompts)
|
|
436
|
+
}
|
|
437
|
+
try:
|
|
438
|
+
for future in as_completed(futures, timeout=timeout_secs):
|
|
439
|
+
i = futures[future]
|
|
440
|
+
try:
|
|
441
|
+
results[i] = future.result()
|
|
442
|
+
except Exception as exc: # noqa: BLE001
|
|
443
|
+
log.warning(
|
|
444
|
+
"Debate agent %d (%s) failed: %s",
|
|
445
|
+
i,
|
|
446
|
+
prompts[i][0],
|
|
447
|
+
exc,
|
|
448
|
+
)
|
|
449
|
+
except FutureTimeoutError:
|
|
450
|
+
log.warning(
|
|
451
|
+
"Debate agent pool timed out after %.1fs; collecting partial results",
|
|
452
|
+
timeout_secs,
|
|
453
|
+
)
|
|
454
|
+
return results
|
|
455
|
+
|
|
456
|
+
def _call_single_agent(
|
|
457
|
+
self,
|
|
458
|
+
role: str,
|
|
459
|
+
sys_prompt: str,
|
|
460
|
+
user_content: str,
|
|
461
|
+
) -> Any: # -> AgentPerspective
|
|
462
|
+
"""Call one debate agent and return its AgentPerspective."""
|
|
463
|
+
from src.models.schemas import AgentPerspective
|
|
464
|
+
|
|
465
|
+
messages = [
|
|
466
|
+
{"role": "system", "content": sys_prompt},
|
|
467
|
+
{"role": "user", "content": user_content},
|
|
468
|
+
]
|
|
469
|
+
response = self._client.beta.chat.completions.parse(
|
|
470
|
+
model=self._model.id,
|
|
471
|
+
messages=messages,
|
|
472
|
+
response_format=AgentPerspective,
|
|
473
|
+
max_tokens=_REVIEW_MAX_TOKENS,
|
|
474
|
+
)
|
|
475
|
+
parsed = response.choices[0].message.parsed
|
|
476
|
+
if parsed is None:
|
|
477
|
+
raise ValueError(f"Debate agent '{role}' returned no parsed output")
|
|
478
|
+
return parsed
|
|
479
|
+
|
|
480
|
+
def _synthesize(self, perspectives: list[Any]) -> Any: # -> DebateReport
|
|
481
|
+
"""Aggregate multiple AgentPerspective results into a DebateReport (rule-based, no LLM)."""
|
|
482
|
+
from collections import Counter
|
|
483
|
+
|
|
484
|
+
from src.models.schemas import DebateReport, Severity
|
|
485
|
+
|
|
486
|
+
all_findings = [f for p in perspectives for f in p.findings]
|
|
487
|
+
|
|
488
|
+
# Consensus: findings whose message appears in 2+ perspectives
|
|
489
|
+
msg_counts: Counter[str] = Counter(f.message for f in all_findings)
|
|
490
|
+
consensus_points = [msg for msg, cnt in msg_counts.items() if cnt > 1]
|
|
491
|
+
|
|
492
|
+
# Disagreements: identify confidence-level splits between agents
|
|
493
|
+
disagreements: list[str] = []
|
|
494
|
+
confident = [p.role for p in perspectives if p.confidence > 0.6]
|
|
495
|
+
skeptical = [p.role for p in perspectives if p.confidence <= 0.4]
|
|
496
|
+
if confident and skeptical:
|
|
497
|
+
disagreements.append(
|
|
498
|
+
f"Confidence split — {', '.join(confident)} are confident vs "
|
|
499
|
+
f"{', '.join(skeptical)} are skeptical"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# Missing evidence: roles absent from this debate
|
|
503
|
+
represented = {p.role for p in perspectives}
|
|
504
|
+
all_roles = {"architect", "security", "performance", "devil_advocate"}
|
|
505
|
+
missing_evidence = [
|
|
506
|
+
f"No perspective from: {r}" for r in sorted(all_roles - represented)
|
|
507
|
+
]
|
|
508
|
+
|
|
509
|
+
# Approval: block on any CRITICAL finding, or 3+ HIGH findings
|
|
510
|
+
critical = sum(1 for f in all_findings if f.severity == Severity.CRITICAL)
|
|
511
|
+
high = sum(1 for f in all_findings if f.severity == Severity.HIGH)
|
|
512
|
+
approved = critical == 0 and high < 3
|
|
513
|
+
|
|
514
|
+
avg_confidence = sum(p.confidence for p in perspectives) / len(perspectives)
|
|
515
|
+
avg_confidence = min(1.0, max(0.0, avg_confidence))
|
|
516
|
+
|
|
517
|
+
role_summaries = "; ".join(f"{p.role}: {p.summary}" for p in perspectives)
|
|
518
|
+
recommendation = (
|
|
519
|
+
f"Multi-agent synthesis ({len(perspectives)} agents). {role_summaries}"
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
return DebateReport(
|
|
523
|
+
perspectives=perspectives,
|
|
524
|
+
consensus_points=consensus_points,
|
|
525
|
+
disagreements=disagreements,
|
|
526
|
+
missing_evidence=missing_evidence,
|
|
527
|
+
recommendation=recommendation,
|
|
528
|
+
approved=approved,
|
|
529
|
+
confidence=avg_confidence,
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
def _single_model_review(self, question: str, context: str) -> Any: # -> DebateReport
|
|
533
|
+
"""Fallback: run the existing single-model review and wrap it in a DebateReport."""
|
|
534
|
+
from src.models.schemas import DebateReport
|
|
535
|
+
|
|
536
|
+
try:
|
|
537
|
+
report = self._call_model(f"{question}\n\nContext:\n{context}")
|
|
538
|
+
return DebateReport(
|
|
539
|
+
perspectives=[],
|
|
540
|
+
consensus_points=[],
|
|
541
|
+
disagreements=[],
|
|
542
|
+
missing_evidence=[],
|
|
543
|
+
recommendation=report.summary,
|
|
544
|
+
approved=report.approved,
|
|
545
|
+
confidence=report.confidence,
|
|
546
|
+
debate_skipped=True,
|
|
547
|
+
)
|
|
548
|
+
except Exception as exc: # noqa: BLE001
|
|
549
|
+
log.warning("Single-model review fallback also failed: %s", exc)
|
|
550
|
+
return DebateReport(
|
|
551
|
+
perspectives=[],
|
|
552
|
+
consensus_points=[],
|
|
553
|
+
disagreements=[],
|
|
554
|
+
missing_evidence=[],
|
|
555
|
+
recommendation="Review failed — no agents responded",
|
|
556
|
+
approved=True,
|
|
557
|
+
confidence=0.0,
|
|
558
|
+
debate_skipped=True,
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
# ---------------------------------------------------------------------------
|
|
563
|
+
# Helpers
|
|
564
|
+
# ---------------------------------------------------------------------------
|
|
565
|
+
|
|
566
|
+
def _collect_diff(files: list[Path], project_root: Path) -> str:
|
|
567
|
+
"""Collect git diff output for the given files. Returns empty string on failure."""
|
|
568
|
+
parts: list[str] = []
|
|
569
|
+
for p in files[:5]:
|
|
570
|
+
try:
|
|
571
|
+
r = subprocess.run(
|
|
572
|
+
["git", "diff", "HEAD", "--", str(p)],
|
|
573
|
+
capture_output=True,
|
|
574
|
+
text=True,
|
|
575
|
+
timeout=10,
|
|
576
|
+
cwd=str(project_root),
|
|
577
|
+
)
|
|
578
|
+
if r.stdout.strip():
|
|
579
|
+
parts.append(r.stdout)
|
|
580
|
+
except Exception: # noqa: BLE001
|
|
581
|
+
pass
|
|
582
|
+
return "\n".join(parts)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def _base_url(provider: str) -> str:
|
|
586
|
+
from src.models.definitions import PROVIDER_BASE_URLS
|
|
587
|
+
return PROVIDER_BASE_URLS.get(provider, PROVIDER_BASE_URLS[Provider.GROK])
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _truncate_diff(diff: str, max_chars: int) -> str:
|
|
591
|
+
if len(diff) <= max_chars:
|
|
592
|
+
return diff
|
|
593
|
+
return diff[:max_chars] + "\n...(diff truncated)"
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def _build_review_prompt(
|
|
597
|
+
files: list[Path], diff: str, reason: str
|
|
598
|
+
) -> str:
|
|
599
|
+
file_list = ", ".join(f.name for f in files)
|
|
600
|
+
return (
|
|
601
|
+
f"Review trigger: {reason}\n"
|
|
602
|
+
f"Files changed: {file_list}\n\n"
|
|
603
|
+
f"Diff:\n```\n{diff}\n```\n\n"
|
|
604
|
+
"Identify all security vulnerabilities, logic errors, and architectural issues. "
|
|
605
|
+
"Be precise about file:line locations."
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
_REVIEW_SYSTEM_PROMPT = """You are a senior security engineer reviewing code changes.
|
|
610
|
+
Your job is to find REAL bugs, security vulnerabilities, and architectural flaws.
|
|
611
|
+
Do NOT comment on style, formatting, or cosmetic issues.
|
|
612
|
+
|
|
613
|
+
CRITICAL: SQL injection, XSS, auth bypass, path traversal, insecure deserialization
|
|
614
|
+
HIGH: Race conditions, memory leaks, incorrect error handling, missing validation
|
|
615
|
+
MEDIUM: Missing tests, inconsistent naming, unclear logic
|
|
616
|
+
LOW: Minor improvements, alternative approaches
|
|
617
|
+
|
|
618
|
+
Return a JSON object matching the ReviewReport schema exactly.
|
|
619
|
+
approved=true means the code can proceed (findings may still exist at LOW/MEDIUM level).
|
|
620
|
+
approved=false means CRITICAL or multiple HIGH findings block the merge.
|
|
621
|
+
"""
|
|
622
|
+
|
|
623
|
+
_ADR_SYSTEM_PROMPT = """You are an architecture review expert.
|
|
624
|
+
Analyze the planned task and the files that will be changed.
|
|
625
|
+
Provide an Architecture Decision Record with:
|
|
626
|
+
- A clear, concise decision statement
|
|
627
|
+
- Rationale for this approach
|
|
628
|
+
- Alternatives that were considered and rejected
|
|
629
|
+
- Known risks of this decision
|
|
630
|
+
|
|
631
|
+
Return a JSON object matching the ADRReport schema exactly.
|
|
632
|
+
"""
|
|
633
|
+
|
|
634
|
+
_SECURITY_SYSTEM_PROMPT = """You are a security threat modeling expert.
|
|
635
|
+
Analyze this code diff for security vulnerabilities and attack surfaces.
|
|
636
|
+
Focus on concrete, exploitable threats — not hypothetical ones.
|
|
637
|
+
Identify practical mitigations for each attack surface found.
|
|
638
|
+
|
|
639
|
+
Return a JSON object matching the ThreatModelReport schema exactly.
|
|
640
|
+
"""
|
|
641
|
+
|
|
642
|
+
_DEBUG_SYSTEM_PROMPT = """You are a debugging expert with deep knowledge of Python.
|
|
643
|
+
Given this error and the previous failed fix attempts, generate ranked hypotheses
|
|
644
|
+
about the root cause. Focus on causes, not symptoms.
|
|
645
|
+
Identify dead ends from the previous attempts to avoid repeating them.
|
|
646
|
+
|
|
647
|
+
Return a JSON object matching the DebugHypotheses schema exactly.
|
|
648
|
+
"""
|