gdmcode 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. gdmcode-0.1.0.dist-info/METADATA +240 -0
  2. gdmcode-0.1.0.dist-info/RECORD +131 -0
  3. gdmcode-0.1.0.dist-info/WHEEL +4 -0
  4. gdmcode-0.1.0.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/_internal/__init__.py +0 -0
  7. src/_internal/constants.py +244 -0
  8. src/_internal/domain_skills.py +339 -0
  9. src/agent/__init__.py +0 -0
  10. src/agent/commit_classifier.py +91 -0
  11. src/agent/context_budget.py +391 -0
  12. src/agent/daemon.py +681 -0
  13. src/agent/dag_validator.py +153 -0
  14. src/agent/debug_loop.py +473 -0
  15. src/agent/impact_analyzer.py +149 -0
  16. src/agent/impact_graph.py +117 -0
  17. src/agent/loop.py +1410 -0
  18. src/agent/orchestrator.py +141 -0
  19. src/agent/regression_guard.py +251 -0
  20. src/agent/review_gate.py +648 -0
  21. src/agent/risk_scorer.py +169 -0
  22. src/agent/self_healing.py +145 -0
  23. src/agent/smart_test_selector.py +89 -0
  24. src/agent/system_prompt.py +226 -0
  25. src/agent/task_tracker.py +320 -0
  26. src/agent/test_validator.py +210 -0
  27. src/agent/tool_orchestrator.py +402 -0
  28. src/agent/transcript.py +230 -0
  29. src/agent/verification_loop.py +133 -0
  30. src/agent/work_director.py +136 -0
  31. src/agent/worktree_manager.py +53 -0
  32. src/artifacts/__init__.py +16 -0
  33. src/artifacts/artifact_store.py +456 -0
  34. src/artifacts/verification_graph.py +75 -0
  35. src/auth.py +411 -0
  36. src/cli.py +1290 -0
  37. src/commands.py +1398 -0
  38. src/config.py +762 -0
  39. src/cost_tracker.py +348 -0
  40. src/db/__init__.py +4 -0
  41. src/db/migrations.py +337 -0
  42. src/enterprise/__init__.py +3 -0
  43. src/enterprise/audit_log.py +182 -0
  44. src/enterprise/identity.py +90 -0
  45. src/enterprise/rbac.py +100 -0
  46. src/enterprise/team_config.py +125 -0
  47. src/enterprise/usage_analytics.py +261 -0
  48. src/exceptions.py +207 -0
  49. src/git_workflow.py +651 -0
  50. src/integrations/__init__.py +6 -0
  51. src/integrations/github_actions.py +106 -0
  52. src/integrations/mcp_server.py +333 -0
  53. src/integrations/sentry_integration.py +100 -0
  54. src/integrations/sentry_server.py +82 -0
  55. src/integrations/webhook_security.py +19 -0
  56. src/main.py +27 -0
  57. src/memory/__init__.py +0 -0
  58. src/memory/code_index.py +376 -0
  59. src/memory/compressor.py +378 -0
  60. src/memory/context_memory.py +135 -0
  61. src/memory/continuous_memory.py +234 -0
  62. src/memory/conventions.py +495 -0
  63. src/memory/db.py +1119 -0
  64. src/memory/document_index.py +205 -0
  65. src/memory/file_cache.py +128 -0
  66. src/memory/project_scanner.py +178 -0
  67. src/memory/session_store.py +201 -0
  68. src/models/__init__.py +0 -0
  69. src/models/client.py +715 -0
  70. src/models/definitions.py +459 -0
  71. src/models/router.py +418 -0
  72. src/models/schemas.py +389 -0
  73. src/permissions.py +294 -0
  74. src/remote/__init__.py +5 -0
  75. src/remote/command_filter.py +33 -0
  76. src/remote/models.py +31 -0
  77. src/remote/permission_handler.py +79 -0
  78. src/remote/phone_ui.py +48 -0
  79. src/remote/protocol.py +59 -0
  80. src/remote/qr.py +65 -0
  81. src/remote/server.py +586 -0
  82. src/remote/token_manager.py +61 -0
  83. src/remote/tunnel.py +212 -0
  84. src/repl.py +475 -0
  85. src/runtime/__init__.py +1 -0
  86. src/runtime/branch_farm.py +372 -0
  87. src/runtime/replay.py +351 -0
  88. src/sandbox/__init__.py +2 -0
  89. src/sandbox/hermetic.py +214 -0
  90. src/sandbox/policy.py +44 -0
  91. src/sdk/__init__.py +3 -0
  92. src/sdk/plugin_base.py +39 -0
  93. src/sdk/plugin_host.py +100 -0
  94. src/sdk/plugin_loader.py +101 -0
  95. src/security.py +409 -0
  96. src/server/__init__.py +7 -0
  97. src/server/bridge.py +427 -0
  98. src/server/bridge_cli.py +103 -0
  99. src/server/bridge_client.py +170 -0
  100. src/server/protocol_version.py +103 -0
  101. src/session/__init__.py +10 -0
  102. src/session/event_fanout.py +46 -0
  103. src/session/input_broker.py +38 -0
  104. src/session/permission_bridge.py +100 -0
  105. src/tools/__init__.py +160 -0
  106. src/tools/_atomic.py +72 -0
  107. src/tools/agent_tools.py +423 -0
  108. src/tools/ask_user_tool.py +83 -0
  109. src/tools/bash_tool.py +384 -0
  110. src/tools/browser_tool.py +352 -0
  111. src/tools/browser_tools.py +179 -0
  112. src/tools/dep_tools.py +210 -0
  113. src/tools/document_reader.py +167 -0
  114. src/tools/document_tool.py +240 -0
  115. src/tools/document_writer.py +171 -0
  116. src/tools/impact_tools.py +240 -0
  117. src/tools/playwright_tool.py +172 -0
  118. src/tools/quality_tools.py +366 -0
  119. src/tools/read_tools.py +318 -0
  120. src/tools/result_cache.py +157 -0
  121. src/tools/search_tools.py +310 -0
  122. src/tools/shell_tools.py +311 -0
  123. src/tools/write_tools.py +337 -0
  124. src/voice/__init__.py +25 -0
  125. src/voice/audio_capture.py +92 -0
  126. src/voice/audio_playback.py +68 -0
  127. src/voice/errors.py +14 -0
  128. src/voice/models.py +35 -0
  129. src/voice/providers.py +143 -0
  130. src/voice/vad.py +55 -0
  131. src/voice/voice_loop.py +156 -0
@@ -0,0 +1,648 @@
1
+ """Multi-agent review gate — uses Grok Debate model to review security-sensitive changes.
2
+
3
+ Triggered automatically by the agent loop before:
4
+ - Multi-file changes touching auth/session/crypto/permissions/SQL/file I/O
5
+ - Architectural decisions (3+ files affected)
6
+ - After a debug loop that needed 3+ iterations
7
+
8
+ Uses the Debate model tier (Grok) or Reasoner tier (other providers) for review.
9
+ Falls back to a single-model review if the Debate model is unavailable.
10
+
11
+ Output: ReviewReport (Pydantic, from src.models.schemas).
12
+ - CRITICAL findings: agent loop BLOCKS until resolved
13
+ - HIGH findings: show to user with "Fix automatically? [Y/n]"
14
+ - LOW findings: logged silently
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import subprocess
20
+ from concurrent.futures import ThreadPoolExecutor
21
+ from concurrent.futures import TimeoutError as FutureTimeoutError
22
+ from concurrent.futures import as_completed
23
+ from dataclasses import dataclass
24
+ from pathlib import Path
25
+ from typing import Any
26
+
27
+ import openai
28
+
29
+ from src.models.definitions import ModelTier, Provider, get_model
30
+ from src.models.schemas import ReviewReport, Severity
31
+
32
+ __all__ = ["ReviewGate", "ReviewTrigger", "ReviewGateResult", "DisagreementDetector",
33
+ "ConfidenceSignals", "aggregate_confidence"]
34
+
35
+ log = logging.getLogger(__name__)
36
+
37
+ _REVIEW_MAX_TOKENS: int = 1_500
38
+ _DIFF_MAX_CHARS: int = 6_000
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Confidence scoring (ide-003)
43
+ # ---------------------------------------------------------------------------
44
+
45
+ @dataclass
46
+ class ConfidenceSignals:
47
+ """Raw boolean signals used to compute a confidence score for a code hunk."""
48
+
49
+ tests_pass: bool
50
+ review_approved: bool
51
+ lint_clean: bool
52
+ no_security_findings: bool
53
+
54
+
55
+ def aggregate_confidence(signals: ConfidenceSignals) -> int:
56
+ """Return a 0–100 confidence score from *signals*.
57
+
58
+ Scoring rubric:
59
+ - Base score: 40
60
+ - tests_pass: +30
61
+ - review_approved: +20
62
+ - lint_clean: +5
63
+ - no_security_findings: +5
64
+ Any combination is valid; result is always clamped to [0, 100].
65
+ """
66
+ score = 40
67
+ if signals.tests_pass:
68
+ score += 30
69
+ if signals.review_approved:
70
+ score += 20
71
+ if signals.lint_clean:
72
+ score += 5
73
+ if signals.no_security_findings:
74
+ score += 5
75
+ return max(0, min(100, score))
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # Disagreement detector (cheap pre-filter — no LLM call)
80
+ # ---------------------------------------------------------------------------
81
+
82
+ class DisagreementDetector:
83
+ """Cheap keyword + diff-size heuristic to decide whether a full 4-agent debate is needed."""
84
+
85
+ _DEBATE_TRIGGERS: frozenset[str] = frozenset({
86
+ "security", "auth", "crypto", "architecture", "migration",
87
+ "trade-off", "tradeoff", "breaking change", "performance",
88
+ })
89
+
90
+ def needs_debate(self, question: str, diff_text: str) -> bool:
91
+ """Return True if the change warrants a full multi-agent debate."""
92
+ combined = (question + diff_text).lower()
93
+ keyword_hit = any(kw in combined for kw in self._DEBATE_TRIGGERS)
94
+ large_diff = len(diff_text) > 2_000
95
+ return keyword_hit or large_diff
96
+
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # Trigger classification
100
+ # ---------------------------------------------------------------------------
101
+
102
+ class ReviewTrigger:
103
+ """Determines whether a code change should trigger a review gate."""
104
+
105
+ # File patterns / imports that flag security-sensitive code
106
+ _SECURITY_PATTERNS: frozenset[str] = frozenset({
107
+ "auth", "session", "password", "token", "secret", "crypto",
108
+ "permission", "sql", "query", "database", "file_io", "subprocess",
109
+ "exec", "eval", "pickle",
110
+ })
111
+
112
+ def should_review(
113
+ self,
114
+ files_changed: list[Path],
115
+ trigger_reason: str = "",
116
+ ) -> bool:
117
+ """Return True if this change warrants multi-agent review."""
118
+ if len(files_changed) >= 3:
119
+ return True
120
+ for path in files_changed:
121
+ name_lower = path.name.lower()
122
+ if any(p in name_lower for p in self._SECURITY_PATTERNS):
123
+ return True
124
+ return bool(trigger_reason)
125
+
126
+ def classify(self, files_changed: list[Path]) -> str:
127
+ """Return a short reason string explaining why review was triggered."""
128
+ if len(files_changed) >= 3:
129
+ return f"architectural change ({len(files_changed)} files)"
130
+ for path in files_changed:
131
+ name_lower = path.name.lower()
132
+ matched = [p for p in self._SECURITY_PATTERNS if p in name_lower]
133
+ if matched:
134
+ return f"security-sensitive file: {path.name} ({', '.join(matched)})"
135
+ return "manual trigger"
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # Review result
140
+ # ---------------------------------------------------------------------------
141
+
142
+ @dataclass
143
+ class ReviewGateResult:
144
+ """Output of a review gate run."""
145
+
146
+ report: ReviewReport
147
+ trigger_reason: str
148
+ model_used: str
149
+ files_reviewed: list[Path]
150
+
151
+ @property
152
+ def blocks_merge(self) -> bool:
153
+ """True if there are CRITICAL findings that must be resolved."""
154
+ return any(f.severity == Severity.CRITICAL for f in self.report.findings)
155
+
156
+ def critical_findings(self) -> list[str]:
157
+ return [
158
+ f.message for f in self.report.findings
159
+ if f.severity == Severity.CRITICAL
160
+ ]
161
+
162
+ def high_findings(self) -> list[str]:
163
+ return [
164
+ f.message for f in self.report.findings
165
+ if f.severity == Severity.HIGH
166
+ ]
167
+
168
+ def format_for_user(self) -> str:
169
+ """Rich-markup string for display in the REPL."""
170
+ lines: list[str] = []
171
+ icon = "❌" if self.blocks_merge else "⚠" if self.high_findings() else "✓"
172
+ lines.append(f"{icon} [bold]Review Gate:[/bold] {self.trigger_reason}")
173
+ for f in self.report.findings:
174
+ color = {
175
+ Severity.CRITICAL: "red",
176
+ Severity.HIGH: "yellow",
177
+ Severity.MEDIUM: "cyan",
178
+ Severity.LOW: "dim",
179
+ Severity.INFO: "dim",
180
+ }.get(f.severity, "white")
181
+ loc = f" ({f.file}:{f.line})" if f.file and f.line else ""
182
+ lines.append(f" [{color}]{f.severity.value.upper()}[/{color}]{loc}: {f.message}")
183
+ return "\n".join(lines)
184
+
185
+
186
+ # ---------------------------------------------------------------------------
187
+ # ReviewGate
188
+ # ---------------------------------------------------------------------------
189
+
190
+ class ReviewGate:
191
+ """Runs the multi-agent review gate on a set of changes.
192
+
193
+ Uses the Debate model tier when available (Grok only).
194
+ Falls back to Reasoner for other providers.
195
+
196
+ Usage::
197
+
198
+ gate = ReviewGate(cfg)
199
+ result = gate.review(
200
+ files_changed=[Path("src/auth.py")],
201
+ diff_text="...",
202
+ trigger_reason="security-sensitive file: auth.py",
203
+ )
204
+ if result.blocks_merge:
205
+ # Surface critical findings before proceeding
206
+ """
207
+
208
+ def __init__(self, cfg: Any, vg: Any = None) -> None: # cfg: GdmConfig
209
+ self._cfg = cfg
210
+ self._vg = vg
211
+ self._trigger = ReviewTrigger()
212
+ self._model = self._pick_model()
213
+ self._client = openai.OpenAI(
214
+ api_key=cfg.api_key,
215
+ base_url=_base_url(cfg.provider),
216
+ )
217
+
218
+ def review(
219
+ self,
220
+ files_changed: list[Path],
221
+ diff_text: str,
222
+ trigger_reason: str = "",
223
+ node_id: "str | None" = None,
224
+ ) -> ReviewGateResult:
225
+ """Run the review gate. Never raises — returns a safe result on failure."""
226
+ if not trigger_reason:
227
+ trigger_reason = self._trigger.classify(files_changed)
228
+
229
+ truncated_diff = _truncate_diff(diff_text, _DIFF_MAX_CHARS)
230
+ prompt = _build_review_prompt(files_changed, truncated_diff, trigger_reason)
231
+
232
+ try:
233
+ report = self._call_model(prompt)
234
+ except Exception as exc: # noqa: BLE001
235
+ log.warning("Review gate failed: %s — returning empty report", exc)
236
+ from src.models.schemas import ReviewReport
237
+ report = ReviewReport(approved=True, confidence=0.5, findings=[], summary="")
238
+
239
+ return ReviewGateResult(
240
+ report=report,
241
+ trigger_reason=trigger_reason,
242
+ model_used=self._model.id,
243
+ files_reviewed=files_changed,
244
+ )
245
+
246
+ # run is an alias for review (backward compat)
247
+ run = review
248
+
249
+ def debate_adr(
250
+ self,
251
+ task_description: str,
252
+ planned_files: list[Path],
253
+ ) -> Any: # -> ADRReport
254
+ """Generate an Architecture Decision Record for a large planned change."""
255
+ from src.models.schemas import ADRReport
256
+ try:
257
+ model_id = self._cfg.debate_model or self._model.id
258
+ file_list = ", ".join(str(p) for p in planned_files)
259
+ messages = [
260
+ {"role": "system", "content": _ADR_SYSTEM_PROMPT},
261
+ {"role": "user", "content": (
262
+ f"Task: {task_description}\nFiles planned: {file_list}"
263
+ )},
264
+ ]
265
+ response = self._client.beta.chat.completions.parse(
266
+ model=model_id,
267
+ messages=messages,
268
+ response_format=ADRReport,
269
+ max_tokens=_REVIEW_MAX_TOKENS,
270
+ )
271
+ parsed = response.choices[0].message.parsed
272
+ if parsed is None:
273
+ raise ValueError("Model returned no parsed output")
274
+ return parsed
275
+ except Exception as exc: # noqa: BLE001
276
+ log.warning("debate_adr failed: %s — returning empty report", exc)
277
+ return ADRReport(decision="", rationale="", alternatives_rejected=[], risks=[])
278
+
279
+ def debate_security(
280
+ self,
281
+ file_path: Path,
282
+ diff_text: str,
283
+ ) -> Any: # -> ThreatModelReport
284
+ """Run security threat modeling on a changed file and its diff."""
285
+ from src.models.schemas import ThreatModelReport
286
+ try:
287
+ model_id = self._cfg.debate_model or self._model.id
288
+ truncated_diff = _truncate_diff(diff_text, _DIFF_MAX_CHARS)
289
+ messages = [
290
+ {"role": "system", "content": _SECURITY_SYSTEM_PROMPT},
291
+ {"role": "user", "content": (
292
+ f"File: {file_path}\n\nDiff:\n```\n{truncated_diff}\n```"
293
+ )},
294
+ ]
295
+ response = self._client.beta.chat.completions.parse(
296
+ model=model_id,
297
+ messages=messages,
298
+ response_format=ThreatModelReport,
299
+ max_tokens=_REVIEW_MAX_TOKENS,
300
+ )
301
+ parsed = response.choices[0].message.parsed
302
+ if parsed is None:
303
+ raise ValueError("Model returned no parsed output")
304
+ return parsed
305
+ except Exception as exc: # noqa: BLE001
306
+ log.warning("debate_security failed: %s — returning empty report", exc)
307
+ return ThreatModelReport(attack_surfaces=[], mitigations=[], severity="low")
308
+
309
+ def debate_debug(
310
+ self,
311
+ error: str,
312
+ attempts_history: list[str],
313
+ ) -> Any: # -> DebugHypotheses
314
+ """Generate fresh debug hypotheses after the ensemble loop is exhausted."""
315
+ from src.models.schemas import DebugHypotheses
316
+ try:
317
+ model_id = self._cfg.debate_model or self._model.id
318
+ history_text = "\n".join(f"- {a}" for a in attempts_history)
319
+ messages = [
320
+ {"role": "system", "content": _DEBUG_SYSTEM_PROMPT},
321
+ {"role": "user", "content": (
322
+ f"Error:\n{error}\n\nPrevious fix attempts:\n{history_text}"
323
+ )},
324
+ ]
325
+ response = self._client.beta.chat.completions.parse(
326
+ model=model_id,
327
+ messages=messages,
328
+ response_format=DebugHypotheses,
329
+ max_tokens=_REVIEW_MAX_TOKENS,
330
+ )
331
+ parsed = response.choices[0].message.parsed
332
+ if parsed is None:
333
+ raise ValueError("Model returned no parsed output")
334
+ return parsed
335
+ except Exception as exc: # noqa: BLE001
336
+ log.warning("debate_debug failed: %s — returning empty report", exc)
337
+ return DebugHypotheses(
338
+ hypotheses=[], suggested_next_action="", dead_ends_identified=[]
339
+ )
340
+
341
+ def _call_model(self, prompt: str) -> ReviewReport:
342
+ """Call the review model using structured output (parse)."""
343
+ messages = [
344
+ {"role": "system", "content": _REVIEW_SYSTEM_PROMPT},
345
+ {"role": "user", "content": prompt},
346
+ ]
347
+ response = self._client.beta.chat.completions.parse(
348
+ model=self._model.id,
349
+ messages=messages,
350
+ response_format=ReviewReport,
351
+ max_tokens=_REVIEW_MAX_TOKENS,
352
+ )
353
+ parsed = response.choices[0].message.parsed
354
+ if parsed is None:
355
+ raise ValueError("Model returned no parsed output")
356
+ return parsed
357
+
358
+ def _pick_model(self) -> Any: # -> ModelDef
359
+ """Pick review model: Debate for Grok, Reasoner for others."""
360
+ tier = ModelTier.DEBATE if self._cfg.provider == Provider.GROK else ModelTier.REASONER
361
+ return get_model(tier, self._cfg.provider)
362
+
363
+ # ------------------------------------------------------------------
364
+ # 4-agent debate
365
+ # ------------------------------------------------------------------
366
+
367
+ def run_debate(
368
+ self,
369
+ question: str,
370
+ context: str,
371
+ *,
372
+ agent_count: int = 4,
373
+ timeout_secs: float = 30.0,
374
+ ) -> Any: # -> DebateReport
375
+ """Run a 4-agent parallel debate and return a synthesised DebateReport.
376
+
377
+ Returns a ``debate_skipped=True`` report immediately when
378
+ :class:`DisagreementDetector` decides no debate is needed.
379
+ Falls back to a single-model review when fewer than 2 agents respond.
380
+ """
381
+ from src.models.schemas import DebateReport
382
+
383
+ detector = DisagreementDetector()
384
+ if not detector.needs_debate(question, context):
385
+ return DebateReport(
386
+ perspectives=[],
387
+ consensus_points=[],
388
+ disagreements=[],
389
+ missing_evidence=[],
390
+ recommendation="Debate skipped — no disagreement triggers detected",
391
+ approved=True,
392
+ confidence=0.5,
393
+ debate_skipped=True,
394
+ )
395
+
396
+ from src.agent.system_prompt import (
397
+ _ARCHITECT_PROMPT,
398
+ _DEVIL_ADVOCATE_PROMPT,
399
+ _PERFORMANCE_PROMPT,
400
+ _SECURITY_PROMPT,
401
+ )
402
+
403
+ prompts: list[tuple[str, str]] = [
404
+ ("architect", _ARCHITECT_PROMPT),
405
+ ("security", _SECURITY_PROMPT),
406
+ ("performance", _PERFORMANCE_PROMPT),
407
+ ("devil_advocate", _DEVIL_ADVOCATE_PROMPT),
408
+ ][:agent_count]
409
+
410
+ user_content = f"{question}\n\nContext/Diff:\n{context}"
411
+ results = self._run_agents_parallel(prompts, user_content, timeout_secs=timeout_secs)
412
+
413
+ active = [r for r in results if r is not None]
414
+ if len(active) < 2:
415
+ log.warning(
416
+ "Only %d/%d debate agents responded — falling back to single-model review",
417
+ len(active),
418
+ len(prompts),
419
+ )
420
+ return self._single_model_review(question, context)
421
+
422
+ return self._synthesize(active)
423
+
424
+ def _run_agents_parallel(
425
+ self,
426
+ prompts: list[tuple[str, str]],
427
+ user_content: str,
428
+ timeout_secs: float = 30.0,
429
+ ) -> list[Any]:
430
+ """Submit all agent calls in parallel via ThreadPoolExecutor and collect results."""
431
+ results: list[Any] = [None] * len(prompts)
432
+ with ThreadPoolExecutor(max_workers=4) as pool:
433
+ futures = {
434
+ pool.submit(self._call_single_agent, role, sys_prompt, user_content): i
435
+ for i, (role, sys_prompt) in enumerate(prompts)
436
+ }
437
+ try:
438
+ for future in as_completed(futures, timeout=timeout_secs):
439
+ i = futures[future]
440
+ try:
441
+ results[i] = future.result()
442
+ except Exception as exc: # noqa: BLE001
443
+ log.warning(
444
+ "Debate agent %d (%s) failed: %s",
445
+ i,
446
+ prompts[i][0],
447
+ exc,
448
+ )
449
+ except FutureTimeoutError:
450
+ log.warning(
451
+ "Debate agent pool timed out after %.1fs; collecting partial results",
452
+ timeout_secs,
453
+ )
454
+ return results
455
+
456
+ def _call_single_agent(
457
+ self,
458
+ role: str,
459
+ sys_prompt: str,
460
+ user_content: str,
461
+ ) -> Any: # -> AgentPerspective
462
+ """Call one debate agent and return its AgentPerspective."""
463
+ from src.models.schemas import AgentPerspective
464
+
465
+ messages = [
466
+ {"role": "system", "content": sys_prompt},
467
+ {"role": "user", "content": user_content},
468
+ ]
469
+ response = self._client.beta.chat.completions.parse(
470
+ model=self._model.id,
471
+ messages=messages,
472
+ response_format=AgentPerspective,
473
+ max_tokens=_REVIEW_MAX_TOKENS,
474
+ )
475
+ parsed = response.choices[0].message.parsed
476
+ if parsed is None:
477
+ raise ValueError(f"Debate agent '{role}' returned no parsed output")
478
+ return parsed
479
+
480
+ def _synthesize(self, perspectives: list[Any]) -> Any: # -> DebateReport
481
+ """Aggregate multiple AgentPerspective results into a DebateReport (rule-based, no LLM)."""
482
+ from collections import Counter
483
+
484
+ from src.models.schemas import DebateReport, Severity
485
+
486
+ all_findings = [f for p in perspectives for f in p.findings]
487
+
488
+ # Consensus: findings whose message appears in 2+ perspectives
489
+ msg_counts: Counter[str] = Counter(f.message for f in all_findings)
490
+ consensus_points = [msg for msg, cnt in msg_counts.items() if cnt > 1]
491
+
492
+ # Disagreements: identify confidence-level splits between agents
493
+ disagreements: list[str] = []
494
+ confident = [p.role for p in perspectives if p.confidence > 0.6]
495
+ skeptical = [p.role for p in perspectives if p.confidence <= 0.4]
496
+ if confident and skeptical:
497
+ disagreements.append(
498
+ f"Confidence split — {', '.join(confident)} are confident vs "
499
+ f"{', '.join(skeptical)} are skeptical"
500
+ )
501
+
502
+ # Missing evidence: roles absent from this debate
503
+ represented = {p.role for p in perspectives}
504
+ all_roles = {"architect", "security", "performance", "devil_advocate"}
505
+ missing_evidence = [
506
+ f"No perspective from: {r}" for r in sorted(all_roles - represented)
507
+ ]
508
+
509
+ # Approval: block on any CRITICAL finding, or 3+ HIGH findings
510
+ critical = sum(1 for f in all_findings if f.severity == Severity.CRITICAL)
511
+ high = sum(1 for f in all_findings if f.severity == Severity.HIGH)
512
+ approved = critical == 0 and high < 3
513
+
514
+ avg_confidence = sum(p.confidence for p in perspectives) / len(perspectives)
515
+ avg_confidence = min(1.0, max(0.0, avg_confidence))
516
+
517
+ role_summaries = "; ".join(f"{p.role}: {p.summary}" for p in perspectives)
518
+ recommendation = (
519
+ f"Multi-agent synthesis ({len(perspectives)} agents). {role_summaries}"
520
+ )
521
+
522
+ return DebateReport(
523
+ perspectives=perspectives,
524
+ consensus_points=consensus_points,
525
+ disagreements=disagreements,
526
+ missing_evidence=missing_evidence,
527
+ recommendation=recommendation,
528
+ approved=approved,
529
+ confidence=avg_confidence,
530
+ )
531
+
532
+ def _single_model_review(self, question: str, context: str) -> Any: # -> DebateReport
533
+ """Fallback: run the existing single-model review and wrap it in a DebateReport."""
534
+ from src.models.schemas import DebateReport
535
+
536
+ try:
537
+ report = self._call_model(f"{question}\n\nContext:\n{context}")
538
+ return DebateReport(
539
+ perspectives=[],
540
+ consensus_points=[],
541
+ disagreements=[],
542
+ missing_evidence=[],
543
+ recommendation=report.summary,
544
+ approved=report.approved,
545
+ confidence=report.confidence,
546
+ debate_skipped=True,
547
+ )
548
+ except Exception as exc: # noqa: BLE001
549
+ log.warning("Single-model review fallback also failed: %s", exc)
550
+ return DebateReport(
551
+ perspectives=[],
552
+ consensus_points=[],
553
+ disagreements=[],
554
+ missing_evidence=[],
555
+ recommendation="Review failed — no agents responded",
556
+ approved=True,
557
+ confidence=0.0,
558
+ debate_skipped=True,
559
+ )
560
+
561
+
562
+ # ---------------------------------------------------------------------------
563
+ # Helpers
564
+ # ---------------------------------------------------------------------------
565
+
566
+ def _collect_diff(files: list[Path], project_root: Path) -> str:
567
+ """Collect git diff output for the given files. Returns empty string on failure."""
568
+ parts: list[str] = []
569
+ for p in files[:5]:
570
+ try:
571
+ r = subprocess.run(
572
+ ["git", "diff", "HEAD", "--", str(p)],
573
+ capture_output=True,
574
+ text=True,
575
+ timeout=10,
576
+ cwd=str(project_root),
577
+ )
578
+ if r.stdout.strip():
579
+ parts.append(r.stdout)
580
+ except Exception: # noqa: BLE001
581
+ pass
582
+ return "\n".join(parts)
583
+
584
+
585
+ def _base_url(provider: str) -> str:
586
+ from src.models.definitions import PROVIDER_BASE_URLS
587
+ return PROVIDER_BASE_URLS.get(provider, PROVIDER_BASE_URLS[Provider.GROK])
588
+
589
+
590
+ def _truncate_diff(diff: str, max_chars: int) -> str:
591
+ if len(diff) <= max_chars:
592
+ return diff
593
+ return diff[:max_chars] + "\n...(diff truncated)"
594
+
595
+
596
+ def _build_review_prompt(
597
+ files: list[Path], diff: str, reason: str
598
+ ) -> str:
599
+ file_list = ", ".join(f.name for f in files)
600
+ return (
601
+ f"Review trigger: {reason}\n"
602
+ f"Files changed: {file_list}\n\n"
603
+ f"Diff:\n```\n{diff}\n```\n\n"
604
+ "Identify all security vulnerabilities, logic errors, and architectural issues. "
605
+ "Be precise about file:line locations."
606
+ )
607
+
608
+
609
+ _REVIEW_SYSTEM_PROMPT = """You are a senior security engineer reviewing code changes.
610
+ Your job is to find REAL bugs, security vulnerabilities, and architectural flaws.
611
+ Do NOT comment on style, formatting, or cosmetic issues.
612
+
613
+ CRITICAL: SQL injection, XSS, auth bypass, path traversal, insecure deserialization
614
+ HIGH: Race conditions, memory leaks, incorrect error handling, missing validation
615
+ MEDIUM: Missing tests, inconsistent naming, unclear logic
616
+ LOW: Minor improvements, alternative approaches
617
+
618
+ Return a JSON object matching the ReviewReport schema exactly.
619
+ approved=true means the code can proceed (findings may still exist at LOW/MEDIUM level).
620
+ approved=false means CRITICAL or multiple HIGH findings block the merge.
621
+ """
622
+
623
+ _ADR_SYSTEM_PROMPT = """You are an architecture review expert.
624
+ Analyze the planned task and the files that will be changed.
625
+ Provide an Architecture Decision Record with:
626
+ - A clear, concise decision statement
627
+ - Rationale for this approach
628
+ - Alternatives that were considered and rejected
629
+ - Known risks of this decision
630
+
631
+ Return a JSON object matching the ADRReport schema exactly.
632
+ """
633
+
634
+ _SECURITY_SYSTEM_PROMPT = """You are a security threat modeling expert.
635
+ Analyze this code diff for security vulnerabilities and attack surfaces.
636
+ Focus on concrete, exploitable threats — not hypothetical ones.
637
+ Identify practical mitigations for each attack surface found.
638
+
639
+ Return a JSON object matching the ThreatModelReport schema exactly.
640
+ """
641
+
642
+ _DEBUG_SYSTEM_PROMPT = """You are a debugging expert with deep knowledge of Python.
643
+ Given this error and the previous failed fix attempts, generate ranked hypotheses
644
+ about the root cause. Focus on causes, not symptoms.
645
+ Identify dead ends from the previous attempts to avoid repeating them.
646
+
647
+ Return a JSON object matching the DebugHypotheses schema exactly.
648
+ """