gdmcode 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gdmcode-0.1.0.dist-info/METADATA +240 -0
- gdmcode-0.1.0.dist-info/RECORD +131 -0
- gdmcode-0.1.0.dist-info/WHEEL +4 -0
- gdmcode-0.1.0.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/_internal/__init__.py +0 -0
- src/_internal/constants.py +244 -0
- src/_internal/domain_skills.py +339 -0
- src/agent/__init__.py +0 -0
- src/agent/commit_classifier.py +91 -0
- src/agent/context_budget.py +391 -0
- src/agent/daemon.py +681 -0
- src/agent/dag_validator.py +153 -0
- src/agent/debug_loop.py +473 -0
- src/agent/impact_analyzer.py +149 -0
- src/agent/impact_graph.py +117 -0
- src/agent/loop.py +1410 -0
- src/agent/orchestrator.py +141 -0
- src/agent/regression_guard.py +251 -0
- src/agent/review_gate.py +648 -0
- src/agent/risk_scorer.py +169 -0
- src/agent/self_healing.py +145 -0
- src/agent/smart_test_selector.py +89 -0
- src/agent/system_prompt.py +226 -0
- src/agent/task_tracker.py +320 -0
- src/agent/test_validator.py +210 -0
- src/agent/tool_orchestrator.py +402 -0
- src/agent/transcript.py +230 -0
- src/agent/verification_loop.py +133 -0
- src/agent/work_director.py +136 -0
- src/agent/worktree_manager.py +53 -0
- src/artifacts/__init__.py +16 -0
- src/artifacts/artifact_store.py +456 -0
- src/artifacts/verification_graph.py +75 -0
- src/auth.py +411 -0
- src/cli.py +1290 -0
- src/commands.py +1398 -0
- src/config.py +762 -0
- src/cost_tracker.py +348 -0
- src/db/__init__.py +4 -0
- src/db/migrations.py +337 -0
- src/enterprise/__init__.py +3 -0
- src/enterprise/audit_log.py +182 -0
- src/enterprise/identity.py +90 -0
- src/enterprise/rbac.py +100 -0
- src/enterprise/team_config.py +125 -0
- src/enterprise/usage_analytics.py +261 -0
- src/exceptions.py +207 -0
- src/git_workflow.py +651 -0
- src/integrations/__init__.py +6 -0
- src/integrations/github_actions.py +106 -0
- src/integrations/mcp_server.py +333 -0
- src/integrations/sentry_integration.py +100 -0
- src/integrations/sentry_server.py +82 -0
- src/integrations/webhook_security.py +19 -0
- src/main.py +27 -0
- src/memory/__init__.py +0 -0
- src/memory/code_index.py +376 -0
- src/memory/compressor.py +378 -0
- src/memory/context_memory.py +135 -0
- src/memory/continuous_memory.py +234 -0
- src/memory/conventions.py +495 -0
- src/memory/db.py +1119 -0
- src/memory/document_index.py +205 -0
- src/memory/file_cache.py +128 -0
- src/memory/project_scanner.py +178 -0
- src/memory/session_store.py +201 -0
- src/models/__init__.py +0 -0
- src/models/client.py +715 -0
- src/models/definitions.py +459 -0
- src/models/router.py +418 -0
- src/models/schemas.py +389 -0
- src/permissions.py +294 -0
- src/remote/__init__.py +5 -0
- src/remote/command_filter.py +33 -0
- src/remote/models.py +31 -0
- src/remote/permission_handler.py +79 -0
- src/remote/phone_ui.py +48 -0
- src/remote/protocol.py +59 -0
- src/remote/qr.py +65 -0
- src/remote/server.py +586 -0
- src/remote/token_manager.py +61 -0
- src/remote/tunnel.py +212 -0
- src/repl.py +475 -0
- src/runtime/__init__.py +1 -0
- src/runtime/branch_farm.py +372 -0
- src/runtime/replay.py +351 -0
- src/sandbox/__init__.py +2 -0
- src/sandbox/hermetic.py +214 -0
- src/sandbox/policy.py +44 -0
- src/sdk/__init__.py +3 -0
- src/sdk/plugin_base.py +39 -0
- src/sdk/plugin_host.py +100 -0
- src/sdk/plugin_loader.py +101 -0
- src/security.py +409 -0
- src/server/__init__.py +7 -0
- src/server/bridge.py +427 -0
- src/server/bridge_cli.py +103 -0
- src/server/bridge_client.py +170 -0
- src/server/protocol_version.py +103 -0
- src/session/__init__.py +10 -0
- src/session/event_fanout.py +46 -0
- src/session/input_broker.py +38 -0
- src/session/permission_bridge.py +100 -0
- src/tools/__init__.py +160 -0
- src/tools/_atomic.py +72 -0
- src/tools/agent_tools.py +423 -0
- src/tools/ask_user_tool.py +83 -0
- src/tools/bash_tool.py +384 -0
- src/tools/browser_tool.py +352 -0
- src/tools/browser_tools.py +179 -0
- src/tools/dep_tools.py +210 -0
- src/tools/document_reader.py +167 -0
- src/tools/document_tool.py +240 -0
- src/tools/document_writer.py +171 -0
- src/tools/impact_tools.py +240 -0
- src/tools/playwright_tool.py +172 -0
- src/tools/quality_tools.py +366 -0
- src/tools/read_tools.py +318 -0
- src/tools/result_cache.py +157 -0
- src/tools/search_tools.py +310 -0
- src/tools/shell_tools.py +311 -0
- src/tools/write_tools.py +337 -0
- src/voice/__init__.py +25 -0
- src/voice/audio_capture.py +92 -0
- src/voice/audio_playback.py +68 -0
- src/voice/errors.py +14 -0
- src/voice/models.py +35 -0
- src/voice/providers.py +143 -0
- src/voice/vad.py +55 -0
- src/voice/voice_loop.py +156 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import copy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class DagNode:
|
|
8
|
+
id: str
|
|
9
|
+
description: str
|
|
10
|
+
depends_on: list[str] = field(default_factory=list)
|
|
11
|
+
model_tier: str = "sonnet"
|
|
12
|
+
write_capable: bool = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ValidationResult:
|
|
17
|
+
is_valid: bool
|
|
18
|
+
dag: list[DagNode]
|
|
19
|
+
errors: list[str] = field(default_factory=list)
|
|
20
|
+
repairable: bool = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DagValidator:
|
|
24
|
+
MAX_NODES = 12
|
|
25
|
+
MAX_DEPTH = 4
|
|
26
|
+
MAX_FAN_OUT = 3
|
|
27
|
+
|
|
28
|
+
def __init__(self, raw: list[dict]):
|
|
29
|
+
self._raw = raw
|
|
30
|
+
|
|
31
|
+
def validate(self) -> ValidationResult:
|
|
32
|
+
errors = []
|
|
33
|
+
try:
|
|
34
|
+
nodes = [
|
|
35
|
+
DagNode(**{k: v for k, v in n.items() if k in DagNode.__dataclass_fields__})
|
|
36
|
+
for n in self._raw
|
|
37
|
+
]
|
|
38
|
+
except Exception as e:
|
|
39
|
+
return ValidationResult(is_valid=False, dag=[], errors=[str(e)], repairable=False)
|
|
40
|
+
|
|
41
|
+
ids = [n.id for n in nodes]
|
|
42
|
+
# Unique IDs
|
|
43
|
+
if len(ids) != len(set(ids)):
|
|
44
|
+
errors.append("Duplicate node IDs")
|
|
45
|
+
# Node cap
|
|
46
|
+
if len(nodes) > self.MAX_NODES:
|
|
47
|
+
errors.append(f"Too many nodes: {len(nodes)} > {self.MAX_NODES}")
|
|
48
|
+
return ValidationResult(is_valid=False, dag=nodes, errors=errors, repairable=False)
|
|
49
|
+
# All deps exist
|
|
50
|
+
id_set = set(ids)
|
|
51
|
+
for n in nodes:
|
|
52
|
+
for dep in n.depends_on:
|
|
53
|
+
if dep not in id_set:
|
|
54
|
+
errors.append(f"Missing dep '{dep}' in node '{n.id}'")
|
|
55
|
+
# Self-deps
|
|
56
|
+
for n in nodes:
|
|
57
|
+
if n.id in n.depends_on:
|
|
58
|
+
errors.append(f"Self-dependency in '{n.id}'")
|
|
59
|
+
# Cycle detection (Kahn's algorithm)
|
|
60
|
+
if self._has_cycle(nodes):
|
|
61
|
+
errors.append("Cycle detected in DAG")
|
|
62
|
+
return ValidationResult(is_valid=False, dag=nodes, errors=errors, repairable=True)
|
|
63
|
+
# Depth cap
|
|
64
|
+
depth = self._max_depth(nodes)
|
|
65
|
+
if depth > self.MAX_DEPTH:
|
|
66
|
+
errors.append(f"DAG depth {depth} exceeds max {self.MAX_DEPTH}")
|
|
67
|
+
# Fan-out cap
|
|
68
|
+
for n in nodes:
|
|
69
|
+
fan_out = sum(1 for m in nodes if n.id in m.depends_on)
|
|
70
|
+
if fan_out > self.MAX_FAN_OUT:
|
|
71
|
+
errors.append(f"Node '{n.id}' fan-out {fan_out} exceeds max {self.MAX_FAN_OUT}")
|
|
72
|
+
|
|
73
|
+
return ValidationResult(
|
|
74
|
+
is_valid=len(errors) == 0,
|
|
75
|
+
dag=nodes,
|
|
76
|
+
errors=errors,
|
|
77
|
+
repairable=len(errors) > 0 and "Cycle" not in " ".join(errors),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def auto_repair(self) -> list[dict]:
|
|
81
|
+
"""Remove back-edges that cause cycles. Returns raw dict list."""
|
|
82
|
+
raw = copy.deepcopy(self._raw)
|
|
83
|
+
id_map = {n["id"]: n for n in raw}
|
|
84
|
+
visited, stack = set(), set()
|
|
85
|
+
to_remove = []
|
|
86
|
+
|
|
87
|
+
def dfs(node_id):
|
|
88
|
+
visited.add(node_id)
|
|
89
|
+
stack.add(node_id)
|
|
90
|
+
for dep in list(id_map.get(node_id, {}).get("depends_on", [])):
|
|
91
|
+
if dep not in visited:
|
|
92
|
+
dfs(dep)
|
|
93
|
+
elif dep in stack:
|
|
94
|
+
to_remove.append((node_id, dep))
|
|
95
|
+
stack.discard(node_id)
|
|
96
|
+
|
|
97
|
+
for nid in id_map:
|
|
98
|
+
if nid not in visited:
|
|
99
|
+
dfs(nid)
|
|
100
|
+
for node_id, dep in to_remove:
|
|
101
|
+
id_map[node_id]["depends_on"].remove(dep)
|
|
102
|
+
return list(id_map.values())
|
|
103
|
+
|
|
104
|
+
def fallback_linear(self) -> list[dict]:
|
|
105
|
+
"""Return a guaranteed-valid linear chain (no parallelism)."""
|
|
106
|
+
raw = copy.deepcopy(self._raw[: self.MAX_NODES])
|
|
107
|
+
for i, node in enumerate(raw):
|
|
108
|
+
node["depends_on"] = [raw[i - 1]["id"]] if i > 0 else []
|
|
109
|
+
return raw
|
|
110
|
+
|
|
111
|
+
def _has_cycle(self, nodes: list[DagNode]) -> bool:
|
|
112
|
+
"""Kahn's algorithm: cycle exists if topological sort can't include all nodes."""
|
|
113
|
+
in_degree = {n.id: 0 for n in nodes}
|
|
114
|
+
for n in nodes:
|
|
115
|
+
for dep in n.depends_on:
|
|
116
|
+
if dep in in_degree:
|
|
117
|
+
in_degree[n.id] += 1
|
|
118
|
+
queue = [nid for nid, deg in in_degree.items() if deg == 0]
|
|
119
|
+
count = 0
|
|
120
|
+
adj = {n.id: [] for n in nodes}
|
|
121
|
+
for n in nodes:
|
|
122
|
+
for dep in n.depends_on:
|
|
123
|
+
if dep in adj:
|
|
124
|
+
adj[dep].append(n.id)
|
|
125
|
+
while queue:
|
|
126
|
+
nid = queue.pop()
|
|
127
|
+
count += 1
|
|
128
|
+
for child in adj.get(nid, []):
|
|
129
|
+
in_degree[child] -= 1
|
|
130
|
+
if in_degree[child] == 0:
|
|
131
|
+
queue.append(child)
|
|
132
|
+
return count != len(nodes)
|
|
133
|
+
|
|
134
|
+
def _max_depth(self, nodes: list[DagNode]) -> int:
|
|
135
|
+
id_map = {n.id: n for n in nodes}
|
|
136
|
+
memo = {}
|
|
137
|
+
|
|
138
|
+
def depth(nid):
|
|
139
|
+
if nid in memo:
|
|
140
|
+
return memo[nid]
|
|
141
|
+
node = id_map.get(nid)
|
|
142
|
+
if not node or not node.depends_on:
|
|
143
|
+
memo[nid] = 1
|
|
144
|
+
return 1
|
|
145
|
+
valid_deps = [dep for dep in node.depends_on if dep in id_map]
|
|
146
|
+
if not valid_deps:
|
|
147
|
+
memo[nid] = 1
|
|
148
|
+
return 1
|
|
149
|
+
d = 1 + max(depth(dep) for dep in valid_deps)
|
|
150
|
+
memo[nid] = d
|
|
151
|
+
return d
|
|
152
|
+
|
|
153
|
+
return max((depth(n.id) for n in nodes), default=0)
|
src/agent/debug_loop.py
ADDED
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
"""Iterative debug loop — fix → test → fix cycle with model-tier escalation.
|
|
2
|
+
|
|
3
|
+
Drives an automated bug-fix workflow:
|
|
4
|
+
1. Run the test suite.
|
|
5
|
+
2. If tests fail, ask the agent to apply a fix.
|
|
6
|
+
3. Re-run tests.
|
|
7
|
+
4. Repeat up to *max_cycles* times, escalating to stronger model tiers on
|
|
8
|
+
repeated failures and optionally trying an ensemble patch strategy.
|
|
9
|
+
|
|
10
|
+
Phase 1: single-loop sequential fixes, ensemble stub (Phase 3 full impl).
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import shlex
|
|
16
|
+
import subprocess
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import TYPE_CHECKING, Generator
|
|
19
|
+
|
|
20
|
+
from src._internal.constants import _ENSEMBLE_PATCH_COST_CAP_USD, _MAX_DEBUG_CYCLES
|
|
21
|
+
from src.agent.loop import AgentEvent, AgentLoop, EventType
|
|
22
|
+
from src.models.definitions import ModelTier
|
|
23
|
+
from src.tools.shell_tools import (
|
|
24
|
+
_extract_error_for_search,
|
|
25
|
+
_format_search_injection,
|
|
26
|
+
_parse_search_results,
|
|
27
|
+
web_search_raw,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from src.config import GdmConfig
|
|
32
|
+
|
|
33
|
+
__all__ = ["DebugAttempt", "DebugLoop", "DebugResult"]
|
|
34
|
+
|
|
35
|
+
log = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Constants
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
_TEST_TIMEOUT_SECS: int = 120
|
|
42
|
+
_CODER_MAX_ATTEMPT: int = 2
|
|
43
|
+
_THINKER_ATTEMPT: int = 3
|
|
44
|
+
_ENSEMBLE_TRIGGER_FAILURES: int = 2
|
|
45
|
+
_MAX_FIX_PROMPT_ERROR_CHARS: int = 2_000
|
|
46
|
+
_MAX_PATCH_DESC_CHARS: int = 200
|
|
47
|
+
|
|
48
|
+
# Referenced by callers for budget enforcement.
|
|
49
|
+
_COST_CAP_USD = _ENSEMBLE_PATCH_COST_CAP_USD
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Data classes
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class DebugAttempt:
|
|
59
|
+
"""Record of one fix attempt in the debug loop."""
|
|
60
|
+
|
|
61
|
+
attempt_num: int
|
|
62
|
+
patch_applied: str # brief description of what was changed
|
|
63
|
+
test_result: str # "pass" | "fail"
|
|
64
|
+
error_output: str # test output on failure, empty on pass
|
|
65
|
+
model_tier_used: str
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class DebugResult:
|
|
70
|
+
"""Outcome of a complete DebugLoop.run() call."""
|
|
71
|
+
|
|
72
|
+
success: bool
|
|
73
|
+
attempts: list[DebugAttempt] = field(default_factory=list)
|
|
74
|
+
final_model_tier: str = ModelTier.CODER
|
|
75
|
+
was_rolled_back: bool = False
|
|
76
|
+
regression_detected: bool = False
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
# Debug loop
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DebugLoop:
|
|
85
|
+
"""Iterative bug fix loop: fix -> test -> fix -> test (max 5 cycles).
|
|
86
|
+
|
|
87
|
+
Escalates model tier on repeated failures:
|
|
88
|
+
|
|
89
|
+
* Attempt 1-2: Coder tier
|
|
90
|
+
* Attempt 3: Thinker tier
|
|
91
|
+
* Attempt 4-5: Reasoner tier
|
|
92
|
+
|
|
93
|
+
After 2 failures, attempts an ensemble patch (3 parallel Scout strategies)
|
|
94
|
+
and selects the one that passes the most tests. The ensemble is currently
|
|
95
|
+
a stub -- full implementation ships in Phase 3.
|
|
96
|
+
|
|
97
|
+
Usage::
|
|
98
|
+
|
|
99
|
+
debug = DebugLoop(loop, cfg)
|
|
100
|
+
result = debug.run(
|
|
101
|
+
task_description="Fix TypeError in auth.py line 45",
|
|
102
|
+
test_command="pytest tests/test_auth.py",
|
|
103
|
+
files_to_watch=["src/auth.py"],
|
|
104
|
+
)
|
|
105
|
+
if result.success:
|
|
106
|
+
print(f"Fixed in {len(result.attempts)} attempt(s)")
|
|
107
|
+
elif result.was_rolled_back:
|
|
108
|
+
print("All attempts failed -- changes rolled back")
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(self, base_loop: AgentLoop, cfg: GdmConfig) -> None:
|
|
112
|
+
"""Initialise the debug loop.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
base_loop: A fully-configured AgentLoop used for fix turns.
|
|
116
|
+
cfg: Session configuration (provider, cost limits, etc.).
|
|
117
|
+
"""
|
|
118
|
+
self._loop = base_loop
|
|
119
|
+
self._cfg = cfg
|
|
120
|
+
self._test_command: str = "" # stored at run() time; used by ensemble
|
|
121
|
+
|
|
122
|
+
# ------------------------------------------------------------------
|
|
123
|
+
# Public entry point
|
|
124
|
+
# ------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def run(
|
|
127
|
+
self,
|
|
128
|
+
task_description: str,
|
|
129
|
+
test_command: str,
|
|
130
|
+
files_to_watch: list[str] | None = None,
|
|
131
|
+
*,
|
|
132
|
+
max_cycles: int = _MAX_DEBUG_CYCLES,
|
|
133
|
+
) -> DebugResult:
|
|
134
|
+
"""Run the debug loop. Returns DebugResult (never raises).
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
task_description: Human-readable description of the bug to fix.
|
|
138
|
+
test_command: Shell command whose exit code signals pass/fail.
|
|
139
|
+
files_to_watch: Optional list of files relevant to the fix.
|
|
140
|
+
max_cycles: Maximum fix attempts before giving up.
|
|
141
|
+
"""
|
|
142
|
+
attempts: list[DebugAttempt] = []
|
|
143
|
+
final_tier = ModelTier.CODER
|
|
144
|
+
self._test_command = test_command # expose to ensemble
|
|
145
|
+
_search_fired: bool = False # one-shot flag; reset each run() call
|
|
146
|
+
_search_context: str = "" # injection block; prepended to fix prompt once
|
|
147
|
+
_auto_search_iter: int = getattr(self._cfg, "debug_auto_search_iteration", 3)
|
|
148
|
+
try:
|
|
149
|
+
for attempt_num in range(1, max_cycles + 1):
|
|
150
|
+
tier = self._get_tier_for_attempt(attempt_num)
|
|
151
|
+
final_tier = tier
|
|
152
|
+
log.info("Debug attempt %d/%d tier=%s", attempt_num, max_cycles, tier)
|
|
153
|
+
|
|
154
|
+
passed, output = self._run_tests(test_command)
|
|
155
|
+
if passed:
|
|
156
|
+
attempts.append(DebugAttempt(attempt_num, "(pre-check)", "pass", "", tier))
|
|
157
|
+
return DebugResult(True, attempts, tier, False, False)
|
|
158
|
+
|
|
159
|
+
# One-shot auto web search at configured iteration number
|
|
160
|
+
if (
|
|
161
|
+
_auto_search_iter > 0
|
|
162
|
+
and attempt_num == _auto_search_iter
|
|
163
|
+
and not _search_fired
|
|
164
|
+
):
|
|
165
|
+
_search_fired = True
|
|
166
|
+
error_query = _extract_error_for_search(output)
|
|
167
|
+
try:
|
|
168
|
+
raw = web_search_raw(error_query)
|
|
169
|
+
results = _parse_search_results(raw)
|
|
170
|
+
_search_context = _format_search_injection(error_query, results)
|
|
171
|
+
log.info(
|
|
172
|
+
"DebugLoop: auto web-search fired (iter=%d query=%r results=%d)",
|
|
173
|
+
attempt_num, error_query[:60], len(results),
|
|
174
|
+
)
|
|
175
|
+
except Exception as exc: # noqa: BLE001
|
|
176
|
+
log.warning("DebugLoop: auto web-search failed (continuing): %s", exc)
|
|
177
|
+
_search_context = ""
|
|
178
|
+
|
|
179
|
+
failed_so_far = sum(1 for a in attempts if a.test_result == "fail")
|
|
180
|
+
if failed_so_far >= _ENSEMBLE_TRIGGER_FAILURES:
|
|
181
|
+
ok, desc = self._try_ensemble_patch(task_description, output)
|
|
182
|
+
if ok:
|
|
183
|
+
attempts.append(DebugAttempt(attempt_num, desc, "pass", "", tier))
|
|
184
|
+
return DebugResult(True, attempts, tier, False, False)
|
|
185
|
+
|
|
186
|
+
patch_desc = self._drain_fix(
|
|
187
|
+
self._apply_fix(output, task_description, attempt_num, _search_context)
|
|
188
|
+
)
|
|
189
|
+
_search_context = "" # consume search context — only used once
|
|
190
|
+
passed_after, out_after = self._run_tests(test_command)
|
|
191
|
+
test_result = "pass" if passed_after else "fail"
|
|
192
|
+
err_out = "" if passed_after else out_after
|
|
193
|
+
attempts.append(DebugAttempt(attempt_num, patch_desc, test_result, err_out, tier))
|
|
194
|
+
|
|
195
|
+
if passed_after:
|
|
196
|
+
return DebugResult(True, attempts, tier, False, False)
|
|
197
|
+
|
|
198
|
+
rolled_back = self._should_rollback(attempts)
|
|
199
|
+
if rolled_back:
|
|
200
|
+
try:
|
|
201
|
+
from src.git_workflow import GitWorkflow # noqa: PLC0415
|
|
202
|
+
wf = GitWorkflow(self._cfg.project_root)
|
|
203
|
+
if wf.is_git_repo():
|
|
204
|
+
wf.rollback_to_pre_task(hard=False)
|
|
205
|
+
log.warning(
|
|
206
|
+
"DebugLoop: all %d attempts failed — soft rollback applied",
|
|
207
|
+
len(attempts),
|
|
208
|
+
)
|
|
209
|
+
except Exception as exc: # noqa: BLE001
|
|
210
|
+
log.warning("DebugLoop: rollback skipped: %s", exc)
|
|
211
|
+
# Debate debug: generate fresh hypotheses after ensemble exhaustion
|
|
212
|
+
try:
|
|
213
|
+
from src.agent.review_gate import ReviewGate # noqa: PLC0415
|
|
214
|
+
last_error_text = attempts[-1].error_output if attempts else ""
|
|
215
|
+
gate = ReviewGate(cfg=self._cfg)
|
|
216
|
+
hypotheses = gate.debate_debug(
|
|
217
|
+
error=last_error_text,
|
|
218
|
+
attempts_history=[str(a.patch_applied) for a in attempts[-3:]],
|
|
219
|
+
)
|
|
220
|
+
log.info(
|
|
221
|
+
"debate_debug hypotheses: %s",
|
|
222
|
+
"; ".join(hypotheses.hypotheses[:3]),
|
|
223
|
+
)
|
|
224
|
+
except Exception as exc: # noqa: BLE001
|
|
225
|
+
log.debug("debate_debug skipped: %s", exc)
|
|
226
|
+
return DebugResult(False, attempts, final_tier, rolled_back, False)
|
|
227
|
+
except Exception as exc: # noqa: BLE001
|
|
228
|
+
log.exception("DebugLoop.run failed: %s", exc)
|
|
229
|
+
return DebugResult(False, attempts, final_tier, False, False)
|
|
230
|
+
|
|
231
|
+
# ------------------------------------------------------------------
|
|
232
|
+
# Private helpers
|
|
233
|
+
# ------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
def _run_tests(self, test_command: str) -> tuple[bool, str]:
|
|
236
|
+
"""Run the test command. Returns (passed, combined output).
|
|
237
|
+
|
|
238
|
+
Uses subprocess.run with a 120-second timeout.
|
|
239
|
+
TimeoutExpired returns (False, "Test timed out") rather than raising.
|
|
240
|
+
"""
|
|
241
|
+
try:
|
|
242
|
+
result = subprocess.run(
|
|
243
|
+
shlex.split(test_command, posix=(not __import__("sys").platform.startswith("win"))),
|
|
244
|
+
shell=False,
|
|
245
|
+
capture_output=True,
|
|
246
|
+
text=True,
|
|
247
|
+
timeout=_TEST_TIMEOUT_SECS,
|
|
248
|
+
)
|
|
249
|
+
output = result.stdout + result.stderr
|
|
250
|
+
passed = result.returncode == 0
|
|
251
|
+
log.debug("Test run: rc=%d output_chars=%d", result.returncode, len(output))
|
|
252
|
+
return passed, output
|
|
253
|
+
except subprocess.TimeoutExpired:
|
|
254
|
+
log.warning("Test command timed out after %ds", _TEST_TIMEOUT_SECS)
|
|
255
|
+
return False, "Test timed out"
|
|
256
|
+
except OSError as exc:
|
|
257
|
+
log.error("Failed to run test command %r: %s", test_command, exc)
|
|
258
|
+
return False, f"Failed to run tests: {exc}"
|
|
259
|
+
|
|
260
|
+
def _apply_fix(
|
|
261
|
+
self,
|
|
262
|
+
error_output: str,
|
|
263
|
+
task_description: str,
|
|
264
|
+
attempt_num: int,
|
|
265
|
+
search_context: str = "",
|
|
266
|
+
) -> Generator[AgentEvent, None, None]:
|
|
267
|
+
"""Run one agent turn asking it to fix the failing tests.
|
|
268
|
+
|
|
269
|
+
Yields AgentEvents from the underlying AgentLoop.run() call.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
error_output: Combined test output from the failing run.
|
|
273
|
+
task_description: Original bug description for context.
|
|
274
|
+
attempt_num: Current attempt number (used in the prompt).
|
|
275
|
+
search_context: Optional web-search injection block (one-shot).
|
|
276
|
+
"""
|
|
277
|
+
tier = self._get_tier_for_attempt(attempt_num)
|
|
278
|
+
search_block = f"\n\n{search_context}" if search_context else ""
|
|
279
|
+
fix_prompt = (
|
|
280
|
+
f"Task: {task_description}\n\n"
|
|
281
|
+
f"Attempt {attempt_num} -- model tier: {tier}.\n\n"
|
|
282
|
+
"The following tests are failing. Analyse the error carefully "
|
|
283
|
+
"and apply a targeted fix:\n\n"
|
|
284
|
+
f"```\n{error_output[:_MAX_FIX_PROMPT_ERROR_CHARS]}\n```"
|
|
285
|
+
f"{search_block}"
|
|
286
|
+
)
|
|
287
|
+
log.info("Applying fix (attempt=%d tier=%s)", attempt_num, tier)
|
|
288
|
+
yield from self._loop.run(fix_prompt)
|
|
289
|
+
|
|
290
|
+
def _drain_fix(self, events: Generator[AgentEvent, None, None]) -> str:
|
|
291
|
+
"""Drain an _apply_fix generator and return a brief patch description.
|
|
292
|
+
|
|
293
|
+
Collects all RESPONSE event content and trims to _MAX_PATCH_DESC_CHARS.
|
|
294
|
+
"""
|
|
295
|
+
parts: list[str] = []
|
|
296
|
+
for event in events:
|
|
297
|
+
if event.type == EventType.RESPONSE:
|
|
298
|
+
parts.append(event.content)
|
|
299
|
+
return (" ".join(parts))[:_MAX_PATCH_DESC_CHARS] or "(fix applied)"
|
|
300
|
+
|
|
301
|
+
def _get_tier_for_attempt(self, attempt_num: int) -> str:
|
|
302
|
+
"""Return the appropriate ModelTier string for *attempt_num*.
|
|
303
|
+
|
|
304
|
+
Escalation schedule:
|
|
305
|
+
* 1-2 -> CODER
|
|
306
|
+
* 3 -> THINKER
|
|
307
|
+
* 4+ -> REASONER
|
|
308
|
+
"""
|
|
309
|
+
if attempt_num <= _CODER_MAX_ATTEMPT:
|
|
310
|
+
return ModelTier.CODER
|
|
311
|
+
if attempt_num == _THINKER_ATTEMPT:
|
|
312
|
+
return ModelTier.THINKER
|
|
313
|
+
return ModelTier.REASONER
|
|
314
|
+
|
|
315
|
+
def _try_ensemble_patch(
|
|
316
|
+
self, task_description: str, error_output: str
|
|
317
|
+
) -> tuple[bool, str]:
|
|
318
|
+
"""Try 3 fix strategies; pick the one that passes tests.
|
|
319
|
+
|
|
320
|
+
Each strategy runs in a fresh git working tree (stash-isolated) so
|
|
321
|
+
strategies don't interfere. Applies the winning patch and returns
|
|
322
|
+
``(True, "ensemble/<strategy>")``. If all strategies fail or git is
|
|
323
|
+
not available, returns ``(False, reason)``.
|
|
324
|
+
|
|
325
|
+
Cost guard: we stop early on the first passing strategy to minimise
|
|
326
|
+
API spend. Strategies are tried sequentially, not in parallel, to
|
|
327
|
+
respect the ``_COST_CAP_USD`` ceiling.
|
|
328
|
+
"""
|
|
329
|
+
import subprocess
|
|
330
|
+
import tempfile
|
|
331
|
+
from pathlib import Path as _Path
|
|
332
|
+
|
|
333
|
+
if not self._test_command:
|
|
334
|
+
log.info("Ensemble patch skipped: no test_command stored")
|
|
335
|
+
return False, "ensemble skipped (no test command)"
|
|
336
|
+
|
|
337
|
+
# Verify git is available
|
|
338
|
+
try:
|
|
339
|
+
root_proc = subprocess.run(
|
|
340
|
+
["git", "rev-parse", "--show-toplevel"],
|
|
341
|
+
capture_output=True, text=True, timeout=5,
|
|
342
|
+
)
|
|
343
|
+
if root_proc.returncode != 0:
|
|
344
|
+
return False, "ensemble skipped (not a git repo)"
|
|
345
|
+
root = root_proc.stdout.strip()
|
|
346
|
+
except Exception as exc: # noqa: BLE001
|
|
347
|
+
log.debug("Ensemble: git check failed: %s", exc)
|
|
348
|
+
return False, "ensemble skipped (git unavailable)"
|
|
349
|
+
|
|
350
|
+
def _git(*args: str) -> subprocess.CompletedProcess[str]:
|
|
351
|
+
return subprocess.run(
|
|
352
|
+
["git", *args], capture_output=True, text=True, cwd=root, timeout=60
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Stash current state → all strategies start from the same clean base
|
|
356
|
+
stash_tag = f"gdm-ensemble-{id(self)}"
|
|
357
|
+
stash_proc = _git("stash", "push", "-u", "-m", stash_tag)
|
|
358
|
+
did_stash = (
|
|
359
|
+
stash_proc.returncode == 0
|
|
360
|
+
and "No local changes" not in stash_proc.stdout
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
_STRATEGIES = [
|
|
364
|
+
(
|
|
365
|
+
"minimal",
|
|
366
|
+
"Apply the minimal possible fix. Change only what is strictly necessary.",
|
|
367
|
+
),
|
|
368
|
+
(
|
|
369
|
+
"refactor",
|
|
370
|
+
"Refactor for correctness and clarity. Prioritise correctness over brevity.",
|
|
371
|
+
),
|
|
372
|
+
(
|
|
373
|
+
"alternative",
|
|
374
|
+
"Re-think the root cause. Use a completely different algorithmic approach.",
|
|
375
|
+
),
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
best_pass_count = -1
|
|
379
|
+
best_patch = ""
|
|
380
|
+
best_name = ""
|
|
381
|
+
|
|
382
|
+
try:
|
|
383
|
+
for strategy_name, strategy_hint in _STRATEGIES:
|
|
384
|
+
# Reset working tree to the stashed base (last commit state)
|
|
385
|
+
_git("checkout", "--", ".")
|
|
386
|
+
_git("clean", "-fdq")
|
|
387
|
+
|
|
388
|
+
# Fresh loop for this strategy to avoid transcript pollution
|
|
389
|
+
try:
|
|
390
|
+
strategy_loop = AgentLoop(
|
|
391
|
+
session_id=(
|
|
392
|
+
f"{getattr(self._loop, '_session_id', '')}"
|
|
393
|
+
f"-ens-{strategy_name}"
|
|
394
|
+
),
|
|
395
|
+
db=getattr(self._loop, "_db", None),
|
|
396
|
+
cost_tracker=getattr(self._loop, "_cost_tracker", None),
|
|
397
|
+
config=self._cfg,
|
|
398
|
+
)
|
|
399
|
+
except Exception: # noqa: BLE001
|
|
400
|
+
strategy_loop = self._loop # fallback: reuse main loop
|
|
401
|
+
|
|
402
|
+
prompt = (
|
|
403
|
+
f"Task: {task_description}\n\n"
|
|
404
|
+
f"Ensemble strategy: {strategy_hint}\n\n"
|
|
405
|
+
f"Failing tests:\n```\n"
|
|
406
|
+
f"{error_output[:_MAX_FIX_PROMPT_ERROR_CHARS]}\n```"
|
|
407
|
+
)
|
|
408
|
+
self._drain_fix(strategy_loop.run(prompt))
|
|
409
|
+
|
|
410
|
+
diff = _git("diff", "HEAD").stdout
|
|
411
|
+
if not diff.strip():
|
|
412
|
+
log.info("Ensemble strategy=%s: no changes produced", strategy_name)
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
passed, _ = self._run_tests(self._test_command)
|
|
416
|
+
pass_count = 1 if passed else 0
|
|
417
|
+
log.info(
|
|
418
|
+
"Ensemble strategy=%s passed=%s diff_lines=%d",
|
|
419
|
+
strategy_name,
|
|
420
|
+
passed,
|
|
421
|
+
len(diff.splitlines()),
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if pass_count > best_pass_count:
|
|
425
|
+
best_pass_count = pass_count
|
|
426
|
+
best_patch = diff
|
|
427
|
+
best_name = strategy_name
|
|
428
|
+
|
|
429
|
+
if passed:
|
|
430
|
+
break # found working strategy — stop early to save cost
|
|
431
|
+
|
|
432
|
+
except Exception as exc: # noqa: BLE001
|
|
433
|
+
log.exception("Ensemble inner loop failed: %s", exc)
|
|
434
|
+
|
|
435
|
+
finally:
|
|
436
|
+
# Reset working tree to clean state. If changes were stashed
|
|
437
|
+
# before the ensemble started, restore them so the outer loop
|
|
438
|
+
# can continue from its pre-ensemble state (case: no patch found).
|
|
439
|
+
_git("checkout", "--", ".")
|
|
440
|
+
_git("clean", "-fdq")
|
|
441
|
+
if did_stash:
|
|
442
|
+
_git("stash", "pop") # restore; if a patch was found we clean again below
|
|
443
|
+
|
|
444
|
+
if best_pass_count > 0 and best_patch:
|
|
445
|
+
# Discard the restored pre-ensemble changes — the ensemble patch is better.
|
|
446
|
+
_git("checkout", "--", ".")
|
|
447
|
+
_git("clean", "-fdq")
|
|
448
|
+
with tempfile.NamedTemporaryFile(
|
|
449
|
+
mode="w", suffix=".patch", delete=False, encoding="utf-8"
|
|
450
|
+
) as fh:
|
|
451
|
+
fh.write(best_patch)
|
|
452
|
+
patch_path = fh.name
|
|
453
|
+
try:
|
|
454
|
+
apply_proc = _git("apply", patch_path)
|
|
455
|
+
if apply_proc.returncode != 0:
|
|
456
|
+
log.warning(
|
|
457
|
+
"Ensemble apply failed: %s", apply_proc.stderr[:200]
|
|
458
|
+
)
|
|
459
|
+
return False, "ensemble: patch apply failed"
|
|
460
|
+
finally:
|
|
461
|
+
_Path(patch_path).unlink(missing_ok=True)
|
|
462
|
+
return True, f"ensemble/{best_name}"
|
|
463
|
+
|
|
464
|
+
return False, "ensemble: no strategy passed tests"
|
|
465
|
+
|
|
466
|
+
def _should_rollback(self, attempts: list[DebugAttempt]) -> bool:
|
|
467
|
+
"""True if all attempts failed and changes should be rolled back.
|
|
468
|
+
|
|
469
|
+
Rollback is triggered when the failed-attempt count equals or exceeds
|
|
470
|
+
the configured maximum debug cycles.
|
|
471
|
+
"""
|
|
472
|
+
failed_attempts = sum(1 for a in attempts if a.test_result == "fail")
|
|
473
|
+
return failed_attempts >= _MAX_DEBUG_CYCLES
|