@oswaldzsh/devhive 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -0
- package/__init__.py +0 -0
- package/agents/__init__.py +0 -0
- package/agents/base.py +118 -0
- package/agents/execute.py +150 -0
- package/agents/verifier_dynamic.py +164 -0
- package/agents/verifier_semantic.py +84 -0
- package/agents/verifier_static.py +153 -0
- package/bin/dh +77 -0
- package/config.yaml +71 -0
- package/control_plane/__init__.py +0 -0
- package/control_plane/cli.py +596 -0
- package/control_plane/dashboard.py +57 -0
- package/control_plane/notifications.py +54 -0
- package/control_plane/tui.py +352 -0
- package/install.sh +67 -0
- package/orchestrator/__init__.py +0 -0
- package/orchestrator/agent_pool.py +107 -0
- package/orchestrator/convergence_gate.py +133 -0
- package/orchestrator/engine.py +353 -0
- package/orchestrator/event_bus.py +58 -0
- package/orchestrator/task_queue.py +59 -0
- package/package.json +50 -0
- package/protocol/__init__.py +0 -0
- package/protocol/schemas.py +222 -0
- package/setup.py +44 -0
- package/signature/__init__.py +0 -0
- package/signature/engine.py +211 -0
- package/signature/extractor.py +156 -0
- package/signature/learner.py +75 -0
- package/signature/src/matcher.c +263 -0
- package/signature/src/matcher.h +135 -0
- package/signatures/seed_signatures.json +174 -0
- package/storage/__init__.py +0 -0
- package/storage/checkpoint.py +153 -0
- package/storage/signature_db.py +62 -0
- package/tools/__init__.py +0 -0
- package/tools/api_client.py +101 -0
- package/tools/git.py +75 -0
- package/tools/sandbox.py +79 -0
- package/verification/__init__.py +0 -0
- package/verification/diagnostic.py +124 -0
- package/verification/patterns/api_breaking.yaml +25 -0
- package/verification/patterns/code_quality.yaml +41 -0
- package/verification/patterns/security.yaml +41 -0
- package/verification/pipeline.py +61 -0
package/README.md
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# DevHive
|
|
2
|
+
|
|
3
|
+
Multi-Agent Software Development System — autonomous coding with verify-specialized agents, failure signatures, and structured handoff protocols.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install -g devhive
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Or via pip:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install devhive
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Requires Python 3.12+.
|
|
18
|
+
|
|
19
|
+
## Quick start
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# Start interactive REPL with live dashboard
|
|
23
|
+
dh
|
|
24
|
+
|
|
25
|
+
# Submit a task from natural language
|
|
26
|
+
dh do "fix the login timeout bug in auth/session.py"
|
|
27
|
+
|
|
28
|
+
# Check system status
|
|
29
|
+
dh status
|
|
30
|
+
|
|
31
|
+
# Review and resolve escalations
|
|
32
|
+
dh review
|
|
33
|
+
|
|
34
|
+
# View task execution timeline
|
|
35
|
+
dh log <task-id>
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Commands
|
|
39
|
+
|
|
40
|
+
| Command | Description |
|
|
41
|
+
|---------|-------------|
|
|
42
|
+
| `dh` | Interactive REPL with live dashboard |
|
|
43
|
+
| `dh do <desc>` | Submit task from natural language |
|
|
44
|
+
| `dh do -f <file>` | Submit task from JSON/YAML spec |
|
|
45
|
+
| `dh status` | Show system status |
|
|
46
|
+
| `dh log <id>` | Show task execution timeline |
|
|
47
|
+
| `dh review` | Interactive escalation review |
|
|
48
|
+
| `dh resolve <id>` | Resolve an escalation |
|
|
49
|
+
| `dh run` | Start orchestrator daemon |
|
|
50
|
+
|
|
51
|
+
## Configuration
|
|
52
|
+
|
|
53
|
+
Copy `config.yaml` to `config.local.yaml` and set your API credentials:
|
|
54
|
+
|
|
55
|
+
```yaml
|
|
56
|
+
api:
|
|
57
|
+
base_url: "https://your-api-endpoint"
|
|
58
|
+
auth_token: "${YOUR_TOKEN}"
|
|
59
|
+
default_model: "your-model"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Architecture
|
|
63
|
+
|
|
64
|
+
DevHive uses a multi-agent architecture with three verification layers:
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
Task → Execute Agent → L1 Verify (Static + Dynamic) → L2 Verify (Semantic) → Merge
|
|
68
|
+
↓ fail ↓ fail
|
|
69
|
+
Signature Match Human Escalation
|
|
70
|
+
↓ ↓
|
|
71
|
+
Auto-fix / Retry Human resolves
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Agent Types
|
|
75
|
+
|
|
76
|
+
- **Execute Agent** — Produces code changes with structured Handoff
|
|
77
|
+
- **Static Verifier** — Detects code pattern issues (API breaks, security, quality)
|
|
78
|
+
- **Dynamic Verifier** — Runs tests, detects drift, matches failure signatures
|
|
79
|
+
- **Semantic Verifier** — Validates Spec alignment (on merge gate)
|
|
80
|
+
|
|
81
|
+
### Key Design Decisions
|
|
82
|
+
|
|
83
|
+
- Agents communicate via **structured JSON Handoff**, not natural language
|
|
84
|
+
- **No Agent arbitrates Agents** — Diagnostic Aggregator is a deterministic rule engine
|
|
85
|
+
- Failure diagnosis uses a **growing signature database** with weighted feature matching
|
|
86
|
+
- **Human intervenes**, not "human in the loop"
|
|
87
|
+
- All escalation thresholds are **quantified**, not fuzzy
|
|
88
|
+
|
|
89
|
+
## License
|
|
90
|
+
|
|
91
|
+
MIT
|
package/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
package/agents/base.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Agent base class — process lifecycle, tool registration, context management."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import multiprocessing
|
|
5
|
+
import os
|
|
6
|
+
import signal
|
|
7
|
+
import time
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from tools.api_client import APIClient, extract_text_from_response, extract_tool_use
|
|
13
|
+
from protocol.schemas import (
|
|
14
|
+
Task, ExecutionHandoff, Verdict, DevHiveEvent,
|
|
15
|
+
Finding, Severity, SuggestedAction, VerdictOverall, VerifierType,
|
|
16
|
+
EscalationReport,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AgentStuckException(Exception):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AgentProcess(multiprocessing.Process, ABC):
|
|
25
|
+
"""Each Agent runs in an independent process, communicating via UDS/Queue."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, agent_id: str, agent_type: str,
|
|
28
|
+
task_queue: multiprocessing.Queue,
|
|
29
|
+
result_queue: multiprocessing.Queue,
|
|
30
|
+
config: dict = None):
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.agent_id = agent_id
|
|
33
|
+
self.agent_type = agent_type
|
|
34
|
+
self._task_queue = task_queue
|
|
35
|
+
self._result_queue = result_queue
|
|
36
|
+
self.config = config or {}
|
|
37
|
+
self.api_client: Optional[APIClient] = None
|
|
38
|
+
self._stop_event = multiprocessing.Event()
|
|
39
|
+
|
|
40
|
+
def run(self):
|
|
41
|
+
"""Agent main loop — wait for tasks, execute, emit results."""
|
|
42
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
|
43
|
+
self.api_client = APIClient(
|
|
44
|
+
base_url=self.config.get("base_url"),
|
|
45
|
+
auth_token=self.config.get("auth_token"),
|
|
46
|
+
default_model=self.config.get("default_model"),
|
|
47
|
+
)
|
|
48
|
+
self._setup()
|
|
49
|
+
self._emit_idle()
|
|
50
|
+
|
|
51
|
+
while not self._stop_event.is_set():
|
|
52
|
+
try:
|
|
53
|
+
task_data = self._task_queue.get(timeout=2)
|
|
54
|
+
except Exception:
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
if task_data is None: # poison pill
|
|
58
|
+
break
|
|
59
|
+
|
|
60
|
+
task = Task(**task_data) if isinstance(task_data, dict) else task_data
|
|
61
|
+
start_time = time.monotonic()
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
result = self._execute(task)
|
|
65
|
+
duration_ms = int((time.monotonic() - start_time) * 1000)
|
|
66
|
+
self._emit_result({"type": "success", "task_id": task.id,
|
|
67
|
+
"result": result, "duration_ms": duration_ms})
|
|
68
|
+
except AgentStuckException as e:
|
|
69
|
+
duration_ms = int((time.monotonic() - start_time) * 1000)
|
|
70
|
+
self._emit_result({"type": "stuck", "task_id": task.id,
|
|
71
|
+
"error": str(e), "duration_ms": duration_ms})
|
|
72
|
+
except Exception as e:
|
|
73
|
+
duration_ms = int((time.monotonic() - start_time) * 1000)
|
|
74
|
+
self._emit_result({"type": "error", "task_id": task.id,
|
|
75
|
+
"error": str(e), "duration_ms": duration_ms})
|
|
76
|
+
|
|
77
|
+
def stop(self):
|
|
78
|
+
self._stop_event.set()
|
|
79
|
+
|
|
80
|
+
def _setup(self):
|
|
81
|
+
"""Override for agent-specific initialization."""
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def _execute(self, task: Task) -> dict | ExecutionHandoff | Verdict:
|
|
86
|
+
"""Execute the task. Must be implemented by each agent type."""
|
|
87
|
+
...
|
|
88
|
+
|
|
89
|
+
def _emit_idle(self):
|
|
90
|
+
self._result_queue.put({"type": "agent.idle", "agent_id": self.agent_id,
|
|
91
|
+
"agent_type": self.agent_type})
|
|
92
|
+
|
|
93
|
+
def _emit_result(self, result: dict):
|
|
94
|
+
result["agent_id"] = self.agent_id
|
|
95
|
+
result["agent_type"] = self.agent_type
|
|
96
|
+
self._result_queue.put(result)
|
|
97
|
+
|
|
98
|
+
def _call_model(self, system_prompt: str, user_message: str,
|
|
99
|
+
tools: list[dict] = None,
|
|
100
|
+
max_tokens: int = 4096,
|
|
101
|
+
model: str = None) -> dict:
|
|
102
|
+
"""Convenience wrapper for API calls."""
|
|
103
|
+
return self.api_client.create_message(
|
|
104
|
+
system=system_prompt,
|
|
105
|
+
messages=[{"role": "user", "content": user_message}],
|
|
106
|
+
tools=tools,
|
|
107
|
+
max_tokens=max_tokens,
|
|
108
|
+
model=model,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def _call_model_sync(self, *args, **kwargs):
|
|
112
|
+
"""Synchronous version for use in multiprocessing context."""
|
|
113
|
+
import asyncio
|
|
114
|
+
loop = asyncio.new_event_loop()
|
|
115
|
+
try:
|
|
116
|
+
return loop.run_until_complete(self._call_model(*args, **kwargs))
|
|
117
|
+
finally:
|
|
118
|
+
loop.close()
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Execute Agent — receives Task, produces code changes + Handoff."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
from agents.base import AgentProcess, AgentStuckException
|
|
7
|
+
from protocol.schemas import (
|
|
8
|
+
Task, ExecutionHandoff, FileChange, VerificationFocus,
|
|
9
|
+
EnvChanges, RiskAssessment, ExecutionTrace, ChangeType, Priority, Severity,
|
|
10
|
+
)
|
|
11
|
+
from tools.git import GitOps, GitError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
EXECUTE_SYSTEM_PROMPT = """You are an Execute Agent in the DevHive system. Your job is to:
|
|
15
|
+
|
|
16
|
+
1. Read the task Spec carefully — understand the target state
|
|
17
|
+
2. Explore the codebase to understand the current state and relevant code
|
|
18
|
+
3. Produce code changes that move the system toward the target state
|
|
19
|
+
4. Run self-checks (compile, lint, basic tests) to verify your changes work
|
|
20
|
+
5. Output a structured Execution Handoff in JSON format
|
|
21
|
+
|
|
22
|
+
CRITICAL RULES:
|
|
23
|
+
- Never merge code. Your output goes to a Verifier, not directly to the main branch.
|
|
24
|
+
- If you are uncertain about any change, note it in risk_self_assessment.
|
|
25
|
+
- Always specify verification_focus — what the verifier should check most carefully.
|
|
26
|
+
- Report env_changes — new dependencies, config changes, migrations needed.
|
|
27
|
+
- If you cannot complete the task, escalate with a clear explanation of what's blocking you.
|
|
28
|
+
|
|
29
|
+
OUTPUT FORMAT:
|
|
30
|
+
You MUST end your response with a JSON block labeled HANDOFF:
|
|
31
|
+
```json
|
|
32
|
+
{
|
|
33
|
+
"intent": "one-line summary of what you changed and why",
|
|
34
|
+
"changes": [...],
|
|
35
|
+
"verification_focus": [...],
|
|
36
|
+
"env_changes": {...},
|
|
37
|
+
"execution_trace": {...}
|
|
38
|
+
}
|
|
39
|
+
```
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ExecuteAgent(AgentProcess):
|
|
44
|
+
"""Produces code changes based on Task Spec."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, agent_id: str, task_queue, result_queue, config: dict = None):
|
|
47
|
+
super().__init__(agent_id, "execute", task_queue, result_queue, config)
|
|
48
|
+
self.git = None
|
|
49
|
+
self.max_self_retries = 2
|
|
50
|
+
|
|
51
|
+
def _setup(self):
|
|
52
|
+
self.git = GitOps(self.config.get("repo_path", "."))
|
|
53
|
+
|
|
54
|
+
def _execute(self, task: Task) -> dict:
|
|
55
|
+
attempt = 0
|
|
56
|
+
last_error = None
|
|
57
|
+
|
|
58
|
+
while attempt <= self.max_self_retries:
|
|
59
|
+
try:
|
|
60
|
+
handoff = self._run_execute(task)
|
|
61
|
+
# Self-check: run basic verification
|
|
62
|
+
if not self._self_check(handoff):
|
|
63
|
+
attempt += 1
|
|
64
|
+
last_error = "Self-check failed"
|
|
65
|
+
continue
|
|
66
|
+
return {"handoff": handoff.model_dump()}
|
|
67
|
+
except Exception as e:
|
|
68
|
+
attempt += 1
|
|
69
|
+
last_error = str(e)
|
|
70
|
+
|
|
71
|
+
raise AgentStuckException(
|
|
72
|
+
f"ExecuteAgent {self.agent_id} failed after {self.max_self_retries + 1} "
|
|
73
|
+
f"attempts. Last error: {last_error}"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def _run_execute(self, task: Task) -> ExecutionHandoff:
|
|
77
|
+
"""Run the model to produce code changes."""
|
|
78
|
+
spec_text = json.dumps(task.spec.model_dump(), indent=2)
|
|
79
|
+
current_files = self.git.changed_files() or ["(clean working tree)"]
|
|
80
|
+
|
|
81
|
+
user_message = f"""## Task Spec
|
|
82
|
+
{spec_text}
|
|
83
|
+
|
|
84
|
+
## Current State
|
|
85
|
+
Branch: {self.git.current_branch()}
|
|
86
|
+
Base commit: {task.base_commit}
|
|
87
|
+
Changed files: {', '.join(current_files)}
|
|
88
|
+
|
|
89
|
+
## Instructions
|
|
90
|
+
Read the task spec, explore the codebase, make the necessary changes.
|
|
91
|
+
Output the HANDOFF JSON when done.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# In production this would be an interactive conversation with tool use.
|
|
95
|
+
# For the MVP, we do a single-turn call expecting the HANDOFF.
|
|
96
|
+
import asyncio
|
|
97
|
+
loop = asyncio.new_event_loop()
|
|
98
|
+
try:
|
|
99
|
+
response = loop.run_until_complete(
|
|
100
|
+
self._call_model(EXECUTE_SYSTEM_PROMPT, user_message,
|
|
101
|
+
max_tokens=8192,
|
|
102
|
+
model=self.config.get("execute_model"))
|
|
103
|
+
)
|
|
104
|
+
finally:
|
|
105
|
+
loop.close()
|
|
106
|
+
|
|
107
|
+
handoff = self._parse_handoff(response)
|
|
108
|
+
return handoff
|
|
109
|
+
|
|
110
|
+
def _parse_handoff(self, response: dict) -> ExecutionHandoff:
|
|
111
|
+
"""Extract HANDOFF JSON from model response."""
|
|
112
|
+
text = ""
|
|
113
|
+
for block in response.get("content", []):
|
|
114
|
+
if block.get("type") == "text":
|
|
115
|
+
text += block["text"]
|
|
116
|
+
|
|
117
|
+
# Try to extract JSON between ```json ... ``` markers
|
|
118
|
+
import re
|
|
119
|
+
match = re.search(r'```json\s*\n(.*?)\n```', text, re.DOTALL)
|
|
120
|
+
if match:
|
|
121
|
+
handoff_data = json.loads(match.group(1))
|
|
122
|
+
else:
|
|
123
|
+
# Try parsing the whole text as JSON
|
|
124
|
+
handoff_data = json.loads(text)
|
|
125
|
+
|
|
126
|
+
return ExecutionHandoff(
|
|
127
|
+
source=self.agent_id,
|
|
128
|
+
task_id=handoff_data.get("task_id", ""),
|
|
129
|
+
intent=handoff_data["intent"],
|
|
130
|
+
changes=[FileChange(**c) for c in handoff_data.get("changes", [])],
|
|
131
|
+
verification_focus=[
|
|
132
|
+
VerificationFocus(**v) for v in handoff_data.get("verification_focus", [])
|
|
133
|
+
],
|
|
134
|
+
env_changes=EnvChanges(**handoff_data.get("env_changes", {})),
|
|
135
|
+
execution_trace=ExecutionTrace(
|
|
136
|
+
**handoff_data.get("execution_trace", {"self_check_passed": False})
|
|
137
|
+
),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def _self_check(self, handoff: ExecutionHandoff) -> bool:
|
|
141
|
+
"""Run basic self-checks: compile/lint if applicable."""
|
|
142
|
+
# Run linter/type-checker on changed files
|
|
143
|
+
changed_files = [c.file for c in handoff.changes]
|
|
144
|
+
for f in changed_files:
|
|
145
|
+
if f.endswith(".py"):
|
|
146
|
+
try:
|
|
147
|
+
self.git._run(["run", "mypy", f, "--ignore-missing-imports"])
|
|
148
|
+
except Exception:
|
|
149
|
+
pass # Non-blocking for now
|
|
150
|
+
return True # MVP: self-check is advisory, not blocking
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Dynamic Verifier — runs tests, detects drift, matches failure signatures."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import time
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from agents.base import AgentProcess, AgentStuckException
|
|
10
|
+
from protocol.schemas import (
|
|
11
|
+
Task, Verdict, Finding, FindingEvidence,
|
|
12
|
+
Severity, SuggestedAction, VerdictOverall, VerifierType,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
DYNAMIC_SYSTEM_PROMPT = """You are a Dynamic Verifier Agent. Your job is to:
|
|
17
|
+
|
|
18
|
+
1. Run tests affected by the code change (not full suite — use call graph)
|
|
19
|
+
2. Analyze test results beyond red/green:
|
|
20
|
+
- Match failures against known signatures
|
|
21
|
+
- Detect performance regressions (runtime, memory)
|
|
22
|
+
- Detect behavioral drifts (log volume, output changes)
|
|
23
|
+
3. Output a structured Verdict in JSON format
|
|
24
|
+
|
|
25
|
+
OUTPUT FORMAT:
|
|
26
|
+
```json
|
|
27
|
+
{
|
|
28
|
+
"overall": "PASS|WARN|FAIL",
|
|
29
|
+
"findings": [
|
|
30
|
+
{
|
|
31
|
+
"severity": "HIGH|MEDIUM|LOW",
|
|
32
|
+
"category": "test_failure|perf_regression|coverage_gap|drift|flaky",
|
|
33
|
+
"title": "short description",
|
|
34
|
+
"detail": "detailed explanation",
|
|
35
|
+
"matched_signature": "sig-XXXX or null",
|
|
36
|
+
"suggested_action": "RETEST|FIX|ESCALATE|IGNORE"
|
|
37
|
+
}
|
|
38
|
+
]
|
|
39
|
+
}
|
|
40
|
+
```
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DynamicVerifier(AgentProcess):
|
|
45
|
+
"""Runs tests and analyzes results."""
|
|
46
|
+
|
|
47
|
+
def __init__(self, agent_id: str, task_queue, result_queue, config: dict = None):
|
|
48
|
+
super().__init__(agent_id, "dynamic_verifier", task_queue, result_queue, config)
|
|
49
|
+
self.drift_thresholds = {}
|
|
50
|
+
self.signature_engine = None
|
|
51
|
+
|
|
52
|
+
def _setup(self):
|
|
53
|
+
self.drift_thresholds = {
|
|
54
|
+
"duration_change_pct": 20,
|
|
55
|
+
"memory_change_pct": 50,
|
|
56
|
+
"log_volume_change_multiplier": 3.0,
|
|
57
|
+
}
|
|
58
|
+
self.drift_thresholds.update(
|
|
59
|
+
self.config.get("drift_thresholds", {})
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def _execute(self, task: Task) -> Verdict:
|
|
63
|
+
findings = []
|
|
64
|
+
|
|
65
|
+
# 1. Run tests
|
|
66
|
+
test_findings = self._run_tests(task)
|
|
67
|
+
findings.extend(test_findings)
|
|
68
|
+
|
|
69
|
+
# 2. Check for drift
|
|
70
|
+
drift_findings = self._check_drift(task)
|
|
71
|
+
findings.extend(drift_findings)
|
|
72
|
+
|
|
73
|
+
# 3. Match against signature DB
|
|
74
|
+
for f in findings:
|
|
75
|
+
if f.severity in (Severity.HIGH, Severity.CRITICAL):
|
|
76
|
+
matched = self._match_signature(f)
|
|
77
|
+
if matched:
|
|
78
|
+
f.matched_signature = matched["signature_id"]
|
|
79
|
+
f.suggested_action = SuggestedAction(matched.get("resolution", {}).get(
|
|
80
|
+
"strategy", "ESCALATE"))
|
|
81
|
+
|
|
82
|
+
# Determine overall
|
|
83
|
+
has_fail = any(f.severity in (Severity.HIGH, Severity.CRITICAL) for f in findings)
|
|
84
|
+
has_warn = any(f.severity == Severity.MEDIUM for f in findings)
|
|
85
|
+
|
|
86
|
+
if has_fail:
|
|
87
|
+
overall = VerdictOverall.FAIL
|
|
88
|
+
elif has_warn:
|
|
89
|
+
overall = VerdictOverall.WARN
|
|
90
|
+
else:
|
|
91
|
+
overall = VerdictOverall.PASS
|
|
92
|
+
|
|
93
|
+
return Verdict(
|
|
94
|
+
verifier_type=VerifierType.DYNAMIC,
|
|
95
|
+
task_id=task.id,
|
|
96
|
+
overall=overall,
|
|
97
|
+
findings=findings,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def _run_tests(self, task: Task) -> list[Finding]:
|
|
101
|
+
"""Run relevant tests based on call graph or Handoff verification_focus."""
|
|
102
|
+
findings = []
|
|
103
|
+
repo_path = self.config.get("repo_path", ".")
|
|
104
|
+
|
|
105
|
+
# Run pytest if it's a Python project
|
|
106
|
+
if os.path.exists(os.path.join(repo_path, "pyproject.toml")) or \
|
|
107
|
+
os.path.exists(os.path.join(repo_path, "setup.py")) or \
|
|
108
|
+
os.path.exists(os.path.join(repo_path, "setup.cfg")):
|
|
109
|
+
try:
|
|
110
|
+
start = time.monotonic()
|
|
111
|
+
result = subprocess.run(
|
|
112
|
+
["pytest", "-x", "--tb=short", "-q"],
|
|
113
|
+
cwd=repo_path, capture_output=True, text=True, timeout=300
|
|
114
|
+
)
|
|
115
|
+
duration = time.monotonic() - start
|
|
116
|
+
|
|
117
|
+
if result.returncode != 0:
|
|
118
|
+
findings.append(Finding(
|
|
119
|
+
severity=Severity.HIGH,
|
|
120
|
+
category="test_failure",
|
|
121
|
+
title="Tests failed",
|
|
122
|
+
detail=result.stdout[-2000:] + "\n" + result.stderr[-1000:],
|
|
123
|
+
evidence=FindingEvidence(type="log", data=result.stdout),
|
|
124
|
+
suggested_action=SuggestedAction.FIX,
|
|
125
|
+
))
|
|
126
|
+
elif duration > 60:
|
|
127
|
+
findings.append(Finding(
|
|
128
|
+
severity=Severity.LOW,
|
|
129
|
+
category="perf_regression",
|
|
130
|
+
title=f"Test suite took {duration:.1f}s (>60s threshold)",
|
|
131
|
+
detail=f"Previous baseline: N/A. Current: {duration:.1f}s",
|
|
132
|
+
evidence=FindingEvidence(type="metric", data=str(duration)),
|
|
133
|
+
suggested_action=SuggestedAction.ESCALATE,
|
|
134
|
+
))
|
|
135
|
+
|
|
136
|
+
except subprocess.TimeoutExpired:
|
|
137
|
+
findings.append(Finding(
|
|
138
|
+
severity=Severity.HIGH,
|
|
139
|
+
category="test_failure",
|
|
140
|
+
title="Test suite timed out after 300s",
|
|
141
|
+
detail="Tests did not complete within the timeout.",
|
|
142
|
+
evidence=FindingEvidence(type="log", data="TIMEOUT"),
|
|
143
|
+
suggested_action=SuggestedAction.ESCALATE,
|
|
144
|
+
))
|
|
145
|
+
|
|
146
|
+
return findings
|
|
147
|
+
|
|
148
|
+
def _check_drift(self, task: Task) -> list[Finding]:
|
|
149
|
+
"""Detect behavioral drift: performance, memory, output changes."""
|
|
150
|
+
return [] # MVP: drift detection requires baseline data, skipped for now
|
|
151
|
+
|
|
152
|
+
def _match_signature(self, finding: Finding) -> Optional[dict]:
|
|
153
|
+
"""Query signature DB for matching failure patterns."""
|
|
154
|
+
if not self.signature_engine:
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
results = self.signature_engine.match({
|
|
159
|
+
"error_type": finding.category,
|
|
160
|
+
"error_location_pattern": finding.title,
|
|
161
|
+
}, k=1, min_confidence=0.65)
|
|
162
|
+
return results[0] if results else None
|
|
163
|
+
except Exception:
|
|
164
|
+
return None
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Semantic Verifier — checks if changes align with Spec intent."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from agents.base import AgentProcess
|
|
6
|
+
from protocol.schemas import (
|
|
7
|
+
Task, SemanticVerdict, Finding, FindingEvidence,
|
|
8
|
+
Severity, SuggestedAction, VerdictOverall, Alignment,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
SEMANTIC_SYSTEM_PROMPT = """You are a Semantic Verifier Agent. Your job is to:
|
|
13
|
+
|
|
14
|
+
1. Read the original Spec and the code changes (from the Execution Handoff)
|
|
15
|
+
2. Determine if the changes align with the Spec's intent — not just its text
|
|
16
|
+
3. Categorize the alignment:
|
|
17
|
+
- ALIGNED: Changes match Spec intent exactly
|
|
18
|
+
- ENHANCED: Changes add reasonable improvements beyond Spec (flag for human review)
|
|
19
|
+
- DEVIATED: Changes diverge from Spec intent (block)
|
|
20
|
+
- CONFLICT: Changes contradict Spec or are internally inconsistent (block)
|
|
21
|
+
|
|
22
|
+
4. Output your verdict in JSON format
|
|
23
|
+
|
|
24
|
+
OUTPUT FORMAT:
|
|
25
|
+
```json
|
|
26
|
+
{
|
|
27
|
+
"alignment": "ALIGNED|ENHANCED|DEVIATED|CONFLICT",
|
|
28
|
+
"reasoning": "detailed explanation of your judgment",
|
|
29
|
+
"concerns": ["list of specific concerns, empty if none"]
|
|
30
|
+
}
|
|
31
|
+
```
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SemanticVerifier(AgentProcess):
|
|
36
|
+
"""Validates semantic alignment between changes and Spec."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, agent_id: str, task_queue, result_queue, config: dict = None):
|
|
39
|
+
super().__init__(agent_id, "semantic_verifier", task_queue, result_queue, config)
|
|
40
|
+
|
|
41
|
+
def _execute(self, task: Task) -> SemanticVerdict:
|
|
42
|
+
import asyncio
|
|
43
|
+
|
|
44
|
+
task_json = json.dumps(task.model_dump(), indent=2, default=str)
|
|
45
|
+
|
|
46
|
+
loop = asyncio.new_event_loop()
|
|
47
|
+
try:
|
|
48
|
+
response = loop.run_until_complete(
|
|
49
|
+
self._call_model(SEMANTIC_SYSTEM_PROMPT,
|
|
50
|
+
f"## Task with Spec, Handoff, and Verdicts\n\n{task_json}",
|
|
51
|
+
max_tokens=4096)
|
|
52
|
+
)
|
|
53
|
+
finally:
|
|
54
|
+
loop.close()
|
|
55
|
+
|
|
56
|
+
return self._parse_verdict(response)
|
|
57
|
+
|
|
58
|
+
def _parse_verdict(self, response: dict) -> SemanticVerdict:
|
|
59
|
+
import re
|
|
60
|
+
text = ""
|
|
61
|
+
for block in response.get("content", []):
|
|
62
|
+
if block.get("type") == "text":
|
|
63
|
+
text += block["text"]
|
|
64
|
+
|
|
65
|
+
match = re.search(r'```json\s*\n(.*?)\n```', text, re.DOTALL)
|
|
66
|
+
if not match:
|
|
67
|
+
return SemanticVerdict(
|
|
68
|
+
task_id="",
|
|
69
|
+
alignment=Alignment.CONFLICT,
|
|
70
|
+
reasoning="Failed to parse model output",
|
|
71
|
+
overall=VerdictOverall.FAIL,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
data = json.loads(match.group(1))
|
|
75
|
+
alignment = Alignment(data.get("alignment", "DEVIATED"))
|
|
76
|
+
|
|
77
|
+
return SemanticVerdict(
|
|
78
|
+
task_id=data.get("task_id", ""),
|
|
79
|
+
alignment=alignment,
|
|
80
|
+
reasoning=data.get("reasoning", ""),
|
|
81
|
+
concerns=data.get("concerns", []),
|
|
82
|
+
overall=VerdictOverall.PASS if alignment in (Alignment.ALIGNED, Alignment.ENHANCED)
|
|
83
|
+
else VerdictOverall.FAIL,
|
|
84
|
+
)
|