@oswaldzsh/devhive 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +91 -0
  2. package/__init__.py +0 -0
  3. package/agents/__init__.py +0 -0
  4. package/agents/base.py +118 -0
  5. package/agents/execute.py +150 -0
  6. package/agents/verifier_dynamic.py +164 -0
  7. package/agents/verifier_semantic.py +84 -0
  8. package/agents/verifier_static.py +153 -0
  9. package/bin/dh +77 -0
  10. package/config.yaml +71 -0
  11. package/control_plane/__init__.py +0 -0
  12. package/control_plane/cli.py +596 -0
  13. package/control_plane/dashboard.py +57 -0
  14. package/control_plane/notifications.py +54 -0
  15. package/control_plane/tui.py +352 -0
  16. package/install.sh +67 -0
  17. package/orchestrator/__init__.py +0 -0
  18. package/orchestrator/agent_pool.py +107 -0
  19. package/orchestrator/convergence_gate.py +133 -0
  20. package/orchestrator/engine.py +353 -0
  21. package/orchestrator/event_bus.py +58 -0
  22. package/orchestrator/task_queue.py +59 -0
  23. package/package.json +50 -0
  24. package/protocol/__init__.py +0 -0
  25. package/protocol/schemas.py +222 -0
  26. package/setup.py +44 -0
  27. package/signature/__init__.py +0 -0
  28. package/signature/engine.py +211 -0
  29. package/signature/extractor.py +156 -0
  30. package/signature/learner.py +75 -0
  31. package/signature/src/matcher.c +263 -0
  32. package/signature/src/matcher.h +135 -0
  33. package/signatures/seed_signatures.json +174 -0
  34. package/storage/__init__.py +0 -0
  35. package/storage/checkpoint.py +153 -0
  36. package/storage/signature_db.py +62 -0
  37. package/tools/__init__.py +0 -0
  38. package/tools/api_client.py +101 -0
  39. package/tools/git.py +75 -0
  40. package/tools/sandbox.py +79 -0
  41. package/verification/__init__.py +0 -0
  42. package/verification/diagnostic.py +124 -0
  43. package/verification/patterns/api_breaking.yaml +25 -0
  44. package/verification/patterns/code_quality.yaml +41 -0
  45. package/verification/patterns/security.yaml +41 -0
  46. package/verification/pipeline.py +61 -0
package/README.md ADDED
@@ -0,0 +1,91 @@
1
+ # DevHive
2
+
3
+ Multi-Agent Software Development System — autonomous coding with verify-specialized agents, failure signatures, and structured handoff protocols.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm install -g devhive
9
+ ```
10
+
11
+ Or via pip:
12
+
13
+ ```bash
14
+ pip install devhive
15
+ ```
16
+
17
+ Requires Python 3.12+.
18
+
19
+ ## Quick start
20
+
21
+ ```bash
22
+ # Start interactive REPL with live dashboard
23
+ dh
24
+
25
+ # Submit a task from natural language
26
+ dh do "fix the login timeout bug in auth/session.py"
27
+
28
+ # Check system status
29
+ dh status
30
+
31
+ # Review and resolve escalations
32
+ dh review
33
+
34
+ # View task execution timeline
35
+ dh log <task-id>
36
+ ```
37
+
38
+ ## Commands
39
+
40
+ | Command | Description |
41
+ |---------|-------------|
42
+ | `dh` | Interactive REPL with live dashboard |
43
+ | `dh do <desc>` | Submit task from natural language |
44
+ | `dh do -f <file>` | Submit task from JSON/YAML spec |
45
+ | `dh status` | Show system status |
46
+ | `dh log <id>` | Show task execution timeline |
47
+ | `dh review` | Interactive escalation review |
48
+ | `dh resolve <id>` | Resolve an escalation |
49
+ | `dh run` | Start orchestrator daemon |
50
+
51
+ ## Configuration
52
+
53
+ Copy `config.yaml` to `config.local.yaml` and set your API credentials:
54
+
55
+ ```yaml
56
+ api:
57
+ base_url: "https://your-api-endpoint"
58
+ auth_token: "${YOUR_TOKEN}"
59
+ default_model: "your-model"
60
+ ```
61
+
62
+ ## Architecture
63
+
64
+ DevHive uses a multi-agent architecture with three verification layers:
65
+
66
+ ```
67
+ Task → Execute Agent → L1 Verify (Static + Dynamic) → L2 Verify (Semantic) → Merge
68
+ ↓ fail ↓ fail
69
+ Signature Match Human Escalation
70
+ ↓ ↓
71
+ Auto-fix / Retry Human resolves
72
+ ```
73
+
74
+ ### Agent Types
75
+
76
+ - **Execute Agent** — Produces code changes with structured Handoff
77
+ - **Static Verifier** — Detects code pattern issues (API breaks, security, quality)
78
+ - **Dynamic Verifier** — Runs tests, detects drift, matches failure signatures
79
+ - **Semantic Verifier** — Validates Spec alignment (on merge gate)
80
+
81
+ ### Key Design Decisions
82
+
83
+ - Agents communicate via **structured JSON Handoff**, not natural language
84
+ - **No Agent arbitrates Agents** — Diagnostic Aggregator is a deterministic rule engine
85
+ - Failure diagnosis uses a **growing signature database** with weighted feature matching
86
+ - **Human intervenes**, not "human in the loop"
87
+ - All escalation thresholds are **quantified**, not fuzzy
88
+
89
+ ## License
90
+
91
+ MIT
package/__init__.py ADDED
File without changes
File without changes
package/agents/base.py ADDED
@@ -0,0 +1,118 @@
1
+ """Agent base class — process lifecycle, tool registration, context management."""
2
+
3
+ import json
4
+ import multiprocessing
5
+ import os
6
+ import signal
7
+ import time
8
+ from abc import ABC, abstractmethod
9
+ from datetime import datetime, timezone
10
+ from typing import Optional
11
+
12
+ from tools.api_client import APIClient, extract_text_from_response, extract_tool_use
13
+ from protocol.schemas import (
14
+ Task, ExecutionHandoff, Verdict, DevHiveEvent,
15
+ Finding, Severity, SuggestedAction, VerdictOverall, VerifierType,
16
+ EscalationReport,
17
+ )
18
+
19
+
20
+ class AgentStuckException(Exception):
21
+ pass
22
+
23
+
24
+ class AgentProcess(multiprocessing.Process, ABC):
25
+ """Each Agent runs in an independent process, communicating via UDS/Queue."""
26
+
27
+ def __init__(self, agent_id: str, agent_type: str,
28
+ task_queue: multiprocessing.Queue,
29
+ result_queue: multiprocessing.Queue,
30
+ config: dict = None):
31
+ super().__init__()
32
+ self.agent_id = agent_id
33
+ self.agent_type = agent_type
34
+ self._task_queue = task_queue
35
+ self._result_queue = result_queue
36
+ self.config = config or {}
37
+ self.api_client: Optional[APIClient] = None
38
+ self._stop_event = multiprocessing.Event()
39
+
40
+ def run(self):
41
+ """Agent main loop — wait for tasks, execute, emit results."""
42
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
43
+ self.api_client = APIClient(
44
+ base_url=self.config.get("base_url"),
45
+ auth_token=self.config.get("auth_token"),
46
+ default_model=self.config.get("default_model"),
47
+ )
48
+ self._setup()
49
+ self._emit_idle()
50
+
51
+ while not self._stop_event.is_set():
52
+ try:
53
+ task_data = self._task_queue.get(timeout=2)
54
+ except Exception:
55
+ continue
56
+
57
+ if task_data is None: # poison pill
58
+ break
59
+
60
+ task = Task(**task_data) if isinstance(task_data, dict) else task_data
61
+ start_time = time.monotonic()
62
+
63
+ try:
64
+ result = self._execute(task)
65
+ duration_ms = int((time.monotonic() - start_time) * 1000)
66
+ self._emit_result({"type": "success", "task_id": task.id,
67
+ "result": result, "duration_ms": duration_ms})
68
+ except AgentStuckException as e:
69
+ duration_ms = int((time.monotonic() - start_time) * 1000)
70
+ self._emit_result({"type": "stuck", "task_id": task.id,
71
+ "error": str(e), "duration_ms": duration_ms})
72
+ except Exception as e:
73
+ duration_ms = int((time.monotonic() - start_time) * 1000)
74
+ self._emit_result({"type": "error", "task_id": task.id,
75
+ "error": str(e), "duration_ms": duration_ms})
76
+
77
+ def stop(self):
78
+ self._stop_event.set()
79
+
80
+ def _setup(self):
81
+ """Override for agent-specific initialization."""
82
+ pass
83
+
84
+ @abstractmethod
85
+ def _execute(self, task: Task) -> dict | ExecutionHandoff | Verdict:
86
+ """Execute the task. Must be implemented by each agent type."""
87
+ ...
88
+
89
+ def _emit_idle(self):
90
+ self._result_queue.put({"type": "agent.idle", "agent_id": self.agent_id,
91
+ "agent_type": self.agent_type})
92
+
93
+ def _emit_result(self, result: dict):
94
+ result["agent_id"] = self.agent_id
95
+ result["agent_type"] = self.agent_type
96
+ self._result_queue.put(result)
97
+
98
+ def _call_model(self, system_prompt: str, user_message: str,
99
+ tools: list[dict] = None,
100
+ max_tokens: int = 4096,
101
+ model: str = None) -> dict:
102
+ """Convenience wrapper for API calls."""
103
+ return self.api_client.create_message(
104
+ system=system_prompt,
105
+ messages=[{"role": "user", "content": user_message}],
106
+ tools=tools,
107
+ max_tokens=max_tokens,
108
+ model=model,
109
+ )
110
+
111
+ def _call_model_sync(self, *args, **kwargs):
112
+ """Synchronous version for use in multiprocessing context."""
113
+ import asyncio
114
+ loop = asyncio.new_event_loop()
115
+ try:
116
+ return loop.run_until_complete(self._call_model(*args, **kwargs))
117
+ finally:
118
+ loop.close()
@@ -0,0 +1,150 @@
1
+ """Execute Agent — receives Task, produces code changes + Handoff."""
2
+
3
+ import json
4
+ from datetime import datetime, timezone
5
+
6
+ from agents.base import AgentProcess, AgentStuckException
7
+ from protocol.schemas import (
8
+ Task, ExecutionHandoff, FileChange, VerificationFocus,
9
+ EnvChanges, RiskAssessment, ExecutionTrace, ChangeType, Priority, Severity,
10
+ )
11
+ from tools.git import GitOps, GitError
12
+
13
+
14
+ EXECUTE_SYSTEM_PROMPT = """You are an Execute Agent in the DevHive system. Your job is to:
15
+
16
+ 1. Read the task Spec carefully — understand the target state
17
+ 2. Explore the codebase to understand the current state and relevant code
18
+ 3. Produce code changes that move the system toward the target state
19
+ 4. Run self-checks (compile, lint, basic tests) to verify your changes work
20
+ 5. Output a structured Execution Handoff in JSON format
21
+
22
+ CRITICAL RULES:
23
+ - Never merge code. Your output goes to a Verifier, not directly to the main branch.
24
+ - If you are uncertain about any change, note it in risk_self_assessment.
25
+ - Always specify verification_focus — what the verifier should check most carefully.
26
+ - Report env_changes — new dependencies, config changes, migrations needed.
27
+ - If you cannot complete the task, escalate with a clear explanation of what's blocking you.
28
+
29
+ OUTPUT FORMAT:
30
+ You MUST end your response with a JSON block labeled HANDOFF:
31
+ ```json
32
+ {
33
+ "intent": "one-line summary of what you changed and why",
34
+ "changes": [...],
35
+ "verification_focus": [...],
36
+ "env_changes": {...},
37
+ "execution_trace": {...}
38
+ }
39
+ ```
40
+ """
41
+
42
+
43
+ class ExecuteAgent(AgentProcess):
44
+ """Produces code changes based on Task Spec."""
45
+
46
+ def __init__(self, agent_id: str, task_queue, result_queue, config: dict = None):
47
+ super().__init__(agent_id, "execute", task_queue, result_queue, config)
48
+ self.git = None
49
+ self.max_self_retries = 2
50
+
51
+ def _setup(self):
52
+ self.git = GitOps(self.config.get("repo_path", "."))
53
+
54
+ def _execute(self, task: Task) -> dict:
55
+ attempt = 0
56
+ last_error = None
57
+
58
+ while attempt <= self.max_self_retries:
59
+ try:
60
+ handoff = self._run_execute(task)
61
+ # Self-check: run basic verification
62
+ if not self._self_check(handoff):
63
+ attempt += 1
64
+ last_error = "Self-check failed"
65
+ continue
66
+ return {"handoff": handoff.model_dump()}
67
+ except Exception as e:
68
+ attempt += 1
69
+ last_error = str(e)
70
+
71
+ raise AgentStuckException(
72
+ f"ExecuteAgent {self.agent_id} failed after {self.max_self_retries + 1} "
73
+ f"attempts. Last error: {last_error}"
74
+ )
75
+
76
+ def _run_execute(self, task: Task) -> ExecutionHandoff:
77
+ """Run the model to produce code changes."""
78
+ spec_text = json.dumps(task.spec.model_dump(), indent=2)
79
+ current_files = self.git.changed_files() or ["(clean working tree)"]
80
+
81
+ user_message = f"""## Task Spec
82
+ {spec_text}
83
+
84
+ ## Current State
85
+ Branch: {self.git.current_branch()}
86
+ Base commit: {task.base_commit}
87
+ Changed files: {', '.join(current_files)}
88
+
89
+ ## Instructions
90
+ Read the task spec, explore the codebase, make the necessary changes.
91
+ Output the HANDOFF JSON when done.
92
+ """
93
+
94
+ # In production this would be an interactive conversation with tool use.
95
+ # For the MVP, we do a single-turn call expecting the HANDOFF.
96
+ import asyncio
97
+ loop = asyncio.new_event_loop()
98
+ try:
99
+ response = loop.run_until_complete(
100
+ self._call_model(EXECUTE_SYSTEM_PROMPT, user_message,
101
+ max_tokens=8192,
102
+ model=self.config.get("execute_model"))
103
+ )
104
+ finally:
105
+ loop.close()
106
+
107
+ handoff = self._parse_handoff(response)
108
+ return handoff
109
+
110
+ def _parse_handoff(self, response: dict) -> ExecutionHandoff:
111
+ """Extract HANDOFF JSON from model response."""
112
+ text = ""
113
+ for block in response.get("content", []):
114
+ if block.get("type") == "text":
115
+ text += block["text"]
116
+
117
+ # Try to extract JSON between ```json ... ``` markers
118
+ import re
119
+ match = re.search(r'```json\s*\n(.*?)\n```', text, re.DOTALL)
120
+ if match:
121
+ handoff_data = json.loads(match.group(1))
122
+ else:
123
+ # Try parsing the whole text as JSON
124
+ handoff_data = json.loads(text)
125
+
126
+ return ExecutionHandoff(
127
+ source=self.agent_id,
128
+ task_id=handoff_data.get("task_id", ""),
129
+ intent=handoff_data["intent"],
130
+ changes=[FileChange(**c) for c in handoff_data.get("changes", [])],
131
+ verification_focus=[
132
+ VerificationFocus(**v) for v in handoff_data.get("verification_focus", [])
133
+ ],
134
+ env_changes=EnvChanges(**handoff_data.get("env_changes", {})),
135
+ execution_trace=ExecutionTrace(
136
+ **handoff_data.get("execution_trace", {"self_check_passed": False})
137
+ ),
138
+ )
139
+
140
+ def _self_check(self, handoff: ExecutionHandoff) -> bool:
141
+ """Run basic self-checks: compile/lint if applicable."""
142
+ # Run linter/type-checker on changed files
143
+ changed_files = [c.file for c in handoff.changes]
144
+ for f in changed_files:
145
+ if f.endswith(".py"):
146
+ try:
147
+ self.git._run(["run", "mypy", f, "--ignore-missing-imports"])
148
+ except Exception:
149
+ pass # Non-blocking for now
150
+ return True # MVP: self-check is advisory, not blocking
@@ -0,0 +1,164 @@
1
+ """Dynamic Verifier — runs tests, detects drift, matches failure signatures."""
2
+
3
+ import json
4
+ import os
5
+ import subprocess
6
+ import time
7
+ from typing import Optional
8
+
9
+ from agents.base import AgentProcess, AgentStuckException
10
+ from protocol.schemas import (
11
+ Task, Verdict, Finding, FindingEvidence,
12
+ Severity, SuggestedAction, VerdictOverall, VerifierType,
13
+ )
14
+
15
+
16
+ DYNAMIC_SYSTEM_PROMPT = """You are a Dynamic Verifier Agent. Your job is to:
17
+
18
+ 1. Run tests affected by the code change (not full suite — use call graph)
19
+ 2. Analyze test results beyond red/green:
20
+ - Match failures against known signatures
21
+ - Detect performance regressions (runtime, memory)
22
+ - Detect behavioral drifts (log volume, output changes)
23
+ 3. Output a structured Verdict in JSON format
24
+
25
+ OUTPUT FORMAT:
26
+ ```json
27
+ {
28
+ "overall": "PASS|WARN|FAIL",
29
+ "findings": [
30
+ {
31
+ "severity": "HIGH|MEDIUM|LOW",
32
+ "category": "test_failure|perf_regression|coverage_gap|drift|flaky",
33
+ "title": "short description",
34
+ "detail": "detailed explanation",
35
+ "matched_signature": "sig-XXXX or null",
36
+ "suggested_action": "RETEST|FIX|ESCALATE|IGNORE"
37
+ }
38
+ ]
39
+ }
40
+ ```
41
+ """
42
+
43
+
44
+ class DynamicVerifier(AgentProcess):
45
+ """Runs tests and analyzes results."""
46
+
47
+ def __init__(self, agent_id: str, task_queue, result_queue, config: dict = None):
48
+ super().__init__(agent_id, "dynamic_verifier", task_queue, result_queue, config)
49
+ self.drift_thresholds = {}
50
+ self.signature_engine = None
51
+
52
+ def _setup(self):
53
+ self.drift_thresholds = {
54
+ "duration_change_pct": 20,
55
+ "memory_change_pct": 50,
56
+ "log_volume_change_multiplier": 3.0,
57
+ }
58
+ self.drift_thresholds.update(
59
+ self.config.get("drift_thresholds", {})
60
+ )
61
+
62
+ def _execute(self, task: Task) -> Verdict:
63
+ findings = []
64
+
65
+ # 1. Run tests
66
+ test_findings = self._run_tests(task)
67
+ findings.extend(test_findings)
68
+
69
+ # 2. Check for drift
70
+ drift_findings = self._check_drift(task)
71
+ findings.extend(drift_findings)
72
+
73
+ # 3. Match against signature DB
74
+ for f in findings:
75
+ if f.severity in (Severity.HIGH, Severity.CRITICAL):
76
+ matched = self._match_signature(f)
77
+ if matched:
78
+ f.matched_signature = matched["signature_id"]
79
+ f.suggested_action = SuggestedAction(matched.get("resolution", {}).get(
80
+ "strategy", "ESCALATE"))
81
+
82
+ # Determine overall
83
+ has_fail = any(f.severity in (Severity.HIGH, Severity.CRITICAL) for f in findings)
84
+ has_warn = any(f.severity == Severity.MEDIUM for f in findings)
85
+
86
+ if has_fail:
87
+ overall = VerdictOverall.FAIL
88
+ elif has_warn:
89
+ overall = VerdictOverall.WARN
90
+ else:
91
+ overall = VerdictOverall.PASS
92
+
93
+ return Verdict(
94
+ verifier_type=VerifierType.DYNAMIC,
95
+ task_id=task.id,
96
+ overall=overall,
97
+ findings=findings,
98
+ )
99
+
100
+ def _run_tests(self, task: Task) -> list[Finding]:
101
+ """Run relevant tests based on call graph or Handoff verification_focus."""
102
+ findings = []
103
+ repo_path = self.config.get("repo_path", ".")
104
+
105
+ # Run pytest if it's a Python project
106
+ if os.path.exists(os.path.join(repo_path, "pyproject.toml")) or \
107
+ os.path.exists(os.path.join(repo_path, "setup.py")) or \
108
+ os.path.exists(os.path.join(repo_path, "setup.cfg")):
109
+ try:
110
+ start = time.monotonic()
111
+ result = subprocess.run(
112
+ ["pytest", "-x", "--tb=short", "-q"],
113
+ cwd=repo_path, capture_output=True, text=True, timeout=300
114
+ )
115
+ duration = time.monotonic() - start
116
+
117
+ if result.returncode != 0:
118
+ findings.append(Finding(
119
+ severity=Severity.HIGH,
120
+ category="test_failure",
121
+ title="Tests failed",
122
+ detail=result.stdout[-2000:] + "\n" + result.stderr[-1000:],
123
+ evidence=FindingEvidence(type="log", data=result.stdout),
124
+ suggested_action=SuggestedAction.FIX,
125
+ ))
126
+ elif duration > 60:
127
+ findings.append(Finding(
128
+ severity=Severity.LOW,
129
+ category="perf_regression",
130
+ title=f"Test suite took {duration:.1f}s (>60s threshold)",
131
+ detail=f"Previous baseline: N/A. Current: {duration:.1f}s",
132
+ evidence=FindingEvidence(type="metric", data=str(duration)),
133
+ suggested_action=SuggestedAction.ESCALATE,
134
+ ))
135
+
136
+ except subprocess.TimeoutExpired:
137
+ findings.append(Finding(
138
+ severity=Severity.HIGH,
139
+ category="test_failure",
140
+ title="Test suite timed out after 300s",
141
+ detail="Tests did not complete within the timeout.",
142
+ evidence=FindingEvidence(type="log", data="TIMEOUT"),
143
+ suggested_action=SuggestedAction.ESCALATE,
144
+ ))
145
+
146
+ return findings
147
+
148
+ def _check_drift(self, task: Task) -> list[Finding]:
149
+ """Detect behavioral drift: performance, memory, output changes."""
150
+ return [] # MVP: drift detection requires baseline data, skipped for now
151
+
152
+ def _match_signature(self, finding: Finding) -> Optional[dict]:
153
+ """Query signature DB for matching failure patterns."""
154
+ if not self.signature_engine:
155
+ return None
156
+
157
+ try:
158
+ results = self.signature_engine.match({
159
+ "error_type": finding.category,
160
+ "error_location_pattern": finding.title,
161
+ }, k=1, min_confidence=0.65)
162
+ return results[0] if results else None
163
+ except Exception:
164
+ return None
@@ -0,0 +1,84 @@
1
+ """Semantic Verifier — checks if changes align with Spec intent."""
2
+
3
+ import json
4
+
5
+ from agents.base import AgentProcess
6
+ from protocol.schemas import (
7
+ Task, SemanticVerdict, Finding, FindingEvidence,
8
+ Severity, SuggestedAction, VerdictOverall, Alignment,
9
+ )
10
+
11
+
12
+ SEMANTIC_SYSTEM_PROMPT = """You are a Semantic Verifier Agent. Your job is to:
13
+
14
+ 1. Read the original Spec and the code changes (from the Execution Handoff)
15
+ 2. Determine if the changes align with the Spec's intent — not just its text
16
+ 3. Categorize the alignment:
17
+ - ALIGNED: Changes match Spec intent exactly
18
+ - ENHANCED: Changes add reasonable improvements beyond Spec (flag for human review)
19
+ - DEVIATED: Changes diverge from Spec intent (block)
20
+ - CONFLICT: Changes contradict Spec or are internally inconsistent (block)
21
+
22
+ 4. Output your verdict in JSON format
23
+
24
+ OUTPUT FORMAT:
25
+ ```json
26
+ {
27
+ "alignment": "ALIGNED|ENHANCED|DEVIATED|CONFLICT",
28
+ "reasoning": "detailed explanation of your judgment",
29
+ "concerns": ["list of specific concerns, empty if none"]
30
+ }
31
+ ```
32
+ """
33
+
34
+
35
+ class SemanticVerifier(AgentProcess):
36
+ """Validates semantic alignment between changes and Spec."""
37
+
38
+ def __init__(self, agent_id: str, task_queue, result_queue, config: dict = None):
39
+ super().__init__(agent_id, "semantic_verifier", task_queue, result_queue, config)
40
+
41
+ def _execute(self, task: Task) -> SemanticVerdict:
42
+ import asyncio
43
+
44
+ task_json = json.dumps(task.model_dump(), indent=2, default=str)
45
+
46
+ loop = asyncio.new_event_loop()
47
+ try:
48
+ response = loop.run_until_complete(
49
+ self._call_model(SEMANTIC_SYSTEM_PROMPT,
50
+ f"## Task with Spec, Handoff, and Verdicts\n\n{task_json}",
51
+ max_tokens=4096)
52
+ )
53
+ finally:
54
+ loop.close()
55
+
56
+ return self._parse_verdict(response)
57
+
58
+ def _parse_verdict(self, response: dict) -> SemanticVerdict:
59
+ import re
60
+ text = ""
61
+ for block in response.get("content", []):
62
+ if block.get("type") == "text":
63
+ text += block["text"]
64
+
65
+ match = re.search(r'```json\s*\n(.*?)\n```', text, re.DOTALL)
66
+ if not match:
67
+ return SemanticVerdict(
68
+ task_id="",
69
+ alignment=Alignment.CONFLICT,
70
+ reasoning="Failed to parse model output",
71
+ overall=VerdictOverall.FAIL,
72
+ )
73
+
74
+ data = json.loads(match.group(1))
75
+ alignment = Alignment(data.get("alignment", "DEVIATED"))
76
+
77
+ return SemanticVerdict(
78
+ task_id=data.get("task_id", ""),
79
+ alignment=alignment,
80
+ reasoning=data.get("reasoning", ""),
81
+ concerns=data.get("concerns", []),
82
+ overall=VerdictOverall.PASS if alignment in (Alignment.ALIGNED, Alignment.ENHANCED)
83
+ else VerdictOverall.FAIL,
84
+ )