ouroboros-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ouroboros-ai might be problematic. Click here for more details.
- ouroboros/__init__.py +15 -0
- ouroboros/__main__.py +9 -0
- ouroboros/bigbang/__init__.py +39 -0
- ouroboros/bigbang/ambiguity.py +464 -0
- ouroboros/bigbang/interview.py +530 -0
- ouroboros/bigbang/seed_generator.py +610 -0
- ouroboros/cli/__init__.py +9 -0
- ouroboros/cli/commands/__init__.py +7 -0
- ouroboros/cli/commands/config.py +79 -0
- ouroboros/cli/commands/init.py +425 -0
- ouroboros/cli/commands/run.py +201 -0
- ouroboros/cli/commands/status.py +85 -0
- ouroboros/cli/formatters/__init__.py +31 -0
- ouroboros/cli/formatters/panels.py +157 -0
- ouroboros/cli/formatters/progress.py +112 -0
- ouroboros/cli/formatters/tables.py +166 -0
- ouroboros/cli/main.py +60 -0
- ouroboros/config/__init__.py +81 -0
- ouroboros/config/loader.py +292 -0
- ouroboros/config/models.py +332 -0
- ouroboros/core/__init__.py +62 -0
- ouroboros/core/ac_tree.py +401 -0
- ouroboros/core/context.py +472 -0
- ouroboros/core/errors.py +246 -0
- ouroboros/core/seed.py +212 -0
- ouroboros/core/types.py +205 -0
- ouroboros/evaluation/__init__.py +110 -0
- ouroboros/evaluation/consensus.py +350 -0
- ouroboros/evaluation/mechanical.py +351 -0
- ouroboros/evaluation/models.py +235 -0
- ouroboros/evaluation/pipeline.py +286 -0
- ouroboros/evaluation/semantic.py +302 -0
- ouroboros/evaluation/trigger.py +278 -0
- ouroboros/events/__init__.py +5 -0
- ouroboros/events/base.py +80 -0
- ouroboros/events/decomposition.py +153 -0
- ouroboros/events/evaluation.py +248 -0
- ouroboros/execution/__init__.py +44 -0
- ouroboros/execution/atomicity.py +451 -0
- ouroboros/execution/decomposition.py +481 -0
- ouroboros/execution/double_diamond.py +1386 -0
- ouroboros/execution/subagent.py +275 -0
- ouroboros/observability/__init__.py +63 -0
- ouroboros/observability/drift.py +383 -0
- ouroboros/observability/logging.py +504 -0
- ouroboros/observability/retrospective.py +338 -0
- ouroboros/orchestrator/__init__.py +78 -0
- ouroboros/orchestrator/adapter.py +391 -0
- ouroboros/orchestrator/events.py +278 -0
- ouroboros/orchestrator/runner.py +597 -0
- ouroboros/orchestrator/session.py +486 -0
- ouroboros/persistence/__init__.py +23 -0
- ouroboros/persistence/checkpoint.py +511 -0
- ouroboros/persistence/event_store.py +183 -0
- ouroboros/persistence/migrations/__init__.py +1 -0
- ouroboros/persistence/migrations/runner.py +100 -0
- ouroboros/persistence/migrations/scripts/001_initial.sql +20 -0
- ouroboros/persistence/schema.py +56 -0
- ouroboros/persistence/uow.py +230 -0
- ouroboros/providers/__init__.py +28 -0
- ouroboros/providers/base.py +133 -0
- ouroboros/providers/claude_code_adapter.py +212 -0
- ouroboros/providers/litellm_adapter.py +316 -0
- ouroboros/py.typed +0 -0
- ouroboros/resilience/__init__.py +67 -0
- ouroboros/resilience/lateral.py +595 -0
- ouroboros/resilience/stagnation.py +727 -0
- ouroboros/routing/__init__.py +60 -0
- ouroboros/routing/complexity.py +272 -0
- ouroboros/routing/downgrade.py +664 -0
- ouroboros/routing/escalation.py +340 -0
- ouroboros/routing/router.py +204 -0
- ouroboros/routing/tiers.py +247 -0
- ouroboros/secondary/__init__.py +40 -0
- ouroboros/secondary/scheduler.py +467 -0
- ouroboros/secondary/todo_registry.py +483 -0
- ouroboros_ai-0.1.0.dist-info/METADATA +607 -0
- ouroboros_ai-0.1.0.dist-info/RECORD +81 -0
- ouroboros_ai-0.1.0.dist-info/WHEEL +4 -0
- ouroboros_ai-0.1.0.dist-info/entry_points.txt +2 -0
- ouroboros_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""Stage 1: Mechanical Verification.
|
|
2
|
+
|
|
3
|
+
Zero-cost verification through automated checks:
|
|
4
|
+
- Lint: Code style and formatting
|
|
5
|
+
- Build: Compilation validation
|
|
6
|
+
- Test: Unit/integration test execution
|
|
7
|
+
- Static: Static analysis (type checking)
|
|
8
|
+
- Coverage: Test coverage threshold (NFR9 >= 0.7)
|
|
9
|
+
|
|
10
|
+
The MechanicalVerifier is stateless and produces immutable results.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from ouroboros.core.errors import ValidationError
|
|
19
|
+
from ouroboros.core.types import Result
|
|
20
|
+
from ouroboros.evaluation.models import CheckResult, CheckType, MechanicalResult
|
|
21
|
+
from ouroboros.events.base import BaseEvent
|
|
22
|
+
from ouroboros.events.evaluation import (
|
|
23
|
+
create_stage1_completed_event,
|
|
24
|
+
create_stage1_started_event,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True, slots=True)
|
|
29
|
+
class MechanicalConfig:
|
|
30
|
+
"""Configuration for mechanical verification.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
coverage_threshold: Minimum coverage required (default 0.7 per NFR9)
|
|
34
|
+
lint_command: Command to run linting
|
|
35
|
+
build_command: Command to run build
|
|
36
|
+
test_command: Command to run tests
|
|
37
|
+
static_command: Command to run static analysis
|
|
38
|
+
timeout_seconds: Timeout for each command
|
|
39
|
+
working_dir: Working directory for commands
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
coverage_threshold: float = 0.7
|
|
43
|
+
lint_command: tuple[str, ...] = ("uv", "run", "ruff", "check", ".")
|
|
44
|
+
build_command: tuple[str, ...] = ("uv", "run", "python", "-m", "py_compile")
|
|
45
|
+
test_command: tuple[str, ...] = ("uv", "run", "pytest", "--tb=short", "-q")
|
|
46
|
+
static_command: tuple[str, ...] = ("uv", "run", "mypy", "src/ouroboros", "--ignore-missing-imports")
|
|
47
|
+
coverage_command: tuple[str, ...] = (
|
|
48
|
+
"uv", "run", "pytest", "--cov=src/ouroboros", "--cov-report=term-missing", "-q"
|
|
49
|
+
)
|
|
50
|
+
timeout_seconds: int = 300
|
|
51
|
+
working_dir: Path | None = None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True, slots=True)
|
|
55
|
+
class CommandResult:
|
|
56
|
+
"""Result of running a shell command.
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
return_code: Exit code of the command
|
|
60
|
+
stdout: Standard output
|
|
61
|
+
stderr: Standard error
|
|
62
|
+
timed_out: Whether the command timed out
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
return_code: int
|
|
66
|
+
stdout: str
|
|
67
|
+
stderr: str
|
|
68
|
+
timed_out: bool = False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
async def run_command(
|
|
72
|
+
command: tuple[str, ...],
|
|
73
|
+
timeout: int,
|
|
74
|
+
working_dir: Path | None = None,
|
|
75
|
+
) -> CommandResult:
|
|
76
|
+
"""Run a shell command asynchronously.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
command: Command and arguments to run
|
|
80
|
+
timeout: Timeout in seconds
|
|
81
|
+
working_dir: Working directory
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
CommandResult with output and status
|
|
85
|
+
"""
|
|
86
|
+
try:
|
|
87
|
+
process = await asyncio.create_subprocess_exec(
|
|
88
|
+
*command,
|
|
89
|
+
stdout=asyncio.subprocess.PIPE,
|
|
90
|
+
stderr=asyncio.subprocess.PIPE,
|
|
91
|
+
cwd=working_dir,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
stdout, stderr = await asyncio.wait_for(
|
|
96
|
+
process.communicate(),
|
|
97
|
+
timeout=timeout,
|
|
98
|
+
)
|
|
99
|
+
return CommandResult(
|
|
100
|
+
return_code=process.returncode or 0,
|
|
101
|
+
stdout=stdout.decode("utf-8", errors="replace"),
|
|
102
|
+
stderr=stderr.decode("utf-8", errors="replace"),
|
|
103
|
+
)
|
|
104
|
+
except TimeoutError:
|
|
105
|
+
process.kill()
|
|
106
|
+
await process.wait()
|
|
107
|
+
return CommandResult(
|
|
108
|
+
return_code=-1,
|
|
109
|
+
stdout="",
|
|
110
|
+
stderr="Command timed out",
|
|
111
|
+
timed_out=True,
|
|
112
|
+
)
|
|
113
|
+
except FileNotFoundError as e:
|
|
114
|
+
return CommandResult(
|
|
115
|
+
return_code=-1,
|
|
116
|
+
stdout="",
|
|
117
|
+
stderr=f"Command not found: {e}",
|
|
118
|
+
)
|
|
119
|
+
except OSError as e:
|
|
120
|
+
return CommandResult(
|
|
121
|
+
return_code=-1,
|
|
122
|
+
stdout="",
|
|
123
|
+
stderr=f"OS error: {e}",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def parse_coverage_from_output(output: str) -> float | None:
|
|
128
|
+
"""Extract coverage percentage from pytest-cov output.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
output: stdout from coverage command
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Coverage as float (0.0-1.0) or None if not found
|
|
135
|
+
"""
|
|
136
|
+
# Look for "TOTAL ... XX%" pattern
|
|
137
|
+
import re
|
|
138
|
+
|
|
139
|
+
# Pattern matches lines like "TOTAL 1234 123 90%"
|
|
140
|
+
pattern = r"TOTAL\s+\d+\s+\d+\s+(\d+)%"
|
|
141
|
+
match = re.search(pattern, output)
|
|
142
|
+
if match:
|
|
143
|
+
return float(match.group(1)) / 100.0
|
|
144
|
+
|
|
145
|
+
# Alternative pattern: "Coverage: XX%"
|
|
146
|
+
alt_pattern = r"Coverage:\s*(\d+(?:\.\d+)?)%"
|
|
147
|
+
alt_match = re.search(alt_pattern, output)
|
|
148
|
+
if alt_match:
|
|
149
|
+
return float(alt_match.group(1)) / 100.0
|
|
150
|
+
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class MechanicalVerifier:
|
|
155
|
+
"""Stage 1 mechanical verification executor.
|
|
156
|
+
|
|
157
|
+
Runs zero-cost automated checks on artifacts.
|
|
158
|
+
Stateless - all state passed via parameters.
|
|
159
|
+
|
|
160
|
+
Example:
|
|
161
|
+
verifier = MechanicalVerifier(config)
|
|
162
|
+
result = await verifier.verify(execution_id, checks=[CheckType.LINT, CheckType.TEST])
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def __init__(self, config: MechanicalConfig | None = None) -> None:
|
|
166
|
+
"""Initialize verifier with configuration.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
config: Verification configuration, uses defaults if None
|
|
170
|
+
"""
|
|
171
|
+
self.config = config or MechanicalConfig()
|
|
172
|
+
|
|
173
|
+
async def verify(
|
|
174
|
+
self,
|
|
175
|
+
execution_id: str,
|
|
176
|
+
checks: list[CheckType] | None = None,
|
|
177
|
+
) -> Result[tuple[MechanicalResult, list[BaseEvent]], ValidationError]:
|
|
178
|
+
"""Run mechanical verification checks.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
execution_id: Execution identifier for events
|
|
182
|
+
checks: List of checks to run, defaults to all
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Result containing MechanicalResult and events, or error
|
|
186
|
+
"""
|
|
187
|
+
if checks is None:
|
|
188
|
+
checks = list(CheckType)
|
|
189
|
+
|
|
190
|
+
events: list[BaseEvent] = []
|
|
191
|
+
check_results: list[CheckResult] = []
|
|
192
|
+
coverage_score: float | None = None
|
|
193
|
+
|
|
194
|
+
# Emit start event
|
|
195
|
+
events.append(
|
|
196
|
+
create_stage1_started_event(
|
|
197
|
+
execution_id=execution_id,
|
|
198
|
+
checks_to_run=[c.value for c in checks],
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Run each check
|
|
203
|
+
for check_type in checks:
|
|
204
|
+
result = await self._run_check(check_type)
|
|
205
|
+
check_results.append(result)
|
|
206
|
+
|
|
207
|
+
# Track coverage if it was a coverage check
|
|
208
|
+
if check_type == CheckType.COVERAGE and result.passed:
|
|
209
|
+
coverage_score = result.details.get("coverage_score")
|
|
210
|
+
|
|
211
|
+
# Determine overall pass/fail
|
|
212
|
+
all_passed = all(c.passed for c in check_results)
|
|
213
|
+
|
|
214
|
+
# Verify coverage threshold if coverage was checked
|
|
215
|
+
if coverage_score is not None and coverage_score < self.config.coverage_threshold:
|
|
216
|
+
# Find and update coverage check to failed
|
|
217
|
+
updated_results = []
|
|
218
|
+
for cr in check_results:
|
|
219
|
+
if cr.check_type == CheckType.COVERAGE:
|
|
220
|
+
updated_results.append(
|
|
221
|
+
CheckResult(
|
|
222
|
+
check_type=CheckType.COVERAGE,
|
|
223
|
+
passed=False,
|
|
224
|
+
message=f"Coverage {coverage_score:.1%} below threshold {self.config.coverage_threshold:.1%}",
|
|
225
|
+
details=cr.details,
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
updated_results.append(cr)
|
|
230
|
+
check_results = updated_results
|
|
231
|
+
all_passed = False
|
|
232
|
+
|
|
233
|
+
mechanical_result = MechanicalResult(
|
|
234
|
+
passed=all_passed,
|
|
235
|
+
checks=tuple(check_results),
|
|
236
|
+
coverage_score=coverage_score,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Emit completion event
|
|
240
|
+
events.append(
|
|
241
|
+
create_stage1_completed_event(
|
|
242
|
+
execution_id=execution_id,
|
|
243
|
+
passed=all_passed,
|
|
244
|
+
checks=[
|
|
245
|
+
{
|
|
246
|
+
"check_type": c.check_type.value,
|
|
247
|
+
"passed": c.passed,
|
|
248
|
+
"message": c.message,
|
|
249
|
+
}
|
|
250
|
+
for c in check_results
|
|
251
|
+
],
|
|
252
|
+
coverage_score=coverage_score,
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
return Result.ok((mechanical_result, events))
|
|
257
|
+
|
|
258
|
+
async def _run_check(self, check_type: CheckType) -> CheckResult:
|
|
259
|
+
"""Run a single check.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
check_type: Type of check to run
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
CheckResult with pass/fail status
|
|
266
|
+
"""
|
|
267
|
+
command = self._get_command_for_check(check_type)
|
|
268
|
+
if command is None:
|
|
269
|
+
return CheckResult(
|
|
270
|
+
check_type=check_type,
|
|
271
|
+
passed=True,
|
|
272
|
+
message=f"Check {check_type.value} skipped (no command configured)",
|
|
273
|
+
details={"skipped": True},
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
cmd_result = await run_command(
|
|
277
|
+
command,
|
|
278
|
+
timeout=self.config.timeout_seconds,
|
|
279
|
+
working_dir=self.config.working_dir,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if cmd_result.timed_out:
|
|
283
|
+
return CheckResult(
|
|
284
|
+
check_type=check_type,
|
|
285
|
+
passed=False,
|
|
286
|
+
message=f"Check {check_type.value} timed out after {self.config.timeout_seconds}s",
|
|
287
|
+
details={"timed_out": True},
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
passed = cmd_result.return_code == 0
|
|
291
|
+
details: dict[str, Any] = {
|
|
292
|
+
"return_code": cmd_result.return_code,
|
|
293
|
+
"stdout_preview": cmd_result.stdout[:500] if cmd_result.stdout else "",
|
|
294
|
+
"stderr_preview": cmd_result.stderr[:500] if cmd_result.stderr else "",
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# Extract coverage if this was a coverage check
|
|
298
|
+
if check_type == CheckType.COVERAGE and passed:
|
|
299
|
+
coverage = parse_coverage_from_output(cmd_result.stdout)
|
|
300
|
+
if coverage is not None:
|
|
301
|
+
details["coverage_score"] = coverage
|
|
302
|
+
|
|
303
|
+
message = (
|
|
304
|
+
f"Check {check_type.value} passed"
|
|
305
|
+
if passed
|
|
306
|
+
else f"Check {check_type.value} failed (exit code {cmd_result.return_code})"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
return CheckResult(
|
|
310
|
+
check_type=check_type,
|
|
311
|
+
passed=passed,
|
|
312
|
+
message=message,
|
|
313
|
+
details=details,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
def _get_command_for_check(self, check_type: CheckType) -> tuple[str, ...] | None:
|
|
317
|
+
"""Get the command for a specific check type.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
check_type: Type of check
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Command tuple or None if not configured
|
|
324
|
+
"""
|
|
325
|
+
commands = {
|
|
326
|
+
CheckType.LINT: self.config.lint_command,
|
|
327
|
+
CheckType.BUILD: self.config.build_command,
|
|
328
|
+
CheckType.TEST: self.config.test_command,
|
|
329
|
+
CheckType.STATIC: self.config.static_command,
|
|
330
|
+
CheckType.COVERAGE: self.config.coverage_command,
|
|
331
|
+
}
|
|
332
|
+
return commands.get(check_type)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
async def run_mechanical_verification(
|
|
336
|
+
execution_id: str,
|
|
337
|
+
config: MechanicalConfig | None = None,
|
|
338
|
+
checks: list[CheckType] | None = None,
|
|
339
|
+
) -> Result[tuple[MechanicalResult, list[BaseEvent]], ValidationError]:
|
|
340
|
+
"""Convenience function for running mechanical verification.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
execution_id: Execution identifier
|
|
344
|
+
config: Optional configuration
|
|
345
|
+
checks: Optional list of checks to run
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
Result with MechanicalResult and events
|
|
349
|
+
"""
|
|
350
|
+
verifier = MechanicalVerifier(config)
|
|
351
|
+
return await verifier.verify(execution_id, checks)
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""Data models for the evaluation pipeline.
|
|
2
|
+
|
|
3
|
+
This module defines immutable data structures for all three evaluation stages.
|
|
4
|
+
All models use frozen dataclasses with slots for immutability and performance.
|
|
5
|
+
|
|
6
|
+
Classes:
|
|
7
|
+
CheckType: Enum of mechanical check types
|
|
8
|
+
CheckResult: Single mechanical check result
|
|
9
|
+
MechanicalResult: Aggregated Stage 1 results
|
|
10
|
+
SemanticResult: Stage 2 LLM evaluation results
|
|
11
|
+
Vote: Single model vote in consensus
|
|
12
|
+
ConsensusResult: Aggregated Stage 3 results
|
|
13
|
+
EvaluationContext: Input context for evaluation
|
|
14
|
+
EvaluationResult: Complete pipeline output
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from enum import StrEnum
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from ouroboros.events.base import BaseEvent
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CheckType(StrEnum):
|
|
25
|
+
"""Types of mechanical checks in Stage 1.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
LINT: Code style and formatting checks
|
|
29
|
+
BUILD: Compilation and build validation
|
|
30
|
+
TEST: Unit and integration test execution
|
|
31
|
+
STATIC: Static analysis (type checking, etc.)
|
|
32
|
+
COVERAGE: Test coverage threshold verification
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
LINT = "lint"
|
|
36
|
+
BUILD = "build"
|
|
37
|
+
TEST = "test"
|
|
38
|
+
STATIC = "static"
|
|
39
|
+
COVERAGE = "coverage"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True, slots=True)
|
|
43
|
+
class CheckResult:
|
|
44
|
+
"""Result of a single mechanical check.
|
|
45
|
+
|
|
46
|
+
Attributes:
|
|
47
|
+
check_type: Type of check performed
|
|
48
|
+
passed: Whether the check passed
|
|
49
|
+
message: Human-readable result message
|
|
50
|
+
details: Additional check-specific details
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
check_type: CheckType
|
|
54
|
+
passed: bool
|
|
55
|
+
message: str
|
|
56
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass(frozen=True, slots=True)
|
|
60
|
+
class MechanicalResult:
|
|
61
|
+
"""Aggregated result of Stage 1 mechanical verification.
|
|
62
|
+
|
|
63
|
+
All checks must pass for the overall result to pass.
|
|
64
|
+
Coverage score is tracked separately for NFR9 compliance.
|
|
65
|
+
|
|
66
|
+
Attributes:
|
|
67
|
+
passed: True if all checks passed
|
|
68
|
+
checks: Tuple of individual check results
|
|
69
|
+
coverage_score: Test coverage percentage (0.0-1.0), None if not measured
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
passed: bool
|
|
73
|
+
checks: tuple[CheckResult, ...]
|
|
74
|
+
coverage_score: float | None = None
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def failed_checks(self) -> tuple[CheckResult, ...]:
|
|
78
|
+
"""Return only the checks that failed."""
|
|
79
|
+
return tuple(c for c in self.checks if not c.passed)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True, slots=True)
|
|
83
|
+
class SemanticResult:
|
|
84
|
+
"""Result of Stage 2 semantic evaluation.
|
|
85
|
+
|
|
86
|
+
Uses LLM to evaluate AC compliance, goal alignment, and drift.
|
|
87
|
+
Uncertainty score determines if Stage 3 consensus is needed.
|
|
88
|
+
|
|
89
|
+
Attributes:
|
|
90
|
+
score: Overall evaluation score (0.0-1.0)
|
|
91
|
+
ac_compliance: Whether acceptance criteria are met
|
|
92
|
+
goal_alignment: Alignment with original goal (0.0-1.0)
|
|
93
|
+
drift_score: Deviation from seed intent (0.0-1.0, lower is better)
|
|
94
|
+
uncertainty: Model uncertainty about evaluation (0.0-1.0)
|
|
95
|
+
reasoning: Explanation of the evaluation
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
score: float
|
|
99
|
+
ac_compliance: bool
|
|
100
|
+
goal_alignment: float
|
|
101
|
+
drift_score: float
|
|
102
|
+
uncertainty: float
|
|
103
|
+
reasoning: str
|
|
104
|
+
|
|
105
|
+
def __post_init__(self) -> None:
|
|
106
|
+
"""Validate score ranges."""
|
|
107
|
+
for attr in ("score", "goal_alignment", "drift_score", "uncertainty"):
|
|
108
|
+
value = getattr(self, attr)
|
|
109
|
+
if not 0.0 <= value <= 1.0:
|
|
110
|
+
msg = f"{attr} must be between 0.0 and 1.0, got {value}"
|
|
111
|
+
raise ValueError(msg)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass(frozen=True, slots=True)
|
|
115
|
+
class Vote:
|
|
116
|
+
"""Single model vote in Stage 3 consensus.
|
|
117
|
+
|
|
118
|
+
Attributes:
|
|
119
|
+
model: Model identifier that cast the vote
|
|
120
|
+
approved: Whether the model approves the output
|
|
121
|
+
confidence: Model's confidence in its decision (0.0-1.0)
|
|
122
|
+
reasoning: Explanation of the vote
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
model: str
|
|
126
|
+
approved: bool
|
|
127
|
+
confidence: float
|
|
128
|
+
reasoning: str
|
|
129
|
+
|
|
130
|
+
def __post_init__(self) -> None:
|
|
131
|
+
"""Validate confidence range."""
|
|
132
|
+
if not 0.0 <= self.confidence <= 1.0:
|
|
133
|
+
msg = f"confidence must be between 0.0 and 1.0, got {self.confidence}"
|
|
134
|
+
raise ValueError(msg)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@dataclass(frozen=True, slots=True)
|
|
138
|
+
class ConsensusResult:
|
|
139
|
+
"""Aggregated result of Stage 3 multi-model consensus.
|
|
140
|
+
|
|
141
|
+
Requires 2/3 majority for approval with minimum 3 models.
|
|
142
|
+
|
|
143
|
+
Attributes:
|
|
144
|
+
approved: True if consensus reached approval
|
|
145
|
+
votes: Tuple of individual model votes
|
|
146
|
+
majority_ratio: Ratio of approving votes (0.0-1.0)
|
|
147
|
+
disagreements: Tuple of reasoning strings from dissenting votes
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
approved: bool
|
|
151
|
+
votes: tuple[Vote, ...]
|
|
152
|
+
majority_ratio: float
|
|
153
|
+
disagreements: tuple[str, ...] = ()
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def approving_votes(self) -> int:
|
|
157
|
+
"""Count of votes that approved."""
|
|
158
|
+
return sum(1 for v in self.votes if v.approved)
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def total_votes(self) -> int:
|
|
162
|
+
"""Total number of votes cast."""
|
|
163
|
+
return len(self.votes)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dataclass(frozen=True, slots=True)
|
|
167
|
+
class EvaluationContext:
|
|
168
|
+
"""Input context for the evaluation pipeline.
|
|
169
|
+
|
|
170
|
+
Attributes:
|
|
171
|
+
execution_id: Unique identifier for the execution
|
|
172
|
+
seed_id: Identifier of the seed being evaluated against
|
|
173
|
+
current_ac: The acceptance criterion being evaluated
|
|
174
|
+
artifact: The output artifact to evaluate
|
|
175
|
+
artifact_type: Type of artifact (code, document, etc.)
|
|
176
|
+
goal: Original goal from seed
|
|
177
|
+
constraints: Constraints from seed
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
execution_id: str
|
|
181
|
+
seed_id: str
|
|
182
|
+
current_ac: str
|
|
183
|
+
artifact: str
|
|
184
|
+
artifact_type: str = "code"
|
|
185
|
+
goal: str = ""
|
|
186
|
+
constraints: tuple[str, ...] = ()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@dataclass(frozen=True, slots=True)
|
|
190
|
+
class EvaluationResult:
|
|
191
|
+
"""Complete evaluation pipeline result.
|
|
192
|
+
|
|
193
|
+
Contains results from all stages that were executed,
|
|
194
|
+
final approval status, and generated events for audit trail.
|
|
195
|
+
|
|
196
|
+
Attributes:
|
|
197
|
+
execution_id: Execution identifier for tracing
|
|
198
|
+
stage1_result: Mechanical verification result (if executed)
|
|
199
|
+
stage2_result: Semantic evaluation result (if executed)
|
|
200
|
+
stage3_result: Consensus result (if triggered)
|
|
201
|
+
final_approved: Overall approval status
|
|
202
|
+
events: List of events generated during evaluation
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
execution_id: str
|
|
206
|
+
stage1_result: MechanicalResult | None = None
|
|
207
|
+
stage2_result: SemanticResult | None = None
|
|
208
|
+
stage3_result: ConsensusResult | None = None
|
|
209
|
+
final_approved: bool = False
|
|
210
|
+
events: list[BaseEvent] = field(default_factory=list)
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def highest_stage_completed(self) -> int:
|
|
214
|
+
"""Return the highest stage number that completed."""
|
|
215
|
+
if self.stage3_result is not None:
|
|
216
|
+
return 3
|
|
217
|
+
if self.stage2_result is not None:
|
|
218
|
+
return 2
|
|
219
|
+
if self.stage1_result is not None:
|
|
220
|
+
return 1
|
|
221
|
+
return 0
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def failure_reason(self) -> str | None:
|
|
225
|
+
"""Return the reason for failure, if any."""
|
|
226
|
+
if self.final_approved:
|
|
227
|
+
return None
|
|
228
|
+
if self.stage1_result and not self.stage1_result.passed:
|
|
229
|
+
failed = self.stage1_result.failed_checks
|
|
230
|
+
return f"Stage 1 failed: {', '.join(c.check_type for c in failed)}"
|
|
231
|
+
if self.stage2_result and not self.stage2_result.ac_compliance:
|
|
232
|
+
return f"Stage 2 failed: AC non-compliance (score={self.stage2_result.score:.2f})"
|
|
233
|
+
if self.stage3_result and not self.stage3_result.approved:
|
|
234
|
+
return f"Stage 3 failed: Consensus not reached ({self.stage3_result.majority_ratio:.0%})"
|
|
235
|
+
return "Unknown failure"
|