ouroboros-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ouroboros-ai might be problematic. Click here for more details.
- ouroboros/__init__.py +15 -0
- ouroboros/__main__.py +9 -0
- ouroboros/bigbang/__init__.py +39 -0
- ouroboros/bigbang/ambiguity.py +464 -0
- ouroboros/bigbang/interview.py +530 -0
- ouroboros/bigbang/seed_generator.py +610 -0
- ouroboros/cli/__init__.py +9 -0
- ouroboros/cli/commands/__init__.py +7 -0
- ouroboros/cli/commands/config.py +79 -0
- ouroboros/cli/commands/init.py +425 -0
- ouroboros/cli/commands/run.py +201 -0
- ouroboros/cli/commands/status.py +85 -0
- ouroboros/cli/formatters/__init__.py +31 -0
- ouroboros/cli/formatters/panels.py +157 -0
- ouroboros/cli/formatters/progress.py +112 -0
- ouroboros/cli/formatters/tables.py +166 -0
- ouroboros/cli/main.py +60 -0
- ouroboros/config/__init__.py +81 -0
- ouroboros/config/loader.py +292 -0
- ouroboros/config/models.py +332 -0
- ouroboros/core/__init__.py +62 -0
- ouroboros/core/ac_tree.py +401 -0
- ouroboros/core/context.py +472 -0
- ouroboros/core/errors.py +246 -0
- ouroboros/core/seed.py +212 -0
- ouroboros/core/types.py +205 -0
- ouroboros/evaluation/__init__.py +110 -0
- ouroboros/evaluation/consensus.py +350 -0
- ouroboros/evaluation/mechanical.py +351 -0
- ouroboros/evaluation/models.py +235 -0
- ouroboros/evaluation/pipeline.py +286 -0
- ouroboros/evaluation/semantic.py +302 -0
- ouroboros/evaluation/trigger.py +278 -0
- ouroboros/events/__init__.py +5 -0
- ouroboros/events/base.py +80 -0
- ouroboros/events/decomposition.py +153 -0
- ouroboros/events/evaluation.py +248 -0
- ouroboros/execution/__init__.py +44 -0
- ouroboros/execution/atomicity.py +451 -0
- ouroboros/execution/decomposition.py +481 -0
- ouroboros/execution/double_diamond.py +1386 -0
- ouroboros/execution/subagent.py +275 -0
- ouroboros/observability/__init__.py +63 -0
- ouroboros/observability/drift.py +383 -0
- ouroboros/observability/logging.py +504 -0
- ouroboros/observability/retrospective.py +338 -0
- ouroboros/orchestrator/__init__.py +78 -0
- ouroboros/orchestrator/adapter.py +391 -0
- ouroboros/orchestrator/events.py +278 -0
- ouroboros/orchestrator/runner.py +597 -0
- ouroboros/orchestrator/session.py +486 -0
- ouroboros/persistence/__init__.py +23 -0
- ouroboros/persistence/checkpoint.py +511 -0
- ouroboros/persistence/event_store.py +183 -0
- ouroboros/persistence/migrations/__init__.py +1 -0
- ouroboros/persistence/migrations/runner.py +100 -0
- ouroboros/persistence/migrations/scripts/001_initial.sql +20 -0
- ouroboros/persistence/schema.py +56 -0
- ouroboros/persistence/uow.py +230 -0
- ouroboros/providers/__init__.py +28 -0
- ouroboros/providers/base.py +133 -0
- ouroboros/providers/claude_code_adapter.py +212 -0
- ouroboros/providers/litellm_adapter.py +316 -0
- ouroboros/py.typed +0 -0
- ouroboros/resilience/__init__.py +67 -0
- ouroboros/resilience/lateral.py +595 -0
- ouroboros/resilience/stagnation.py +727 -0
- ouroboros/routing/__init__.py +60 -0
- ouroboros/routing/complexity.py +272 -0
- ouroboros/routing/downgrade.py +664 -0
- ouroboros/routing/escalation.py +340 -0
- ouroboros/routing/router.py +204 -0
- ouroboros/routing/tiers.py +247 -0
- ouroboros/secondary/__init__.py +40 -0
- ouroboros/secondary/scheduler.py +467 -0
- ouroboros/secondary/todo_registry.py +483 -0
- ouroboros_ai-0.1.0.dist-info/METADATA +607 -0
- ouroboros_ai-0.1.0.dist-info/RECORD +81 -0
- ouroboros_ai-0.1.0.dist-info/WHEEL +4 -0
- ouroboros_ai-0.1.0.dist-info/entry_points.txt +2 -0
- ouroboros_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Three-stage evaluation pipeline for Ouroboros.
|
|
2
|
+
|
|
3
|
+
This module provides the evaluation infrastructure for verifying outputs
|
|
4
|
+
through three progressive stages:
|
|
5
|
+
|
|
6
|
+
1. Stage 1 - Mechanical Verification ($0): Lint, build, test, static analysis
|
|
7
|
+
2. Stage 2 - Semantic Evaluation (Standard tier): AC compliance, goal alignment
|
|
8
|
+
3. Stage 3 - Multi-Model Consensus (Frontier tier): 3-model voting
|
|
9
|
+
|
|
10
|
+
Classes:
|
|
11
|
+
CheckResult: Result of a single mechanical check
|
|
12
|
+
CheckType: Types of mechanical checks
|
|
13
|
+
MechanicalResult: Aggregated Stage 1 results
|
|
14
|
+
SemanticResult: Stage 2 LLM evaluation results
|
|
15
|
+
Vote: Single model vote in consensus
|
|
16
|
+
ConsensusResult: Aggregated Stage 3 results
|
|
17
|
+
EvaluationResult: Complete pipeline result
|
|
18
|
+
EvaluationContext: Input context for evaluation
|
|
19
|
+
MechanicalVerifier: Stage 1 checker
|
|
20
|
+
MechanicalConfig: Stage 1 configuration
|
|
21
|
+
SemanticEvaluator: Stage 2 evaluator
|
|
22
|
+
SemanticConfig: Stage 2 configuration
|
|
23
|
+
ConsensusEvaluator: Stage 3 consensus builder
|
|
24
|
+
ConsensusConfig: Stage 3 configuration
|
|
25
|
+
ConsensusTrigger: Trigger matrix implementation
|
|
26
|
+
TriggerType: Types of consensus triggers
|
|
27
|
+
TriggerContext: Context for trigger evaluation
|
|
28
|
+
TriggerResult: Result of trigger evaluation
|
|
29
|
+
TriggerConfig: Trigger thresholds
|
|
30
|
+
EvaluationPipeline: Full pipeline orchestrator
|
|
31
|
+
PipelineConfig: Pipeline configuration
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from ouroboros.evaluation.consensus import (
|
|
35
|
+
DEFAULT_CONSENSUS_MODELS,
|
|
36
|
+
ConsensusConfig,
|
|
37
|
+
ConsensusEvaluator,
|
|
38
|
+
run_consensus_evaluation,
|
|
39
|
+
)
|
|
40
|
+
from ouroboros.evaluation.mechanical import (
|
|
41
|
+
MechanicalConfig,
|
|
42
|
+
MechanicalVerifier,
|
|
43
|
+
run_mechanical_verification,
|
|
44
|
+
)
|
|
45
|
+
from ouroboros.evaluation.models import (
|
|
46
|
+
CheckResult,
|
|
47
|
+
CheckType,
|
|
48
|
+
ConsensusResult,
|
|
49
|
+
EvaluationContext,
|
|
50
|
+
EvaluationResult,
|
|
51
|
+
MechanicalResult,
|
|
52
|
+
SemanticResult,
|
|
53
|
+
Vote,
|
|
54
|
+
)
|
|
55
|
+
from ouroboros.evaluation.pipeline import (
|
|
56
|
+
EvaluationPipeline,
|
|
57
|
+
PipelineConfig,
|
|
58
|
+
run_evaluation_pipeline,
|
|
59
|
+
)
|
|
60
|
+
from ouroboros.evaluation.semantic import (
|
|
61
|
+
DEFAULT_SEMANTIC_MODEL,
|
|
62
|
+
SemanticConfig,
|
|
63
|
+
SemanticEvaluator,
|
|
64
|
+
run_semantic_evaluation,
|
|
65
|
+
)
|
|
66
|
+
from ouroboros.evaluation.trigger import (
|
|
67
|
+
ConsensusTrigger,
|
|
68
|
+
TriggerConfig,
|
|
69
|
+
TriggerContext,
|
|
70
|
+
TriggerResult,
|
|
71
|
+
TriggerType,
|
|
72
|
+
check_consensus_trigger,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
__all__ = [
|
|
76
|
+
# Models
|
|
77
|
+
"CheckResult",
|
|
78
|
+
"CheckType",
|
|
79
|
+
"ConsensusResult",
|
|
80
|
+
"EvaluationContext",
|
|
81
|
+
"EvaluationResult",
|
|
82
|
+
"MechanicalResult",
|
|
83
|
+
"SemanticResult",
|
|
84
|
+
"Vote",
|
|
85
|
+
# Stage 1
|
|
86
|
+
"MechanicalConfig",
|
|
87
|
+
"MechanicalVerifier",
|
|
88
|
+
"run_mechanical_verification",
|
|
89
|
+
# Stage 2
|
|
90
|
+
"DEFAULT_SEMANTIC_MODEL",
|
|
91
|
+
"SemanticConfig",
|
|
92
|
+
"SemanticEvaluator",
|
|
93
|
+
"run_semantic_evaluation",
|
|
94
|
+
# Stage 3
|
|
95
|
+
"DEFAULT_CONSENSUS_MODELS",
|
|
96
|
+
"ConsensusConfig",
|
|
97
|
+
"ConsensusEvaluator",
|
|
98
|
+
"run_consensus_evaluation",
|
|
99
|
+
# Trigger
|
|
100
|
+
"ConsensusTrigger",
|
|
101
|
+
"TriggerConfig",
|
|
102
|
+
"TriggerContext",
|
|
103
|
+
"TriggerResult",
|
|
104
|
+
"TriggerType",
|
|
105
|
+
"check_consensus_trigger",
|
|
106
|
+
# Pipeline
|
|
107
|
+
"EvaluationPipeline",
|
|
108
|
+
"PipelineConfig",
|
|
109
|
+
"run_evaluation_pipeline",
|
|
110
|
+
]
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"""Stage 3: Multi-Model Consensus.
|
|
2
|
+
|
|
3
|
+
Multi-model voting using Frontier tier:
|
|
4
|
+
- 3 different models evaluate independently
|
|
5
|
+
- 2/3 majority required for approval
|
|
6
|
+
- Disagreements are logged with reasoning
|
|
7
|
+
|
|
8
|
+
The ConsensusEvaluator uses multiple LLM models for diverse verification.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
import json
|
|
14
|
+
|
|
15
|
+
from ouroboros.core.errors import ProviderError, ValidationError
|
|
16
|
+
from ouroboros.core.types import Result
|
|
17
|
+
from ouroboros.evaluation.models import ConsensusResult, EvaluationContext, Vote
|
|
18
|
+
from ouroboros.events.base import BaseEvent
|
|
19
|
+
from ouroboros.events.evaluation import (
|
|
20
|
+
create_stage3_completed_event,
|
|
21
|
+
create_stage3_started_event,
|
|
22
|
+
)
|
|
23
|
+
from ouroboros.providers.base import CompletionConfig, Message, MessageRole
|
|
24
|
+
from ouroboros.providers.litellm_adapter import LiteLLMAdapter
|
|
25
|
+
|
|
26
|
+
# Default models for consensus voting (Frontier tier)
|
|
27
|
+
# Can be overridden via ConsensusConfig.models
|
|
28
|
+
DEFAULT_CONSENSUS_MODELS: tuple[str, ...] = (
|
|
29
|
+
"openrouter/openai/gpt-4o",
|
|
30
|
+
"openrouter/anthropic/claude-sonnet-4-20250514",
|
|
31
|
+
"openrouter/google/gemini-2.5-pro",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True, slots=True)
|
|
36
|
+
class ConsensusConfig:
|
|
37
|
+
"""Configuration for consensus evaluation.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
models: List of models to use for voting (at least 3)
|
|
41
|
+
temperature: Sampling temperature
|
|
42
|
+
max_tokens: Maximum tokens per response
|
|
43
|
+
majority_threshold: Required majority ratio (default 2/3)
|
|
44
|
+
diversity_required: Require different providers
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
models: tuple[str, ...] = DEFAULT_CONSENSUS_MODELS
|
|
48
|
+
temperature: float = 0.3
|
|
49
|
+
max_tokens: int = 1024
|
|
50
|
+
majority_threshold: float = 0.66 # 2/3 = 0.6666...
|
|
51
|
+
diversity_required: bool = True
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
CONSENSUS_SYSTEM_PROMPT = """You are a senior code reviewer participating in a consensus evaluation. Your vote will be combined with other reviewers to reach a decision.
|
|
55
|
+
|
|
56
|
+
You must respond ONLY with a valid JSON object in the following exact format:
|
|
57
|
+
{
|
|
58
|
+
"approved": <boolean>,
|
|
59
|
+
"confidence": <float between 0.0 and 1.0>,
|
|
60
|
+
"reasoning": "<string explaining your vote>"
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
Evaluation criteria for approval:
|
|
64
|
+
- The artifact correctly implements the acceptance criterion
|
|
65
|
+
- The implementation aligns with the stated goal
|
|
66
|
+
- No significant issues or concerns
|
|
67
|
+
- Code quality is acceptable
|
|
68
|
+
|
|
69
|
+
Be honest and thorough. If you have concerns, vote against approval with clear reasoning.
|
|
70
|
+
Confidence should reflect how certain you are about your decision."""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def build_consensus_prompt(context: EvaluationContext) -> str:
|
|
74
|
+
"""Build the user prompt for consensus voting.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
context: Evaluation context
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Formatted prompt string
|
|
81
|
+
"""
|
|
82
|
+
constraints_text = "\n".join(f"- {c}" for c in context.constraints) if context.constraints else "None"
|
|
83
|
+
|
|
84
|
+
return f"""Review the following artifact for consensus approval:
|
|
85
|
+
|
|
86
|
+
## Acceptance Criterion
|
|
87
|
+
{context.current_ac}
|
|
88
|
+
|
|
89
|
+
## Original Goal
|
|
90
|
+
{context.goal if context.goal else "Not specified"}
|
|
91
|
+
|
|
92
|
+
## Constraints
|
|
93
|
+
{constraints_text}
|
|
94
|
+
|
|
95
|
+
## Artifact ({context.artifact_type})
|
|
96
|
+
```
|
|
97
|
+
{context.artifact}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Cast your vote as a JSON object with: approved (boolean), confidence (0-1), and reasoning."""
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def extract_json_payload(text: str) -> str | None:
|
|
104
|
+
"""Extract JSON object from text using index-based approach.
|
|
105
|
+
|
|
106
|
+
More reliable than regex for handling nested braces in code snippets.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
text: Raw text potentially containing JSON
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Extracted JSON string or None if not found
|
|
113
|
+
"""
|
|
114
|
+
start = text.find("{")
|
|
115
|
+
end = text.rfind("}")
|
|
116
|
+
if start != -1 and end != -1 and end > start:
|
|
117
|
+
return text[start : end + 1]
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def parse_vote_response(response_text: str, model: str) -> Result[Vote, ValidationError]:
|
|
122
|
+
"""Parse LLM response into Vote.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
response_text: Raw LLM response
|
|
126
|
+
model: Model that cast the vote
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Result containing Vote or ValidationError
|
|
130
|
+
"""
|
|
131
|
+
# Extract JSON using index-based approach (handles nested braces)
|
|
132
|
+
json_str = extract_json_payload(response_text)
|
|
133
|
+
|
|
134
|
+
if not json_str:
|
|
135
|
+
return Result.err(
|
|
136
|
+
ValidationError(
|
|
137
|
+
f"Could not find JSON in vote from {model}",
|
|
138
|
+
field="response",
|
|
139
|
+
value=response_text[:100],
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
data = json.loads(json_str)
|
|
145
|
+
except json.JSONDecodeError as e:
|
|
146
|
+
return Result.err(
|
|
147
|
+
ValidationError(
|
|
148
|
+
f"Invalid JSON in vote from {model}: {e}",
|
|
149
|
+
field="response",
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Validate required fields
|
|
154
|
+
if "approved" not in data:
|
|
155
|
+
return Result.err(
|
|
156
|
+
ValidationError(
|
|
157
|
+
f"Missing 'approved' field in vote from {model}",
|
|
158
|
+
field="approved",
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
confidence = max(0.0, min(1.0, float(data.get("confidence", 0.5))))
|
|
164
|
+
return Result.ok(
|
|
165
|
+
Vote(
|
|
166
|
+
model=model,
|
|
167
|
+
approved=bool(data["approved"]),
|
|
168
|
+
confidence=confidence,
|
|
169
|
+
reasoning=str(data.get("reasoning", "No reasoning provided")),
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
except (TypeError, ValueError) as e:
|
|
173
|
+
return Result.err(
|
|
174
|
+
ValidationError(
|
|
175
|
+
f"Invalid field types in vote from {model}: {e}",
|
|
176
|
+
field="response",
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class ConsensusEvaluator:
|
|
182
|
+
"""Stage 3 multi-model consensus evaluator.
|
|
183
|
+
|
|
184
|
+
Uses multiple Frontier tier models for diverse verification.
|
|
185
|
+
Requires 2/3 majority for approval.
|
|
186
|
+
|
|
187
|
+
Example:
|
|
188
|
+
evaluator = ConsensusEvaluator(llm_adapter)
|
|
189
|
+
result = await evaluator.evaluate(context, trigger_reason)
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
def __init__(
|
|
193
|
+
self,
|
|
194
|
+
llm_adapter: LiteLLMAdapter,
|
|
195
|
+
config: ConsensusConfig | None = None,
|
|
196
|
+
) -> None:
|
|
197
|
+
"""Initialize evaluator.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
llm_adapter: LLM adapter for completions
|
|
201
|
+
config: Consensus configuration
|
|
202
|
+
"""
|
|
203
|
+
self._llm = llm_adapter
|
|
204
|
+
self._config = config or ConsensusConfig()
|
|
205
|
+
|
|
206
|
+
async def evaluate(
|
|
207
|
+
self,
|
|
208
|
+
context: EvaluationContext,
|
|
209
|
+
trigger_reason: str = "manual",
|
|
210
|
+
) -> Result[tuple[ConsensusResult, list[BaseEvent]], ProviderError | ValidationError]:
|
|
211
|
+
"""Run consensus evaluation with multiple models.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
context: Evaluation context
|
|
215
|
+
trigger_reason: Why consensus was triggered
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Result containing ConsensusResult and events, or error
|
|
219
|
+
"""
|
|
220
|
+
events: list[BaseEvent] = []
|
|
221
|
+
models = list(self._config.models)
|
|
222
|
+
|
|
223
|
+
# Emit start event
|
|
224
|
+
events.append(
|
|
225
|
+
create_stage3_started_event(
|
|
226
|
+
execution_id=context.execution_id,
|
|
227
|
+
models=models,
|
|
228
|
+
trigger_reason=trigger_reason,
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Build messages
|
|
233
|
+
messages = [
|
|
234
|
+
Message(role=MessageRole.SYSTEM, content=CONSENSUS_SYSTEM_PROMPT),
|
|
235
|
+
Message(role=MessageRole.USER, content=build_consensus_prompt(context)),
|
|
236
|
+
]
|
|
237
|
+
|
|
238
|
+
# Collect votes from all models concurrently
|
|
239
|
+
vote_tasks = [
|
|
240
|
+
self._get_vote(messages, model)
|
|
241
|
+
for model in models
|
|
242
|
+
]
|
|
243
|
+
vote_results = await asyncio.gather(*vote_tasks, return_exceptions=True)
|
|
244
|
+
|
|
245
|
+
# Process results
|
|
246
|
+
votes: list[Vote] = []
|
|
247
|
+
errors: list[str] = []
|
|
248
|
+
|
|
249
|
+
for model, result in zip(models, vote_results, strict=True):
|
|
250
|
+
if isinstance(result, Exception):
|
|
251
|
+
errors.append(f"{model}: {result}")
|
|
252
|
+
continue
|
|
253
|
+
if result.is_err:
|
|
254
|
+
errors.append(f"{model}: {result.error.message}")
|
|
255
|
+
continue
|
|
256
|
+
votes.append(result.value)
|
|
257
|
+
|
|
258
|
+
# Need at least 2 votes to proceed
|
|
259
|
+
if len(votes) < 2:
|
|
260
|
+
return Result.err(
|
|
261
|
+
ValidationError(
|
|
262
|
+
f"Not enough votes collected: {len(votes)}/3",
|
|
263
|
+
details={"errors": errors},
|
|
264
|
+
)
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Calculate consensus
|
|
268
|
+
approving = sum(1 for v in votes if v.approved)
|
|
269
|
+
majority_ratio = approving / len(votes)
|
|
270
|
+
approved = majority_ratio >= self._config.majority_threshold
|
|
271
|
+
|
|
272
|
+
# Collect disagreements (reasoning from dissenting votes)
|
|
273
|
+
disagreements = tuple(
|
|
274
|
+
v.reasoning for v in votes if v.approved != approved
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
consensus_result = ConsensusResult(
|
|
278
|
+
approved=approved,
|
|
279
|
+
votes=tuple(votes),
|
|
280
|
+
majority_ratio=majority_ratio,
|
|
281
|
+
disagreements=disagreements,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Emit completion event
|
|
285
|
+
events.append(
|
|
286
|
+
create_stage3_completed_event(
|
|
287
|
+
execution_id=context.execution_id,
|
|
288
|
+
approved=approved,
|
|
289
|
+
votes=[
|
|
290
|
+
{
|
|
291
|
+
"model": v.model,
|
|
292
|
+
"approved": v.approved,
|
|
293
|
+
"confidence": v.confidence,
|
|
294
|
+
"reasoning": v.reasoning,
|
|
295
|
+
}
|
|
296
|
+
for v in votes
|
|
297
|
+
],
|
|
298
|
+
majority_ratio=majority_ratio,
|
|
299
|
+
disagreements=list(disagreements),
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
return Result.ok((consensus_result, events))
|
|
304
|
+
|
|
305
|
+
async def _get_vote(
|
|
306
|
+
self,
|
|
307
|
+
messages: list[Message],
|
|
308
|
+
model: str,
|
|
309
|
+
) -> Result[Vote, ProviderError | ValidationError]:
|
|
310
|
+
"""Get a single vote from a model.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
messages: Prompt messages
|
|
314
|
+
model: Model to query
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Result containing Vote or error
|
|
318
|
+
"""
|
|
319
|
+
config = CompletionConfig(
|
|
320
|
+
model=model,
|
|
321
|
+
temperature=self._config.temperature,
|
|
322
|
+
max_tokens=self._config.max_tokens,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
llm_result = await self._llm.complete(messages, config)
|
|
326
|
+
if llm_result.is_err:
|
|
327
|
+
return Result.err(llm_result.error)
|
|
328
|
+
|
|
329
|
+
return parse_vote_response(llm_result.value.content, model)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
async def run_consensus_evaluation(
|
|
333
|
+
context: EvaluationContext,
|
|
334
|
+
llm_adapter: LiteLLMAdapter,
|
|
335
|
+
trigger_reason: str = "manual",
|
|
336
|
+
config: ConsensusConfig | None = None,
|
|
337
|
+
) -> Result[tuple[ConsensusResult, list[BaseEvent]], ProviderError | ValidationError]:
|
|
338
|
+
"""Convenience function for running consensus evaluation.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
context: Evaluation context
|
|
342
|
+
llm_adapter: LLM adapter
|
|
343
|
+
trigger_reason: Why consensus was triggered
|
|
344
|
+
config: Optional configuration
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Result with ConsensusResult and events
|
|
348
|
+
"""
|
|
349
|
+
evaluator = ConsensusEvaluator(llm_adapter, config)
|
|
350
|
+
return await evaluator.evaluate(context, trigger_reason)
|