ouroboros-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ouroboros-ai might be problematic. Click here for more details.
- ouroboros/__init__.py +15 -0
- ouroboros/__main__.py +9 -0
- ouroboros/bigbang/__init__.py +39 -0
- ouroboros/bigbang/ambiguity.py +464 -0
- ouroboros/bigbang/interview.py +530 -0
- ouroboros/bigbang/seed_generator.py +610 -0
- ouroboros/cli/__init__.py +9 -0
- ouroboros/cli/commands/__init__.py +7 -0
- ouroboros/cli/commands/config.py +79 -0
- ouroboros/cli/commands/init.py +425 -0
- ouroboros/cli/commands/run.py +201 -0
- ouroboros/cli/commands/status.py +85 -0
- ouroboros/cli/formatters/__init__.py +31 -0
- ouroboros/cli/formatters/panels.py +157 -0
- ouroboros/cli/formatters/progress.py +112 -0
- ouroboros/cli/formatters/tables.py +166 -0
- ouroboros/cli/main.py +60 -0
- ouroboros/config/__init__.py +81 -0
- ouroboros/config/loader.py +292 -0
- ouroboros/config/models.py +332 -0
- ouroboros/core/__init__.py +62 -0
- ouroboros/core/ac_tree.py +401 -0
- ouroboros/core/context.py +472 -0
- ouroboros/core/errors.py +246 -0
- ouroboros/core/seed.py +212 -0
- ouroboros/core/types.py +205 -0
- ouroboros/evaluation/__init__.py +110 -0
- ouroboros/evaluation/consensus.py +350 -0
- ouroboros/evaluation/mechanical.py +351 -0
- ouroboros/evaluation/models.py +235 -0
- ouroboros/evaluation/pipeline.py +286 -0
- ouroboros/evaluation/semantic.py +302 -0
- ouroboros/evaluation/trigger.py +278 -0
- ouroboros/events/__init__.py +5 -0
- ouroboros/events/base.py +80 -0
- ouroboros/events/decomposition.py +153 -0
- ouroboros/events/evaluation.py +248 -0
- ouroboros/execution/__init__.py +44 -0
- ouroboros/execution/atomicity.py +451 -0
- ouroboros/execution/decomposition.py +481 -0
- ouroboros/execution/double_diamond.py +1386 -0
- ouroboros/execution/subagent.py +275 -0
- ouroboros/observability/__init__.py +63 -0
- ouroboros/observability/drift.py +383 -0
- ouroboros/observability/logging.py +504 -0
- ouroboros/observability/retrospective.py +338 -0
- ouroboros/orchestrator/__init__.py +78 -0
- ouroboros/orchestrator/adapter.py +391 -0
- ouroboros/orchestrator/events.py +278 -0
- ouroboros/orchestrator/runner.py +597 -0
- ouroboros/orchestrator/session.py +486 -0
- ouroboros/persistence/__init__.py +23 -0
- ouroboros/persistence/checkpoint.py +511 -0
- ouroboros/persistence/event_store.py +183 -0
- ouroboros/persistence/migrations/__init__.py +1 -0
- ouroboros/persistence/migrations/runner.py +100 -0
- ouroboros/persistence/migrations/scripts/001_initial.sql +20 -0
- ouroboros/persistence/schema.py +56 -0
- ouroboros/persistence/uow.py +230 -0
- ouroboros/providers/__init__.py +28 -0
- ouroboros/providers/base.py +133 -0
- ouroboros/providers/claude_code_adapter.py +212 -0
- ouroboros/providers/litellm_adapter.py +316 -0
- ouroboros/py.typed +0 -0
- ouroboros/resilience/__init__.py +67 -0
- ouroboros/resilience/lateral.py +595 -0
- ouroboros/resilience/stagnation.py +727 -0
- ouroboros/routing/__init__.py +60 -0
- ouroboros/routing/complexity.py +272 -0
- ouroboros/routing/downgrade.py +664 -0
- ouroboros/routing/escalation.py +340 -0
- ouroboros/routing/router.py +204 -0
- ouroboros/routing/tiers.py +247 -0
- ouroboros/secondary/__init__.py +40 -0
- ouroboros/secondary/scheduler.py +467 -0
- ouroboros/secondary/todo_registry.py +483 -0
- ouroboros_ai-0.1.0.dist-info/METADATA +607 -0
- ouroboros_ai-0.1.0.dist-info/RECORD +81 -0
- ouroboros_ai-0.1.0.dist-info/WHEEL +4 -0
- ouroboros_ai-0.1.0.dist-info/entry_points.txt +2 -0
- ouroboros_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""Evaluation Pipeline Orchestrator.
|
|
2
|
+
|
|
3
|
+
Orchestrates the three-stage evaluation pipeline:
|
|
4
|
+
1. Stage 1: Mechanical Verification ($0)
|
|
5
|
+
2. Stage 2: Semantic Evaluation (Standard tier)
|
|
6
|
+
3. Stage 3: Multi-Model Consensus (Frontier tier, if triggered)
|
|
7
|
+
|
|
8
|
+
The pipeline respects configuration flags and trigger conditions.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
from ouroboros.core.errors import ProviderError, ValidationError
|
|
14
|
+
from ouroboros.core.types import Result
|
|
15
|
+
from ouroboros.evaluation.consensus import ConsensusConfig, ConsensusEvaluator
|
|
16
|
+
from ouroboros.evaluation.mechanical import (
|
|
17
|
+
MechanicalConfig,
|
|
18
|
+
MechanicalVerifier,
|
|
19
|
+
)
|
|
20
|
+
from ouroboros.evaluation.models import (
|
|
21
|
+
CheckType,
|
|
22
|
+
EvaluationContext,
|
|
23
|
+
EvaluationResult,
|
|
24
|
+
)
|
|
25
|
+
from ouroboros.evaluation.semantic import SemanticConfig, SemanticEvaluator
|
|
26
|
+
from ouroboros.evaluation.trigger import (
|
|
27
|
+
ConsensusTrigger,
|
|
28
|
+
TriggerConfig,
|
|
29
|
+
TriggerContext,
|
|
30
|
+
)
|
|
31
|
+
from ouroboros.events.base import BaseEvent
|
|
32
|
+
from ouroboros.events.evaluation import create_pipeline_completed_event
|
|
33
|
+
from ouroboros.providers.litellm_adapter import LiteLLMAdapter
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True, slots=True)
|
|
37
|
+
class PipelineConfig:
|
|
38
|
+
"""Configuration for the evaluation pipeline.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
stage1_enabled: Run mechanical verification
|
|
42
|
+
stage2_enabled: Run semantic evaluation
|
|
43
|
+
stage3_enabled: Allow consensus if triggered
|
|
44
|
+
mechanical: Stage 1 configuration
|
|
45
|
+
semantic: Stage 2 configuration
|
|
46
|
+
consensus: Stage 3 configuration
|
|
47
|
+
trigger: Trigger matrix configuration
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
stage1_enabled: bool = True
|
|
51
|
+
stage2_enabled: bool = True
|
|
52
|
+
stage3_enabled: bool = True
|
|
53
|
+
mechanical: MechanicalConfig | None = None
|
|
54
|
+
semantic: SemanticConfig | None = None
|
|
55
|
+
consensus: ConsensusConfig | None = None
|
|
56
|
+
trigger: TriggerConfig | None = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class EvaluationPipeline:
|
|
60
|
+
"""Orchestrates the three-stage evaluation pipeline.
|
|
61
|
+
|
|
62
|
+
Runs stages sequentially, respecting configuration and triggers.
|
|
63
|
+
Stage 3 is only run if trigger conditions are met.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
pipeline = EvaluationPipeline(llm_adapter, config)
|
|
67
|
+
result = await pipeline.evaluate(context)
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
llm_adapter: LiteLLMAdapter,
|
|
73
|
+
config: PipelineConfig | None = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""Initialize pipeline.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
llm_adapter: LLM adapter for semantic and consensus
|
|
79
|
+
config: Pipeline configuration
|
|
80
|
+
"""
|
|
81
|
+
self._llm = llm_adapter
|
|
82
|
+
self._config = config or PipelineConfig()
|
|
83
|
+
|
|
84
|
+
# Initialize stage evaluators
|
|
85
|
+
self._mechanical = MechanicalVerifier(self._config.mechanical)
|
|
86
|
+
self._semantic = SemanticEvaluator(llm_adapter, self._config.semantic)
|
|
87
|
+
self._consensus = ConsensusEvaluator(llm_adapter, self._config.consensus)
|
|
88
|
+
self._trigger = ConsensusTrigger(self._config.trigger)
|
|
89
|
+
|
|
90
|
+
async def evaluate(
|
|
91
|
+
self,
|
|
92
|
+
context: EvaluationContext,
|
|
93
|
+
trigger_context: TriggerContext | None = None,
|
|
94
|
+
) -> Result[EvaluationResult, ProviderError | ValidationError]:
|
|
95
|
+
"""Run the evaluation pipeline.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
context: Evaluation context with artifact
|
|
99
|
+
trigger_context: Optional pre-populated trigger context
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Result containing EvaluationResult or error
|
|
103
|
+
"""
|
|
104
|
+
events: list[BaseEvent] = []
|
|
105
|
+
stage1_result = None
|
|
106
|
+
stage2_result = None
|
|
107
|
+
stage3_result = None
|
|
108
|
+
|
|
109
|
+
# Stage 1: Mechanical Verification
|
|
110
|
+
if self._config.stage1_enabled:
|
|
111
|
+
result = await self._mechanical.verify(
|
|
112
|
+
context.execution_id,
|
|
113
|
+
checks=[CheckType.LINT, CheckType.BUILD, CheckType.TEST, CheckType.STATIC, CheckType.COVERAGE],
|
|
114
|
+
)
|
|
115
|
+
if result.is_err:
|
|
116
|
+
return Result.err(result.error)
|
|
117
|
+
|
|
118
|
+
stage1_result, stage1_events = result.value
|
|
119
|
+
events.extend(stage1_events)
|
|
120
|
+
|
|
121
|
+
# If Stage 1 fails, stop here
|
|
122
|
+
if not stage1_result.passed:
|
|
123
|
+
return self._build_result(
|
|
124
|
+
context.execution_id,
|
|
125
|
+
events,
|
|
126
|
+
stage1_result=stage1_result,
|
|
127
|
+
final_approved=False,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Stage 2: Semantic Evaluation
|
|
131
|
+
if self._config.stage2_enabled:
|
|
132
|
+
result = await self._semantic.evaluate(context)
|
|
133
|
+
if result.is_err:
|
|
134
|
+
return Result.err(result.error)
|
|
135
|
+
|
|
136
|
+
stage2_result, stage2_events = result.value
|
|
137
|
+
events.extend(stage2_events)
|
|
138
|
+
|
|
139
|
+
# Build trigger context if not provided
|
|
140
|
+
if trigger_context is None:
|
|
141
|
+
trigger_context = TriggerContext(
|
|
142
|
+
execution_id=context.execution_id,
|
|
143
|
+
semantic_result=stage2_result,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Check if Stage 2 failed on compliance
|
|
147
|
+
if not stage2_result.ac_compliance:
|
|
148
|
+
return self._build_result(
|
|
149
|
+
context.execution_id,
|
|
150
|
+
events,
|
|
151
|
+
stage1_result=stage1_result,
|
|
152
|
+
stage2_result=stage2_result,
|
|
153
|
+
final_approved=False,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Stage 3: Consensus (if triggered)
|
|
157
|
+
if self._config.stage3_enabled and trigger_context:
|
|
158
|
+
trigger_result = self._trigger.evaluate(trigger_context)
|
|
159
|
+
if trigger_result.is_err:
|
|
160
|
+
return Result.err(trigger_result.error)
|
|
161
|
+
|
|
162
|
+
trigger_decision, trigger_events = trigger_result.value
|
|
163
|
+
events.extend(trigger_events)
|
|
164
|
+
|
|
165
|
+
if trigger_decision.should_trigger:
|
|
166
|
+
trigger_reason = (
|
|
167
|
+
trigger_decision.trigger_type.value
|
|
168
|
+
if trigger_decision.trigger_type
|
|
169
|
+
else "manual"
|
|
170
|
+
)
|
|
171
|
+
result = await self._consensus.evaluate(context, trigger_reason)
|
|
172
|
+
if result.is_err:
|
|
173
|
+
return Result.err(result.error)
|
|
174
|
+
|
|
175
|
+
stage3_result, stage3_events = result.value
|
|
176
|
+
events.extend(stage3_events)
|
|
177
|
+
|
|
178
|
+
# Final approval based on consensus
|
|
179
|
+
return self._build_result(
|
|
180
|
+
context.execution_id,
|
|
181
|
+
events,
|
|
182
|
+
stage1_result=stage1_result,
|
|
183
|
+
stage2_result=stage2_result,
|
|
184
|
+
stage3_result=stage3_result,
|
|
185
|
+
final_approved=stage3_result.approved,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# No consensus triggered - approve based on Stage 2
|
|
189
|
+
final_approved = True
|
|
190
|
+
if stage2_result:
|
|
191
|
+
final_approved = stage2_result.ac_compliance and stage2_result.score >= 0.8
|
|
192
|
+
|
|
193
|
+
return self._build_result(
|
|
194
|
+
context.execution_id,
|
|
195
|
+
events,
|
|
196
|
+
stage1_result=stage1_result,
|
|
197
|
+
stage2_result=stage2_result,
|
|
198
|
+
final_approved=final_approved,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def _build_result(
|
|
202
|
+
self,
|
|
203
|
+
execution_id: str,
|
|
204
|
+
events: list[BaseEvent],
|
|
205
|
+
stage1_result=None,
|
|
206
|
+
stage2_result=None,
|
|
207
|
+
stage3_result=None,
|
|
208
|
+
final_approved: bool = False,
|
|
209
|
+
) -> Result[EvaluationResult, ValidationError]:
|
|
210
|
+
"""Build the final evaluation result.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
execution_id: Execution identifier
|
|
214
|
+
events: Collected events
|
|
215
|
+
stage1_result: Stage 1 result if completed
|
|
216
|
+
stage2_result: Stage 2 result if completed
|
|
217
|
+
stage3_result: Stage 3 result if triggered
|
|
218
|
+
final_approved: Overall approval status
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Result containing EvaluationResult
|
|
222
|
+
"""
|
|
223
|
+
# Calculate highest stage before creating immutable result
|
|
224
|
+
highest_stage = 0
|
|
225
|
+
if stage1_result is not None:
|
|
226
|
+
highest_stage = 1
|
|
227
|
+
if stage2_result is not None:
|
|
228
|
+
highest_stage = 2
|
|
229
|
+
if stage3_result is not None:
|
|
230
|
+
highest_stage = 3
|
|
231
|
+
|
|
232
|
+
# Calculate failure reason before creating immutable result
|
|
233
|
+
failure_reason: str | None = None
|
|
234
|
+
if not final_approved:
|
|
235
|
+
if stage1_result and not stage1_result.passed:
|
|
236
|
+
failed = stage1_result.failed_checks
|
|
237
|
+
failure_reason = f"Stage 1 failed: {', '.join(c.check_type for c in failed)}"
|
|
238
|
+
elif stage2_result and not stage2_result.ac_compliance:
|
|
239
|
+
failure_reason = f"Stage 2 failed: AC non-compliance (score={stage2_result.score:.2f})"
|
|
240
|
+
elif stage3_result and not stage3_result.approved:
|
|
241
|
+
failure_reason = f"Stage 3 failed: Consensus not reached ({stage3_result.majority_ratio:.0%})"
|
|
242
|
+
else:
|
|
243
|
+
failure_reason = "Unknown failure"
|
|
244
|
+
|
|
245
|
+
# Create completion event
|
|
246
|
+
completion_event = create_pipeline_completed_event(
|
|
247
|
+
execution_id=execution_id,
|
|
248
|
+
final_approved=final_approved,
|
|
249
|
+
highest_stage=highest_stage,
|
|
250
|
+
failure_reason=failure_reason,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Build complete event list before creating frozen result
|
|
254
|
+
all_events = [*events, completion_event]
|
|
255
|
+
|
|
256
|
+
result = EvaluationResult(
|
|
257
|
+
execution_id=execution_id,
|
|
258
|
+
stage1_result=stage1_result,
|
|
259
|
+
stage2_result=stage2_result,
|
|
260
|
+
stage3_result=stage3_result,
|
|
261
|
+
final_approved=final_approved,
|
|
262
|
+
events=all_events,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
return Result.ok(result)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
async def run_evaluation_pipeline(
|
|
269
|
+
context: EvaluationContext,
|
|
270
|
+
llm_adapter: LiteLLMAdapter,
|
|
271
|
+
config: PipelineConfig | None = None,
|
|
272
|
+
trigger_context: TriggerContext | None = None,
|
|
273
|
+
) -> Result[EvaluationResult, ProviderError | ValidationError]:
|
|
274
|
+
"""Convenience function for running the evaluation pipeline.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
context: Evaluation context
|
|
278
|
+
llm_adapter: LLM adapter
|
|
279
|
+
config: Optional configuration
|
|
280
|
+
trigger_context: Optional trigger context
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Result with EvaluationResult
|
|
284
|
+
"""
|
|
285
|
+
pipeline = EvaluationPipeline(llm_adapter, config)
|
|
286
|
+
return await pipeline.evaluate(context, trigger_context)
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""Stage 2: Semantic Evaluation.
|
|
2
|
+
|
|
3
|
+
LLM-based semantic evaluation using Standard tier:
|
|
4
|
+
- AC Compliance: Whether acceptance criteria are met
|
|
5
|
+
- Goal Alignment: Alignment with original goal
|
|
6
|
+
- Drift Measurement: Deviation from seed intent
|
|
7
|
+
|
|
8
|
+
The SemanticEvaluator uses the LiteLLM adapter for LLM calls.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
from ouroboros.core.errors import ProviderError, ValidationError
|
|
15
|
+
from ouroboros.core.types import Result
|
|
16
|
+
from ouroboros.evaluation.models import EvaluationContext, SemanticResult
|
|
17
|
+
from ouroboros.events.base import BaseEvent
|
|
18
|
+
from ouroboros.events.evaluation import (
|
|
19
|
+
create_stage2_completed_event,
|
|
20
|
+
create_stage2_started_event,
|
|
21
|
+
)
|
|
22
|
+
from ouroboros.providers.base import CompletionConfig, Message, MessageRole
|
|
23
|
+
from ouroboros.providers.litellm_adapter import LiteLLMAdapter
|
|
24
|
+
|
|
25
|
+
# Default model for semantic evaluation (Standard tier)
|
|
26
|
+
# Can be overridden via SemanticConfig.model
|
|
27
|
+
DEFAULT_SEMANTIC_MODEL = "openrouter/google/gemini-2.0-flash-001"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True, slots=True)
|
|
31
|
+
class SemanticConfig:
|
|
32
|
+
"""Configuration for semantic evaluation.
|
|
33
|
+
|
|
34
|
+
Attributes:
|
|
35
|
+
model: LLM model to use for evaluation
|
|
36
|
+
temperature: Sampling temperature (lower for consistency)
|
|
37
|
+
max_tokens: Maximum tokens for response
|
|
38
|
+
satisfaction_threshold: Minimum score to pass (default 0.8)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model: str = DEFAULT_SEMANTIC_MODEL
|
|
42
|
+
temperature: float = 0.2
|
|
43
|
+
max_tokens: int = 2048
|
|
44
|
+
satisfaction_threshold: float = 0.8
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
EVALUATION_SYSTEM_PROMPT = """You are a rigorous software evaluation assistant. Your task is to evaluate code artifacts against acceptance criteria, goal alignment, and semantic drift.
|
|
48
|
+
|
|
49
|
+
You must respond ONLY with a valid JSON object in the following exact format:
|
|
50
|
+
{
|
|
51
|
+
"score": <float between 0.0 and 1.0>,
|
|
52
|
+
"ac_compliance": <boolean>,
|
|
53
|
+
"goal_alignment": <float between 0.0 and 1.0>,
|
|
54
|
+
"drift_score": <float between 0.0 and 1.0>,
|
|
55
|
+
"uncertainty": <float between 0.0 and 1.0>,
|
|
56
|
+
"reasoning": "<string explaining your evaluation>"
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Evaluation criteria:
|
|
60
|
+
- score: Overall quality score (0.0 = completely fails, 1.0 = perfect)
|
|
61
|
+
- ac_compliance: true if the artifact meets the acceptance criterion
|
|
62
|
+
- goal_alignment: How well the artifact aligns with the original goal
|
|
63
|
+
- drift_score: How much the implementation drifts from intent (0.0 = no drift, 1.0 = complete drift)
|
|
64
|
+
- uncertainty: Your confidence level in this evaluation (0.0 = certain, 1.0 = very uncertain)
|
|
65
|
+
- reasoning: Brief explanation of your evaluation
|
|
66
|
+
|
|
67
|
+
Be strict but fair. A passing artifact should have:
|
|
68
|
+
- ac_compliance = true
|
|
69
|
+
- score >= 0.8
|
|
70
|
+
- goal_alignment >= 0.7
|
|
71
|
+
- drift_score <= 0.3
|
|
72
|
+
- uncertainty <= 0.3"""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def build_evaluation_prompt(context: EvaluationContext) -> str:
|
|
76
|
+
"""Build the user prompt for evaluation.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
context: Evaluation context with artifact and criteria
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Formatted prompt string
|
|
83
|
+
"""
|
|
84
|
+
constraints_text = "\n".join(f"- {c}" for c in context.constraints) if context.constraints else "None specified"
|
|
85
|
+
|
|
86
|
+
return f"""Evaluate the following artifact:
|
|
87
|
+
|
|
88
|
+
## Acceptance Criterion
|
|
89
|
+
{context.current_ac}
|
|
90
|
+
|
|
91
|
+
## Original Goal
|
|
92
|
+
{context.goal if context.goal else "Not specified"}
|
|
93
|
+
|
|
94
|
+
## Constraints
|
|
95
|
+
{constraints_text}
|
|
96
|
+
|
|
97
|
+
## Artifact Type
|
|
98
|
+
{context.artifact_type}
|
|
99
|
+
|
|
100
|
+
## Artifact Content
|
|
101
|
+
```
|
|
102
|
+
{context.artifact}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Provide your evaluation as a JSON object."""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def extract_json_payload(text: str) -> str | None:
|
|
109
|
+
"""Extract JSON object from text using index-based approach.
|
|
110
|
+
|
|
111
|
+
More reliable than regex for handling nested braces in code snippets.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
text: Raw text potentially containing JSON
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Extracted JSON string or None if not found
|
|
118
|
+
"""
|
|
119
|
+
start = text.find("{")
|
|
120
|
+
end = text.rfind("}")
|
|
121
|
+
if start != -1 and end != -1 and end > start:
|
|
122
|
+
return text[start : end + 1]
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def parse_semantic_response(response_text: str) -> Result[SemanticResult, ValidationError]:
|
|
127
|
+
"""Parse LLM response into SemanticResult.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
response_text: Raw LLM response text
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Result containing SemanticResult or ValidationError
|
|
134
|
+
"""
|
|
135
|
+
# Extract JSON using index-based approach (handles nested braces)
|
|
136
|
+
json_str = extract_json_payload(response_text)
|
|
137
|
+
|
|
138
|
+
if not json_str:
|
|
139
|
+
return Result.err(
|
|
140
|
+
ValidationError(
|
|
141
|
+
"Could not find JSON in response",
|
|
142
|
+
field="response",
|
|
143
|
+
value=response_text[:100],
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
data = json.loads(json_str)
|
|
149
|
+
except json.JSONDecodeError as e:
|
|
150
|
+
return Result.err(
|
|
151
|
+
ValidationError(
|
|
152
|
+
f"Invalid JSON in response: {e}",
|
|
153
|
+
field="response",
|
|
154
|
+
value=json_str[:100],
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Validate required fields
|
|
159
|
+
required_fields = ["score", "ac_compliance", "goal_alignment", "drift_score", "uncertainty", "reasoning"]
|
|
160
|
+
missing = [f for f in required_fields if f not in data]
|
|
161
|
+
if missing:
|
|
162
|
+
return Result.err(
|
|
163
|
+
ValidationError(
|
|
164
|
+
f"Missing required fields: {missing}",
|
|
165
|
+
field="response",
|
|
166
|
+
details={"missing_fields": missing},
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Validate and clamp numeric ranges
|
|
171
|
+
try:
|
|
172
|
+
score = max(0.0, min(1.0, float(data["score"])))
|
|
173
|
+
goal_alignment = max(0.0, min(1.0, float(data["goal_alignment"])))
|
|
174
|
+
drift_score = max(0.0, min(1.0, float(data["drift_score"])))
|
|
175
|
+
uncertainty = max(0.0, min(1.0, float(data["uncertainty"])))
|
|
176
|
+
|
|
177
|
+
return Result.ok(
|
|
178
|
+
SemanticResult(
|
|
179
|
+
score=score,
|
|
180
|
+
ac_compliance=bool(data["ac_compliance"]),
|
|
181
|
+
goal_alignment=goal_alignment,
|
|
182
|
+
drift_score=drift_score,
|
|
183
|
+
uncertainty=uncertainty,
|
|
184
|
+
reasoning=str(data["reasoning"]),
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
except (TypeError, ValueError) as e:
|
|
188
|
+
return Result.err(
|
|
189
|
+
ValidationError(
|
|
190
|
+
f"Invalid field types: {e}",
|
|
191
|
+
field="response",
|
|
192
|
+
details={"error": str(e)},
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class SemanticEvaluator:
|
|
198
|
+
"""Stage 2 semantic evaluation using LLM.
|
|
199
|
+
|
|
200
|
+
Evaluates artifacts for AC compliance, goal alignment, and drift.
|
|
201
|
+
Uses Standard tier LLM for balanced cost/quality.
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
evaluator = SemanticEvaluator(llm_adapter)
|
|
205
|
+
result = await evaluator.evaluate(context, execution_id)
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
def __init__(
|
|
209
|
+
self,
|
|
210
|
+
llm_adapter: LiteLLMAdapter,
|
|
211
|
+
config: SemanticConfig | None = None,
|
|
212
|
+
) -> None:
|
|
213
|
+
"""Initialize evaluator.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
llm_adapter: LLM adapter for completions
|
|
217
|
+
config: Evaluation configuration
|
|
218
|
+
"""
|
|
219
|
+
self._llm = llm_adapter
|
|
220
|
+
self._config = config or SemanticConfig()
|
|
221
|
+
|
|
222
|
+
async def evaluate(
|
|
223
|
+
self,
|
|
224
|
+
context: EvaluationContext,
|
|
225
|
+
) -> Result[tuple[SemanticResult, list[BaseEvent]], ProviderError | ValidationError]:
|
|
226
|
+
"""Evaluate an artifact semantically.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
context: Evaluation context
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Result containing SemanticResult and events, or error
|
|
233
|
+
"""
|
|
234
|
+
events: list[BaseEvent] = []
|
|
235
|
+
|
|
236
|
+
# Emit start event
|
|
237
|
+
events.append(
|
|
238
|
+
create_stage2_started_event(
|
|
239
|
+
execution_id=context.execution_id,
|
|
240
|
+
model=self._config.model,
|
|
241
|
+
current_ac=context.current_ac,
|
|
242
|
+
)
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Build messages
|
|
246
|
+
messages = [
|
|
247
|
+
Message(role=MessageRole.SYSTEM, content=EVALUATION_SYSTEM_PROMPT),
|
|
248
|
+
Message(role=MessageRole.USER, content=build_evaluation_prompt(context)),
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
# Call LLM
|
|
252
|
+
completion_config = CompletionConfig(
|
|
253
|
+
model=self._config.model,
|
|
254
|
+
temperature=self._config.temperature,
|
|
255
|
+
max_tokens=self._config.max_tokens,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
llm_result = await self._llm.complete(messages, completion_config)
|
|
259
|
+
if llm_result.is_err:
|
|
260
|
+
return Result.err(llm_result.error)
|
|
261
|
+
|
|
262
|
+
response = llm_result.value
|
|
263
|
+
|
|
264
|
+
# Parse response
|
|
265
|
+
parse_result = parse_semantic_response(response.content)
|
|
266
|
+
if parse_result.is_err:
|
|
267
|
+
return Result.err(parse_result.error)
|
|
268
|
+
|
|
269
|
+
semantic_result = parse_result.value
|
|
270
|
+
|
|
271
|
+
# Emit completion event
|
|
272
|
+
events.append(
|
|
273
|
+
create_stage2_completed_event(
|
|
274
|
+
execution_id=context.execution_id,
|
|
275
|
+
score=semantic_result.score,
|
|
276
|
+
ac_compliance=semantic_result.ac_compliance,
|
|
277
|
+
goal_alignment=semantic_result.goal_alignment,
|
|
278
|
+
drift_score=semantic_result.drift_score,
|
|
279
|
+
uncertainty=semantic_result.uncertainty,
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return Result.ok((semantic_result, events))
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
async def run_semantic_evaluation(
|
|
287
|
+
context: EvaluationContext,
|
|
288
|
+
llm_adapter: LiteLLMAdapter,
|
|
289
|
+
config: SemanticConfig | None = None,
|
|
290
|
+
) -> Result[tuple[SemanticResult, list[BaseEvent]], ProviderError | ValidationError]:
|
|
291
|
+
"""Convenience function for running semantic evaluation.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
context: Evaluation context
|
|
295
|
+
llm_adapter: LLM adapter
|
|
296
|
+
config: Optional configuration
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Result with SemanticResult and events
|
|
300
|
+
"""
|
|
301
|
+
evaluator = SemanticEvaluator(llm_adapter, config)
|
|
302
|
+
return await evaluator.evaluate(context)
|