ouroboros-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ouroboros-ai might be problematic. Click here for more details.

Files changed (81) hide show
  1. ouroboros/__init__.py +15 -0
  2. ouroboros/__main__.py +9 -0
  3. ouroboros/bigbang/__init__.py +39 -0
  4. ouroboros/bigbang/ambiguity.py +464 -0
  5. ouroboros/bigbang/interview.py +530 -0
  6. ouroboros/bigbang/seed_generator.py +610 -0
  7. ouroboros/cli/__init__.py +9 -0
  8. ouroboros/cli/commands/__init__.py +7 -0
  9. ouroboros/cli/commands/config.py +79 -0
  10. ouroboros/cli/commands/init.py +425 -0
  11. ouroboros/cli/commands/run.py +201 -0
  12. ouroboros/cli/commands/status.py +85 -0
  13. ouroboros/cli/formatters/__init__.py +31 -0
  14. ouroboros/cli/formatters/panels.py +157 -0
  15. ouroboros/cli/formatters/progress.py +112 -0
  16. ouroboros/cli/formatters/tables.py +166 -0
  17. ouroboros/cli/main.py +60 -0
  18. ouroboros/config/__init__.py +81 -0
  19. ouroboros/config/loader.py +292 -0
  20. ouroboros/config/models.py +332 -0
  21. ouroboros/core/__init__.py +62 -0
  22. ouroboros/core/ac_tree.py +401 -0
  23. ouroboros/core/context.py +472 -0
  24. ouroboros/core/errors.py +246 -0
  25. ouroboros/core/seed.py +212 -0
  26. ouroboros/core/types.py +205 -0
  27. ouroboros/evaluation/__init__.py +110 -0
  28. ouroboros/evaluation/consensus.py +350 -0
  29. ouroboros/evaluation/mechanical.py +351 -0
  30. ouroboros/evaluation/models.py +235 -0
  31. ouroboros/evaluation/pipeline.py +286 -0
  32. ouroboros/evaluation/semantic.py +302 -0
  33. ouroboros/evaluation/trigger.py +278 -0
  34. ouroboros/events/__init__.py +5 -0
  35. ouroboros/events/base.py +80 -0
  36. ouroboros/events/decomposition.py +153 -0
  37. ouroboros/events/evaluation.py +248 -0
  38. ouroboros/execution/__init__.py +44 -0
  39. ouroboros/execution/atomicity.py +451 -0
  40. ouroboros/execution/decomposition.py +481 -0
  41. ouroboros/execution/double_diamond.py +1386 -0
  42. ouroboros/execution/subagent.py +275 -0
  43. ouroboros/observability/__init__.py +63 -0
  44. ouroboros/observability/drift.py +383 -0
  45. ouroboros/observability/logging.py +504 -0
  46. ouroboros/observability/retrospective.py +338 -0
  47. ouroboros/orchestrator/__init__.py +78 -0
  48. ouroboros/orchestrator/adapter.py +391 -0
  49. ouroboros/orchestrator/events.py +278 -0
  50. ouroboros/orchestrator/runner.py +597 -0
  51. ouroboros/orchestrator/session.py +486 -0
  52. ouroboros/persistence/__init__.py +23 -0
  53. ouroboros/persistence/checkpoint.py +511 -0
  54. ouroboros/persistence/event_store.py +183 -0
  55. ouroboros/persistence/migrations/__init__.py +1 -0
  56. ouroboros/persistence/migrations/runner.py +100 -0
  57. ouroboros/persistence/migrations/scripts/001_initial.sql +20 -0
  58. ouroboros/persistence/schema.py +56 -0
  59. ouroboros/persistence/uow.py +230 -0
  60. ouroboros/providers/__init__.py +28 -0
  61. ouroboros/providers/base.py +133 -0
  62. ouroboros/providers/claude_code_adapter.py +212 -0
  63. ouroboros/providers/litellm_adapter.py +316 -0
  64. ouroboros/py.typed +0 -0
  65. ouroboros/resilience/__init__.py +67 -0
  66. ouroboros/resilience/lateral.py +595 -0
  67. ouroboros/resilience/stagnation.py +727 -0
  68. ouroboros/routing/__init__.py +60 -0
  69. ouroboros/routing/complexity.py +272 -0
  70. ouroboros/routing/downgrade.py +664 -0
  71. ouroboros/routing/escalation.py +340 -0
  72. ouroboros/routing/router.py +204 -0
  73. ouroboros/routing/tiers.py +247 -0
  74. ouroboros/secondary/__init__.py +40 -0
  75. ouroboros/secondary/scheduler.py +467 -0
  76. ouroboros/secondary/todo_registry.py +483 -0
  77. ouroboros_ai-0.1.0.dist-info/METADATA +607 -0
  78. ouroboros_ai-0.1.0.dist-info/RECORD +81 -0
  79. ouroboros_ai-0.1.0.dist-info/WHEEL +4 -0
  80. ouroboros_ai-0.1.0.dist-info/entry_points.txt +2 -0
  81. ouroboros_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,286 @@
1
+ """Evaluation Pipeline Orchestrator.
2
+
3
+ Orchestrates the three-stage evaluation pipeline:
4
+ 1. Stage 1: Mechanical Verification ($0)
5
+ 2. Stage 2: Semantic Evaluation (Standard tier)
6
+ 3. Stage 3: Multi-Model Consensus (Frontier tier, if triggered)
7
+
8
+ The pipeline respects configuration flags and trigger conditions.
9
+ """
10
+
11
+ from dataclasses import dataclass
12
+
13
+ from ouroboros.core.errors import ProviderError, ValidationError
14
+ from ouroboros.core.types import Result
15
+ from ouroboros.evaluation.consensus import ConsensusConfig, ConsensusEvaluator
16
+ from ouroboros.evaluation.mechanical import (
17
+ MechanicalConfig,
18
+ MechanicalVerifier,
19
+ )
20
+ from ouroboros.evaluation.models import (
21
+ CheckType,
22
+ EvaluationContext,
23
+ EvaluationResult,
24
+ )
25
+ from ouroboros.evaluation.semantic import SemanticConfig, SemanticEvaluator
26
+ from ouroboros.evaluation.trigger import (
27
+ ConsensusTrigger,
28
+ TriggerConfig,
29
+ TriggerContext,
30
+ )
31
+ from ouroboros.events.base import BaseEvent
32
+ from ouroboros.events.evaluation import create_pipeline_completed_event
33
+ from ouroboros.providers.litellm_adapter import LiteLLMAdapter
34
+
35
+
36
+ @dataclass(frozen=True, slots=True)
37
+ class PipelineConfig:
38
+ """Configuration for the evaluation pipeline.
39
+
40
+ Attributes:
41
+ stage1_enabled: Run mechanical verification
42
+ stage2_enabled: Run semantic evaluation
43
+ stage3_enabled: Allow consensus if triggered
44
+ mechanical: Stage 1 configuration
45
+ semantic: Stage 2 configuration
46
+ consensus: Stage 3 configuration
47
+ trigger: Trigger matrix configuration
48
+ """
49
+
50
+ stage1_enabled: bool = True
51
+ stage2_enabled: bool = True
52
+ stage3_enabled: bool = True
53
+ mechanical: MechanicalConfig | None = None
54
+ semantic: SemanticConfig | None = None
55
+ consensus: ConsensusConfig | None = None
56
+ trigger: TriggerConfig | None = None
57
+
58
+
59
+ class EvaluationPipeline:
60
+ """Orchestrates the three-stage evaluation pipeline.
61
+
62
+ Runs stages sequentially, respecting configuration and triggers.
63
+ Stage 3 is only run if trigger conditions are met.
64
+
65
+ Example:
66
+ pipeline = EvaluationPipeline(llm_adapter, config)
67
+ result = await pipeline.evaluate(context)
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ llm_adapter: LiteLLMAdapter,
73
+ config: PipelineConfig | None = None,
74
+ ) -> None:
75
+ """Initialize pipeline.
76
+
77
+ Args:
78
+ llm_adapter: LLM adapter for semantic and consensus
79
+ config: Pipeline configuration
80
+ """
81
+ self._llm = llm_adapter
82
+ self._config = config or PipelineConfig()
83
+
84
+ # Initialize stage evaluators
85
+ self._mechanical = MechanicalVerifier(self._config.mechanical)
86
+ self._semantic = SemanticEvaluator(llm_adapter, self._config.semantic)
87
+ self._consensus = ConsensusEvaluator(llm_adapter, self._config.consensus)
88
+ self._trigger = ConsensusTrigger(self._config.trigger)
89
+
90
+ async def evaluate(
91
+ self,
92
+ context: EvaluationContext,
93
+ trigger_context: TriggerContext | None = None,
94
+ ) -> Result[EvaluationResult, ProviderError | ValidationError]:
95
+ """Run the evaluation pipeline.
96
+
97
+ Args:
98
+ context: Evaluation context with artifact
99
+ trigger_context: Optional pre-populated trigger context
100
+
101
+ Returns:
102
+ Result containing EvaluationResult or error
103
+ """
104
+ events: list[BaseEvent] = []
105
+ stage1_result = None
106
+ stage2_result = None
107
+ stage3_result = None
108
+
109
+ # Stage 1: Mechanical Verification
110
+ if self._config.stage1_enabled:
111
+ result = await self._mechanical.verify(
112
+ context.execution_id,
113
+ checks=[CheckType.LINT, CheckType.BUILD, CheckType.TEST, CheckType.STATIC, CheckType.COVERAGE],
114
+ )
115
+ if result.is_err:
116
+ return Result.err(result.error)
117
+
118
+ stage1_result, stage1_events = result.value
119
+ events.extend(stage1_events)
120
+
121
+ # If Stage 1 fails, stop here
122
+ if not stage1_result.passed:
123
+ return self._build_result(
124
+ context.execution_id,
125
+ events,
126
+ stage1_result=stage1_result,
127
+ final_approved=False,
128
+ )
129
+
130
+ # Stage 2: Semantic Evaluation
131
+ if self._config.stage2_enabled:
132
+ result = await self._semantic.evaluate(context)
133
+ if result.is_err:
134
+ return Result.err(result.error)
135
+
136
+ stage2_result, stage2_events = result.value
137
+ events.extend(stage2_events)
138
+
139
+ # Build trigger context if not provided
140
+ if trigger_context is None:
141
+ trigger_context = TriggerContext(
142
+ execution_id=context.execution_id,
143
+ semantic_result=stage2_result,
144
+ )
145
+
146
+ # Check if Stage 2 failed on compliance
147
+ if not stage2_result.ac_compliance:
148
+ return self._build_result(
149
+ context.execution_id,
150
+ events,
151
+ stage1_result=stage1_result,
152
+ stage2_result=stage2_result,
153
+ final_approved=False,
154
+ )
155
+
156
+ # Stage 3: Consensus (if triggered)
157
+ if self._config.stage3_enabled and trigger_context:
158
+ trigger_result = self._trigger.evaluate(trigger_context)
159
+ if trigger_result.is_err:
160
+ return Result.err(trigger_result.error)
161
+
162
+ trigger_decision, trigger_events = trigger_result.value
163
+ events.extend(trigger_events)
164
+
165
+ if trigger_decision.should_trigger:
166
+ trigger_reason = (
167
+ trigger_decision.trigger_type.value
168
+ if trigger_decision.trigger_type
169
+ else "manual"
170
+ )
171
+ result = await self._consensus.evaluate(context, trigger_reason)
172
+ if result.is_err:
173
+ return Result.err(result.error)
174
+
175
+ stage3_result, stage3_events = result.value
176
+ events.extend(stage3_events)
177
+
178
+ # Final approval based on consensus
179
+ return self._build_result(
180
+ context.execution_id,
181
+ events,
182
+ stage1_result=stage1_result,
183
+ stage2_result=stage2_result,
184
+ stage3_result=stage3_result,
185
+ final_approved=stage3_result.approved,
186
+ )
187
+
188
+ # No consensus triggered - approve based on Stage 2
189
+ final_approved = True
190
+ if stage2_result:
191
+ final_approved = stage2_result.ac_compliance and stage2_result.score >= 0.8
192
+
193
+ return self._build_result(
194
+ context.execution_id,
195
+ events,
196
+ stage1_result=stage1_result,
197
+ stage2_result=stage2_result,
198
+ final_approved=final_approved,
199
+ )
200
+
201
+ def _build_result(
202
+ self,
203
+ execution_id: str,
204
+ events: list[BaseEvent],
205
+ stage1_result=None,
206
+ stage2_result=None,
207
+ stage3_result=None,
208
+ final_approved: bool = False,
209
+ ) -> Result[EvaluationResult, ValidationError]:
210
+ """Build the final evaluation result.
211
+
212
+ Args:
213
+ execution_id: Execution identifier
214
+ events: Collected events
215
+ stage1_result: Stage 1 result if completed
216
+ stage2_result: Stage 2 result if completed
217
+ stage3_result: Stage 3 result if triggered
218
+ final_approved: Overall approval status
219
+
220
+ Returns:
221
+ Result containing EvaluationResult
222
+ """
223
+ # Calculate highest stage before creating immutable result
224
+ highest_stage = 0
225
+ if stage1_result is not None:
226
+ highest_stage = 1
227
+ if stage2_result is not None:
228
+ highest_stage = 2
229
+ if stage3_result is not None:
230
+ highest_stage = 3
231
+
232
+ # Calculate failure reason before creating immutable result
233
+ failure_reason: str | None = None
234
+ if not final_approved:
235
+ if stage1_result and not stage1_result.passed:
236
+ failed = stage1_result.failed_checks
237
+ failure_reason = f"Stage 1 failed: {', '.join(c.check_type for c in failed)}"
238
+ elif stage2_result and not stage2_result.ac_compliance:
239
+ failure_reason = f"Stage 2 failed: AC non-compliance (score={stage2_result.score:.2f})"
240
+ elif stage3_result and not stage3_result.approved:
241
+ failure_reason = f"Stage 3 failed: Consensus not reached ({stage3_result.majority_ratio:.0%})"
242
+ else:
243
+ failure_reason = "Unknown failure"
244
+
245
+ # Create completion event
246
+ completion_event = create_pipeline_completed_event(
247
+ execution_id=execution_id,
248
+ final_approved=final_approved,
249
+ highest_stage=highest_stage,
250
+ failure_reason=failure_reason,
251
+ )
252
+
253
+ # Build complete event list before creating frozen result
254
+ all_events = [*events, completion_event]
255
+
256
+ result = EvaluationResult(
257
+ execution_id=execution_id,
258
+ stage1_result=stage1_result,
259
+ stage2_result=stage2_result,
260
+ stage3_result=stage3_result,
261
+ final_approved=final_approved,
262
+ events=all_events,
263
+ )
264
+
265
+ return Result.ok(result)
266
+
267
+
268
+ async def run_evaluation_pipeline(
269
+ context: EvaluationContext,
270
+ llm_adapter: LiteLLMAdapter,
271
+ config: PipelineConfig | None = None,
272
+ trigger_context: TriggerContext | None = None,
273
+ ) -> Result[EvaluationResult, ProviderError | ValidationError]:
274
+ """Convenience function for running the evaluation pipeline.
275
+
276
+ Args:
277
+ context: Evaluation context
278
+ llm_adapter: LLM adapter
279
+ config: Optional configuration
280
+ trigger_context: Optional trigger context
281
+
282
+ Returns:
283
+ Result with EvaluationResult
284
+ """
285
+ pipeline = EvaluationPipeline(llm_adapter, config)
286
+ return await pipeline.evaluate(context, trigger_context)
@@ -0,0 +1,302 @@
1
+ """Stage 2: Semantic Evaluation.
2
+
3
+ LLM-based semantic evaluation using Standard tier:
4
+ - AC Compliance: Whether acceptance criteria are met
5
+ - Goal Alignment: Alignment with original goal
6
+ - Drift Measurement: Deviation from seed intent
7
+
8
+ The SemanticEvaluator uses the LiteLLM adapter for LLM calls.
9
+ """
10
+
11
+ from dataclasses import dataclass
12
+ import json
13
+
14
+ from ouroboros.core.errors import ProviderError, ValidationError
15
+ from ouroboros.core.types import Result
16
+ from ouroboros.evaluation.models import EvaluationContext, SemanticResult
17
+ from ouroboros.events.base import BaseEvent
18
+ from ouroboros.events.evaluation import (
19
+ create_stage2_completed_event,
20
+ create_stage2_started_event,
21
+ )
22
+ from ouroboros.providers.base import CompletionConfig, Message, MessageRole
23
+ from ouroboros.providers.litellm_adapter import LiteLLMAdapter
24
+
25
+ # Default model for semantic evaluation (Standard tier)
26
+ # Can be overridden via SemanticConfig.model
27
+ DEFAULT_SEMANTIC_MODEL = "openrouter/google/gemini-2.0-flash-001"
28
+
29
+
30
+ @dataclass(frozen=True, slots=True)
31
+ class SemanticConfig:
32
+ """Configuration for semantic evaluation.
33
+
34
+ Attributes:
35
+ model: LLM model to use for evaluation
36
+ temperature: Sampling temperature (lower for consistency)
37
+ max_tokens: Maximum tokens for response
38
+ satisfaction_threshold: Minimum score to pass (default 0.8)
39
+ """
40
+
41
+ model: str = DEFAULT_SEMANTIC_MODEL
42
+ temperature: float = 0.2
43
+ max_tokens: int = 2048
44
+ satisfaction_threshold: float = 0.8
45
+
46
+
47
+ EVALUATION_SYSTEM_PROMPT = """You are a rigorous software evaluation assistant. Your task is to evaluate code artifacts against acceptance criteria, goal alignment, and semantic drift.
48
+
49
+ You must respond ONLY with a valid JSON object in the following exact format:
50
+ {
51
+ "score": <float between 0.0 and 1.0>,
52
+ "ac_compliance": <boolean>,
53
+ "goal_alignment": <float between 0.0 and 1.0>,
54
+ "drift_score": <float between 0.0 and 1.0>,
55
+ "uncertainty": <float between 0.0 and 1.0>,
56
+ "reasoning": "<string explaining your evaluation>"
57
+ }
58
+
59
+ Evaluation criteria:
60
+ - score: Overall quality score (0.0 = completely fails, 1.0 = perfect)
61
+ - ac_compliance: true if the artifact meets the acceptance criterion
62
+ - goal_alignment: How well the artifact aligns with the original goal
63
+ - drift_score: How much the implementation drifts from intent (0.0 = no drift, 1.0 = complete drift)
64
+ - uncertainty: Your confidence level in this evaluation (0.0 = certain, 1.0 = very uncertain)
65
+ - reasoning: Brief explanation of your evaluation
66
+
67
+ Be strict but fair. A passing artifact should have:
68
+ - ac_compliance = true
69
+ - score >= 0.8
70
+ - goal_alignment >= 0.7
71
+ - drift_score <= 0.3
72
+ - uncertainty <= 0.3"""
73
+
74
+
75
+ def build_evaluation_prompt(context: EvaluationContext) -> str:
76
+ """Build the user prompt for evaluation.
77
+
78
+ Args:
79
+ context: Evaluation context with artifact and criteria
80
+
81
+ Returns:
82
+ Formatted prompt string
83
+ """
84
+ constraints_text = "\n".join(f"- {c}" for c in context.constraints) if context.constraints else "None specified"
85
+
86
+ return f"""Evaluate the following artifact:
87
+
88
+ ## Acceptance Criterion
89
+ {context.current_ac}
90
+
91
+ ## Original Goal
92
+ {context.goal if context.goal else "Not specified"}
93
+
94
+ ## Constraints
95
+ {constraints_text}
96
+
97
+ ## Artifact Type
98
+ {context.artifact_type}
99
+
100
+ ## Artifact Content
101
+ ```
102
+ {context.artifact}
103
+ ```
104
+
105
+ Provide your evaluation as a JSON object."""
106
+
107
+
108
+ def extract_json_payload(text: str) -> str | None:
109
+ """Extract JSON object from text using index-based approach.
110
+
111
+ More reliable than regex for handling nested braces in code snippets.
112
+
113
+ Args:
114
+ text: Raw text potentially containing JSON
115
+
116
+ Returns:
117
+ Extracted JSON string or None if not found
118
+ """
119
+ start = text.find("{")
120
+ end = text.rfind("}")
121
+ if start != -1 and end != -1 and end > start:
122
+ return text[start : end + 1]
123
+ return None
124
+
125
+
126
+ def parse_semantic_response(response_text: str) -> Result[SemanticResult, ValidationError]:
127
+ """Parse LLM response into SemanticResult.
128
+
129
+ Args:
130
+ response_text: Raw LLM response text
131
+
132
+ Returns:
133
+ Result containing SemanticResult or ValidationError
134
+ """
135
+ # Extract JSON using index-based approach (handles nested braces)
136
+ json_str = extract_json_payload(response_text)
137
+
138
+ if not json_str:
139
+ return Result.err(
140
+ ValidationError(
141
+ "Could not find JSON in response",
142
+ field="response",
143
+ value=response_text[:100],
144
+ )
145
+ )
146
+
147
+ try:
148
+ data = json.loads(json_str)
149
+ except json.JSONDecodeError as e:
150
+ return Result.err(
151
+ ValidationError(
152
+ f"Invalid JSON in response: {e}",
153
+ field="response",
154
+ value=json_str[:100],
155
+ )
156
+ )
157
+
158
+ # Validate required fields
159
+ required_fields = ["score", "ac_compliance", "goal_alignment", "drift_score", "uncertainty", "reasoning"]
160
+ missing = [f for f in required_fields if f not in data]
161
+ if missing:
162
+ return Result.err(
163
+ ValidationError(
164
+ f"Missing required fields: {missing}",
165
+ field="response",
166
+ details={"missing_fields": missing},
167
+ )
168
+ )
169
+
170
+ # Validate and clamp numeric ranges
171
+ try:
172
+ score = max(0.0, min(1.0, float(data["score"])))
173
+ goal_alignment = max(0.0, min(1.0, float(data["goal_alignment"])))
174
+ drift_score = max(0.0, min(1.0, float(data["drift_score"])))
175
+ uncertainty = max(0.0, min(1.0, float(data["uncertainty"])))
176
+
177
+ return Result.ok(
178
+ SemanticResult(
179
+ score=score,
180
+ ac_compliance=bool(data["ac_compliance"]),
181
+ goal_alignment=goal_alignment,
182
+ drift_score=drift_score,
183
+ uncertainty=uncertainty,
184
+ reasoning=str(data["reasoning"]),
185
+ )
186
+ )
187
+ except (TypeError, ValueError) as e:
188
+ return Result.err(
189
+ ValidationError(
190
+ f"Invalid field types: {e}",
191
+ field="response",
192
+ details={"error": str(e)},
193
+ )
194
+ )
195
+
196
+
197
+ class SemanticEvaluator:
198
+ """Stage 2 semantic evaluation using LLM.
199
+
200
+ Evaluates artifacts for AC compliance, goal alignment, and drift.
201
+ Uses Standard tier LLM for balanced cost/quality.
202
+
203
+ Example:
204
+ evaluator = SemanticEvaluator(llm_adapter)
205
+ result = await evaluator.evaluate(context, execution_id)
206
+ """
207
+
208
+ def __init__(
209
+ self,
210
+ llm_adapter: LiteLLMAdapter,
211
+ config: SemanticConfig | None = None,
212
+ ) -> None:
213
+ """Initialize evaluator.
214
+
215
+ Args:
216
+ llm_adapter: LLM adapter for completions
217
+ config: Evaluation configuration
218
+ """
219
+ self._llm = llm_adapter
220
+ self._config = config or SemanticConfig()
221
+
222
+ async def evaluate(
223
+ self,
224
+ context: EvaluationContext,
225
+ ) -> Result[tuple[SemanticResult, list[BaseEvent]], ProviderError | ValidationError]:
226
+ """Evaluate an artifact semantically.
227
+
228
+ Args:
229
+ context: Evaluation context
230
+
231
+ Returns:
232
+ Result containing SemanticResult and events, or error
233
+ """
234
+ events: list[BaseEvent] = []
235
+
236
+ # Emit start event
237
+ events.append(
238
+ create_stage2_started_event(
239
+ execution_id=context.execution_id,
240
+ model=self._config.model,
241
+ current_ac=context.current_ac,
242
+ )
243
+ )
244
+
245
+ # Build messages
246
+ messages = [
247
+ Message(role=MessageRole.SYSTEM, content=EVALUATION_SYSTEM_PROMPT),
248
+ Message(role=MessageRole.USER, content=build_evaluation_prompt(context)),
249
+ ]
250
+
251
+ # Call LLM
252
+ completion_config = CompletionConfig(
253
+ model=self._config.model,
254
+ temperature=self._config.temperature,
255
+ max_tokens=self._config.max_tokens,
256
+ )
257
+
258
+ llm_result = await self._llm.complete(messages, completion_config)
259
+ if llm_result.is_err:
260
+ return Result.err(llm_result.error)
261
+
262
+ response = llm_result.value
263
+
264
+ # Parse response
265
+ parse_result = parse_semantic_response(response.content)
266
+ if parse_result.is_err:
267
+ return Result.err(parse_result.error)
268
+
269
+ semantic_result = parse_result.value
270
+
271
+ # Emit completion event
272
+ events.append(
273
+ create_stage2_completed_event(
274
+ execution_id=context.execution_id,
275
+ score=semantic_result.score,
276
+ ac_compliance=semantic_result.ac_compliance,
277
+ goal_alignment=semantic_result.goal_alignment,
278
+ drift_score=semantic_result.drift_score,
279
+ uncertainty=semantic_result.uncertainty,
280
+ )
281
+ )
282
+
283
+ return Result.ok((semantic_result, events))
284
+
285
+
286
+ async def run_semantic_evaluation(
287
+ context: EvaluationContext,
288
+ llm_adapter: LiteLLMAdapter,
289
+ config: SemanticConfig | None = None,
290
+ ) -> Result[tuple[SemanticResult, list[BaseEvent]], ProviderError | ValidationError]:
291
+ """Convenience function for running semantic evaluation.
292
+
293
+ Args:
294
+ context: Evaluation context
295
+ llm_adapter: LLM adapter
296
+ config: Optional configuration
297
+
298
+ Returns:
299
+ Result with SemanticResult and events
300
+ """
301
+ evaluator = SemanticEvaluator(llm_adapter, config)
302
+ return await evaluator.evaluate(context)