ouroboros-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ouroboros-ai might be problematic. Click here for more details.

Files changed (81) hide show
  1. ouroboros/__init__.py +15 -0
  2. ouroboros/__main__.py +9 -0
  3. ouroboros/bigbang/__init__.py +39 -0
  4. ouroboros/bigbang/ambiguity.py +464 -0
  5. ouroboros/bigbang/interview.py +530 -0
  6. ouroboros/bigbang/seed_generator.py +610 -0
  7. ouroboros/cli/__init__.py +9 -0
  8. ouroboros/cli/commands/__init__.py +7 -0
  9. ouroboros/cli/commands/config.py +79 -0
  10. ouroboros/cli/commands/init.py +425 -0
  11. ouroboros/cli/commands/run.py +201 -0
  12. ouroboros/cli/commands/status.py +85 -0
  13. ouroboros/cli/formatters/__init__.py +31 -0
  14. ouroboros/cli/formatters/panels.py +157 -0
  15. ouroboros/cli/formatters/progress.py +112 -0
  16. ouroboros/cli/formatters/tables.py +166 -0
  17. ouroboros/cli/main.py +60 -0
  18. ouroboros/config/__init__.py +81 -0
  19. ouroboros/config/loader.py +292 -0
  20. ouroboros/config/models.py +332 -0
  21. ouroboros/core/__init__.py +62 -0
  22. ouroboros/core/ac_tree.py +401 -0
  23. ouroboros/core/context.py +472 -0
  24. ouroboros/core/errors.py +246 -0
  25. ouroboros/core/seed.py +212 -0
  26. ouroboros/core/types.py +205 -0
  27. ouroboros/evaluation/__init__.py +110 -0
  28. ouroboros/evaluation/consensus.py +350 -0
  29. ouroboros/evaluation/mechanical.py +351 -0
  30. ouroboros/evaluation/models.py +235 -0
  31. ouroboros/evaluation/pipeline.py +286 -0
  32. ouroboros/evaluation/semantic.py +302 -0
  33. ouroboros/evaluation/trigger.py +278 -0
  34. ouroboros/events/__init__.py +5 -0
  35. ouroboros/events/base.py +80 -0
  36. ouroboros/events/decomposition.py +153 -0
  37. ouroboros/events/evaluation.py +248 -0
  38. ouroboros/execution/__init__.py +44 -0
  39. ouroboros/execution/atomicity.py +451 -0
  40. ouroboros/execution/decomposition.py +481 -0
  41. ouroboros/execution/double_diamond.py +1386 -0
  42. ouroboros/execution/subagent.py +275 -0
  43. ouroboros/observability/__init__.py +63 -0
  44. ouroboros/observability/drift.py +383 -0
  45. ouroboros/observability/logging.py +504 -0
  46. ouroboros/observability/retrospective.py +338 -0
  47. ouroboros/orchestrator/__init__.py +78 -0
  48. ouroboros/orchestrator/adapter.py +391 -0
  49. ouroboros/orchestrator/events.py +278 -0
  50. ouroboros/orchestrator/runner.py +597 -0
  51. ouroboros/orchestrator/session.py +486 -0
  52. ouroboros/persistence/__init__.py +23 -0
  53. ouroboros/persistence/checkpoint.py +511 -0
  54. ouroboros/persistence/event_store.py +183 -0
  55. ouroboros/persistence/migrations/__init__.py +1 -0
  56. ouroboros/persistence/migrations/runner.py +100 -0
  57. ouroboros/persistence/migrations/scripts/001_initial.sql +20 -0
  58. ouroboros/persistence/schema.py +56 -0
  59. ouroboros/persistence/uow.py +230 -0
  60. ouroboros/providers/__init__.py +28 -0
  61. ouroboros/providers/base.py +133 -0
  62. ouroboros/providers/claude_code_adapter.py +212 -0
  63. ouroboros/providers/litellm_adapter.py +316 -0
  64. ouroboros/py.typed +0 -0
  65. ouroboros/resilience/__init__.py +67 -0
  66. ouroboros/resilience/lateral.py +595 -0
  67. ouroboros/resilience/stagnation.py +727 -0
  68. ouroboros/routing/__init__.py +60 -0
  69. ouroboros/routing/complexity.py +272 -0
  70. ouroboros/routing/downgrade.py +664 -0
  71. ouroboros/routing/escalation.py +340 -0
  72. ouroboros/routing/router.py +204 -0
  73. ouroboros/routing/tiers.py +247 -0
  74. ouroboros/secondary/__init__.py +40 -0
  75. ouroboros/secondary/scheduler.py +467 -0
  76. ouroboros/secondary/todo_registry.py +483 -0
  77. ouroboros_ai-0.1.0.dist-info/METADATA +607 -0
  78. ouroboros_ai-0.1.0.dist-info/RECORD +81 -0
  79. ouroboros_ai-0.1.0.dist-info/WHEEL +4 -0
  80. ouroboros_ai-0.1.0.dist-info/entry_points.txt +2 -0
  81. ouroboros_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,110 @@
1
+ """Three-stage evaluation pipeline for Ouroboros.
2
+
3
+ This module provides the evaluation infrastructure for verifying outputs
4
+ through three progressive stages:
5
+
6
+ 1. Stage 1 - Mechanical Verification ($0): Lint, build, test, static analysis
7
+ 2. Stage 2 - Semantic Evaluation (Standard tier): AC compliance, goal alignment
8
+ 3. Stage 3 - Multi-Model Consensus (Frontier tier): 3-model voting
9
+
10
+ Classes:
11
+ CheckResult: Result of a single mechanical check
12
+ CheckType: Types of mechanical checks
13
+ MechanicalResult: Aggregated Stage 1 results
14
+ SemanticResult: Stage 2 LLM evaluation results
15
+ Vote: Single model vote in consensus
16
+ ConsensusResult: Aggregated Stage 3 results
17
+ EvaluationResult: Complete pipeline result
18
+ EvaluationContext: Input context for evaluation
19
+ MechanicalVerifier: Stage 1 checker
20
+ MechanicalConfig: Stage 1 configuration
21
+ SemanticEvaluator: Stage 2 evaluator
22
+ SemanticConfig: Stage 2 configuration
23
+ ConsensusEvaluator: Stage 3 consensus builder
24
+ ConsensusConfig: Stage 3 configuration
25
+ ConsensusTrigger: Trigger matrix implementation
26
+ TriggerType: Types of consensus triggers
27
+ TriggerContext: Context for trigger evaluation
28
+ TriggerResult: Result of trigger evaluation
29
+ TriggerConfig: Trigger thresholds
30
+ EvaluationPipeline: Full pipeline orchestrator
31
+ PipelineConfig: Pipeline configuration
32
+ """
33
+
34
+ from ouroboros.evaluation.consensus import (
35
+ DEFAULT_CONSENSUS_MODELS,
36
+ ConsensusConfig,
37
+ ConsensusEvaluator,
38
+ run_consensus_evaluation,
39
+ )
40
+ from ouroboros.evaluation.mechanical import (
41
+ MechanicalConfig,
42
+ MechanicalVerifier,
43
+ run_mechanical_verification,
44
+ )
45
+ from ouroboros.evaluation.models import (
46
+ CheckResult,
47
+ CheckType,
48
+ ConsensusResult,
49
+ EvaluationContext,
50
+ EvaluationResult,
51
+ MechanicalResult,
52
+ SemanticResult,
53
+ Vote,
54
+ )
55
+ from ouroboros.evaluation.pipeline import (
56
+ EvaluationPipeline,
57
+ PipelineConfig,
58
+ run_evaluation_pipeline,
59
+ )
60
+ from ouroboros.evaluation.semantic import (
61
+ DEFAULT_SEMANTIC_MODEL,
62
+ SemanticConfig,
63
+ SemanticEvaluator,
64
+ run_semantic_evaluation,
65
+ )
66
+ from ouroboros.evaluation.trigger import (
67
+ ConsensusTrigger,
68
+ TriggerConfig,
69
+ TriggerContext,
70
+ TriggerResult,
71
+ TriggerType,
72
+ check_consensus_trigger,
73
+ )
74
+
75
+ __all__ = [
76
+ # Models
77
+ "CheckResult",
78
+ "CheckType",
79
+ "ConsensusResult",
80
+ "EvaluationContext",
81
+ "EvaluationResult",
82
+ "MechanicalResult",
83
+ "SemanticResult",
84
+ "Vote",
85
+ # Stage 1
86
+ "MechanicalConfig",
87
+ "MechanicalVerifier",
88
+ "run_mechanical_verification",
89
+ # Stage 2
90
+ "DEFAULT_SEMANTIC_MODEL",
91
+ "SemanticConfig",
92
+ "SemanticEvaluator",
93
+ "run_semantic_evaluation",
94
+ # Stage 3
95
+ "DEFAULT_CONSENSUS_MODELS",
96
+ "ConsensusConfig",
97
+ "ConsensusEvaluator",
98
+ "run_consensus_evaluation",
99
+ # Trigger
100
+ "ConsensusTrigger",
101
+ "TriggerConfig",
102
+ "TriggerContext",
103
+ "TriggerResult",
104
+ "TriggerType",
105
+ "check_consensus_trigger",
106
+ # Pipeline
107
+ "EvaluationPipeline",
108
+ "PipelineConfig",
109
+ "run_evaluation_pipeline",
110
+ ]
@@ -0,0 +1,350 @@
1
+ """Stage 3: Multi-Model Consensus.
2
+
3
+ Multi-model voting using Frontier tier:
4
+ - 3 different models evaluate independently
5
+ - 2/3 majority required for approval
6
+ - Disagreements are logged with reasoning
7
+
8
+ The ConsensusEvaluator uses multiple LLM models for diverse verification.
9
+ """
10
+
11
+ import asyncio
12
+ from dataclasses import dataclass
13
+ import json
14
+
15
+ from ouroboros.core.errors import ProviderError, ValidationError
16
+ from ouroboros.core.types import Result
17
+ from ouroboros.evaluation.models import ConsensusResult, EvaluationContext, Vote
18
+ from ouroboros.events.base import BaseEvent
19
+ from ouroboros.events.evaluation import (
20
+ create_stage3_completed_event,
21
+ create_stage3_started_event,
22
+ )
23
+ from ouroboros.providers.base import CompletionConfig, Message, MessageRole
24
+ from ouroboros.providers.litellm_adapter import LiteLLMAdapter
25
+
26
+ # Default models for consensus voting (Frontier tier)
27
+ # Can be overridden via ConsensusConfig.models
28
+ DEFAULT_CONSENSUS_MODELS: tuple[str, ...] = (
29
+ "openrouter/openai/gpt-4o",
30
+ "openrouter/anthropic/claude-sonnet-4-20250514",
31
+ "openrouter/google/gemini-2.5-pro",
32
+ )
33
+
34
+
35
+ @dataclass(frozen=True, slots=True)
36
+ class ConsensusConfig:
37
+ """Configuration for consensus evaluation.
38
+
39
+ Attributes:
40
+ models: List of models to use for voting (at least 3)
41
+ temperature: Sampling temperature
42
+ max_tokens: Maximum tokens per response
43
+ majority_threshold: Required majority ratio (default 2/3)
44
+ diversity_required: Require different providers
45
+ """
46
+
47
+ models: tuple[str, ...] = DEFAULT_CONSENSUS_MODELS
48
+ temperature: float = 0.3
49
+ max_tokens: int = 1024
50
+ majority_threshold: float = 0.66 # 2/3 = 0.6666...
51
+ diversity_required: bool = True
52
+
53
+
54
+ CONSENSUS_SYSTEM_PROMPT = """You are a senior code reviewer participating in a consensus evaluation. Your vote will be combined with other reviewers to reach a decision.
55
+
56
+ You must respond ONLY with a valid JSON object in the following exact format:
57
+ {
58
+ "approved": <boolean>,
59
+ "confidence": <float between 0.0 and 1.0>,
60
+ "reasoning": "<string explaining your vote>"
61
+ }
62
+
63
+ Evaluation criteria for approval:
64
+ - The artifact correctly implements the acceptance criterion
65
+ - The implementation aligns with the stated goal
66
+ - No significant issues or concerns
67
+ - Code quality is acceptable
68
+
69
+ Be honest and thorough. If you have concerns, vote against approval with clear reasoning.
70
+ Confidence should reflect how certain you are about your decision."""
71
+
72
+
73
+ def build_consensus_prompt(context: EvaluationContext) -> str:
74
+ """Build the user prompt for consensus voting.
75
+
76
+ Args:
77
+ context: Evaluation context
78
+
79
+ Returns:
80
+ Formatted prompt string
81
+ """
82
+ constraints_text = "\n".join(f"- {c}" for c in context.constraints) if context.constraints else "None"
83
+
84
+ return f"""Review the following artifact for consensus approval:
85
+
86
+ ## Acceptance Criterion
87
+ {context.current_ac}
88
+
89
+ ## Original Goal
90
+ {context.goal if context.goal else "Not specified"}
91
+
92
+ ## Constraints
93
+ {constraints_text}
94
+
95
+ ## Artifact ({context.artifact_type})
96
+ ```
97
+ {context.artifact}
98
+ ```
99
+
100
+ Cast your vote as a JSON object with: approved (boolean), confidence (0-1), and reasoning."""
101
+
102
+
103
+ def extract_json_payload(text: str) -> str | None:
104
+ """Extract JSON object from text using index-based approach.
105
+
106
+ More reliable than regex for handling nested braces in code snippets.
107
+
108
+ Args:
109
+ text: Raw text potentially containing JSON
110
+
111
+ Returns:
112
+ Extracted JSON string or None if not found
113
+ """
114
+ start = text.find("{")
115
+ end = text.rfind("}")
116
+ if start != -1 and end != -1 and end > start:
117
+ return text[start : end + 1]
118
+ return None
119
+
120
+
121
+ def parse_vote_response(response_text: str, model: str) -> Result[Vote, ValidationError]:
122
+ """Parse LLM response into Vote.
123
+
124
+ Args:
125
+ response_text: Raw LLM response
126
+ model: Model that cast the vote
127
+
128
+ Returns:
129
+ Result containing Vote or ValidationError
130
+ """
131
+ # Extract JSON using index-based approach (handles nested braces)
132
+ json_str = extract_json_payload(response_text)
133
+
134
+ if not json_str:
135
+ return Result.err(
136
+ ValidationError(
137
+ f"Could not find JSON in vote from {model}",
138
+ field="response",
139
+ value=response_text[:100],
140
+ )
141
+ )
142
+
143
+ try:
144
+ data = json.loads(json_str)
145
+ except json.JSONDecodeError as e:
146
+ return Result.err(
147
+ ValidationError(
148
+ f"Invalid JSON in vote from {model}: {e}",
149
+ field="response",
150
+ )
151
+ )
152
+
153
+ # Validate required fields
154
+ if "approved" not in data:
155
+ return Result.err(
156
+ ValidationError(
157
+ f"Missing 'approved' field in vote from {model}",
158
+ field="approved",
159
+ )
160
+ )
161
+
162
+ try:
163
+ confidence = max(0.0, min(1.0, float(data.get("confidence", 0.5))))
164
+ return Result.ok(
165
+ Vote(
166
+ model=model,
167
+ approved=bool(data["approved"]),
168
+ confidence=confidence,
169
+ reasoning=str(data.get("reasoning", "No reasoning provided")),
170
+ )
171
+ )
172
+ except (TypeError, ValueError) as e:
173
+ return Result.err(
174
+ ValidationError(
175
+ f"Invalid field types in vote from {model}: {e}",
176
+ field="response",
177
+ )
178
+ )
179
+
180
+
181
+ class ConsensusEvaluator:
182
+ """Stage 3 multi-model consensus evaluator.
183
+
184
+ Uses multiple Frontier tier models for diverse verification.
185
+ Requires 2/3 majority for approval.
186
+
187
+ Example:
188
+ evaluator = ConsensusEvaluator(llm_adapter)
189
+ result = await evaluator.evaluate(context, trigger_reason)
190
+ """
191
+
192
+ def __init__(
193
+ self,
194
+ llm_adapter: LiteLLMAdapter,
195
+ config: ConsensusConfig | None = None,
196
+ ) -> None:
197
+ """Initialize evaluator.
198
+
199
+ Args:
200
+ llm_adapter: LLM adapter for completions
201
+ config: Consensus configuration
202
+ """
203
+ self._llm = llm_adapter
204
+ self._config = config or ConsensusConfig()
205
+
206
+ async def evaluate(
207
+ self,
208
+ context: EvaluationContext,
209
+ trigger_reason: str = "manual",
210
+ ) -> Result[tuple[ConsensusResult, list[BaseEvent]], ProviderError | ValidationError]:
211
+ """Run consensus evaluation with multiple models.
212
+
213
+ Args:
214
+ context: Evaluation context
215
+ trigger_reason: Why consensus was triggered
216
+
217
+ Returns:
218
+ Result containing ConsensusResult and events, or error
219
+ """
220
+ events: list[BaseEvent] = []
221
+ models = list(self._config.models)
222
+
223
+ # Emit start event
224
+ events.append(
225
+ create_stage3_started_event(
226
+ execution_id=context.execution_id,
227
+ models=models,
228
+ trigger_reason=trigger_reason,
229
+ )
230
+ )
231
+
232
+ # Build messages
233
+ messages = [
234
+ Message(role=MessageRole.SYSTEM, content=CONSENSUS_SYSTEM_PROMPT),
235
+ Message(role=MessageRole.USER, content=build_consensus_prompt(context)),
236
+ ]
237
+
238
+ # Collect votes from all models concurrently
239
+ vote_tasks = [
240
+ self._get_vote(messages, model)
241
+ for model in models
242
+ ]
243
+ vote_results = await asyncio.gather(*vote_tasks, return_exceptions=True)
244
+
245
+ # Process results
246
+ votes: list[Vote] = []
247
+ errors: list[str] = []
248
+
249
+ for model, result in zip(models, vote_results, strict=True):
250
+ if isinstance(result, Exception):
251
+ errors.append(f"{model}: {result}")
252
+ continue
253
+ if result.is_err:
254
+ errors.append(f"{model}: {result.error.message}")
255
+ continue
256
+ votes.append(result.value)
257
+
258
+ # Need at least 2 votes to proceed
259
+ if len(votes) < 2:
260
+ return Result.err(
261
+ ValidationError(
262
+ f"Not enough votes collected: {len(votes)}/3",
263
+ details={"errors": errors},
264
+ )
265
+ )
266
+
267
+ # Calculate consensus
268
+ approving = sum(1 for v in votes if v.approved)
269
+ majority_ratio = approving / len(votes)
270
+ approved = majority_ratio >= self._config.majority_threshold
271
+
272
+ # Collect disagreements (reasoning from dissenting votes)
273
+ disagreements = tuple(
274
+ v.reasoning for v in votes if v.approved != approved
275
+ )
276
+
277
+ consensus_result = ConsensusResult(
278
+ approved=approved,
279
+ votes=tuple(votes),
280
+ majority_ratio=majority_ratio,
281
+ disagreements=disagreements,
282
+ )
283
+
284
+ # Emit completion event
285
+ events.append(
286
+ create_stage3_completed_event(
287
+ execution_id=context.execution_id,
288
+ approved=approved,
289
+ votes=[
290
+ {
291
+ "model": v.model,
292
+ "approved": v.approved,
293
+ "confidence": v.confidence,
294
+ "reasoning": v.reasoning,
295
+ }
296
+ for v in votes
297
+ ],
298
+ majority_ratio=majority_ratio,
299
+ disagreements=list(disagreements),
300
+ )
301
+ )
302
+
303
+ return Result.ok((consensus_result, events))
304
+
305
+ async def _get_vote(
306
+ self,
307
+ messages: list[Message],
308
+ model: str,
309
+ ) -> Result[Vote, ProviderError | ValidationError]:
310
+ """Get a single vote from a model.
311
+
312
+ Args:
313
+ messages: Prompt messages
314
+ model: Model to query
315
+
316
+ Returns:
317
+ Result containing Vote or error
318
+ """
319
+ config = CompletionConfig(
320
+ model=model,
321
+ temperature=self._config.temperature,
322
+ max_tokens=self._config.max_tokens,
323
+ )
324
+
325
+ llm_result = await self._llm.complete(messages, config)
326
+ if llm_result.is_err:
327
+ return Result.err(llm_result.error)
328
+
329
+ return parse_vote_response(llm_result.value.content, model)
330
+
331
+
332
+ async def run_consensus_evaluation(
333
+ context: EvaluationContext,
334
+ llm_adapter: LiteLLMAdapter,
335
+ trigger_reason: str = "manual",
336
+ config: ConsensusConfig | None = None,
337
+ ) -> Result[tuple[ConsensusResult, list[BaseEvent]], ProviderError | ValidationError]:
338
+ """Convenience function for running consensus evaluation.
339
+
340
+ Args:
341
+ context: Evaluation context
342
+ llm_adapter: LLM adapter
343
+ trigger_reason: Why consensus was triggered
344
+ config: Optional configuration
345
+
346
+ Returns:
347
+ Result with ConsensusResult and events
348
+ """
349
+ evaluator = ConsensusEvaluator(llm_adapter, config)
350
+ return await evaluator.evaluate(context, trigger_reason)