opik-optimizer 2.0.1__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +12 -0
- opik_optimizer/base_optimizer.py +33 -0
- opik_optimizer/hierarchical_reflective_optimizer/__init__.py +5 -0
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +718 -0
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +355 -0
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +91 -0
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +679 -0
- opik_optimizer/hierarchical_reflective_optimizer/types.py +49 -0
- opik_optimizer/optimization_result.py +227 -6
- opik_optimizer/parameter_optimizer/__init__.py +11 -0
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +382 -0
- opik_optimizer/parameter_optimizer/parameter_search_space.py +125 -0
- opik_optimizer/parameter_optimizer/parameter_spec.py +214 -0
- opik_optimizer/parameter_optimizer/search_space_types.py +24 -0
- opik_optimizer/parameter_optimizer/sensitivity_analysis.py +71 -0
- {opik_optimizer-2.0.1.dist-info → opik_optimizer-2.1.1.dist-info}/METADATA +4 -2
- {opik_optimizer-2.0.1.dist-info → opik_optimizer-2.1.1.dist-info}/RECORD +20 -8
- {opik_optimizer-2.0.1.dist-info → opik_optimizer-2.1.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.0.1.dist-info → opik_optimizer-2.1.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.0.1.dist-info → opik_optimizer-2.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,355 @@
|
|
1
|
+
import logging
|
2
|
+
import asyncio
|
3
|
+
from typing import Any
|
4
|
+
from tqdm import tqdm
|
5
|
+
|
6
|
+
from opik.evaluation.evaluation_result import EvaluationResult
|
7
|
+
from .types import (
|
8
|
+
RootCauseAnalysis,
|
9
|
+
BatchAnalysis,
|
10
|
+
HierarchicalRootCauseAnalysis,
|
11
|
+
)
|
12
|
+
from . import reporting
|
13
|
+
from .prompts import BATCH_ANALYSIS_PROMPT, SYNTHESIS_PROMPT
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class HierarchicalRootCauseAnalyzer:
|
19
|
+
"""
|
20
|
+
Performs hierarchical root cause analysis on evaluation results.
|
21
|
+
|
22
|
+
This analyzer splits large evaluation datasets into manageable batches,
|
23
|
+
performs root cause analysis on each batch in parallel (up to 5 batches
|
24
|
+
concurrently by default), then combines and summarizes the results to
|
25
|
+
identify the most important failure patterns.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
call_model_fn: Function to call the LLM (should match signature of ReflectiveOptimizer._call_model)
|
29
|
+
reasoning_model: Name of the reasoning model to use
|
30
|
+
seed: Random seed for reproducibility
|
31
|
+
max_parallel_batches: Maximum number of batches to process concurrently (default: 5)
|
32
|
+
batch_size: Number of test cases per batch for analysis (default: 25)
|
33
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on) (default: 1)
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
call_model_fn: Any,
|
39
|
+
reasoning_model: str,
|
40
|
+
seed: int,
|
41
|
+
max_parallel_batches: int,
|
42
|
+
batch_size: int,
|
43
|
+
verbose: int = 1,
|
44
|
+
) -> None:
|
45
|
+
"""
|
46
|
+
Initialize the hierarchical root cause analyzer.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
call_model_fn: Function to call the LLM (should match signature of ReflectiveOptimizer._call_model)
|
50
|
+
reasoning_model: Name of the reasoning model to use
|
51
|
+
seed: Random seed for reproducibility
|
52
|
+
max_parallel_batches: Maximum number of batches to process concurrently (default: 5)
|
53
|
+
batch_size: Number of test cases per batch for analysis (default: 25)
|
54
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on) (default: 1)
|
55
|
+
"""
|
56
|
+
self.call_model_fn = call_model_fn
|
57
|
+
self.reasoning_model = reasoning_model
|
58
|
+
self.seed = seed
|
59
|
+
self.max_parallel_batches = max_parallel_batches
|
60
|
+
self.batch_size = batch_size
|
61
|
+
self.verbose = verbose
|
62
|
+
|
63
|
+
def _format_test_results_batch(
|
64
|
+
self,
|
65
|
+
test_results: list[Any],
|
66
|
+
batch_start: int,
|
67
|
+
batch_end: int,
|
68
|
+
) -> str:
|
69
|
+
"""
|
70
|
+
Format a batch of test results for analysis.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
test_results: Full list of test results
|
74
|
+
batch_start: Starting index of the batch
|
75
|
+
batch_end: Ending index of the batch (exclusive)
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
Formatted string containing test result details
|
79
|
+
"""
|
80
|
+
formatted_results = []
|
81
|
+
|
82
|
+
for idx in range(batch_start, min(batch_end, len(test_results))):
|
83
|
+
test_result = test_results[idx]
|
84
|
+
test_case = test_result.test_case
|
85
|
+
dataset_item_id = test_case.dataset_item_id
|
86
|
+
|
87
|
+
# Extract scores
|
88
|
+
scores_info = []
|
89
|
+
for score in test_result.score_results:
|
90
|
+
score_str = f" - {score.name}: {score.value:.3f}"
|
91
|
+
if score.reason:
|
92
|
+
score_str += f"\n Reason: {score.reason}"
|
93
|
+
if score.scoring_failed:
|
94
|
+
score_str += " (FAILED)"
|
95
|
+
scores_info.append(score_str)
|
96
|
+
|
97
|
+
# Format this test result
|
98
|
+
result_text = f"""Test Case #{idx + 1} (ID: {dataset_item_id}, Trial: {test_result.trial_id})
|
99
|
+
Scores:
|
100
|
+
{chr(10).join(scores_info)}"""
|
101
|
+
|
102
|
+
formatted_results.append(result_text)
|
103
|
+
|
104
|
+
return "\n\n" + ("=" * 80 + "\n\n").join(formatted_results)
|
105
|
+
|
106
|
+
async def _analyze_batch_async(
|
107
|
+
self,
|
108
|
+
evaluation_result: EvaluationResult,
|
109
|
+
batch_number: int,
|
110
|
+
batch_start: int,
|
111
|
+
batch_end: int,
|
112
|
+
) -> BatchAnalysis:
|
113
|
+
"""
|
114
|
+
Analyze a single batch of test results asynchronously.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
evaluation_result: The full evaluation result
|
118
|
+
batch_number: The batch number (1-indexed)
|
119
|
+
batch_start: Starting index in test_results
|
120
|
+
batch_end: Ending index in test_results (exclusive)
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
BatchAnalysis containing failure modes for this batch
|
124
|
+
"""
|
125
|
+
test_results = evaluation_result.test_results
|
126
|
+
actual_end = min(batch_end, len(test_results))
|
127
|
+
|
128
|
+
logger.debug(
|
129
|
+
f"Analyzing batch {batch_number}: "
|
130
|
+
f"test cases {batch_start + 1} to {actual_end}"
|
131
|
+
)
|
132
|
+
|
133
|
+
formatted_batch = self._format_test_results_batch(
|
134
|
+
test_results, batch_start, batch_end
|
135
|
+
)
|
136
|
+
|
137
|
+
batch_analysis_prompt = BATCH_ANALYSIS_PROMPT.format(
|
138
|
+
formatted_batch=formatted_batch,
|
139
|
+
)
|
140
|
+
|
141
|
+
root_cause_response = await self.call_model_fn(
|
142
|
+
model=self.reasoning_model,
|
143
|
+
messages=[{"role": "user", "content": batch_analysis_prompt}],
|
144
|
+
seed=self.seed,
|
145
|
+
model_kwargs={},
|
146
|
+
response_model=RootCauseAnalysis,
|
147
|
+
)
|
148
|
+
|
149
|
+
return BatchAnalysis(
|
150
|
+
batch_number=batch_number,
|
151
|
+
start_index=batch_start,
|
152
|
+
end_index=actual_end,
|
153
|
+
failure_modes=root_cause_response.failure_modes,
|
154
|
+
)
|
155
|
+
|
156
|
+
async def _synthesize_batch_analyses_async(
|
157
|
+
self,
|
158
|
+
evaluation_result: EvaluationResult,
|
159
|
+
batch_analyses: list[BatchAnalysis],
|
160
|
+
) -> HierarchicalRootCauseAnalysis:
|
161
|
+
"""
|
162
|
+
Synthesize multiple batch analyses into a unified root cause analysis asynchronously.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
evaluation_result: The full evaluation result
|
166
|
+
batch_analyses: List of batch analysis results
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
HierarchicalRootCauseAnalysis with unified failure modes
|
170
|
+
"""
|
171
|
+
logger.debug(
|
172
|
+
f"Synthesizing {len(batch_analyses)} batch analyses "
|
173
|
+
f"from {len(evaluation_result.test_results)} total test cases"
|
174
|
+
)
|
175
|
+
|
176
|
+
# Format all batch analyses for synthesis
|
177
|
+
batch_summaries = []
|
178
|
+
for batch_analysis in batch_analyses:
|
179
|
+
failure_list = []
|
180
|
+
for fm in batch_analysis.failure_modes:
|
181
|
+
failure_list.append(
|
182
|
+
f" - {fm.name}\n"
|
183
|
+
f" Description: {fm.description}\n"
|
184
|
+
f" Root Cause: {fm.root_cause}"
|
185
|
+
)
|
186
|
+
|
187
|
+
summary = f"""Batch {batch_analysis.batch_number} (Test Cases {batch_analysis.start_index + 1}-{batch_analysis.end_index}):
|
188
|
+
{chr(10).join(failure_list)}"""
|
189
|
+
batch_summaries.append(summary)
|
190
|
+
|
191
|
+
synthesis_prompt = SYNTHESIS_PROMPT.format(
|
192
|
+
batch_summaries=chr(10).join(batch_summaries),
|
193
|
+
)
|
194
|
+
|
195
|
+
synthesis_response = await self.call_model_fn(
|
196
|
+
model=self.reasoning_model,
|
197
|
+
messages=[{"role": "user", "content": synthesis_prompt}],
|
198
|
+
seed=self.seed,
|
199
|
+
model_kwargs={},
|
200
|
+
response_model=HierarchicalRootCauseAnalysis,
|
201
|
+
)
|
202
|
+
|
203
|
+
return synthesis_response
|
204
|
+
|
205
|
+
def _validate_reasons_present(self, test_results: list[Any]) -> None:
|
206
|
+
"""
|
207
|
+
Validate that test results include reasons for scoring.
|
208
|
+
|
209
|
+
Args:
|
210
|
+
test_results: List of test results to validate
|
211
|
+
|
212
|
+
Raises:
|
213
|
+
ValueError: If no test results have reasons in their score_results
|
214
|
+
"""
|
215
|
+
if not test_results:
|
216
|
+
return
|
217
|
+
|
218
|
+
has_reasons = False
|
219
|
+
for test_result in test_results:
|
220
|
+
for score in test_result.score_results:
|
221
|
+
if score.reason and score.reason.strip():
|
222
|
+
has_reasons = True
|
223
|
+
break
|
224
|
+
if has_reasons:
|
225
|
+
break
|
226
|
+
|
227
|
+
if not has_reasons:
|
228
|
+
raise ValueError(
|
229
|
+
"Test results must include 'reason' fields in score_results for hierarchical "
|
230
|
+
"root cause analysis to work effectively. Reasons are critical for identifying "
|
231
|
+
"failure patterns and root causes. Please ensure your scoring metrics provide "
|
232
|
+
"detailed reasons for their scores."
|
233
|
+
)
|
234
|
+
|
235
|
+
async def analyze_async(
|
236
|
+
self, evaluation_result: EvaluationResult
|
237
|
+
) -> HierarchicalRootCauseAnalysis:
|
238
|
+
"""
|
239
|
+
Perform hierarchical root cause analysis on evaluation results asynchronously.
|
240
|
+
|
241
|
+
This method:
|
242
|
+
1. Validates that test results include reasons (critical for analysis)
|
243
|
+
2. Splits test results into batches of BATCH_SIZE
|
244
|
+
3. Analyzes batches concurrently (up to max_parallel_batches at once)
|
245
|
+
4. Synthesizes batch analyses into unified failure modes
|
246
|
+
|
247
|
+
Args:
|
248
|
+
evaluation_result: The evaluation result to analyze
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
HierarchicalRootCauseAnalysis with unified failure modes and synthesis notes
|
252
|
+
|
253
|
+
Raises:
|
254
|
+
ValueError: If test results don't include reasons, which are critical for analysis
|
255
|
+
"""
|
256
|
+
test_results = evaluation_result.test_results
|
257
|
+
num_test_results = len(test_results)
|
258
|
+
|
259
|
+
# Validate that reasons are present in test results
|
260
|
+
self._validate_reasons_present(test_results)
|
261
|
+
|
262
|
+
logger.info(
|
263
|
+
f"Starting hierarchical root cause analysis on {num_test_results} test cases"
|
264
|
+
)
|
265
|
+
|
266
|
+
# Prepare batch tasks
|
267
|
+
batch_tasks = []
|
268
|
+
batch_number = 1
|
269
|
+
for batch_start in range(0, num_test_results, self.batch_size):
|
270
|
+
batch_end = min(batch_start + self.batch_size, num_test_results)
|
271
|
+
task = self._analyze_batch_async(
|
272
|
+
evaluation_result=evaluation_result,
|
273
|
+
batch_number=batch_number,
|
274
|
+
batch_start=batch_start,
|
275
|
+
batch_end=batch_end,
|
276
|
+
)
|
277
|
+
batch_tasks.append((batch_number, task))
|
278
|
+
batch_number += 1
|
279
|
+
|
280
|
+
# Process batches with semaphore to limit concurrency
|
281
|
+
logger.info(
|
282
|
+
f"Processing {len(batch_tasks)} batches concurrently "
|
283
|
+
f"(max {self.max_parallel_batches} at once)"
|
284
|
+
)
|
285
|
+
|
286
|
+
semaphore = asyncio.Semaphore(self.max_parallel_batches)
|
287
|
+
|
288
|
+
# Create progress bar for batch processing
|
289
|
+
pbar = tqdm(
|
290
|
+
total=len(batch_tasks), desc="Processing batches", unit="batch", leave=False
|
291
|
+
)
|
292
|
+
|
293
|
+
async def run_with_semaphore(
|
294
|
+
batch_num: int, task: Any
|
295
|
+
) -> tuple[int, BatchAnalysis]:
|
296
|
+
async with semaphore:
|
297
|
+
try:
|
298
|
+
result = await task
|
299
|
+
logger.debug(
|
300
|
+
f"Completed batch {batch_num}: "
|
301
|
+
f"identified {len(result.failure_modes)} failure modes"
|
302
|
+
)
|
303
|
+
pbar.update(1) # Update progress bar
|
304
|
+
return batch_num, result
|
305
|
+
except Exception as exc:
|
306
|
+
logger.error(f"Batch {batch_num} failed: {exc}")
|
307
|
+
pbar.update(1) # Update progress bar even on error
|
308
|
+
raise
|
309
|
+
|
310
|
+
# Run all tasks with semaphore control
|
311
|
+
results = await asyncio.gather(
|
312
|
+
*[run_with_semaphore(num, task) for num, task in batch_tasks]
|
313
|
+
)
|
314
|
+
|
315
|
+
pbar.close() # Close progress bar
|
316
|
+
|
317
|
+
# Sort by batch number to maintain order
|
318
|
+
batch_analyses = [result for _, result in sorted(results)]
|
319
|
+
|
320
|
+
logger.info(
|
321
|
+
f"Stage 1 complete: Analyzed {len(batch_analyses)} batches, "
|
322
|
+
f"total {sum(len(ba.failure_modes) for ba in batch_analyses)} failure modes"
|
323
|
+
)
|
324
|
+
|
325
|
+
# Stage 2: Synthesize batch analyses
|
326
|
+
logger.info("Stage 2: Synthesizing batch analyses...")
|
327
|
+
|
328
|
+
with reporting.display_batch_synthesis(
|
329
|
+
num_batches=len(batch_analyses), verbose=self.verbose
|
330
|
+
):
|
331
|
+
hierarchical_analysis = await self._synthesize_batch_analyses_async(
|
332
|
+
evaluation_result=evaluation_result,
|
333
|
+
batch_analyses=batch_analyses,
|
334
|
+
)
|
335
|
+
|
336
|
+
logger.info(
|
337
|
+
f"Synthesis complete: "
|
338
|
+
f"identified {len(hierarchical_analysis.unified_failure_modes)} unified failure modes"
|
339
|
+
)
|
340
|
+
|
341
|
+
return hierarchical_analysis
|
342
|
+
|
343
|
+
def analyze(
|
344
|
+
self, evaluation_result: EvaluationResult
|
345
|
+
) -> HierarchicalRootCauseAnalysis:
|
346
|
+
"""
|
347
|
+
Synchronous wrapper for analyze_async() for backward compatibility.
|
348
|
+
|
349
|
+
Args:
|
350
|
+
evaluation_result: The evaluation result to analyze
|
351
|
+
|
352
|
+
Returns:
|
353
|
+
HierarchicalRootCauseAnalysis with unified failure modes and synthesis notes
|
354
|
+
"""
|
355
|
+
return asyncio.run(self.analyze_async(evaluation_result))
|
@@ -0,0 +1,91 @@
|
|
1
|
+
"""Prompt templates for the Hierarchical Reflective Optimizer.
|
2
|
+
|
3
|
+
This module contains all the prompt templates used by the optimizer for:
|
4
|
+
- Batch-level root cause analysis
|
5
|
+
- Synthesis of batch analyses
|
6
|
+
- Prompt improvement generation
|
7
|
+
"""
|
8
|
+
|
9
|
+
# Prompt template for analyzing a batch of test results
|
10
|
+
BATCH_ANALYSIS_PROMPT = """You are analyzing evaluation results to identify failure patterns.
|
11
|
+
|
12
|
+
TEST RESULTS:
|
13
|
+
```
|
14
|
+
{formatted_batch}
|
15
|
+
```
|
16
|
+
|
17
|
+
Think through the failures systematically:
|
18
|
+
|
19
|
+
1. IDENTIFY: List all distinct types of failures you observe in the test results
|
20
|
+
2. GROUP: Which failures share similar characteristics or root causes?
|
21
|
+
3. FREQUENCY: Which patterns appear multiple times across different test cases?
|
22
|
+
4. PRIORITIZE: Which failures are most critical to address?
|
23
|
+
|
24
|
+
Then, for each distinct failure pattern provide:
|
25
|
+
1. A clear, descriptive name that captures the essence of the failure
|
26
|
+
2. A comprehensive description of what is failing
|
27
|
+
3. The underlying root cause explaining why this failure occurs
|
28
|
+
|
29
|
+
Focus on patterns that appear multiple times. Be specific about what is failing and why.
|
30
|
+
Provide a list of failure modes, each with a name, description, and root cause."""
|
31
|
+
|
32
|
+
|
33
|
+
# Prompt template for synthesizing multiple batch analyses
|
34
|
+
SYNTHESIS_PROMPT = """You are synthesizing root cause analyses from multiple batches of evaluation results.
|
35
|
+
|
36
|
+
BATCH ANALYSES:
|
37
|
+
```
|
38
|
+
{batch_summaries}
|
39
|
+
```
|
40
|
+
|
41
|
+
Your task is to synthesize these batch-level analyses into a unified root cause analysis.
|
42
|
+
|
43
|
+
1. MERGE similar failure modes across batches:
|
44
|
+
- If multiple batches identify the same or very similar failure pattern, combine them into one unified failure mode
|
45
|
+
- Create a comprehensive description that captures the pattern across all relevant batches
|
46
|
+
- Identify the core root cause
|
47
|
+
|
48
|
+
2. PRIORITIZE the most critical failure modes:
|
49
|
+
- Focus on patterns that appear in multiple batches
|
50
|
+
- Consider the severity and frequency of each failure
|
51
|
+
- Eliminate one-off or minor issues unless they're particularly impactful
|
52
|
+
|
53
|
+
3. PROVIDE SYNTHESIS NOTES:
|
54
|
+
- Briefly explain which batch-level patterns were merged and why
|
55
|
+
- Note any cross-batch trends or patterns
|
56
|
+
- Highlight the most critical areas for improvement
|
57
|
+
|
58
|
+
Provide:
|
59
|
+
1. A unified list of failure modes (name, description, root cause)
|
60
|
+
2. Synthesis notes explaining your analysis process and key findings"""
|
61
|
+
|
62
|
+
|
63
|
+
# Prompt template for improving prompts based on failure modes
|
64
|
+
IMPROVE_PROMPT_TEMPLATE = """You are an expert prompt engineer. You are given a prompt and a failure mode identified during evaluation.
|
65
|
+
Your task is to improve the prompt to address this failure mode.
|
66
|
+
|
67
|
+
CURRENT PROMPT:
|
68
|
+
```
|
69
|
+
{current_prompt}
|
70
|
+
```
|
71
|
+
FAILURE MODE TO ADDRESS:
|
72
|
+
- Name: {failure_mode_name}
|
73
|
+
- Description: {failure_mode_description}
|
74
|
+
- Root Cause: {failure_mode_root_cause}
|
75
|
+
|
76
|
+
INSTRUCTIONS FOR IMPROVING THE PROMPT:
|
77
|
+
|
78
|
+
1. **Analyze First**: Carefully review the current prompt to understand what instructions already exist.
|
79
|
+
|
80
|
+
2. **Choose the Right Approach**:
|
81
|
+
- If relevant instructions already exist but are unclear or incomplete, UPDATE and CLARIFY them in place
|
82
|
+
- If the prompt is missing critical instructions needed to address this failure mode, ADD new targeted instructions
|
83
|
+
- If existing instructions contradict what's needed, REPLACE them with corrected versions
|
84
|
+
|
85
|
+
3. **Be Surgical**: Make targeted changes that directly address the root cause. Don't add unnecessary instructions or rewrite the entire prompt.
|
86
|
+
|
87
|
+
4. **Maintain Structure**: Keep the same message structure (role and content format). Only modify the content where necessary.
|
88
|
+
|
89
|
+
5. **Be Specific**: Ensure your changes provide concrete, actionable guidance that directly addresses the identified failure mode.
|
90
|
+
|
91
|
+
Provide your reasoning for the changes you made, explaining WHY each change addresses the failure mode, and then provide the improved prompt."""
|