opik-optimizer 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,355 @@
1
+ import logging
2
+ import asyncio
3
+ from typing import Any
4
+ from tqdm import tqdm
5
+
6
+ from opik.evaluation.evaluation_result import EvaluationResult
7
+ from .types import (
8
+ RootCauseAnalysis,
9
+ BatchAnalysis,
10
+ HierarchicalRootCauseAnalysis,
11
+ )
12
+ from . import reporting
13
+ from .prompts import BATCH_ANALYSIS_PROMPT, SYNTHESIS_PROMPT
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class HierarchicalRootCauseAnalyzer:
19
+ """
20
+ Performs hierarchical root cause analysis on evaluation results.
21
+
22
+ This analyzer splits large evaluation datasets into manageable batches,
23
+ performs root cause analysis on each batch in parallel (up to 5 batches
24
+ concurrently by default), then combines and summarizes the results to
25
+ identify the most important failure patterns.
26
+
27
+ Args:
28
+ call_model_fn: Function to call the LLM (should match signature of ReflectiveOptimizer._call_model)
29
+ reasoning_model: Name of the reasoning model to use
30
+ seed: Random seed for reproducibility
31
+ max_parallel_batches: Maximum number of batches to process concurrently (default: 5)
32
+ batch_size: Number of test cases per batch for analysis (default: 25)
33
+ verbose: Controls internal logging/progress bars (0=off, 1=on) (default: 1)
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ call_model_fn: Any,
39
+ reasoning_model: str,
40
+ seed: int,
41
+ max_parallel_batches: int,
42
+ batch_size: int,
43
+ verbose: int = 1,
44
+ ) -> None:
45
+ """
46
+ Initialize the hierarchical root cause analyzer.
47
+
48
+ Args:
49
+ call_model_fn: Function to call the LLM (should match signature of ReflectiveOptimizer._call_model)
50
+ reasoning_model: Name of the reasoning model to use
51
+ seed: Random seed for reproducibility
52
+ max_parallel_batches: Maximum number of batches to process concurrently (default: 5)
53
+ batch_size: Number of test cases per batch for analysis (default: 25)
54
+ verbose: Controls internal logging/progress bars (0=off, 1=on) (default: 1)
55
+ """
56
+ self.call_model_fn = call_model_fn
57
+ self.reasoning_model = reasoning_model
58
+ self.seed = seed
59
+ self.max_parallel_batches = max_parallel_batches
60
+ self.batch_size = batch_size
61
+ self.verbose = verbose
62
+
63
+ def _format_test_results_batch(
64
+ self,
65
+ test_results: list[Any],
66
+ batch_start: int,
67
+ batch_end: int,
68
+ ) -> str:
69
+ """
70
+ Format a batch of test results for analysis.
71
+
72
+ Args:
73
+ test_results: Full list of test results
74
+ batch_start: Starting index of the batch
75
+ batch_end: Ending index of the batch (exclusive)
76
+
77
+ Returns:
78
+ Formatted string containing test result details
79
+ """
80
+ formatted_results = []
81
+
82
+ for idx in range(batch_start, min(batch_end, len(test_results))):
83
+ test_result = test_results[idx]
84
+ test_case = test_result.test_case
85
+ dataset_item_id = test_case.dataset_item_id
86
+
87
+ # Extract scores
88
+ scores_info = []
89
+ for score in test_result.score_results:
90
+ score_str = f" - {score.name}: {score.value:.3f}"
91
+ if score.reason:
92
+ score_str += f"\n Reason: {score.reason}"
93
+ if score.scoring_failed:
94
+ score_str += " (FAILED)"
95
+ scores_info.append(score_str)
96
+
97
+ # Format this test result
98
+ result_text = f"""Test Case #{idx + 1} (ID: {dataset_item_id}, Trial: {test_result.trial_id})
99
+ Scores:
100
+ {chr(10).join(scores_info)}"""
101
+
102
+ formatted_results.append(result_text)
103
+
104
+ return "\n\n" + ("=" * 80 + "\n\n").join(formatted_results)
105
+
106
+ async def _analyze_batch_async(
107
+ self,
108
+ evaluation_result: EvaluationResult,
109
+ batch_number: int,
110
+ batch_start: int,
111
+ batch_end: int,
112
+ ) -> BatchAnalysis:
113
+ """
114
+ Analyze a single batch of test results asynchronously.
115
+
116
+ Args:
117
+ evaluation_result: The full evaluation result
118
+ batch_number: The batch number (1-indexed)
119
+ batch_start: Starting index in test_results
120
+ batch_end: Ending index in test_results (exclusive)
121
+
122
+ Returns:
123
+ BatchAnalysis containing failure modes for this batch
124
+ """
125
+ test_results = evaluation_result.test_results
126
+ actual_end = min(batch_end, len(test_results))
127
+
128
+ logger.debug(
129
+ f"Analyzing batch {batch_number}: "
130
+ f"test cases {batch_start + 1} to {actual_end}"
131
+ )
132
+
133
+ formatted_batch = self._format_test_results_batch(
134
+ test_results, batch_start, batch_end
135
+ )
136
+
137
+ batch_analysis_prompt = BATCH_ANALYSIS_PROMPT.format(
138
+ formatted_batch=formatted_batch,
139
+ )
140
+
141
+ root_cause_response = await self.call_model_fn(
142
+ model=self.reasoning_model,
143
+ messages=[{"role": "user", "content": batch_analysis_prompt}],
144
+ seed=self.seed,
145
+ model_kwargs={},
146
+ response_model=RootCauseAnalysis,
147
+ )
148
+
149
+ return BatchAnalysis(
150
+ batch_number=batch_number,
151
+ start_index=batch_start,
152
+ end_index=actual_end,
153
+ failure_modes=root_cause_response.failure_modes,
154
+ )
155
+
156
+ async def _synthesize_batch_analyses_async(
157
+ self,
158
+ evaluation_result: EvaluationResult,
159
+ batch_analyses: list[BatchAnalysis],
160
+ ) -> HierarchicalRootCauseAnalysis:
161
+ """
162
+ Synthesize multiple batch analyses into a unified root cause analysis asynchronously.
163
+
164
+ Args:
165
+ evaluation_result: The full evaluation result
166
+ batch_analyses: List of batch analysis results
167
+
168
+ Returns:
169
+ HierarchicalRootCauseAnalysis with unified failure modes
170
+ """
171
+ logger.debug(
172
+ f"Synthesizing {len(batch_analyses)} batch analyses "
173
+ f"from {len(evaluation_result.test_results)} total test cases"
174
+ )
175
+
176
+ # Format all batch analyses for synthesis
177
+ batch_summaries = []
178
+ for batch_analysis in batch_analyses:
179
+ failure_list = []
180
+ for fm in batch_analysis.failure_modes:
181
+ failure_list.append(
182
+ f" - {fm.name}\n"
183
+ f" Description: {fm.description}\n"
184
+ f" Root Cause: {fm.root_cause}"
185
+ )
186
+
187
+ summary = f"""Batch {batch_analysis.batch_number} (Test Cases {batch_analysis.start_index + 1}-{batch_analysis.end_index}):
188
+ {chr(10).join(failure_list)}"""
189
+ batch_summaries.append(summary)
190
+
191
+ synthesis_prompt = SYNTHESIS_PROMPT.format(
192
+ batch_summaries=chr(10).join(batch_summaries),
193
+ )
194
+
195
+ synthesis_response = await self.call_model_fn(
196
+ model=self.reasoning_model,
197
+ messages=[{"role": "user", "content": synthesis_prompt}],
198
+ seed=self.seed,
199
+ model_kwargs={},
200
+ response_model=HierarchicalRootCauseAnalysis,
201
+ )
202
+
203
+ return synthesis_response
204
+
205
+ def _validate_reasons_present(self, test_results: list[Any]) -> None:
206
+ """
207
+ Validate that test results include reasons for scoring.
208
+
209
+ Args:
210
+ test_results: List of test results to validate
211
+
212
+ Raises:
213
+ ValueError: If no test results have reasons in their score_results
214
+ """
215
+ if not test_results:
216
+ return
217
+
218
+ has_reasons = False
219
+ for test_result in test_results:
220
+ for score in test_result.score_results:
221
+ if score.reason and score.reason.strip():
222
+ has_reasons = True
223
+ break
224
+ if has_reasons:
225
+ break
226
+
227
+ if not has_reasons:
228
+ raise ValueError(
229
+ "Test results must include 'reason' fields in score_results for hierarchical "
230
+ "root cause analysis to work effectively. Reasons are critical for identifying "
231
+ "failure patterns and root causes. Please ensure your scoring metrics provide "
232
+ "detailed reasons for their scores."
233
+ )
234
+
235
+ async def analyze_async(
236
+ self, evaluation_result: EvaluationResult
237
+ ) -> HierarchicalRootCauseAnalysis:
238
+ """
239
+ Perform hierarchical root cause analysis on evaluation results asynchronously.
240
+
241
+ This method:
242
+ 1. Validates that test results include reasons (critical for analysis)
243
+ 2. Splits test results into batches of BATCH_SIZE
244
+ 3. Analyzes batches concurrently (up to max_parallel_batches at once)
245
+ 4. Synthesizes batch analyses into unified failure modes
246
+
247
+ Args:
248
+ evaluation_result: The evaluation result to analyze
249
+
250
+ Returns:
251
+ HierarchicalRootCauseAnalysis with unified failure modes and synthesis notes
252
+
253
+ Raises:
254
+ ValueError: If test results don't include reasons, which are critical for analysis
255
+ """
256
+ test_results = evaluation_result.test_results
257
+ num_test_results = len(test_results)
258
+
259
+ # Validate that reasons are present in test results
260
+ self._validate_reasons_present(test_results)
261
+
262
+ logger.info(
263
+ f"Starting hierarchical root cause analysis on {num_test_results} test cases"
264
+ )
265
+
266
+ # Prepare batch tasks
267
+ batch_tasks = []
268
+ batch_number = 1
269
+ for batch_start in range(0, num_test_results, self.batch_size):
270
+ batch_end = min(batch_start + self.batch_size, num_test_results)
271
+ task = self._analyze_batch_async(
272
+ evaluation_result=evaluation_result,
273
+ batch_number=batch_number,
274
+ batch_start=batch_start,
275
+ batch_end=batch_end,
276
+ )
277
+ batch_tasks.append((batch_number, task))
278
+ batch_number += 1
279
+
280
+ # Process batches with semaphore to limit concurrency
281
+ logger.info(
282
+ f"Processing {len(batch_tasks)} batches concurrently "
283
+ f"(max {self.max_parallel_batches} at once)"
284
+ )
285
+
286
+ semaphore = asyncio.Semaphore(self.max_parallel_batches)
287
+
288
+ # Create progress bar for batch processing
289
+ pbar = tqdm(
290
+ total=len(batch_tasks), desc="Processing batches", unit="batch", leave=False
291
+ )
292
+
293
+ async def run_with_semaphore(
294
+ batch_num: int, task: Any
295
+ ) -> tuple[int, BatchAnalysis]:
296
+ async with semaphore:
297
+ try:
298
+ result = await task
299
+ logger.debug(
300
+ f"Completed batch {batch_num}: "
301
+ f"identified {len(result.failure_modes)} failure modes"
302
+ )
303
+ pbar.update(1) # Update progress bar
304
+ return batch_num, result
305
+ except Exception as exc:
306
+ logger.error(f"Batch {batch_num} failed: {exc}")
307
+ pbar.update(1) # Update progress bar even on error
308
+ raise
309
+
310
+ # Run all tasks with semaphore control
311
+ results = await asyncio.gather(
312
+ *[run_with_semaphore(num, task) for num, task in batch_tasks]
313
+ )
314
+
315
+ pbar.close() # Close progress bar
316
+
317
+ # Sort by batch number to maintain order
318
+ batch_analyses = [result for _, result in sorted(results)]
319
+
320
+ logger.info(
321
+ f"Stage 1 complete: Analyzed {len(batch_analyses)} batches, "
322
+ f"total {sum(len(ba.failure_modes) for ba in batch_analyses)} failure modes"
323
+ )
324
+
325
+ # Stage 2: Synthesize batch analyses
326
+ logger.info("Stage 2: Synthesizing batch analyses...")
327
+
328
+ with reporting.display_batch_synthesis(
329
+ num_batches=len(batch_analyses), verbose=self.verbose
330
+ ):
331
+ hierarchical_analysis = await self._synthesize_batch_analyses_async(
332
+ evaluation_result=evaluation_result,
333
+ batch_analyses=batch_analyses,
334
+ )
335
+
336
+ logger.info(
337
+ f"Synthesis complete: "
338
+ f"identified {len(hierarchical_analysis.unified_failure_modes)} unified failure modes"
339
+ )
340
+
341
+ return hierarchical_analysis
342
+
343
+ def analyze(
344
+ self, evaluation_result: EvaluationResult
345
+ ) -> HierarchicalRootCauseAnalysis:
346
+ """
347
+ Synchronous wrapper for analyze_async() for backward compatibility.
348
+
349
+ Args:
350
+ evaluation_result: The evaluation result to analyze
351
+
352
+ Returns:
353
+ HierarchicalRootCauseAnalysis with unified failure modes and synthesis notes
354
+ """
355
+ return asyncio.run(self.analyze_async(evaluation_result))
@@ -0,0 +1,91 @@
1
+ """Prompt templates for the Hierarchical Reflective Optimizer.
2
+
3
+ This module contains all the prompt templates used by the optimizer for:
4
+ - Batch-level root cause analysis
5
+ - Synthesis of batch analyses
6
+ - Prompt improvement generation
7
+ """
8
+
9
+ # Prompt template for analyzing a batch of test results
10
+ BATCH_ANALYSIS_PROMPT = """You are analyzing evaluation results to identify failure patterns.
11
+
12
+ TEST RESULTS:
13
+ ```
14
+ {formatted_batch}
15
+ ```
16
+
17
+ Think through the failures systematically:
18
+
19
+ 1. IDENTIFY: List all distinct types of failures you observe in the test results
20
+ 2. GROUP: Which failures share similar characteristics or root causes?
21
+ 3. FREQUENCY: Which patterns appear multiple times across different test cases?
22
+ 4. PRIORITIZE: Which failures are most critical to address?
23
+
24
+ Then, for each distinct failure pattern provide:
25
+ 1. A clear, descriptive name that captures the essence of the failure
26
+ 2. A comprehensive description of what is failing
27
+ 3. The underlying root cause explaining why this failure occurs
28
+
29
+ Focus on patterns that appear multiple times. Be specific about what is failing and why.
30
+ Provide a list of failure modes, each with a name, description, and root cause."""
31
+
32
+
33
+ # Prompt template for synthesizing multiple batch analyses
34
+ SYNTHESIS_PROMPT = """You are synthesizing root cause analyses from multiple batches of evaluation results.
35
+
36
+ BATCH ANALYSES:
37
+ ```
38
+ {batch_summaries}
39
+ ```
40
+
41
+ Your task is to synthesize these batch-level analyses into a unified root cause analysis.
42
+
43
+ 1. MERGE similar failure modes across batches:
44
+ - If multiple batches identify the same or very similar failure pattern, combine them into one unified failure mode
45
+ - Create a comprehensive description that captures the pattern across all relevant batches
46
+ - Identify the core root cause
47
+
48
+ 2. PRIORITIZE the most critical failure modes:
49
+ - Focus on patterns that appear in multiple batches
50
+ - Consider the severity and frequency of each failure
51
+ - Eliminate one-off or minor issues unless they're particularly impactful
52
+
53
+ 3. PROVIDE SYNTHESIS NOTES:
54
+ - Briefly explain which batch-level patterns were merged and why
55
+ - Note any cross-batch trends or patterns
56
+ - Highlight the most critical areas for improvement
57
+
58
+ Provide:
59
+ 1. A unified list of failure modes (name, description, root cause)
60
+ 2. Synthesis notes explaining your analysis process and key findings"""
61
+
62
+
63
+ # Prompt template for improving prompts based on failure modes
64
+ IMPROVE_PROMPT_TEMPLATE = """You are an expert prompt engineer. You are given a prompt and a failure mode identified during evaluation.
65
+ Your task is to improve the prompt to address this failure mode.
66
+
67
+ CURRENT PROMPT:
68
+ ```
69
+ {current_prompt}
70
+ ```
71
+ FAILURE MODE TO ADDRESS:
72
+ - Name: {failure_mode_name}
73
+ - Description: {failure_mode_description}
74
+ - Root Cause: {failure_mode_root_cause}
75
+
76
+ INSTRUCTIONS FOR IMPROVING THE PROMPT:
77
+
78
+ 1. **Analyze First**: Carefully review the current prompt to understand what instructions already exist.
79
+
80
+ 2. **Choose the Right Approach**:
81
+ - If relevant instructions already exist but are unclear or incomplete, UPDATE and CLARIFY them in place
82
+ - If the prompt is missing critical instructions needed to address this failure mode, ADD new targeted instructions
83
+ - If existing instructions contradict what's needed, REPLACE them with corrected versions
84
+
85
+ 3. **Be Surgical**: Make targeted changes that directly address the root cause. Don't add unnecessary instructions or rewrite the entire prompt.
86
+
87
+ 4. **Maintain Structure**: Keep the same message structure (role and content format). Only modify the content where necessary.
88
+
89
+ 5. **Be Specific**: Ensure your changes provide concrete, actionable guidance that directly addresses the identified failure mode.
90
+
91
+ Provide your reasoning for the changes you made, explaining WHY each change addresses the failure mode, and then provide the improved prompt."""