opik-optimizer 2.1.2__py3-none-any.whl → 2.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@ from .mipro_optimizer import MiproOptimizer
19
19
  from .hierarchical_reflective_optimizer import HierarchicalReflectiveOptimizer
20
20
  from .optimization_config.configs import TaskConfig
21
21
  from .optimization_result import OptimizationResult
22
+ from .multi_metric_objective import MultiMetricObjective
22
23
  from .parameter_optimizer import (
23
24
  ParameterOptimizer,
24
25
  ParameterSearchSpace,
@@ -48,6 +49,7 @@ __all__ = [
48
49
  "setup_logging",
49
50
  "datasets",
50
51
  "TaskConfig",
52
+ "MultiMetricObjective",
51
53
  "ParameterSearchSpace",
52
54
  "ParameterSpec",
53
55
  "ParameterType",
@@ -1,4 +1,3 @@
1
- from opik.environment import get_tqdm_for_current_environment
2
1
  import os
3
2
  import logging
4
3
 
@@ -29,8 +28,6 @@ from .types import (
29
28
  )
30
29
  from .prompts import IMPROVE_PROMPT_TEMPLATE
31
30
 
32
- tqdm = get_tqdm_for_current_environment()
33
-
34
31
  # Using disk cache for LLM calls
35
32
  disk_cache_dir = os.path.expanduser("~/.litellm_cache")
36
33
  litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
@@ -61,10 +58,14 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
61
58
  max_parallel_batches: Maximum number of batches to process concurrently during
62
59
  hierarchical root cause analysis (default: 5)
63
60
  batch_size: Number of test cases per batch for root cause analysis (default: 25)
61
+ max_iterations: Maximum number of optimization iterations (default: 5)
62
+ convergence_threshold: Stop if relative improvement is below this threshold (default: 0.01)
64
63
  **model_kwargs: Additional arguments passed to the LLM model
65
64
  """
66
65
 
67
66
  DEFAULT_ROUNDS = 10
67
+ DEFAULT_MAX_ITERATIONS = 5
68
+ DEFAULT_CONVERGENCE_THRESHOLD = 0.01 # Stop if improvement is less than 1%
68
69
 
69
70
  def __init__(
70
71
  self,
@@ -74,6 +75,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
74
75
  seed: int = 42,
75
76
  max_parallel_batches: int = 5,
76
77
  batch_size: int = 25,
78
+ max_iterations: int = DEFAULT_MAX_ITERATIONS,
79
+ convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
77
80
  **model_kwargs: Any,
78
81
  ):
79
82
  super().__init__(
@@ -83,6 +86,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
83
86
  self.num_threads = num_threads
84
87
  self.max_parallel_batches = max_parallel_batches
85
88
  self.batch_size = batch_size
89
+ self.max_iterations = max_iterations
90
+ self.convergence_threshold = convergence_threshold
86
91
 
87
92
  # Initialize hierarchical analyzer
88
93
  self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
@@ -247,6 +252,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
247
252
  "reasoning_model": self.reasoning_model,
248
253
  "num_threads": self.num_threads,
249
254
  "max_parallel_batches": self.max_parallel_batches,
255
+ "max_iterations": self.max_iterations,
256
+ "convergence_threshold": self.convergence_threshold,
250
257
  "seed": self.seed,
251
258
  "verbose": self.verbose,
252
259
  }
@@ -417,7 +424,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
417
424
  n_samples: int | None,
418
425
  attempt: int,
419
426
  max_attempts: int,
420
- ) -> tuple[chat_prompt.ChatPrompt, float]:
427
+ ) -> tuple[chat_prompt.ChatPrompt, float, EvaluationResult]:
421
428
  """
422
429
  Generate and evaluate a single improvement attempt for a failure mode.
423
430
 
@@ -434,7 +441,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
434
441
  max_attempts: Total number of attempts
435
442
 
436
443
  Returns:
437
- Tuple of (improved_prompt, improved_score)
444
+ Tuple of (improved_prompt, improved_score, improved_experiment_result)
438
445
  """
439
446
  # Generate improvement with progress indication
440
447
  with reporting.display_prompt_improvement(
@@ -485,7 +492,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
485
492
  ) / len(improved_experiment_result.test_results)
486
493
  improved_reporter.set_score(improved_score)
487
494
 
488
- return improved_chat_prompt, improved_score
495
+ return improved_chat_prompt, improved_score, improved_experiment_result
489
496
 
490
497
  def optimize_prompt(
491
498
  self,
@@ -528,6 +535,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
528
535
  "n_samples": n_samples,
529
536
  "auto_continue": auto_continue,
530
537
  "max_retries": max_retries,
538
+ "max_iterations": self.max_iterations,
539
+ "convergence_threshold": self.convergence_threshold,
531
540
  },
532
541
  verbose=self.verbose,
533
542
  tools=getattr(prompt, "tools", None),
@@ -557,53 +566,62 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
557
566
  prompt.get_messages()
558
567
  ) # Store copy of initial messages for diff
559
568
 
560
- # Iteration 1: Analyze and improve (structure ready for future multi-iteration support)
561
- with reporting.display_optimization_iteration(
562
- iteration=1, verbose=self.verbose
563
- ) as iteration_reporter:
564
- # Perform hierarchical root cause analysis
565
- with reporting.display_root_cause_analysis(
566
- verbose=self.verbose
567
- ) as analysis_reporter:
568
- hierarchical_analysis = self._hierarchical_root_cause_analysis(
569
- experiment_result
570
- )
571
- analysis_reporter.set_completed(
572
- total_test_cases=hierarchical_analysis.total_test_cases,
573
- num_batches=hierarchical_analysis.num_batches,
574
- )
575
-
576
- # Display hierarchical synthesis and failure modes
577
- if self.verbose:
578
- reporting.display_hierarchical_synthesis(
579
- total_test_cases=hierarchical_analysis.total_test_cases,
580
- num_batches=hierarchical_analysis.num_batches,
581
- synthesis_notes=hierarchical_analysis.synthesis_notes,
582
- verbose=self.verbose,
583
- )
569
+ # Multi-iteration optimization loop
570
+ iteration = 0
571
+ previous_iteration_score = initial_score
572
+
573
+ for iteration in range(1, self.max_iterations + 1):
574
+ logger.info(f"Starting iteration {iteration}/{self.max_iterations}")
575
+
576
+ with reporting.display_optimization_iteration(
577
+ iteration=iteration, verbose=self.verbose
578
+ ) as iteration_reporter:
579
+ # Perform hierarchical root cause analysis
580
+ with reporting.display_root_cause_analysis(
581
+ verbose=self.verbose
582
+ ) as analysis_reporter:
583
+ hierarchical_analysis = self._hierarchical_root_cause_analysis(
584
+ experiment_result
585
+ )
586
+ analysis_reporter.set_completed(
587
+ total_test_cases=hierarchical_analysis.total_test_cases,
588
+ num_batches=hierarchical_analysis.num_batches,
589
+ )
584
590
 
585
- reporting.display_failure_modes(
586
- failure_modes=hierarchical_analysis.unified_failure_modes,
587
- verbose=self.verbose,
588
- )
591
+ # Display hierarchical synthesis and failure modes
592
+ if self.verbose:
593
+ reporting.display_hierarchical_synthesis(
594
+ total_test_cases=hierarchical_analysis.total_test_cases,
595
+ num_batches=hierarchical_analysis.num_batches,
596
+ synthesis_notes=hierarchical_analysis.synthesis_notes,
597
+ verbose=self.verbose,
598
+ )
589
599
 
590
- # Generate improved prompt for each failure mode
591
- for idx, root_cause in enumerate(
592
- hierarchical_analysis.unified_failure_modes, 1
593
- ):
594
- logger.debug(
595
- f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
600
+ reporting.display_failure_modes(
601
+ failure_modes=hierarchical_analysis.unified_failure_modes,
602
+ verbose=self.verbose,
596
603
  )
597
604
 
598
- # Try multiple attempts if needed
599
- max_attempts = max_retries + 1
600
- improved_chat_prompt = None
601
- improved_score = None
605
+ # Generate improved prompt for each failure mode
606
+ for idx, root_cause in enumerate(
607
+ hierarchical_analysis.unified_failure_modes, 1
608
+ ):
609
+ logger.debug(
610
+ f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
611
+ )
602
612
 
603
- for attempt in range(1, max_attempts + 1):
604
- # Generate and evaluate improvement
605
- improved_chat_prompt, improved_score = (
606
- self._generate_and_evaluate_improvement(
613
+ # Try multiple attempts if needed
614
+ max_attempts = max_retries + 1
615
+ improved_chat_prompt = None
616
+ improved_score = None
617
+
618
+ for attempt in range(1, max_attempts + 1):
619
+ # Generate and evaluate improvement
620
+ (
621
+ improved_chat_prompt,
622
+ improved_score,
623
+ improved_experiment_result,
624
+ ) = self._generate_and_evaluate_improvement(
607
625
  root_cause=root_cause,
608
626
  best_prompt=best_prompt,
609
627
  best_score=best_score,
@@ -615,64 +633,86 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
615
633
  attempt=attempt,
616
634
  max_attempts=max_attempts,
617
635
  )
618
- )
619
636
 
620
- # Check if we got improvement
621
- if improved_score > best_score:
622
- logger.info(
623
- f"Improvement found for '{root_cause.name}' on attempt {attempt}"
637
+ # Check if we got improvement
638
+ if improved_score > best_score:
639
+ logger.info(
640
+ f"Improvement found for '{root_cause.name}' on attempt {attempt}"
641
+ )
642
+ break
643
+
644
+ # No improvement - should we retry?
645
+ if attempt < max_attempts:
646
+ reporting.display_retry_attempt(
647
+ attempt=attempt,
648
+ max_attempts=max_attempts,
649
+ failure_mode_name=root_cause.name,
650
+ verbose=self.verbose,
651
+ )
652
+ else:
653
+ logger.debug(
654
+ f"No improvement after {attempt} attempts for '{root_cause.name}'"
655
+ )
656
+
657
+ # Check if final result is an improvement
658
+ if (
659
+ improved_score is not None
660
+ and improved_chat_prompt is not None
661
+ and improved_score > best_score
662
+ ):
663
+ improvement = self._calculate_improvement(
664
+ improved_score, best_score
624
665
  )
625
- break
626
666
 
627
- # No improvement - should we retry?
628
- if attempt < max_attempts:
629
- reporting.display_retry_attempt(
630
- attempt=attempt,
631
- max_attempts=max_attempts,
632
- failure_mode_name=root_cause.name,
667
+ # Display improvement for this iteration
668
+ reporting.display_iteration_improvement(
669
+ improvement=improvement,
670
+ current_score=improved_score,
671
+ best_score=best_score,
633
672
  verbose=self.verbose,
634
673
  )
674
+
675
+ # Update best
676
+ best_score = improved_score
677
+ best_prompt = improved_chat_prompt
678
+ best_messages = improved_chat_prompt.get_messages()
679
+ experiment_result = improved_experiment_result
680
+ logger.info(
681
+ f"Updated best prompt after addressing '{root_cause.name}'"
682
+ )
635
683
  else:
636
684
  logger.debug(
637
- f"No improvement after {attempt} attempts for '{root_cause.name}'"
685
+ f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
638
686
  )
639
687
 
640
- # Check if final result is an improvement
641
- if (
642
- improved_score is not None
643
- and improved_chat_prompt is not None
644
- and improved_score > best_score
645
- ):
646
- improvement = self._calculate_improvement(
647
- improved_score, best_score
648
- )
649
-
650
- # Display improvement for this iteration
651
- reporting.display_iteration_improvement(
652
- improvement=improvement,
653
- current_score=improved_score,
654
- best_score=best_score,
655
- verbose=self.verbose,
656
- )
688
+ # Mark iteration complete
689
+ improved_since_start = best_score > initial_score
690
+ iteration_reporter.iteration_complete(
691
+ best_score=best_score, improved=improved_since_start
692
+ )
657
693
 
658
- # Update best
659
- best_score = improved_score
660
- best_prompt = improved_chat_prompt
661
- best_messages = improved_chat_prompt.get_messages()
662
- logger.info(
663
- f"Updated best prompt after addressing '{root_cause.name}'"
664
- )
665
- else:
666
- logger.debug(
667
- f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
668
- )
694
+ # Check for convergence after iteration
695
+ iteration_improvement = self._calculate_improvement(
696
+ best_score, previous_iteration_score
697
+ )
669
698
 
670
- # Mark iteration complete
671
- improved_since_start = best_score > initial_score
672
- iteration_reporter.iteration_complete(
673
- best_score=best_score, improved=improved_since_start
699
+ logger.info(
700
+ f"Iteration {iteration} complete. Score: {best_score:.4f}, "
701
+ f"Improvement: {iteration_improvement:.2%}"
674
702
  )
675
703
 
704
+ # Stop if improvement is below convergence threshold
705
+ if abs(iteration_improvement) < self.convergence_threshold:
706
+ logger.info(
707
+ f"Convergence achieved: improvement ({iteration_improvement:.2%}) "
708
+ f"below threshold ({self.convergence_threshold:.2%}). "
709
+ f"Stopping after {iteration} iterations."
710
+ )
711
+ break
712
+
713
+ # Update previous score for next iteration
714
+ previous_iteration_score = best_score
715
+
676
716
  # Display final optimization result with diff
677
717
  reporting.display_optimized_prompt_diff(
678
718
  initial_messages=initial_messages,
@@ -682,14 +722,27 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
682
722
  verbose=self.verbose,
683
723
  )
684
724
 
725
+ # Update optimization status to completed
726
+ try:
727
+ optimization.update(status="completed")
728
+ logger.info(f"Optimization {optimization.id} status updated to completed.")
729
+ except Exception as e:
730
+ logger.warning(f"Failed to update optimization status: {e}")
731
+
685
732
  # Prepare details for the result
686
733
  details = {
734
+ "model": best_prompt.model or self.model,
735
+ "temperature": (best_prompt.model_kwargs or {}).get("temperature")
736
+ or self.model_kwargs.get("temperature"),
687
737
  "reasoning_model": self.reasoning_model,
688
738
  "num_threads": self.num_threads,
689
739
  "max_parallel_batches": self.max_parallel_batches,
690
740
  "max_retries": max_retries,
691
741
  "n_samples": n_samples,
692
742
  "auto_continue": auto_continue,
743
+ "max_iterations": self.max_iterations,
744
+ "convergence_threshold": self.convergence_threshold,
745
+ "iterations_completed": iteration,
693
746
  }
694
747
 
695
748
  # Extract tool prompts if tools exist
@@ -1,8 +1,8 @@
1
1
  import logging
2
2
  import asyncio
3
3
  from typing import Any
4
- from tqdm import tqdm
5
4
 
5
+ from rich.progress import Progress, TextColumn, BarColumn, TaskProgressColumn
6
6
  from opik.evaluation.evaluation_result import EvaluationResult
7
7
  from .types import (
8
8
  RootCauseAnalysis,
@@ -11,6 +11,7 @@ from .types import (
11
11
  )
12
12
  from . import reporting
13
13
  from .prompts import BATCH_ANALYSIS_PROMPT, SYNTHESIS_PROMPT
14
+ from ..reporting_utils import get_console
14
15
 
15
16
  logger = logging.getLogger(__name__)
16
17
 
@@ -285,13 +286,11 @@ Scores:
285
286
 
286
287
  semaphore = asyncio.Semaphore(self.max_parallel_batches)
287
288
 
288
- # Create progress bar for batch processing
289
- pbar = tqdm(
290
- total=len(batch_tasks), desc="Processing batches", unit="batch", leave=False
291
- )
289
+ # Create progress bar for batch processing using Rich
290
+ console = get_console()
292
291
 
293
292
  async def run_with_semaphore(
294
- batch_num: int, task: Any
293
+ batch_num: int, task: Any, progress: Progress | None, task_id: Any | None
295
294
  ) -> tuple[int, BatchAnalysis]:
296
295
  async with semaphore:
297
296
  try:
@@ -300,19 +299,44 @@ Scores:
300
299
  f"Completed batch {batch_num}: "
301
300
  f"identified {len(result.failure_modes)} failure modes"
302
301
  )
303
- pbar.update(1) # Update progress bar
302
+ if progress and task_id is not None:
303
+ progress.update(task_id, advance=1) # Update progress bar
304
304
  return batch_num, result
305
305
  except Exception as exc:
306
306
  logger.error(f"Batch {batch_num} failed: {exc}")
307
- pbar.update(1) # Update progress bar even on error
307
+ if progress and task_id is not None:
308
+ progress.update(
309
+ task_id, advance=1
310
+ ) # Update progress bar even on error
308
311
  raise
309
312
 
310
- # Run all tasks with semaphore control
311
- results = await asyncio.gather(
312
- *[run_with_semaphore(num, task) for num, task in batch_tasks]
313
- )
314
-
315
- pbar.close() # Close progress bar
313
+ # Run all tasks with semaphore control and rich progress bar
314
+ if self.verbose >= 1:
315
+ with Progress(
316
+ TextColumn("│ "),
317
+ TextColumn("[progress.description]{task.description}"),
318
+ BarColumn(),
319
+ TaskProgressColumn(),
320
+ console=console,
321
+ transient=True,
322
+ ) as progress:
323
+ task_id = progress.add_task(
324
+ "Processing batches", total=len(batch_tasks)
325
+ )
326
+ results = await asyncio.gather(
327
+ *[
328
+ run_with_semaphore(num, task, progress, task_id)
329
+ for num, task in batch_tasks
330
+ ]
331
+ )
332
+ else:
333
+ # No progress bar in non-verbose mode
334
+ results = await asyncio.gather(
335
+ *[
336
+ run_with_semaphore(num, task, None, None)
337
+ for num, task in batch_tasks
338
+ ]
339
+ )
316
340
 
317
341
  # Sort by batch number to maintain order
318
342
  batch_analyses = [result for _, result in sorted(results)]
@@ -29,9 +29,11 @@ def display_retry_attempt(
29
29
  """Display retry attempt information."""
30
30
  if verbose >= 1:
31
31
  console.print(
32
- Text(
33
- f"│ Retry attempt {attempt + 1}/{max_attempts} for failure mode '{failure_mode_name}' (no improvement observed)",
34
- style="yellow",
32
+ Text("│ ").append(
33
+ Text(
34
+ f"Retry attempt {attempt + 1}/{max_attempts} for failure mode '{failure_mode_name}' (no improvement observed)",
35
+ style="yellow",
36
+ )
35
37
  )
36
38
  )
37
39
 
@@ -132,14 +134,16 @@ def display_evaluation(
132
134
  def set_score(self, s: float) -> None:
133
135
  if verbose >= 1:
134
136
  # Adjust score indentation based on indent style
135
- score_indent = " " if indent == "> " else "│ "
137
+ score_indent = "" if indent == "> " else "│ "
136
138
 
137
139
  if baseline_score is None:
138
140
  # This is the baseline evaluation
139
141
  console.print(
140
- Text(
141
- f"\r{score_indent}Baseline score was: {s:.4f}.",
142
- style="green",
142
+ Text(score_indent).append(
143
+ Text(
144
+ f"Baseline score was: {s:.4f}.",
145
+ style="green",
146
+ )
143
147
  )
144
148
  )
145
149
  console.print(Text("│"))
@@ -152,9 +156,11 @@ def display_evaluation(
152
156
  else 0
153
157
  )
154
158
  console.print(
155
- Text(
156
- f"\r{score_indent}Score for updated prompt: {s:.4f} (+{improvement_pct:.1f}%)",
157
- style="green bold",
159
+ Text(score_indent).append(
160
+ Text(
161
+ f"Score for updated prompt: {s:.4f} (+{improvement_pct:.1f}%)",
162
+ style="green bold",
163
+ )
158
164
  )
159
165
  )
160
166
  elif s < baseline_score:
@@ -164,23 +170,27 @@ def display_evaluation(
164
170
  else 0
165
171
  )
166
172
  console.print(
167
- Text(
168
- f"\r{score_indent}Score for updated prompt: {s:.4f} (-{decline_pct:.1f}%)",
169
- style="red",
173
+ Text(score_indent).append(
174
+ Text(
175
+ f"Score for updated prompt: {s:.4f} (-{decline_pct:.1f}%)",
176
+ style="red",
177
+ )
170
178
  )
171
179
  )
172
180
  else:
173
181
  console.print(
174
- Text(
175
- f"\r{score_indent}Score for updated prompt: {s:.4f} (no change)",
176
- style="yellow",
182
+ Text(score_indent).append(
183
+ Text(
184
+ f"Score for updated prompt: {s:.4f} (no change)",
185
+ style="yellow",
186
+ )
177
187
  )
178
188
  )
179
189
  console.print(Text("│"))
180
190
 
181
191
  # Use our log suppression context manager and yield the reporter
182
192
  # Adjust progress bar indentation based on indent style
183
- progress_indent = " Evaluation" if indent == "> " else "│ Evaluation"
193
+ progress_indent = "Evaluation" if indent == "> " else "│ Evaluation"
184
194
  with suppress_opik_logs():
185
195
  with convert_tqdm_to_rich(progress_indent, verbose=verbose):
186
196
  try:
@@ -306,25 +316,31 @@ def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
306
316
  def display_optimization_iteration(iteration: int, verbose: int = 1) -> Iterator[Any]:
307
317
  """Context manager to display progress for a single optimization iteration."""
308
318
  if verbose >= 1:
309
- console.print()
310
319
  console.print(Text("│"))
311
- console.print(Text(f"│ Iteration {iteration}", style="bold cyan"))
320
+ console.print(Text("│"))
321
+ console.print(
322
+ Text("│ ").append(Text(f"Iteration {iteration}", style="bold cyan"))
323
+ )
312
324
 
313
325
  class Reporter:
314
326
  def iteration_complete(self, best_score: float, improved: bool) -> None:
315
327
  if verbose >= 1:
316
328
  if improved:
317
329
  console.print(
318
- Text(
319
- f"│ Iteration {iteration} complete - New best score: {best_score:.4f}",
320
- style="green",
330
+ Text("│ ").append(
331
+ Text(
332
+ f"Iteration {iteration} complete - New best score: {best_score:.4f}",
333
+ style="green",
334
+ )
321
335
  )
322
336
  )
323
337
  else:
324
338
  console.print(
325
- Text(
326
- f"│ Iteration {iteration} complete - No improvement (best: {best_score:.4f})",
327
- style="yellow",
339
+ Text("│ ").append(
340
+ Text(
341
+ f"Iteration {iteration} complete - No improvement (best: {best_score:.4f})",
342
+ style="yellow",
343
+ )
328
344
  )
329
345
  )
330
346
  console.print(Text("│"))
@@ -341,16 +357,20 @@ def display_root_cause_analysis(verbose: int = 1) -> Iterator[Any]:
341
357
  if verbose >= 1:
342
358
  console.print(Text("│ "))
343
359
  console.print(
344
- Text("│ Analyzing root cause of failed evaluation items", style="cyan")
360
+ Text("│ ").append(
361
+ Text("Analyzing root cause of failed evaluation items", style="cyan")
362
+ )
345
363
  )
346
364
 
347
365
  class Reporter:
348
366
  def set_completed(self, total_test_cases: int, num_batches: int) -> None:
349
367
  if verbose >= 1:
350
368
  console.print(
351
- Text(
352
- f"│ Analyzed {total_test_cases} test cases across {num_batches} batches",
353
- style="green",
369
+ Text("│ ").append(
370
+ Text(
371
+ f"Analyzed {total_test_cases} test cases across {num_batches} batches",
372
+ style="green",
373
+ )
354
374
  )
355
375
  )
356
376
  console.print(Text("│ "))
@@ -367,7 +387,9 @@ def display_root_cause_analysis(verbose: int = 1) -> Iterator[Any]:
367
387
  def display_batch_synthesis(num_batches: int, verbose: int = 1) -> Iterator[Any]:
368
388
  """Context manager to display message during batch synthesis."""
369
389
  if verbose >= 1:
370
- console.print(Text("│ Synthesizing failure modes", style="cyan"))
390
+ console.print(
391
+ Text("│ ").append(Text("Synthesizing failure modes", style="cyan"))
392
+ )
371
393
 
372
394
  class Reporter:
373
395
  def set_completed(self, num_unified_modes: int) -> None:
@@ -406,10 +428,13 @@ def display_hierarchical_synthesis(
406
428
  console.print(panel)
407
429
 
408
430
  rendered_panel = capture.get()
409
- for line in rendered_panel.splitlines():
410
- console.print(Text("│ ") + Text.from_ansi(line))
411
431
 
412
- console.print()
432
+ # Prefix each line with '│ ', preserving ANSI styles
433
+ prefixed_output = "\n".join(f"│ {line}" for line in rendered_panel.splitlines())
434
+
435
+ # Print the prefixed output (will include colors)
436
+ console.print(prefixed_output, highlight=False)
437
+ console.print(Text("│"))
413
438
 
414
439
 
415
440
  def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
@@ -433,10 +458,13 @@ def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
433
458
  console.print(header_panel)
434
459
 
435
460
  rendered_header = capture.get()
436
- for line in rendered_header.splitlines():
437
- console.print(Text("│ ") + Text.from_ansi(line))
438
461
 
439
- console.print()
462
+ # Prefix each line with '│ ', preserving ANSI styles
463
+ prefixed_output = "\n".join(f"│ {line}" for line in rendered_header.splitlines())
464
+
465
+ # Print the prefixed output (will include colors)
466
+ console.print(prefixed_output, highlight=False)
467
+ console.print(Text("│"))
440
468
 
441
469
  for idx, failure_mode in enumerate(failure_modes, 1):
442
470
  # Create content for this failure mode
@@ -460,8 +488,14 @@ def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
460
488
  console.print(panel)
461
489
 
462
490
  rendered_panel = capture.get()
463
- for line in rendered_panel.splitlines():
464
- console.print(Text("") + Text.from_ansi(line))
491
+
492
+ # Prefix each line with '', preserving ANSI styles
493
+ prefixed_output = "\n".join(
494
+ f"│ {line}" for line in rendered_panel.splitlines()
495
+ )
496
+
497
+ # Print the prefixed output (will include colors)
498
+ console.print(prefixed_output, highlight=False)
465
499
 
466
500
  if idx < len(failure_modes):
467
501
  console.print("│")
@@ -473,9 +507,13 @@ def display_prompt_improvement(
473
507
  ) -> Iterator[Any]:
474
508
  """Context manager to display progress while generating improved prompt."""
475
509
  if verbose >= 1:
476
- console.print()
510
+ console.print(Text("│"))
477
511
  console.print(Text("│ "))
478
- console.print(Text(f"│ Addressing: {failure_mode_name}", style="bold cyan"))
512
+ console.print(
513
+ Text("│ ").append(
514
+ Text(f"Addressing: {failure_mode_name}", style="bold cyan")
515
+ )
516
+ )
479
517
 
480
518
  class Reporter:
481
519
  def set_reasoning(self, reasoning: str) -> None:
@@ -498,9 +536,14 @@ def display_prompt_improvement(
498
536
  console.print(panel)
499
537
 
500
538
  rendered_panel = capture.get()
501
- for line in rendered_panel.splitlines():
502
- console.print(Text("│ ") + Text.from_ansi(line))
503
539
 
540
+ # Prefix each line with '│ ', preserving ANSI styles
541
+ prefixed_output = "\n".join(
542
+ f"│ {line}" for line in rendered_panel.splitlines()
543
+ )
544
+
545
+ # Print the prefixed output (will include colors)
546
+ console.print(prefixed_output, highlight=False)
504
547
  console.print(Text("│ "))
505
548
 
506
549
  try:
@@ -520,9 +563,11 @@ def display_improvement_reasoning(
520
563
  if verbose < 1:
521
564
  return
522
565
 
523
- console.print()
566
+ console.print(Text("│"))
524
567
  console.print(Text("│ "))
525
- console.print(Text(f"│ Addressing: {failure_mode_name}", style="bold cyan"))
568
+ console.print(
569
+ Text("│ ").append(Text(f"Addressing: {failure_mode_name}", style="bold cyan"))
570
+ )
526
571
 
527
572
  reasoning_content = Text()
528
573
  reasoning_content.append("Improvement Strategy:\n", style="cyan")
@@ -542,9 +587,12 @@ def display_improvement_reasoning(
542
587
  console.print(panel)
543
588
 
544
589
  rendered_panel = capture.get()
545
- for line in rendered_panel.splitlines():
546
- console.print(Text("│ ") + Text.from_ansi(line))
547
590
 
591
+ # Prefix each line with '│ ', preserving ANSI styles
592
+ prefixed_output = "\n".join(f"│ {line}" for line in rendered_panel.splitlines())
593
+
594
+ # Print the prefixed output (will include colors)
595
+ console.print(prefixed_output, highlight=False)
548
596
  console.print(Text("│ "))
549
597
 
550
598
 
@@ -557,16 +605,20 @@ def display_iteration_improvement(
557
605
 
558
606
  if improvement > 0:
559
607
  console.print(
560
- Text(
561
- f"│ ✓ Improvement: {improvement:.2%} (from {best_score:.4f} to {current_score:.4f})",
562
- style="green bold",
608
+ Text("│ ").append(
609
+ Text(
610
+ f" Improvement: {improvement:.2%} (from {best_score:.4f} to {current_score:.4f})",
611
+ style="green bold",
612
+ )
563
613
  )
564
614
  )
565
615
  else:
566
616
  console.print(
567
- Text(
568
- f"│ ✗ No improvement: {improvement:.2%} (score: {current_score:.4f}, best: {best_score:.4f})",
569
- style="yellow",
617
+ Text("│ ").append(
618
+ Text(
619
+ f"✗ No improvement: {improvement:.2%} (score: {current_score:.4f}, best: {best_score:.4f})",
620
+ style="yellow",
621
+ )
570
622
  )
571
623
  )
572
624
 
@@ -584,27 +636,31 @@ def display_optimized_prompt_diff(
584
636
  if verbose < 1:
585
637
  return
586
638
 
587
- console.print()
588
639
  console.print(Text("│"))
589
- console.print(Text("│ > Optimization Results", style="bold green"))
640
+ console.print(Text("│"))
641
+ console.print(Text("│ ").append(Text("> Optimization Results", style="bold green")))
590
642
  console.print(Text("│"))
591
643
 
592
644
  # Show score improvement
593
645
  if best_score > initial_score:
594
646
  perc_change = (best_score - initial_score) / initial_score
595
647
  console.print(
596
- Text(
597
- f"│ Prompt improved from {initial_score:.4f} to {best_score:.4f} ({perc_change:.2%})",
598
- style="green",
648
+ Text("│ ").append(
649
+ Text(
650
+ f"Prompt improved from {initial_score:.4f} to {best_score:.4f} ({perc_change:.2%})",
651
+ style="green",
652
+ )
599
653
  )
600
654
  )
601
655
  else:
602
656
  console.print(
603
- Text(f"│ No improvement found (score: {best_score:.4f})", style="yellow")
657
+ Text("│ ").append(
658
+ Text(f"No improvement found (score: {best_score:.4f})", style="yellow")
659
+ )
604
660
  )
605
661
 
606
662
  console.print(Text("│"))
607
- console.print(Text("│ Prompt Changes:", style="cyan"))
663
+ console.print(Text("│ ").append(Text("Prompt Changes:", style="cyan")))
608
664
  console.print(Text("│"))
609
665
 
610
666
  # Compare each message
@@ -626,24 +682,30 @@ def display_optimized_prompt_diff(
626
682
 
627
683
  # Handle added messages
628
684
  if not initial_msg:
629
- console.print(Text(f"│ {role}: (added)", style="green bold"))
685
+ console.print(
686
+ Text("│ ").append(Text(f"{role}: (added)", style="green bold"))
687
+ )
630
688
  for line in optimized_content.splitlines():
631
- console.print(Text(f"│ +{line}", style="green"))
689
+ console.print(Text("│ ").append(Text(f"+{line}", style="green")))
632
690
  console.print(Text("│"))
633
691
  continue
634
692
 
635
693
  # Handle removed messages
636
694
  if not optimized_msg:
637
- console.print(Text(f"│ {role}: (removed)", style="red bold"))
695
+ console.print(
696
+ Text("│ ").append(Text(f"{role}: (removed)", style="red bold"))
697
+ )
638
698
  for line in initial_content.splitlines():
639
- console.print(Text(f"│ -{line}", style="red"))
699
+ console.print(Text("│ ").append(Text(f"-{line}", style="red")))
640
700
  console.print(Text("│"))
641
701
  continue
642
702
 
643
703
  # Check if there are changes
644
704
  if initial_content == optimized_content:
645
705
  # No changes in this message
646
- console.print(Text(f"│ {role}: (unchanged)", style="dim"))
706
+ console.print(
707
+ Text("│ ").append(Text(f"{role}: (unchanged)", style="dim"))
708
+ )
647
709
  continue
648
710
 
649
711
  # Generate unified diff
@@ -660,7 +722,7 @@ def display_optimized_prompt_diff(
660
722
  continue
661
723
 
662
724
  # Display message header
663
- console.print(Text(f"│ {role}:", style="bold cyan"))
725
+ console.print(Text("│ ").append(Text(f"{role}:", style="bold cyan")))
664
726
 
665
727
  # Create diff content
666
728
  diff_content = Text()
@@ -0,0 +1,33 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from opik.evaluation.metrics.score_result import ScoreResult
4
+
5
+
6
+ class MultiMetricObjective:
7
+ def __init__(
8
+ self,
9
+ metrics: list[Callable[[dict[str, Any], str], ScoreResult]],
10
+ weights: list[float] | None = None,
11
+ name: str = "multi_metric_objective",
12
+ ):
13
+ self.metrics = metrics
14
+ self.weights = weights if weights else [1 / len(metrics)] * len(metrics)
15
+ self.__name__ = name
16
+
17
+ def __call__(self, dataset_item: dict[str, Any], llm_output: str) -> ScoreResult:
18
+ raw_score_results = []
19
+ weighted_score_value = 0
20
+
21
+ for metric, weight in zip(self.metrics, self.weights):
22
+ score_result = metric(dataset_item, llm_output)
23
+ raw_score_results.append(score_result)
24
+ weighted_score_value += score_result.value * weight
25
+
26
+ aggregated_score_result = ScoreResult(
27
+ name=self.__name__,
28
+ value=weighted_score_value,
29
+ metadata={"raw_score_results": raw_score_results},
30
+ )
31
+
32
+ # Important: we return the aggregated score result first
33
+ return aggregated_score_result
@@ -5,6 +5,7 @@ from collections.abc import Callable
5
5
  import opik
6
6
  from opik.evaluation import evaluator as opik_evaluator
7
7
  from opik.evaluation.metrics import base_metric, score_result
8
+ from . import multi_metric_objective
8
9
 
9
10
  logger = logging.getLogger(__name__)
10
11
 
@@ -14,9 +15,20 @@ def _create_metric_class(metric: Callable) -> base_metric.BaseMetric:
14
15
  def __init__(self) -> None:
15
16
  self.name = metric.__name__
16
17
 
17
- def score(self, llm_output: str, **kwargs: Any) -> score_result.ScoreResult:
18
+ def score(
19
+ self, llm_output: str, **kwargs: Any
20
+ ) -> score_result.ScoreResult | list[score_result.ScoreResult]:
18
21
  try:
19
22
  metric_val = metric(dataset_item=kwargs, llm_output=llm_output)
23
+
24
+ if isinstance(metric, multi_metric_objective.MultiMetricObjective):
25
+ if (
26
+ hasattr(metric_val, "metadata")
27
+ and "raw_score_results" in metric_val.metadata
28
+ ):
29
+ return [metric_val, *metric_val.metadata["raw_score_results"]]
30
+ else:
31
+ return [metric_val]
20
32
  if isinstance(metric_val, score_result.ScoreResult):
21
33
  return score_result.ScoreResult(
22
34
  name=self.name,
@@ -107,15 +119,20 @@ def evaluate(
107
119
  if not result.test_results:
108
120
  return 0.0
109
121
 
110
- # We may allow score aggregation customization.
111
- score_results: list[score_result.ScoreResult] = [
112
- test_result.score_results[0] for test_result in result.test_results
113
- ]
114
- if not score_results:
122
+ # Filter score results to only include the objective metric
123
+ objective_metric_name = metric.__name__
124
+ objective_score_results: list[score_result.ScoreResult] = []
125
+ for test_result in result.test_results:
126
+ for score_result_ in test_result.score_results:
127
+ if score_result_.name == objective_metric_name:
128
+ objective_score_results.append(score_result_)
129
+ break
130
+
131
+ if not objective_score_results:
115
132
  return 0.0
116
133
 
117
- avg_score = sum([score_result_.value for score_result_ in score_results]) / len(
118
- score_results
119
- )
134
+ avg_score = sum(
135
+ [score_result_.value for score_result_ in objective_score_results]
136
+ ) / len(objective_score_results)
120
137
 
121
138
  return avg_score
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opik_optimizer
3
- Version: 2.1.2
3
+ Version: 2.1.3
4
4
  Summary: Agent optimization with Opik
5
5
  Home-page: https://github.com/comet-ml/opik
6
6
  Author: Comet ML
@@ -1,13 +1,14 @@
1
- opik_optimizer/__init__.py,sha256=VwryQ5bSOmJSl4CiacCIv_UF_In8Zho54fQ3FUR8pyk,1573
1
+ opik_optimizer/__init__.py,sha256=lA9cjEsNxrJwYJ68vCjeNZgrcxO_rNJaAHsdMwaq364,1658
2
2
  opik_optimizer/_throttle.py,sha256=1JXIhYlo0IaqCgwmNB0Hnh9CYhYPkwRFdVGIcE7pVNg,1362
3
3
  opik_optimizer/base_optimizer.py,sha256=XryBkUTs4FQmHcBtVm63EJIKWrTvwqduUZ6ArHzYQko,21520
4
4
  opik_optimizer/cache_config.py,sha256=Xd3NdUsL7bLQWoNe3pESqH4nHucU1iNTSGp-RqbwDog,599
5
5
  opik_optimizer/logging_config.py,sha256=TmxX0C1P20amxoXuiNQvlENOjdSNfWwvL8jFy206VWM,3837
6
+ opik_optimizer/multi_metric_objective.py,sha256=y4jqirnhkfhB7SWonI4ldYg5fWG4JGfAxqu7ylRD1J4,1178
6
7
  opik_optimizer/optimizable_agent.py,sha256=R0_BdwdHyZGWTw3oSvTg8FULDOYM8XaTiPNR3qV8DkQ,6344
7
8
  opik_optimizer/optimization_result.py,sha256=sG-Yr-hOaH9zx_I5S6_W3v6j8nPUhwYdS333jVM4Gus,17218
8
9
  opik_optimizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
10
  opik_optimizer/reporting_utils.py,sha256=dcECFmzZ_J-DKoukMDEE_fm7X8sdQyl_ijTddvQtepE,8287
10
- opik_optimizer/task_evaluator.py,sha256=1hILYwJLtn7XpPX96JjubnlMasmudVTHMVK3pmd22bE,4312
11
+ opik_optimizer/task_evaluator.py,sha256=7N254DU0UkWJ5saQ5AmYEsHHSrychAJtedmmjNsCOnI,5081
11
12
  opik_optimizer/data/context7_eval.jsonl,sha256=vPR3XRfI0UbZ1hgUGaOdpraFT99RDLU1YWuPFLLQz40,1757
12
13
  opik_optimizer/data/hotpot-500.json,sha256=YXxCtuvYvxSu5u0y4559a6b1qwgAYsWzT_SUKv_21ew,76862
13
14
  opik_optimizer/datasets/__init__.py,sha256=V4LVDOaRjwzaYvhdQ3V6CAwFaeKnxyTV1lp_ES9Z31E,691
@@ -46,10 +47,10 @@ opik_optimizer/gepa_optimizer/adapter.py,sha256=KzPa4koq7aJhALMAOKPxAO4yWuEy_YbW
46
47
  opik_optimizer/gepa_optimizer/gepa_optimizer.py,sha256=HBjikhce3K4VIaiIXs7eSagmRyFPdY8h4seoW9F3nQE,26481
47
48
  opik_optimizer/gepa_optimizer/reporting.py,sha256=F0cxYSjRuFAszgi3rgqwH1A-KH26kZOLtENP7x1xrQs,5154
48
49
  opik_optimizer/hierarchical_reflective_optimizer/__init__.py,sha256=9qM3kvfAaFy-Y6Tg19MXHJxpnF5DJQQwzr6oNsxaRBM,133
49
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py,sha256=Fs83ztOuPS8mkFvJAVmYok15DaXTk4Jqpoa9ImRl2t4,27256
50
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py,sha256=GSIXUBxoS9LFnCXopS6B6wLSpmCYXA8Cv6ytELgEBoc,12709
50
+ opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py,sha256=j9Gr5z9j-evFhkbxkbiZ7RXt6Q89LshYYR4ac_UxwX0,30235
51
+ opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py,sha256=0D5wgx04jZvTJ0Yjqm0jtQvkjrGBB73qgcsSwLBpnv0,13814
51
52
  opik_optimizer/hierarchical_reflective_optimizer/prompts.py,sha256=XcOEI9eeEbTgKFsFiRWxvHdaByQkiN02bH2gTl3HX-Y,3853
52
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py,sha256=LpHv_WBZCg2a0RhZaGwUmCch_-Dfk_rpuMxTckJMWTU,23234
53
+ opik_optimizer/hierarchical_reflective_optimizer/reporting.py,sha256=d1jQ3uZs0fTI2DeumvGmkxuMHtwA0wt_ROtl4E6UdIM,25461
53
54
  opik_optimizer/hierarchical_reflective_optimizer/types.py,sha256=bS-JAheX2FpJ4XAxoZi5PfjloG8L-B1LGQA1iLXZhW4,1031
54
55
  opik_optimizer/mcp_utils/__init__.py,sha256=BsWQT8nAa6JV6zcOD__OvPMepUS2IpJD4J2rnAXhpuU,710
55
56
  opik_optimizer/mcp_utils/mcp.py,sha256=UylgpTJsybszS433_kuTAgKH-PPde-VHjHVelMardFs,18466
@@ -79,8 +80,8 @@ opik_optimizer/utils/colbert.py,sha256=qSrzKUUGw7P92mLy4Ofug5pBGeTsHBLMJXlXSJSfK
79
80
  opik_optimizer/utils/core.py,sha256=5GT1vp6fW8ICO42LHMX14BjR-xEb6afAKjM7b1Evx5M,15298
80
81
  opik_optimizer/utils/dataset_utils.py,sha256=dqRUGOekjeNWL0J15R8xFwLyKJDJynJXzVyQmt8rhHA,1464
81
82
  opik_optimizer/utils/prompt_segments.py,sha256=1zUITSccJ82Njac1rmANzim4WWM6rVac61mfluS7lFE,5931
82
- opik_optimizer-2.1.2.dist-info/licenses/LICENSE,sha256=V-0VHJOBdcA_teT8VymvsBUQ1-CZU6yJRmMEjec_8tA,11372
83
- opik_optimizer-2.1.2.dist-info/METADATA,sha256=V2WXOFaF2tc7-xAHcdndLn8pcUp1JHxHEVONa0K8j78,12829
84
- opik_optimizer-2.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
85
- opik_optimizer-2.1.2.dist-info/top_level.txt,sha256=ondOlpq6_yFckqpxoAHSfzZS2N-JfgmA-QQhOJfz7m0,15
86
- opik_optimizer-2.1.2.dist-info/RECORD,,
83
+ opik_optimizer-2.1.3.dist-info/licenses/LICENSE,sha256=V-0VHJOBdcA_teT8VymvsBUQ1-CZU6yJRmMEjec_8tA,11372
84
+ opik_optimizer-2.1.3.dist-info/METADATA,sha256=omnNZ2--FZxU-ex3SEKYF4ZaKRDTcQfkPoc2kxKLB7U,12829
85
+ opik_optimizer-2.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
86
+ opik_optimizer-2.1.3.dist-info/top_level.txt,sha256=ondOlpq6_yFckqpxoAHSfzZS2N-JfgmA-QQhOJfz7m0,15
87
+ opik_optimizer-2.1.3.dist-info/RECORD,,