PyPI - opik-optimizer - Versions diffs - 2.1.2__py3-none-any.whl → 2.1.3__py3-none-any.whl - Mend

opik-optimizer 2.1.2py3-none-any.whl → 2.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

opik_optimizer/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ from .mipro_optimizer import MiproOptimizer
 from .hierarchical_reflective_optimizer import HierarchicalReflectiveOptimizer
 from .optimization_config.configs import TaskConfig
 from .optimization_result import OptimizationResult
+from .multi_metric_objective import MultiMetricObjective
 from .parameter_optimizer import (
     ParameterOptimizer,
     ParameterSearchSpace,
@@ -48,6 +49,7 @@ __all__ = [
     "setup_logging",
     "datasets",
     "TaskConfig",
+    "MultiMetricObjective",
     "ParameterSearchSpace",
     "ParameterSpec",
     "ParameterType",

opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from opik.environment import get_tqdm_for_current_environment
 import os
 import logging
@@ -29,8 +28,6 @@ from .types import (
 )
 from .prompts import IMPROVE_PROMPT_TEMPLATE
-tqdm = get_tqdm_for_current_environment()
 # Using disk cache for LLM calls
 disk_cache_dir = os.path.expanduser("~/.litellm_cache")
 litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
@@ -61,10 +58,14 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
         max_parallel_batches: Maximum number of batches to process concurrently during
             hierarchical root cause analysis (default: 5)
         batch_size: Number of test cases per batch for root cause analysis (default: 25)
+        max_iterations: Maximum number of optimization iterations (default: 5)
+        convergence_threshold: Stop if relative improvement is below this threshold (default: 0.01)
         **model_kwargs: Additional arguments passed to the LLM model
     """
     DEFAULT_ROUNDS = 10
+    DEFAULT_MAX_ITERATIONS = 5
+    DEFAULT_CONVERGENCE_THRESHOLD = 0.01  # Stop if improvement is less than 1%
     def __init__(
         self,
@@ -74,6 +75,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
         seed: int = 42,
         max_parallel_batches: int = 5,
         batch_size: int = 25,
+        max_iterations: int = DEFAULT_MAX_ITERATIONS,
+        convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
         **model_kwargs: Any,
     ):
         super().__init__(
@@ -83,6 +86,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
         self.num_threads = num_threads
         self.max_parallel_batches = max_parallel_batches
         self.batch_size = batch_size
+        self.max_iterations = max_iterations
+        self.convergence_threshold = convergence_threshold
         # Initialize hierarchical analyzer
         self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
@@ -247,6 +252,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
             "reasoning_model": self.reasoning_model,
             "num_threads": self.num_threads,
             "max_parallel_batches": self.max_parallel_batches,
+            "max_iterations": self.max_iterations,
+            "convergence_threshold": self.convergence_threshold,
             "seed": self.seed,
             "verbose": self.verbose,
         }
@@ -417,7 +424,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
         n_samples: int | None,
         attempt: int,
         max_attempts: int,
-    ) -> tuple[chat_prompt.ChatPrompt, float]:
+    ) -> tuple[chat_prompt.ChatPrompt, float, EvaluationResult]:
         """
         Generate and evaluate a single improvement attempt for a failure mode.
@@ -434,7 +441,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
             max_attempts: Total number of attempts
         Returns:
-            Tuple of (improved_prompt, improved_score)
+            Tuple of (improved_prompt, improved_score, improved_experiment_result)
         """
         # Generate improvement with progress indication
         with reporting.display_prompt_improvement(
@@ -485,7 +492,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
             ) / len(improved_experiment_result.test_results)
             improved_reporter.set_score(improved_score)
-        return improved_chat_prompt, improved_score
+        return improved_chat_prompt, improved_score, improved_experiment_result
     def optimize_prompt(
         self,
@@ -528,6 +535,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
                 "n_samples": n_samples,
                 "auto_continue": auto_continue,
                 "max_retries": max_retries,
+                "max_iterations": self.max_iterations,
+                "convergence_threshold": self.convergence_threshold,
             },
             verbose=self.verbose,
             tools=getattr(prompt, "tools", None),
@@ -557,53 +566,62 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
             prompt.get_messages()
         )  # Store copy of initial messages for diff
-        # Iteration 1: Analyze and improve (structure ready for future multi-iteration support)
-        with reporting.display_optimization_iteration(
-            iteration=1, verbose=self.verbose
-        ) as iteration_reporter:
-            # Perform hierarchical root cause analysis
-            with reporting.display_root_cause_analysis(
-                verbose=self.verbose
-            ) as analysis_reporter:
-                hierarchical_analysis = self._hierarchical_root_cause_analysis(
-                    experiment_result
-                )
-                analysis_reporter.set_completed(
-                    total_test_cases=hierarchical_analysis.total_test_cases,
-                    num_batches=hierarchical_analysis.num_batches,
-                )
-            # Display hierarchical synthesis and failure modes
-            if self.verbose:
-                reporting.display_hierarchical_synthesis(
-                    total_test_cases=hierarchical_analysis.total_test_cases,
-                    num_batches=hierarchical_analysis.num_batches,
-                    synthesis_notes=hierarchical_analysis.synthesis_notes,
-                    verbose=self.verbose,
-                )
+        # Multi-iteration optimization loop
+        iteration = 0
+        previous_iteration_score = initial_score
+        for iteration in range(1, self.max_iterations + 1):
+            logger.info(f"Starting iteration {iteration}/{self.max_iterations}")
+            with reporting.display_optimization_iteration(
+                iteration=iteration, verbose=self.verbose
+            ) as iteration_reporter:
+                # Perform hierarchical root cause analysis
+                with reporting.display_root_cause_analysis(
+                    verbose=self.verbose
+                ) as analysis_reporter:
+                    hierarchical_analysis = self._hierarchical_root_cause_analysis(
+                        experiment_result
+                    )
+                    analysis_reporter.set_completed(
+                        total_test_cases=hierarchical_analysis.total_test_cases,
+                        num_batches=hierarchical_analysis.num_batches,
+                    )
-            reporting.display_failure_modes(
-                failure_modes=hierarchical_analysis.unified_failure_modes,
-                verbose=self.verbose,
-            )
+                # Display hierarchical synthesis and failure modes
+                if self.verbose:
+                    reporting.display_hierarchical_synthesis(
+                        total_test_cases=hierarchical_analysis.total_test_cases,
+                        num_batches=hierarchical_analysis.num_batches,
+                        synthesis_notes=hierarchical_analysis.synthesis_notes,
+                        verbose=self.verbose,
+                    )
-            # Generate improved prompt for each failure mode
-            for idx, root_cause in enumerate(
-                hierarchical_analysis.unified_failure_modes, 1
-            ):
-                logger.debug(
-                    f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
+                reporting.display_failure_modes(
+                    failure_modes=hierarchical_analysis.unified_failure_modes,
+                    verbose=self.verbose,
                 )
-                # Try multiple attempts if needed
-                max_attempts = max_retries + 1
-                improved_chat_prompt = None
-                improved_score = None
+                # Generate improved prompt for each failure mode
+                for idx, root_cause in enumerate(
+                    hierarchical_analysis.unified_failure_modes, 1
+                ):
+                    logger.debug(
+                        f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
+                    )
-                for attempt in range(1, max_attempts + 1):
-                    # Generate and evaluate improvement
-                    improved_chat_prompt, improved_score = (
-                        self._generate_and_evaluate_improvement(
+                    # Try multiple attempts if needed
+                    max_attempts = max_retries + 1
+                    improved_chat_prompt = None
+                    improved_score = None
+                    for attempt in range(1, max_attempts + 1):
+                        # Generate and evaluate improvement
+                        (
+                            improved_chat_prompt,
+                            improved_score,
+                            improved_experiment_result,
+                        ) = self._generate_and_evaluate_improvement(
                             root_cause=root_cause,
                             best_prompt=best_prompt,
                             best_score=best_score,
@@ -615,64 +633,86 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
                             attempt=attempt,
                             max_attempts=max_attempts,
                         )
-                    )
-                    # Check if we got improvement
-                    if improved_score > best_score:
-                        logger.info(
-                            f"Improvement found for '{root_cause.name}' on attempt {attempt}"
+                        # Check if we got improvement
+                        if improved_score > best_score:
+                            logger.info(
+                                f"Improvement found for '{root_cause.name}' on attempt {attempt}"
+                            )
+                            break
+                        # No improvement - should we retry?
+                        if attempt < max_attempts:
+                            reporting.display_retry_attempt(
+                                attempt=attempt,
+                                max_attempts=max_attempts,
+                                failure_mode_name=root_cause.name,
+                                verbose=self.verbose,
+                            )
+                        else:
+                            logger.debug(
+                                f"No improvement after {attempt} attempts for '{root_cause.name}'"
+                            )
+                    # Check if final result is an improvement
+                    if (
+                        improved_score is not None
+                        and improved_chat_prompt is not None
+                        and improved_score > best_score
+                    ):
+                        improvement = self._calculate_improvement(
+                            improved_score, best_score
                         )
-                        break
-                    # No improvement - should we retry?
-                    if attempt < max_attempts:
-                        reporting.display_retry_attempt(
-                            attempt=attempt,
-                            max_attempts=max_attempts,
-                            failure_mode_name=root_cause.name,
+                        # Display improvement for this iteration
+                        reporting.display_iteration_improvement(
+                            improvement=improvement,
+                            current_score=improved_score,
+                            best_score=best_score,
                             verbose=self.verbose,
                         )
+                        # Update best
+                        best_score = improved_score
+                        best_prompt = improved_chat_prompt
+                        best_messages = improved_chat_prompt.get_messages()
+                        experiment_result = improved_experiment_result
+                        logger.info(
+                            f"Updated best prompt after addressing '{root_cause.name}'"
+                        )
                     else:
                         logger.debug(
-                            f"No improvement after {attempt} attempts for '{root_cause.name}'"
+                            f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
                         )
-                # Check if final result is an improvement
-                if (
-                    improved_score is not None
-                    and improved_chat_prompt is not None
-                    and improved_score > best_score
-                ):
-                    improvement = self._calculate_improvement(
-                        improved_score, best_score
-                    )
-                    # Display improvement for this iteration
-                    reporting.display_iteration_improvement(
-                        improvement=improvement,
-                        current_score=improved_score,
-                        best_score=best_score,
-                        verbose=self.verbose,
-                    )
+                # Mark iteration complete
+                improved_since_start = best_score > initial_score
+                iteration_reporter.iteration_complete(
+                    best_score=best_score, improved=improved_since_start
+                )
-                    # Update best
-                    best_score = improved_score
-                    best_prompt = improved_chat_prompt
-                    best_messages = improved_chat_prompt.get_messages()
-                    logger.info(
-                        f"Updated best prompt after addressing '{root_cause.name}'"
-                    )
-                else:
-                    logger.debug(
-                        f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
-                    )
+            # Check for convergence after iteration
+            iteration_improvement = self._calculate_improvement(
+                best_score, previous_iteration_score
+            )
-            # Mark iteration complete
-            improved_since_start = best_score > initial_score
-            iteration_reporter.iteration_complete(
-                best_score=best_score, improved=improved_since_start
+            logger.info(
+                f"Iteration {iteration} complete. Score: {best_score:.4f}, "
+                f"Improvement: {iteration_improvement:.2%}"
             )
+            # Stop if improvement is below convergence threshold
+            if abs(iteration_improvement) < self.convergence_threshold:
+                logger.info(
+                    f"Convergence achieved: improvement ({iteration_improvement:.2%}) "
+                    f"below threshold ({self.convergence_threshold:.2%}). "
+                    f"Stopping after {iteration} iterations."
+                )
+                break
+            # Update previous score for next iteration
+            previous_iteration_score = best_score
         # Display final optimization result with diff
         reporting.display_optimized_prompt_diff(
             initial_messages=initial_messages,
@@ -682,14 +722,27 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
             verbose=self.verbose,
         )
+        # Update optimization status to completed
+        try:
+            optimization.update(status="completed")
+            logger.info(f"Optimization {optimization.id} status updated to completed.")
+        except Exception as e:
+            logger.warning(f"Failed to update optimization status: {e}")
         # Prepare details for the result
         details = {
+            "model": best_prompt.model or self.model,
+            "temperature": (best_prompt.model_kwargs or {}).get("temperature")
+            or self.model_kwargs.get("temperature"),
             "reasoning_model": self.reasoning_model,
             "num_threads": self.num_threads,
             "max_parallel_batches": self.max_parallel_batches,
             "max_retries": max_retries,
             "n_samples": n_samples,
             "auto_continue": auto_continue,
+            "max_iterations": self.max_iterations,
+            "convergence_threshold": self.convergence_threshold,
+            "iterations_completed": iteration,
         }
         # Extract tool prompts if tools exist

opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import logging
 import asyncio
 from typing import Any
-from tqdm import tqdm
+from rich.progress import Progress, TextColumn, BarColumn, TaskProgressColumn
 from opik.evaluation.evaluation_result import EvaluationResult
 from .types import (
     RootCauseAnalysis,
@@ -11,6 +11,7 @@ from .types import (
 )
 from . import reporting
 from .prompts import BATCH_ANALYSIS_PROMPT, SYNTHESIS_PROMPT
+from ..reporting_utils import get_console
 logger = logging.getLogger(__name__)
@@ -285,13 +286,11 @@ Scores:
         semaphore = asyncio.Semaphore(self.max_parallel_batches)
-        # Create progress bar for batch processing
-        pbar = tqdm(
-            total=len(batch_tasks), desc="Processing batches", unit="batch", leave=False
-        )
+        # Create progress bar for batch processing using Rich
+        console = get_console()
         async def run_with_semaphore(
-            batch_num: int, task: Any
+            batch_num: int, task: Any, progress: Progress | None, task_id: Any | None
         ) -> tuple[int, BatchAnalysis]:
             async with semaphore:
                 try:
@@ -300,19 +299,44 @@ Scores:
                         f"Completed batch {batch_num}: "
                         f"identified {len(result.failure_modes)} failure modes"
                     )
-                    pbar.update(1)  # Update progress bar
+                    if progress and task_id is not None:
+                        progress.update(task_id, advance=1)  # Update progress bar
                     return batch_num, result
                 except Exception as exc:
                     logger.error(f"Batch {batch_num} failed: {exc}")
-                    pbar.update(1)  # Update progress bar even on error
+                    if progress and task_id is not None:
+                        progress.update(
+                            task_id, advance=1
+                        )  # Update progress bar even on error
                     raise
-        # Run all tasks with semaphore control
-        results = await asyncio.gather(
-            *[run_with_semaphore(num, task) for num, task in batch_tasks]
-        )
-        pbar.close()  # Close progress bar
+        # Run all tasks with semaphore control and rich progress bar
+        if self.verbose >= 1:
+            with Progress(
+                TextColumn("│      "),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+                console=console,
+                transient=True,
+            ) as progress:
+                task_id = progress.add_task(
+                    "Processing batches", total=len(batch_tasks)
+                )
+                results = await asyncio.gather(
+                    *[
+                        run_with_semaphore(num, task, progress, task_id)
+                        for num, task in batch_tasks
+                    ]
+                )
+        else:
+            # No progress bar in non-verbose mode
+            results = await asyncio.gather(
+                *[
+                    run_with_semaphore(num, task, None, None)
+                    for num, task in batch_tasks
+                ]
+            )
         # Sort by batch number to maintain order
         batch_analyses = [result for _, result in sorted(results)]

opik_optimizer/hierarchical_reflective_optimizer/reporting.py CHANGED Viewed

@@ -29,9 +29,11 @@ def display_retry_attempt(
     """Display retry attempt information."""
     if verbose >= 1:
         console.print(
-            Text(
-                f"│    Retry attempt {attempt + 1}/{max_attempts} for failure mode '{failure_mode_name}' (no improvement observed)",
-                style="yellow",
+            Text("│    ").append(
+                Text(
+                    f"Retry attempt {attempt + 1}/{max_attempts} for failure mode '{failure_mode_name}' (no improvement observed)",
+                    style="yellow",
+                )
             )
         )
@@ -132,14 +134,16 @@ def display_evaluation(
         def set_score(self, s: float) -> None:
             if verbose >= 1:
                 # Adjust score indentation based on indent style
-                score_indent = "  " if indent == "> " else "│   "
+                score_indent = "│ " if indent == "> " else "│   "
                 if baseline_score is None:
                     # This is the baseline evaluation
                     console.print(
-                        Text(
-                            f"\r{score_indent}Baseline score was: {s:.4f}.",
-                            style="green",
+                        Text(score_indent).append(
+                            Text(
+                                f"Baseline score was: {s:.4f}.",
+                                style="green",
+                            )
                         )
                     )
                     console.print(Text("│"))
@@ -152,9 +156,11 @@ def display_evaluation(
                             else 0
                         )
                         console.print(
-                            Text(
-                                f"\r{score_indent}Score for updated prompt: {s:.4f} (+{improvement_pct:.1f}%)",
-                                style="green bold",
+                            Text(score_indent).append(
+                                Text(
+                                    f"Score for updated prompt: {s:.4f} (+{improvement_pct:.1f}%)",
+                                    style="green bold",
+                                )
                             )
                         )
                     elif s < baseline_score:
@@ -164,23 +170,27 @@ def display_evaluation(
                             else 0
                         )
                         console.print(
-                            Text(
-                                f"\r{score_indent}Score for updated prompt: {s:.4f} (-{decline_pct:.1f}%)",
-                                style="red",
+                            Text(score_indent).append(
+                                Text(
+                                    f"Score for updated prompt: {s:.4f} (-{decline_pct:.1f}%)",
+                                    style="red",
+                                )
                             )
                         )
                     else:
                         console.print(
-                            Text(
-                                f"\r{score_indent}Score for updated prompt: {s:.4f} (no change)",
-                                style="yellow",
+                            Text(score_indent).append(
+                                Text(
+                                    f"Score for updated prompt: {s:.4f} (no change)",
+                                    style="yellow",
+                                )
                             )
                         )
                     console.print(Text("│"))
     # Use our log suppression context manager and yield the reporter
     # Adjust progress bar indentation based on indent style
-    progress_indent = "  Evaluation" if indent == "> " else "│   Evaluation"
+    progress_indent = "│ Evaluation" if indent == "> " else "│   Evaluation"
     with suppress_opik_logs():
         with convert_tqdm_to_rich(progress_indent, verbose=verbose):
             try:
@@ -306,25 +316,31 @@ def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
 def display_optimization_iteration(iteration: int, verbose: int = 1) -> Iterator[Any]:
     """Context manager to display progress for a single optimization iteration."""
     if verbose >= 1:
-        console.print()
         console.print(Text("│"))
-        console.print(Text(f"│ Iteration {iteration}", style="bold cyan"))
+        console.print(Text("│"))
+        console.print(
+            Text("│ ").append(Text(f"Iteration {iteration}", style="bold cyan"))
+        )
     class Reporter:
         def iteration_complete(self, best_score: float, improved: bool) -> None:
             if verbose >= 1:
                 if improved:
                     console.print(
-                        Text(
-                            f"│ Iteration {iteration} complete - New best score: {best_score:.4f}",
-                            style="green",
+                        Text("│ ").append(
+                            Text(
+                                f"Iteration {iteration} complete - New best score: {best_score:.4f}",
+                                style="green",
+                            )
                         )
                     )
                 else:
                     console.print(
-                        Text(
-                            f"│ Iteration {iteration} complete - No improvement (best: {best_score:.4f})",
-                            style="yellow",
+                        Text("│ ").append(
+                            Text(
+                                f"Iteration {iteration} complete - No improvement (best: {best_score:.4f})",
+                                style="yellow",
+                            )
                         )
                     )
                 console.print(Text("│"))
@@ -341,16 +357,20 @@ def display_root_cause_analysis(verbose: int = 1) -> Iterator[Any]:
     if verbose >= 1:
         console.print(Text("│   "))
         console.print(
-            Text("│   Analyzing root cause of failed evaluation items", style="cyan")
+            Text("│   ").append(
+                Text("Analyzing root cause of failed evaluation items", style="cyan")
+            )
         )
     class Reporter:
         def set_completed(self, total_test_cases: int, num_batches: int) -> None:
             if verbose >= 1:
                 console.print(
-                    Text(
-                        f"│   Analyzed {total_test_cases} test cases across {num_batches} batches",
-                        style="green",
+                    Text("│   ").append(
+                        Text(
+                            f"Analyzed {total_test_cases} test cases across {num_batches} batches",
+                            style="green",
+                        )
                     )
                 )
                 console.print(Text("│   "))
@@ -367,7 +387,9 @@ def display_root_cause_analysis(verbose: int = 1) -> Iterator[Any]:
 def display_batch_synthesis(num_batches: int, verbose: int = 1) -> Iterator[Any]:
     """Context manager to display message during batch synthesis."""
     if verbose >= 1:
-        console.print(Text("│   Synthesizing failure modes", style="cyan"))
+        console.print(
+            Text("│   ").append(Text("Synthesizing failure modes", style="cyan"))
+        )
     class Reporter:
         def set_completed(self, num_unified_modes: int) -> None:
@@ -406,10 +428,13 @@ def display_hierarchical_synthesis(
         console.print(panel)
     rendered_panel = capture.get()
-    for line in rendered_panel.splitlines():
-        console.print(Text("│ ") + Text.from_ansi(line))
-    console.print()
+    # Prefix each line with '│ ', preserving ANSI styles
+    prefixed_output = "\n".join(f"│ {line}" for line in rendered_panel.splitlines())
+    # Print the prefixed output (will include colors)
+    console.print(prefixed_output, highlight=False)
+    console.print(Text("│"))
 def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
@@ -433,10 +458,13 @@ def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
         console.print(header_panel)
     rendered_header = capture.get()
-    for line in rendered_header.splitlines():
-        console.print(Text("│   ") + Text.from_ansi(line))
-    console.print()
+    # Prefix each line with '│   ', preserving ANSI styles
+    prefixed_output = "\n".join(f"│   {line}" for line in rendered_header.splitlines())
+    # Print the prefixed output (will include colors)
+    console.print(prefixed_output, highlight=False)
+    console.print(Text("│"))
     for idx, failure_mode in enumerate(failure_modes, 1):
         # Create content for this failure mode
@@ -460,8 +488,14 @@ def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
             console.print(panel)
         rendered_panel = capture.get()
-        for line in rendered_panel.splitlines():
-            console.print(Text("│   ") + Text.from_ansi(line))
+        # Prefix each line with '│   ', preserving ANSI styles
+        prefixed_output = "\n".join(
+            f"│   {line}" for line in rendered_panel.splitlines()
+        )
+        # Print the prefixed output (will include colors)
+        console.print(prefixed_output, highlight=False)
         if idx < len(failure_modes):
             console.print("│")
@@ -473,9 +507,13 @@ def display_prompt_improvement(
 ) -> Iterator[Any]:
     """Context manager to display progress while generating improved prompt."""
     if verbose >= 1:
-        console.print()
+        console.print(Text("│"))
         console.print(Text("│   "))
-        console.print(Text(f"│   Addressing: {failure_mode_name}", style="bold cyan"))
+        console.print(
+            Text("│   ").append(
+                Text(f"Addressing: {failure_mode_name}", style="bold cyan")
+            )
+        )
     class Reporter:
         def set_reasoning(self, reasoning: str) -> None:
@@ -498,9 +536,14 @@ def display_prompt_improvement(
                     console.print(panel)
                 rendered_panel = capture.get()
-                for line in rendered_panel.splitlines():
-                    console.print(Text("│     ") + Text.from_ansi(line))
+                # Prefix each line with '│     ', preserving ANSI styles
+                prefixed_output = "\n".join(
+                    f"│     {line}" for line in rendered_panel.splitlines()
+                )
+                # Print the prefixed output (will include colors)
+                console.print(prefixed_output, highlight=False)
                 console.print(Text("│   "))
     try:
@@ -520,9 +563,11 @@ def display_improvement_reasoning(
     if verbose < 1:
         return
-    console.print()
+    console.print(Text("│"))
     console.print(Text("│   "))
-    console.print(Text(f"│   Addressing: {failure_mode_name}", style="bold cyan"))
+    console.print(
+        Text("│   ").append(Text(f"Addressing: {failure_mode_name}", style="bold cyan"))
+    )
     reasoning_content = Text()
     reasoning_content.append("Improvement Strategy:\n", style="cyan")
@@ -542,9 +587,12 @@ def display_improvement_reasoning(
         console.print(panel)
     rendered_panel = capture.get()
-    for line in rendered_panel.splitlines():
-        console.print(Text("│     ") + Text.from_ansi(line))
+    # Prefix each line with '│     ', preserving ANSI styles
+    prefixed_output = "\n".join(f"│     {line}" for line in rendered_panel.splitlines())
+    # Print the prefixed output (will include colors)
+    console.print(prefixed_output, highlight=False)
     console.print(Text("│   "))
@@ -557,16 +605,20 @@ def display_iteration_improvement(
     if improvement > 0:
         console.print(
-            Text(
-                f"│   ✓ Improvement: {improvement:.2%} (from {best_score:.4f} to {current_score:.4f})",
-                style="green bold",
+            Text("│   ").append(
+                Text(
+                    f"✓ Improvement: {improvement:.2%} (from {best_score:.4f} to {current_score:.4f})",
+                    style="green bold",
+                )
             )
         )
     else:
         console.print(
-            Text(
-                f"│   ✗ No improvement: {improvement:.2%} (score: {current_score:.4f}, best: {best_score:.4f})",
-                style="yellow",
+            Text("│   ").append(
+                Text(
+                    f"✗ No improvement: {improvement:.2%} (score: {current_score:.4f}, best: {best_score:.4f})",
+                    style="yellow",
+                )
             )
         )
@@ -584,27 +636,31 @@ def display_optimized_prompt_diff(
     if verbose < 1:
         return
-    console.print()
     console.print(Text("│"))
-    console.print(Text("│ > Optimization Results", style="bold green"))
+    console.print(Text("│"))
+    console.print(Text("│ ").append(Text("> Optimization Results", style="bold green")))
     console.print(Text("│"))
     # Show score improvement
     if best_score > initial_score:
         perc_change = (best_score - initial_score) / initial_score
         console.print(
-            Text(
-                f"│   Prompt improved from {initial_score:.4f} to {best_score:.4f} ({perc_change:.2%})",
-                style="green",
+            Text("│   ").append(
+                Text(
+                    f"Prompt improved from {initial_score:.4f} to {best_score:.4f} ({perc_change:.2%})",
+                    style="green",
+                )
             )
         )
     else:
         console.print(
-            Text(f"│   No improvement found (score: {best_score:.4f})", style="yellow")
+            Text("│   ").append(
+                Text(f"No improvement found (score: {best_score:.4f})", style="yellow")
+            )
         )
     console.print(Text("│"))
-    console.print(Text("│   Prompt Changes:", style="cyan"))
+    console.print(Text("│   ").append(Text("Prompt Changes:", style="cyan")))
     console.print(Text("│"))
     # Compare each message
@@ -626,24 +682,30 @@ def display_optimized_prompt_diff(
         # Handle added messages
         if not initial_msg:
-            console.print(Text(f"│     {role}: (added)", style="green bold"))
+            console.print(
+                Text("│     ").append(Text(f"{role}: (added)", style="green bold"))
+            )
             for line in optimized_content.splitlines():
-                console.print(Text(f"│       +{line}", style="green"))
+                console.print(Text("│       ").append(Text(f"+{line}", style="green")))
             console.print(Text("│"))
             continue
         # Handle removed messages
         if not optimized_msg:
-            console.print(Text(f"│     {role}: (removed)", style="red bold"))
+            console.print(
+                Text("│     ").append(Text(f"{role}: (removed)", style="red bold"))
+            )
             for line in initial_content.splitlines():
-                console.print(Text(f"│       -{line}", style="red"))
+                console.print(Text("│       ").append(Text(f"-{line}", style="red")))
             console.print(Text("│"))
             continue
         # Check if there are changes
         if initial_content == optimized_content:
             # No changes in this message
-            console.print(Text(f"│     {role}: (unchanged)", style="dim"))
+            console.print(
+                Text("│     ").append(Text(f"{role}: (unchanged)", style="dim"))
+            )
             continue
         # Generate unified diff
@@ -660,7 +722,7 @@ def display_optimized_prompt_diff(
             continue
         # Display message header
-        console.print(Text(f"│     {role}:", style="bold cyan"))
+        console.print(Text("│     ").append(Text(f"{role}:", style="bold cyan")))
         # Create diff content
         diff_content = Text()

opik_optimizer/multi_metric_objective.py ADDED Viewed

@@ -0,0 +1,33 @@
+from typing import Any
+from collections.abc import Callable
+from opik.evaluation.metrics.score_result import ScoreResult
+class MultiMetricObjective:
+    def __init__(
+        self,
+        metrics: list[Callable[[dict[str, Any], str], ScoreResult]],
+        weights: list[float] | None = None,
+        name: str = "multi_metric_objective",
+    ):
+        self.metrics = metrics
+        self.weights = weights if weights else [1 / len(metrics)] * len(metrics)
+        self.__name__ = name
+    def __call__(self, dataset_item: dict[str, Any], llm_output: str) -> ScoreResult:
+        raw_score_results = []
+        weighted_score_value = 0
+        for metric, weight in zip(self.metrics, self.weights):
+            score_result = metric(dataset_item, llm_output)
+            raw_score_results.append(score_result)
+            weighted_score_value += score_result.value * weight
+        aggregated_score_result = ScoreResult(
+            name=self.__name__,
+            value=weighted_score_value,
+            metadata={"raw_score_results": raw_score_results},
+        )
+        # Important: we return the aggregated score result first
+        return aggregated_score_result

opik_optimizer/task_evaluator.py CHANGED Viewed

@@ -5,6 +5,7 @@ from collections.abc import Callable
 import opik
 from opik.evaluation import evaluator as opik_evaluator
 from opik.evaluation.metrics import base_metric, score_result
+from . import multi_metric_objective
 logger = logging.getLogger(__name__)
@@ -14,9 +15,20 @@ def _create_metric_class(metric: Callable) -> base_metric.BaseMetric:
         def __init__(self) -> None:
             self.name = metric.__name__
-        def score(self, llm_output: str, **kwargs: Any) -> score_result.ScoreResult:
+        def score(
+            self, llm_output: str, **kwargs: Any
+        ) -> score_result.ScoreResult | list[score_result.ScoreResult]:
             try:
                 metric_val = metric(dataset_item=kwargs, llm_output=llm_output)
+                if isinstance(metric, multi_metric_objective.MultiMetricObjective):
+                    if (
+                        hasattr(metric_val, "metadata")
+                        and "raw_score_results" in metric_val.metadata
+                    ):
+                        return [metric_val, *metric_val.metadata["raw_score_results"]]
+                    else:
+                        return [metric_val]
                 if isinstance(metric_val, score_result.ScoreResult):
                     return score_result.ScoreResult(
                         name=self.name,
@@ -107,15 +119,20 @@ def evaluate(
     if not result.test_results:
         return 0.0
-    # We may allow score aggregation customization.
-    score_results: list[score_result.ScoreResult] = [
-        test_result.score_results[0] for test_result in result.test_results
-    ]
-    if not score_results:
+    # Filter score results to only include the objective metric
+    objective_metric_name = metric.__name__
+    objective_score_results: list[score_result.ScoreResult] = []
+    for test_result in result.test_results:
+        for score_result_ in test_result.score_results:
+            if score_result_.name == objective_metric_name:
+                objective_score_results.append(score_result_)
+                break
+    if not objective_score_results:
         return 0.0
-    avg_score = sum([score_result_.value for score_result_ in score_results]) / len(
-        score_results
-    )
+    avg_score = sum(
+        [score_result_.value for score_result_ in objective_score_results]
+    ) / len(objective_score_results)
     return avg_score

{opik_optimizer-2.1.2.dist-info → opik_optimizer-2.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: opik_optimizer
-Version: 2.1.2
+Version: 2.1.3
 Summary: Agent optimization with Opik
 Home-page: https://github.com/comet-ml/opik
 Author: Comet ML

{opik_optimizer-2.1.2.dist-info → opik_optimizer-2.1.3.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,14 @@
-opik_optimizer/__init__.py,sha256=VwryQ5bSOmJSl4CiacCIv_UF_In8Zho54fQ3FUR8pyk,1573
+opik_optimizer/__init__.py,sha256=lA9cjEsNxrJwYJ68vCjeNZgrcxO_rNJaAHsdMwaq364,1658
 opik_optimizer/_throttle.py,sha256=1JXIhYlo0IaqCgwmNB0Hnh9CYhYPkwRFdVGIcE7pVNg,1362
 opik_optimizer/base_optimizer.py,sha256=XryBkUTs4FQmHcBtVm63EJIKWrTvwqduUZ6ArHzYQko,21520
 opik_optimizer/cache_config.py,sha256=Xd3NdUsL7bLQWoNe3pESqH4nHucU1iNTSGp-RqbwDog,599
 opik_optimizer/logging_config.py,sha256=TmxX0C1P20amxoXuiNQvlENOjdSNfWwvL8jFy206VWM,3837
+opik_optimizer/multi_metric_objective.py,sha256=y4jqirnhkfhB7SWonI4ldYg5fWG4JGfAxqu7ylRD1J4,1178
 opik_optimizer/optimizable_agent.py,sha256=R0_BdwdHyZGWTw3oSvTg8FULDOYM8XaTiPNR3qV8DkQ,6344
 opik_optimizer/optimization_result.py,sha256=sG-Yr-hOaH9zx_I5S6_W3v6j8nPUhwYdS333jVM4Gus,17218
 opik_optimizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 opik_optimizer/reporting_utils.py,sha256=dcECFmzZ_J-DKoukMDEE_fm7X8sdQyl_ijTddvQtepE,8287
-opik_optimizer/task_evaluator.py,sha256=1hILYwJLtn7XpPX96JjubnlMasmudVTHMVK3pmd22bE,4312
+opik_optimizer/task_evaluator.py,sha256=7N254DU0UkWJ5saQ5AmYEsHHSrychAJtedmmjNsCOnI,5081
 opik_optimizer/data/context7_eval.jsonl,sha256=vPR3XRfI0UbZ1hgUGaOdpraFT99RDLU1YWuPFLLQz40,1757
 opik_optimizer/data/hotpot-500.json,sha256=YXxCtuvYvxSu5u0y4559a6b1qwgAYsWzT_SUKv_21ew,76862
 opik_optimizer/datasets/__init__.py,sha256=V4LVDOaRjwzaYvhdQ3V6CAwFaeKnxyTV1lp_ES9Z31E,691
@@ -46,10 +47,10 @@ opik_optimizer/gepa_optimizer/adapter.py,sha256=KzPa4koq7aJhALMAOKPxAO4yWuEy_YbW
 opik_optimizer/gepa_optimizer/gepa_optimizer.py,sha256=HBjikhce3K4VIaiIXs7eSagmRyFPdY8h4seoW9F3nQE,26481
 opik_optimizer/gepa_optimizer/reporting.py,sha256=F0cxYSjRuFAszgi3rgqwH1A-KH26kZOLtENP7x1xrQs,5154
 opik_optimizer/hierarchical_reflective_optimizer/__init__.py,sha256=9qM3kvfAaFy-Y6Tg19MXHJxpnF5DJQQwzr6oNsxaRBM,133
-opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py,sha256=Fs83ztOuPS8mkFvJAVmYok15DaXTk4Jqpoa9ImRl2t4,27256
-opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py,sha256=GSIXUBxoS9LFnCXopS6B6wLSpmCYXA8Cv6ytELgEBoc,12709
+opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py,sha256=j9Gr5z9j-evFhkbxkbiZ7RXt6Q89LshYYR4ac_UxwX0,30235
+opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py,sha256=0D5wgx04jZvTJ0Yjqm0jtQvkjrGBB73qgcsSwLBpnv0,13814
 opik_optimizer/hierarchical_reflective_optimizer/prompts.py,sha256=XcOEI9eeEbTgKFsFiRWxvHdaByQkiN02bH2gTl3HX-Y,3853
-opik_optimizer/hierarchical_reflective_optimizer/reporting.py,sha256=LpHv_WBZCg2a0RhZaGwUmCch_-Dfk_rpuMxTckJMWTU,23234
+opik_optimizer/hierarchical_reflective_optimizer/reporting.py,sha256=d1jQ3uZs0fTI2DeumvGmkxuMHtwA0wt_ROtl4E6UdIM,25461
 opik_optimizer/hierarchical_reflective_optimizer/types.py,sha256=bS-JAheX2FpJ4XAxoZi5PfjloG8L-B1LGQA1iLXZhW4,1031
 opik_optimizer/mcp_utils/__init__.py,sha256=BsWQT8nAa6JV6zcOD__OvPMepUS2IpJD4J2rnAXhpuU,710
 opik_optimizer/mcp_utils/mcp.py,sha256=UylgpTJsybszS433_kuTAgKH-PPde-VHjHVelMardFs,18466
@@ -79,8 +80,8 @@ opik_optimizer/utils/colbert.py,sha256=qSrzKUUGw7P92mLy4Ofug5pBGeTsHBLMJXlXSJSfK
 opik_optimizer/utils/core.py,sha256=5GT1vp6fW8ICO42LHMX14BjR-xEb6afAKjM7b1Evx5M,15298
 opik_optimizer/utils/dataset_utils.py,sha256=dqRUGOekjeNWL0J15R8xFwLyKJDJynJXzVyQmt8rhHA,1464
 opik_optimizer/utils/prompt_segments.py,sha256=1zUITSccJ82Njac1rmANzim4WWM6rVac61mfluS7lFE,5931
-opik_optimizer-2.1.2.dist-info/licenses/LICENSE,sha256=V-0VHJOBdcA_teT8VymvsBUQ1-CZU6yJRmMEjec_8tA,11372
-opik_optimizer-2.1.2.dist-info/METADATA,sha256=V2WXOFaF2tc7-xAHcdndLn8pcUp1JHxHEVONa0K8j78,12829
-opik_optimizer-2.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-opik_optimizer-2.1.2.dist-info/top_level.txt,sha256=ondOlpq6_yFckqpxoAHSfzZS2N-JfgmA-QQhOJfz7m0,15
-opik_optimizer-2.1.2.dist-info/RECORD,,
+opik_optimizer-2.1.3.dist-info/licenses/LICENSE,sha256=V-0VHJOBdcA_teT8VymvsBUQ1-CZU6yJRmMEjec_8tA,11372
+opik_optimizer-2.1.3.dist-info/METADATA,sha256=omnNZ2--FZxU-ex3SEKYF4ZaKRDTcQfkPoc2kxKLB7U,12829
+opik_optimizer-2.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+opik_optimizer-2.1.3.dist-info/top_level.txt,sha256=ondOlpq6_yFckqpxoAHSfzZS2N-JfgmA-QQhOJfz7m0,15
+opik_optimizer-2.1.3.dist-info/RECORD,,

{opik_optimizer-2.1.2.dist-info → opik_optimizer-2.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{opik_optimizer-2.1.2.dist-info → opik_optimizer-2.1.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{opik_optimizer-2.1.2.dist-info → opik_optimizer-2.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

opik-optimizer 2.1.2__py3-none-any.whl → 2.1.3__py3-none-any.whl

opik-optimizer 2.1.2py3-none-any.whl → 2.1.3py3-none-any.whl