opik-optimizer 2.1.1__py3-none-any.whl → 2.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/base_optimizer.py +2 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +3 -3
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +146 -93
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +38 -14
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +127 -65
- opik_optimizer/multi_metric_objective.py +33 -0
- opik_optimizer/task_evaluator.py +26 -9
- {opik_optimizer-2.1.1.dist-info → opik_optimizer-2.1.3.dist-info}/METADATA +1 -1
- {opik_optimizer-2.1.1.dist-info → opik_optimizer-2.1.3.dist-info}/RECORD +13 -12
- {opik_optimizer-2.1.1.dist-info → opik_optimizer-2.1.3.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.1.dist-info → opik_optimizer-2.1.3.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.1.dist-info → opik_optimizer-2.1.3.dist-info}/top_level.txt +0 -0
opik_optimizer/__init__.py
CHANGED
|
@@ -19,6 +19,7 @@ from .mipro_optimizer import MiproOptimizer
|
|
|
19
19
|
from .hierarchical_reflective_optimizer import HierarchicalReflectiveOptimizer
|
|
20
20
|
from .optimization_config.configs import TaskConfig
|
|
21
21
|
from .optimization_result import OptimizationResult
|
|
22
|
+
from .multi_metric_objective import MultiMetricObjective
|
|
22
23
|
from .parameter_optimizer import (
|
|
23
24
|
ParameterOptimizer,
|
|
24
25
|
ParameterSearchSpace,
|
|
@@ -48,6 +49,7 @@ __all__ = [
|
|
|
48
49
|
"setup_logging",
|
|
49
50
|
"datasets",
|
|
50
51
|
"TaskConfig",
|
|
52
|
+
"MultiMetricObjective",
|
|
51
53
|
"ParameterSearchSpace",
|
|
52
54
|
"ParameterSpec",
|
|
53
55
|
"ParameterType",
|
opik_optimizer/base_optimizer.py
CHANGED
|
@@ -280,6 +280,7 @@ class BaseOptimizer(ABC):
|
|
|
280
280
|
agent_config["project_name"] = getattr(prompt, "project_name", None)
|
|
281
281
|
agent_config["model"] = getattr(prompt, "model", None) or self.model
|
|
282
282
|
agent_config["tools"] = self._serialize_tools(prompt)
|
|
283
|
+
agent_config["optimizer"] = self.__class__.__name__
|
|
283
284
|
return self._drop_none(agent_config)
|
|
284
285
|
|
|
285
286
|
def get_optimizer_metadata(self) -> dict[str, Any]:
|
|
@@ -341,6 +342,7 @@ class BaseOptimizer(ABC):
|
|
|
341
342
|
"metric": getattr(metric, "__name__", str(metric)),
|
|
342
343
|
"dataset": getattr(dataset, "name", None),
|
|
343
344
|
"dataset_id": dataset_id,
|
|
345
|
+
"optimizer": self.__class__.__name__,
|
|
344
346
|
"optimizer_metadata": self._build_optimizer_metadata(),
|
|
345
347
|
"tool_signatures": self._summarize_tool_signatures(prompt),
|
|
346
348
|
"configuration": {
|
|
@@ -253,7 +253,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
253
253
|
opt_id = None
|
|
254
254
|
|
|
255
255
|
gepa_reporting.display_header(
|
|
256
|
-
algorithm=
|
|
256
|
+
algorithm=self.__class__.__name__,
|
|
257
257
|
optimization_id=opt_id,
|
|
258
258
|
dataset_id=getattr(dataset, "id", None),
|
|
259
259
|
verbose=self.verbose,
|
|
@@ -264,7 +264,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
264
264
|
_display_config(
|
|
265
265
|
messages=prompt.get_messages(),
|
|
266
266
|
optimizer_config={
|
|
267
|
-
"optimizer":
|
|
267
|
+
"optimizer": self.__class__.__name__,
|
|
268
268
|
"model": self.model,
|
|
269
269
|
"reflection_model": self.reflection_model,
|
|
270
270
|
"max_metric_calls": max_metric_calls,
|
|
@@ -422,7 +422,7 @@ class GepaOptimizer(BaseOptimizer):
|
|
|
422
422
|
"system_prompt": candidate_prompt,
|
|
423
423
|
"gepa_score": val_scores[idx] if idx < len(val_scores) else None,
|
|
424
424
|
"opik_score": score,
|
|
425
|
-
"source":
|
|
425
|
+
"source": self.__class__.__name__,
|
|
426
426
|
}
|
|
427
427
|
)
|
|
428
428
|
history.append(
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from opik.environment import get_tqdm_for_current_environment
|
|
2
1
|
import os
|
|
3
2
|
import logging
|
|
4
3
|
|
|
@@ -29,8 +28,6 @@ from .types import (
|
|
|
29
28
|
)
|
|
30
29
|
from .prompts import IMPROVE_PROMPT_TEMPLATE
|
|
31
30
|
|
|
32
|
-
tqdm = get_tqdm_for_current_environment()
|
|
33
|
-
|
|
34
31
|
# Using disk cache for LLM calls
|
|
35
32
|
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
|
36
33
|
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
|
|
@@ -61,10 +58,14 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
61
58
|
max_parallel_batches: Maximum number of batches to process concurrently during
|
|
62
59
|
hierarchical root cause analysis (default: 5)
|
|
63
60
|
batch_size: Number of test cases per batch for root cause analysis (default: 25)
|
|
61
|
+
max_iterations: Maximum number of optimization iterations (default: 5)
|
|
62
|
+
convergence_threshold: Stop if relative improvement is below this threshold (default: 0.01)
|
|
64
63
|
**model_kwargs: Additional arguments passed to the LLM model
|
|
65
64
|
"""
|
|
66
65
|
|
|
67
66
|
DEFAULT_ROUNDS = 10
|
|
67
|
+
DEFAULT_MAX_ITERATIONS = 5
|
|
68
|
+
DEFAULT_CONVERGENCE_THRESHOLD = 0.01 # Stop if improvement is less than 1%
|
|
68
69
|
|
|
69
70
|
def __init__(
|
|
70
71
|
self,
|
|
@@ -74,6 +75,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
74
75
|
seed: int = 42,
|
|
75
76
|
max_parallel_batches: int = 5,
|
|
76
77
|
batch_size: int = 25,
|
|
78
|
+
max_iterations: int = DEFAULT_MAX_ITERATIONS,
|
|
79
|
+
convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
|
|
77
80
|
**model_kwargs: Any,
|
|
78
81
|
):
|
|
79
82
|
super().__init__(
|
|
@@ -83,6 +86,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
83
86
|
self.num_threads = num_threads
|
|
84
87
|
self.max_parallel_batches = max_parallel_batches
|
|
85
88
|
self.batch_size = batch_size
|
|
89
|
+
self.max_iterations = max_iterations
|
|
90
|
+
self.convergence_threshold = convergence_threshold
|
|
86
91
|
|
|
87
92
|
# Initialize hierarchical analyzer
|
|
88
93
|
self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
|
|
@@ -247,6 +252,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
247
252
|
"reasoning_model": self.reasoning_model,
|
|
248
253
|
"num_threads": self.num_threads,
|
|
249
254
|
"max_parallel_batches": self.max_parallel_batches,
|
|
255
|
+
"max_iterations": self.max_iterations,
|
|
256
|
+
"convergence_threshold": self.convergence_threshold,
|
|
250
257
|
"seed": self.seed,
|
|
251
258
|
"verbose": self.verbose,
|
|
252
259
|
}
|
|
@@ -417,7 +424,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
417
424
|
n_samples: int | None,
|
|
418
425
|
attempt: int,
|
|
419
426
|
max_attempts: int,
|
|
420
|
-
) -> tuple[chat_prompt.ChatPrompt, float]:
|
|
427
|
+
) -> tuple[chat_prompt.ChatPrompt, float, EvaluationResult]:
|
|
421
428
|
"""
|
|
422
429
|
Generate and evaluate a single improvement attempt for a failure mode.
|
|
423
430
|
|
|
@@ -434,7 +441,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
434
441
|
max_attempts: Total number of attempts
|
|
435
442
|
|
|
436
443
|
Returns:
|
|
437
|
-
Tuple of (improved_prompt, improved_score)
|
|
444
|
+
Tuple of (improved_prompt, improved_score, improved_experiment_result)
|
|
438
445
|
"""
|
|
439
446
|
# Generate improvement with progress indication
|
|
440
447
|
with reporting.display_prompt_improvement(
|
|
@@ -485,7 +492,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
485
492
|
) / len(improved_experiment_result.test_results)
|
|
486
493
|
improved_reporter.set_score(improved_score)
|
|
487
494
|
|
|
488
|
-
return improved_chat_prompt, improved_score
|
|
495
|
+
return improved_chat_prompt, improved_score, improved_experiment_result
|
|
489
496
|
|
|
490
497
|
def optimize_prompt(
|
|
491
498
|
self,
|
|
@@ -528,6 +535,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
528
535
|
"n_samples": n_samples,
|
|
529
536
|
"auto_continue": auto_continue,
|
|
530
537
|
"max_retries": max_retries,
|
|
538
|
+
"max_iterations": self.max_iterations,
|
|
539
|
+
"convergence_threshold": self.convergence_threshold,
|
|
531
540
|
},
|
|
532
541
|
verbose=self.verbose,
|
|
533
542
|
tools=getattr(prompt, "tools", None),
|
|
@@ -557,53 +566,62 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
557
566
|
prompt.get_messages()
|
|
558
567
|
) # Store copy of initial messages for diff
|
|
559
568
|
|
|
560
|
-
#
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
synthesis_notes=hierarchical_analysis.synthesis_notes,
|
|
582
|
-
verbose=self.verbose,
|
|
583
|
-
)
|
|
569
|
+
# Multi-iteration optimization loop
|
|
570
|
+
iteration = 0
|
|
571
|
+
previous_iteration_score = initial_score
|
|
572
|
+
|
|
573
|
+
for iteration in range(1, self.max_iterations + 1):
|
|
574
|
+
logger.info(f"Starting iteration {iteration}/{self.max_iterations}")
|
|
575
|
+
|
|
576
|
+
with reporting.display_optimization_iteration(
|
|
577
|
+
iteration=iteration, verbose=self.verbose
|
|
578
|
+
) as iteration_reporter:
|
|
579
|
+
# Perform hierarchical root cause analysis
|
|
580
|
+
with reporting.display_root_cause_analysis(
|
|
581
|
+
verbose=self.verbose
|
|
582
|
+
) as analysis_reporter:
|
|
583
|
+
hierarchical_analysis = self._hierarchical_root_cause_analysis(
|
|
584
|
+
experiment_result
|
|
585
|
+
)
|
|
586
|
+
analysis_reporter.set_completed(
|
|
587
|
+
total_test_cases=hierarchical_analysis.total_test_cases,
|
|
588
|
+
num_batches=hierarchical_analysis.num_batches,
|
|
589
|
+
)
|
|
584
590
|
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
591
|
+
# Display hierarchical synthesis and failure modes
|
|
592
|
+
if self.verbose:
|
|
593
|
+
reporting.display_hierarchical_synthesis(
|
|
594
|
+
total_test_cases=hierarchical_analysis.total_test_cases,
|
|
595
|
+
num_batches=hierarchical_analysis.num_batches,
|
|
596
|
+
synthesis_notes=hierarchical_analysis.synthesis_notes,
|
|
597
|
+
verbose=self.verbose,
|
|
598
|
+
)
|
|
589
599
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
):
|
|
594
|
-
logger.debug(
|
|
595
|
-
f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
|
|
600
|
+
reporting.display_failure_modes(
|
|
601
|
+
failure_modes=hierarchical_analysis.unified_failure_modes,
|
|
602
|
+
verbose=self.verbose,
|
|
596
603
|
)
|
|
597
604
|
|
|
598
|
-
#
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
605
|
+
# Generate improved prompt for each failure mode
|
|
606
|
+
for idx, root_cause in enumerate(
|
|
607
|
+
hierarchical_analysis.unified_failure_modes, 1
|
|
608
|
+
):
|
|
609
|
+
logger.debug(
|
|
610
|
+
f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
|
|
611
|
+
)
|
|
602
612
|
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
improved_chat_prompt
|
|
606
|
-
|
|
613
|
+
# Try multiple attempts if needed
|
|
614
|
+
max_attempts = max_retries + 1
|
|
615
|
+
improved_chat_prompt = None
|
|
616
|
+
improved_score = None
|
|
617
|
+
|
|
618
|
+
for attempt in range(1, max_attempts + 1):
|
|
619
|
+
# Generate and evaluate improvement
|
|
620
|
+
(
|
|
621
|
+
improved_chat_prompt,
|
|
622
|
+
improved_score,
|
|
623
|
+
improved_experiment_result,
|
|
624
|
+
) = self._generate_and_evaluate_improvement(
|
|
607
625
|
root_cause=root_cause,
|
|
608
626
|
best_prompt=best_prompt,
|
|
609
627
|
best_score=best_score,
|
|
@@ -615,64 +633,86 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
615
633
|
attempt=attempt,
|
|
616
634
|
max_attempts=max_attempts,
|
|
617
635
|
)
|
|
618
|
-
)
|
|
619
636
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
637
|
+
# Check if we got improvement
|
|
638
|
+
if improved_score > best_score:
|
|
639
|
+
logger.info(
|
|
640
|
+
f"Improvement found for '{root_cause.name}' on attempt {attempt}"
|
|
641
|
+
)
|
|
642
|
+
break
|
|
643
|
+
|
|
644
|
+
# No improvement - should we retry?
|
|
645
|
+
if attempt < max_attempts:
|
|
646
|
+
reporting.display_retry_attempt(
|
|
647
|
+
attempt=attempt,
|
|
648
|
+
max_attempts=max_attempts,
|
|
649
|
+
failure_mode_name=root_cause.name,
|
|
650
|
+
verbose=self.verbose,
|
|
651
|
+
)
|
|
652
|
+
else:
|
|
653
|
+
logger.debug(
|
|
654
|
+
f"No improvement after {attempt} attempts for '{root_cause.name}'"
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# Check if final result is an improvement
|
|
658
|
+
if (
|
|
659
|
+
improved_score is not None
|
|
660
|
+
and improved_chat_prompt is not None
|
|
661
|
+
and improved_score > best_score
|
|
662
|
+
):
|
|
663
|
+
improvement = self._calculate_improvement(
|
|
664
|
+
improved_score, best_score
|
|
624
665
|
)
|
|
625
|
-
break
|
|
626
666
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
failure_mode_name=root_cause.name,
|
|
667
|
+
# Display improvement for this iteration
|
|
668
|
+
reporting.display_iteration_improvement(
|
|
669
|
+
improvement=improvement,
|
|
670
|
+
current_score=improved_score,
|
|
671
|
+
best_score=best_score,
|
|
633
672
|
verbose=self.verbose,
|
|
634
673
|
)
|
|
674
|
+
|
|
675
|
+
# Update best
|
|
676
|
+
best_score = improved_score
|
|
677
|
+
best_prompt = improved_chat_prompt
|
|
678
|
+
best_messages = improved_chat_prompt.get_messages()
|
|
679
|
+
experiment_result = improved_experiment_result
|
|
680
|
+
logger.info(
|
|
681
|
+
f"Updated best prompt after addressing '{root_cause.name}'"
|
|
682
|
+
)
|
|
635
683
|
else:
|
|
636
684
|
logger.debug(
|
|
637
|
-
f"
|
|
685
|
+
f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
|
|
638
686
|
)
|
|
639
687
|
|
|
640
|
-
#
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
):
|
|
646
|
-
improvement = self._calculate_improvement(
|
|
647
|
-
improved_score, best_score
|
|
648
|
-
)
|
|
649
|
-
|
|
650
|
-
# Display improvement for this iteration
|
|
651
|
-
reporting.display_iteration_improvement(
|
|
652
|
-
improvement=improvement,
|
|
653
|
-
current_score=improved_score,
|
|
654
|
-
best_score=best_score,
|
|
655
|
-
verbose=self.verbose,
|
|
656
|
-
)
|
|
688
|
+
# Mark iteration complete
|
|
689
|
+
improved_since_start = best_score > initial_score
|
|
690
|
+
iteration_reporter.iteration_complete(
|
|
691
|
+
best_score=best_score, improved=improved_since_start
|
|
692
|
+
)
|
|
657
693
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
logger.info(
|
|
663
|
-
f"Updated best prompt after addressing '{root_cause.name}'"
|
|
664
|
-
)
|
|
665
|
-
else:
|
|
666
|
-
logger.debug(
|
|
667
|
-
f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
|
|
668
|
-
)
|
|
694
|
+
# Check for convergence after iteration
|
|
695
|
+
iteration_improvement = self._calculate_improvement(
|
|
696
|
+
best_score, previous_iteration_score
|
|
697
|
+
)
|
|
669
698
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
best_score=best_score, improved=improved_since_start
|
|
699
|
+
logger.info(
|
|
700
|
+
f"Iteration {iteration} complete. Score: {best_score:.4f}, "
|
|
701
|
+
f"Improvement: {iteration_improvement:.2%}"
|
|
674
702
|
)
|
|
675
703
|
|
|
704
|
+
# Stop if improvement is below convergence threshold
|
|
705
|
+
if abs(iteration_improvement) < self.convergence_threshold:
|
|
706
|
+
logger.info(
|
|
707
|
+
f"Convergence achieved: improvement ({iteration_improvement:.2%}) "
|
|
708
|
+
f"below threshold ({self.convergence_threshold:.2%}). "
|
|
709
|
+
f"Stopping after {iteration} iterations."
|
|
710
|
+
)
|
|
711
|
+
break
|
|
712
|
+
|
|
713
|
+
# Update previous score for next iteration
|
|
714
|
+
previous_iteration_score = best_score
|
|
715
|
+
|
|
676
716
|
# Display final optimization result with diff
|
|
677
717
|
reporting.display_optimized_prompt_diff(
|
|
678
718
|
initial_messages=initial_messages,
|
|
@@ -682,14 +722,27 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
682
722
|
verbose=self.verbose,
|
|
683
723
|
)
|
|
684
724
|
|
|
725
|
+
# Update optimization status to completed
|
|
726
|
+
try:
|
|
727
|
+
optimization.update(status="completed")
|
|
728
|
+
logger.info(f"Optimization {optimization.id} status updated to completed.")
|
|
729
|
+
except Exception as e:
|
|
730
|
+
logger.warning(f"Failed to update optimization status: {e}")
|
|
731
|
+
|
|
685
732
|
# Prepare details for the result
|
|
686
733
|
details = {
|
|
734
|
+
"model": best_prompt.model or self.model,
|
|
735
|
+
"temperature": (best_prompt.model_kwargs or {}).get("temperature")
|
|
736
|
+
or self.model_kwargs.get("temperature"),
|
|
687
737
|
"reasoning_model": self.reasoning_model,
|
|
688
738
|
"num_threads": self.num_threads,
|
|
689
739
|
"max_parallel_batches": self.max_parallel_batches,
|
|
690
740
|
"max_retries": max_retries,
|
|
691
741
|
"n_samples": n_samples,
|
|
692
742
|
"auto_continue": auto_continue,
|
|
743
|
+
"max_iterations": self.max_iterations,
|
|
744
|
+
"convergence_threshold": self.convergence_threshold,
|
|
745
|
+
"iterations_completed": iteration,
|
|
693
746
|
}
|
|
694
747
|
|
|
695
748
|
# Extract tool prompts if tools exist
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import asyncio
|
|
3
3
|
from typing import Any
|
|
4
|
-
from tqdm import tqdm
|
|
5
4
|
|
|
5
|
+
from rich.progress import Progress, TextColumn, BarColumn, TaskProgressColumn
|
|
6
6
|
from opik.evaluation.evaluation_result import EvaluationResult
|
|
7
7
|
from .types import (
|
|
8
8
|
RootCauseAnalysis,
|
|
@@ -11,6 +11,7 @@ from .types import (
|
|
|
11
11
|
)
|
|
12
12
|
from . import reporting
|
|
13
13
|
from .prompts import BATCH_ANALYSIS_PROMPT, SYNTHESIS_PROMPT
|
|
14
|
+
from ..reporting_utils import get_console
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
|
@@ -285,13 +286,11 @@ Scores:
|
|
|
285
286
|
|
|
286
287
|
semaphore = asyncio.Semaphore(self.max_parallel_batches)
|
|
287
288
|
|
|
288
|
-
# Create progress bar for batch processing
|
|
289
|
-
|
|
290
|
-
total=len(batch_tasks), desc="Processing batches", unit="batch", leave=False
|
|
291
|
-
)
|
|
289
|
+
# Create progress bar for batch processing using Rich
|
|
290
|
+
console = get_console()
|
|
292
291
|
|
|
293
292
|
async def run_with_semaphore(
|
|
294
|
-
batch_num: int, task: Any
|
|
293
|
+
batch_num: int, task: Any, progress: Progress | None, task_id: Any | None
|
|
295
294
|
) -> tuple[int, BatchAnalysis]:
|
|
296
295
|
async with semaphore:
|
|
297
296
|
try:
|
|
@@ -300,19 +299,44 @@ Scores:
|
|
|
300
299
|
f"Completed batch {batch_num}: "
|
|
301
300
|
f"identified {len(result.failure_modes)} failure modes"
|
|
302
301
|
)
|
|
303
|
-
|
|
302
|
+
if progress and task_id is not None:
|
|
303
|
+
progress.update(task_id, advance=1) # Update progress bar
|
|
304
304
|
return batch_num, result
|
|
305
305
|
except Exception as exc:
|
|
306
306
|
logger.error(f"Batch {batch_num} failed: {exc}")
|
|
307
|
-
|
|
307
|
+
if progress and task_id is not None:
|
|
308
|
+
progress.update(
|
|
309
|
+
task_id, advance=1
|
|
310
|
+
) # Update progress bar even on error
|
|
308
311
|
raise
|
|
309
312
|
|
|
310
|
-
# Run all tasks with semaphore control
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
313
|
+
# Run all tasks with semaphore control and rich progress bar
|
|
314
|
+
if self.verbose >= 1:
|
|
315
|
+
with Progress(
|
|
316
|
+
TextColumn("│ "),
|
|
317
|
+
TextColumn("[progress.description]{task.description}"),
|
|
318
|
+
BarColumn(),
|
|
319
|
+
TaskProgressColumn(),
|
|
320
|
+
console=console,
|
|
321
|
+
transient=True,
|
|
322
|
+
) as progress:
|
|
323
|
+
task_id = progress.add_task(
|
|
324
|
+
"Processing batches", total=len(batch_tasks)
|
|
325
|
+
)
|
|
326
|
+
results = await asyncio.gather(
|
|
327
|
+
*[
|
|
328
|
+
run_with_semaphore(num, task, progress, task_id)
|
|
329
|
+
for num, task in batch_tasks
|
|
330
|
+
]
|
|
331
|
+
)
|
|
332
|
+
else:
|
|
333
|
+
# No progress bar in non-verbose mode
|
|
334
|
+
results = await asyncio.gather(
|
|
335
|
+
*[
|
|
336
|
+
run_with_semaphore(num, task, None, None)
|
|
337
|
+
for num, task in batch_tasks
|
|
338
|
+
]
|
|
339
|
+
)
|
|
316
340
|
|
|
317
341
|
# Sort by batch number to maintain order
|
|
318
342
|
batch_analyses = [result for _, result in sorted(results)]
|
|
@@ -29,9 +29,11 @@ def display_retry_attempt(
|
|
|
29
29
|
"""Display retry attempt information."""
|
|
30
30
|
if verbose >= 1:
|
|
31
31
|
console.print(
|
|
32
|
-
Text(
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
Text("│ ").append(
|
|
33
|
+
Text(
|
|
34
|
+
f"Retry attempt {attempt + 1}/{max_attempts} for failure mode '{failure_mode_name}' (no improvement observed)",
|
|
35
|
+
style="yellow",
|
|
36
|
+
)
|
|
35
37
|
)
|
|
36
38
|
)
|
|
37
39
|
|
|
@@ -132,14 +134,16 @@ def display_evaluation(
|
|
|
132
134
|
def set_score(self, s: float) -> None:
|
|
133
135
|
if verbose >= 1:
|
|
134
136
|
# Adjust score indentation based on indent style
|
|
135
|
-
score_indent = "
|
|
137
|
+
score_indent = "│ " if indent == "> " else "│ "
|
|
136
138
|
|
|
137
139
|
if baseline_score is None:
|
|
138
140
|
# This is the baseline evaluation
|
|
139
141
|
console.print(
|
|
140
|
-
Text(
|
|
141
|
-
|
|
142
|
-
|
|
142
|
+
Text(score_indent).append(
|
|
143
|
+
Text(
|
|
144
|
+
f"Baseline score was: {s:.4f}.",
|
|
145
|
+
style="green",
|
|
146
|
+
)
|
|
143
147
|
)
|
|
144
148
|
)
|
|
145
149
|
console.print(Text("│"))
|
|
@@ -152,9 +156,11 @@ def display_evaluation(
|
|
|
152
156
|
else 0
|
|
153
157
|
)
|
|
154
158
|
console.print(
|
|
155
|
-
Text(
|
|
156
|
-
|
|
157
|
-
|
|
159
|
+
Text(score_indent).append(
|
|
160
|
+
Text(
|
|
161
|
+
f"Score for updated prompt: {s:.4f} (+{improvement_pct:.1f}%)",
|
|
162
|
+
style="green bold",
|
|
163
|
+
)
|
|
158
164
|
)
|
|
159
165
|
)
|
|
160
166
|
elif s < baseline_score:
|
|
@@ -164,23 +170,27 @@ def display_evaluation(
|
|
|
164
170
|
else 0
|
|
165
171
|
)
|
|
166
172
|
console.print(
|
|
167
|
-
Text(
|
|
168
|
-
|
|
169
|
-
|
|
173
|
+
Text(score_indent).append(
|
|
174
|
+
Text(
|
|
175
|
+
f"Score for updated prompt: {s:.4f} (-{decline_pct:.1f}%)",
|
|
176
|
+
style="red",
|
|
177
|
+
)
|
|
170
178
|
)
|
|
171
179
|
)
|
|
172
180
|
else:
|
|
173
181
|
console.print(
|
|
174
|
-
Text(
|
|
175
|
-
|
|
176
|
-
|
|
182
|
+
Text(score_indent).append(
|
|
183
|
+
Text(
|
|
184
|
+
f"Score for updated prompt: {s:.4f} (no change)",
|
|
185
|
+
style="yellow",
|
|
186
|
+
)
|
|
177
187
|
)
|
|
178
188
|
)
|
|
179
189
|
console.print(Text("│"))
|
|
180
190
|
|
|
181
191
|
# Use our log suppression context manager and yield the reporter
|
|
182
192
|
# Adjust progress bar indentation based on indent style
|
|
183
|
-
progress_indent = "
|
|
193
|
+
progress_indent = "│ Evaluation" if indent == "> " else "│ Evaluation"
|
|
184
194
|
with suppress_opik_logs():
|
|
185
195
|
with convert_tqdm_to_rich(progress_indent, verbose=verbose):
|
|
186
196
|
try:
|
|
@@ -306,25 +316,31 @@ def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
|
|
|
306
316
|
def display_optimization_iteration(iteration: int, verbose: int = 1) -> Iterator[Any]:
|
|
307
317
|
"""Context manager to display progress for a single optimization iteration."""
|
|
308
318
|
if verbose >= 1:
|
|
309
|
-
console.print()
|
|
310
319
|
console.print(Text("│"))
|
|
311
|
-
console.print(Text(
|
|
320
|
+
console.print(Text("│"))
|
|
321
|
+
console.print(
|
|
322
|
+
Text("│ ").append(Text(f"Iteration {iteration}", style="bold cyan"))
|
|
323
|
+
)
|
|
312
324
|
|
|
313
325
|
class Reporter:
|
|
314
326
|
def iteration_complete(self, best_score: float, improved: bool) -> None:
|
|
315
327
|
if verbose >= 1:
|
|
316
328
|
if improved:
|
|
317
329
|
console.print(
|
|
318
|
-
Text(
|
|
319
|
-
|
|
320
|
-
|
|
330
|
+
Text("│ ").append(
|
|
331
|
+
Text(
|
|
332
|
+
f"Iteration {iteration} complete - New best score: {best_score:.4f}",
|
|
333
|
+
style="green",
|
|
334
|
+
)
|
|
321
335
|
)
|
|
322
336
|
)
|
|
323
337
|
else:
|
|
324
338
|
console.print(
|
|
325
|
-
Text(
|
|
326
|
-
|
|
327
|
-
|
|
339
|
+
Text("│ ").append(
|
|
340
|
+
Text(
|
|
341
|
+
f"Iteration {iteration} complete - No improvement (best: {best_score:.4f})",
|
|
342
|
+
style="yellow",
|
|
343
|
+
)
|
|
328
344
|
)
|
|
329
345
|
)
|
|
330
346
|
console.print(Text("│"))
|
|
@@ -341,16 +357,20 @@ def display_root_cause_analysis(verbose: int = 1) -> Iterator[Any]:
|
|
|
341
357
|
if verbose >= 1:
|
|
342
358
|
console.print(Text("│ "))
|
|
343
359
|
console.print(
|
|
344
|
-
Text("│
|
|
360
|
+
Text("│ ").append(
|
|
361
|
+
Text("Analyzing root cause of failed evaluation items", style="cyan")
|
|
362
|
+
)
|
|
345
363
|
)
|
|
346
364
|
|
|
347
365
|
class Reporter:
|
|
348
366
|
def set_completed(self, total_test_cases: int, num_batches: int) -> None:
|
|
349
367
|
if verbose >= 1:
|
|
350
368
|
console.print(
|
|
351
|
-
Text(
|
|
352
|
-
|
|
353
|
-
|
|
369
|
+
Text("│ ").append(
|
|
370
|
+
Text(
|
|
371
|
+
f"Analyzed {total_test_cases} test cases across {num_batches} batches",
|
|
372
|
+
style="green",
|
|
373
|
+
)
|
|
354
374
|
)
|
|
355
375
|
)
|
|
356
376
|
console.print(Text("│ "))
|
|
@@ -367,7 +387,9 @@ def display_root_cause_analysis(verbose: int = 1) -> Iterator[Any]:
|
|
|
367
387
|
def display_batch_synthesis(num_batches: int, verbose: int = 1) -> Iterator[Any]:
|
|
368
388
|
"""Context manager to display message during batch synthesis."""
|
|
369
389
|
if verbose >= 1:
|
|
370
|
-
console.print(
|
|
390
|
+
console.print(
|
|
391
|
+
Text("│ ").append(Text("Synthesizing failure modes", style="cyan"))
|
|
392
|
+
)
|
|
371
393
|
|
|
372
394
|
class Reporter:
|
|
373
395
|
def set_completed(self, num_unified_modes: int) -> None:
|
|
@@ -406,10 +428,13 @@ def display_hierarchical_synthesis(
|
|
|
406
428
|
console.print(panel)
|
|
407
429
|
|
|
408
430
|
rendered_panel = capture.get()
|
|
409
|
-
for line in rendered_panel.splitlines():
|
|
410
|
-
console.print(Text("│ ") + Text.from_ansi(line))
|
|
411
431
|
|
|
412
|
-
|
|
432
|
+
# Prefix each line with '│ ', preserving ANSI styles
|
|
433
|
+
prefixed_output = "\n".join(f"│ {line}" for line in rendered_panel.splitlines())
|
|
434
|
+
|
|
435
|
+
# Print the prefixed output (will include colors)
|
|
436
|
+
console.print(prefixed_output, highlight=False)
|
|
437
|
+
console.print(Text("│"))
|
|
413
438
|
|
|
414
439
|
|
|
415
440
|
def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
|
|
@@ -433,10 +458,13 @@ def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
|
|
|
433
458
|
console.print(header_panel)
|
|
434
459
|
|
|
435
460
|
rendered_header = capture.get()
|
|
436
|
-
for line in rendered_header.splitlines():
|
|
437
|
-
console.print(Text("│ ") + Text.from_ansi(line))
|
|
438
461
|
|
|
439
|
-
|
|
462
|
+
# Prefix each line with '│ ', preserving ANSI styles
|
|
463
|
+
prefixed_output = "\n".join(f"│ {line}" for line in rendered_header.splitlines())
|
|
464
|
+
|
|
465
|
+
# Print the prefixed output (will include colors)
|
|
466
|
+
console.print(prefixed_output, highlight=False)
|
|
467
|
+
console.print(Text("│"))
|
|
440
468
|
|
|
441
469
|
for idx, failure_mode in enumerate(failure_modes, 1):
|
|
442
470
|
# Create content for this failure mode
|
|
@@ -460,8 +488,14 @@ def display_failure_modes(failure_modes: list[Any], verbose: int = 1) -> None:
|
|
|
460
488
|
console.print(panel)
|
|
461
489
|
|
|
462
490
|
rendered_panel = capture.get()
|
|
463
|
-
|
|
464
|
-
|
|
491
|
+
|
|
492
|
+
# Prefix each line with '│ ', preserving ANSI styles
|
|
493
|
+
prefixed_output = "\n".join(
|
|
494
|
+
f"│ {line}" for line in rendered_panel.splitlines()
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
# Print the prefixed output (will include colors)
|
|
498
|
+
console.print(prefixed_output, highlight=False)
|
|
465
499
|
|
|
466
500
|
if idx < len(failure_modes):
|
|
467
501
|
console.print("│")
|
|
@@ -473,9 +507,13 @@ def display_prompt_improvement(
|
|
|
473
507
|
) -> Iterator[Any]:
|
|
474
508
|
"""Context manager to display progress while generating improved prompt."""
|
|
475
509
|
if verbose >= 1:
|
|
476
|
-
console.print()
|
|
510
|
+
console.print(Text("│"))
|
|
477
511
|
console.print(Text("│ "))
|
|
478
|
-
console.print(
|
|
512
|
+
console.print(
|
|
513
|
+
Text("│ ").append(
|
|
514
|
+
Text(f"Addressing: {failure_mode_name}", style="bold cyan")
|
|
515
|
+
)
|
|
516
|
+
)
|
|
479
517
|
|
|
480
518
|
class Reporter:
|
|
481
519
|
def set_reasoning(self, reasoning: str) -> None:
|
|
@@ -498,9 +536,14 @@ def display_prompt_improvement(
|
|
|
498
536
|
console.print(panel)
|
|
499
537
|
|
|
500
538
|
rendered_panel = capture.get()
|
|
501
|
-
for line in rendered_panel.splitlines():
|
|
502
|
-
console.print(Text("│ ") + Text.from_ansi(line))
|
|
503
539
|
|
|
540
|
+
# Prefix each line with '│ ', preserving ANSI styles
|
|
541
|
+
prefixed_output = "\n".join(
|
|
542
|
+
f"│ {line}" for line in rendered_panel.splitlines()
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
# Print the prefixed output (will include colors)
|
|
546
|
+
console.print(prefixed_output, highlight=False)
|
|
504
547
|
console.print(Text("│ "))
|
|
505
548
|
|
|
506
549
|
try:
|
|
@@ -520,9 +563,11 @@ def display_improvement_reasoning(
|
|
|
520
563
|
if verbose < 1:
|
|
521
564
|
return
|
|
522
565
|
|
|
523
|
-
console.print()
|
|
566
|
+
console.print(Text("│"))
|
|
524
567
|
console.print(Text("│ "))
|
|
525
|
-
console.print(
|
|
568
|
+
console.print(
|
|
569
|
+
Text("│ ").append(Text(f"Addressing: {failure_mode_name}", style="bold cyan"))
|
|
570
|
+
)
|
|
526
571
|
|
|
527
572
|
reasoning_content = Text()
|
|
528
573
|
reasoning_content.append("Improvement Strategy:\n", style="cyan")
|
|
@@ -542,9 +587,12 @@ def display_improvement_reasoning(
|
|
|
542
587
|
console.print(panel)
|
|
543
588
|
|
|
544
589
|
rendered_panel = capture.get()
|
|
545
|
-
for line in rendered_panel.splitlines():
|
|
546
|
-
console.print(Text("│ ") + Text.from_ansi(line))
|
|
547
590
|
|
|
591
|
+
# Prefix each line with '│ ', preserving ANSI styles
|
|
592
|
+
prefixed_output = "\n".join(f"│ {line}" for line in rendered_panel.splitlines())
|
|
593
|
+
|
|
594
|
+
# Print the prefixed output (will include colors)
|
|
595
|
+
console.print(prefixed_output, highlight=False)
|
|
548
596
|
console.print(Text("│ "))
|
|
549
597
|
|
|
550
598
|
|
|
@@ -557,16 +605,20 @@ def display_iteration_improvement(
|
|
|
557
605
|
|
|
558
606
|
if improvement > 0:
|
|
559
607
|
console.print(
|
|
560
|
-
Text(
|
|
561
|
-
|
|
562
|
-
|
|
608
|
+
Text("│ ").append(
|
|
609
|
+
Text(
|
|
610
|
+
f"✓ Improvement: {improvement:.2%} (from {best_score:.4f} to {current_score:.4f})",
|
|
611
|
+
style="green bold",
|
|
612
|
+
)
|
|
563
613
|
)
|
|
564
614
|
)
|
|
565
615
|
else:
|
|
566
616
|
console.print(
|
|
567
|
-
Text(
|
|
568
|
-
|
|
569
|
-
|
|
617
|
+
Text("│ ").append(
|
|
618
|
+
Text(
|
|
619
|
+
f"✗ No improvement: {improvement:.2%} (score: {current_score:.4f}, best: {best_score:.4f})",
|
|
620
|
+
style="yellow",
|
|
621
|
+
)
|
|
570
622
|
)
|
|
571
623
|
)
|
|
572
624
|
|
|
@@ -584,27 +636,31 @@ def display_optimized_prompt_diff(
|
|
|
584
636
|
if verbose < 1:
|
|
585
637
|
return
|
|
586
638
|
|
|
587
|
-
console.print()
|
|
588
639
|
console.print(Text("│"))
|
|
589
|
-
console.print(Text("│
|
|
640
|
+
console.print(Text("│"))
|
|
641
|
+
console.print(Text("│ ").append(Text("> Optimization Results", style="bold green")))
|
|
590
642
|
console.print(Text("│"))
|
|
591
643
|
|
|
592
644
|
# Show score improvement
|
|
593
645
|
if best_score > initial_score:
|
|
594
646
|
perc_change = (best_score - initial_score) / initial_score
|
|
595
647
|
console.print(
|
|
596
|
-
Text(
|
|
597
|
-
|
|
598
|
-
|
|
648
|
+
Text("│ ").append(
|
|
649
|
+
Text(
|
|
650
|
+
f"Prompt improved from {initial_score:.4f} to {best_score:.4f} ({perc_change:.2%})",
|
|
651
|
+
style="green",
|
|
652
|
+
)
|
|
599
653
|
)
|
|
600
654
|
)
|
|
601
655
|
else:
|
|
602
656
|
console.print(
|
|
603
|
-
Text(
|
|
657
|
+
Text("│ ").append(
|
|
658
|
+
Text(f"No improvement found (score: {best_score:.4f})", style="yellow")
|
|
659
|
+
)
|
|
604
660
|
)
|
|
605
661
|
|
|
606
662
|
console.print(Text("│"))
|
|
607
|
-
console.print(Text("│ Prompt Changes:", style="cyan"))
|
|
663
|
+
console.print(Text("│ ").append(Text("Prompt Changes:", style="cyan")))
|
|
608
664
|
console.print(Text("│"))
|
|
609
665
|
|
|
610
666
|
# Compare each message
|
|
@@ -626,24 +682,30 @@ def display_optimized_prompt_diff(
|
|
|
626
682
|
|
|
627
683
|
# Handle added messages
|
|
628
684
|
if not initial_msg:
|
|
629
|
-
console.print(
|
|
685
|
+
console.print(
|
|
686
|
+
Text("│ ").append(Text(f"{role}: (added)", style="green bold"))
|
|
687
|
+
)
|
|
630
688
|
for line in optimized_content.splitlines():
|
|
631
|
-
console.print(Text(
|
|
689
|
+
console.print(Text("│ ").append(Text(f"+{line}", style="green")))
|
|
632
690
|
console.print(Text("│"))
|
|
633
691
|
continue
|
|
634
692
|
|
|
635
693
|
# Handle removed messages
|
|
636
694
|
if not optimized_msg:
|
|
637
|
-
console.print(
|
|
695
|
+
console.print(
|
|
696
|
+
Text("│ ").append(Text(f"{role}: (removed)", style="red bold"))
|
|
697
|
+
)
|
|
638
698
|
for line in initial_content.splitlines():
|
|
639
|
-
console.print(Text(
|
|
699
|
+
console.print(Text("│ ").append(Text(f"-{line}", style="red")))
|
|
640
700
|
console.print(Text("│"))
|
|
641
701
|
continue
|
|
642
702
|
|
|
643
703
|
# Check if there are changes
|
|
644
704
|
if initial_content == optimized_content:
|
|
645
705
|
# No changes in this message
|
|
646
|
-
console.print(
|
|
706
|
+
console.print(
|
|
707
|
+
Text("│ ").append(Text(f"{role}: (unchanged)", style="dim"))
|
|
708
|
+
)
|
|
647
709
|
continue
|
|
648
710
|
|
|
649
711
|
# Generate unified diff
|
|
@@ -660,7 +722,7 @@ def display_optimized_prompt_diff(
|
|
|
660
722
|
continue
|
|
661
723
|
|
|
662
724
|
# Display message header
|
|
663
|
-
console.print(Text(
|
|
725
|
+
console.print(Text("│ ").append(Text(f"{role}:", style="bold cyan")))
|
|
664
726
|
|
|
665
727
|
# Create diff content
|
|
666
728
|
diff_content = Text()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from opik.evaluation.metrics.score_result import ScoreResult
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MultiMetricObjective:
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
metrics: list[Callable[[dict[str, Any], str], ScoreResult]],
|
|
10
|
+
weights: list[float] | None = None,
|
|
11
|
+
name: str = "multi_metric_objective",
|
|
12
|
+
):
|
|
13
|
+
self.metrics = metrics
|
|
14
|
+
self.weights = weights if weights else [1 / len(metrics)] * len(metrics)
|
|
15
|
+
self.__name__ = name
|
|
16
|
+
|
|
17
|
+
def __call__(self, dataset_item: dict[str, Any], llm_output: str) -> ScoreResult:
|
|
18
|
+
raw_score_results = []
|
|
19
|
+
weighted_score_value = 0
|
|
20
|
+
|
|
21
|
+
for metric, weight in zip(self.metrics, self.weights):
|
|
22
|
+
score_result = metric(dataset_item, llm_output)
|
|
23
|
+
raw_score_results.append(score_result)
|
|
24
|
+
weighted_score_value += score_result.value * weight
|
|
25
|
+
|
|
26
|
+
aggregated_score_result = ScoreResult(
|
|
27
|
+
name=self.__name__,
|
|
28
|
+
value=weighted_score_value,
|
|
29
|
+
metadata={"raw_score_results": raw_score_results},
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Important: we return the aggregated score result first
|
|
33
|
+
return aggregated_score_result
|
opik_optimizer/task_evaluator.py
CHANGED
|
@@ -5,6 +5,7 @@ from collections.abc import Callable
|
|
|
5
5
|
import opik
|
|
6
6
|
from opik.evaluation import evaluator as opik_evaluator
|
|
7
7
|
from opik.evaluation.metrics import base_metric, score_result
|
|
8
|
+
from . import multi_metric_objective
|
|
8
9
|
|
|
9
10
|
logger = logging.getLogger(__name__)
|
|
10
11
|
|
|
@@ -14,9 +15,20 @@ def _create_metric_class(metric: Callable) -> base_metric.BaseMetric:
|
|
|
14
15
|
def __init__(self) -> None:
|
|
15
16
|
self.name = metric.__name__
|
|
16
17
|
|
|
17
|
-
def score(
|
|
18
|
+
def score(
|
|
19
|
+
self, llm_output: str, **kwargs: Any
|
|
20
|
+
) -> score_result.ScoreResult | list[score_result.ScoreResult]:
|
|
18
21
|
try:
|
|
19
22
|
metric_val = metric(dataset_item=kwargs, llm_output=llm_output)
|
|
23
|
+
|
|
24
|
+
if isinstance(metric, multi_metric_objective.MultiMetricObjective):
|
|
25
|
+
if (
|
|
26
|
+
hasattr(metric_val, "metadata")
|
|
27
|
+
and "raw_score_results" in metric_val.metadata
|
|
28
|
+
):
|
|
29
|
+
return [metric_val, *metric_val.metadata["raw_score_results"]]
|
|
30
|
+
else:
|
|
31
|
+
return [metric_val]
|
|
20
32
|
if isinstance(metric_val, score_result.ScoreResult):
|
|
21
33
|
return score_result.ScoreResult(
|
|
22
34
|
name=self.name,
|
|
@@ -107,15 +119,20 @@ def evaluate(
|
|
|
107
119
|
if not result.test_results:
|
|
108
120
|
return 0.0
|
|
109
121
|
|
|
110
|
-
#
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
122
|
+
# Filter score results to only include the objective metric
|
|
123
|
+
objective_metric_name = metric.__name__
|
|
124
|
+
objective_score_results: list[score_result.ScoreResult] = []
|
|
125
|
+
for test_result in result.test_results:
|
|
126
|
+
for score_result_ in test_result.score_results:
|
|
127
|
+
if score_result_.name == objective_metric_name:
|
|
128
|
+
objective_score_results.append(score_result_)
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
if not objective_score_results:
|
|
115
132
|
return 0.0
|
|
116
133
|
|
|
117
|
-
avg_score = sum(
|
|
118
|
-
|
|
119
|
-
)
|
|
134
|
+
avg_score = sum(
|
|
135
|
+
[score_result_.value for score_result_ in objective_score_results]
|
|
136
|
+
) / len(objective_score_results)
|
|
120
137
|
|
|
121
138
|
return avg_score
|
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
opik_optimizer/__init__.py,sha256=
|
|
1
|
+
opik_optimizer/__init__.py,sha256=lA9cjEsNxrJwYJ68vCjeNZgrcxO_rNJaAHsdMwaq364,1658
|
|
2
2
|
opik_optimizer/_throttle.py,sha256=1JXIhYlo0IaqCgwmNB0Hnh9CYhYPkwRFdVGIcE7pVNg,1362
|
|
3
|
-
opik_optimizer/base_optimizer.py,sha256=
|
|
3
|
+
opik_optimizer/base_optimizer.py,sha256=XryBkUTs4FQmHcBtVm63EJIKWrTvwqduUZ6ArHzYQko,21520
|
|
4
4
|
opik_optimizer/cache_config.py,sha256=Xd3NdUsL7bLQWoNe3pESqH4nHucU1iNTSGp-RqbwDog,599
|
|
5
5
|
opik_optimizer/logging_config.py,sha256=TmxX0C1P20amxoXuiNQvlENOjdSNfWwvL8jFy206VWM,3837
|
|
6
|
+
opik_optimizer/multi_metric_objective.py,sha256=y4jqirnhkfhB7SWonI4ldYg5fWG4JGfAxqu7ylRD1J4,1178
|
|
6
7
|
opik_optimizer/optimizable_agent.py,sha256=R0_BdwdHyZGWTw3oSvTg8FULDOYM8XaTiPNR3qV8DkQ,6344
|
|
7
8
|
opik_optimizer/optimization_result.py,sha256=sG-Yr-hOaH9zx_I5S6_W3v6j8nPUhwYdS333jVM4Gus,17218
|
|
8
9
|
opik_optimizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
10
|
opik_optimizer/reporting_utils.py,sha256=dcECFmzZ_J-DKoukMDEE_fm7X8sdQyl_ijTddvQtepE,8287
|
|
10
|
-
opik_optimizer/task_evaluator.py,sha256=
|
|
11
|
+
opik_optimizer/task_evaluator.py,sha256=7N254DU0UkWJ5saQ5AmYEsHHSrychAJtedmmjNsCOnI,5081
|
|
11
12
|
opik_optimizer/data/context7_eval.jsonl,sha256=vPR3XRfI0UbZ1hgUGaOdpraFT99RDLU1YWuPFLLQz40,1757
|
|
12
13
|
opik_optimizer/data/hotpot-500.json,sha256=YXxCtuvYvxSu5u0y4559a6b1qwgAYsWzT_SUKv_21ew,76862
|
|
13
14
|
opik_optimizer/datasets/__init__.py,sha256=V4LVDOaRjwzaYvhdQ3V6CAwFaeKnxyTV1lp_ES9Z31E,691
|
|
@@ -43,13 +44,13 @@ opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py,sha256
|
|
|
43
44
|
opik_optimizer/few_shot_bayesian_optimizer/reporting.py,sha256=OMpLG4xsM6K7oQcP_nbnky47NklVsowNDlK6WliZM10,6311
|
|
44
45
|
opik_optimizer/gepa_optimizer/__init__.py,sha256=XcPah5t4mop7UCFo69E9l45Mem49-itqkQT7_J1aWOA,71
|
|
45
46
|
opik_optimizer/gepa_optimizer/adapter.py,sha256=KzPa4koq7aJhALMAOKPxAO4yWuEy_YbW7tGnqny3Hfo,5139
|
|
46
|
-
opik_optimizer/gepa_optimizer/gepa_optimizer.py,sha256=
|
|
47
|
+
opik_optimizer/gepa_optimizer/gepa_optimizer.py,sha256=HBjikhce3K4VIaiIXs7eSagmRyFPdY8h4seoW9F3nQE,26481
|
|
47
48
|
opik_optimizer/gepa_optimizer/reporting.py,sha256=F0cxYSjRuFAszgi3rgqwH1A-KH26kZOLtENP7x1xrQs,5154
|
|
48
49
|
opik_optimizer/hierarchical_reflective_optimizer/__init__.py,sha256=9qM3kvfAaFy-Y6Tg19MXHJxpnF5DJQQwzr6oNsxaRBM,133
|
|
49
|
-
opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py,sha256=
|
|
50
|
-
opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py,sha256=
|
|
50
|
+
opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py,sha256=j9Gr5z9j-evFhkbxkbiZ7RXt6Q89LshYYR4ac_UxwX0,30235
|
|
51
|
+
opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py,sha256=0D5wgx04jZvTJ0Yjqm0jtQvkjrGBB73qgcsSwLBpnv0,13814
|
|
51
52
|
opik_optimizer/hierarchical_reflective_optimizer/prompts.py,sha256=XcOEI9eeEbTgKFsFiRWxvHdaByQkiN02bH2gTl3HX-Y,3853
|
|
52
|
-
opik_optimizer/hierarchical_reflective_optimizer/reporting.py,sha256=
|
|
53
|
+
opik_optimizer/hierarchical_reflective_optimizer/reporting.py,sha256=d1jQ3uZs0fTI2DeumvGmkxuMHtwA0wt_ROtl4E6UdIM,25461
|
|
53
54
|
opik_optimizer/hierarchical_reflective_optimizer/types.py,sha256=bS-JAheX2FpJ4XAxoZi5PfjloG8L-B1LGQA1iLXZhW4,1031
|
|
54
55
|
opik_optimizer/mcp_utils/__init__.py,sha256=BsWQT8nAa6JV6zcOD__OvPMepUS2IpJD4J2rnAXhpuU,710
|
|
55
56
|
opik_optimizer/mcp_utils/mcp.py,sha256=UylgpTJsybszS433_kuTAgKH-PPde-VHjHVelMardFs,18466
|
|
@@ -79,8 +80,8 @@ opik_optimizer/utils/colbert.py,sha256=qSrzKUUGw7P92mLy4Ofug5pBGeTsHBLMJXlXSJSfK
|
|
|
79
80
|
opik_optimizer/utils/core.py,sha256=5GT1vp6fW8ICO42LHMX14BjR-xEb6afAKjM7b1Evx5M,15298
|
|
80
81
|
opik_optimizer/utils/dataset_utils.py,sha256=dqRUGOekjeNWL0J15R8xFwLyKJDJynJXzVyQmt8rhHA,1464
|
|
81
82
|
opik_optimizer/utils/prompt_segments.py,sha256=1zUITSccJ82Njac1rmANzim4WWM6rVac61mfluS7lFE,5931
|
|
82
|
-
opik_optimizer-2.1.
|
|
83
|
-
opik_optimizer-2.1.
|
|
84
|
-
opik_optimizer-2.1.
|
|
85
|
-
opik_optimizer-2.1.
|
|
86
|
-
opik_optimizer-2.1.
|
|
83
|
+
opik_optimizer-2.1.3.dist-info/licenses/LICENSE,sha256=V-0VHJOBdcA_teT8VymvsBUQ1-CZU6yJRmMEjec_8tA,11372
|
|
84
|
+
opik_optimizer-2.1.3.dist-info/METADATA,sha256=omnNZ2--FZxU-ex3SEKYF4ZaKRDTcQfkPoc2kxKLB7U,12829
|
|
85
|
+
opik_optimizer-2.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
86
|
+
opik_optimizer-2.1.3.dist-info/top_level.txt,sha256=ondOlpq6_yFckqpxoAHSfzZS2N-JfgmA-QQhOJfz7m0,15
|
|
87
|
+
opik_optimizer-2.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|