opik-optimizer 2.1.2__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -2
- opik_optimizer/base_optimizer.py +314 -145
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
- opik_optimizer/gepa_optimizer/reporting.py +164 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +221 -245
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +38 -14
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +287 -132
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/multi_metric_objective.py +33 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +42 -15
- opik_optimizer/task_evaluator.py +26 -9
- opik_optimizer/utils/core.py +16 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +37 -37
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -145,15 +145,15 @@ class LM(BaseLM):
|
|
|
145
145
|
):
|
|
146
146
|
settings.usage_tracker.add_usage(self.model, dict(results.usage))
|
|
147
147
|
|
|
148
|
-
self.
|
|
148
|
+
self._increment_llm_counter()
|
|
149
149
|
return results
|
|
150
150
|
|
|
151
|
-
def
|
|
151
|
+
def _increment_llm_counter(self) -> None:
|
|
152
152
|
"""Increment the LLM call counter."""
|
|
153
153
|
self.llm_call_counter += 1
|
|
154
154
|
parent = getattr(self, "parent_optimizer", None)
|
|
155
|
-
if parent is not None and hasattr(parent, "
|
|
156
|
-
parent.
|
|
155
|
+
if parent is not None and hasattr(parent, "_increment_llm_counter"):
|
|
156
|
+
parent._increment_llm_counter()
|
|
157
157
|
|
|
158
158
|
def launch(self, launch_kwargs: dict[str, Any] | None = None):
|
|
159
159
|
self.provider.launch(self, launch_kwargs)
|
|
@@ -608,13 +608,7 @@ class MIPROv2(Teleprompter):
|
|
|
608
608
|
|
|
609
609
|
# Define the objective function
|
|
610
610
|
def objective(trial):
|
|
611
|
-
nonlocal
|
|
612
|
-
program, \
|
|
613
|
-
best_program, \
|
|
614
|
-
best_score, \
|
|
615
|
-
trial_logs, \
|
|
616
|
-
total_eval_calls, \
|
|
617
|
-
score_data
|
|
611
|
+
nonlocal best_program, best_score, trial_logs, total_eval_calls, score_data # noqa: F824
|
|
618
612
|
|
|
619
613
|
trial_num = trial.number + 1
|
|
620
614
|
if minibatch:
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from opik.evaluation.metrics.score_result import ScoreResult
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MultiMetricObjective:
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
metrics: list[Callable[[dict[str, Any], str], ScoreResult]],
|
|
10
|
+
weights: list[float] | None = None,
|
|
11
|
+
name: str = "multi_metric_objective",
|
|
12
|
+
):
|
|
13
|
+
self.metrics = metrics
|
|
14
|
+
self.weights = weights if weights else [1 / len(metrics)] * len(metrics)
|
|
15
|
+
self.__name__ = name
|
|
16
|
+
|
|
17
|
+
def __call__(self, dataset_item: dict[str, Any], llm_output: str) -> ScoreResult:
|
|
18
|
+
raw_score_results = []
|
|
19
|
+
weighted_score_value = 0
|
|
20
|
+
|
|
21
|
+
for metric, weight in zip(self.metrics, self.weights):
|
|
22
|
+
score_result = metric(dataset_item, llm_output)
|
|
23
|
+
raw_score_results.append(score_result)
|
|
24
|
+
weighted_score_value += score_result.value * weight
|
|
25
|
+
|
|
26
|
+
aggregated_score_result = ScoreResult(
|
|
27
|
+
name=self.__name__,
|
|
28
|
+
value=weighted_score_value,
|
|
29
|
+
metadata={"raw_score_results": raw_score_results},
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Important: we return the aggregated score result first
|
|
33
|
+
return aggregated_score_result
|
|
@@ -40,18 +40,19 @@ class OptimizableAgent:
|
|
|
40
40
|
|
|
41
41
|
model: str | None = None
|
|
42
42
|
model_kwargs: dict[str, Any] = {}
|
|
43
|
-
project_name: str | None = "Default Project"
|
|
44
43
|
input_dataset_field: str | None = None
|
|
45
44
|
prompts: dict[str, "ChatPrompt"]
|
|
46
45
|
prompt: "ChatPrompt"
|
|
47
46
|
|
|
48
|
-
def __init__(self, prompt: "ChatPrompt") -> None:
|
|
47
|
+
def __init__(self, prompt: "ChatPrompt", project_name: str | None = None) -> None:
|
|
49
48
|
"""
|
|
50
49
|
Initialize the OptimizableAgent.
|
|
51
50
|
|
|
52
51
|
Args:
|
|
53
52
|
prompt: a chat prompt
|
|
53
|
+
project_name: Optional project name for Opik tracking
|
|
54
54
|
"""
|
|
55
|
+
self.project_name = project_name or "Default Project"
|
|
55
56
|
self.init_llm()
|
|
56
57
|
self.init_agent(prompt)
|
|
57
58
|
|
|
@@ -83,6 +84,7 @@ class OptimizableAgent:
|
|
|
83
84
|
metadata={
|
|
84
85
|
"opik": {
|
|
85
86
|
"current_span_data": get_current_span_data(),
|
|
87
|
+
"project_name": self.project_name,
|
|
86
88
|
},
|
|
87
89
|
},
|
|
88
90
|
**self.model_kwargs,
|
|
@@ -131,6 +133,7 @@ class OptimizableAgent:
|
|
|
131
133
|
for tool_call in msg["tool_calls"]:
|
|
132
134
|
tool_name = tool_call["function"]["name"]
|
|
133
135
|
arguments = json.loads(tool_call["function"]["arguments"])
|
|
136
|
+
|
|
134
137
|
tool_func = self.prompt.function_map.get(tool_name)
|
|
135
138
|
try:
|
|
136
139
|
tool_result = (
|
|
@@ -149,9 +152,9 @@ class OptimizableAgent:
|
|
|
149
152
|
)
|
|
150
153
|
# Increment tool call counter if we have access to the optimizer
|
|
151
154
|
if hasattr(self, "optimizer") and hasattr(
|
|
152
|
-
self.optimizer, "
|
|
155
|
+
self.optimizer, "_increment_tool_counter"
|
|
153
156
|
):
|
|
154
|
-
self.optimizer.
|
|
157
|
+
self.optimizer._increment_tool_counter()
|
|
155
158
|
else:
|
|
156
159
|
final_response = msg["content"]
|
|
157
160
|
break
|
|
@@ -39,10 +39,9 @@ class ChatPrompt:
|
|
|
39
39
|
messages: list[dict[str, str]] | None = None,
|
|
40
40
|
tools: list[dict[str, Any]] | None = None,
|
|
41
41
|
function_map: dict[str, Callable] | None = None,
|
|
42
|
-
model: str
|
|
42
|
+
model: str = "gpt-4o-mini",
|
|
43
43
|
invoke: Callable | None = None,
|
|
44
|
-
|
|
45
|
-
**model_kwargs: Any,
|
|
44
|
+
model_parameters: dict[str, Any] | None = None,
|
|
46
45
|
) -> None:
|
|
47
46
|
if system is None and user is None and messages is None:
|
|
48
47
|
raise ValueError(
|
|
@@ -92,9 +91,8 @@ class ChatPrompt:
|
|
|
92
91
|
self.function_map = {}
|
|
93
92
|
# These are used for the LiteLLMAgent class:
|
|
94
93
|
self.model = model
|
|
95
|
-
self.model_kwargs =
|
|
94
|
+
self.model_kwargs = model_parameters or {}
|
|
96
95
|
self.invoke = invoke
|
|
97
|
-
self.project_name = project_name
|
|
98
96
|
|
|
99
97
|
def get_messages(
|
|
100
98
|
self,
|
|
@@ -149,8 +147,8 @@ class ChatPrompt:
|
|
|
149
147
|
|
|
150
148
|
# TODO(opik-mcp): once we introduce a dedicated MCP prompt subclass,
|
|
151
149
|
# migrate callers away from generic copies so optimizer metadata stays typed.
|
|
152
|
-
|
|
153
|
-
copy.deepcopy(self.model_kwargs) if self.model_kwargs
|
|
150
|
+
model_parameters = (
|
|
151
|
+
copy.deepcopy(self.model_kwargs) if self.model_kwargs else None
|
|
154
152
|
)
|
|
155
153
|
return ChatPrompt(
|
|
156
154
|
name=self.name,
|
|
@@ -161,8 +159,7 @@ class ChatPrompt:
|
|
|
161
159
|
function_map=self.function_map,
|
|
162
160
|
model=self.model,
|
|
163
161
|
invoke=self.invoke,
|
|
164
|
-
|
|
165
|
-
**model_kwargs,
|
|
162
|
+
model_parameters=model_parameters,
|
|
166
163
|
)
|
|
167
164
|
|
|
168
165
|
def set_messages(self, messages: list[dict[str, Any]]) -> None:
|
|
@@ -192,6 +189,6 @@ class ChatPrompt:
|
|
|
192
189
|
"""Custom validation method to handle nested objects during deserialization."""
|
|
193
190
|
return ChatPrompt(
|
|
194
191
|
system=obj.get("system", None),
|
|
195
|
-
|
|
192
|
+
user=obj.get("user", None),
|
|
196
193
|
messages=obj.get("messages", None),
|
|
197
194
|
)
|
|
@@ -20,26 +20,47 @@ from ..optimization_result import OptimizationResult
|
|
|
20
20
|
from .parameter_search_space import ParameterSearchSpace
|
|
21
21
|
from .search_space_types import ParameterType
|
|
22
22
|
from .sensitivity_analysis import compute_sensitivity_from_trials
|
|
23
|
+
from . import reporting
|
|
23
24
|
|
|
24
25
|
logger = logging.getLogger(__name__)
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
class ParameterOptimizer(BaseOptimizer):
|
|
28
|
-
"""
|
|
29
|
+
"""
|
|
30
|
+
The Parameter Optimizer uses Bayesian optimization to tune model parameters like
|
|
31
|
+
temperature, top_p, and other LLM call parameters for optimal performance.
|
|
32
|
+
|
|
33
|
+
This optimizer is ideal when you have a good prompt but want to fine-tune the
|
|
34
|
+
model's behavior through parameter adjustments rather than prompt modifications.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
model: LiteLLM model name (used for metadata, not for optimization calls)
|
|
38
|
+
model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
|
|
39
|
+
Common params: temperature, max_tokens, max_completion_tokens, top_p.
|
|
40
|
+
See: https://docs.litellm.ai/docs/completion/input
|
|
41
|
+
default_n_trials: Default number of optimization trials to run
|
|
42
|
+
local_search_ratio: Ratio of trials to dedicate to local search refinement (0.0-1.0)
|
|
43
|
+
local_search_scale: Scale factor for narrowing search space during local search
|
|
44
|
+
n_threads: Number of parallel threads for evaluation
|
|
45
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
46
|
+
seed: Random seed for reproducibility
|
|
47
|
+
"""
|
|
29
48
|
|
|
30
49
|
def __init__(
|
|
31
50
|
self,
|
|
32
|
-
model: str,
|
|
51
|
+
model: str = "gpt-4o",
|
|
33
52
|
*,
|
|
53
|
+
model_parameters: dict[str, Any] | None = None,
|
|
34
54
|
default_n_trials: int = 20,
|
|
35
|
-
n_threads: int = 4,
|
|
36
|
-
seed: int = 42,
|
|
37
|
-
verbose: int = 1,
|
|
38
55
|
local_search_ratio: float = 0.3,
|
|
39
56
|
local_search_scale: float = 0.2,
|
|
40
|
-
|
|
57
|
+
n_threads: int = 4,
|
|
58
|
+
verbose: int = 1,
|
|
59
|
+
seed: int = 42,
|
|
41
60
|
) -> None:
|
|
42
|
-
super().__init__(
|
|
61
|
+
super().__init__(
|
|
62
|
+
model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
|
|
63
|
+
)
|
|
43
64
|
self.default_n_trials = default_n_trials
|
|
44
65
|
self.n_threads = n_threads
|
|
45
66
|
self.local_search_ratio = max(0.0, min(local_search_ratio, 1.0))
|
|
@@ -56,11 +77,13 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
56
77
|
self,
|
|
57
78
|
prompt: chat_prompt.ChatPrompt,
|
|
58
79
|
dataset: Dataset,
|
|
59
|
-
metric: Callable
|
|
80
|
+
metric: Callable,
|
|
60
81
|
experiment_config: dict | None = None,
|
|
61
82
|
n_samples: int | None = None,
|
|
62
83
|
auto_continue: bool = False,
|
|
63
84
|
agent_class: type[OptimizableAgent] | None = None,
|
|
85
|
+
project_name: str = "Optimization",
|
|
86
|
+
*args: Any,
|
|
64
87
|
**kwargs: Any,
|
|
65
88
|
) -> OptimizationResult:
|
|
66
89
|
raise NotImplementedError(
|
|
@@ -76,28 +99,47 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
76
99
|
metric: Callable[[Any, Any], float],
|
|
77
100
|
parameter_space: ParameterSearchSpace | Mapping[str, Any],
|
|
78
101
|
experiment_config: dict | None = None,
|
|
79
|
-
|
|
102
|
+
max_trials: int | None = None,
|
|
80
103
|
n_samples: int | None = None,
|
|
81
104
|
agent_class: type[OptimizableAgent] | None = None,
|
|
82
|
-
|
|
105
|
+
sampler: optuna.samplers.BaseSampler | None = None,
|
|
106
|
+
callbacks: list[Callable[[optuna.study.Study, optuna.trial.FrozenTrial], None]]
|
|
107
|
+
| None = None,
|
|
108
|
+
timeout: float | None = None,
|
|
109
|
+
local_trials: int | None = None,
|
|
110
|
+
local_search_scale: float | None = None,
|
|
83
111
|
) -> OptimizationResult:
|
|
112
|
+
"""
|
|
113
|
+
Optimize model parameters using Bayesian optimization.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
prompt: The prompt to evaluate with tuned parameters
|
|
117
|
+
dataset: Dataset providing evaluation examples
|
|
118
|
+
metric: Objective function to maximize
|
|
119
|
+
parameter_space: Definition of the search space for tunable parameters
|
|
120
|
+
experiment_config: Optional experiment metadata
|
|
121
|
+
max_trials: Total number of trials (if None, uses default_n_trials)
|
|
122
|
+
n_samples: Number of dataset samples to evaluate per trial (None for all)
|
|
123
|
+
agent_class: Optional custom agent class to execute evaluations
|
|
124
|
+
sampler: Optuna sampler to use (default: TPESampler with seed)
|
|
125
|
+
callbacks: List of callback functions for Optuna study
|
|
126
|
+
timeout: Maximum time in seconds for optimization
|
|
127
|
+
local_trials: Number of trials for local search (overrides local_search_ratio)
|
|
128
|
+
local_search_scale: Scale factor for local search narrowing (0.0-1.0)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
OptimizationResult: Structured result describing the best parameters found
|
|
132
|
+
"""
|
|
84
133
|
if not isinstance(parameter_space, ParameterSearchSpace):
|
|
85
134
|
parameter_space = ParameterSearchSpace.model_validate(parameter_space)
|
|
86
135
|
|
|
87
136
|
# After validation, parameter_space is guaranteed to be ParameterSearchSpace
|
|
88
137
|
assert isinstance(parameter_space, ParameterSearchSpace) # for mypy
|
|
89
138
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
timeout = kwargs.pop("timeout", None)
|
|
93
|
-
local_trials_override = kwargs.pop("local_trials", None)
|
|
94
|
-
local_search_scale_override = kwargs.pop("local_search_scale", None)
|
|
95
|
-
if kwargs:
|
|
96
|
-
extra_keys = ", ".join(sorted(kwargs.keys()))
|
|
97
|
-
raise TypeError(f"Unsupported keyword arguments: {extra_keys}")
|
|
139
|
+
local_trials_override = local_trials
|
|
140
|
+
local_search_scale_override = local_search_scale
|
|
98
141
|
|
|
99
|
-
self.
|
|
100
|
-
self.configure_prompt_model(prompt)
|
|
142
|
+
self._validate_optimization_inputs(prompt, dataset, metric)
|
|
101
143
|
|
|
102
144
|
base_model_kwargs = copy.deepcopy(prompt.model_kwargs or {})
|
|
103
145
|
base_prompt = prompt.copy()
|
|
@@ -105,18 +147,56 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
105
147
|
|
|
106
148
|
metric_name = getattr(metric, "__name__", str(metric))
|
|
107
149
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
150
|
+
# Create optimization run
|
|
151
|
+
optimization = self.opik_client.create_optimization(
|
|
152
|
+
dataset_name=dataset.name,
|
|
153
|
+
objective_name=metric_name,
|
|
154
|
+
metadata={"optimizer": self.__class__.__name__},
|
|
155
|
+
)
|
|
156
|
+
self.current_optimization_id = optimization.id
|
|
157
|
+
logger.debug(f"Created optimization with ID: {optimization.id}")
|
|
158
|
+
|
|
159
|
+
# Display header with optimization link
|
|
160
|
+
reporting.display_header(
|
|
161
|
+
algorithm=self.__class__.__name__,
|
|
162
|
+
optimization_id=optimization.id,
|
|
163
|
+
dataset_id=dataset.id,
|
|
114
164
|
verbose=self.verbose,
|
|
115
|
-
experiment_config=experiment_config,
|
|
116
|
-
n_samples=n_samples,
|
|
117
|
-
agent_class=self.agent_class,
|
|
118
165
|
)
|
|
119
166
|
|
|
167
|
+
# Display configuration
|
|
168
|
+
reporting.display_configuration(
|
|
169
|
+
messages=prompt.get_messages(),
|
|
170
|
+
optimizer_config={
|
|
171
|
+
"optimizer": self.__class__.__name__,
|
|
172
|
+
"n_trials": max_trials
|
|
173
|
+
if max_trials is not None
|
|
174
|
+
else self.default_n_trials,
|
|
175
|
+
"n_samples": n_samples,
|
|
176
|
+
"n_threads": self.n_threads,
|
|
177
|
+
"local_search_ratio": self.local_search_ratio,
|
|
178
|
+
"local_search_scale": self.local_search_scale,
|
|
179
|
+
},
|
|
180
|
+
verbose=self.verbose,
|
|
181
|
+
tools=getattr(prompt, "tools", None),
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
self.agent_class = self._setup_agent_class(base_prompt, agent_class)
|
|
185
|
+
|
|
186
|
+
# Evaluate baseline with reporting
|
|
187
|
+
with reporting.display_evaluation(verbose=self.verbose) as baseline_reporter:
|
|
188
|
+
baseline_score = self.evaluate_prompt(
|
|
189
|
+
prompt=base_prompt,
|
|
190
|
+
dataset=dataset,
|
|
191
|
+
metric=metric,
|
|
192
|
+
n_threads=self.n_threads,
|
|
193
|
+
verbose=self.verbose,
|
|
194
|
+
experiment_config=experiment_config,
|
|
195
|
+
n_samples=n_samples,
|
|
196
|
+
agent_class=self.agent_class,
|
|
197
|
+
)
|
|
198
|
+
baseline_reporter.set_score(baseline_score)
|
|
199
|
+
|
|
120
200
|
history: list[dict[str, Any]] = [
|
|
121
201
|
{
|
|
122
202
|
"iteration": 0,
|
|
@@ -141,7 +221,7 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
141
221
|
sampler = sampler or optuna.samplers.TPESampler(seed=self.seed)
|
|
142
222
|
study = optuna.create_study(direction="maximize", sampler=sampler)
|
|
143
223
|
|
|
144
|
-
total_trials = self.default_n_trials if
|
|
224
|
+
total_trials = self.default_n_trials if max_trials is None else max_trials
|
|
145
225
|
if total_trials < 0:
|
|
146
226
|
total_trials = 0
|
|
147
227
|
|
|
@@ -159,25 +239,45 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
159
239
|
current_stage = "global"
|
|
160
240
|
stage_records: list[dict[str, Any]] = []
|
|
161
241
|
search_ranges: dict[str, dict[str, Any]] = {}
|
|
242
|
+
current_best_score = baseline_score
|
|
162
243
|
|
|
163
244
|
def objective(trial: Trial) -> float:
|
|
245
|
+
nonlocal current_best_score
|
|
246
|
+
|
|
164
247
|
sampled_values = current_space.suggest(trial)
|
|
165
248
|
tuned_prompt = parameter_space.apply(
|
|
166
249
|
prompt,
|
|
167
250
|
sampled_values,
|
|
168
251
|
base_model_kwargs=base_model_kwargs,
|
|
169
252
|
)
|
|
170
|
-
tuned_agent_class = self.
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
253
|
+
tuned_agent_class = self._setup_agent_class(tuned_prompt, agent_class)
|
|
254
|
+
|
|
255
|
+
# Display trial evaluation with parameters
|
|
256
|
+
with reporting.display_trial_evaluation(
|
|
257
|
+
trial_number=trial.number,
|
|
258
|
+
total_trials=total_trials,
|
|
259
|
+
stage=current_stage,
|
|
260
|
+
parameters=sampled_values,
|
|
176
261
|
verbose=self.verbose,
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
262
|
+
) as trial_reporter:
|
|
263
|
+
score = self.evaluate_prompt(
|
|
264
|
+
prompt=tuned_prompt,
|
|
265
|
+
dataset=dataset,
|
|
266
|
+
metric=metric,
|
|
267
|
+
n_threads=self.n_threads,
|
|
268
|
+
verbose=self.verbose,
|
|
269
|
+
experiment_config=experiment_config,
|
|
270
|
+
n_samples=n_samples,
|
|
271
|
+
agent_class=tuned_agent_class,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Check if this is a new best
|
|
275
|
+
is_best = score > current_best_score
|
|
276
|
+
if is_best:
|
|
277
|
+
current_best_score = score
|
|
278
|
+
|
|
279
|
+
trial_reporter.set_score(score, is_best=is_best)
|
|
280
|
+
|
|
181
281
|
trial.set_user_attr("parameters", sampled_values)
|
|
182
282
|
trial.set_user_attr(
|
|
183
283
|
"model_kwargs", copy.deepcopy(tuned_prompt.model_kwargs)
|
|
@@ -198,6 +298,20 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
198
298
|
search_ranges["global"] = global_range
|
|
199
299
|
|
|
200
300
|
if global_trials > 0:
|
|
301
|
+
if self.verbose >= 1:
|
|
302
|
+
from rich.text import Text
|
|
303
|
+
from rich.console import Console
|
|
304
|
+
|
|
305
|
+
console = Console()
|
|
306
|
+
console.print("")
|
|
307
|
+
console.print(Text("> Starting global search phase", style="bold cyan"))
|
|
308
|
+
console.print(
|
|
309
|
+
Text(
|
|
310
|
+
f"│ Exploring full parameter space with {global_trials} trials"
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
console.print("")
|
|
314
|
+
|
|
201
315
|
study.optimize(
|
|
202
316
|
objective,
|
|
203
317
|
n_trials=global_trials,
|
|
@@ -278,6 +392,22 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
278
392
|
)
|
|
279
393
|
search_ranges["local"] = local_range
|
|
280
394
|
|
|
395
|
+
if self.verbose >= 1:
|
|
396
|
+
from rich.text import Text
|
|
397
|
+
from rich.console import Console
|
|
398
|
+
|
|
399
|
+
console = Console()
|
|
400
|
+
console.print("")
|
|
401
|
+
console.print(
|
|
402
|
+
Text("> Starting local search phase", style="bold cyan")
|
|
403
|
+
)
|
|
404
|
+
console.print(
|
|
405
|
+
Text(
|
|
406
|
+
f"│ Refining around best parameters with {local_trials} trials (scale: {local_scale})"
|
|
407
|
+
)
|
|
408
|
+
)
|
|
409
|
+
console.print("")
|
|
410
|
+
|
|
281
411
|
current_space = local_space
|
|
282
412
|
study.optimize(
|
|
283
413
|
objective,
|
|
@@ -346,6 +476,22 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
346
476
|
completed_trials, parameter_space.parameters
|
|
347
477
|
)
|
|
348
478
|
|
|
479
|
+
# Display final results
|
|
480
|
+
reporting.display_result(
|
|
481
|
+
initial_score=baseline_score,
|
|
482
|
+
best_score=best_score,
|
|
483
|
+
best_prompt=prompt.get_messages(),
|
|
484
|
+
verbose=self.verbose,
|
|
485
|
+
tools=getattr(prompt, "tools", None),
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
# Update optimization status to completed
|
|
489
|
+
try:
|
|
490
|
+
optimization.update(status="completed")
|
|
491
|
+
logger.info(f"Optimization {optimization.id} status updated to completed.")
|
|
492
|
+
except Exception as e:
|
|
493
|
+
logger.warning(f"Failed to update optimization status: {e}")
|
|
494
|
+
|
|
349
495
|
details = {
|
|
350
496
|
"initial_score": baseline_score,
|
|
351
497
|
"optimized_parameters": best_parameters,
|
|
@@ -379,4 +525,6 @@ class ParameterOptimizer(BaseOptimizer):
|
|
|
379
525
|
history=history,
|
|
380
526
|
llm_calls=self.llm_call_counter,
|
|
381
527
|
tool_calls=self.tool_call_counter,
|
|
528
|
+
optimization_id=optimization.id,
|
|
529
|
+
dataset_id=dataset.id,
|
|
382
530
|
)
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Reporting utilities for ParameterOptimizer."""
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import Any
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
|
|
7
|
+
from rich.text import Text
|
|
8
|
+
|
|
9
|
+
from ..reporting_utils import ( # noqa: F401
|
|
10
|
+
convert_tqdm_to_rich,
|
|
11
|
+
display_configuration,
|
|
12
|
+
display_header,
|
|
13
|
+
display_result,
|
|
14
|
+
get_console,
|
|
15
|
+
suppress_opik_logs,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
console = get_console()
|
|
19
|
+
PANEL_WIDTH = 70
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@contextmanager
|
|
23
|
+
def display_evaluation(
|
|
24
|
+
message: str = "First we will establish the baseline performance:", verbose: int = 1
|
|
25
|
+
) -> Iterator[Any]:
|
|
26
|
+
"""Context manager to display messages during an evaluation phase."""
|
|
27
|
+
|
|
28
|
+
# Entry point
|
|
29
|
+
if verbose >= 1:
|
|
30
|
+
console.print(Text(f"> {message}"))
|
|
31
|
+
|
|
32
|
+
# Create a simple object with a method to set the score
|
|
33
|
+
class Reporter:
|
|
34
|
+
def set_score(self, s: float) -> None:
|
|
35
|
+
if verbose >= 1:
|
|
36
|
+
console.print(Text(f"│ Baseline score was: {s:.4f}.\n", style="green"))
|
|
37
|
+
|
|
38
|
+
# Use our log suppression context manager and yield the reporter
|
|
39
|
+
with suppress_opik_logs():
|
|
40
|
+
with convert_tqdm_to_rich("│ Evaluation", verbose=verbose):
|
|
41
|
+
try:
|
|
42
|
+
yield Reporter()
|
|
43
|
+
finally:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@contextmanager
|
|
48
|
+
def display_trial_progress(
|
|
49
|
+
stage: str, n_trials: int, verbose: int = 1
|
|
50
|
+
) -> Iterator[Any]:
|
|
51
|
+
"""Context manager to display progress during Optuna trial optimization."""
|
|
52
|
+
|
|
53
|
+
if verbose >= 1:
|
|
54
|
+
console.print(Text(f"> Running {stage} search with {n_trials} trials"))
|
|
55
|
+
|
|
56
|
+
class Reporter:
|
|
57
|
+
def trial_complete(
|
|
58
|
+
self, trial_number: int, score: float, is_best: bool
|
|
59
|
+
) -> None:
|
|
60
|
+
if verbose >= 1:
|
|
61
|
+
if is_best:
|
|
62
|
+
console.print(
|
|
63
|
+
Text(
|
|
64
|
+
f"│ Trial {trial_number + 1}/{n_trials}: {score:.4f} (new best)",
|
|
65
|
+
style="green",
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
console.print(
|
|
70
|
+
Text(
|
|
71
|
+
f"│ Trial {trial_number + 1}/{n_trials}: {score:.4f}",
|
|
72
|
+
style="dim",
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
with suppress_opik_logs():
|
|
77
|
+
try:
|
|
78
|
+
yield Reporter()
|
|
79
|
+
finally:
|
|
80
|
+
if verbose >= 1:
|
|
81
|
+
console.print("")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def display_search_stage_summary(
|
|
85
|
+
stage: str, best_score: float, best_params: dict[str, Any], verbose: int = 1
|
|
86
|
+
) -> None:
|
|
87
|
+
"""Display summary after a search stage completes."""
|
|
88
|
+
if verbose < 1:
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
console.print(Text(f"│ {stage.capitalize()} search complete", style="cyan"))
|
|
92
|
+
console.print(Text(f"│ Best score: {best_score:.4f}", style="green"))
|
|
93
|
+
if best_params:
|
|
94
|
+
console.print(Text("│ Best parameters:", style="dim"))
|
|
95
|
+
for key, value in best_params.items():
|
|
96
|
+
console.print(Text(f"│ {key}: {value}", style="dim cyan"))
|
|
97
|
+
console.print("")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@contextmanager
|
|
101
|
+
def display_trial_evaluation(
|
|
102
|
+
trial_number: int,
|
|
103
|
+
total_trials: int,
|
|
104
|
+
stage: str,
|
|
105
|
+
parameters: dict[str, Any],
|
|
106
|
+
verbose: int = 1,
|
|
107
|
+
) -> Iterator[Any]:
|
|
108
|
+
"""Context manager to display a single trial evaluation with parameters."""
|
|
109
|
+
|
|
110
|
+
if verbose >= 1:
|
|
111
|
+
console.print("")
|
|
112
|
+
console.print(
|
|
113
|
+
Text(
|
|
114
|
+
f"│ Trial {trial_number + 1}/{total_trials} ({stage} search)",
|
|
115
|
+
style="cyan bold",
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Display parameters being tested
|
|
120
|
+
if parameters:
|
|
121
|
+
param_text = Text()
|
|
122
|
+
param_text.append("│ Testing parameters:\n", style="dim")
|
|
123
|
+
for key, value in parameters.items():
|
|
124
|
+
# Format the value nicely
|
|
125
|
+
if isinstance(value, float):
|
|
126
|
+
formatted_value = f"{value:.6f}"
|
|
127
|
+
else:
|
|
128
|
+
formatted_value = str(value)
|
|
129
|
+
param_text.append(f"│ {key}: ", style="dim")
|
|
130
|
+
param_text.append(f"{formatted_value}\n", style="cyan")
|
|
131
|
+
console.print(param_text)
|
|
132
|
+
|
|
133
|
+
class Reporter:
|
|
134
|
+
def set_score(self, s: float, is_best: bool = False) -> None:
|
|
135
|
+
if verbose >= 1:
|
|
136
|
+
if is_best:
|
|
137
|
+
console.print(
|
|
138
|
+
Text(f"│ Score: {s:.4f} (new best)", style="green bold")
|
|
139
|
+
)
|
|
140
|
+
else:
|
|
141
|
+
console.print(Text(f"│ Score: {s:.4f}", style="dim"))
|
|
142
|
+
|
|
143
|
+
with suppress_opik_logs():
|
|
144
|
+
with convert_tqdm_to_rich("│ Evaluation", verbose=verbose):
|
|
145
|
+
try:
|
|
146
|
+
yield Reporter()
|
|
147
|
+
finally:
|
|
148
|
+
pass
|