opik-optimizer 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/base_optimizer.py +376 -19
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +80 -17
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +179 -39
- opik_optimizer/evolutionary_optimizer/llm_support.py +3 -1
- opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +17 -3
- opik_optimizer/evolutionary_optimizer/population_ops.py +5 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +47 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +12 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +65 -59
- opik_optimizer/gepa_optimizer/adapter.py +5 -3
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +163 -66
- opik_optimizer/mcp_utils/mcp_workflow.py +57 -3
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +75 -69
- opik_optimizer/mipro_optimizer/_lm.py +10 -3
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +1 -1
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +96 -21
- opik_optimizer/optimizable_agent.py +5 -0
- opik_optimizer/optimization_result.py +1 -0
- opik_optimizer/utils/core.py +56 -14
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/METADATA +96 -9
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/RECORD +27 -26
- /opik_optimizer/{colbert.py → utils/colbert.py} +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
opik_optimizer/__init__.py
CHANGED
@@ -15,6 +15,7 @@ from .few_shot_bayesian_optimizer import FewShotBayesianOptimizer
|
|
15
15
|
from .gepa_optimizer import GepaOptimizer
|
16
16
|
from .logging_config import setup_logging
|
17
17
|
from .meta_prompt_optimizer import MetaPromptOptimizer
|
18
|
+
from .mipro_optimizer import MiproOptimizer
|
18
19
|
from .optimization_config.configs import TaskConfig
|
19
20
|
from .optimization_result import OptimizationResult
|
20
21
|
|
@@ -31,6 +32,7 @@ __all__ = [
|
|
31
32
|
"FewShotBayesianOptimizer",
|
32
33
|
"GepaOptimizer",
|
33
34
|
"MetaPromptOptimizer",
|
35
|
+
"MiproOptimizer",
|
34
36
|
"EvolutionaryOptimizer",
|
35
37
|
"OptimizationResult",
|
36
38
|
"OptimizableAgent",
|
opik_optimizer/base_optimizer.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1
|
-
from typing import Any
|
1
|
+
from typing import Any, cast
|
2
2
|
from collections.abc import Callable
|
3
3
|
|
4
|
+
import copy
|
5
|
+
import inspect
|
4
6
|
import logging
|
5
7
|
import time
|
6
|
-
from abc import abstractmethod
|
8
|
+
from abc import ABC, abstractmethod
|
7
9
|
import random
|
10
|
+
import importlib.metadata
|
8
11
|
|
9
12
|
|
10
13
|
import litellm
|
@@ -17,7 +20,7 @@ from . import _throttle, optimization_result
|
|
17
20
|
from .cache_config import initialize_cache
|
18
21
|
from .optimization_config import chat_prompt, mappers
|
19
22
|
from .optimizable_agent import OptimizableAgent
|
20
|
-
from .utils import create_litellm_agent_class
|
23
|
+
from .utils import create_litellm_agent_class, optimization_context
|
21
24
|
from . import task_evaluator
|
22
25
|
|
23
26
|
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
@@ -29,6 +32,12 @@ litellm.drop_params = True
|
|
29
32
|
logger = logging.getLogger(__name__)
|
30
33
|
|
31
34
|
|
35
|
+
try:
|
36
|
+
_OPTIMIZER_VERSION = importlib.metadata.version("opik_optimizer")
|
37
|
+
except importlib.metadata.PackageNotFoundError: # pragma: no cover - dev installs
|
38
|
+
_OPTIMIZER_VERSION = "unknown"
|
39
|
+
|
40
|
+
|
32
41
|
class OptimizationRound(BaseModel):
|
33
42
|
model_config = {"arbitrary_types_allowed": True}
|
34
43
|
|
@@ -41,11 +50,12 @@ class OptimizationRound(BaseModel):
|
|
41
50
|
improvement: float
|
42
51
|
|
43
52
|
|
44
|
-
class BaseOptimizer:
|
53
|
+
class BaseOptimizer(ABC):
|
45
54
|
def __init__(
|
46
55
|
self,
|
47
56
|
model: str,
|
48
57
|
verbose: int = 1,
|
58
|
+
seed: int = 42,
|
49
59
|
**model_kwargs: Any,
|
50
60
|
) -> None:
|
51
61
|
"""
|
@@ -54,19 +64,335 @@ class BaseOptimizer:
|
|
54
64
|
Args:
|
55
65
|
model: LiteLLM model name
|
56
66
|
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
67
|
+
seed: Random seed for reproducibility (default: 42)
|
57
68
|
model_kwargs: additional args for model (eg, temperature)
|
58
69
|
"""
|
59
70
|
self.model = model
|
60
71
|
self.reasoning_model = model
|
61
72
|
self.model_kwargs = model_kwargs
|
62
73
|
self.verbose = verbose
|
74
|
+
self.seed = seed
|
63
75
|
self._history: list[OptimizationRound] = []
|
64
76
|
self.experiment_config = None
|
65
77
|
self.llm_call_counter = 0
|
78
|
+
self.tool_call_counter = 0
|
79
|
+
self._opik_client = None # Lazy initialization
|
66
80
|
|
67
81
|
# Initialize shared cache
|
68
82
|
initialize_cache()
|
69
83
|
|
84
|
+
def reset_counters(self) -> None:
|
85
|
+
"""Reset all call counters for a new optimization run."""
|
86
|
+
self.llm_call_counter = 0
|
87
|
+
self.tool_call_counter = 0
|
88
|
+
|
89
|
+
def increment_llm_counter(self) -> None:
|
90
|
+
"""Increment the LLM call counter."""
|
91
|
+
self.llm_call_counter += 1
|
92
|
+
|
93
|
+
def increment_tool_counter(self) -> None:
|
94
|
+
"""Increment the tool call counter."""
|
95
|
+
self.tool_call_counter += 1
|
96
|
+
|
97
|
+
def cleanup(self) -> None:
|
98
|
+
"""
|
99
|
+
Clean up resources and perform memory management.
|
100
|
+
Should be called when the optimizer is no longer needed.
|
101
|
+
"""
|
102
|
+
# Reset counters
|
103
|
+
self.reset_counters()
|
104
|
+
|
105
|
+
# Clear history to free memory
|
106
|
+
self._history.clear()
|
107
|
+
|
108
|
+
# Clear Opik client if it exists
|
109
|
+
if self._opik_client is not None:
|
110
|
+
# Note: Opik client doesn't have explicit cleanup, but we can clear the reference
|
111
|
+
self._opik_client = None
|
112
|
+
|
113
|
+
logger.debug(f"Cleaned up resources for {self.__class__.__name__}")
|
114
|
+
|
115
|
+
def __del__(self) -> None:
|
116
|
+
"""Destructor to ensure cleanup is called."""
|
117
|
+
try:
|
118
|
+
self.cleanup()
|
119
|
+
except Exception:
|
120
|
+
# Ignore exceptions during cleanup in destructor
|
121
|
+
pass
|
122
|
+
|
123
|
+
@property
|
124
|
+
def opik_client(self) -> Any:
|
125
|
+
"""Lazy initialization of Opik client."""
|
126
|
+
if self._opik_client is None:
|
127
|
+
import opik
|
128
|
+
|
129
|
+
self._opik_client = opik.Opik()
|
130
|
+
return self._opik_client
|
131
|
+
|
132
|
+
def validate_optimization_inputs(
|
133
|
+
self, prompt: "chat_prompt.ChatPrompt", dataset: "Dataset", metric: Callable
|
134
|
+
) -> None:
|
135
|
+
"""
|
136
|
+
Validate common optimization inputs.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
prompt: The chat prompt to validate
|
140
|
+
dataset: The dataset to validate
|
141
|
+
metric: The metric function to validate
|
142
|
+
|
143
|
+
Raises:
|
144
|
+
ValueError: If any input is invalid
|
145
|
+
"""
|
146
|
+
if not isinstance(prompt, chat_prompt.ChatPrompt):
|
147
|
+
raise ValueError("Prompt must be a ChatPrompt object")
|
148
|
+
|
149
|
+
if not isinstance(dataset, Dataset):
|
150
|
+
raise ValueError("Dataset must be a Dataset object")
|
151
|
+
|
152
|
+
if not callable(metric):
|
153
|
+
raise ValueError(
|
154
|
+
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
155
|
+
)
|
156
|
+
|
157
|
+
def setup_agent_class(
|
158
|
+
self, prompt: "chat_prompt.ChatPrompt", agent_class: Any = None
|
159
|
+
) -> Any:
|
160
|
+
"""
|
161
|
+
Setup agent class for optimization.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
prompt: The chat prompt
|
165
|
+
agent_class: Optional custom agent class
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
The agent class to use
|
169
|
+
"""
|
170
|
+
if agent_class is None:
|
171
|
+
return create_litellm_agent_class(prompt, optimizer_ref=self)
|
172
|
+
else:
|
173
|
+
return agent_class
|
174
|
+
|
175
|
+
def configure_prompt_model(self, prompt: "chat_prompt.ChatPrompt") -> None:
|
176
|
+
"""
|
177
|
+
Configure prompt model and model_kwargs if not set.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
prompt: The chat prompt to configure
|
181
|
+
"""
|
182
|
+
# Only configure if prompt is a valid ChatPrompt object
|
183
|
+
if hasattr(prompt, "model") and hasattr(prompt, "model_kwargs"):
|
184
|
+
if prompt.model is None:
|
185
|
+
prompt.model = self.model
|
186
|
+
if prompt.model_kwargs is None:
|
187
|
+
prompt.model_kwargs = self.model_kwargs
|
188
|
+
|
189
|
+
# ------------------------------------------------------------------
|
190
|
+
# Experiment metadata helpers
|
191
|
+
# ------------------------------------------------------------------
|
192
|
+
|
193
|
+
@staticmethod
|
194
|
+
def _drop_none(metadata: dict[str, Any]) -> dict[str, Any]:
|
195
|
+
return {k: v for k, v in metadata.items() if v is not None}
|
196
|
+
|
197
|
+
@staticmethod
|
198
|
+
def _deep_merge_dicts(
|
199
|
+
base: dict[str, Any], overrides: dict[str, Any]
|
200
|
+
) -> dict[str, Any]:
|
201
|
+
result = copy.deepcopy(base)
|
202
|
+
for key, value in overrides.items():
|
203
|
+
if (
|
204
|
+
key in result
|
205
|
+
and isinstance(result[key], dict)
|
206
|
+
and isinstance(value, dict)
|
207
|
+
):
|
208
|
+
result[key] = BaseOptimizer._deep_merge_dicts(result[key], value)
|
209
|
+
else:
|
210
|
+
result[key] = value
|
211
|
+
return result
|
212
|
+
|
213
|
+
@staticmethod
|
214
|
+
def _serialize_tools(prompt: "chat_prompt.ChatPrompt") -> list[dict[str, Any]]:
|
215
|
+
tools_obj = getattr(prompt, "tools", None)
|
216
|
+
if not isinstance(tools_obj, list):
|
217
|
+
return []
|
218
|
+
|
219
|
+
try:
|
220
|
+
return copy.deepcopy(cast(list[dict[str, Any]], tools_obj))
|
221
|
+
except Exception: # pragma: no cover - defensive
|
222
|
+
serialized_tools: list[dict[str, Any]] = []
|
223
|
+
for tool in tools_obj:
|
224
|
+
if isinstance(tool, dict):
|
225
|
+
serialized_tools.append({k: v for k, v in tool.items() if k})
|
226
|
+
return serialized_tools
|
227
|
+
|
228
|
+
@staticmethod
|
229
|
+
def _describe_annotation(annotation: Any) -> str | None:
|
230
|
+
if annotation is inspect._empty:
|
231
|
+
return None
|
232
|
+
if isinstance(annotation, type):
|
233
|
+
return annotation.__name__
|
234
|
+
return str(annotation)
|
235
|
+
|
236
|
+
def _summarize_tool_signatures(
|
237
|
+
self, prompt: "chat_prompt.ChatPrompt"
|
238
|
+
) -> list[dict[str, Any]]:
|
239
|
+
signatures: list[dict[str, Any]] = []
|
240
|
+
for name, func in getattr(prompt, "function_map", {}).items():
|
241
|
+
callable_obj = getattr(func, "__wrapped__", func)
|
242
|
+
try:
|
243
|
+
sig = inspect.signature(callable_obj)
|
244
|
+
except (TypeError, ValueError): # pragma: no cover - defensive
|
245
|
+
signatures.append({"name": name, "signature": "unavailable"})
|
246
|
+
continue
|
247
|
+
|
248
|
+
params: list[dict[str, Any]] = []
|
249
|
+
for parameter in sig.parameters.values():
|
250
|
+
params.append(
|
251
|
+
self._drop_none(
|
252
|
+
{
|
253
|
+
"name": parameter.name,
|
254
|
+
"kind": parameter.kind.name,
|
255
|
+
"annotation": self._describe_annotation(
|
256
|
+
parameter.annotation
|
257
|
+
),
|
258
|
+
"default": (
|
259
|
+
None
|
260
|
+
if parameter.default is inspect._empty
|
261
|
+
else parameter.default
|
262
|
+
),
|
263
|
+
}
|
264
|
+
)
|
265
|
+
)
|
266
|
+
|
267
|
+
signatures.append(
|
268
|
+
self._drop_none(
|
269
|
+
{
|
270
|
+
"name": name,
|
271
|
+
"parameters": params,
|
272
|
+
"docstring": inspect.getdoc(callable_obj),
|
273
|
+
}
|
274
|
+
)
|
275
|
+
)
|
276
|
+
return signatures
|
277
|
+
|
278
|
+
def _build_agent_config(self, prompt: "chat_prompt.ChatPrompt") -> dict[str, Any]:
|
279
|
+
agent_config: dict[str, Any] = dict(prompt.to_dict())
|
280
|
+
agent_config["project_name"] = getattr(prompt, "project_name", None)
|
281
|
+
agent_config["model"] = getattr(prompt, "model", None) or self.model
|
282
|
+
agent_config["tools"] = self._serialize_tools(prompt)
|
283
|
+
return self._drop_none(agent_config)
|
284
|
+
|
285
|
+
def get_optimizer_metadata(self) -> dict[str, Any]:
|
286
|
+
"""Override in subclasses to expose optimizer-specific parameters."""
|
287
|
+
return {}
|
288
|
+
|
289
|
+
def _build_optimizer_metadata(self) -> dict[str, Any]:
|
290
|
+
metadata = {
|
291
|
+
"name": self.__class__.__name__,
|
292
|
+
"version": _OPTIMIZER_VERSION,
|
293
|
+
"model": self.model,
|
294
|
+
"model_kwargs": self.model_kwargs or None,
|
295
|
+
"seed": getattr(self, "seed", None),
|
296
|
+
"num_threads": getattr(self, "num_threads", None),
|
297
|
+
}
|
298
|
+
|
299
|
+
# n_threads is used by some optimizers instead of num_threads
|
300
|
+
if metadata["num_threads"] is None and hasattr(self, "n_threads"):
|
301
|
+
metadata["num_threads"] = getattr(self, "n_threads")
|
302
|
+
|
303
|
+
if hasattr(self, "reasoning_model"):
|
304
|
+
metadata["reasoning_model"] = getattr(self, "reasoning_model")
|
305
|
+
|
306
|
+
extra_parameters = self.get_optimizer_metadata()
|
307
|
+
if extra_parameters:
|
308
|
+
metadata["parameters"] = extra_parameters
|
309
|
+
|
310
|
+
return self._drop_none(metadata)
|
311
|
+
|
312
|
+
def _prepare_experiment_config(
|
313
|
+
self,
|
314
|
+
*,
|
315
|
+
prompt: "chat_prompt.ChatPrompt",
|
316
|
+
dataset: Dataset,
|
317
|
+
metric: Callable,
|
318
|
+
experiment_config: dict[str, Any] | None = None,
|
319
|
+
configuration_updates: dict[str, Any] | None = None,
|
320
|
+
additional_metadata: dict[str, Any] | None = None,
|
321
|
+
) -> dict[str, Any]:
|
322
|
+
dataset_id = getattr(dataset, "id", None)
|
323
|
+
project_name = (
|
324
|
+
getattr(self.agent_class, "project_name", None)
|
325
|
+
if hasattr(self, "agent_class")
|
326
|
+
else None
|
327
|
+
)
|
328
|
+
if not project_name:
|
329
|
+
project_name = getattr(prompt, "project_name", None)
|
330
|
+
if not project_name:
|
331
|
+
project_name = self.__class__.__name__
|
332
|
+
|
333
|
+
base_config: dict[str, Any] = {
|
334
|
+
"project_name": project_name,
|
335
|
+
"agent_class": (
|
336
|
+
getattr(self.agent_class, "__name__", None)
|
337
|
+
if hasattr(self, "agent_class")
|
338
|
+
else None
|
339
|
+
),
|
340
|
+
"agent_config": self._build_agent_config(prompt),
|
341
|
+
"metric": getattr(metric, "__name__", str(metric)),
|
342
|
+
"dataset": getattr(dataset, "name", None),
|
343
|
+
"dataset_id": dataset_id,
|
344
|
+
"optimizer_metadata": self._build_optimizer_metadata(),
|
345
|
+
"tool_signatures": self._summarize_tool_signatures(prompt),
|
346
|
+
"configuration": {
|
347
|
+
"prompt": prompt.get_messages(),
|
348
|
+
"prompt_name": getattr(prompt, "name", None),
|
349
|
+
"tools": self._serialize_tools(prompt),
|
350
|
+
"prompt_project_name": getattr(prompt, "project_name", None),
|
351
|
+
},
|
352
|
+
}
|
353
|
+
|
354
|
+
if configuration_updates:
|
355
|
+
base_config["configuration"] = self._deep_merge_dicts(
|
356
|
+
base_config["configuration"], configuration_updates
|
357
|
+
)
|
358
|
+
|
359
|
+
if additional_metadata:
|
360
|
+
base_config = self._deep_merge_dicts(base_config, additional_metadata)
|
361
|
+
|
362
|
+
if experiment_config:
|
363
|
+
base_config = self._deep_merge_dicts(base_config, experiment_config)
|
364
|
+
|
365
|
+
return self._drop_none(base_config)
|
366
|
+
|
367
|
+
def create_optimization_context(
|
368
|
+
self, dataset: "Dataset", metric: Callable, metadata: dict | None = None
|
369
|
+
) -> Any:
|
370
|
+
"""
|
371
|
+
Create optimization context for tracking.
|
372
|
+
|
373
|
+
Args:
|
374
|
+
dataset: The dataset being optimized
|
375
|
+
metric: The metric function
|
376
|
+
metadata: Additional metadata
|
377
|
+
|
378
|
+
Returns:
|
379
|
+
Optimization context manager
|
380
|
+
"""
|
381
|
+
context_metadata = {
|
382
|
+
"optimizer": self.__class__.__name__,
|
383
|
+
"model": self.model,
|
384
|
+
"seed": self.seed,
|
385
|
+
}
|
386
|
+
if metadata:
|
387
|
+
context_metadata.update(metadata)
|
388
|
+
|
389
|
+
return optimization_context(
|
390
|
+
client=self.opik_client,
|
391
|
+
dataset_name=dataset.name,
|
392
|
+
objective_name=metric.__name__,
|
393
|
+
metadata=context_metadata,
|
394
|
+
)
|
395
|
+
|
70
396
|
@abstractmethod
|
71
397
|
def optimize_prompt(
|
72
398
|
self,
|
@@ -74,6 +400,9 @@ class BaseOptimizer:
|
|
74
400
|
dataset: Dataset,
|
75
401
|
metric: Callable,
|
76
402
|
experiment_config: dict | None = None,
|
403
|
+
n_samples: int | None = None,
|
404
|
+
auto_continue: bool = False,
|
405
|
+
agent_class: type[OptimizableAgent] | None = None,
|
77
406
|
**kwargs: Any,
|
78
407
|
) -> optimization_result.OptimizationResult:
|
79
408
|
"""
|
@@ -100,9 +429,43 @@ class BaseOptimizer:
|
|
100
429
|
tool_name: str,
|
101
430
|
second_pass: Any,
|
102
431
|
experiment_config: dict | None = None,
|
432
|
+
n_samples: int | None = None,
|
433
|
+
auto_continue: bool = False,
|
434
|
+
agent_class: type[OptimizableAgent] | None = None,
|
435
|
+
fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
|
436
|
+
fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
|
437
|
+
allow_tool_use_on_second_pass: bool = False,
|
103
438
|
**kwargs: Any,
|
104
439
|
) -> optimization_result.OptimizationResult:
|
105
|
-
"""
|
440
|
+
"""
|
441
|
+
Optimize prompts that rely on MCP (Model Context Protocol) tooling.
|
442
|
+
|
443
|
+
This method provides a standardized interface for optimizing prompts that use
|
444
|
+
external tools through the MCP protocol. It handles tool invocation, second-pass
|
445
|
+
coordination, and fallback mechanisms.
|
446
|
+
|
447
|
+
Args:
|
448
|
+
prompt: The chat prompt to optimize, must include tools
|
449
|
+
dataset: Opik dataset containing evaluation data
|
450
|
+
metric: Evaluation function that takes (dataset_item, llm_output) and returns a score
|
451
|
+
tool_name: Name of the MCP tool to use for optimization
|
452
|
+
second_pass: MCPSecondPassCoordinator for handling second-pass tool calls
|
453
|
+
experiment_config: Optional configuration for the experiment
|
454
|
+
n_samples: Number of samples to use for optimization (default: None)
|
455
|
+
auto_continue: Whether to auto-continue optimization (default: False)
|
456
|
+
agent_class: Custom agent class to use (default: None)
|
457
|
+
fallback_invoker: Fallback function for tool invocation (default: None)
|
458
|
+
fallback_arguments: Function to extract tool arguments (default: None)
|
459
|
+
allow_tool_use_on_second_pass: Whether to allow tool use on second pass (default: False)
|
460
|
+
**kwargs: Additional arguments for optimization
|
461
|
+
|
462
|
+
Returns:
|
463
|
+
OptimizationResult: The optimization result containing the optimized prompt and metrics
|
464
|
+
|
465
|
+
Raises:
|
466
|
+
NotImplementedError: If the optimizer doesn't implement MCP optimization
|
467
|
+
ValueError: If the prompt doesn't include required tools
|
468
|
+
"""
|
106
469
|
raise NotImplementedError(
|
107
470
|
f"{self.__class__.__name__} does not implement optimize_mcp yet."
|
108
471
|
)
|
@@ -166,7 +529,7 @@ class BaseOptimizer:
|
|
166
529
|
self.agent_class: type[OptimizableAgent]
|
167
530
|
|
168
531
|
if agent_class is None:
|
169
|
-
self.agent_class = create_litellm_agent_class(prompt)
|
532
|
+
self.agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
|
170
533
|
else:
|
171
534
|
self.agent_class = agent_class
|
172
535
|
|
@@ -181,18 +544,12 @@ class BaseOptimizer:
|
|
181
544
|
}
|
182
545
|
return result
|
183
546
|
|
184
|
-
experiment_config =
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
"agent_config": prompt.to_dict(),
|
191
|
-
"metric": metric.__name__,
|
192
|
-
"dataset": dataset.name,
|
193
|
-
"configuration": {"prompt": (prompt.get_messages() if prompt else [])},
|
194
|
-
},
|
195
|
-
}
|
547
|
+
experiment_config = self._prepare_experiment_config(
|
548
|
+
prompt=prompt,
|
549
|
+
dataset=dataset,
|
550
|
+
metric=metric,
|
551
|
+
experiment_config=experiment_config,
|
552
|
+
)
|
196
553
|
|
197
554
|
if n_samples is not None:
|
198
555
|
if dataset_item_ids is not None:
|
@@ -207,7 +564,7 @@ class BaseOptimizer:
|
|
207
564
|
metric=metric,
|
208
565
|
evaluated_task=llm_task,
|
209
566
|
num_threads=n_threads,
|
210
|
-
project_name=
|
567
|
+
project_name=experiment_config.get("project_name"),
|
211
568
|
experiment_config=experiment_config,
|
212
569
|
optimization_id=None,
|
213
570
|
verbose=verbose,
|
@@ -1,10 +1,15 @@
|
|
1
|
-
from typing import Any, TYPE_CHECKING
|
1
|
+
from typing import Any, TYPE_CHECKING, cast
|
2
2
|
from collections.abc import Callable
|
3
3
|
|
4
4
|
|
5
5
|
from .. import task_evaluator
|
6
6
|
from ..optimization_config import mappers, chat_prompt
|
7
|
+
from ..mcp_utils.mcp_workflow import MCPExecutionConfig
|
7
8
|
import opik
|
9
|
+
import copy
|
10
|
+
|
11
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
12
|
+
from ..base_optimizer import BaseOptimizer
|
8
13
|
|
9
14
|
|
10
15
|
class EvaluationOps:
|
@@ -30,33 +35,91 @@ class EvaluationOps:
|
|
30
35
|
|
31
36
|
new_prompt = prompt.copy()
|
32
37
|
new_prompt.set_messages(messages)
|
38
|
+
tools = getattr(messages, "tools", None)
|
39
|
+
if tools is not None:
|
40
|
+
new_prompt.tools = copy.deepcopy(tools)
|
41
|
+
|
42
|
+
optimizer = cast("BaseOptimizer", self)
|
33
43
|
|
34
|
-
|
35
|
-
|
36
|
-
experiment_config = {
|
37
|
-
**experiment_config,
|
38
|
-
"optimizer": self.__class__.__name__,
|
39
|
-
"agent_class": self.agent_class.__name__,
|
40
|
-
"agent_config": new_prompt.to_dict(),
|
41
|
-
"metric": metric.__name__,
|
42
|
-
"dataset": dataset.name,
|
43
|
-
"configuration": {
|
44
|
-
"prompt": new_prompt.get_messages(),
|
44
|
+
configuration_updates = optimizer._drop_none(
|
45
|
+
{
|
45
46
|
"n_samples_for_eval": (
|
46
47
|
len(dataset_item_ids) if dataset_item_ids is not None else n_samples
|
47
48
|
),
|
48
49
|
"total_dataset_items": total_items,
|
49
|
-
}
|
50
|
-
|
50
|
+
}
|
51
|
+
)
|
52
|
+
evaluation_details = optimizer._drop_none(
|
53
|
+
{
|
54
|
+
"dataset_item_ids": dataset_item_ids,
|
55
|
+
"optimization_id": optimization_id,
|
56
|
+
}
|
57
|
+
)
|
58
|
+
additional_metadata = (
|
59
|
+
{"evaluation": evaluation_details} if evaluation_details else None
|
60
|
+
)
|
61
|
+
|
62
|
+
experiment_config = optimizer._prepare_experiment_config(
|
63
|
+
prompt=new_prompt,
|
64
|
+
dataset=dataset,
|
65
|
+
metric=metric,
|
66
|
+
experiment_config=experiment_config,
|
67
|
+
configuration_updates=configuration_updates,
|
68
|
+
additional_metadata=additional_metadata,
|
69
|
+
)
|
51
70
|
try:
|
52
71
|
agent = self.agent_class(new_prompt)
|
53
72
|
except Exception:
|
54
73
|
return 0.0
|
55
74
|
|
75
|
+
mcp_execution_config: MCPExecutionConfig | None = kwargs.get("mcp_config")
|
76
|
+
|
56
77
|
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
57
78
|
messages = new_prompt.get_messages(dataset_item)
|
58
|
-
|
59
|
-
|
79
|
+
|
80
|
+
if mcp_execution_config is None:
|
81
|
+
model_output = agent.invoke(messages)
|
82
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
|
83
|
+
|
84
|
+
coordinator = mcp_execution_config.coordinator
|
85
|
+
coordinator.reset()
|
86
|
+
|
87
|
+
raw_model_output = agent.llm_invoke(
|
88
|
+
messages=messages,
|
89
|
+
seed=getattr(self, "seed", None),
|
90
|
+
allow_tool_use=True,
|
91
|
+
)
|
92
|
+
|
93
|
+
second_pass_messages = coordinator.build_second_pass_messages(
|
94
|
+
base_messages=messages,
|
95
|
+
dataset_item=dataset_item,
|
96
|
+
)
|
97
|
+
|
98
|
+
if (
|
99
|
+
second_pass_messages is None
|
100
|
+
and mcp_execution_config.fallback_invoker is not None
|
101
|
+
):
|
102
|
+
fallback_args = mcp_execution_config.fallback_arguments(dataset_item)
|
103
|
+
if fallback_args:
|
104
|
+
summary_override = mcp_execution_config.fallback_invoker(
|
105
|
+
fallback_args
|
106
|
+
)
|
107
|
+
second_pass_messages = coordinator.build_second_pass_messages(
|
108
|
+
base_messages=messages,
|
109
|
+
dataset_item=dataset_item,
|
110
|
+
summary_override=summary_override,
|
111
|
+
)
|
112
|
+
|
113
|
+
if second_pass_messages is not None:
|
114
|
+
final_response = agent.llm_invoke(
|
115
|
+
messages=second_pass_messages,
|
116
|
+
seed=getattr(self, "seed", None),
|
117
|
+
allow_tool_use=mcp_execution_config.allow_tool_use_on_second_pass,
|
118
|
+
)
|
119
|
+
else:
|
120
|
+
final_response = raw_model_output
|
121
|
+
|
122
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: final_response.strip()}
|
60
123
|
|
61
124
|
score = task_evaluator.evaluate(
|
62
125
|
dataset=dataset,
|
@@ -64,7 +127,7 @@ class EvaluationOps:
|
|
64
127
|
metric=metric,
|
65
128
|
evaluated_task=llm_task,
|
66
129
|
num_threads=self.num_threads,
|
67
|
-
project_name=experiment_config
|
130
|
+
project_name=experiment_config.get("project_name"),
|
68
131
|
n_samples=n_samples if dataset_item_ids is None else None,
|
69
132
|
experiment_config=experiment_config,
|
70
133
|
optimization_id=optimization_id,
|