opik-optimizer 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +0 -2
- opik_optimizer/base_optimizer.py +314 -145
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
- opik_optimizer/gepa_optimizer/reporting.py +164 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +42 -15
- opik_optimizer/utils/core.py +16 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +34 -35
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
opik_optimizer/__init__.py
CHANGED
|
@@ -15,7 +15,6 @@ from .few_shot_bayesian_optimizer import FewShotBayesianOptimizer
|
|
|
15
15
|
from .gepa_optimizer import GepaOptimizer
|
|
16
16
|
from .logging_config import setup_logging
|
|
17
17
|
from .meta_prompt_optimizer import MetaPromptOptimizer
|
|
18
|
-
from .mipro_optimizer import MiproOptimizer
|
|
19
18
|
from .hierarchical_reflective_optimizer import HierarchicalReflectiveOptimizer
|
|
20
19
|
from .optimization_config.configs import TaskConfig
|
|
21
20
|
from .optimization_result import OptimizationResult
|
|
@@ -40,7 +39,6 @@ __all__ = [
|
|
|
40
39
|
"FewShotBayesianOptimizer",
|
|
41
40
|
"GepaOptimizer",
|
|
42
41
|
"MetaPromptOptimizer",
|
|
43
|
-
"MiproOptimizer",
|
|
44
42
|
"EvolutionaryOptimizer",
|
|
45
43
|
"HierarchicalReflectiveOptimizer",
|
|
46
44
|
"ParameterOptimizer",
|
opik_optimizer/base_optimizer.py
CHANGED
|
@@ -13,14 +13,14 @@ import importlib.metadata
|
|
|
13
13
|
import litellm
|
|
14
14
|
from opik.rest_api.core import ApiError
|
|
15
15
|
from opik.api_objects import optimization
|
|
16
|
-
from opik import Dataset
|
|
16
|
+
from opik import Dataset, opik_context
|
|
17
17
|
from pydantic import BaseModel
|
|
18
18
|
|
|
19
19
|
from . import _throttle, optimization_result
|
|
20
20
|
from .cache_config import initialize_cache
|
|
21
21
|
from .optimization_config import chat_prompt, mappers
|
|
22
22
|
from .optimizable_agent import OptimizableAgent
|
|
23
|
-
from .utils import create_litellm_agent_class
|
|
23
|
+
from .utils import create_litellm_agent_class
|
|
24
24
|
from . import task_evaluator
|
|
25
25
|
|
|
26
26
|
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
|
@@ -56,20 +56,24 @@ class BaseOptimizer(ABC):
|
|
|
56
56
|
model: str,
|
|
57
57
|
verbose: int = 1,
|
|
58
58
|
seed: int = 42,
|
|
59
|
-
|
|
59
|
+
model_parameters: dict[str, Any] | None = None,
|
|
60
60
|
) -> None:
|
|
61
61
|
"""
|
|
62
62
|
Base class for optimizers.
|
|
63
63
|
|
|
64
64
|
Args:
|
|
65
|
-
model: LiteLLM model name
|
|
66
|
-
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
67
|
-
seed: Random seed for reproducibility
|
|
68
|
-
|
|
65
|
+
model: LiteLLM model name for optimizer's internal reasoning/generation calls
|
|
66
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
67
|
+
seed: Random seed for reproducibility
|
|
68
|
+
model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
|
|
69
|
+
Common params: temperature, max_tokens, max_completion_tokens, top_p,
|
|
70
|
+
presence_penalty, frequency_penalty.
|
|
71
|
+
See: https://docs.litellm.ai/docs/completion/input
|
|
72
|
+
Note: These params control the optimizer's reasoning model, NOT the prompt evaluation.
|
|
69
73
|
"""
|
|
70
74
|
self.model = model
|
|
71
75
|
self.reasoning_model = model
|
|
72
|
-
self.
|
|
76
|
+
self.model_parameters = model_parameters or {}
|
|
73
77
|
self.verbose = verbose
|
|
74
78
|
self.seed = seed
|
|
75
79
|
self._history: list[OptimizationRound] = []
|
|
@@ -77,20 +81,22 @@ class BaseOptimizer(ABC):
|
|
|
77
81
|
self.llm_call_counter = 0
|
|
78
82
|
self.tool_call_counter = 0
|
|
79
83
|
self._opik_client = None # Lazy initialization
|
|
84
|
+
self.current_optimization_id: str | None = None # Track current optimization
|
|
85
|
+
self.project_name: str = "Optimization" # Default project name
|
|
80
86
|
|
|
81
87
|
# Initialize shared cache
|
|
82
88
|
initialize_cache()
|
|
83
89
|
|
|
84
|
-
def
|
|
90
|
+
def _reset_counters(self) -> None:
|
|
85
91
|
"""Reset all call counters for a new optimization run."""
|
|
86
92
|
self.llm_call_counter = 0
|
|
87
93
|
self.tool_call_counter = 0
|
|
88
94
|
|
|
89
|
-
def
|
|
95
|
+
def _increment_llm_counter(self) -> None:
|
|
90
96
|
"""Increment the LLM call counter."""
|
|
91
97
|
self.llm_call_counter += 1
|
|
92
98
|
|
|
93
|
-
def
|
|
99
|
+
def _increment_tool_counter(self) -> None:
|
|
94
100
|
"""Increment the tool call counter."""
|
|
95
101
|
self.tool_call_counter += 1
|
|
96
102
|
|
|
@@ -100,7 +106,7 @@ class BaseOptimizer(ABC):
|
|
|
100
106
|
Should be called when the optimizer is no longer needed.
|
|
101
107
|
"""
|
|
102
108
|
# Reset counters
|
|
103
|
-
self.
|
|
109
|
+
self._reset_counters()
|
|
104
110
|
|
|
105
111
|
# Clear history to free memory
|
|
106
112
|
self._history.clear()
|
|
@@ -129,7 +135,7 @@ class BaseOptimizer(ABC):
|
|
|
129
135
|
self._opik_client = opik.Opik()
|
|
130
136
|
return self._opik_client
|
|
131
137
|
|
|
132
|
-
def
|
|
138
|
+
def _validate_optimization_inputs(
|
|
133
139
|
self, prompt: "chat_prompt.ChatPrompt", dataset: "Dataset", metric: Callable
|
|
134
140
|
) -> None:
|
|
135
141
|
"""
|
|
@@ -154,7 +160,7 @@ class BaseOptimizer(ABC):
|
|
|
154
160
|
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
|
155
161
|
)
|
|
156
162
|
|
|
157
|
-
def
|
|
163
|
+
def _setup_agent_class(
|
|
158
164
|
self, prompt: "chat_prompt.ChatPrompt", agent_class: Any = None
|
|
159
165
|
) -> Any:
|
|
160
166
|
"""
|
|
@@ -172,19 +178,288 @@ class BaseOptimizer(ABC):
|
|
|
172
178
|
else:
|
|
173
179
|
return agent_class
|
|
174
180
|
|
|
175
|
-
def
|
|
181
|
+
def _extract_tool_prompts(
|
|
182
|
+
self, tools: list[dict[str, Any]] | None
|
|
183
|
+
) -> dict[str, str] | None:
|
|
176
184
|
"""
|
|
177
|
-
|
|
185
|
+
Extract tool names and descriptions from tools list.
|
|
178
186
|
|
|
179
187
|
Args:
|
|
180
|
-
|
|
188
|
+
tools: List of tool definitions in OpenAI/LiteLLM format
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Dictionary mapping tool names to descriptions, or None if no tools
|
|
181
192
|
"""
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
193
|
+
if not tools:
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
(tool.get("function", {}).get("name") or f"tool_{idx}"): tool.get(
|
|
198
|
+
"function", {}
|
|
199
|
+
).get("description", "")
|
|
200
|
+
for idx, tool in enumerate(tools)
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
# ------------------------------------------------------------------
|
|
204
|
+
# LLM call methods
|
|
205
|
+
# ------------------------------------------------------------------
|
|
206
|
+
|
|
207
|
+
def _prepare_model_params(
|
|
208
|
+
self,
|
|
209
|
+
call_time_params: dict[str, Any],
|
|
210
|
+
response_model: type[BaseModel] | None = None,
|
|
211
|
+
is_reasoning: bool = False,
|
|
212
|
+
) -> dict[str, Any]:
|
|
213
|
+
"""
|
|
214
|
+
Prepare parameters for LiteLLM call by merging and adding monitoring.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
call_time_params: Dict of LiteLLM params from call-time overrides
|
|
218
|
+
response_model: Optional Pydantic model for structured output
|
|
219
|
+
is_reasoning: Flag for metadata tagging
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Dictionary ready for litellm.completion/acompletion
|
|
223
|
+
"""
|
|
224
|
+
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
|
225
|
+
|
|
226
|
+
# Merge optimizer's model_parameters with call-time overrides
|
|
227
|
+
merged_params = {**self.model_parameters, **call_time_params}
|
|
228
|
+
|
|
229
|
+
# Add Opik monitoring wrapper
|
|
230
|
+
final_params = opik_litellm_monitor.try_add_opik_monitoring_to_params(
|
|
231
|
+
merged_params
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Add reasoning metadata if applicable
|
|
235
|
+
if is_reasoning and "metadata" in final_params:
|
|
236
|
+
if "opik_call_type" not in final_params["metadata"]:
|
|
237
|
+
final_params["metadata"]["opik_call_type"] = "reasoning"
|
|
238
|
+
|
|
239
|
+
# Configure project_name and tags for Opik tracing
|
|
240
|
+
if "metadata" not in final_params:
|
|
241
|
+
final_params["metadata"] = {}
|
|
242
|
+
if "opik" not in final_params["metadata"]:
|
|
243
|
+
final_params["metadata"]["opik"] = {}
|
|
244
|
+
|
|
245
|
+
# Set project name for optimizer reasoning calls
|
|
246
|
+
final_params["metadata"]["opik"]["project_name"] = self.project_name
|
|
247
|
+
|
|
248
|
+
# Add tags if optimization_id is available
|
|
249
|
+
if self.current_optimization_id:
|
|
250
|
+
final_params["metadata"]["opik"]["tags"] = [
|
|
251
|
+
self.current_optimization_id,
|
|
252
|
+
"Prompt Optimization",
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
# Add structured output support
|
|
256
|
+
if response_model is not None:
|
|
257
|
+
final_params["response_format"] = response_model
|
|
258
|
+
|
|
259
|
+
return final_params
|
|
260
|
+
|
|
261
|
+
def _parse_response(
|
|
262
|
+
self,
|
|
263
|
+
response: Any,
|
|
264
|
+
response_model: type[BaseModel] | None = None,
|
|
265
|
+
) -> BaseModel | str:
|
|
266
|
+
"""
|
|
267
|
+
Parse LiteLLM response, with optional structured output parsing.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
response: The response from litellm.completion/acompletion
|
|
271
|
+
response_model: Optional Pydantic model for structured output
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
If response_model is provided, returns an instance of that model.
|
|
275
|
+
Otherwise, returns the raw string response.
|
|
276
|
+
"""
|
|
277
|
+
content = response.choices[0].message.content
|
|
278
|
+
|
|
279
|
+
# When using structured outputs with Pydantic models, LiteLLM automatically
|
|
280
|
+
# parses the response. Parse the JSON string into the Pydantic model
|
|
281
|
+
if response_model is not None:
|
|
282
|
+
return response_model.model_validate_json(content)
|
|
283
|
+
|
|
284
|
+
return content
|
|
285
|
+
|
|
286
|
+
def _build_call_time_params(
|
|
287
|
+
self,
|
|
288
|
+
temperature: float | None = None,
|
|
289
|
+
max_tokens: int | None = None,
|
|
290
|
+
max_completion_tokens: int | None = None,
|
|
291
|
+
top_p: float | None = None,
|
|
292
|
+
presence_penalty: float | None = None,
|
|
293
|
+
frequency_penalty: float | None = None,
|
|
294
|
+
metadata: dict[str, Any] | None = None,
|
|
295
|
+
) -> dict[str, Any]:
|
|
296
|
+
"""
|
|
297
|
+
Build dictionary of call-time LiteLLM parameter overrides.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
temperature: Sampling temperature (0-2)
|
|
301
|
+
max_tokens: Maximum tokens to generate
|
|
302
|
+
max_completion_tokens: Upper bound for generated tokens
|
|
303
|
+
top_p: Nucleus sampling probability mass
|
|
304
|
+
presence_penalty: Penalty for new tokens based on presence
|
|
305
|
+
frequency_penalty: Penalty for new tokens based on frequency
|
|
306
|
+
metadata: Optional metadata dict for monitoring
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Dictionary of non-None parameters for LiteLLM
|
|
310
|
+
"""
|
|
311
|
+
call_time_params: dict[str, Any] = {}
|
|
312
|
+
if temperature is not None:
|
|
313
|
+
call_time_params["temperature"] = temperature
|
|
314
|
+
if max_tokens is not None:
|
|
315
|
+
call_time_params["max_tokens"] = max_tokens
|
|
316
|
+
if max_completion_tokens is not None:
|
|
317
|
+
call_time_params["max_completion_tokens"] = max_completion_tokens
|
|
318
|
+
if top_p is not None:
|
|
319
|
+
call_time_params["top_p"] = top_p
|
|
320
|
+
if presence_penalty is not None:
|
|
321
|
+
call_time_params["presence_penalty"] = presence_penalty
|
|
322
|
+
if frequency_penalty is not None:
|
|
323
|
+
call_time_params["frequency_penalty"] = frequency_penalty
|
|
324
|
+
if metadata is not None:
|
|
325
|
+
call_time_params["metadata"] = metadata
|
|
326
|
+
return call_time_params
|
|
327
|
+
|
|
328
|
+
@_throttle.rate_limited(_limiter)
|
|
329
|
+
def _call_model(
|
|
330
|
+
self,
|
|
331
|
+
messages: list[dict[str, str]],
|
|
332
|
+
model: str | None = None,
|
|
333
|
+
seed: int | None = None,
|
|
334
|
+
response_model: type[BaseModel] | None = None,
|
|
335
|
+
is_reasoning: bool = False,
|
|
336
|
+
# Explicit call-time overrides for LiteLLM params
|
|
337
|
+
temperature: float | None = None,
|
|
338
|
+
max_tokens: int | None = None,
|
|
339
|
+
max_completion_tokens: int | None = None,
|
|
340
|
+
top_p: float | None = None,
|
|
341
|
+
presence_penalty: float | None = None,
|
|
342
|
+
frequency_penalty: float | None = None,
|
|
343
|
+
# Optimizer-specific metadata (not passed to LiteLLM)
|
|
344
|
+
optimization_id: str | None = None,
|
|
345
|
+
metadata: dict[str, Any] | None = None,
|
|
346
|
+
) -> BaseModel | str:
|
|
347
|
+
"""
|
|
348
|
+
Call the LLM model with optional structured output.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
messages: List of message dictionaries with 'role' and 'content' keys
|
|
352
|
+
model: The model to use (defaults to self.model)
|
|
353
|
+
seed: Random seed for reproducibility (defaults to self.seed)
|
|
354
|
+
response_model: Optional Pydantic model for structured output
|
|
355
|
+
is_reasoning: Flag for metadata tagging (not passed to LiteLLM)
|
|
356
|
+
temperature: Sampling temperature (0-2)
|
|
357
|
+
max_tokens: Maximum tokens to generate
|
|
358
|
+
max_completion_tokens: Upper bound for generated tokens
|
|
359
|
+
top_p: Nucleus sampling probability mass
|
|
360
|
+
presence_penalty: Penalty for new tokens based on presence
|
|
361
|
+
frequency_penalty: Penalty for new tokens based on frequency
|
|
362
|
+
optimization_id: Optional ID for optimization tracking (metadata only)
|
|
363
|
+
metadata: Optional metadata dict for monitoring
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
If response_model is provided, returns an instance of that model.
|
|
367
|
+
Otherwise, returns the raw string response.
|
|
368
|
+
"""
|
|
369
|
+
self._increment_llm_counter()
|
|
370
|
+
|
|
371
|
+
# Build dict of call-time LiteLLM parameter overrides (non-None only)
|
|
372
|
+
call_time_params = self._build_call_time_params(
|
|
373
|
+
temperature=temperature,
|
|
374
|
+
max_tokens=max_tokens,
|
|
375
|
+
max_completion_tokens=max_completion_tokens,
|
|
376
|
+
top_p=top_p,
|
|
377
|
+
presence_penalty=presence_penalty,
|
|
378
|
+
frequency_penalty=frequency_penalty,
|
|
379
|
+
metadata=metadata,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
final_params_for_litellm = self._prepare_model_params(
|
|
383
|
+
call_time_params, response_model, is_reasoning
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
response = litellm.completion(
|
|
387
|
+
model=model or self.model,
|
|
388
|
+
messages=messages,
|
|
389
|
+
seed=seed if seed is not None else self.seed,
|
|
390
|
+
num_retries=6,
|
|
391
|
+
**final_params_for_litellm,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
return self._parse_response(response, response_model)
|
|
395
|
+
|
|
396
|
+
@_throttle.rate_limited(_limiter)
|
|
397
|
+
async def _call_model_async(
|
|
398
|
+
self,
|
|
399
|
+
messages: list[dict[str, str]],
|
|
400
|
+
model: str | None = None,
|
|
401
|
+
seed: int | None = None,
|
|
402
|
+
response_model: type[BaseModel] | None = None,
|
|
403
|
+
is_reasoning: bool = False,
|
|
404
|
+
# Explicit call-time overrides for LiteLLM params
|
|
405
|
+
temperature: float | None = None,
|
|
406
|
+
max_tokens: int | None = None,
|
|
407
|
+
max_completion_tokens: int | None = None,
|
|
408
|
+
top_p: float | None = None,
|
|
409
|
+
presence_penalty: float | None = None,
|
|
410
|
+
frequency_penalty: float | None = None,
|
|
411
|
+
# Optimizer-specific metadata (not passed to LiteLLM)
|
|
412
|
+
optimization_id: str | None = None,
|
|
413
|
+
metadata: dict[str, Any] | None = None,
|
|
414
|
+
) -> BaseModel | str:
|
|
415
|
+
"""
|
|
416
|
+
Async version of _call_model using litellm.acompletion.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
messages: List of message dictionaries with 'role' and 'content' keys
|
|
420
|
+
model: The model to use (defaults to self.model)
|
|
421
|
+
seed: Random seed for reproducibility (defaults to self.seed)
|
|
422
|
+
response_model: Optional Pydantic model for structured output
|
|
423
|
+
is_reasoning: Flag for metadata tagging (not passed to LiteLLM)
|
|
424
|
+
temperature: Sampling temperature (0-2)
|
|
425
|
+
max_tokens: Maximum tokens to generate
|
|
426
|
+
max_completion_tokens: Upper bound for generated tokens
|
|
427
|
+
top_p: Nucleus sampling probability mass
|
|
428
|
+
presence_penalty: Penalty for new tokens based on presence
|
|
429
|
+
frequency_penalty: Penalty for new tokens based on frequency
|
|
430
|
+
optimization_id: Optional ID for optimization tracking (metadata only)
|
|
431
|
+
metadata: Optional metadata dict for monitoring
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
If response_model is provided, returns an instance of that model.
|
|
435
|
+
Otherwise, returns the raw string response.
|
|
436
|
+
"""
|
|
437
|
+
self._increment_llm_counter()
|
|
438
|
+
|
|
439
|
+
# Build dict of call-time LiteLLM parameter overrides (non-None only)
|
|
440
|
+
call_time_params = self._build_call_time_params(
|
|
441
|
+
temperature=temperature,
|
|
442
|
+
max_tokens=max_tokens,
|
|
443
|
+
max_completion_tokens=max_completion_tokens,
|
|
444
|
+
top_p=top_p,
|
|
445
|
+
presence_penalty=presence_penalty,
|
|
446
|
+
frequency_penalty=frequency_penalty,
|
|
447
|
+
metadata=metadata,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
final_params_for_litellm = self._prepare_model_params(
|
|
451
|
+
call_time_params, response_model, is_reasoning
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
response = await litellm.acompletion(
|
|
455
|
+
model=model or self.model,
|
|
456
|
+
messages=messages,
|
|
457
|
+
seed=seed if seed is not None else self.seed,
|
|
458
|
+
num_retries=6,
|
|
459
|
+
**final_params_for_litellm,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
return self._parse_response(response, response_model)
|
|
188
463
|
|
|
189
464
|
# ------------------------------------------------------------------
|
|
190
465
|
# Experiment metadata helpers
|
|
@@ -292,7 +567,7 @@ class BaseOptimizer(ABC):
|
|
|
292
567
|
"name": self.__class__.__name__,
|
|
293
568
|
"version": _OPTIMIZER_VERSION,
|
|
294
569
|
"model": self.model,
|
|
295
|
-
"
|
|
570
|
+
"model_parameters": self.model_parameters or None,
|
|
296
571
|
"seed": getattr(self, "seed", None),
|
|
297
572
|
"num_threads": getattr(self, "num_threads", None),
|
|
298
573
|
}
|
|
@@ -362,39 +637,10 @@ class BaseOptimizer(ABC):
|
|
|
362
637
|
base_config = self._deep_merge_dicts(base_config, additional_metadata)
|
|
363
638
|
|
|
364
639
|
if experiment_config:
|
|
365
|
-
base_config = self._deep_merge_dicts(
|
|
640
|
+
base_config = self._deep_merge_dicts(experiment_config, base_config)
|
|
366
641
|
|
|
367
642
|
return self._drop_none(base_config)
|
|
368
643
|
|
|
369
|
-
def create_optimization_context(
|
|
370
|
-
self, dataset: "Dataset", metric: Callable, metadata: dict | None = None
|
|
371
|
-
) -> Any:
|
|
372
|
-
"""
|
|
373
|
-
Create optimization context for tracking.
|
|
374
|
-
|
|
375
|
-
Args:
|
|
376
|
-
dataset: The dataset being optimized
|
|
377
|
-
metric: The metric function
|
|
378
|
-
metadata: Additional metadata
|
|
379
|
-
|
|
380
|
-
Returns:
|
|
381
|
-
Optimization context manager
|
|
382
|
-
"""
|
|
383
|
-
context_metadata = {
|
|
384
|
-
"optimizer": self.__class__.__name__,
|
|
385
|
-
"model": self.model,
|
|
386
|
-
"seed": self.seed,
|
|
387
|
-
}
|
|
388
|
-
if metadata:
|
|
389
|
-
context_metadata.update(metadata)
|
|
390
|
-
|
|
391
|
-
return optimization_context(
|
|
392
|
-
client=self.opik_client,
|
|
393
|
-
dataset_name=dataset.name,
|
|
394
|
-
objective_name=metric.__name__,
|
|
395
|
-
metadata=context_metadata,
|
|
396
|
-
)
|
|
397
|
-
|
|
398
644
|
@abstractmethod
|
|
399
645
|
def optimize_prompt(
|
|
400
646
|
self,
|
|
@@ -405,6 +651,8 @@ class BaseOptimizer(ABC):
|
|
|
405
651
|
n_samples: int | None = None,
|
|
406
652
|
auto_continue: bool = False,
|
|
407
653
|
agent_class: type[OptimizableAgent] | None = None,
|
|
654
|
+
project_name: str = "Optimization",
|
|
655
|
+
*args: Any,
|
|
408
656
|
**kwargs: Any,
|
|
409
657
|
) -> optimization_result.OptimizationResult:
|
|
410
658
|
"""
|
|
@@ -418,93 +666,11 @@ class BaseOptimizer(ABC):
|
|
|
418
666
|
input_key: input field of dataset
|
|
419
667
|
output_key: output field of dataset
|
|
420
668
|
experiment_config: Optional configuration for the experiment
|
|
669
|
+
project_name: Opik project name for logging traces (default: "Optimization")
|
|
421
670
|
**kwargs: Additional arguments for optimization
|
|
422
671
|
"""
|
|
423
672
|
pass
|
|
424
673
|
|
|
425
|
-
def optimize_mcp(
|
|
426
|
-
self,
|
|
427
|
-
prompt: "chat_prompt.ChatPrompt",
|
|
428
|
-
dataset: Dataset,
|
|
429
|
-
metric: Callable,
|
|
430
|
-
*,
|
|
431
|
-
tool_name: str,
|
|
432
|
-
second_pass: Any,
|
|
433
|
-
experiment_config: dict | None = None,
|
|
434
|
-
n_samples: int | None = None,
|
|
435
|
-
auto_continue: bool = False,
|
|
436
|
-
agent_class: type[OptimizableAgent] | None = None,
|
|
437
|
-
fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
|
|
438
|
-
fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
|
|
439
|
-
allow_tool_use_on_second_pass: bool = False,
|
|
440
|
-
**kwargs: Any,
|
|
441
|
-
) -> optimization_result.OptimizationResult:
|
|
442
|
-
"""
|
|
443
|
-
Optimize prompts that rely on MCP (Model Context Protocol) tooling.
|
|
444
|
-
|
|
445
|
-
This method provides a standardized interface for optimizing prompts that use
|
|
446
|
-
external tools through the MCP protocol. It handles tool invocation, second-pass
|
|
447
|
-
coordination, and fallback mechanisms.
|
|
448
|
-
|
|
449
|
-
Args:
|
|
450
|
-
prompt: The chat prompt to optimize, must include tools
|
|
451
|
-
dataset: Opik dataset containing evaluation data
|
|
452
|
-
metric: Evaluation function that takes (dataset_item, llm_output) and returns a score
|
|
453
|
-
tool_name: Name of the MCP tool to use for optimization
|
|
454
|
-
second_pass: MCPSecondPassCoordinator for handling second-pass tool calls
|
|
455
|
-
experiment_config: Optional configuration for the experiment
|
|
456
|
-
n_samples: Number of samples to use for optimization (default: None)
|
|
457
|
-
auto_continue: Whether to auto-continue optimization (default: False)
|
|
458
|
-
agent_class: Custom agent class to use (default: None)
|
|
459
|
-
fallback_invoker: Fallback function for tool invocation (default: None)
|
|
460
|
-
fallback_arguments: Function to extract tool arguments (default: None)
|
|
461
|
-
allow_tool_use_on_second_pass: Whether to allow tool use on second pass (default: False)
|
|
462
|
-
**kwargs: Additional arguments for optimization
|
|
463
|
-
|
|
464
|
-
Returns:
|
|
465
|
-
OptimizationResult: The optimization result containing the optimized prompt and metrics
|
|
466
|
-
|
|
467
|
-
Raises:
|
|
468
|
-
NotImplementedError: If the optimizer doesn't implement MCP optimization
|
|
469
|
-
ValueError: If the prompt doesn't include required tools
|
|
470
|
-
"""
|
|
471
|
-
raise NotImplementedError(
|
|
472
|
-
f"{self.__class__.__name__} does not implement optimize_mcp yet."
|
|
473
|
-
)
|
|
474
|
-
|
|
475
|
-
def optimize_parameter(
|
|
476
|
-
self,
|
|
477
|
-
prompt: "chat_prompt.ChatPrompt",
|
|
478
|
-
dataset: Dataset,
|
|
479
|
-
metric: Callable,
|
|
480
|
-
parameter_space: Any,
|
|
481
|
-
experiment_config: dict | None = None,
|
|
482
|
-
n_trials: int | None = None,
|
|
483
|
-
n_samples: int | None = None,
|
|
484
|
-
agent_class: type[OptimizableAgent] | None = None,
|
|
485
|
-
**kwargs: Any,
|
|
486
|
-
) -> optimization_result.OptimizationResult:
|
|
487
|
-
"""
|
|
488
|
-
Optimize LLM call parameters such as temperature or top_k.
|
|
489
|
-
|
|
490
|
-
Args:
|
|
491
|
-
prompt: The chat prompt to evaluate with tuned parameters
|
|
492
|
-
dataset: Dataset providing evaluation examples
|
|
493
|
-
metric: Objective function to maximize
|
|
494
|
-
parameter_space: Definition of the search space for tunable parameters
|
|
495
|
-
experiment_config: Optional experiment metadata
|
|
496
|
-
n_trials: Number of trials to run (optimizer specific default if None)
|
|
497
|
-
n_samples: Number of dataset samples to evaluate per trial (None for all)
|
|
498
|
-
agent_class: Optional custom agent class to execute evaluations
|
|
499
|
-
**kwargs: Additional optimizer specific settings
|
|
500
|
-
|
|
501
|
-
Returns:
|
|
502
|
-
OptimizationResult: Structured result describing the best parameters found
|
|
503
|
-
"""
|
|
504
|
-
raise NotImplementedError(
|
|
505
|
-
f"{self.__class__.__name__} does not implement optimize_parameter yet."
|
|
506
|
-
)
|
|
507
|
-
|
|
508
674
|
def get_history(self) -> list[OptimizationRound]:
|
|
509
675
|
"""
|
|
510
676
|
Get the optimization history.
|
|
@@ -523,7 +689,7 @@ class BaseOptimizer(ABC):
|
|
|
523
689
|
"""
|
|
524
690
|
self._history.append(round_data)
|
|
525
691
|
|
|
526
|
-
def
|
|
692
|
+
def _update_optimization(
|
|
527
693
|
self, optimization: optimization.Optimization, status: str
|
|
528
694
|
) -> None:
|
|
529
695
|
"""
|
|
@@ -556,11 +722,6 @@ class BaseOptimizer(ABC):
|
|
|
556
722
|
) -> float:
|
|
557
723
|
random.seed(seed)
|
|
558
724
|
|
|
559
|
-
if prompt.model is None:
|
|
560
|
-
prompt.model = self.model
|
|
561
|
-
if prompt.model_kwargs is None:
|
|
562
|
-
prompt.model_kwargs = self.model_kwargs
|
|
563
|
-
|
|
564
725
|
self.agent_class: type[OptimizableAgent]
|
|
565
726
|
|
|
566
727
|
if agent_class is None:
|
|
@@ -574,6 +735,13 @@ class BaseOptimizer(ABC):
|
|
|
574
735
|
messages = prompt.get_messages(dataset_item)
|
|
575
736
|
raw_model_output = agent.invoke(messages)
|
|
576
737
|
cleaned_model_output = raw_model_output.strip()
|
|
738
|
+
|
|
739
|
+
# Add tags to trace for optimization tracking
|
|
740
|
+
if self.current_optimization_id:
|
|
741
|
+
opik_context.update_current_trace(
|
|
742
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
743
|
+
)
|
|
744
|
+
|
|
577
745
|
result = {
|
|
578
746
|
mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
|
|
579
747
|
}
|
|
@@ -591,6 +759,7 @@ class BaseOptimizer(ABC):
|
|
|
591
759
|
raise Exception("Can't use n_samples and dataset_item_ids")
|
|
592
760
|
|
|
593
761
|
all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
|
|
762
|
+
n_samples = min(n_samples, len(all_ids))
|
|
594
763
|
dataset_item_ids = random.sample(all_ids, n_samples)
|
|
595
764
|
|
|
596
765
|
score = task_evaluator.evaluate(
|
|
@@ -599,9 +768,9 @@ class BaseOptimizer(ABC):
|
|
|
599
768
|
metric=metric,
|
|
600
769
|
evaluated_task=llm_task,
|
|
601
770
|
num_threads=n_threads,
|
|
602
|
-
project_name=
|
|
771
|
+
project_name=self.project_name,
|
|
603
772
|
experiment_config=experiment_config,
|
|
604
|
-
optimization_id=
|
|
773
|
+
optimization_id=self.current_optimization_id,
|
|
605
774
|
verbose=verbose,
|
|
606
775
|
)
|
|
607
776
|
return score
|
|
@@ -89,7 +89,20 @@ class CrossoverOps:
|
|
|
89
89
|
else:
|
|
90
90
|
pass
|
|
91
91
|
|
|
92
|
-
|
|
92
|
+
child1 = creator.Individual(messages_1_orig)
|
|
93
|
+
child2 = creator.Individual(messages_2_orig)
|
|
94
|
+
|
|
95
|
+
# Preserve tools and function_map from parents
|
|
96
|
+
if hasattr(ind1, "tools"):
|
|
97
|
+
setattr(child1, "tools", getattr(ind1, "tools"))
|
|
98
|
+
if hasattr(ind1, "function_map"):
|
|
99
|
+
setattr(child1, "function_map", getattr(ind1, "function_map"))
|
|
100
|
+
if hasattr(ind2, "tools"):
|
|
101
|
+
setattr(child2, "tools", getattr(ind2, "tools"))
|
|
102
|
+
if hasattr(ind2, "function_map"):
|
|
103
|
+
setattr(child2, "function_map", getattr(ind2, "function_map"))
|
|
104
|
+
|
|
105
|
+
return child1, child2
|
|
93
106
|
|
|
94
107
|
def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> tuple[Any, Any]:
|
|
95
108
|
"""Perform crossover by asking an LLM to blend two parent prompts."""
|
|
@@ -151,9 +164,23 @@ class CrossoverOps:
|
|
|
151
164
|
raise ValueError("LLM response did not include any valid child prompts")
|
|
152
165
|
|
|
153
166
|
# We only need two children; if only one returned, duplicate pattern from DEAP
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
167
|
+
first_child_messages = children[0]
|
|
168
|
+
second_child_messages = children[1] if len(children) > 1 else children[0]
|
|
169
|
+
|
|
170
|
+
child1 = creator.Individual(first_child_messages)
|
|
171
|
+
child2 = creator.Individual(second_child_messages)
|
|
172
|
+
|
|
173
|
+
# Preserve tools and function_map from parents
|
|
174
|
+
if hasattr(ind1, "tools"):
|
|
175
|
+
setattr(child1, "tools", getattr(ind1, "tools"))
|
|
176
|
+
if hasattr(ind1, "function_map"):
|
|
177
|
+
setattr(child1, "function_map", getattr(ind1, "function_map"))
|
|
178
|
+
if hasattr(ind2, "tools"):
|
|
179
|
+
setattr(child2, "tools", getattr(ind2, "tools"))
|
|
180
|
+
if hasattr(ind2, "function_map"):
|
|
181
|
+
setattr(child2, "function_map", getattr(ind2, "function_map"))
|
|
182
|
+
|
|
183
|
+
return child1, child2
|
|
157
184
|
except Exception as e:
|
|
158
185
|
logger.warning(
|
|
159
186
|
f"LLM-driven crossover failed: {e}. Falling back to DEAP crossover."
|