opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +4 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +402 -28
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
- opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +154 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +30 -23
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +21 -16
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +22 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/utils/colbert.py +236 -0
- opik_optimizer/{utils.py → utils/core.py} +160 -33
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- opik_optimizer-2.0.0.dist-info/METADATA +345 -0
- opik_optimizer-2.0.0.dist-info/RECORD +74 -0
- opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.6.dist-info/METADATA +0 -181
- opik_optimizer-1.0.6.dist-info/RECORD +0 -50
- opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
opik_optimizer/__init__.py
CHANGED
@@ -12,8 +12,10 @@ from .optimizable_agent import OptimizableAgent
|
|
12
12
|
from .optimization_config.chat_prompt import ChatPrompt
|
13
13
|
from .base_optimizer import BaseOptimizer
|
14
14
|
from .few_shot_bayesian_optimizer import FewShotBayesianOptimizer
|
15
|
+
from .gepa_optimizer import GepaOptimizer
|
15
16
|
from .logging_config import setup_logging
|
16
17
|
from .meta_prompt_optimizer import MetaPromptOptimizer
|
18
|
+
from .mipro_optimizer import MiproOptimizer
|
17
19
|
from .optimization_config.configs import TaskConfig
|
18
20
|
from .optimization_result import OptimizationResult
|
19
21
|
|
@@ -28,7 +30,9 @@ __all__ = [
|
|
28
30
|
"BaseOptimizer",
|
29
31
|
"ChatPrompt",
|
30
32
|
"FewShotBayesianOptimizer",
|
33
|
+
"GepaOptimizer",
|
31
34
|
"MetaPromptOptimizer",
|
35
|
+
"MiproOptimizer",
|
32
36
|
"EvolutionaryOptimizer",
|
33
37
|
"OptimizationResult",
|
34
38
|
"OptimizableAgent",
|
opik_optimizer/_throttle.py
CHANGED
opik_optimizer/base_optimizer.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
-
from typing import Any,
|
1
|
+
from typing import Any, cast
|
2
|
+
from collections.abc import Callable
|
2
3
|
|
4
|
+
import copy
|
5
|
+
import inspect
|
3
6
|
import logging
|
4
7
|
import time
|
5
|
-
from abc import abstractmethod
|
8
|
+
from abc import ABC, abstractmethod
|
6
9
|
import random
|
10
|
+
import importlib.metadata
|
7
11
|
|
8
12
|
|
9
13
|
import litellm
|
@@ -16,7 +20,7 @@ from . import _throttle, optimization_result
|
|
16
20
|
from .cache_config import initialize_cache
|
17
21
|
from .optimization_config import chat_prompt, mappers
|
18
22
|
from .optimizable_agent import OptimizableAgent
|
19
|
-
from .utils import create_litellm_agent_class
|
23
|
+
from .utils import create_litellm_agent_class, optimization_context
|
20
24
|
from . import task_evaluator
|
21
25
|
|
22
26
|
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
@@ -28,6 +32,12 @@ litellm.drop_params = True
|
|
28
32
|
logger = logging.getLogger(__name__)
|
29
33
|
|
30
34
|
|
35
|
+
try:
|
36
|
+
_OPTIMIZER_VERSION = importlib.metadata.version("opik_optimizer")
|
37
|
+
except importlib.metadata.PackageNotFoundError: # pragma: no cover - dev installs
|
38
|
+
_OPTIMIZER_VERSION = "unknown"
|
39
|
+
|
40
|
+
|
31
41
|
class OptimizationRound(BaseModel):
|
32
42
|
model_config = {"arbitrary_types_allowed": True}
|
33
43
|
|
@@ -40,11 +50,12 @@ class OptimizationRound(BaseModel):
|
|
40
50
|
improvement: float
|
41
51
|
|
42
52
|
|
43
|
-
class BaseOptimizer:
|
53
|
+
class BaseOptimizer(ABC):
|
44
54
|
def __init__(
|
45
55
|
self,
|
46
56
|
model: str,
|
47
57
|
verbose: int = 1,
|
58
|
+
seed: int = 42,
|
48
59
|
**model_kwargs: Any,
|
49
60
|
) -> None:
|
50
61
|
"""
|
@@ -53,26 +64,345 @@ class BaseOptimizer:
|
|
53
64
|
Args:
|
54
65
|
model: LiteLLM model name
|
55
66
|
verbose: Controls internal logging/progress bars (0=off, 1=on).
|
67
|
+
seed: Random seed for reproducibility (default: 42)
|
56
68
|
model_kwargs: additional args for model (eg, temperature)
|
57
69
|
"""
|
58
70
|
self.model = model
|
59
71
|
self.reasoning_model = model
|
60
72
|
self.model_kwargs = model_kwargs
|
61
73
|
self.verbose = verbose
|
62
|
-
self.
|
74
|
+
self.seed = seed
|
75
|
+
self._history: list[OptimizationRound] = []
|
63
76
|
self.experiment_config = None
|
64
77
|
self.llm_call_counter = 0
|
78
|
+
self.tool_call_counter = 0
|
79
|
+
self._opik_client = None # Lazy initialization
|
65
80
|
|
66
81
|
# Initialize shared cache
|
67
82
|
initialize_cache()
|
68
83
|
|
84
|
+
def reset_counters(self) -> None:
|
85
|
+
"""Reset all call counters for a new optimization run."""
|
86
|
+
self.llm_call_counter = 0
|
87
|
+
self.tool_call_counter = 0
|
88
|
+
|
89
|
+
def increment_llm_counter(self) -> None:
|
90
|
+
"""Increment the LLM call counter."""
|
91
|
+
self.llm_call_counter += 1
|
92
|
+
|
93
|
+
def increment_tool_counter(self) -> None:
|
94
|
+
"""Increment the tool call counter."""
|
95
|
+
self.tool_call_counter += 1
|
96
|
+
|
97
|
+
def cleanup(self) -> None:
|
98
|
+
"""
|
99
|
+
Clean up resources and perform memory management.
|
100
|
+
Should be called when the optimizer is no longer needed.
|
101
|
+
"""
|
102
|
+
# Reset counters
|
103
|
+
self.reset_counters()
|
104
|
+
|
105
|
+
# Clear history to free memory
|
106
|
+
self._history.clear()
|
107
|
+
|
108
|
+
# Clear Opik client if it exists
|
109
|
+
if self._opik_client is not None:
|
110
|
+
# Note: Opik client doesn't have explicit cleanup, but we can clear the reference
|
111
|
+
self._opik_client = None
|
112
|
+
|
113
|
+
logger.debug(f"Cleaned up resources for {self.__class__.__name__}")
|
114
|
+
|
115
|
+
def __del__(self) -> None:
|
116
|
+
"""Destructor to ensure cleanup is called."""
|
117
|
+
try:
|
118
|
+
self.cleanup()
|
119
|
+
except Exception:
|
120
|
+
# Ignore exceptions during cleanup in destructor
|
121
|
+
pass
|
122
|
+
|
123
|
+
@property
|
124
|
+
def opik_client(self) -> Any:
|
125
|
+
"""Lazy initialization of Opik client."""
|
126
|
+
if self._opik_client is None:
|
127
|
+
import opik
|
128
|
+
|
129
|
+
self._opik_client = opik.Opik()
|
130
|
+
return self._opik_client
|
131
|
+
|
132
|
+
def validate_optimization_inputs(
|
133
|
+
self, prompt: "chat_prompt.ChatPrompt", dataset: "Dataset", metric: Callable
|
134
|
+
) -> None:
|
135
|
+
"""
|
136
|
+
Validate common optimization inputs.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
prompt: The chat prompt to validate
|
140
|
+
dataset: The dataset to validate
|
141
|
+
metric: The metric function to validate
|
142
|
+
|
143
|
+
Raises:
|
144
|
+
ValueError: If any input is invalid
|
145
|
+
"""
|
146
|
+
if not isinstance(prompt, chat_prompt.ChatPrompt):
|
147
|
+
raise ValueError("Prompt must be a ChatPrompt object")
|
148
|
+
|
149
|
+
if not isinstance(dataset, Dataset):
|
150
|
+
raise ValueError("Dataset must be a Dataset object")
|
151
|
+
|
152
|
+
if not callable(metric):
|
153
|
+
raise ValueError(
|
154
|
+
"Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
|
155
|
+
)
|
156
|
+
|
157
|
+
def setup_agent_class(
|
158
|
+
self, prompt: "chat_prompt.ChatPrompt", agent_class: Any = None
|
159
|
+
) -> Any:
|
160
|
+
"""
|
161
|
+
Setup agent class for optimization.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
prompt: The chat prompt
|
165
|
+
agent_class: Optional custom agent class
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
The agent class to use
|
169
|
+
"""
|
170
|
+
if agent_class is None:
|
171
|
+
return create_litellm_agent_class(prompt, optimizer_ref=self)
|
172
|
+
else:
|
173
|
+
return agent_class
|
174
|
+
|
175
|
+
def configure_prompt_model(self, prompt: "chat_prompt.ChatPrompt") -> None:
|
176
|
+
"""
|
177
|
+
Configure prompt model and model_kwargs if not set.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
prompt: The chat prompt to configure
|
181
|
+
"""
|
182
|
+
# Only configure if prompt is a valid ChatPrompt object
|
183
|
+
if hasattr(prompt, "model") and hasattr(prompt, "model_kwargs"):
|
184
|
+
if prompt.model is None:
|
185
|
+
prompt.model = self.model
|
186
|
+
if prompt.model_kwargs is None:
|
187
|
+
prompt.model_kwargs = self.model_kwargs
|
188
|
+
|
189
|
+
# ------------------------------------------------------------------
|
190
|
+
# Experiment metadata helpers
|
191
|
+
# ------------------------------------------------------------------
|
192
|
+
|
193
|
+
@staticmethod
|
194
|
+
def _drop_none(metadata: dict[str, Any]) -> dict[str, Any]:
|
195
|
+
return {k: v for k, v in metadata.items() if v is not None}
|
196
|
+
|
197
|
+
@staticmethod
|
198
|
+
def _deep_merge_dicts(
|
199
|
+
base: dict[str, Any], overrides: dict[str, Any]
|
200
|
+
) -> dict[str, Any]:
|
201
|
+
result = copy.deepcopy(base)
|
202
|
+
for key, value in overrides.items():
|
203
|
+
if (
|
204
|
+
key in result
|
205
|
+
and isinstance(result[key], dict)
|
206
|
+
and isinstance(value, dict)
|
207
|
+
):
|
208
|
+
result[key] = BaseOptimizer._deep_merge_dicts(result[key], value)
|
209
|
+
else:
|
210
|
+
result[key] = value
|
211
|
+
return result
|
212
|
+
|
213
|
+
@staticmethod
|
214
|
+
def _serialize_tools(prompt: "chat_prompt.ChatPrompt") -> list[dict[str, Any]]:
|
215
|
+
tools_obj = getattr(prompt, "tools", None)
|
216
|
+
if not isinstance(tools_obj, list):
|
217
|
+
return []
|
218
|
+
|
219
|
+
try:
|
220
|
+
return copy.deepcopy(cast(list[dict[str, Any]], tools_obj))
|
221
|
+
except Exception: # pragma: no cover - defensive
|
222
|
+
serialized_tools: list[dict[str, Any]] = []
|
223
|
+
for tool in tools_obj:
|
224
|
+
if isinstance(tool, dict):
|
225
|
+
serialized_tools.append({k: v for k, v in tool.items() if k})
|
226
|
+
return serialized_tools
|
227
|
+
|
228
|
+
@staticmethod
|
229
|
+
def _describe_annotation(annotation: Any) -> str | None:
|
230
|
+
if annotation is inspect._empty:
|
231
|
+
return None
|
232
|
+
if isinstance(annotation, type):
|
233
|
+
return annotation.__name__
|
234
|
+
return str(annotation)
|
235
|
+
|
236
|
+
def _summarize_tool_signatures(
|
237
|
+
self, prompt: "chat_prompt.ChatPrompt"
|
238
|
+
) -> list[dict[str, Any]]:
|
239
|
+
signatures: list[dict[str, Any]] = []
|
240
|
+
for name, func in getattr(prompt, "function_map", {}).items():
|
241
|
+
callable_obj = getattr(func, "__wrapped__", func)
|
242
|
+
try:
|
243
|
+
sig = inspect.signature(callable_obj)
|
244
|
+
except (TypeError, ValueError): # pragma: no cover - defensive
|
245
|
+
signatures.append({"name": name, "signature": "unavailable"})
|
246
|
+
continue
|
247
|
+
|
248
|
+
params: list[dict[str, Any]] = []
|
249
|
+
for parameter in sig.parameters.values():
|
250
|
+
params.append(
|
251
|
+
self._drop_none(
|
252
|
+
{
|
253
|
+
"name": parameter.name,
|
254
|
+
"kind": parameter.kind.name,
|
255
|
+
"annotation": self._describe_annotation(
|
256
|
+
parameter.annotation
|
257
|
+
),
|
258
|
+
"default": (
|
259
|
+
None
|
260
|
+
if parameter.default is inspect._empty
|
261
|
+
else parameter.default
|
262
|
+
),
|
263
|
+
}
|
264
|
+
)
|
265
|
+
)
|
266
|
+
|
267
|
+
signatures.append(
|
268
|
+
self._drop_none(
|
269
|
+
{
|
270
|
+
"name": name,
|
271
|
+
"parameters": params,
|
272
|
+
"docstring": inspect.getdoc(callable_obj),
|
273
|
+
}
|
274
|
+
)
|
275
|
+
)
|
276
|
+
return signatures
|
277
|
+
|
278
|
+
def _build_agent_config(self, prompt: "chat_prompt.ChatPrompt") -> dict[str, Any]:
|
279
|
+
agent_config: dict[str, Any] = dict(prompt.to_dict())
|
280
|
+
agent_config["project_name"] = getattr(prompt, "project_name", None)
|
281
|
+
agent_config["model"] = getattr(prompt, "model", None) or self.model
|
282
|
+
agent_config["tools"] = self._serialize_tools(prompt)
|
283
|
+
return self._drop_none(agent_config)
|
284
|
+
|
285
|
+
def get_optimizer_metadata(self) -> dict[str, Any]:
|
286
|
+
"""Override in subclasses to expose optimizer-specific parameters."""
|
287
|
+
return {}
|
288
|
+
|
289
|
+
def _build_optimizer_metadata(self) -> dict[str, Any]:
|
290
|
+
metadata = {
|
291
|
+
"name": self.__class__.__name__,
|
292
|
+
"version": _OPTIMIZER_VERSION,
|
293
|
+
"model": self.model,
|
294
|
+
"model_kwargs": self.model_kwargs or None,
|
295
|
+
"seed": getattr(self, "seed", None),
|
296
|
+
"num_threads": getattr(self, "num_threads", None),
|
297
|
+
}
|
298
|
+
|
299
|
+
# n_threads is used by some optimizers instead of num_threads
|
300
|
+
if metadata["num_threads"] is None and hasattr(self, "n_threads"):
|
301
|
+
metadata["num_threads"] = getattr(self, "n_threads")
|
302
|
+
|
303
|
+
if hasattr(self, "reasoning_model"):
|
304
|
+
metadata["reasoning_model"] = getattr(self, "reasoning_model")
|
305
|
+
|
306
|
+
extra_parameters = self.get_optimizer_metadata()
|
307
|
+
if extra_parameters:
|
308
|
+
metadata["parameters"] = extra_parameters
|
309
|
+
|
310
|
+
return self._drop_none(metadata)
|
311
|
+
|
312
|
+
def _prepare_experiment_config(
|
313
|
+
self,
|
314
|
+
*,
|
315
|
+
prompt: "chat_prompt.ChatPrompt",
|
316
|
+
dataset: Dataset,
|
317
|
+
metric: Callable,
|
318
|
+
experiment_config: dict[str, Any] | None = None,
|
319
|
+
configuration_updates: dict[str, Any] | None = None,
|
320
|
+
additional_metadata: dict[str, Any] | None = None,
|
321
|
+
) -> dict[str, Any]:
|
322
|
+
dataset_id = getattr(dataset, "id", None)
|
323
|
+
project_name = (
|
324
|
+
getattr(self.agent_class, "project_name", None)
|
325
|
+
if hasattr(self, "agent_class")
|
326
|
+
else None
|
327
|
+
)
|
328
|
+
if not project_name:
|
329
|
+
project_name = getattr(prompt, "project_name", None)
|
330
|
+
if not project_name:
|
331
|
+
project_name = self.__class__.__name__
|
332
|
+
|
333
|
+
base_config: dict[str, Any] = {
|
334
|
+
"project_name": project_name,
|
335
|
+
"agent_class": (
|
336
|
+
getattr(self.agent_class, "__name__", None)
|
337
|
+
if hasattr(self, "agent_class")
|
338
|
+
else None
|
339
|
+
),
|
340
|
+
"agent_config": self._build_agent_config(prompt),
|
341
|
+
"metric": getattr(metric, "__name__", str(metric)),
|
342
|
+
"dataset": getattr(dataset, "name", None),
|
343
|
+
"dataset_id": dataset_id,
|
344
|
+
"optimizer_metadata": self._build_optimizer_metadata(),
|
345
|
+
"tool_signatures": self._summarize_tool_signatures(prompt),
|
346
|
+
"configuration": {
|
347
|
+
"prompt": prompt.get_messages(),
|
348
|
+
"prompt_name": getattr(prompt, "name", None),
|
349
|
+
"tools": self._serialize_tools(prompt),
|
350
|
+
"prompt_project_name": getattr(prompt, "project_name", None),
|
351
|
+
},
|
352
|
+
}
|
353
|
+
|
354
|
+
if configuration_updates:
|
355
|
+
base_config["configuration"] = self._deep_merge_dicts(
|
356
|
+
base_config["configuration"], configuration_updates
|
357
|
+
)
|
358
|
+
|
359
|
+
if additional_metadata:
|
360
|
+
base_config = self._deep_merge_dicts(base_config, additional_metadata)
|
361
|
+
|
362
|
+
if experiment_config:
|
363
|
+
base_config = self._deep_merge_dicts(base_config, experiment_config)
|
364
|
+
|
365
|
+
return self._drop_none(base_config)
|
366
|
+
|
367
|
+
def create_optimization_context(
|
368
|
+
self, dataset: "Dataset", metric: Callable, metadata: dict | None = None
|
369
|
+
) -> Any:
|
370
|
+
"""
|
371
|
+
Create optimization context for tracking.
|
372
|
+
|
373
|
+
Args:
|
374
|
+
dataset: The dataset being optimized
|
375
|
+
metric: The metric function
|
376
|
+
metadata: Additional metadata
|
377
|
+
|
378
|
+
Returns:
|
379
|
+
Optimization context manager
|
380
|
+
"""
|
381
|
+
context_metadata = {
|
382
|
+
"optimizer": self.__class__.__name__,
|
383
|
+
"model": self.model,
|
384
|
+
"seed": self.seed,
|
385
|
+
}
|
386
|
+
if metadata:
|
387
|
+
context_metadata.update(metadata)
|
388
|
+
|
389
|
+
return optimization_context(
|
390
|
+
client=self.opik_client,
|
391
|
+
dataset_name=dataset.name,
|
392
|
+
objective_name=metric.__name__,
|
393
|
+
metadata=context_metadata,
|
394
|
+
)
|
395
|
+
|
69
396
|
@abstractmethod
|
70
397
|
def optimize_prompt(
|
71
398
|
self,
|
72
399
|
prompt: "chat_prompt.ChatPrompt",
|
73
400
|
dataset: Dataset,
|
74
401
|
metric: Callable,
|
75
|
-
experiment_config:
|
402
|
+
experiment_config: dict | None = None,
|
403
|
+
n_samples: int | None = None,
|
404
|
+
auto_continue: bool = False,
|
405
|
+
agent_class: type[OptimizableAgent] | None = None,
|
76
406
|
**kwargs: Any,
|
77
407
|
) -> optimization_result.OptimizationResult:
|
78
408
|
"""
|
@@ -90,7 +420,57 @@ class BaseOptimizer:
|
|
90
420
|
"""
|
91
421
|
pass
|
92
422
|
|
93
|
-
def
|
423
|
+
def optimize_mcp(
|
424
|
+
self,
|
425
|
+
prompt: "chat_prompt.ChatPrompt",
|
426
|
+
dataset: Dataset,
|
427
|
+
metric: Callable,
|
428
|
+
*,
|
429
|
+
tool_name: str,
|
430
|
+
second_pass: Any,
|
431
|
+
experiment_config: dict | None = None,
|
432
|
+
n_samples: int | None = None,
|
433
|
+
auto_continue: bool = False,
|
434
|
+
agent_class: type[OptimizableAgent] | None = None,
|
435
|
+
fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
|
436
|
+
fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
|
437
|
+
allow_tool_use_on_second_pass: bool = False,
|
438
|
+
**kwargs: Any,
|
439
|
+
) -> optimization_result.OptimizationResult:
|
440
|
+
"""
|
441
|
+
Optimize prompts that rely on MCP (Model Context Protocol) tooling.
|
442
|
+
|
443
|
+
This method provides a standardized interface for optimizing prompts that use
|
444
|
+
external tools through the MCP protocol. It handles tool invocation, second-pass
|
445
|
+
coordination, and fallback mechanisms.
|
446
|
+
|
447
|
+
Args:
|
448
|
+
prompt: The chat prompt to optimize, must include tools
|
449
|
+
dataset: Opik dataset containing evaluation data
|
450
|
+
metric: Evaluation function that takes (dataset_item, llm_output) and returns a score
|
451
|
+
tool_name: Name of the MCP tool to use for optimization
|
452
|
+
second_pass: MCPSecondPassCoordinator for handling second-pass tool calls
|
453
|
+
experiment_config: Optional configuration for the experiment
|
454
|
+
n_samples: Number of samples to use for optimization (default: None)
|
455
|
+
auto_continue: Whether to auto-continue optimization (default: False)
|
456
|
+
agent_class: Custom agent class to use (default: None)
|
457
|
+
fallback_invoker: Fallback function for tool invocation (default: None)
|
458
|
+
fallback_arguments: Function to extract tool arguments (default: None)
|
459
|
+
allow_tool_use_on_second_pass: Whether to allow tool use on second pass (default: False)
|
460
|
+
**kwargs: Additional arguments for optimization
|
461
|
+
|
462
|
+
Returns:
|
463
|
+
OptimizationResult: The optimization result containing the optimized prompt and metrics
|
464
|
+
|
465
|
+
Raises:
|
466
|
+
NotImplementedError: If the optimizer doesn't implement MCP optimization
|
467
|
+
ValueError: If the prompt doesn't include required tools
|
468
|
+
"""
|
469
|
+
raise NotImplementedError(
|
470
|
+
f"{self.__class__.__name__} does not implement optimize_mcp yet."
|
471
|
+
)
|
472
|
+
|
473
|
+
def get_history(self) -> list[OptimizationRound]:
|
94
474
|
"""
|
95
475
|
Get the optimization history.
|
96
476
|
|
@@ -133,11 +513,11 @@ class BaseOptimizer:
|
|
133
513
|
metric: Callable,
|
134
514
|
n_threads: int,
|
135
515
|
verbose: int = 1,
|
136
|
-
dataset_item_ids:
|
137
|
-
experiment_config:
|
138
|
-
n_samples:
|
139
|
-
seed:
|
140
|
-
agent_class:
|
516
|
+
dataset_item_ids: list[str] | None = None,
|
517
|
+
experiment_config: dict | None = None,
|
518
|
+
n_samples: int | None = None,
|
519
|
+
seed: int | None = None,
|
520
|
+
agent_class: type[OptimizableAgent] | None = None,
|
141
521
|
) -> float:
|
142
522
|
random.seed(seed)
|
143
523
|
|
@@ -146,16 +526,16 @@ class BaseOptimizer:
|
|
146
526
|
if prompt.model_kwargs is None:
|
147
527
|
prompt.model_kwargs = self.model_kwargs
|
148
528
|
|
149
|
-
self.agent_class:
|
529
|
+
self.agent_class: type[OptimizableAgent]
|
150
530
|
|
151
531
|
if agent_class is None:
|
152
|
-
self.agent_class = create_litellm_agent_class(prompt)
|
532
|
+
self.agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
|
153
533
|
else:
|
154
534
|
self.agent_class = agent_class
|
155
535
|
|
156
536
|
agent = self.agent_class(prompt)
|
157
537
|
|
158
|
-
def llm_task(dataset_item:
|
538
|
+
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
159
539
|
messages = prompt.get_messages(dataset_item)
|
160
540
|
raw_model_output = agent.invoke(messages)
|
161
541
|
cleaned_model_output = raw_model_output.strip()
|
@@ -164,18 +544,12 @@ class BaseOptimizer:
|
|
164
544
|
}
|
165
545
|
return result
|
166
546
|
|
167
|
-
experiment_config =
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
"agent_config": prompt.to_dict(),
|
174
|
-
"metric": metric.__name__,
|
175
|
-
"dataset": dataset.name,
|
176
|
-
"configuration": {"prompt": (prompt.get_messages() if prompt else [])},
|
177
|
-
},
|
178
|
-
}
|
547
|
+
experiment_config = self._prepare_experiment_config(
|
548
|
+
prompt=prompt,
|
549
|
+
dataset=dataset,
|
550
|
+
metric=metric,
|
551
|
+
experiment_config=experiment_config,
|
552
|
+
)
|
179
553
|
|
180
554
|
if n_samples is not None:
|
181
555
|
if dataset_item_ids is not None:
|
@@ -190,7 +564,7 @@ class BaseOptimizer:
|
|
190
564
|
metric=metric,
|
191
565
|
evaluated_task=llm_task,
|
192
566
|
num_threads=n_threads,
|
193
|
-
project_name=
|
567
|
+
project_name=experiment_config.get("project_name"),
|
194
568
|
experiment_config=experiment_config,
|
195
569
|
optimization_id=None,
|
196
570
|
verbose=verbose,
|
@@ -0,0 +1,3 @@
|
|
1
|
+
{"id": "ctx-001", "user_query": "Using the Context7 library ID /vercel/next.js, how can I route users down different UI flows with the App Router?", "expected_tool": "get-library-docs", "arguments": {"context7CompatibleLibraryID": "/vercel/next.js", "topic": "routing", "tokens": 1500}, "reference_answer": "The App Router handles conditional experiences with parallel routes. Create directories that start with @ to declare each slot, provide a default.tsx so the route still renders when a branch is missing, and decide which slot to render inside your layout based on the user's state. This lets you show different UI branches without blocking navigation."}
|
2
|
+
{"id": "ctx-002", "user_query": "With library ID /supabase/supabase, what do the docs recommend for keeping edge functions secure?", "expected_tool": "get-library-docs", "arguments": {"context7CompatibleLibraryID": "/supabase/supabase", "topic": "security", "tokens": 1200}, "reference_answer": "Supabase recommends enabling Row Level Security (RLS) on your Postgres tables so edge functions can only access data allowed by fine-grained policies. Run `alter table ... enable row level security;` (for example on the `todos` table) to enforce those policies and prevent unauthorized access."}
|
3
|
+
{"id": "ctx-003", "user_query": "Given /mongodb/docs, remind me what makes up the basic aggregation pipeline.", "expected_tool": "get-library-docs", "arguments": {"context7CompatibleLibraryID": "/mongodb/docs", "topic": "aggregation", "tokens": 1000}, "reference_answer": "An aggregation pipeline runs ordered stages such as $match, $group, $project, $sort, and $limit. Each stage accepts the stream of documents from the previous stage so you can filter, reshape, and summarize the data step by step."}
|
@@ -0,0 +1,90 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from importlib import resources
|
6
|
+
from typing import Any, Union
|
7
|
+
|
8
|
+
try: # pragma: no cover - optional dependency
|
9
|
+
import opik # type: ignore
|
10
|
+
except ImportError: # pragma: no cover - fallback for tests
|
11
|
+
opik = None
|
12
|
+
|
13
|
+
from opik_optimizer.utils.dataset_utils import attach_uuids, dataset_suffix
|
14
|
+
|
15
|
+
OpikDataset = Any
|
16
|
+
|
17
|
+
DATA_PACKAGE = "opik_optimizer.data"
|
18
|
+
DATA_FILENAME = "context7_eval.jsonl"
|
19
|
+
DATASET_NAME = "context7_eval"
|
20
|
+
|
21
|
+
|
22
|
+
def _load_examples() -> list[dict[str, Any]]:
|
23
|
+
text = (
|
24
|
+
resources.files(DATA_PACKAGE)
|
25
|
+
.joinpath(DATA_FILENAME)
|
26
|
+
.read_text(encoding="utf-8")
|
27
|
+
)
|
28
|
+
return [json.loads(line) for line in text.splitlines() if line.strip()]
|
29
|
+
|
30
|
+
|
31
|
+
def _dataset_name(test_mode: bool) -> str:
|
32
|
+
suffix = dataset_suffix(DATA_PACKAGE, DATA_FILENAME)
|
33
|
+
return f"{DATASET_NAME}_{suffix}{'_test' if test_mode else ''}"
|
34
|
+
|
35
|
+
|
36
|
+
@dataclass
|
37
|
+
class _ListDataset:
|
38
|
+
name: str
|
39
|
+
_items: list[dict[str, Any]]
|
40
|
+
|
41
|
+
def __post_init__(self) -> None:
|
42
|
+
for idx, item in enumerate(self._items):
|
43
|
+
item.setdefault("id", f"{self.name}-{idx}")
|
44
|
+
self.id = self.name
|
45
|
+
|
46
|
+
def copy(self) -> _ListDataset:
|
47
|
+
return _ListDataset(self.name, [dict(item) for item in self._items])
|
48
|
+
|
49
|
+
def get_items(self, nb_samples: int | None = None) -> list[dict[str, Any]]:
|
50
|
+
if nb_samples is None:
|
51
|
+
return [dict(item) for item in self._items]
|
52
|
+
return [dict(item) for item in self._items[:nb_samples]]
|
53
|
+
|
54
|
+
|
55
|
+
DatasetResult = Union["_ListDataset", OpikDataset]
|
56
|
+
|
57
|
+
|
58
|
+
def load_context7_dataset(test_mode: bool = False) -> DatasetResult:
|
59
|
+
"""Return the context7 synthetic dataset as an Opik dataset when available."""
|
60
|
+
|
61
|
+
examples = _load_examples()
|
62
|
+
dataset_name = _dataset_name(test_mode)
|
63
|
+
|
64
|
+
if opik is None:
|
65
|
+
return _ListDataset(dataset_name, examples)
|
66
|
+
|
67
|
+
try:
|
68
|
+
client = opik.Opik()
|
69
|
+
dataset: OpikDataset = client.get_or_create_dataset(dataset_name)
|
70
|
+
items = dataset.get_items()
|
71
|
+
expected_len = len(examples) if not test_mode else min(len(examples), 2)
|
72
|
+
|
73
|
+
if len(items) == expected_len:
|
74
|
+
return dataset
|
75
|
+
if len(items) != 0: # pragma: no cover - defensive path
|
76
|
+
raise ValueError(
|
77
|
+
f"Dataset {dataset_name} already exists with {len(items)} items. Delete it to regenerate."
|
78
|
+
)
|
79
|
+
|
80
|
+
if test_mode:
|
81
|
+
dataset.insert(attach_uuids(examples[:expected_len]))
|
82
|
+
else:
|
83
|
+
dataset.insert(attach_uuids(examples))
|
84
|
+
return dataset
|
85
|
+
except Exception:
|
86
|
+
# If Opik client fails (e.g., no API key configured), fall back to local dataset
|
87
|
+
return _ListDataset(dataset_name, examples)
|
88
|
+
|
89
|
+
|
90
|
+
__all__ = ["load_context7_dataset"]
|