opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. opik_optimizer/__init__.py +4 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +402 -28
  4. opik_optimizer/data/context7_eval.jsonl +3 -0
  5. opik_optimizer/datasets/context7_eval.py +90 -0
  6. opik_optimizer/datasets/tiny_test.py +33 -34
  7. opik_optimizer/datasets/truthful_qa.py +2 -2
  8. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  9. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
  10. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
  11. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  12. opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
  13. opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +154 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +30 -23
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +21 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +22 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/utils/colbert.py +236 -0
  46. opik_optimizer/{utils.py → utils/core.py} +160 -33
  47. opik_optimizer/utils/dataset_utils.py +49 -0
  48. opik_optimizer/utils/prompt_segments.py +186 -0
  49. opik_optimizer-2.0.0.dist-info/METADATA +345 -0
  50. opik_optimizer-2.0.0.dist-info/RECORD +74 -0
  51. opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
  52. opik_optimizer-1.0.6.dist-info/METADATA +0 -181
  53. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  54. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  55. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
  56. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -12,8 +12,10 @@ from .optimizable_agent import OptimizableAgent
12
12
  from .optimization_config.chat_prompt import ChatPrompt
13
13
  from .base_optimizer import BaseOptimizer
14
14
  from .few_shot_bayesian_optimizer import FewShotBayesianOptimizer
15
+ from .gepa_optimizer import GepaOptimizer
15
16
  from .logging_config import setup_logging
16
17
  from .meta_prompt_optimizer import MetaPromptOptimizer
18
+ from .mipro_optimizer import MiproOptimizer
17
19
  from .optimization_config.configs import TaskConfig
18
20
  from .optimization_result import OptimizationResult
19
21
 
@@ -28,7 +30,9 @@ __all__ = [
28
30
  "BaseOptimizer",
29
31
  "ChatPrompt",
30
32
  "FewShotBayesianOptimizer",
33
+ "GepaOptimizer",
31
34
  "MetaPromptOptimizer",
35
+ "MiproOptimizer",
32
36
  "EvolutionaryOptimizer",
33
37
  "OptimizationResult",
34
38
  "OptimizableAgent",
@@ -3,7 +3,8 @@ import pyrate_limiter
3
3
  import time
4
4
  import opik.config
5
5
 
6
- from typing import Callable, Any
6
+ from typing import Any
7
+ from collections.abc import Callable
7
8
 
8
9
 
9
10
  class RateLimiter:
@@ -1,9 +1,13 @@
1
- from typing import Any, Callable, Dict, List, Optional, Type
1
+ from typing import Any, cast
2
+ from collections.abc import Callable
2
3
 
4
+ import copy
5
+ import inspect
3
6
  import logging
4
7
  import time
5
- from abc import abstractmethod
8
+ from abc import ABC, abstractmethod
6
9
  import random
10
+ import importlib.metadata
7
11
 
8
12
 
9
13
  import litellm
@@ -16,7 +20,7 @@ from . import _throttle, optimization_result
16
20
  from .cache_config import initialize_cache
17
21
  from .optimization_config import chat_prompt, mappers
18
22
  from .optimizable_agent import OptimizableAgent
19
- from .utils import create_litellm_agent_class
23
+ from .utils import create_litellm_agent_class, optimization_context
20
24
  from . import task_evaluator
21
25
 
22
26
  _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
@@ -28,6 +32,12 @@ litellm.drop_params = True
28
32
  logger = logging.getLogger(__name__)
29
33
 
30
34
 
35
+ try:
36
+ _OPTIMIZER_VERSION = importlib.metadata.version("opik_optimizer")
37
+ except importlib.metadata.PackageNotFoundError: # pragma: no cover - dev installs
38
+ _OPTIMIZER_VERSION = "unknown"
39
+
40
+
31
41
  class OptimizationRound(BaseModel):
32
42
  model_config = {"arbitrary_types_allowed": True}
33
43
 
@@ -40,11 +50,12 @@ class OptimizationRound(BaseModel):
40
50
  improvement: float
41
51
 
42
52
 
43
- class BaseOptimizer:
53
+ class BaseOptimizer(ABC):
44
54
  def __init__(
45
55
  self,
46
56
  model: str,
47
57
  verbose: int = 1,
58
+ seed: int = 42,
48
59
  **model_kwargs: Any,
49
60
  ) -> None:
50
61
  """
@@ -53,26 +64,345 @@ class BaseOptimizer:
53
64
  Args:
54
65
  model: LiteLLM model name
55
66
  verbose: Controls internal logging/progress bars (0=off, 1=on).
67
+ seed: Random seed for reproducibility (default: 42)
56
68
  model_kwargs: additional args for model (eg, temperature)
57
69
  """
58
70
  self.model = model
59
71
  self.reasoning_model = model
60
72
  self.model_kwargs = model_kwargs
61
73
  self.verbose = verbose
62
- self._history: List[OptimizationRound] = []
74
+ self.seed = seed
75
+ self._history: list[OptimizationRound] = []
63
76
  self.experiment_config = None
64
77
  self.llm_call_counter = 0
78
+ self.tool_call_counter = 0
79
+ self._opik_client = None # Lazy initialization
65
80
 
66
81
  # Initialize shared cache
67
82
  initialize_cache()
68
83
 
84
+ def reset_counters(self) -> None:
85
+ """Reset all call counters for a new optimization run."""
86
+ self.llm_call_counter = 0
87
+ self.tool_call_counter = 0
88
+
89
+ def increment_llm_counter(self) -> None:
90
+ """Increment the LLM call counter."""
91
+ self.llm_call_counter += 1
92
+
93
+ def increment_tool_counter(self) -> None:
94
+ """Increment the tool call counter."""
95
+ self.tool_call_counter += 1
96
+
97
+ def cleanup(self) -> None:
98
+ """
99
+ Clean up resources and perform memory management.
100
+ Should be called when the optimizer is no longer needed.
101
+ """
102
+ # Reset counters
103
+ self.reset_counters()
104
+
105
+ # Clear history to free memory
106
+ self._history.clear()
107
+
108
+ # Clear Opik client if it exists
109
+ if self._opik_client is not None:
110
+ # Note: Opik client doesn't have explicit cleanup, but we can clear the reference
111
+ self._opik_client = None
112
+
113
+ logger.debug(f"Cleaned up resources for {self.__class__.__name__}")
114
+
115
+ def __del__(self) -> None:
116
+ """Destructor to ensure cleanup is called."""
117
+ try:
118
+ self.cleanup()
119
+ except Exception:
120
+ # Ignore exceptions during cleanup in destructor
121
+ pass
122
+
123
+ @property
124
+ def opik_client(self) -> Any:
125
+ """Lazy initialization of Opik client."""
126
+ if self._opik_client is None:
127
+ import opik
128
+
129
+ self._opik_client = opik.Opik()
130
+ return self._opik_client
131
+
132
+ def validate_optimization_inputs(
133
+ self, prompt: "chat_prompt.ChatPrompt", dataset: "Dataset", metric: Callable
134
+ ) -> None:
135
+ """
136
+ Validate common optimization inputs.
137
+
138
+ Args:
139
+ prompt: The chat prompt to validate
140
+ dataset: The dataset to validate
141
+ metric: The metric function to validate
142
+
143
+ Raises:
144
+ ValueError: If any input is invalid
145
+ """
146
+ if not isinstance(prompt, chat_prompt.ChatPrompt):
147
+ raise ValueError("Prompt must be a ChatPrompt object")
148
+
149
+ if not isinstance(dataset, Dataset):
150
+ raise ValueError("Dataset must be a Dataset object")
151
+
152
+ if not callable(metric):
153
+ raise ValueError(
154
+ "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
155
+ )
156
+
157
+ def setup_agent_class(
158
+ self, prompt: "chat_prompt.ChatPrompt", agent_class: Any = None
159
+ ) -> Any:
160
+ """
161
+ Setup agent class for optimization.
162
+
163
+ Args:
164
+ prompt: The chat prompt
165
+ agent_class: Optional custom agent class
166
+
167
+ Returns:
168
+ The agent class to use
169
+ """
170
+ if agent_class is None:
171
+ return create_litellm_agent_class(prompt, optimizer_ref=self)
172
+ else:
173
+ return agent_class
174
+
175
+ def configure_prompt_model(self, prompt: "chat_prompt.ChatPrompt") -> None:
176
+ """
177
+ Configure prompt model and model_kwargs if not set.
178
+
179
+ Args:
180
+ prompt: The chat prompt to configure
181
+ """
182
+ # Only configure if prompt is a valid ChatPrompt object
183
+ if hasattr(prompt, "model") and hasattr(prompt, "model_kwargs"):
184
+ if prompt.model is None:
185
+ prompt.model = self.model
186
+ if prompt.model_kwargs is None:
187
+ prompt.model_kwargs = self.model_kwargs
188
+
189
+ # ------------------------------------------------------------------
190
+ # Experiment metadata helpers
191
+ # ------------------------------------------------------------------
192
+
193
+ @staticmethod
194
+ def _drop_none(metadata: dict[str, Any]) -> dict[str, Any]:
195
+ return {k: v for k, v in metadata.items() if v is not None}
196
+
197
+ @staticmethod
198
+ def _deep_merge_dicts(
199
+ base: dict[str, Any], overrides: dict[str, Any]
200
+ ) -> dict[str, Any]:
201
+ result = copy.deepcopy(base)
202
+ for key, value in overrides.items():
203
+ if (
204
+ key in result
205
+ and isinstance(result[key], dict)
206
+ and isinstance(value, dict)
207
+ ):
208
+ result[key] = BaseOptimizer._deep_merge_dicts(result[key], value)
209
+ else:
210
+ result[key] = value
211
+ return result
212
+
213
+ @staticmethod
214
+ def _serialize_tools(prompt: "chat_prompt.ChatPrompt") -> list[dict[str, Any]]:
215
+ tools_obj = getattr(prompt, "tools", None)
216
+ if not isinstance(tools_obj, list):
217
+ return []
218
+
219
+ try:
220
+ return copy.deepcopy(cast(list[dict[str, Any]], tools_obj))
221
+ except Exception: # pragma: no cover - defensive
222
+ serialized_tools: list[dict[str, Any]] = []
223
+ for tool in tools_obj:
224
+ if isinstance(tool, dict):
225
+ serialized_tools.append({k: v for k, v in tool.items() if k})
226
+ return serialized_tools
227
+
228
+ @staticmethod
229
+ def _describe_annotation(annotation: Any) -> str | None:
230
+ if annotation is inspect._empty:
231
+ return None
232
+ if isinstance(annotation, type):
233
+ return annotation.__name__
234
+ return str(annotation)
235
+
236
+ def _summarize_tool_signatures(
237
+ self, prompt: "chat_prompt.ChatPrompt"
238
+ ) -> list[dict[str, Any]]:
239
+ signatures: list[dict[str, Any]] = []
240
+ for name, func in getattr(prompt, "function_map", {}).items():
241
+ callable_obj = getattr(func, "__wrapped__", func)
242
+ try:
243
+ sig = inspect.signature(callable_obj)
244
+ except (TypeError, ValueError): # pragma: no cover - defensive
245
+ signatures.append({"name": name, "signature": "unavailable"})
246
+ continue
247
+
248
+ params: list[dict[str, Any]] = []
249
+ for parameter in sig.parameters.values():
250
+ params.append(
251
+ self._drop_none(
252
+ {
253
+ "name": parameter.name,
254
+ "kind": parameter.kind.name,
255
+ "annotation": self._describe_annotation(
256
+ parameter.annotation
257
+ ),
258
+ "default": (
259
+ None
260
+ if parameter.default is inspect._empty
261
+ else parameter.default
262
+ ),
263
+ }
264
+ )
265
+ )
266
+
267
+ signatures.append(
268
+ self._drop_none(
269
+ {
270
+ "name": name,
271
+ "parameters": params,
272
+ "docstring": inspect.getdoc(callable_obj),
273
+ }
274
+ )
275
+ )
276
+ return signatures
277
+
278
+ def _build_agent_config(self, prompt: "chat_prompt.ChatPrompt") -> dict[str, Any]:
279
+ agent_config: dict[str, Any] = dict(prompt.to_dict())
280
+ agent_config["project_name"] = getattr(prompt, "project_name", None)
281
+ agent_config["model"] = getattr(prompt, "model", None) or self.model
282
+ agent_config["tools"] = self._serialize_tools(prompt)
283
+ return self._drop_none(agent_config)
284
+
285
+ def get_optimizer_metadata(self) -> dict[str, Any]:
286
+ """Override in subclasses to expose optimizer-specific parameters."""
287
+ return {}
288
+
289
+ def _build_optimizer_metadata(self) -> dict[str, Any]:
290
+ metadata = {
291
+ "name": self.__class__.__name__,
292
+ "version": _OPTIMIZER_VERSION,
293
+ "model": self.model,
294
+ "model_kwargs": self.model_kwargs or None,
295
+ "seed": getattr(self, "seed", None),
296
+ "num_threads": getattr(self, "num_threads", None),
297
+ }
298
+
299
+ # n_threads is used by some optimizers instead of num_threads
300
+ if metadata["num_threads"] is None and hasattr(self, "n_threads"):
301
+ metadata["num_threads"] = getattr(self, "n_threads")
302
+
303
+ if hasattr(self, "reasoning_model"):
304
+ metadata["reasoning_model"] = getattr(self, "reasoning_model")
305
+
306
+ extra_parameters = self.get_optimizer_metadata()
307
+ if extra_parameters:
308
+ metadata["parameters"] = extra_parameters
309
+
310
+ return self._drop_none(metadata)
311
+
312
+ def _prepare_experiment_config(
313
+ self,
314
+ *,
315
+ prompt: "chat_prompt.ChatPrompt",
316
+ dataset: Dataset,
317
+ metric: Callable,
318
+ experiment_config: dict[str, Any] | None = None,
319
+ configuration_updates: dict[str, Any] | None = None,
320
+ additional_metadata: dict[str, Any] | None = None,
321
+ ) -> dict[str, Any]:
322
+ dataset_id = getattr(dataset, "id", None)
323
+ project_name = (
324
+ getattr(self.agent_class, "project_name", None)
325
+ if hasattr(self, "agent_class")
326
+ else None
327
+ )
328
+ if not project_name:
329
+ project_name = getattr(prompt, "project_name", None)
330
+ if not project_name:
331
+ project_name = self.__class__.__name__
332
+
333
+ base_config: dict[str, Any] = {
334
+ "project_name": project_name,
335
+ "agent_class": (
336
+ getattr(self.agent_class, "__name__", None)
337
+ if hasattr(self, "agent_class")
338
+ else None
339
+ ),
340
+ "agent_config": self._build_agent_config(prompt),
341
+ "metric": getattr(metric, "__name__", str(metric)),
342
+ "dataset": getattr(dataset, "name", None),
343
+ "dataset_id": dataset_id,
344
+ "optimizer_metadata": self._build_optimizer_metadata(),
345
+ "tool_signatures": self._summarize_tool_signatures(prompt),
346
+ "configuration": {
347
+ "prompt": prompt.get_messages(),
348
+ "prompt_name": getattr(prompt, "name", None),
349
+ "tools": self._serialize_tools(prompt),
350
+ "prompt_project_name": getattr(prompt, "project_name", None),
351
+ },
352
+ }
353
+
354
+ if configuration_updates:
355
+ base_config["configuration"] = self._deep_merge_dicts(
356
+ base_config["configuration"], configuration_updates
357
+ )
358
+
359
+ if additional_metadata:
360
+ base_config = self._deep_merge_dicts(base_config, additional_metadata)
361
+
362
+ if experiment_config:
363
+ base_config = self._deep_merge_dicts(base_config, experiment_config)
364
+
365
+ return self._drop_none(base_config)
366
+
367
+ def create_optimization_context(
368
+ self, dataset: "Dataset", metric: Callable, metadata: dict | None = None
369
+ ) -> Any:
370
+ """
371
+ Create optimization context for tracking.
372
+
373
+ Args:
374
+ dataset: The dataset being optimized
375
+ metric: The metric function
376
+ metadata: Additional metadata
377
+
378
+ Returns:
379
+ Optimization context manager
380
+ """
381
+ context_metadata = {
382
+ "optimizer": self.__class__.__name__,
383
+ "model": self.model,
384
+ "seed": self.seed,
385
+ }
386
+ if metadata:
387
+ context_metadata.update(metadata)
388
+
389
+ return optimization_context(
390
+ client=self.opik_client,
391
+ dataset_name=dataset.name,
392
+ objective_name=metric.__name__,
393
+ metadata=context_metadata,
394
+ )
395
+
69
396
  @abstractmethod
70
397
  def optimize_prompt(
71
398
  self,
72
399
  prompt: "chat_prompt.ChatPrompt",
73
400
  dataset: Dataset,
74
401
  metric: Callable,
75
- experiment_config: Optional[Dict] = None,
402
+ experiment_config: dict | None = None,
403
+ n_samples: int | None = None,
404
+ auto_continue: bool = False,
405
+ agent_class: type[OptimizableAgent] | None = None,
76
406
  **kwargs: Any,
77
407
  ) -> optimization_result.OptimizationResult:
78
408
  """
@@ -90,7 +420,57 @@ class BaseOptimizer:
90
420
  """
91
421
  pass
92
422
 
93
- def get_history(self) -> List[OptimizationRound]:
423
+ def optimize_mcp(
424
+ self,
425
+ prompt: "chat_prompt.ChatPrompt",
426
+ dataset: Dataset,
427
+ metric: Callable,
428
+ *,
429
+ tool_name: str,
430
+ second_pass: Any,
431
+ experiment_config: dict | None = None,
432
+ n_samples: int | None = None,
433
+ auto_continue: bool = False,
434
+ agent_class: type[OptimizableAgent] | None = None,
435
+ fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
436
+ fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
437
+ allow_tool_use_on_second_pass: bool = False,
438
+ **kwargs: Any,
439
+ ) -> optimization_result.OptimizationResult:
440
+ """
441
+ Optimize prompts that rely on MCP (Model Context Protocol) tooling.
442
+
443
+ This method provides a standardized interface for optimizing prompts that use
444
+ external tools through the MCP protocol. It handles tool invocation, second-pass
445
+ coordination, and fallback mechanisms.
446
+
447
+ Args:
448
+ prompt: The chat prompt to optimize, must include tools
449
+ dataset: Opik dataset containing evaluation data
450
+ metric: Evaluation function that takes (dataset_item, llm_output) and returns a score
451
+ tool_name: Name of the MCP tool to use for optimization
452
+ second_pass: MCPSecondPassCoordinator for handling second-pass tool calls
453
+ experiment_config: Optional configuration for the experiment
454
+ n_samples: Number of samples to use for optimization (default: None)
455
+ auto_continue: Whether to auto-continue optimization (default: False)
456
+ agent_class: Custom agent class to use (default: None)
457
+ fallback_invoker: Fallback function for tool invocation (default: None)
458
+ fallback_arguments: Function to extract tool arguments (default: None)
459
+ allow_tool_use_on_second_pass: Whether to allow tool use on second pass (default: False)
460
+ **kwargs: Additional arguments for optimization
461
+
462
+ Returns:
463
+ OptimizationResult: The optimization result containing the optimized prompt and metrics
464
+
465
+ Raises:
466
+ NotImplementedError: If the optimizer doesn't implement MCP optimization
467
+ ValueError: If the prompt doesn't include required tools
468
+ """
469
+ raise NotImplementedError(
470
+ f"{self.__class__.__name__} does not implement optimize_mcp yet."
471
+ )
472
+
473
+ def get_history(self) -> list[OptimizationRound]:
94
474
  """
95
475
  Get the optimization history.
96
476
 
@@ -133,11 +513,11 @@ class BaseOptimizer:
133
513
  metric: Callable,
134
514
  n_threads: int,
135
515
  verbose: int = 1,
136
- dataset_item_ids: Optional[List[str]] = None,
137
- experiment_config: Optional[Dict] = None,
138
- n_samples: Optional[int] = None,
139
- seed: Optional[int] = None,
140
- agent_class: Optional[Type[OptimizableAgent]] = None,
516
+ dataset_item_ids: list[str] | None = None,
517
+ experiment_config: dict | None = None,
518
+ n_samples: int | None = None,
519
+ seed: int | None = None,
520
+ agent_class: type[OptimizableAgent] | None = None,
141
521
  ) -> float:
142
522
  random.seed(seed)
143
523
 
@@ -146,16 +526,16 @@ class BaseOptimizer:
146
526
  if prompt.model_kwargs is None:
147
527
  prompt.model_kwargs = self.model_kwargs
148
528
 
149
- self.agent_class: Type[OptimizableAgent]
529
+ self.agent_class: type[OptimizableAgent]
150
530
 
151
531
  if agent_class is None:
152
- self.agent_class = create_litellm_agent_class(prompt)
532
+ self.agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
153
533
  else:
154
534
  self.agent_class = agent_class
155
535
 
156
536
  agent = self.agent_class(prompt)
157
537
 
158
- def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, str]:
538
+ def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
159
539
  messages = prompt.get_messages(dataset_item)
160
540
  raw_model_output = agent.invoke(messages)
161
541
  cleaned_model_output = raw_model_output.strip()
@@ -164,18 +544,12 @@ class BaseOptimizer:
164
544
  }
165
545
  return result
166
546
 
167
- experiment_config = experiment_config or {}
168
- experiment_config["project_name"] = self.__class__.__name__
169
- experiment_config = {
170
- **experiment_config,
171
- **{
172
- "agent_class": self.agent_class.__name__,
173
- "agent_config": prompt.to_dict(),
174
- "metric": metric.__name__,
175
- "dataset": dataset.name,
176
- "configuration": {"prompt": (prompt.get_messages() if prompt else [])},
177
- },
178
- }
547
+ experiment_config = self._prepare_experiment_config(
548
+ prompt=prompt,
549
+ dataset=dataset,
550
+ metric=metric,
551
+ experiment_config=experiment_config,
552
+ )
179
553
 
180
554
  if n_samples is not None:
181
555
  if dataset_item_ids is not None:
@@ -190,7 +564,7 @@ class BaseOptimizer:
190
564
  metric=metric,
191
565
  evaluated_task=llm_task,
192
566
  num_threads=n_threads,
193
- project_name=self.agent_class.project_name,
567
+ project_name=experiment_config.get("project_name"),
194
568
  experiment_config=experiment_config,
195
569
  optimization_id=None,
196
570
  verbose=verbose,
@@ -0,0 +1,3 @@
1
+ {"id": "ctx-001", "user_query": "Using the Context7 library ID /vercel/next.js, how can I route users down different UI flows with the App Router?", "expected_tool": "get-library-docs", "arguments": {"context7CompatibleLibraryID": "/vercel/next.js", "topic": "routing", "tokens": 1500}, "reference_answer": "The App Router handles conditional experiences with parallel routes. Create directories that start with @ to declare each slot, provide a default.tsx so the route still renders when a branch is missing, and decide which slot to render inside your layout based on the user's state. This lets you show different UI branches without blocking navigation."}
2
+ {"id": "ctx-002", "user_query": "With library ID /supabase/supabase, what do the docs recommend for keeping edge functions secure?", "expected_tool": "get-library-docs", "arguments": {"context7CompatibleLibraryID": "/supabase/supabase", "topic": "security", "tokens": 1200}, "reference_answer": "Supabase recommends enabling Row Level Security (RLS) on your Postgres tables so edge functions can only access data allowed by fine-grained policies. Run `alter table ... enable row level security;` (for example on the `todos` table) to enforce those policies and prevent unauthorized access."}
3
+ {"id": "ctx-003", "user_query": "Given /mongodb/docs, remind me what makes up the basic aggregation pipeline.", "expected_tool": "get-library-docs", "arguments": {"context7CompatibleLibraryID": "/mongodb/docs", "topic": "aggregation", "tokens": 1000}, "reference_answer": "An aggregation pipeline runs ordered stages such as $match, $group, $project, $sort, and $limit. Each stage accepts the stream of documents from the previous stage so you can filter, reshape, and summarize the data step by step."}
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from importlib import resources
6
+ from typing import Any, Union
7
+
8
+ try: # pragma: no cover - optional dependency
9
+ import opik # type: ignore
10
+ except ImportError: # pragma: no cover - fallback for tests
11
+ opik = None
12
+
13
+ from opik_optimizer.utils.dataset_utils import attach_uuids, dataset_suffix
14
+
15
+ OpikDataset = Any
16
+
17
+ DATA_PACKAGE = "opik_optimizer.data"
18
+ DATA_FILENAME = "context7_eval.jsonl"
19
+ DATASET_NAME = "context7_eval"
20
+
21
+
22
+ def _load_examples() -> list[dict[str, Any]]:
23
+ text = (
24
+ resources.files(DATA_PACKAGE)
25
+ .joinpath(DATA_FILENAME)
26
+ .read_text(encoding="utf-8")
27
+ )
28
+ return [json.loads(line) for line in text.splitlines() if line.strip()]
29
+
30
+
31
+ def _dataset_name(test_mode: bool) -> str:
32
+ suffix = dataset_suffix(DATA_PACKAGE, DATA_FILENAME)
33
+ return f"{DATASET_NAME}_{suffix}{'_test' if test_mode else ''}"
34
+
35
+
36
+ @dataclass
37
+ class _ListDataset:
38
+ name: str
39
+ _items: list[dict[str, Any]]
40
+
41
+ def __post_init__(self) -> None:
42
+ for idx, item in enumerate(self._items):
43
+ item.setdefault("id", f"{self.name}-{idx}")
44
+ self.id = self.name
45
+
46
+ def copy(self) -> _ListDataset:
47
+ return _ListDataset(self.name, [dict(item) for item in self._items])
48
+
49
+ def get_items(self, nb_samples: int | None = None) -> list[dict[str, Any]]:
50
+ if nb_samples is None:
51
+ return [dict(item) for item in self._items]
52
+ return [dict(item) for item in self._items[:nb_samples]]
53
+
54
+
55
+ DatasetResult = Union["_ListDataset", OpikDataset]
56
+
57
+
58
+ def load_context7_dataset(test_mode: bool = False) -> DatasetResult:
59
+ """Return the context7 synthetic dataset as an Opik dataset when available."""
60
+
61
+ examples = _load_examples()
62
+ dataset_name = _dataset_name(test_mode)
63
+
64
+ if opik is None:
65
+ return _ListDataset(dataset_name, examples)
66
+
67
+ try:
68
+ client = opik.Opik()
69
+ dataset: OpikDataset = client.get_or_create_dataset(dataset_name)
70
+ items = dataset.get_items()
71
+ expected_len = len(examples) if not test_mode else min(len(examples), 2)
72
+
73
+ if len(items) == expected_len:
74
+ return dataset
75
+ if len(items) != 0: # pragma: no cover - defensive path
76
+ raise ValueError(
77
+ f"Dataset {dataset_name} already exists with {len(items)} items. Delete it to regenerate."
78
+ )
79
+
80
+ if test_mode:
81
+ dataset.insert(attach_uuids(examples[:expected_len]))
82
+ else:
83
+ dataset.insert(attach_uuids(examples))
84
+ return dataset
85
+ except Exception:
86
+ # If Opik client fails (e.g., no API key configured), fall back to local dataset
87
+ return _ListDataset(dataset_name, examples)
88
+
89
+
90
+ __all__ = ["load_context7_dataset"]