opik-optimizer 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,395 @@
1
+ from typing import Any, Dict, List, Tuple, Union, Optional
2
+ import os
3
+ import random
4
+
5
+ import opik
6
+
7
+ from opik.integrations.dspy.callback import OpikCallback
8
+ from opik.opik_context import get_current_span_data
9
+ from opik.evaluation import evaluate
10
+ from opik import Dataset
11
+
12
+ import dspy
13
+
14
+ import litellm
15
+ from litellm.caching import Cache
16
+
17
+ from ..optimization_result import OptimizationResult
18
+ from ..base_optimizer import BaseOptimizer
19
+ from ._mipro_optimizer_v2 import MIPROv2
20
+ from ._lm import LM
21
+ from ..optimization_config.configs import MetricConfig, TaskConfig
22
+ from .utils import (
23
+ create_dspy_signature,
24
+ opik_metric_to_dspy,
25
+ create_dspy_training_set,
26
+ get_tool_prompts,
27
+ )
28
+
29
+ # Using disk cache for LLM calls
30
+ disk_cache_dir = os.path.expanduser("~/.litellm_cache")
31
+ litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
32
+
33
+ # Set up logging
34
+ import logging
35
+
36
+ logger = logging.getLogger(__name__) # Inherits config from setup_logging
37
+
38
+
39
+ class MiproOptimizer(BaseOptimizer):
40
+ def __init__(self, model, project_name: Optional[str] = None, **model_kwargs):
41
+ super().__init__(model, project_name, **model_kwargs)
42
+ self.tools = []
43
+ self.num_threads = self.model_kwargs.pop("num_threads", 6)
44
+ self.model_kwargs["model"] = self.model
45
+ lm = LM(**self.model_kwargs)
46
+ opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
47
+ dspy.configure(lm=lm, callbacks=[opik_callback])
48
+ logger.debug(f"Initialized MiproOptimizer with model: {model}")
49
+
50
+ def evaluate_prompt(
51
+ self,
52
+ dataset: Union[str, Dataset],
53
+ metric_config: MetricConfig,
54
+ task_config: TaskConfig,
55
+ prompt: Union[str, dspy.Module, OptimizationResult] = None,
56
+ n_samples: int = 10,
57
+ dataset_item_ids: Optional[List[str]] = None,
58
+ experiment_config: Optional[Dict] = None,
59
+ **kwargs,
60
+ ) -> float:
61
+ """
62
+ Compute the score of a prompt on dataset (or part thereof)
63
+
64
+ Args:
65
+ dataset: Opik dataset name or dataset
66
+ metric_config: A MetricConfig instance
67
+ task_config: A TaskConfig instance
68
+ prompt: The prompt to evaluate
69
+ n_samples: number of items to test in the dataset
70
+ dataset_item_ids: Optional list of dataset item IDs to evaluate
71
+ experiment_config: Optional configuration for the experiment
72
+ **kwargs: Additional arguments for evaluation
73
+
74
+ Returns:
75
+ Evaluation score
76
+ """
77
+ # FIMXE: call super when it is ready
78
+ # FIXME: Intermediate values:
79
+ metric = metric_config.metric
80
+ input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
81
+ output_key = task_config.output_dataset_field
82
+
83
+ if isinstance(dataset, str):
84
+ opik_client = opik.Opik(project_name=self.project_name)
85
+ dataset = opik_client.get_dataset(dataset)
86
+
87
+ def LLM(input: str) -> str:
88
+ if isinstance(prompt, str):
89
+ response = litellm.completion(
90
+ messages=[
91
+ {"role": "system", "content": prompt},
92
+ {"role": "user", "content": input},
93
+ ],
94
+ metadata={
95
+ "opik": {
96
+ "current_span_data": get_current_span_data(),
97
+ "tags": ["optimizer"],
98
+ },
99
+ },
100
+ **self.model_kwargs,
101
+ )
102
+ return response.choices[0].message.content
103
+ elif isinstance(prompt, OptimizationResult):
104
+ if prompt.optimizer == "MiproOptimizer" and getattr(prompt, "details"):
105
+ program = prompt.details["program"]
106
+ result = program(**{input_key: input})
107
+ return getattr(result, output_key)
108
+ else:
109
+ response = litellm.completion(
110
+ messages=[
111
+ {"role": "system", "content": prompt.prompt},
112
+ # FIXME: insert demonstrations here
113
+ {"role": "user", "content": input},
114
+ ],
115
+ metadata={
116
+ "opik": {
117
+ "current_span_data": get_current_span_data(),
118
+ "tags": ["optimizer"],
119
+ },
120
+ },
121
+ **self.model_kwargs,
122
+ )
123
+ return response.choices[0].message.content
124
+ elif isinstance(prompt, dspy.Module):
125
+ result = prompt(**{input_key: input})
126
+ return getattr(result, output_key)
127
+ else:
128
+ raise Exception("I don't know how to evaluate this prompt: %r" % prompt)
129
+
130
+ def evaluation_task(dataset_item):
131
+ # Get the model output
132
+ model_output = LLM(dataset_item[input_key])
133
+
134
+ # Prepare the result with all required fields
135
+ result = {
136
+ "input": dataset_item[input_key],
137
+ "output": model_output,
138
+ "expected_output": dataset_item[output_key],
139
+ "reference": dataset_item[output_key],
140
+ }
141
+
142
+ # Add context if available, otherwise use input as context
143
+ result["context"] = dataset_item.get("context", dataset_item[input_key])
144
+
145
+ return result
146
+
147
+ if n_samples is not None:
148
+ if dataset_item_ids is not None:
149
+ raise Exception("Can't use n_samples and dataset_item_ids")
150
+
151
+ all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
152
+ dataset_item_ids = random.sample(all_ids, n_samples)
153
+
154
+ experiment_config = experiment_config or {}
155
+ experiment_config = {
156
+ **experiment_config,
157
+ **{
158
+ "optimizer": self.__class__.__name__,
159
+ "tools": (
160
+ [f.__name__ for f in task_config.tools] if task_config.tools else []
161
+ ),
162
+ "metric": metric_config.metric.name,
163
+ "dataset": dataset.name,
164
+ },
165
+ }
166
+ # Run evaluation with all metrics at once
167
+ evaluation = evaluate(
168
+ dataset=dataset,
169
+ task=evaluation_task,
170
+ scoring_metrics=[metric],
171
+ # "reference" needs to match metric
172
+ scoring_key_mapping={"reference": output_key},
173
+ task_threads=self.num_threads,
174
+ dataset_item_ids=dataset_item_ids,
175
+ project_name=self.project_name,
176
+ experiment_config=experiment_config,
177
+ )
178
+
179
+ # Calculate average score across all metrics
180
+ total_score = 0
181
+ count = len(evaluation.test_results)
182
+ for i in range(count):
183
+ total_score += evaluation.test_results[i].score_results[0].value
184
+ score = total_score / count if count > 0 else 0.0
185
+
186
+ logger.debug(
187
+ f"Starting Mipro evaluation for prompt type: {type(prompt).__name__}"
188
+ )
189
+ logger.debug(f"Evaluation score: {score:.4f}")
190
+ return score
191
+
192
+ def optimize_prompt(
193
+ self,
194
+ dataset: Union[str, Dataset],
195
+ metric_config: MetricConfig,
196
+ task_config: TaskConfig,
197
+ num_candidates: int = 10,
198
+ experiment_config: Optional[Dict] = None,
199
+ **kwargs,
200
+ ) -> OptimizationResult:
201
+ self._opik_client = opik.Opik()
202
+ optimization = None
203
+ try:
204
+ optimization = self._opik_client.create_optimization(
205
+ dataset_name=dataset.name,
206
+ objective_name=metric_config.metric.name,
207
+ )
208
+ except Exception:
209
+ logger.warning(
210
+ "Opik server does not support optimizations. Please upgrade opik."
211
+ )
212
+ optimization = None
213
+
214
+ if not optimization:
215
+ logger.warning("Continuing without Opik optimization tracking.")
216
+
217
+ try:
218
+ result = self._optimize_prompt(
219
+ dataset=dataset,
220
+ metric_config=metric_config,
221
+ task_config=task_config,
222
+ num_candidates=num_candidates,
223
+ experiment_config=experiment_config,
224
+ optimization_id=optimization.id if optimization is not None else None,
225
+ **kwargs,
226
+ )
227
+ if optimization:
228
+ self.update_optimization(optimization, status="completed")
229
+ return result
230
+ except Exception as e:
231
+ logger.error(f"Mipro optimization failed: {e}", exc_info=True)
232
+ if optimization:
233
+ self.update_optimization(optimization, status="cancelled")
234
+ raise e
235
+
236
+ def _optimize_prompt(
237
+ self,
238
+ dataset: Union[str, Dataset],
239
+ metric_config: MetricConfig,
240
+ task_config: TaskConfig,
241
+ num_candidates: int = 10,
242
+ experiment_config: Optional[Dict] = None,
243
+ optimization_id: Optional[str] = None,
244
+ **kwargs,
245
+ ) -> OptimizationResult:
246
+ logger.info("Preparing MIPRO optimization...")
247
+ self.prepare_optimize_prompt(
248
+ dataset=dataset,
249
+ metric_config=metric_config,
250
+ task_config=task_config,
251
+ num_candidates=num_candidates,
252
+ experiment_config=experiment_config,
253
+ optimization_id=optimization_id,
254
+ **kwargs,
255
+ )
256
+ logger.info("Starting MIPRO compilation...")
257
+ result = self.continue_optimize_prompt()
258
+ logger.info("MIPRO optimization complete.")
259
+ return result
260
+
261
+ def prepare_optimize_prompt(
262
+ self,
263
+ dataset,
264
+ metric_config,
265
+ task_config,
266
+ num_candidates: int = 10,
267
+ experiment_config: Optional[Dict] = None,
268
+ optimization_id: Optional[str] = None,
269
+ **kwargs,
270
+ ) -> None:
271
+ # FIXME: Intermediate values:
272
+ metric = metric_config.metric
273
+ prompt = task_config.instruction_prompt
274
+ input_key = task_config.input_dataset_fields[0] # FIXME: allow all
275
+ output_key = task_config.output_dataset_field
276
+ self.tools = task_config.tools
277
+ self.num_candidates = num_candidates
278
+ self.seed = 9
279
+ self.input_key = input_key
280
+ self.output_key = output_key
281
+ self.prompt = prompt
282
+
283
+ # Convert to values for MIPRO:
284
+ if isinstance(dataset, str):
285
+ opik_client = opik.Opik(project_name=self.project_name)
286
+ self.dataset = opik_client.get_dataset(dataset).get_items()
287
+ else:
288
+ self.dataset = dataset.get_items()
289
+
290
+ # Validate dataset:
291
+ for row in self.dataset:
292
+ if self.input_key not in row:
293
+ raise Exception("row does not contain input_key: %r" % self.input_key)
294
+ if self.output_key not in row:
295
+ raise Exception("row does not contain output_key: %r" % self.output_key)
296
+
297
+ self.trainset = create_dspy_training_set(self.dataset, self.input_key)
298
+ self.data_signature = create_dspy_signature(
299
+ self.input_key, self.output_key, self.prompt
300
+ )
301
+
302
+ if self.tools:
303
+ self.module = dspy.ReAct(self.data_signature, tools=self.tools)
304
+ else:
305
+ self.module = dspy.Predict(self.data_signature)
306
+
307
+ # Convert the metric to a DSPy-compatible function
308
+ self.metric_function = opik_metric_to_dspy(metric, self.output_key)
309
+ self.opik_metric = metric
310
+ log_dir = os.path.expanduser("~/.opik-optimizer-checkpoints")
311
+ os.makedirs(log_dir, exist_ok=True)
312
+
313
+ experiment_config = experiment_config or {}
314
+ experiment_config = {
315
+ **experiment_config,
316
+ **{
317
+ "optimizer": self.__class__.__name__,
318
+ "tools": [f.__name__ for f in self.tools],
319
+ "metric": metric.name,
320
+ "num_threads": self.num_threads,
321
+ "num_candidates": self.num_candidates,
322
+ "dataset": dataset.name,
323
+ },
324
+ }
325
+
326
+ # Initialize the optimizer:
327
+ self.optimizer = MIPROv2(
328
+ metric=self.metric_function,
329
+ auto="light",
330
+ num_threads=self.num_threads,
331
+ verbose=False,
332
+ num_candidates=self.num_candidates,
333
+ seed=self.seed,
334
+ opik_prompt_task_config=task_config,
335
+ opik_dataset=dataset,
336
+ opik_project_name=self.project_name,
337
+ opik_metric_config=metric_config,
338
+ opik_optimization_id=optimization_id,
339
+ log_dir=log_dir,
340
+ experiment_config=experiment_config,
341
+ )
342
+
343
+ logger.debug("Created DSPy training set.")
344
+ logger.debug(f"Using DSPy module: {type(self.module).__name__}")
345
+ logger.debug(f"Using metric function: {self.metric_function.__name__}")
346
+
347
+ def load_from_checkpoint(self, filename):
348
+ """
349
+ Load the module from a checkpoint.
350
+ """
351
+ self.module.load(os.path.expanduser(filename))
352
+
353
+ def continue_optimize_prompt(self):
354
+ """
355
+ Continue to look for optimizations
356
+ """
357
+ self.results = self.optimizer.compile(
358
+ student=self.module,
359
+ trainset=self.trainset,
360
+ provide_traceback=True,
361
+ requires_permission_to_run=False,
362
+ num_trials=3,
363
+ )
364
+ self.best_programs = sorted(
365
+ self.results.candidate_programs,
366
+ key=lambda item: item["score"],
367
+ reverse=True,
368
+ )
369
+ self.module = self.get_best().details["program"]
370
+ return self.get_best()
371
+
372
+ def get_best(self, position: int = 0) -> OptimizationResult:
373
+ score = self.best_programs[position]["score"]
374
+ state = self.best_programs[position]["program"].dump_state()
375
+ if self.tools:
376
+ tool_names = [tool.__name__ for tool in self.tools]
377
+ tool_prompts = get_tool_prompts(
378
+ tool_names, state["react"]["signature"]["instructions"]
379
+ )
380
+ best_prompt = state["react"]["signature"]["instructions"]
381
+ demos = [x.toDict() for x in state["react"]["demos"]]
382
+ else:
383
+ tool_prompts = None
384
+ best_prompt = state["signature"]["instructions"]
385
+ demos = [x.toDict() for x in state["demos"]]
386
+
387
+ return OptimizationResult(
388
+ optimizer="MiproOptimizer",
389
+ prompt=best_prompt,
390
+ tool_prompts=tool_prompts,
391
+ score=score,
392
+ metric_name=self.opik_metric.name,
393
+ demonstrations=demos,
394
+ details={"program": self.best_programs[position]["program"]},
395
+ )
@@ -0,0 +1,107 @@
1
+ from typing import Any, Dict, List, Tuple, Union, Optional
2
+
3
+ import uuid
4
+ import dspy
5
+ import re
6
+
7
+ from dspy.signatures.signature import make_signature
8
+
9
+
10
+ class State(dict):
11
+ def __getattr__(self, key):
12
+ try:
13
+ return self[key]
14
+ except KeyError as e:
15
+ raise AttributeError(e)
16
+
17
+ def __setattr__(self, key, value):
18
+ self[key] = value
19
+
20
+ def __delattr__(self, key):
21
+ try:
22
+ del self[key]
23
+ except KeyError as e:
24
+ raise AttributeError(e)
25
+
26
+
27
+ def create_dspy_signature(
28
+ input: str,
29
+ output: str,
30
+ prompt: str = None,
31
+ ):
32
+ """
33
+ Create a dspy Signature given inputs, outputs, prompt
34
+ """
35
+ # FIXME: allow multiple inputs, input/ouput descriptions
36
+ return make_signature(
37
+ signature={input: (str, dspy.InputField()), output: (str, dspy.OutputField())},
38
+ instructions=prompt,
39
+ )
40
+
41
+
42
+ def opik_metric_to_dspy(metric, output):
43
+ answer_field = output
44
+
45
+ def opik_metric_score_wrapper(example, prediction, trace=None):
46
+ # Extract the input from the example
47
+ input_text = getattr(example, "input", "")
48
+ if isinstance(input_text, list):
49
+ input_text = input_text[0] if input_text else ""
50
+
51
+ # Extract the expected output
52
+ expected_output = getattr(example, answer_field, "")
53
+ if isinstance(expected_output, list):
54
+ expected_output = expected_output[0] if expected_output else ""
55
+
56
+ # Get the model output
57
+ model_output = getattr(prediction, answer_field, "")
58
+
59
+ # Create a result dictionary with all required fields
60
+ result = {
61
+ "input": input_text,
62
+ "output": model_output,
63
+ "expected_output": expected_output,
64
+ "reference": expected_output,
65
+ "context": getattr(example, "context", input_text),
66
+ }
67
+
68
+ try:
69
+ # Calculate the score using the metric
70
+ score_result = metric.score(**result)
71
+ return (
72
+ score_result.value if hasattr(score_result, "value") else score_result
73
+ )
74
+ except Exception as e:
75
+ print(f"Error calculating metric score: {e}")
76
+ return 0.0
77
+
78
+ return opik_metric_score_wrapper
79
+
80
+
81
+ def create_dspy_training_set(data: list[dict], input: str) -> list[dspy.Example]:
82
+ """
83
+ Turn a list of dicts into a list of dspy Examples
84
+ """
85
+ output = []
86
+ for example in data:
87
+ example_obj = dspy.Example(
88
+ **example, dspy_uuid=str(uuid.uuid4()), dspy_split="train"
89
+ )
90
+ example_obj = example_obj.with_inputs(input)
91
+ output.append(example_obj)
92
+ return output
93
+
94
+
95
+ def get_tool_prompts(tool_names, text: str) -> Dict[str, str]:
96
+ """
97
+ Extract the embedded tool prompts from a text.
98
+ """
99
+ tool_prompts = {}
100
+ for count, tool_name in enumerate(tool_names):
101
+ pattern = rf"\b{tool_name}\b[, \.]*([^{count + 2}]*)"
102
+ match = re.search(pattern, text)
103
+ if match:
104
+ description = match.groups()[0]
105
+ if description:
106
+ tool_prompts[tool_name] = description.strip()
107
+ return tool_prompts
File without changes
@@ -0,0 +1,35 @@
1
+ """Module containing configuration classes for optimization."""
2
+
3
+ import pydantic
4
+ import opik
5
+ from typing import Dict, Callable, Union, List, Literal, Any, Optional
6
+ from opik.evaluation.metrics import BaseMetric
7
+
8
+
9
+ class MetricConfig(pydantic.BaseModel):
10
+ """Configuration for a metric used in optimization."""
11
+ metric: BaseMetric
12
+ inputs: Dict[str, Union[str, Callable[[Any], Any]]]
13
+
14
+ model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
15
+
16
+
17
+ class TaskConfig(pydantic.BaseModel):
18
+ """Configuration for a prompt task."""
19
+ model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
20
+
21
+ instruction_prompt: Union[str, List[Dict[Literal["role", "content"], str]]]
22
+ use_chat_prompt: bool = False
23
+ input_dataset_fields: List[str]
24
+ output_dataset_field: str
25
+ tools: List[Any] = []
26
+
27
+
28
+ class OptimizationConfig(pydantic.BaseModel):
29
+ """Configuration for optimization."""
30
+ model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
31
+
32
+ dataset: opik.Dataset
33
+ objective: MetricConfig
34
+ optimization_direction: Literal["maximize", "minimize"] = "maximize"
35
+ task: TaskConfig
@@ -0,0 +1,49 @@
1
+ from typing import Dict, Callable, Optional, Any, Union
2
+
3
+ EVALUATED_LLM_TASK_OUTPUT = "_llm_task_output"
4
+
5
+ class Mapper:
6
+ """Base class for mapping functions that transform data between different formats."""
7
+
8
+ def __init__(self, name: Optional[str] = None, transform: Optional[Callable[[Any], Any]] = None):
9
+ if name is not None and transform is not None:
10
+ raise ValueError("Only one of name or transform can be provided")
11
+
12
+ self.name = name
13
+ self.transform = transform
14
+
15
+ def __call__(self, data: Any) -> Any:
16
+ if self.transform is not None:
17
+ return self.transform(data)
18
+ if self.name is not None:
19
+ return data[self.name]
20
+ return data
21
+
22
+ def from_dataset_field(*, name: str = None, transform: Optional[Callable[[Dict[str, Any]], Any]] = None) -> Union[str, Callable[[Dict[str, Any]], Any]]:
23
+ if name is not None and transform is not None:
24
+ raise ValueError("Only one of name or transform can be provided")
25
+
26
+ if name is not None:
27
+ return name
28
+
29
+ if transform is not None:
30
+ return transform
31
+
32
+ raise ValueError("At least one of name or transform must be provided")
33
+
34
+
35
+ def from_llm_response_text() -> str:
36
+ return EVALUATED_LLM_TASK_OUTPUT
37
+
38
+
39
+ def from_agent_output(*, name: str = None, transform: Optional[Callable[[Any], Any]] = None) -> Union[str, Callable[[Any], Any]]:
40
+ if name is not None and transform is not None:
41
+ raise ValueError("Only one of name or transform can be provided")
42
+
43
+ if name is not None:
44
+ return lambda agent_output: agent_output[name]
45
+
46
+ if transform is not None:
47
+ return transform
48
+
49
+ return EVALUATED_LLM_TASK_OUTPUT