opik-optimizer 2.1.3__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +0 -2
- opik_optimizer/base_optimizer.py +313 -144
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +345 -201
- opik_optimizer/gepa_optimizer/reporting.py +291 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +60 -15
- opik_optimizer/utils/__init__.py +3 -0
- opik_optimizer/utils/candidate_utils.py +52 -0
- opik_optimizer/utils/core.py +35 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/RECORD +36 -36
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,680 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import random
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from typing import Any, Literal
|
|
5
|
-
from collections.abc import Callable
|
|
6
|
-
import logging
|
|
7
|
-
|
|
8
|
-
import dspy
|
|
9
|
-
import litellm
|
|
10
|
-
import opik
|
|
11
|
-
from litellm.caching import Cache
|
|
12
|
-
from opik import Dataset
|
|
13
|
-
from opik.evaluation import evaluate
|
|
14
|
-
from opik.integrations.dspy.callback import OpikCallback
|
|
15
|
-
from opik.opik_context import get_current_span_data
|
|
16
|
-
|
|
17
|
-
from ..optimization_result import OptimizationResult
|
|
18
|
-
from ..base_optimizer import BaseOptimizer
|
|
19
|
-
from ..optimization_config.configs import TaskConfig
|
|
20
|
-
from ..optimization_config import chat_prompt
|
|
21
|
-
from ._lm import LM
|
|
22
|
-
from ._mipro_optimizer_v2 import MIPROv2
|
|
23
|
-
from .utils import (
|
|
24
|
-
create_dspy_signature,
|
|
25
|
-
create_dspy_training_set,
|
|
26
|
-
get_tool_prompts,
|
|
27
|
-
opik_metric_to_dspy,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
# Using disk cache for LLM calls
|
|
31
|
-
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
|
32
|
-
litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
|
|
33
|
-
|
|
34
|
-
logger = logging.getLogger(__name__) # Inherits config from setup_logging
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class MiproOptimizer(BaseOptimizer):
|
|
38
|
-
def __init__(
|
|
39
|
-
self,
|
|
40
|
-
model,
|
|
41
|
-
project_name: str | None = None,
|
|
42
|
-
verbose: int = 1,
|
|
43
|
-
**model_kwargs,
|
|
44
|
-
):
|
|
45
|
-
super().__init__(model=model, verbose=verbose, **model_kwargs)
|
|
46
|
-
self.tools = []
|
|
47
|
-
self.project_name = project_name
|
|
48
|
-
if "n_threads" in self.model_kwargs:
|
|
49
|
-
# To allow compatibility with other optimizers:
|
|
50
|
-
self.model_kwargs["num_threads"] = self.model_kwargs["n_threads"]
|
|
51
|
-
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
|
52
|
-
self.model_kwargs["model"] = self.model
|
|
53
|
-
# FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
|
|
54
|
-
self.lm = LM(**self.model_kwargs)
|
|
55
|
-
setattr(self.lm, "parent_optimizer", self)
|
|
56
|
-
opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
|
|
57
|
-
dspy.configure(lm=self.lm, callbacks=[opik_callback])
|
|
58
|
-
logger.debug(f"Initialized MiproOptimizer with model: {model}")
|
|
59
|
-
|
|
60
|
-
def get_optimizer_metadata(self) -> dict[str, Any]:
|
|
61
|
-
return self._drop_none(
|
|
62
|
-
{
|
|
63
|
-
"project_name": self.project_name,
|
|
64
|
-
"num_threads": self.num_threads,
|
|
65
|
-
}
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
def evaluate_prompt(
|
|
69
|
-
self,
|
|
70
|
-
dataset: str | Dataset,
|
|
71
|
-
metric: Callable,
|
|
72
|
-
task_config: TaskConfig,
|
|
73
|
-
prompt: str | dspy.Module | OptimizationResult | None = None,
|
|
74
|
-
n_samples: int = 10,
|
|
75
|
-
dataset_item_ids: list[str] | None = None,
|
|
76
|
-
experiment_config: dict | None = None,
|
|
77
|
-
verbose: int = 1,
|
|
78
|
-
**kwargs,
|
|
79
|
-
) -> float:
|
|
80
|
-
"""
|
|
81
|
-
Compute the score of a prompt on dataset (or part thereof)
|
|
82
|
-
|
|
83
|
-
Args:
|
|
84
|
-
dataset: Opik dataset name or dataset
|
|
85
|
-
metric: Metric function to optimize
|
|
86
|
-
task_config: A TaskConfig instance
|
|
87
|
-
prompt: The prompt to evaluate
|
|
88
|
-
n_samples: number of items to test in the dataset
|
|
89
|
-
dataset_item_ids: Optional list of dataset item IDs to evaluate
|
|
90
|
-
experiment_config: Optional configuration for the experiment
|
|
91
|
-
verbose: Verbosity level
|
|
92
|
-
**kwargs: Additional arguments for evaluation
|
|
93
|
-
|
|
94
|
-
Returns:
|
|
95
|
-
Evaluation score
|
|
96
|
-
"""
|
|
97
|
-
# FIMXE: call super when it is ready
|
|
98
|
-
# FIXME: Intermediate values:
|
|
99
|
-
self.increment_llm_counter()
|
|
100
|
-
input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
|
|
101
|
-
output_key = task_config.output_dataset_field
|
|
102
|
-
|
|
103
|
-
# Kwargs might contain n_samples, passed from run_benchmark.py
|
|
104
|
-
n_samples = kwargs.pop(
|
|
105
|
-
"n_samples", None
|
|
106
|
-
) # Get n_samples from kwargs if present
|
|
107
|
-
|
|
108
|
-
if isinstance(dataset, str):
|
|
109
|
-
opik_client = opik.Opik(project_name=self.project_name)
|
|
110
|
-
dataset = opik_client.get_dataset(dataset)
|
|
111
|
-
|
|
112
|
-
def LLM(input: str) -> str:
|
|
113
|
-
if isinstance(prompt, str):
|
|
114
|
-
response = litellm.completion(
|
|
115
|
-
messages=[
|
|
116
|
-
{"role": "system", "content": prompt},
|
|
117
|
-
{"role": "user", "content": input},
|
|
118
|
-
],
|
|
119
|
-
metadata={
|
|
120
|
-
"opik": {
|
|
121
|
-
"current_span_data": get_current_span_data(),
|
|
122
|
-
"tags": ["optimizer"],
|
|
123
|
-
},
|
|
124
|
-
},
|
|
125
|
-
**self.model_kwargs,
|
|
126
|
-
)
|
|
127
|
-
return response.choices[0].message.content
|
|
128
|
-
elif isinstance(prompt, OptimizationResult):
|
|
129
|
-
if prompt.optimizer == "MiproOptimizer" and getattr(prompt, "details"):
|
|
130
|
-
program = prompt.details["program"]
|
|
131
|
-
result = program(**{input_key: input})
|
|
132
|
-
return getattr(result, output_key)
|
|
133
|
-
else:
|
|
134
|
-
response = litellm.completion(
|
|
135
|
-
messages=[
|
|
136
|
-
{"role": "system", "content": prompt.prompt},
|
|
137
|
-
# FIXME: insert demonstrations here
|
|
138
|
-
{"role": "user", "content": input},
|
|
139
|
-
],
|
|
140
|
-
metadata={
|
|
141
|
-
"opik": {
|
|
142
|
-
"current_span_data": get_current_span_data(),
|
|
143
|
-
"tags": ["optimizer"],
|
|
144
|
-
},
|
|
145
|
-
},
|
|
146
|
-
**self.model_kwargs,
|
|
147
|
-
)
|
|
148
|
-
return response.choices[0].message.content
|
|
149
|
-
elif isinstance(prompt, dspy.Module):
|
|
150
|
-
result = prompt(**{input_key: input})
|
|
151
|
-
return getattr(result, output_key)
|
|
152
|
-
else:
|
|
153
|
-
raise Exception("I don't know how to evaluate this prompt: %r" % prompt)
|
|
154
|
-
|
|
155
|
-
def evaluation_task(dataset_item):
|
|
156
|
-
# Get the model output
|
|
157
|
-
model_output = LLM(dataset_item[input_key])
|
|
158
|
-
|
|
159
|
-
# Prepare the result with all required fields
|
|
160
|
-
result = {
|
|
161
|
-
"input": dataset_item[input_key],
|
|
162
|
-
"output": model_output,
|
|
163
|
-
"expected_output": dataset_item[output_key],
|
|
164
|
-
"reference": dataset_item[output_key],
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
# Add context if available, otherwise use input as context
|
|
168
|
-
result["context"] = dataset_item.get("context", dataset_item[input_key])
|
|
169
|
-
|
|
170
|
-
return result
|
|
171
|
-
|
|
172
|
-
# Robust n_samples handling for selecting dataset_item_ids
|
|
173
|
-
dataset_items_for_eval = dataset.get_items()
|
|
174
|
-
num_total_items = len(dataset_items_for_eval)
|
|
175
|
-
dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
|
|
176
|
-
|
|
177
|
-
if (
|
|
178
|
-
n_samples is not None
|
|
179
|
-
): # If n_samples is specified by the caller (run_benchmark.py)
|
|
180
|
-
if dataset_item_ids is not None:
|
|
181
|
-
# This case should ideally be an error or a clear precedence rule.
|
|
182
|
-
# For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
|
|
183
|
-
logger.warning(
|
|
184
|
-
"MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids."
|
|
185
|
-
)
|
|
186
|
-
# dataset_item_ids_to_use is already dataset_item_ids
|
|
187
|
-
elif n_samples > num_total_items:
|
|
188
|
-
logger.warning(
|
|
189
|
-
f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items."
|
|
190
|
-
)
|
|
191
|
-
dataset_item_ids_to_use = (
|
|
192
|
-
None # opik.evaluation.evaluate handles None as all items
|
|
193
|
-
)
|
|
194
|
-
elif n_samples <= 0:
|
|
195
|
-
logger.warning(
|
|
196
|
-
f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items."
|
|
197
|
-
)
|
|
198
|
-
dataset_item_ids_to_use = None
|
|
199
|
-
else:
|
|
200
|
-
# n_samples is valid and dataset_item_ids was not provided, so sample now.
|
|
201
|
-
all_ids = [item["id"] for item in dataset_items_for_eval]
|
|
202
|
-
dataset_item_ids_to_use = random.sample(all_ids, n_samples)
|
|
203
|
-
logger.info(
|
|
204
|
-
f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation."
|
|
205
|
-
)
|
|
206
|
-
else: # n_samples is None
|
|
207
|
-
if dataset_item_ids is None:
|
|
208
|
-
logger.info(
|
|
209
|
-
f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items."
|
|
210
|
-
)
|
|
211
|
-
# dataset_item_ids_to_use is already dataset_item_ids (which could be None)
|
|
212
|
-
|
|
213
|
-
experiment_config = experiment_config or {}
|
|
214
|
-
experiment_config = {
|
|
215
|
-
**experiment_config,
|
|
216
|
-
**{
|
|
217
|
-
"optimizer": self.__class__.__name__,
|
|
218
|
-
"tools": (
|
|
219
|
-
[f.__name__ for f in task_config.tools] if task_config.tools else []
|
|
220
|
-
),
|
|
221
|
-
"metric": metric.__name__,
|
|
222
|
-
"dataset": dataset.name,
|
|
223
|
-
},
|
|
224
|
-
}
|
|
225
|
-
# Run evaluation with all metrics at once
|
|
226
|
-
evaluation = evaluate(
|
|
227
|
-
dataset=dataset,
|
|
228
|
-
task=evaluation_task,
|
|
229
|
-
scoring_metrics=[metric],
|
|
230
|
-
# "reference" needs to match metric
|
|
231
|
-
scoring_key_mapping={"reference": output_key},
|
|
232
|
-
task_threads=self.num_threads,
|
|
233
|
-
dataset_item_ids=dataset_item_ids_to_use,
|
|
234
|
-
project_name=self.project_name,
|
|
235
|
-
experiment_config=experiment_config,
|
|
236
|
-
verbose=verbose,
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
# Calculate average score across all metrics
|
|
240
|
-
total_score = 0
|
|
241
|
-
count = len(evaluation.test_results)
|
|
242
|
-
for i in range(count):
|
|
243
|
-
total_score += evaluation.test_results[i].score_results[0].value
|
|
244
|
-
score = total_score / count if count > 0 else 0.0
|
|
245
|
-
|
|
246
|
-
logger.debug(
|
|
247
|
-
f"Starting Mipro evaluation for prompt type: {type(prompt).__name__}"
|
|
248
|
-
)
|
|
249
|
-
logger.debug(f"Evaluation score: {score:.4f}")
|
|
250
|
-
return score
|
|
251
|
-
|
|
252
|
-
def optimize_prompt(
|
|
253
|
-
self,
|
|
254
|
-
prompt: chat_prompt.ChatPrompt,
|
|
255
|
-
dataset: str | Dataset,
|
|
256
|
-
metric: Callable,
|
|
257
|
-
experiment_config: dict | None = None,
|
|
258
|
-
n_samples: int | None = 10,
|
|
259
|
-
auto_continue: bool = False,
|
|
260
|
-
agent_class: str | None = None,
|
|
261
|
-
**kwargs,
|
|
262
|
-
) -> OptimizationResult:
|
|
263
|
-
"""
|
|
264
|
-
Optimize a prompt using MIPRO (Multi-Input Prompt Optimization).
|
|
265
|
-
|
|
266
|
-
Args:
|
|
267
|
-
prompt: The chat prompt to optimize
|
|
268
|
-
dataset: Opik dataset (or dataset name) containing evaluation data
|
|
269
|
-
metric: Evaluation function that takes (dataset_item, llm_output) and returns a score
|
|
270
|
-
experiment_config: Optional configuration for the experiment
|
|
271
|
-
n_samples: Number of samples to use for optimization (default: 10)
|
|
272
|
-
auto_continue: Whether to auto-continue optimization (default: False)
|
|
273
|
-
agent_class: Custom agent class to use (default: None)
|
|
274
|
-
**kwargs: Additional arguments including:
|
|
275
|
-
task_config: TaskConfig instance (required)
|
|
276
|
-
num_candidates: Number of candidates to generate (default: 10)
|
|
277
|
-
num_trials: Number of trials to run (default: 3)
|
|
278
|
-
auto: Optimization mode - "light", "medium", or "heavy" (default: "light")
|
|
279
|
-
|
|
280
|
-
Returns:
|
|
281
|
-
OptimizationResult: The optimization result containing the optimized prompt and metrics
|
|
282
|
-
|
|
283
|
-
Raises:
|
|
284
|
-
ValueError: If task_config is not provided
|
|
285
|
-
"""
|
|
286
|
-
# Resolve dataset names to Dataset objects for validation compatibility
|
|
287
|
-
if isinstance(dataset, str):
|
|
288
|
-
dataset_name = dataset
|
|
289
|
-
client = opik.Opik(project_name=self.project_name)
|
|
290
|
-
dataset = client.get_dataset(dataset_name)
|
|
291
|
-
|
|
292
|
-
# Use base class validation and setup methods
|
|
293
|
-
self.validate_optimization_inputs(prompt, dataset, metric)
|
|
294
|
-
|
|
295
|
-
# Extract MIPRO-specific parameters from kwargs
|
|
296
|
-
task_config = kwargs.pop("task_config", None)
|
|
297
|
-
if task_config is None:
|
|
298
|
-
raise ValueError("task_config is required for MiproOptimizer")
|
|
299
|
-
|
|
300
|
-
num_candidates = kwargs.pop("num_candidates", 10)
|
|
301
|
-
num_trials = kwargs.pop("num_trials", 3)
|
|
302
|
-
auto = kwargs.pop("auto", "light")
|
|
303
|
-
|
|
304
|
-
with self.create_optimization_context(dataset, metric) as optimization:
|
|
305
|
-
result = self._optimize_prompt(
|
|
306
|
-
dataset=dataset,
|
|
307
|
-
metric=metric,
|
|
308
|
-
task_config=task_config,
|
|
309
|
-
num_candidates=num_candidates,
|
|
310
|
-
experiment_config=experiment_config,
|
|
311
|
-
optimization_id=optimization.id if optimization is not None else None,
|
|
312
|
-
num_trials=num_trials,
|
|
313
|
-
n_samples=n_samples,
|
|
314
|
-
auto=auto,
|
|
315
|
-
**kwargs,
|
|
316
|
-
)
|
|
317
|
-
return result
|
|
318
|
-
|
|
319
|
-
def _optimize_prompt(
|
|
320
|
-
self,
|
|
321
|
-
dataset: str | Dataset,
|
|
322
|
-
metric: Callable,
|
|
323
|
-
task_config: TaskConfig,
|
|
324
|
-
num_candidates: int = 10,
|
|
325
|
-
experiment_config: dict | None = None,
|
|
326
|
-
optimization_id: str | None = None,
|
|
327
|
-
num_trials: int | None = 3,
|
|
328
|
-
n_samples: int | None = 10,
|
|
329
|
-
auto: Literal["light", "medium", "heavy"] | None = "light",
|
|
330
|
-
**kwargs,
|
|
331
|
-
) -> OptimizationResult:
|
|
332
|
-
logger.info("Preparing MIPRO optimization...")
|
|
333
|
-
self.prepare_optimize_prompt(
|
|
334
|
-
dataset=dataset,
|
|
335
|
-
metric=metric,
|
|
336
|
-
task_config=task_config,
|
|
337
|
-
num_candidates=num_candidates,
|
|
338
|
-
experiment_config=experiment_config,
|
|
339
|
-
optimization_id=optimization_id,
|
|
340
|
-
num_trials=num_trials,
|
|
341
|
-
n_samples=n_samples,
|
|
342
|
-
auto=auto,
|
|
343
|
-
**kwargs,
|
|
344
|
-
)
|
|
345
|
-
logger.info("Starting MIPRO compilation...")
|
|
346
|
-
result = self.continue_optimize_prompt()
|
|
347
|
-
logger.info("MIPRO optimization complete.")
|
|
348
|
-
return result
|
|
349
|
-
|
|
350
|
-
def prepare_optimize_prompt(
|
|
351
|
-
self,
|
|
352
|
-
dataset,
|
|
353
|
-
metric,
|
|
354
|
-
task_config,
|
|
355
|
-
num_candidates: int = 10,
|
|
356
|
-
experiment_config: dict | None = None,
|
|
357
|
-
optimization_id: str | None = None,
|
|
358
|
-
num_trials: int | None = 3,
|
|
359
|
-
n_samples: int | None = 10,
|
|
360
|
-
auto: Literal["light", "medium", "heavy"] | None = "light",
|
|
361
|
-
**kwargs,
|
|
362
|
-
) -> None:
|
|
363
|
-
# FIXME: Intermediate values:
|
|
364
|
-
self.reset_counters() # Reset counters for run
|
|
365
|
-
prompt = task_config.instruction_prompt
|
|
366
|
-
input_key = task_config.input_dataset_fields[0] # FIXME: allow all
|
|
367
|
-
output_key = task_config.output_dataset_field
|
|
368
|
-
self.tools = task_config.tools
|
|
369
|
-
self.num_candidates = num_candidates
|
|
370
|
-
self.auto = auto
|
|
371
|
-
self.input_key = input_key
|
|
372
|
-
self.output_key = output_key
|
|
373
|
-
self.prompt = prompt
|
|
374
|
-
self.num_trials = num_trials
|
|
375
|
-
self.n_samples = n_samples
|
|
376
|
-
|
|
377
|
-
# Convert to values for MIPRO:
|
|
378
|
-
if isinstance(dataset, str):
|
|
379
|
-
opik_client = opik.Opik(project_name=self.project_name)
|
|
380
|
-
self.dataset = opik_client.get_dataset(dataset).get_items()
|
|
381
|
-
else:
|
|
382
|
-
self.dataset = dataset.get_items()
|
|
383
|
-
|
|
384
|
-
# Validate dataset:
|
|
385
|
-
for row in self.dataset:
|
|
386
|
-
if self.input_key not in row:
|
|
387
|
-
raise Exception("row does not contain input_key: %r" % self.input_key)
|
|
388
|
-
if self.output_key not in row:
|
|
389
|
-
raise Exception("row does not contain output_key: %r" % self.output_key)
|
|
390
|
-
|
|
391
|
-
self.trainset = create_dspy_training_set(
|
|
392
|
-
self.dataset, self.input_key, self.n_samples
|
|
393
|
-
)
|
|
394
|
-
self.data_signature = create_dspy_signature(
|
|
395
|
-
self.input_key, self.output_key, self.prompt
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
if self.tools:
|
|
399
|
-
self.module = dspy.ReAct(self.data_signature, tools=self.tools)
|
|
400
|
-
else:
|
|
401
|
-
self.module = dspy.Predict(self.data_signature)
|
|
402
|
-
|
|
403
|
-
# Convert the metric to a DSPy-compatible function
|
|
404
|
-
self.metric_function = opik_metric_to_dspy(metric, self.output_key)
|
|
405
|
-
self.opik_metric = metric
|
|
406
|
-
log_dir = os.path.expanduser("~/.opik-optimizer-checkpoints")
|
|
407
|
-
os.makedirs(log_dir, exist_ok=True)
|
|
408
|
-
|
|
409
|
-
experiment_config = experiment_config or {}
|
|
410
|
-
experiment_config = {
|
|
411
|
-
**experiment_config,
|
|
412
|
-
**{
|
|
413
|
-
"optimizer": self.__class__.__name__,
|
|
414
|
-
"tools": [f.__name__ for f in self.tools],
|
|
415
|
-
"metric": metric.__name__,
|
|
416
|
-
"num_threads": self.num_threads,
|
|
417
|
-
"num_candidates": self.num_candidates,
|
|
418
|
-
"num_trials": self.num_trials,
|
|
419
|
-
"dataset": dataset.name,
|
|
420
|
-
},
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
# Initialize the optimizer:
|
|
424
|
-
self.optimizer = MIPROv2(
|
|
425
|
-
metric=self.metric_function,
|
|
426
|
-
auto=self.auto,
|
|
427
|
-
num_threads=self.num_threads,
|
|
428
|
-
verbose=(self.verbose == 1),
|
|
429
|
-
num_candidates=self.num_candidates,
|
|
430
|
-
seed=self.seed,
|
|
431
|
-
opik_prompt_task_config=task_config,
|
|
432
|
-
opik_dataset=dataset,
|
|
433
|
-
opik_project_name=self.project_name,
|
|
434
|
-
opik_metric=metric,
|
|
435
|
-
opik_optimization_id=optimization_id,
|
|
436
|
-
log_dir=log_dir,
|
|
437
|
-
experiment_config=experiment_config,
|
|
438
|
-
)
|
|
439
|
-
|
|
440
|
-
logger.debug("Created DSPy training set.")
|
|
441
|
-
logger.debug(f"Using DSPy module: {type(self.module).__name__}")
|
|
442
|
-
logger.debug(f"Using metric function: {self.metric_function.__name__}")
|
|
443
|
-
|
|
444
|
-
def cleanup(self) -> None:
|
|
445
|
-
"""
|
|
446
|
-
Clean up MIPRO-specific resources.
|
|
447
|
-
"""
|
|
448
|
-
# Call parent cleanup
|
|
449
|
-
super().cleanup()
|
|
450
|
-
|
|
451
|
-
# Clear MIPRO-specific resources
|
|
452
|
-
self.tools = None
|
|
453
|
-
self.prompt = None
|
|
454
|
-
|
|
455
|
-
logger.debug("Cleaned up MIPRO-specific resources")
|
|
456
|
-
|
|
457
|
-
def load_from_checkpoint(self, filename):
|
|
458
|
-
"""
|
|
459
|
-
Load the module from a checkpoint.
|
|
460
|
-
"""
|
|
461
|
-
self.module.load(os.path.expanduser(filename))
|
|
462
|
-
|
|
463
|
-
def continue_optimize_prompt(self):
|
|
464
|
-
"""
|
|
465
|
-
Continue to look for optimizations
|
|
466
|
-
"""
|
|
467
|
-
if not hasattr(self, "optimizer") or not self.optimizer:
|
|
468
|
-
raise RuntimeError(
|
|
469
|
-
"MiproOptimizer not prepared. Call prepare_optimize_prompt first."
|
|
470
|
-
)
|
|
471
|
-
|
|
472
|
-
self.results = self.optimizer.compile(
|
|
473
|
-
student=self.module,
|
|
474
|
-
trainset=self.trainset,
|
|
475
|
-
provide_traceback=True,
|
|
476
|
-
requires_permission_to_run=False,
|
|
477
|
-
num_trials=self.num_trials,
|
|
478
|
-
)
|
|
479
|
-
self.best_programs = sorted(
|
|
480
|
-
self.results.candidate_programs,
|
|
481
|
-
key=lambda item: item["score"],
|
|
482
|
-
reverse=True,
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
mipro_history_processed = []
|
|
486
|
-
# self.num_candidates is set in prepare_optimize_prompt, defaults to 10
|
|
487
|
-
# If self.num_candidates is 0 or None, this logic might break or be odd.
|
|
488
|
-
# Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
|
|
489
|
-
num_candidates_per_round = ( # noqa
|
|
490
|
-
self.num_candidates
|
|
491
|
-
if hasattr(self, "num_candidates")
|
|
492
|
-
and self.num_candidates
|
|
493
|
-
and self.num_candidates > 0
|
|
494
|
-
else 1
|
|
495
|
-
)
|
|
496
|
-
|
|
497
|
-
for i, candidate_data in enumerate(self.results.candidate_programs):
|
|
498
|
-
program_module = candidate_data.get("program")
|
|
499
|
-
instruction = "N/A"
|
|
500
|
-
if hasattr(program_module, "signature") and hasattr(
|
|
501
|
-
program_module.signature, "instructions"
|
|
502
|
-
):
|
|
503
|
-
instruction = program_module.signature.instructions
|
|
504
|
-
elif hasattr(program_module, "extended_signature") and hasattr(
|
|
505
|
-
program_module.extended_signature, "instructions"
|
|
506
|
-
):
|
|
507
|
-
instruction = program_module.extended_signature.instructions
|
|
508
|
-
elif (
|
|
509
|
-
hasattr(program_module, "predictor")
|
|
510
|
-
and hasattr(program_module.predictor, "signature")
|
|
511
|
-
and hasattr(program_module.predictor.signature, "instructions")
|
|
512
|
-
):
|
|
513
|
-
instruction = program_module.predictor.signature.instructions
|
|
514
|
-
|
|
515
|
-
# Remove R and C calculation for Mipro as its history is flat
|
|
516
|
-
# current_round_number = (i // num_candidates_per_round) + 1
|
|
517
|
-
# current_candidate_in_round = (i % num_candidates_per_round) + 1
|
|
518
|
-
|
|
519
|
-
iter_detail = {
|
|
520
|
-
"iteration": i + 1,
|
|
521
|
-
# "round_number": current_round_number, # Remove round_number
|
|
522
|
-
# "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
|
|
523
|
-
"timestamp": datetime.now().isoformat(),
|
|
524
|
-
"prompt_candidate": instruction,
|
|
525
|
-
"parameters_used": {"program_summary": str(program_module)[:500]},
|
|
526
|
-
"scores": [], # Initialize scores list
|
|
527
|
-
"tokens_used": None, # TODO: add tokens_used
|
|
528
|
-
"cost": None, # TODO: add cost
|
|
529
|
-
"duration_seconds": None, # TODO: add duration_seconds
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
current_score = candidate_data.get("score")
|
|
533
|
-
metric_name_for_history = self.opik_metric.__name__
|
|
534
|
-
|
|
535
|
-
# Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
|
|
536
|
-
# For now, specifically targeting Levenshtein-like metrics
|
|
537
|
-
if isinstance(current_score, (float, int)) and (
|
|
538
|
-
"levenshtein" in metric_name_for_history.lower()
|
|
539
|
-
or "similarity" in metric_name_for_history.lower()
|
|
540
|
-
):
|
|
541
|
-
# Assuming scores like 32.4 are 0-1 scores scaled by 100
|
|
542
|
-
if abs(current_score) > 1.0: # A simple check to see if it looks scaled
|
|
543
|
-
logger.debug(
|
|
544
|
-
f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100."
|
|
545
|
-
)
|
|
546
|
-
current_score /= 100.0
|
|
547
|
-
|
|
548
|
-
iter_detail["scores"].append(
|
|
549
|
-
{
|
|
550
|
-
"metric_name": metric_name_for_history,
|
|
551
|
-
"score": current_score,
|
|
552
|
-
"opik_evaluation_id": None, # TODO: add opik_evaluation_id
|
|
553
|
-
}
|
|
554
|
-
)
|
|
555
|
-
mipro_history_processed.append(iter_detail)
|
|
556
|
-
|
|
557
|
-
if not self.best_programs:
|
|
558
|
-
logger.warning("MIPRO compile returned no candidate programs.")
|
|
559
|
-
return OptimizationResult(
|
|
560
|
-
optimizer="MiproOptimizer",
|
|
561
|
-
prompt=[
|
|
562
|
-
{
|
|
563
|
-
"role": "user",
|
|
564
|
-
"content": getattr(
|
|
565
|
-
self, "prompt", "Error: Initial prompt not found"
|
|
566
|
-
),
|
|
567
|
-
}
|
|
568
|
-
],
|
|
569
|
-
score=0.0,
|
|
570
|
-
metric_name=(
|
|
571
|
-
self.opik_metric.__name__
|
|
572
|
-
if hasattr(self, "opik_metric")
|
|
573
|
-
else "unknown_metric"
|
|
574
|
-
),
|
|
575
|
-
details={"error": "No candidate programs generated by MIPRO"},
|
|
576
|
-
history=mipro_history_processed,
|
|
577
|
-
llm_calls=self.llm_call_counter,
|
|
578
|
-
tool_calls=self.tool_call_counter,
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
self.module = self.get_best().details["program"]
|
|
582
|
-
best_program_details = self.get_best()
|
|
583
|
-
|
|
584
|
-
# Unscale the main score if necessary, similar to history scores
|
|
585
|
-
final_best_score = best_program_details.score
|
|
586
|
-
final_metric_name = best_program_details.metric_name
|
|
587
|
-
if (
|
|
588
|
-
isinstance(final_best_score, (float, int))
|
|
589
|
-
and final_metric_name
|
|
590
|
-
and (
|
|
591
|
-
"levenshtein" in final_metric_name.lower()
|
|
592
|
-
or "similarity" in final_metric_name.lower()
|
|
593
|
-
)
|
|
594
|
-
):
|
|
595
|
-
if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
|
|
596
|
-
logger.debug(
|
|
597
|
-
f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100."
|
|
598
|
-
)
|
|
599
|
-
final_best_score /= 100.0
|
|
600
|
-
|
|
601
|
-
return OptimizationResult(
|
|
602
|
-
optimizer="MiproOptimizer",
|
|
603
|
-
prompt=best_program_details.prompt,
|
|
604
|
-
tool_prompts=best_program_details.tool_prompts,
|
|
605
|
-
score=final_best_score, # Use the potentially unscaled score
|
|
606
|
-
metric_name=final_metric_name,
|
|
607
|
-
demonstrations=best_program_details.demonstrations,
|
|
608
|
-
details=best_program_details.details,
|
|
609
|
-
history=mipro_history_processed,
|
|
610
|
-
llm_calls=self.llm_call_counter,
|
|
611
|
-
tool_calls=self.tool_call_counter,
|
|
612
|
-
)
|
|
613
|
-
|
|
614
|
-
def get_best(self, position: int = 0) -> OptimizationResult:
|
|
615
|
-
if not hasattr(self, "best_programs") or not self.best_programs:
|
|
616
|
-
logger.error(
|
|
617
|
-
"get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results."
|
|
618
|
-
)
|
|
619
|
-
# Get LLM call count from the optimizer if available
|
|
620
|
-
dspy_llm_calls = (
|
|
621
|
-
getattr(self.optimizer, "total_calls", 0)
|
|
622
|
-
if hasattr(self, "optimizer") and self.optimizer
|
|
623
|
-
else 0
|
|
624
|
-
)
|
|
625
|
-
actual_llm_calls = max(self.llm_call_counter, dspy_llm_calls)
|
|
626
|
-
|
|
627
|
-
return OptimizationResult(
|
|
628
|
-
optimizer="MiproOptimizer",
|
|
629
|
-
prompt=[
|
|
630
|
-
{
|
|
631
|
-
"role": "user",
|
|
632
|
-
"content": getattr(
|
|
633
|
-
self, "prompt", "Error: Initial prompt not found"
|
|
634
|
-
),
|
|
635
|
-
}
|
|
636
|
-
],
|
|
637
|
-
score=0.0,
|
|
638
|
-
metric_name=(
|
|
639
|
-
getattr(self, "opik_metric", None).name
|
|
640
|
-
if hasattr(self, "opik_metric") and self.opik_metric
|
|
641
|
-
else "unknown_metric"
|
|
642
|
-
),
|
|
643
|
-
details={"error": "No programs generated or compile failed"},
|
|
644
|
-
history=[],
|
|
645
|
-
llm_calls=actual_llm_calls,
|
|
646
|
-
tool_calls=self.tool_call_counter,
|
|
647
|
-
)
|
|
648
|
-
|
|
649
|
-
score = self.best_programs[position]["score"]
|
|
650
|
-
program_module = self.best_programs[position]["program"]
|
|
651
|
-
state = program_module.dump_state()
|
|
652
|
-
if self.tools:
|
|
653
|
-
tool_names = [tool.__name__ for tool in self.tools]
|
|
654
|
-
tool_prompts = get_tool_prompts(
|
|
655
|
-
tool_names, state["react"]["signature"]["instructions"]
|
|
656
|
-
)
|
|
657
|
-
best_prompt = state["react"]["signature"]["instructions"]
|
|
658
|
-
demos = [x.toDict() for x in state["react"]["demos"]]
|
|
659
|
-
else:
|
|
660
|
-
tool_prompts = None
|
|
661
|
-
best_prompt = state["signature"]["instructions"]
|
|
662
|
-
demos = [x.toDict() for x in state["demos"]]
|
|
663
|
-
|
|
664
|
-
# Get LLM call count from the DSPy program module
|
|
665
|
-
dspy_llm_calls = getattr(program_module, "total_calls", 0)
|
|
666
|
-
# Use the higher of our counter or DSPy's counter
|
|
667
|
-
actual_llm_calls = max(self.llm_call_counter, dspy_llm_calls)
|
|
668
|
-
|
|
669
|
-
print(best_prompt)
|
|
670
|
-
return OptimizationResult(
|
|
671
|
-
optimizer="MiproOptimizer",
|
|
672
|
-
prompt=[{"role": "user", "content": best_prompt}],
|
|
673
|
-
tool_prompts=tool_prompts,
|
|
674
|
-
score=score,
|
|
675
|
-
metric_name=self.opik_metric.__name__,
|
|
676
|
-
demonstrations=demos,
|
|
677
|
-
details={"program": program_module},
|
|
678
|
-
llm_calls=actual_llm_calls,
|
|
679
|
-
tool_calls=self.tool_call_counter,
|
|
680
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|