opik-optimizer 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. opik_optimizer/__init__.py +0 -2
  2. opik_optimizer/base_optimizer.py +314 -145
  3. opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
  4. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
  5. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
  6. opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
  9. opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
  10. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
  11. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
  12. opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
  13. opik_optimizer/gepa_optimizer/reporting.py +164 -22
  14. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
  15. opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
  16. opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
  17. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
  18. opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
  19. opik_optimizer/mipro_optimizer/__init__.py +2 -2
  20. opik_optimizer/mipro_optimizer/_lm.py +4 -4
  21. opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
  22. opik_optimizer/mipro_optimizer/utils.py +1 -0
  23. opik_optimizer/optimizable_agent.py +7 -4
  24. opik_optimizer/optimization_config/chat_prompt.py +7 -10
  25. opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
  26. opik_optimizer/parameter_optimizer/reporting.py +148 -0
  27. opik_optimizer/reporting_utils.py +42 -15
  28. opik_optimizer/utils/core.py +16 -2
  29. opik_optimizer/utils/prompt_segments.py +1 -2
  30. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
  31. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +34 -35
  32. opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
  33. opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
  34. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
  35. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
  36. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
@@ -1,680 +0,0 @@
1
- import os
2
- import random
3
- from datetime import datetime
4
- from typing import Any, Literal
5
- from collections.abc import Callable
6
- import logging
7
-
8
- import dspy
9
- import litellm
10
- import opik
11
- from litellm.caching import Cache
12
- from opik import Dataset
13
- from opik.evaluation import evaluate
14
- from opik.integrations.dspy.callback import OpikCallback
15
- from opik.opik_context import get_current_span_data
16
-
17
- from ..optimization_result import OptimizationResult
18
- from ..base_optimizer import BaseOptimizer
19
- from ..optimization_config.configs import TaskConfig
20
- from ..optimization_config import chat_prompt
21
- from ._lm import LM
22
- from ._mipro_optimizer_v2 import MIPROv2
23
- from .utils import (
24
- create_dspy_signature,
25
- create_dspy_training_set,
26
- get_tool_prompts,
27
- opik_metric_to_dspy,
28
- )
29
-
30
- # Using disk cache for LLM calls
31
- disk_cache_dir = os.path.expanduser("~/.litellm_cache")
32
- litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
33
-
34
- logger = logging.getLogger(__name__) # Inherits config from setup_logging
35
-
36
-
37
- class MiproOptimizer(BaseOptimizer):
38
- def __init__(
39
- self,
40
- model,
41
- project_name: str | None = None,
42
- verbose: int = 1,
43
- **model_kwargs,
44
- ):
45
- super().__init__(model=model, verbose=verbose, **model_kwargs)
46
- self.tools = []
47
- self.project_name = project_name
48
- if "n_threads" in self.model_kwargs:
49
- # To allow compatibility with other optimizers:
50
- self.model_kwargs["num_threads"] = self.model_kwargs["n_threads"]
51
- self.num_threads = self.model_kwargs.pop("num_threads", 6)
52
- self.model_kwargs["model"] = self.model
53
- # FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
54
- self.lm = LM(**self.model_kwargs)
55
- setattr(self.lm, "parent_optimizer", self)
56
- opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
57
- dspy.configure(lm=self.lm, callbacks=[opik_callback])
58
- logger.debug(f"Initialized MiproOptimizer with model: {model}")
59
-
60
- def get_optimizer_metadata(self) -> dict[str, Any]:
61
- return self._drop_none(
62
- {
63
- "project_name": self.project_name,
64
- "num_threads": self.num_threads,
65
- }
66
- )
67
-
68
- def evaluate_prompt(
69
- self,
70
- dataset: str | Dataset,
71
- metric: Callable,
72
- task_config: TaskConfig,
73
- prompt: str | dspy.Module | OptimizationResult | None = None,
74
- n_samples: int = 10,
75
- dataset_item_ids: list[str] | None = None,
76
- experiment_config: dict | None = None,
77
- verbose: int = 1,
78
- **kwargs,
79
- ) -> float:
80
- """
81
- Compute the score of a prompt on dataset (or part thereof)
82
-
83
- Args:
84
- dataset: Opik dataset name or dataset
85
- metric: Metric function to optimize
86
- task_config: A TaskConfig instance
87
- prompt: The prompt to evaluate
88
- n_samples: number of items to test in the dataset
89
- dataset_item_ids: Optional list of dataset item IDs to evaluate
90
- experiment_config: Optional configuration for the experiment
91
- verbose: Verbosity level
92
- **kwargs: Additional arguments for evaluation
93
-
94
- Returns:
95
- Evaluation score
96
- """
97
- # FIMXE: call super when it is ready
98
- # FIXME: Intermediate values:
99
- self.increment_llm_counter()
100
- input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
101
- output_key = task_config.output_dataset_field
102
-
103
- # Kwargs might contain n_samples, passed from run_benchmark.py
104
- n_samples = kwargs.pop(
105
- "n_samples", None
106
- ) # Get n_samples from kwargs if present
107
-
108
- if isinstance(dataset, str):
109
- opik_client = opik.Opik(project_name=self.project_name)
110
- dataset = opik_client.get_dataset(dataset)
111
-
112
- def LLM(input: str) -> str:
113
- if isinstance(prompt, str):
114
- response = litellm.completion(
115
- messages=[
116
- {"role": "system", "content": prompt},
117
- {"role": "user", "content": input},
118
- ],
119
- metadata={
120
- "opik": {
121
- "current_span_data": get_current_span_data(),
122
- "tags": ["optimizer"],
123
- },
124
- },
125
- **self.model_kwargs,
126
- )
127
- return response.choices[0].message.content
128
- elif isinstance(prompt, OptimizationResult):
129
- if prompt.optimizer == "MiproOptimizer" and getattr(prompt, "details"):
130
- program = prompt.details["program"]
131
- result = program(**{input_key: input})
132
- return getattr(result, output_key)
133
- else:
134
- response = litellm.completion(
135
- messages=[
136
- {"role": "system", "content": prompt.prompt},
137
- # FIXME: insert demonstrations here
138
- {"role": "user", "content": input},
139
- ],
140
- metadata={
141
- "opik": {
142
- "current_span_data": get_current_span_data(),
143
- "tags": ["optimizer"],
144
- },
145
- },
146
- **self.model_kwargs,
147
- )
148
- return response.choices[0].message.content
149
- elif isinstance(prompt, dspy.Module):
150
- result = prompt(**{input_key: input})
151
- return getattr(result, output_key)
152
- else:
153
- raise Exception("I don't know how to evaluate this prompt: %r" % prompt)
154
-
155
- def evaluation_task(dataset_item):
156
- # Get the model output
157
- model_output = LLM(dataset_item[input_key])
158
-
159
- # Prepare the result with all required fields
160
- result = {
161
- "input": dataset_item[input_key],
162
- "output": model_output,
163
- "expected_output": dataset_item[output_key],
164
- "reference": dataset_item[output_key],
165
- }
166
-
167
- # Add context if available, otherwise use input as context
168
- result["context"] = dataset_item.get("context", dataset_item[input_key])
169
-
170
- return result
171
-
172
- # Robust n_samples handling for selecting dataset_item_ids
173
- dataset_items_for_eval = dataset.get_items()
174
- num_total_items = len(dataset_items_for_eval)
175
- dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
176
-
177
- if (
178
- n_samples is not None
179
- ): # If n_samples is specified by the caller (run_benchmark.py)
180
- if dataset_item_ids is not None:
181
- # This case should ideally be an error or a clear precedence rule.
182
- # For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
183
- logger.warning(
184
- "MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids."
185
- )
186
- # dataset_item_ids_to_use is already dataset_item_ids
187
- elif n_samples > num_total_items:
188
- logger.warning(
189
- f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items."
190
- )
191
- dataset_item_ids_to_use = (
192
- None # opik.evaluation.evaluate handles None as all items
193
- )
194
- elif n_samples <= 0:
195
- logger.warning(
196
- f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items."
197
- )
198
- dataset_item_ids_to_use = None
199
- else:
200
- # n_samples is valid and dataset_item_ids was not provided, so sample now.
201
- all_ids = [item["id"] for item in dataset_items_for_eval]
202
- dataset_item_ids_to_use = random.sample(all_ids, n_samples)
203
- logger.info(
204
- f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation."
205
- )
206
- else: # n_samples is None
207
- if dataset_item_ids is None:
208
- logger.info(
209
- f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items."
210
- )
211
- # dataset_item_ids_to_use is already dataset_item_ids (which could be None)
212
-
213
- experiment_config = experiment_config or {}
214
- experiment_config = {
215
- **experiment_config,
216
- **{
217
- "optimizer": self.__class__.__name__,
218
- "tools": (
219
- [f.__name__ for f in task_config.tools] if task_config.tools else []
220
- ),
221
- "metric": metric.__name__,
222
- "dataset": dataset.name,
223
- },
224
- }
225
- # Run evaluation with all metrics at once
226
- evaluation = evaluate(
227
- dataset=dataset,
228
- task=evaluation_task,
229
- scoring_metrics=[metric],
230
- # "reference" needs to match metric
231
- scoring_key_mapping={"reference": output_key},
232
- task_threads=self.num_threads,
233
- dataset_item_ids=dataset_item_ids_to_use,
234
- project_name=self.project_name,
235
- experiment_config=experiment_config,
236
- verbose=verbose,
237
- )
238
-
239
- # Calculate average score across all metrics
240
- total_score = 0
241
- count = len(evaluation.test_results)
242
- for i in range(count):
243
- total_score += evaluation.test_results[i].score_results[0].value
244
- score = total_score / count if count > 0 else 0.0
245
-
246
- logger.debug(
247
- f"Starting Mipro evaluation for prompt type: {type(prompt).__name__}"
248
- )
249
- logger.debug(f"Evaluation score: {score:.4f}")
250
- return score
251
-
252
- def optimize_prompt(
253
- self,
254
- prompt: chat_prompt.ChatPrompt,
255
- dataset: str | Dataset,
256
- metric: Callable,
257
- experiment_config: dict | None = None,
258
- n_samples: int | None = 10,
259
- auto_continue: bool = False,
260
- agent_class: str | None = None,
261
- **kwargs,
262
- ) -> OptimizationResult:
263
- """
264
- Optimize a prompt using MIPRO (Multi-Input Prompt Optimization).
265
-
266
- Args:
267
- prompt: The chat prompt to optimize
268
- dataset: Opik dataset (or dataset name) containing evaluation data
269
- metric: Evaluation function that takes (dataset_item, llm_output) and returns a score
270
- experiment_config: Optional configuration for the experiment
271
- n_samples: Number of samples to use for optimization (default: 10)
272
- auto_continue: Whether to auto-continue optimization (default: False)
273
- agent_class: Custom agent class to use (default: None)
274
- **kwargs: Additional arguments including:
275
- task_config: TaskConfig instance (required)
276
- num_candidates: Number of candidates to generate (default: 10)
277
- num_trials: Number of trials to run (default: 3)
278
- auto: Optimization mode - "light", "medium", or "heavy" (default: "light")
279
-
280
- Returns:
281
- OptimizationResult: The optimization result containing the optimized prompt and metrics
282
-
283
- Raises:
284
- ValueError: If task_config is not provided
285
- """
286
- # Resolve dataset names to Dataset objects for validation compatibility
287
- if isinstance(dataset, str):
288
- dataset_name = dataset
289
- client = opik.Opik(project_name=self.project_name)
290
- dataset = client.get_dataset(dataset_name)
291
-
292
- # Use base class validation and setup methods
293
- self.validate_optimization_inputs(prompt, dataset, metric)
294
-
295
- # Extract MIPRO-specific parameters from kwargs
296
- task_config = kwargs.pop("task_config", None)
297
- if task_config is None:
298
- raise ValueError("task_config is required for MiproOptimizer")
299
-
300
- num_candidates = kwargs.pop("num_candidates", 10)
301
- num_trials = kwargs.pop("num_trials", 3)
302
- auto = kwargs.pop("auto", "light")
303
-
304
- with self.create_optimization_context(dataset, metric) as optimization:
305
- result = self._optimize_prompt(
306
- dataset=dataset,
307
- metric=metric,
308
- task_config=task_config,
309
- num_candidates=num_candidates,
310
- experiment_config=experiment_config,
311
- optimization_id=optimization.id if optimization is not None else None,
312
- num_trials=num_trials,
313
- n_samples=n_samples,
314
- auto=auto,
315
- **kwargs,
316
- )
317
- return result
318
-
319
- def _optimize_prompt(
320
- self,
321
- dataset: str | Dataset,
322
- metric: Callable,
323
- task_config: TaskConfig,
324
- num_candidates: int = 10,
325
- experiment_config: dict | None = None,
326
- optimization_id: str | None = None,
327
- num_trials: int | None = 3,
328
- n_samples: int | None = 10,
329
- auto: Literal["light", "medium", "heavy"] | None = "light",
330
- **kwargs,
331
- ) -> OptimizationResult:
332
- logger.info("Preparing MIPRO optimization...")
333
- self.prepare_optimize_prompt(
334
- dataset=dataset,
335
- metric=metric,
336
- task_config=task_config,
337
- num_candidates=num_candidates,
338
- experiment_config=experiment_config,
339
- optimization_id=optimization_id,
340
- num_trials=num_trials,
341
- n_samples=n_samples,
342
- auto=auto,
343
- **kwargs,
344
- )
345
- logger.info("Starting MIPRO compilation...")
346
- result = self.continue_optimize_prompt()
347
- logger.info("MIPRO optimization complete.")
348
- return result
349
-
350
- def prepare_optimize_prompt(
351
- self,
352
- dataset,
353
- metric,
354
- task_config,
355
- num_candidates: int = 10,
356
- experiment_config: dict | None = None,
357
- optimization_id: str | None = None,
358
- num_trials: int | None = 3,
359
- n_samples: int | None = 10,
360
- auto: Literal["light", "medium", "heavy"] | None = "light",
361
- **kwargs,
362
- ) -> None:
363
- # FIXME: Intermediate values:
364
- self.reset_counters() # Reset counters for run
365
- prompt = task_config.instruction_prompt
366
- input_key = task_config.input_dataset_fields[0] # FIXME: allow all
367
- output_key = task_config.output_dataset_field
368
- self.tools = task_config.tools
369
- self.num_candidates = num_candidates
370
- self.auto = auto
371
- self.input_key = input_key
372
- self.output_key = output_key
373
- self.prompt = prompt
374
- self.num_trials = num_trials
375
- self.n_samples = n_samples
376
-
377
- # Convert to values for MIPRO:
378
- if isinstance(dataset, str):
379
- opik_client = opik.Opik(project_name=self.project_name)
380
- self.dataset = opik_client.get_dataset(dataset).get_items()
381
- else:
382
- self.dataset = dataset.get_items()
383
-
384
- # Validate dataset:
385
- for row in self.dataset:
386
- if self.input_key not in row:
387
- raise Exception("row does not contain input_key: %r" % self.input_key)
388
- if self.output_key not in row:
389
- raise Exception("row does not contain output_key: %r" % self.output_key)
390
-
391
- self.trainset = create_dspy_training_set(
392
- self.dataset, self.input_key, self.n_samples
393
- )
394
- self.data_signature = create_dspy_signature(
395
- self.input_key, self.output_key, self.prompt
396
- )
397
-
398
- if self.tools:
399
- self.module = dspy.ReAct(self.data_signature, tools=self.tools)
400
- else:
401
- self.module = dspy.Predict(self.data_signature)
402
-
403
- # Convert the metric to a DSPy-compatible function
404
- self.metric_function = opik_metric_to_dspy(metric, self.output_key)
405
- self.opik_metric = metric
406
- log_dir = os.path.expanduser("~/.opik-optimizer-checkpoints")
407
- os.makedirs(log_dir, exist_ok=True)
408
-
409
- experiment_config = experiment_config or {}
410
- experiment_config = {
411
- **experiment_config,
412
- **{
413
- "optimizer": self.__class__.__name__,
414
- "tools": [f.__name__ for f in self.tools],
415
- "metric": metric.__name__,
416
- "num_threads": self.num_threads,
417
- "num_candidates": self.num_candidates,
418
- "num_trials": self.num_trials,
419
- "dataset": dataset.name,
420
- },
421
- }
422
-
423
- # Initialize the optimizer:
424
- self.optimizer = MIPROv2(
425
- metric=self.metric_function,
426
- auto=self.auto,
427
- num_threads=self.num_threads,
428
- verbose=(self.verbose == 1),
429
- num_candidates=self.num_candidates,
430
- seed=self.seed,
431
- opik_prompt_task_config=task_config,
432
- opik_dataset=dataset,
433
- opik_project_name=self.project_name,
434
- opik_metric=metric,
435
- opik_optimization_id=optimization_id,
436
- log_dir=log_dir,
437
- experiment_config=experiment_config,
438
- )
439
-
440
- logger.debug("Created DSPy training set.")
441
- logger.debug(f"Using DSPy module: {type(self.module).__name__}")
442
- logger.debug(f"Using metric function: {self.metric_function.__name__}")
443
-
444
- def cleanup(self) -> None:
445
- """
446
- Clean up MIPRO-specific resources.
447
- """
448
- # Call parent cleanup
449
- super().cleanup()
450
-
451
- # Clear MIPRO-specific resources
452
- self.tools = None
453
- self.prompt = None
454
-
455
- logger.debug("Cleaned up MIPRO-specific resources")
456
-
457
- def load_from_checkpoint(self, filename):
458
- """
459
- Load the module from a checkpoint.
460
- """
461
- self.module.load(os.path.expanduser(filename))
462
-
463
- def continue_optimize_prompt(self):
464
- """
465
- Continue to look for optimizations
466
- """
467
- if not hasattr(self, "optimizer") or not self.optimizer:
468
- raise RuntimeError(
469
- "MiproOptimizer not prepared. Call prepare_optimize_prompt first."
470
- )
471
-
472
- self.results = self.optimizer.compile(
473
- student=self.module,
474
- trainset=self.trainset,
475
- provide_traceback=True,
476
- requires_permission_to_run=False,
477
- num_trials=self.num_trials,
478
- )
479
- self.best_programs = sorted(
480
- self.results.candidate_programs,
481
- key=lambda item: item["score"],
482
- reverse=True,
483
- )
484
-
485
- mipro_history_processed = []
486
- # self.num_candidates is set in prepare_optimize_prompt, defaults to 10
487
- # If self.num_candidates is 0 or None, this logic might break or be odd.
488
- # Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
489
- num_candidates_per_round = ( # noqa
490
- self.num_candidates
491
- if hasattr(self, "num_candidates")
492
- and self.num_candidates
493
- and self.num_candidates > 0
494
- else 1
495
- )
496
-
497
- for i, candidate_data in enumerate(self.results.candidate_programs):
498
- program_module = candidate_data.get("program")
499
- instruction = "N/A"
500
- if hasattr(program_module, "signature") and hasattr(
501
- program_module.signature, "instructions"
502
- ):
503
- instruction = program_module.signature.instructions
504
- elif hasattr(program_module, "extended_signature") and hasattr(
505
- program_module.extended_signature, "instructions"
506
- ):
507
- instruction = program_module.extended_signature.instructions
508
- elif (
509
- hasattr(program_module, "predictor")
510
- and hasattr(program_module.predictor, "signature")
511
- and hasattr(program_module.predictor.signature, "instructions")
512
- ):
513
- instruction = program_module.predictor.signature.instructions
514
-
515
- # Remove R and C calculation for Mipro as its history is flat
516
- # current_round_number = (i // num_candidates_per_round) + 1
517
- # current_candidate_in_round = (i % num_candidates_per_round) + 1
518
-
519
- iter_detail = {
520
- "iteration": i + 1,
521
- # "round_number": current_round_number, # Remove round_number
522
- # "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
523
- "timestamp": datetime.now().isoformat(),
524
- "prompt_candidate": instruction,
525
- "parameters_used": {"program_summary": str(program_module)[:500]},
526
- "scores": [], # Initialize scores list
527
- "tokens_used": None, # TODO: add tokens_used
528
- "cost": None, # TODO: add cost
529
- "duration_seconds": None, # TODO: add duration_seconds
530
- }
531
-
532
- current_score = candidate_data.get("score")
533
- metric_name_for_history = self.opik_metric.__name__
534
-
535
- # Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
536
- # For now, specifically targeting Levenshtein-like metrics
537
- if isinstance(current_score, (float, int)) and (
538
- "levenshtein" in metric_name_for_history.lower()
539
- or "similarity" in metric_name_for_history.lower()
540
- ):
541
- # Assuming scores like 32.4 are 0-1 scores scaled by 100
542
- if abs(current_score) > 1.0: # A simple check to see if it looks scaled
543
- logger.debug(
544
- f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100."
545
- )
546
- current_score /= 100.0
547
-
548
- iter_detail["scores"].append(
549
- {
550
- "metric_name": metric_name_for_history,
551
- "score": current_score,
552
- "opik_evaluation_id": None, # TODO: add opik_evaluation_id
553
- }
554
- )
555
- mipro_history_processed.append(iter_detail)
556
-
557
- if not self.best_programs:
558
- logger.warning("MIPRO compile returned no candidate programs.")
559
- return OptimizationResult(
560
- optimizer="MiproOptimizer",
561
- prompt=[
562
- {
563
- "role": "user",
564
- "content": getattr(
565
- self, "prompt", "Error: Initial prompt not found"
566
- ),
567
- }
568
- ],
569
- score=0.0,
570
- metric_name=(
571
- self.opik_metric.__name__
572
- if hasattr(self, "opik_metric")
573
- else "unknown_metric"
574
- ),
575
- details={"error": "No candidate programs generated by MIPRO"},
576
- history=mipro_history_processed,
577
- llm_calls=self.llm_call_counter,
578
- tool_calls=self.tool_call_counter,
579
- )
580
-
581
- self.module = self.get_best().details["program"]
582
- best_program_details = self.get_best()
583
-
584
- # Unscale the main score if necessary, similar to history scores
585
- final_best_score = best_program_details.score
586
- final_metric_name = best_program_details.metric_name
587
- if (
588
- isinstance(final_best_score, (float, int))
589
- and final_metric_name
590
- and (
591
- "levenshtein" in final_metric_name.lower()
592
- or "similarity" in final_metric_name.lower()
593
- )
594
- ):
595
- if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
596
- logger.debug(
597
- f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100."
598
- )
599
- final_best_score /= 100.0
600
-
601
- return OptimizationResult(
602
- optimizer="MiproOptimizer",
603
- prompt=best_program_details.prompt,
604
- tool_prompts=best_program_details.tool_prompts,
605
- score=final_best_score, # Use the potentially unscaled score
606
- metric_name=final_metric_name,
607
- demonstrations=best_program_details.demonstrations,
608
- details=best_program_details.details,
609
- history=mipro_history_processed,
610
- llm_calls=self.llm_call_counter,
611
- tool_calls=self.tool_call_counter,
612
- )
613
-
614
- def get_best(self, position: int = 0) -> OptimizationResult:
615
- if not hasattr(self, "best_programs") or not self.best_programs:
616
- logger.error(
617
- "get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results."
618
- )
619
- # Get LLM call count from the optimizer if available
620
- dspy_llm_calls = (
621
- getattr(self.optimizer, "total_calls", 0)
622
- if hasattr(self, "optimizer") and self.optimizer
623
- else 0
624
- )
625
- actual_llm_calls = max(self.llm_call_counter, dspy_llm_calls)
626
-
627
- return OptimizationResult(
628
- optimizer="MiproOptimizer",
629
- prompt=[
630
- {
631
- "role": "user",
632
- "content": getattr(
633
- self, "prompt", "Error: Initial prompt not found"
634
- ),
635
- }
636
- ],
637
- score=0.0,
638
- metric_name=(
639
- getattr(self, "opik_metric", None).name
640
- if hasattr(self, "opik_metric") and self.opik_metric
641
- else "unknown_metric"
642
- ),
643
- details={"error": "No programs generated or compile failed"},
644
- history=[],
645
- llm_calls=actual_llm_calls,
646
- tool_calls=self.tool_call_counter,
647
- )
648
-
649
- score = self.best_programs[position]["score"]
650
- program_module = self.best_programs[position]["program"]
651
- state = program_module.dump_state()
652
- if self.tools:
653
- tool_names = [tool.__name__ for tool in self.tools]
654
- tool_prompts = get_tool_prompts(
655
- tool_names, state["react"]["signature"]["instructions"]
656
- )
657
- best_prompt = state["react"]["signature"]["instructions"]
658
- demos = [x.toDict() for x in state["react"]["demos"]]
659
- else:
660
- tool_prompts = None
661
- best_prompt = state["signature"]["instructions"]
662
- demos = [x.toDict() for x in state["demos"]]
663
-
664
- # Get LLM call count from the DSPy program module
665
- dspy_llm_calls = getattr(program_module, "total_calls", 0)
666
- # Use the higher of our counter or DSPy's counter
667
- actual_llm_calls = max(self.llm_call_counter, dspy_llm_calls)
668
-
669
- print(best_prompt)
670
- return OptimizationResult(
671
- optimizer="MiproOptimizer",
672
- prompt=[{"role": "user", "content": best_prompt}],
673
- tool_prompts=tool_prompts,
674
- score=score,
675
- metric_name=self.opik_metric.__name__,
676
- demonstrations=demos,
677
- details={"program": program_module},
678
- llm_calls=actual_llm_calls,
679
- tool_calls=self.tool_call_counter,
680
- )