opik-optimizer 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. opik_optimizer/__init__.py +15 -26
  2. opik_optimizer/base_optimizer.py +28 -44
  3. opik_optimizer/datasets/__init__.py +6 -7
  4. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
  5. opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
  6. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
  7. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
  8. opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
  9. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
  10. opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
  11. opik_optimizer/mipro_optimizer/__init__.py +1 -1
  12. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
  13. opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
  14. opik_optimizer/mipro_optimizer/utils.py +1 -23
  15. opik_optimizer/optimization_config/chat_prompt.py +106 -0
  16. opik_optimizer/optimization_config/configs.py +2 -21
  17. opik_optimizer/optimization_config/mappers.py +1 -1
  18. opik_optimizer/optimization_result.py +57 -85
  19. opik_optimizer/reporting_utils.py +180 -0
  20. opik_optimizer/task_evaluator.py +41 -26
  21. opik_optimizer/utils.py +187 -3
  22. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
  23. opik_optimizer-0.9.0.dist-info/RECORD +48 -0
  24. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
  25. opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
  26. opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
  27. opik_optimizer/integrations/__init__.py +0 -0
  28. opik_optimizer/meta_prompt_optimizer.py +0 -1151
  29. opik_optimizer-0.8.1.dist-info/RECORD +0 -45
  30. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
  31. {opik_optimizer-0.8.1.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,140 @@
1
+ from contextlib import contextmanager
2
+
3
+ import rich
4
+ from rich.text import Text
5
+
6
+ from ..reporting_utils import (
7
+ convert_tqdm_to_rich,
8
+ display_configuration, # noqa: F401
9
+ display_header, # noqa: F401
10
+ display_messages,
11
+ display_result, # noqa: F401
12
+ get_console,
13
+ suppress_opik_logs,
14
+ )
15
+
16
+ PANEL_WIDTH = 70
17
+ console = get_console()
18
+
19
+
20
+ @contextmanager
21
+ def display_round_progress(max_rounds: int, verbose: int = 1):
22
+ """Context manager to display messages during an evaluation phase."""
23
+
24
+ # Create a simple object with a method to set the score
25
+ class Reporter:
26
+ def failed_to_generate(self, num_prompts, error):
27
+ if verbose >= 1:
28
+ console.print(Text(f"│ Failed to generate {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}: {error}", style="red"))
29
+ console.print(Text("│"))
30
+
31
+ def round_start(self, round_number):
32
+ if verbose >= 1:
33
+ console.print(Text(f"│ - Starting optimization round {round_number + 1} of {max_rounds}"))
34
+
35
+ def round_end(self, round_number, score, best_score, best_prompt):
36
+ if verbose >= 1:
37
+ console.print(Text(f"│ Completed optimization round {round_number + 1} of {max_rounds}"))
38
+ if best_score == 0 and score == 0:
39
+ console.print(Text("│ No improvement in this optimization round - score is 0", style="yellow"))
40
+ elif best_score == 0:
41
+ console.print(Text(f"│ Found a new best performing prompt: {score:.4f}", style="green"))
42
+ elif score > best_score:
43
+ perc_change = (score - best_score) / best_score
44
+ console.print(Text(f"│ Found a new best performing prompt: {score:.4f} ({perc_change:.2%})", style="green"))
45
+ elif score <= best_score:
46
+ console.print(Text("│ No improvement in this optimization round", style="red"))
47
+
48
+ console.print(Text("│"))
49
+
50
+ # Use our log suppression context manager and yield the reporter
51
+ with suppress_opik_logs():
52
+ with convert_tqdm_to_rich(verbose=verbose):
53
+ try:
54
+ yield Reporter()
55
+ finally:
56
+ pass
57
+
58
+
59
+ @contextmanager
60
+ def display_evaluation(message: str = "First we will establish the baseline performance:", verbose: int = 1):
61
+ """Context manager to display messages during an evaluation phase."""
62
+ score = None
63
+
64
+ # Entry point
65
+ if verbose >= 1:
66
+ console.print(Text(f"> {message}"))
67
+
68
+ # Create a simple object with a method to set the score
69
+ class Reporter:
70
+ def set_score(self, s):
71
+ if verbose >= 1:
72
+ console.print(Text(f"\r Baseline score was: {s:.4f}.\n", style="green"))
73
+
74
+ # Use our log suppression context manager and yield the reporter
75
+ with suppress_opik_logs():
76
+ with convert_tqdm_to_rich(" Evaluation", verbose=verbose):
77
+ try:
78
+ yield Reporter()
79
+ finally:
80
+ pass
81
+
82
+ def display_optimization_start_message(verbose: int = 1):
83
+ if verbose >= 1:
84
+ console.print(Text("> Starting the optimization run"))
85
+ console.print(Text("│"))
86
+
87
+
88
+ @contextmanager
89
+ def display_candidate_generation_report(num_prompts: int, verbose: int = 1):
90
+ """Context manager to display messages during an evaluation phase."""
91
+ # Entry point
92
+ if verbose >= 1:
93
+ console.print(Text(f"│ Generating candidate prompt{'' if num_prompts == 1 else 's'}:"))
94
+
95
+ # Create a simple object with a method to set the score
96
+ class Reporter:
97
+ def set_generated_prompts(self, prompts):
98
+ console.print(Text(f"│ Successfully generated {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}", style="dim"))
99
+ console.print(Text("│"))
100
+
101
+ try:
102
+ yield Reporter()
103
+ finally:
104
+ pass
105
+
106
+
107
+ @contextmanager
108
+ def display_prompt_candidate_scoring_report(candidate_count, prompt, verbose: int = 1):
109
+ """Context manager to display messages during an evaluation phase."""
110
+ # Create a simple object with a method to set the score
111
+ class Reporter:
112
+ def set_generated_prompts(self, candidate_count, prompt):
113
+ if verbose >= 1:
114
+ console.print(Text(f"│ Evaluating candidate prompt {candidate_count+1}:"))
115
+ display_messages(prompt, "│ ")
116
+
117
+ def set_final_score(self, best_score, score):
118
+ if verbose >= 1:
119
+ if best_score == 0 and score > 0:
120
+ console.print(Text(f"│ Evaluation score: {score:.4f}", style="green"))
121
+ elif best_score == 0 and score == 0:
122
+ console.print(Text(f"│ Evaluation score: {score:.4f}", style="dim yellow"))
123
+ elif score > best_score:
124
+ perc_change = (score - best_score) / best_score
125
+ console.print(Text(f"│ Evaluation score: {score:.4f} ({perc_change:.2%})", style="green"))
126
+ elif score < best_score:
127
+ perc_change = (score - best_score) / best_score
128
+ console.print(Text(f"│ Evaluation score: {score:.4f} ({perc_change:.2%})", style="red"))
129
+ else:
130
+ console.print(Text(f"│ Evaluation score: {score:.4f}", style="dim yellow"))
131
+
132
+ console.print(Text("│"))
133
+ console.print(Text("│"))
134
+ try:
135
+ with suppress_opik_logs():
136
+ with convert_tqdm_to_rich("│ Evaluation", verbose=verbose):
137
+ yield Reporter()
138
+ finally:
139
+ pass
140
+
@@ -1 +1 @@
1
- from .mipro_optimizer import MiproOptimizer
1
+ from .mipro_optimizer import MiproOptimizer, MIPROv2
@@ -1,15 +1,12 @@
1
- import logging
2
1
  import random
3
2
  import textwrap
4
3
  from collections import defaultdict
5
4
  from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
6
5
 
6
+ import dspy
7
7
  import numpy as np
8
8
  import opik
9
9
  import optuna
10
- from optuna.distributions import CategoricalDistribution
11
-
12
- import dspy
13
10
  from dspy.evaluate.evaluate import Evaluate
14
11
  from dspy.propose import GroundedProposer
15
12
  from dspy.teleprompt.teleprompt import Teleprompter
@@ -23,6 +20,9 @@ from dspy.teleprompt.utils import (
23
20
  save_candidate_program,
24
21
  set_signature,
25
22
  )
23
+ from optuna.distributions import CategoricalDistribution
24
+
25
+ from ..optimization_config.configs import TaskConfig
26
26
 
27
27
 
28
28
  class Logger():
@@ -51,7 +51,7 @@ ENDC = "\033[0m" # Resets the color to default
51
51
 
52
52
  import opik
53
53
  from opik_optimizer import task_evaluator
54
- from opik_optimizer.optimization_config.configs import MetricConfig, TaskConfig
54
+ from opik_optimizer.optimization_config.configs import TaskConfig
55
55
  from opik_optimizer.optimization_config import mappers
56
56
 
57
57
  def get_prompt(program):
@@ -85,7 +85,7 @@ class MIPROv2(Teleprompter):
85
85
  log_dir: Optional[str] = None,
86
86
  metric_threshold: Optional[float] = None,
87
87
  opik_dataset: Optional[opik.Dataset] = None,
88
- opik_metric_config: Optional[MetricConfig] = None,
88
+ opik_metric: Optional[Callable] = None,
89
89
  opik_prompt_task_config: Optional[TaskConfig] = None,
90
90
  opik_project_name: Optional[str] = None,
91
91
  opik_optimization_id: Optional[str] = None,
@@ -119,7 +119,7 @@ class MIPROv2(Teleprompter):
119
119
  self.rng = None
120
120
 
121
121
  self.opik_dataset = opik_dataset
122
- self.opik_metric_config = opik_metric_config
122
+ self.opik_metric = opik_metric
123
123
  self.opik_prompt_task_config = opik_prompt_task_config
124
124
  self.opik_project_name = opik_project_name
125
125
  self.opik_optimization_id = opik_optimization_id
@@ -572,7 +572,7 @@ class MIPROv2(Teleprompter):
572
572
  opik_dataset=self.opik_dataset,
573
573
  trainset=valset,
574
574
  candidate_program=program,
575
- metric_config=self.opik_metric_config,
575
+ metric=self.opik_metric,
576
576
  prompt_task_config=self.opik_prompt_task_config,
577
577
  project_name=self.opik_project_name,
578
578
  num_threads=self.num_threads,
@@ -643,15 +643,7 @@ class MIPROv2(Teleprompter):
643
643
  score = eval_candidate_program(
644
644
  batch_size, valset, candidate_program, evaluate, self.rng
645
645
  )
646
- # score = eval_candidate_program_with_opik(
647
- # opik_dataset=self.opik_dataset,
648
- # trainset=valset,
649
- # candidate_program=candidate_program,
650
- # metric_config=self.opik_metric_config,
651
- # prompt_task_config=self.opik_prompt_task_config,
652
- # project_name=self.opik_project_name,
653
- # experiment_config=experiment_config,
654
- # )
646
+
655
647
  total_eval_calls += batch_size
656
648
 
657
649
  # Update best score and program
@@ -953,7 +945,7 @@ class MIPROv2(Teleprompter):
953
945
  opik_dataset=self.opik_dataset,
954
946
  trainset=valset,
955
947
  candidate_program=highest_mean_program,
956
- metric_config=self.opik_metric_config,
948
+ metric=self.opik_metric,
957
949
  prompt_task_config=self.opik_prompt_task_config,
958
950
  project_name=self.opik_project_name,
959
951
  num_threads=self.num_threads,
@@ -1029,7 +1021,7 @@ def eval_candidate_program_with_opik(
1029
1021
  trainset: List,
1030
1022
  candidate_program: Any,
1031
1023
  project_name: str,
1032
- metric_config: MetricConfig,
1024
+ metric: Callable,
1033
1025
  prompt_task_config: TaskConfig,
1034
1026
  num_threads: int,
1035
1027
  experiment_config: Optional[Dict[str, Any]] = None,
@@ -1057,7 +1049,7 @@ def eval_candidate_program_with_opik(
1057
1049
  score = task_evaluator.evaluate(
1058
1050
  dataset=opik_dataset,
1059
1051
  evaluated_task=program_task,
1060
- metric_config=metric_config,
1052
+ metric=metric,
1061
1053
  dataset_item_ids=dataset_item_ids,
1062
1054
  project_name=project_name,
1063
1055
  num_threads=num_threads,
@@ -1,30 +1,29 @@
1
- from typing import Any, Dict, List, Tuple, Union, Optional, Literal
2
1
  import os
3
2
  import random
4
3
  from datetime import datetime
5
-
6
- import opik
7
-
8
- from opik.integrations.dspy.callback import OpikCallback
9
- from opik.opik_context import get_current_span_data
10
- from opik.evaluation import evaluate
11
- from opik import Dataset
4
+ from typing import Callable, Dict, List, Literal, Optional, Union
12
5
 
13
6
  import dspy
14
-
15
7
  import litellm
8
+ import opik
16
9
  from litellm.caching import Cache
10
+ from opik import Dataset
11
+ from opik.evaluation import evaluate
12
+ from opik.integrations.dspy.callback import OpikCallback
13
+ from opik.opik_context import get_current_span_data
17
14
 
18
15
  from ..optimization_result import OptimizationResult
16
+ from ..utils import optimization_context
19
17
  from ..base_optimizer import BaseOptimizer
20
- from ._mipro_optimizer_v2 import MIPROv2
18
+ from ..optimization_config.configs import TaskConfig
19
+ from ..optimization_result import OptimizationResult
21
20
  from ._lm import LM
22
- from ..optimization_config.configs import MetricConfig, TaskConfig
21
+ from ._mipro_optimizer_v2 import MIPROv2
23
22
  from .utils import (
24
23
  create_dspy_signature,
25
- opik_metric_to_dspy,
26
24
  create_dspy_training_set,
27
25
  get_tool_prompts,
26
+ opik_metric_to_dspy,
28
27
  )
29
28
 
30
29
  # Using disk cache for LLM calls
@@ -53,7 +52,7 @@ class MiproOptimizer(BaseOptimizer):
53
52
  def evaluate_prompt(
54
53
  self,
55
54
  dataset: Union[str, Dataset],
56
- metric_config: MetricConfig,
55
+ metric: Callable,
57
56
  task_config: TaskConfig,
58
57
  prompt: Union[str, dspy.Module, OptimizationResult] = None,
59
58
  n_samples: int = 10,
@@ -67,7 +66,7 @@ class MiproOptimizer(BaseOptimizer):
67
66
 
68
67
  Args:
69
68
  dataset: Opik dataset name or dataset
70
- metric_config: A MetricConfig instance
69
+ metric: Metric function to optimize
71
70
  task_config: A TaskConfig instance
72
71
  prompt: The prompt to evaluate
73
72
  n_samples: number of items to test in the dataset
@@ -82,7 +81,6 @@ class MiproOptimizer(BaseOptimizer):
82
81
  # FIMXE: call super when it is ready
83
82
  # FIXME: Intermediate values:
84
83
  self.llm_call_counter += 1
85
- metric = metric_config.metric
86
84
  input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
87
85
  output_key = task_config.output_dataset_field
88
86
 
@@ -188,7 +186,7 @@ class MiproOptimizer(BaseOptimizer):
188
186
  "tools": (
189
187
  [f.__name__ for f in task_config.tools] if task_config.tools else []
190
188
  ),
191
- "metric": metric_config.metric.name,
189
+ "metric": metric.__name__,
192
190
  "dataset": dataset.name,
193
191
  },
194
192
  }
@@ -222,7 +220,7 @@ class MiproOptimizer(BaseOptimizer):
222
220
  def optimize_prompt(
223
221
  self,
224
222
  dataset: Union[str, Dataset],
225
- metric_config: MetricConfig,
223
+ metric: Callable,
226
224
  task_config: TaskConfig,
227
225
  num_candidates: int = 10,
228
226
  experiment_config: Optional[Dict] = None,
@@ -232,26 +230,15 @@ class MiproOptimizer(BaseOptimizer):
232
230
  **kwargs,
233
231
  ) -> OptimizationResult:
234
232
  self._opik_client = opik.Opik()
235
- optimization = None
236
- try:
237
- optimization = self._opik_client.create_optimization(
233
+ with optimization_context(
234
+ client=self._opik_client,
238
235
  dataset_name=dataset.name,
239
- objective_name=metric_config.metric.name,
236
+ objective_name=metric.__name__,
240
237
  metadata={"optimizer": self.__class__.__name__},
241
- )
242
- except Exception:
243
- logger.warning(
244
- "Opik server does not support optimizations. Please upgrade opik."
245
- )
246
- optimization = None
247
-
248
- if not optimization:
249
- logger.warning("Continuing without Opik optimization tracking.")
250
-
251
- try:
238
+ ) as optimization:
252
239
  result = self._optimize_prompt(
253
240
  dataset=dataset,
254
- metric_config=metric_config,
241
+ metric=metric,
255
242
  task_config=task_config,
256
243
  num_candidates=num_candidates,
257
244
  experiment_config=experiment_config,
@@ -261,19 +248,12 @@ class MiproOptimizer(BaseOptimizer):
261
248
  auto=auto,
262
249
  **kwargs,
263
250
  )
264
- if optimization:
265
- self.update_optimization(optimization, status="completed")
266
251
  return result
267
- except Exception as e:
268
- logger.error(f"Mipro optimization failed: {e}", exc_info=True)
269
- if optimization:
270
- self.update_optimization(optimization, status="cancelled")
271
- raise e
272
252
 
273
253
  def _optimize_prompt(
274
254
  self,
275
255
  dataset: Union[str, Dataset],
276
- metric_config: MetricConfig,
256
+ metric: Callable,
277
257
  task_config: TaskConfig,
278
258
  num_candidates: int = 10,
279
259
  experiment_config: Optional[Dict] = None,
@@ -286,7 +266,7 @@ class MiproOptimizer(BaseOptimizer):
286
266
  logger.info("Preparing MIPRO optimization...")
287
267
  self.prepare_optimize_prompt(
288
268
  dataset=dataset,
289
- metric_config=metric_config,
269
+ metric=metric,
290
270
  task_config=task_config,
291
271
  num_candidates=num_candidates,
292
272
  experiment_config=experiment_config,
@@ -304,7 +284,7 @@ class MiproOptimizer(BaseOptimizer):
304
284
  def prepare_optimize_prompt(
305
285
  self,
306
286
  dataset,
307
- metric_config,
287
+ metric,
308
288
  task_config,
309
289
  num_candidates: int = 10,
310
290
  experiment_config: Optional[Dict] = None,
@@ -316,7 +296,6 @@ class MiproOptimizer(BaseOptimizer):
316
296
  ) -> None:
317
297
  # FIXME: Intermediate values:
318
298
  self.llm_call_counter = 0
319
- metric = metric_config.metric
320
299
  prompt = task_config.instruction_prompt
321
300
  input_key = task_config.input_dataset_fields[0] # FIXME: allow all
322
301
  output_key = task_config.output_dataset_field
@@ -366,7 +345,7 @@ class MiproOptimizer(BaseOptimizer):
366
345
  **{
367
346
  "optimizer": self.__class__.__name__,
368
347
  "tools": [f.__name__ for f in self.tools],
369
- "metric": metric.name,
348
+ "metric": metric.__name__,
370
349
  "num_threads": self.num_threads,
371
350
  "num_candidates": self.num_candidates,
372
351
  "num_trials": self.num_trials,
@@ -385,7 +364,7 @@ class MiproOptimizer(BaseOptimizer):
385
364
  opik_prompt_task_config=task_config,
386
365
  opik_dataset=dataset,
387
366
  opik_project_name=self.project_name,
388
- opik_metric_config=metric_config,
367
+ opik_metric=metric,
389
368
  opik_optimization_id=optimization_id,
390
369
  log_dir=log_dir,
391
370
  experiment_config=experiment_config,
@@ -457,7 +436,7 @@ class MiproOptimizer(BaseOptimizer):
457
436
  }
458
437
 
459
438
  current_score = candidate_data.get("score")
460
- metric_name_for_history = self.opik_metric.name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric"
439
+ metric_name_for_history = self.opik_metric.__name__
461
440
 
462
441
  # Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
463
442
  # For now, specifically targeting Levenshtein-like metrics
@@ -479,9 +458,9 @@ class MiproOptimizer(BaseOptimizer):
479
458
  logger.warning("MIPRO compile returned no candidate programs.")
480
459
  return OptimizationResult(
481
460
  optimizer="MiproOptimizer",
482
- prompt=self.prompt,
461
+ prompt=[{"role": "user", "content": getattr(self, 'prompt', "Error: Initial prompt not found")}],
483
462
  score=0.0,
484
- metric_name=self.opik_metric.name if hasattr(self, 'opik_metric') else "unknown_metric",
463
+ metric_name=self.opik_metric.__name__ if hasattr(self, 'opik_metric') else "unknown_metric",
485
464
  details={"error": "No candidate programs generated by MIPRO"},
486
465
  history=mipro_history_processed,
487
466
  llm_calls=self.llm_call_counter
@@ -517,7 +496,7 @@ class MiproOptimizer(BaseOptimizer):
517
496
  logger.error("get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results.")
518
497
  return OptimizationResult(
519
498
  optimizer="MiproOptimizer",
520
- prompt=getattr(self, 'prompt', "Error: Initial prompt not found"),
499
+ prompt=[{"role": "user", "content": getattr(self, 'prompt', "Error: Initial prompt not found")}],
521
500
  score=0.0,
522
501
  metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
523
502
  details={"error": "No programs generated or compile failed"},
@@ -540,12 +519,13 @@ class MiproOptimizer(BaseOptimizer):
540
519
  best_prompt = state["signature"]["instructions"]
541
520
  demos = [x.toDict() for x in state["demos"]]
542
521
 
522
+ print(best_prompt)
543
523
  return OptimizationResult(
544
524
  optimizer="MiproOptimizer",
545
- prompt=best_prompt,
525
+ prompt=[{"role": "user", "content": best_prompt}],
546
526
  tool_prompts=tool_prompts,
547
527
  score=score,
548
- metric_name=self.opik_metric.name,
528
+ metric_name=self.opik_metric.__name__,
549
529
  demonstrations=demos,
550
530
  details={"program": program_module},
551
531
  llm_calls=self.llm_call_counter
@@ -44,31 +44,9 @@ def opik_metric_to_dspy(metric, output):
44
44
  answer_field = output
45
45
 
46
46
  def opik_metric_score_wrapper(example, prediction, trace=None):
47
- # Extract the input from the example
48
- input_text = getattr(example, "input", "")
49
- if isinstance(input_text, list):
50
- input_text = input_text[0] if input_text else ""
51
-
52
- # Extract the expected output
53
- expected_output = getattr(example, answer_field, "")
54
- if isinstance(expected_output, list):
55
- expected_output = expected_output[0] if expected_output else ""
56
-
57
- # Get the model output
58
- model_output = getattr(prediction, answer_field, "")
59
-
60
- # Create a result dictionary with all required fields
61
- result = {
62
- "input": input_text,
63
- "output": model_output,
64
- "expected_output": expected_output,
65
- "reference": expected_output,
66
- "context": getattr(example, "context", input_text),
67
- }
68
-
69
47
  try:
70
48
  # Calculate the score using the metric
71
- score_result = metric.score(**result)
49
+ score_result = metric(dataset_item=example.toDict(), llm_output=getattr(prediction, answer_field, ""))
72
50
  return (
73
51
  score_result.value if hasattr(score_result, "value") else score_result
74
52
  )
@@ -0,0 +1,106 @@
1
+ from typing import Any, Dict, List, Literal, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class Tool(BaseModel):
7
+ name: str =Field(
8
+ ...,
9
+ description="Name of the tool"
10
+ )
11
+ description: str = Field(
12
+ ...,
13
+ description="Description of the tool"
14
+ )
15
+ parameters: Dict[str, Any] = Field(
16
+ ...,
17
+ description="JSON Schema defining the input parameters for the tool"
18
+ )
19
+
20
+ class ChatPrompt:
21
+ system: str
22
+ prompt: str
23
+ messages: List[Dict[Literal["role", "content"], str]]
24
+
25
+ def __init__(
26
+ self,
27
+ system: Optional[str] = None,
28
+ prompt: Optional[str] = None,
29
+ messages: Optional[List[Dict[Literal["role", "content"], str]]] = None,
30
+ tools: Optional[List[Tool]] = None
31
+ ):
32
+ self.system = system
33
+ self.prompt = prompt
34
+ self.messages = messages
35
+
36
+ self.formatted_messages = self._standardize_prompts()
37
+
38
+ def _standardize_prompts(
39
+ self, **kwargs: Any
40
+ ) -> List[Dict[Literal["role", "content"], str]]:
41
+ if (self.system is None and self.prompt is None and self.messages is None):
42
+ raise ValueError(
43
+ "At least one of `system`, `prompt` or `messages` must be provided"
44
+ )
45
+
46
+ if (self.prompt is not None and self.messages is not None):
47
+ raise ValueError(
48
+ "`prompt` and `messages` cannot be provided together"
49
+ )
50
+
51
+ if (self.system is not None and not isinstance(self.system, str)):
52
+ raise ValueError(
53
+ "`system` must be a string"
54
+ )
55
+
56
+ if (self.prompt is not None and not isinstance(self.prompt, str)):
57
+ raise ValueError(
58
+ "`prompt` must be a string"
59
+ )
60
+
61
+ if (self.messages is not None and not isinstance(self.messages, list)):
62
+ raise ValueError(
63
+ "`messages` must be a list"
64
+ )
65
+
66
+ standardize_messages = []
67
+
68
+ if (self.system is not None):
69
+ standardize_messages.append({"role": "system", "content": self.system})
70
+
71
+ if (self.prompt is not None):
72
+ standardize_messages.append({"role": "user", "content": self.prompt})
73
+
74
+ if (self.messages is not None):
75
+ for message in self.messages:
76
+ standardize_messages.append(message)
77
+
78
+ return standardize_messages
79
+
80
+ def format(self, **kwargs: Any) -> str:
81
+ return self.prompt.format(**kwargs)
82
+
83
+ def to_dict(self) -> Dict[str, Any]:
84
+ """Convert ChatPrompt to a dictionary for JSON serialization.
85
+
86
+ Returns:
87
+ Dict containing the serializable representation of this ChatPrompt
88
+ """
89
+ return {
90
+ "system": self.system,
91
+ "prompt": self.prompt,
92
+ "messages": self.messages,
93
+ "formatted_messages": self.formatted_messages
94
+ }
95
+
96
+ @classmethod
97
+ def model_validate(cls, obj: Any, *, strict: bool | None = None, from_attributes: bool | None = None,
98
+ context: Any | None = None, by_alias: bool | None = None, by_name: bool | None = None) -> 'ChatPrompt':
99
+ """Custom validation method to handle nested objects during deserialization."""
100
+ return ChatPrompt(
101
+ system=obj.get('system', None),
102
+ prompt=obj.get('prompt', None),
103
+ messages=obj.get('messages', None),
104
+
105
+ )
106
+
@@ -1,17 +1,8 @@
1
1
  """Module containing configuration classes for optimization."""
2
2
 
3
- import pydantic
4
- import opik
5
- from typing import Dict, Callable, Union, List, Literal, Any, Optional
6
- from opik.evaluation.metrics import BaseMetric
7
-
8
-
9
- class MetricConfig(pydantic.BaseModel):
10
- """Configuration for a metric used in optimization."""
11
- metric: BaseMetric
12
- inputs: Dict[str, Union[str, Callable[[Any], Any]]]
3
+ from typing import Any, Dict, List, Literal, Union
13
4
 
14
- model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
5
+ import pydantic
15
6
 
16
7
 
17
8
  class TaskConfig(pydantic.BaseModel):
@@ -23,13 +14,3 @@ class TaskConfig(pydantic.BaseModel):
23
14
  input_dataset_fields: List[str]
24
15
  output_dataset_field: str
25
16
  tools: List[Any] = []
26
-
27
-
28
- class OptimizationConfig(pydantic.BaseModel):
29
- """Configuration for optimization."""
30
- model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
31
-
32
- dataset: opik.Dataset
33
- objective: MetricConfig
34
- optimization_direction: Literal["maximize", "minimize"] = "maximize"
35
- task: TaskConfig
@@ -1,6 +1,6 @@
1
1
  from typing import Dict, Callable, Optional, Any, Union
2
2
 
3
- EVALUATED_LLM_TASK_OUTPUT = "_llm_task_output"
3
+ EVALUATED_LLM_TASK_OUTPUT = "llm_output"
4
4
 
5
5
  class Mapper:
6
6
  """Base class for mapping functions that transform data between different formats."""