opik-optimizer 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +15 -26
- opik_optimizer/base_optimizer.py +28 -44
- opik_optimizer/data/hotpot-500.json +501 -1001
- opik_optimizer/datasets/__init__.py +6 -7
- opik_optimizer/datasets/hotpot_qa.py +2 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +742 -726
- opik_optimizer/evolutionary_optimizer/reporting.py +246 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +297 -193
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +119 -0
- opik_optimizer/meta_prompt_optimizer/__init__.py +5 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +816 -0
- opik_optimizer/meta_prompt_optimizer/reporting.py +140 -0
- opik_optimizer/mipro_optimizer/__init__.py +1 -1
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +12 -20
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +32 -52
- opik_optimizer/mipro_optimizer/utils.py +1 -23
- opik_optimizer/optimization_config/chat_prompt.py +106 -0
- opik_optimizer/optimization_config/configs.py +2 -21
- opik_optimizer/optimization_config/mappers.py +1 -1
- opik_optimizer/optimization_result.py +57 -85
- opik_optimizer/reporting_utils.py +180 -0
- opik_optimizer/task_evaluator.py +41 -26
- opik_optimizer/utils.py +187 -3
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/METADATA +15 -31
- opik_optimizer-0.9.0.dist-info/RECORD +48 -0
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/WHEEL +1 -1
- opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py +0 -91
- opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py +0 -80
- opik_optimizer/integrations/__init__.py +0 -0
- opik_optimizer/meta_prompt_optimizer.py +0 -1151
- opik_optimizer-0.8.0.dist-info/RECORD +0 -45
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.8.0.dist-info → opik_optimizer-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
|
3
|
+
import rich
|
4
|
+
from rich.text import Text
|
5
|
+
|
6
|
+
from ..reporting_utils import (
|
7
|
+
convert_tqdm_to_rich,
|
8
|
+
display_configuration, # noqa: F401
|
9
|
+
display_header, # noqa: F401
|
10
|
+
display_messages,
|
11
|
+
display_result, # noqa: F401
|
12
|
+
get_console,
|
13
|
+
suppress_opik_logs,
|
14
|
+
)
|
15
|
+
|
16
|
+
PANEL_WIDTH = 70
|
17
|
+
console = get_console()
|
18
|
+
|
19
|
+
|
20
|
+
@contextmanager
|
21
|
+
def display_round_progress(max_rounds: int, verbose: int = 1):
|
22
|
+
"""Context manager to display messages during an evaluation phase."""
|
23
|
+
|
24
|
+
# Create a simple object with a method to set the score
|
25
|
+
class Reporter:
|
26
|
+
def failed_to_generate(self, num_prompts, error):
|
27
|
+
if verbose >= 1:
|
28
|
+
console.print(Text(f"│ Failed to generate {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}: {error}", style="red"))
|
29
|
+
console.print(Text("│"))
|
30
|
+
|
31
|
+
def round_start(self, round_number):
|
32
|
+
if verbose >= 1:
|
33
|
+
console.print(Text(f"│ - Starting optimization round {round_number + 1} of {max_rounds}"))
|
34
|
+
|
35
|
+
def round_end(self, round_number, score, best_score, best_prompt):
|
36
|
+
if verbose >= 1:
|
37
|
+
console.print(Text(f"│ Completed optimization round {round_number + 1} of {max_rounds}"))
|
38
|
+
if best_score == 0 and score == 0:
|
39
|
+
console.print(Text("│ No improvement in this optimization round - score is 0", style="yellow"))
|
40
|
+
elif best_score == 0:
|
41
|
+
console.print(Text(f"│ Found a new best performing prompt: {score:.4f}", style="green"))
|
42
|
+
elif score > best_score:
|
43
|
+
perc_change = (score - best_score) / best_score
|
44
|
+
console.print(Text(f"│ Found a new best performing prompt: {score:.4f} ({perc_change:.2%})", style="green"))
|
45
|
+
elif score <= best_score:
|
46
|
+
console.print(Text("│ No improvement in this optimization round", style="red"))
|
47
|
+
|
48
|
+
console.print(Text("│"))
|
49
|
+
|
50
|
+
# Use our log suppression context manager and yield the reporter
|
51
|
+
with suppress_opik_logs():
|
52
|
+
with convert_tqdm_to_rich(verbose=verbose):
|
53
|
+
try:
|
54
|
+
yield Reporter()
|
55
|
+
finally:
|
56
|
+
pass
|
57
|
+
|
58
|
+
|
59
|
+
@contextmanager
|
60
|
+
def display_evaluation(message: str = "First we will establish the baseline performance:", verbose: int = 1):
|
61
|
+
"""Context manager to display messages during an evaluation phase."""
|
62
|
+
score = None
|
63
|
+
|
64
|
+
# Entry point
|
65
|
+
if verbose >= 1:
|
66
|
+
console.print(Text(f"> {message}"))
|
67
|
+
|
68
|
+
# Create a simple object with a method to set the score
|
69
|
+
class Reporter:
|
70
|
+
def set_score(self, s):
|
71
|
+
if verbose >= 1:
|
72
|
+
console.print(Text(f"\r Baseline score was: {s:.4f}.\n", style="green"))
|
73
|
+
|
74
|
+
# Use our log suppression context manager and yield the reporter
|
75
|
+
with suppress_opik_logs():
|
76
|
+
with convert_tqdm_to_rich(" Evaluation", verbose=verbose):
|
77
|
+
try:
|
78
|
+
yield Reporter()
|
79
|
+
finally:
|
80
|
+
pass
|
81
|
+
|
82
|
+
def display_optimization_start_message(verbose: int = 1):
|
83
|
+
if verbose >= 1:
|
84
|
+
console.print(Text("> Starting the optimization run"))
|
85
|
+
console.print(Text("│"))
|
86
|
+
|
87
|
+
|
88
|
+
@contextmanager
|
89
|
+
def display_candidate_generation_report(num_prompts: int, verbose: int = 1):
|
90
|
+
"""Context manager to display messages during an evaluation phase."""
|
91
|
+
# Entry point
|
92
|
+
if verbose >= 1:
|
93
|
+
console.print(Text(f"│ Generating candidate prompt{'' if num_prompts == 1 else 's'}:"))
|
94
|
+
|
95
|
+
# Create a simple object with a method to set the score
|
96
|
+
class Reporter:
|
97
|
+
def set_generated_prompts(self, prompts):
|
98
|
+
console.print(Text(f"│ Successfully generated {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}", style="dim"))
|
99
|
+
console.print(Text("│"))
|
100
|
+
|
101
|
+
try:
|
102
|
+
yield Reporter()
|
103
|
+
finally:
|
104
|
+
pass
|
105
|
+
|
106
|
+
|
107
|
+
@contextmanager
|
108
|
+
def display_prompt_candidate_scoring_report(candidate_count, prompt, verbose: int = 1):
|
109
|
+
"""Context manager to display messages during an evaluation phase."""
|
110
|
+
# Create a simple object with a method to set the score
|
111
|
+
class Reporter:
|
112
|
+
def set_generated_prompts(self, candidate_count, prompt):
|
113
|
+
if verbose >= 1:
|
114
|
+
console.print(Text(f"│ Evaluating candidate prompt {candidate_count+1}:"))
|
115
|
+
display_messages(prompt, "│ ")
|
116
|
+
|
117
|
+
def set_final_score(self, best_score, score):
|
118
|
+
if verbose >= 1:
|
119
|
+
if best_score == 0 and score > 0:
|
120
|
+
console.print(Text(f"│ Evaluation score: {score:.4f}", style="green"))
|
121
|
+
elif best_score == 0 and score == 0:
|
122
|
+
console.print(Text(f"│ Evaluation score: {score:.4f}", style="dim yellow"))
|
123
|
+
elif score > best_score:
|
124
|
+
perc_change = (score - best_score) / best_score
|
125
|
+
console.print(Text(f"│ Evaluation score: {score:.4f} ({perc_change:.2%})", style="green"))
|
126
|
+
elif score < best_score:
|
127
|
+
perc_change = (score - best_score) / best_score
|
128
|
+
console.print(Text(f"│ Evaluation score: {score:.4f} ({perc_change:.2%})", style="red"))
|
129
|
+
else:
|
130
|
+
console.print(Text(f"│ Evaluation score: {score:.4f}", style="dim yellow"))
|
131
|
+
|
132
|
+
console.print(Text("│"))
|
133
|
+
console.print(Text("│"))
|
134
|
+
try:
|
135
|
+
with suppress_opik_logs():
|
136
|
+
with convert_tqdm_to_rich("│ Evaluation", verbose=verbose):
|
137
|
+
yield Reporter()
|
138
|
+
finally:
|
139
|
+
pass
|
140
|
+
|
@@ -1 +1 @@
|
|
1
|
-
from .mipro_optimizer import MiproOptimizer
|
1
|
+
from .mipro_optimizer import MiproOptimizer, MIPROv2
|
@@ -1,15 +1,12 @@
|
|
1
|
-
import logging
|
2
1
|
import random
|
3
2
|
import textwrap
|
4
3
|
from collections import defaultdict
|
5
4
|
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
|
6
5
|
|
6
|
+
import dspy
|
7
7
|
import numpy as np
|
8
8
|
import opik
|
9
9
|
import optuna
|
10
|
-
from optuna.distributions import CategoricalDistribution
|
11
|
-
|
12
|
-
import dspy
|
13
10
|
from dspy.evaluate.evaluate import Evaluate
|
14
11
|
from dspy.propose import GroundedProposer
|
15
12
|
from dspy.teleprompt.teleprompt import Teleprompter
|
@@ -23,6 +20,9 @@ from dspy.teleprompt.utils import (
|
|
23
20
|
save_candidate_program,
|
24
21
|
set_signature,
|
25
22
|
)
|
23
|
+
from optuna.distributions import CategoricalDistribution
|
24
|
+
|
25
|
+
from ..optimization_config.configs import TaskConfig
|
26
26
|
|
27
27
|
|
28
28
|
class Logger():
|
@@ -51,7 +51,7 @@ ENDC = "\033[0m" # Resets the color to default
|
|
51
51
|
|
52
52
|
import opik
|
53
53
|
from opik_optimizer import task_evaluator
|
54
|
-
from opik_optimizer.optimization_config.configs import
|
54
|
+
from opik_optimizer.optimization_config.configs import TaskConfig
|
55
55
|
from opik_optimizer.optimization_config import mappers
|
56
56
|
|
57
57
|
def get_prompt(program):
|
@@ -85,7 +85,7 @@ class MIPROv2(Teleprompter):
|
|
85
85
|
log_dir: Optional[str] = None,
|
86
86
|
metric_threshold: Optional[float] = None,
|
87
87
|
opik_dataset: Optional[opik.Dataset] = None,
|
88
|
-
|
88
|
+
opik_metric: Optional[Callable] = None,
|
89
89
|
opik_prompt_task_config: Optional[TaskConfig] = None,
|
90
90
|
opik_project_name: Optional[str] = None,
|
91
91
|
opik_optimization_id: Optional[str] = None,
|
@@ -119,7 +119,7 @@ class MIPROv2(Teleprompter):
|
|
119
119
|
self.rng = None
|
120
120
|
|
121
121
|
self.opik_dataset = opik_dataset
|
122
|
-
self.
|
122
|
+
self.opik_metric = opik_metric
|
123
123
|
self.opik_prompt_task_config = opik_prompt_task_config
|
124
124
|
self.opik_project_name = opik_project_name
|
125
125
|
self.opik_optimization_id = opik_optimization_id
|
@@ -572,7 +572,7 @@ class MIPROv2(Teleprompter):
|
|
572
572
|
opik_dataset=self.opik_dataset,
|
573
573
|
trainset=valset,
|
574
574
|
candidate_program=program,
|
575
|
-
|
575
|
+
metric=self.opik_metric,
|
576
576
|
prompt_task_config=self.opik_prompt_task_config,
|
577
577
|
project_name=self.opik_project_name,
|
578
578
|
num_threads=self.num_threads,
|
@@ -643,15 +643,7 @@ class MIPROv2(Teleprompter):
|
|
643
643
|
score = eval_candidate_program(
|
644
644
|
batch_size, valset, candidate_program, evaluate, self.rng
|
645
645
|
)
|
646
|
-
|
647
|
-
# opik_dataset=self.opik_dataset,
|
648
|
-
# trainset=valset,
|
649
|
-
# candidate_program=candidate_program,
|
650
|
-
# metric_config=self.opik_metric_config,
|
651
|
-
# prompt_task_config=self.opik_prompt_task_config,
|
652
|
-
# project_name=self.opik_project_name,
|
653
|
-
# experiment_config=experiment_config,
|
654
|
-
# )
|
646
|
+
|
655
647
|
total_eval_calls += batch_size
|
656
648
|
|
657
649
|
# Update best score and program
|
@@ -953,7 +945,7 @@ class MIPROv2(Teleprompter):
|
|
953
945
|
opik_dataset=self.opik_dataset,
|
954
946
|
trainset=valset,
|
955
947
|
candidate_program=highest_mean_program,
|
956
|
-
|
948
|
+
metric=self.opik_metric,
|
957
949
|
prompt_task_config=self.opik_prompt_task_config,
|
958
950
|
project_name=self.opik_project_name,
|
959
951
|
num_threads=self.num_threads,
|
@@ -1029,7 +1021,7 @@ def eval_candidate_program_with_opik(
|
|
1029
1021
|
trainset: List,
|
1030
1022
|
candidate_program: Any,
|
1031
1023
|
project_name: str,
|
1032
|
-
|
1024
|
+
metric: Callable,
|
1033
1025
|
prompt_task_config: TaskConfig,
|
1034
1026
|
num_threads: int,
|
1035
1027
|
experiment_config: Optional[Dict[str, Any]] = None,
|
@@ -1057,7 +1049,7 @@ def eval_candidate_program_with_opik(
|
|
1057
1049
|
score = task_evaluator.evaluate(
|
1058
1050
|
dataset=opik_dataset,
|
1059
1051
|
evaluated_task=program_task,
|
1060
|
-
|
1052
|
+
metric=metric,
|
1061
1053
|
dataset_item_ids=dataset_item_ids,
|
1062
1054
|
project_name=project_name,
|
1063
1055
|
num_threads=num_threads,
|
@@ -1,30 +1,29 @@
|
|
1
|
-
from typing import Any, Dict, List, Tuple, Union, Optional, Literal
|
2
1
|
import os
|
3
2
|
import random
|
4
3
|
from datetime import datetime
|
5
|
-
|
6
|
-
import opik
|
7
|
-
|
8
|
-
from opik.integrations.dspy.callback import OpikCallback
|
9
|
-
from opik.opik_context import get_current_span_data
|
10
|
-
from opik.evaluation import evaluate
|
11
|
-
from opik import Dataset
|
4
|
+
from typing import Callable, Dict, List, Literal, Optional, Union
|
12
5
|
|
13
6
|
import dspy
|
14
|
-
|
15
7
|
import litellm
|
8
|
+
import opik
|
16
9
|
from litellm.caching import Cache
|
10
|
+
from opik import Dataset
|
11
|
+
from opik.evaluation import evaluate
|
12
|
+
from opik.integrations.dspy.callback import OpikCallback
|
13
|
+
from opik.opik_context import get_current_span_data
|
17
14
|
|
18
15
|
from ..optimization_result import OptimizationResult
|
16
|
+
from ..utils import optimization_context
|
19
17
|
from ..base_optimizer import BaseOptimizer
|
20
|
-
from .
|
18
|
+
from ..optimization_config.configs import TaskConfig
|
19
|
+
from ..optimization_result import OptimizationResult
|
21
20
|
from ._lm import LM
|
22
|
-
from
|
21
|
+
from ._mipro_optimizer_v2 import MIPROv2
|
23
22
|
from .utils import (
|
24
23
|
create_dspy_signature,
|
25
|
-
opik_metric_to_dspy,
|
26
24
|
create_dspy_training_set,
|
27
25
|
get_tool_prompts,
|
26
|
+
opik_metric_to_dspy,
|
28
27
|
)
|
29
28
|
|
30
29
|
# Using disk cache for LLM calls
|
@@ -53,7 +52,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
53
52
|
def evaluate_prompt(
|
54
53
|
self,
|
55
54
|
dataset: Union[str, Dataset],
|
56
|
-
|
55
|
+
metric: Callable,
|
57
56
|
task_config: TaskConfig,
|
58
57
|
prompt: Union[str, dspy.Module, OptimizationResult] = None,
|
59
58
|
n_samples: int = 10,
|
@@ -67,7 +66,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
67
66
|
|
68
67
|
Args:
|
69
68
|
dataset: Opik dataset name or dataset
|
70
|
-
|
69
|
+
metric: Metric function to optimize
|
71
70
|
task_config: A TaskConfig instance
|
72
71
|
prompt: The prompt to evaluate
|
73
72
|
n_samples: number of items to test in the dataset
|
@@ -82,7 +81,6 @@ class MiproOptimizer(BaseOptimizer):
|
|
82
81
|
# FIMXE: call super when it is ready
|
83
82
|
# FIXME: Intermediate values:
|
84
83
|
self.llm_call_counter += 1
|
85
|
-
metric = metric_config.metric
|
86
84
|
input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
|
87
85
|
output_key = task_config.output_dataset_field
|
88
86
|
|
@@ -188,7 +186,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
188
186
|
"tools": (
|
189
187
|
[f.__name__ for f in task_config.tools] if task_config.tools else []
|
190
188
|
),
|
191
|
-
"metric":
|
189
|
+
"metric": metric.__name__,
|
192
190
|
"dataset": dataset.name,
|
193
191
|
},
|
194
192
|
}
|
@@ -222,7 +220,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
222
220
|
def optimize_prompt(
|
223
221
|
self,
|
224
222
|
dataset: Union[str, Dataset],
|
225
|
-
|
223
|
+
metric: Callable,
|
226
224
|
task_config: TaskConfig,
|
227
225
|
num_candidates: int = 10,
|
228
226
|
experiment_config: Optional[Dict] = None,
|
@@ -232,26 +230,15 @@ class MiproOptimizer(BaseOptimizer):
|
|
232
230
|
**kwargs,
|
233
231
|
) -> OptimizationResult:
|
234
232
|
self._opik_client = opik.Opik()
|
235
|
-
|
236
|
-
|
237
|
-
optimization = self._opik_client.create_optimization(
|
233
|
+
with optimization_context(
|
234
|
+
client=self._opik_client,
|
238
235
|
dataset_name=dataset.name,
|
239
|
-
objective_name=
|
236
|
+
objective_name=metric.__name__,
|
240
237
|
metadata={"optimizer": self.__class__.__name__},
|
241
|
-
|
242
|
-
except Exception:
|
243
|
-
logger.warning(
|
244
|
-
"Opik server does not support optimizations. Please upgrade opik."
|
245
|
-
)
|
246
|
-
optimization = None
|
247
|
-
|
248
|
-
if not optimization:
|
249
|
-
logger.warning("Continuing without Opik optimization tracking.")
|
250
|
-
|
251
|
-
try:
|
238
|
+
) as optimization:
|
252
239
|
result = self._optimize_prompt(
|
253
240
|
dataset=dataset,
|
254
|
-
|
241
|
+
metric=metric,
|
255
242
|
task_config=task_config,
|
256
243
|
num_candidates=num_candidates,
|
257
244
|
experiment_config=experiment_config,
|
@@ -261,19 +248,12 @@ class MiproOptimizer(BaseOptimizer):
|
|
261
248
|
auto=auto,
|
262
249
|
**kwargs,
|
263
250
|
)
|
264
|
-
if optimization:
|
265
|
-
self.update_optimization(optimization, status="completed")
|
266
251
|
return result
|
267
|
-
except Exception as e:
|
268
|
-
logger.error(f"Mipro optimization failed: {e}", exc_info=True)
|
269
|
-
if optimization:
|
270
|
-
self.update_optimization(optimization, status="cancelled")
|
271
|
-
raise e
|
272
252
|
|
273
253
|
def _optimize_prompt(
|
274
254
|
self,
|
275
255
|
dataset: Union[str, Dataset],
|
276
|
-
|
256
|
+
metric: Callable,
|
277
257
|
task_config: TaskConfig,
|
278
258
|
num_candidates: int = 10,
|
279
259
|
experiment_config: Optional[Dict] = None,
|
@@ -286,7 +266,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
286
266
|
logger.info("Preparing MIPRO optimization...")
|
287
267
|
self.prepare_optimize_prompt(
|
288
268
|
dataset=dataset,
|
289
|
-
|
269
|
+
metric=metric,
|
290
270
|
task_config=task_config,
|
291
271
|
num_candidates=num_candidates,
|
292
272
|
experiment_config=experiment_config,
|
@@ -304,7 +284,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
304
284
|
def prepare_optimize_prompt(
|
305
285
|
self,
|
306
286
|
dataset,
|
307
|
-
|
287
|
+
metric,
|
308
288
|
task_config,
|
309
289
|
num_candidates: int = 10,
|
310
290
|
experiment_config: Optional[Dict] = None,
|
@@ -316,7 +296,6 @@ class MiproOptimizer(BaseOptimizer):
|
|
316
296
|
) -> None:
|
317
297
|
# FIXME: Intermediate values:
|
318
298
|
self.llm_call_counter = 0
|
319
|
-
metric = metric_config.metric
|
320
299
|
prompt = task_config.instruction_prompt
|
321
300
|
input_key = task_config.input_dataset_fields[0] # FIXME: allow all
|
322
301
|
output_key = task_config.output_dataset_field
|
@@ -366,7 +345,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
366
345
|
**{
|
367
346
|
"optimizer": self.__class__.__name__,
|
368
347
|
"tools": [f.__name__ for f in self.tools],
|
369
|
-
"metric": metric.
|
348
|
+
"metric": metric.__name__,
|
370
349
|
"num_threads": self.num_threads,
|
371
350
|
"num_candidates": self.num_candidates,
|
372
351
|
"num_trials": self.num_trials,
|
@@ -385,7 +364,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
385
364
|
opik_prompt_task_config=task_config,
|
386
365
|
opik_dataset=dataset,
|
387
366
|
opik_project_name=self.project_name,
|
388
|
-
|
367
|
+
opik_metric=metric,
|
389
368
|
opik_optimization_id=optimization_id,
|
390
369
|
log_dir=log_dir,
|
391
370
|
experiment_config=experiment_config,
|
@@ -457,7 +436,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
457
436
|
}
|
458
437
|
|
459
438
|
current_score = candidate_data.get("score")
|
460
|
-
metric_name_for_history = self.opik_metric.
|
439
|
+
metric_name_for_history = self.opik_metric.__name__
|
461
440
|
|
462
441
|
# Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
|
463
442
|
# For now, specifically targeting Levenshtein-like metrics
|
@@ -479,9 +458,9 @@ class MiproOptimizer(BaseOptimizer):
|
|
479
458
|
logger.warning("MIPRO compile returned no candidate programs.")
|
480
459
|
return OptimizationResult(
|
481
460
|
optimizer="MiproOptimizer",
|
482
|
-
prompt=self
|
461
|
+
prompt=[{"role": "user", "content": getattr(self, 'prompt', "Error: Initial prompt not found")}],
|
483
462
|
score=0.0,
|
484
|
-
metric_name=self.opik_metric.
|
463
|
+
metric_name=self.opik_metric.__name__ if hasattr(self, 'opik_metric') else "unknown_metric",
|
485
464
|
details={"error": "No candidate programs generated by MIPRO"},
|
486
465
|
history=mipro_history_processed,
|
487
466
|
llm_calls=self.llm_call_counter
|
@@ -517,7 +496,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
517
496
|
logger.error("get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results.")
|
518
497
|
return OptimizationResult(
|
519
498
|
optimizer="MiproOptimizer",
|
520
|
-
prompt=getattr(self, 'prompt', "Error: Initial prompt not found"),
|
499
|
+
prompt=[{"role": "user", "content": getattr(self, 'prompt', "Error: Initial prompt not found")}],
|
521
500
|
score=0.0,
|
522
501
|
metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
|
523
502
|
details={"error": "No programs generated or compile failed"},
|
@@ -540,12 +519,13 @@ class MiproOptimizer(BaseOptimizer):
|
|
540
519
|
best_prompt = state["signature"]["instructions"]
|
541
520
|
demos = [x.toDict() for x in state["demos"]]
|
542
521
|
|
522
|
+
print(best_prompt)
|
543
523
|
return OptimizationResult(
|
544
524
|
optimizer="MiproOptimizer",
|
545
|
-
prompt=best_prompt,
|
525
|
+
prompt=[{"role": "user", "content": best_prompt}],
|
546
526
|
tool_prompts=tool_prompts,
|
547
527
|
score=score,
|
548
|
-
metric_name=self.opik_metric.
|
528
|
+
metric_name=self.opik_metric.__name__,
|
549
529
|
demonstrations=demos,
|
550
530
|
details={"program": program_module},
|
551
531
|
llm_calls=self.llm_call_counter
|
@@ -44,31 +44,9 @@ def opik_metric_to_dspy(metric, output):
|
|
44
44
|
answer_field = output
|
45
45
|
|
46
46
|
def opik_metric_score_wrapper(example, prediction, trace=None):
|
47
|
-
# Extract the input from the example
|
48
|
-
input_text = getattr(example, "input", "")
|
49
|
-
if isinstance(input_text, list):
|
50
|
-
input_text = input_text[0] if input_text else ""
|
51
|
-
|
52
|
-
# Extract the expected output
|
53
|
-
expected_output = getattr(example, answer_field, "")
|
54
|
-
if isinstance(expected_output, list):
|
55
|
-
expected_output = expected_output[0] if expected_output else ""
|
56
|
-
|
57
|
-
# Get the model output
|
58
|
-
model_output = getattr(prediction, answer_field, "")
|
59
|
-
|
60
|
-
# Create a result dictionary with all required fields
|
61
|
-
result = {
|
62
|
-
"input": input_text,
|
63
|
-
"output": model_output,
|
64
|
-
"expected_output": expected_output,
|
65
|
-
"reference": expected_output,
|
66
|
-
"context": getattr(example, "context", input_text),
|
67
|
-
}
|
68
|
-
|
69
47
|
try:
|
70
48
|
# Calculate the score using the metric
|
71
|
-
score_result = metric.
|
49
|
+
score_result = metric(dataset_item=example.toDict(), llm_output=getattr(prediction, answer_field, ""))
|
72
50
|
return (
|
73
51
|
score_result.value if hasattr(score_result, "value") else score_result
|
74
52
|
)
|
@@ -0,0 +1,106 @@
|
|
1
|
+
from typing import Any, Dict, List, Literal, Optional
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
|
6
|
+
class Tool(BaseModel):
|
7
|
+
name: str =Field(
|
8
|
+
...,
|
9
|
+
description="Name of the tool"
|
10
|
+
)
|
11
|
+
description: str = Field(
|
12
|
+
...,
|
13
|
+
description="Description of the tool"
|
14
|
+
)
|
15
|
+
parameters: Dict[str, Any] = Field(
|
16
|
+
...,
|
17
|
+
description="JSON Schema defining the input parameters for the tool"
|
18
|
+
)
|
19
|
+
|
20
|
+
class ChatPrompt:
|
21
|
+
system: str
|
22
|
+
prompt: str
|
23
|
+
messages: List[Dict[Literal["role", "content"], str]]
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
system: Optional[str] = None,
|
28
|
+
prompt: Optional[str] = None,
|
29
|
+
messages: Optional[List[Dict[Literal["role", "content"], str]]] = None,
|
30
|
+
tools: Optional[List[Tool]] = None
|
31
|
+
):
|
32
|
+
self.system = system
|
33
|
+
self.prompt = prompt
|
34
|
+
self.messages = messages
|
35
|
+
|
36
|
+
self.formatted_messages = self._standardize_prompts()
|
37
|
+
|
38
|
+
def _standardize_prompts(
|
39
|
+
self, **kwargs: Any
|
40
|
+
) -> List[Dict[Literal["role", "content"], str]]:
|
41
|
+
if (self.system is None and self.prompt is None and self.messages is None):
|
42
|
+
raise ValueError(
|
43
|
+
"At least one of `system`, `prompt` or `messages` must be provided"
|
44
|
+
)
|
45
|
+
|
46
|
+
if (self.prompt is not None and self.messages is not None):
|
47
|
+
raise ValueError(
|
48
|
+
"`prompt` and `messages` cannot be provided together"
|
49
|
+
)
|
50
|
+
|
51
|
+
if (self.system is not None and not isinstance(self.system, str)):
|
52
|
+
raise ValueError(
|
53
|
+
"`system` must be a string"
|
54
|
+
)
|
55
|
+
|
56
|
+
if (self.prompt is not None and not isinstance(self.prompt, str)):
|
57
|
+
raise ValueError(
|
58
|
+
"`prompt` must be a string"
|
59
|
+
)
|
60
|
+
|
61
|
+
if (self.messages is not None and not isinstance(self.messages, list)):
|
62
|
+
raise ValueError(
|
63
|
+
"`messages` must be a list"
|
64
|
+
)
|
65
|
+
|
66
|
+
standardize_messages = []
|
67
|
+
|
68
|
+
if (self.system is not None):
|
69
|
+
standardize_messages.append({"role": "system", "content": self.system})
|
70
|
+
|
71
|
+
if (self.prompt is not None):
|
72
|
+
standardize_messages.append({"role": "user", "content": self.prompt})
|
73
|
+
|
74
|
+
if (self.messages is not None):
|
75
|
+
for message in self.messages:
|
76
|
+
standardize_messages.append(message)
|
77
|
+
|
78
|
+
return standardize_messages
|
79
|
+
|
80
|
+
def format(self, **kwargs: Any) -> str:
|
81
|
+
return self.prompt.format(**kwargs)
|
82
|
+
|
83
|
+
def to_dict(self) -> Dict[str, Any]:
|
84
|
+
"""Convert ChatPrompt to a dictionary for JSON serialization.
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
Dict containing the serializable representation of this ChatPrompt
|
88
|
+
"""
|
89
|
+
return {
|
90
|
+
"system": self.system,
|
91
|
+
"prompt": self.prompt,
|
92
|
+
"messages": self.messages,
|
93
|
+
"formatted_messages": self.formatted_messages
|
94
|
+
}
|
95
|
+
|
96
|
+
@classmethod
|
97
|
+
def model_validate(cls, obj: Any, *, strict: bool | None = None, from_attributes: bool | None = None,
|
98
|
+
context: Any | None = None, by_alias: bool | None = None, by_name: bool | None = None) -> 'ChatPrompt':
|
99
|
+
"""Custom validation method to handle nested objects during deserialization."""
|
100
|
+
return ChatPrompt(
|
101
|
+
system=obj.get('system', None),
|
102
|
+
prompt=obj.get('prompt', None),
|
103
|
+
messages=obj.get('messages', None),
|
104
|
+
|
105
|
+
)
|
106
|
+
|
@@ -1,17 +1,8 @@
|
|
1
1
|
"""Module containing configuration classes for optimization."""
|
2
2
|
|
3
|
-
import
|
4
|
-
import opik
|
5
|
-
from typing import Dict, Callable, Union, List, Literal, Any, Optional
|
6
|
-
from opik.evaluation.metrics import BaseMetric
|
7
|
-
|
8
|
-
|
9
|
-
class MetricConfig(pydantic.BaseModel):
|
10
|
-
"""Configuration for a metric used in optimization."""
|
11
|
-
metric: BaseMetric
|
12
|
-
inputs: Dict[str, Union[str, Callable[[Any], Any]]]
|
3
|
+
from typing import Any, Dict, List, Literal, Union
|
13
4
|
|
14
|
-
|
5
|
+
import pydantic
|
15
6
|
|
16
7
|
|
17
8
|
class TaskConfig(pydantic.BaseModel):
|
@@ -23,13 +14,3 @@ class TaskConfig(pydantic.BaseModel):
|
|
23
14
|
input_dataset_fields: List[str]
|
24
15
|
output_dataset_field: str
|
25
16
|
tools: List[Any] = []
|
26
|
-
|
27
|
-
|
28
|
-
class OptimizationConfig(pydantic.BaseModel):
|
29
|
-
"""Configuration for optimization."""
|
30
|
-
model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
|
31
|
-
|
32
|
-
dataset: opik.Dataset
|
33
|
-
objective: MetricConfig
|
34
|
-
optimization_direction: Literal["maximize", "minimize"] = "maximize"
|
35
|
-
task: TaskConfig
|