opik-optimizer 0.9.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -5
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +38 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +22 -13
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +89 -58
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +117 -14
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.1.dist-info/RECORD +50 -0
- opik_optimizer-0.9.2.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
|
|
1
1
|
from contextlib import contextmanager
|
2
|
+
from typing import Any, Iterator
|
2
3
|
|
3
4
|
from rich.text import Text
|
4
5
|
|
6
|
+
from ..optimization_config import chat_prompt
|
5
7
|
from ..reporting_utils import (
|
6
8
|
convert_tqdm_to_rich,
|
7
9
|
display_configuration, # noqa: F401
|
@@ -17,33 +19,66 @@ console = get_console()
|
|
17
19
|
|
18
20
|
|
19
21
|
@contextmanager
|
20
|
-
def display_round_progress(max_rounds: int, verbose: int = 1):
|
22
|
+
def display_round_progress(max_rounds: int, verbose: int = 1) -> Any:
|
21
23
|
"""Context manager to display messages during an evaluation phase."""
|
22
|
-
|
24
|
+
|
23
25
|
# Create a simple object with a method to set the score
|
24
26
|
class Reporter:
|
25
|
-
def failed_to_generate(self, num_prompts, error):
|
27
|
+
def failed_to_generate(self, num_prompts: int, error: str) -> None:
|
26
28
|
if verbose >= 1:
|
27
|
-
console.print(
|
29
|
+
console.print(
|
30
|
+
Text(
|
31
|
+
f"│ Failed to generate {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}: {error}",
|
32
|
+
style="red",
|
33
|
+
)
|
34
|
+
)
|
28
35
|
console.print(Text("│"))
|
29
|
-
|
30
|
-
def round_start(self, round_number):
|
36
|
+
|
37
|
+
def round_start(self, round_number: int) -> None:
|
31
38
|
if verbose >= 1:
|
32
|
-
console.print(
|
39
|
+
console.print(
|
40
|
+
Text(
|
41
|
+
f"│ - Starting optimization round {round_number + 1} of {max_rounds}"
|
42
|
+
)
|
43
|
+
)
|
33
44
|
|
34
|
-
def round_end(self, round_number, score, best_score
|
45
|
+
def round_end(self, round_number: int, score: float, best_score: float) -> None:
|
35
46
|
if verbose >= 1:
|
36
|
-
console.print(
|
47
|
+
console.print(
|
48
|
+
Text(
|
49
|
+
f"│ Completed optimization round {round_number + 1} of {max_rounds}"
|
50
|
+
)
|
51
|
+
)
|
37
52
|
if best_score == 0 and score == 0:
|
38
|
-
console.print(
|
53
|
+
console.print(
|
54
|
+
Text(
|
55
|
+
"│ No improvement in this optimization round - score is 0",
|
56
|
+
style="yellow",
|
57
|
+
)
|
58
|
+
)
|
39
59
|
elif best_score == 0:
|
40
|
-
console.print(
|
60
|
+
console.print(
|
61
|
+
Text(
|
62
|
+
f"│ Found a new best performing prompt: {score:.4f}",
|
63
|
+
style="green",
|
64
|
+
)
|
65
|
+
)
|
41
66
|
elif score > best_score:
|
42
67
|
perc_change = (score - best_score) / best_score
|
43
|
-
console.print(
|
68
|
+
console.print(
|
69
|
+
Text(
|
70
|
+
f"│ Found a new best performing prompt: {score:.4f} ({perc_change:.2%})",
|
71
|
+
style="green",
|
72
|
+
)
|
73
|
+
)
|
44
74
|
elif score <= best_score:
|
45
|
-
console.print(
|
46
|
-
|
75
|
+
console.print(
|
76
|
+
Text(
|
77
|
+
"│ No improvement in this optimization round",
|
78
|
+
style="red",
|
79
|
+
)
|
80
|
+
)
|
81
|
+
|
47
82
|
console.print(Text("│"))
|
48
83
|
|
49
84
|
# Use our log suppression context manager and yield the reporter
|
@@ -56,20 +91,22 @@ def display_round_progress(max_rounds: int, verbose: int = 1):
|
|
56
91
|
|
57
92
|
|
58
93
|
@contextmanager
|
59
|
-
def display_evaluation(
|
94
|
+
def display_evaluation(
|
95
|
+
message: str = "First we will establish the baseline performance:", verbose: int = 1
|
96
|
+
) -> Any:
|
60
97
|
"""Context manager to display messages during an evaluation phase."""
|
61
|
-
score = None
|
62
|
-
|
63
98
|
# Entry point
|
64
99
|
if verbose >= 1:
|
65
100
|
console.print(Text(f"> {message}"))
|
66
|
-
|
101
|
+
|
67
102
|
# Create a simple object with a method to set the score
|
68
103
|
class Reporter:
|
69
|
-
def set_score(self, s):
|
104
|
+
def set_score(self, s: float) -> None:
|
70
105
|
if verbose >= 1:
|
71
|
-
console.print(
|
72
|
-
|
106
|
+
console.print(
|
107
|
+
Text(f"\r Baseline score was: {s:.4f}.\n", style="green")
|
108
|
+
)
|
109
|
+
|
73
110
|
# Use our log suppression context manager and yield the reporter
|
74
111
|
with suppress_opik_logs():
|
75
112
|
with convert_tqdm_to_rich(" Evaluation", verbose=verbose):
|
@@ -78,62 +115,100 @@ def display_evaluation(message: str = "First we will establish the baseline perf
|
|
78
115
|
finally:
|
79
116
|
pass
|
80
117
|
|
81
|
-
|
118
|
+
|
119
|
+
def display_optimization_start_message(verbose: int = 1) -> None:
|
82
120
|
if verbose >= 1:
|
83
121
|
console.print(Text("> Starting the optimization run"))
|
84
122
|
console.print(Text("│"))
|
85
123
|
|
86
124
|
|
125
|
+
class CandidateGenerationReporter:
|
126
|
+
def __init__(self, num_prompts: int):
|
127
|
+
self.num_prompts = num_prompts
|
128
|
+
|
129
|
+
def set_generated_prompts(self) -> None:
|
130
|
+
console.print(
|
131
|
+
Text(
|
132
|
+
f"│ Successfully generated {self.num_prompts} candidate prompt{'' if self.num_prompts == 1 else 's'}",
|
133
|
+
style="dim",
|
134
|
+
)
|
135
|
+
)
|
136
|
+
console.print(Text("│"))
|
137
|
+
|
138
|
+
|
87
139
|
@contextmanager
|
88
|
-
def display_candidate_generation_report(
|
89
|
-
|
90
|
-
|
140
|
+
def display_candidate_generation_report(
|
141
|
+
num_prompts: int, verbose: int = 1
|
142
|
+
) -> Iterator[CandidateGenerationReporter]:
|
91
143
|
if verbose >= 1:
|
92
|
-
console.print(
|
93
|
-
|
94
|
-
|
95
|
-
class Reporter:
|
96
|
-
def set_generated_prompts(self, prompts):
|
97
|
-
console.print(Text(f"│ Successfully generated {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}", style="dim"))
|
98
|
-
console.print(Text("│"))
|
144
|
+
console.print(
|
145
|
+
Text(f"│ Generating candidate prompt{'' if num_prompts == 1 else 's'}:")
|
146
|
+
)
|
99
147
|
|
100
148
|
try:
|
101
|
-
yield
|
149
|
+
yield CandidateGenerationReporter(num_prompts)
|
102
150
|
finally:
|
103
151
|
pass
|
104
152
|
|
105
153
|
|
106
154
|
@contextmanager
|
107
|
-
def display_prompt_candidate_scoring_report(
|
155
|
+
def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
|
108
156
|
"""Context manager to display messages during an evaluation phase."""
|
157
|
+
|
109
158
|
# Create a simple object with a method to set the score
|
110
159
|
class Reporter:
|
111
|
-
def set_generated_prompts(
|
160
|
+
def set_generated_prompts(
|
161
|
+
self, candidate_count: int, prompt: chat_prompt.ChatPrompt
|
162
|
+
) -> None:
|
112
163
|
if verbose >= 1:
|
113
|
-
console.print(
|
114
|
-
|
115
|
-
|
116
|
-
|
164
|
+
console.print(
|
165
|
+
Text(f"│ Evaluating candidate prompt {candidate_count+1}:")
|
166
|
+
)
|
167
|
+
display_messages(prompt.get_messages(), "│ ")
|
168
|
+
|
169
|
+
def set_final_score(self, best_score: float, score: float) -> None:
|
117
170
|
if verbose >= 1:
|
118
171
|
if best_score == 0 and score > 0:
|
119
|
-
console.print(
|
172
|
+
console.print(
|
173
|
+
Text(f"│ Evaluation score: {score:.4f}", style="green")
|
174
|
+
)
|
120
175
|
elif best_score == 0 and score == 0:
|
121
|
-
console.print(
|
176
|
+
console.print(
|
177
|
+
Text(
|
178
|
+
f"│ Evaluation score: {score:.4f}",
|
179
|
+
style="dim yellow",
|
180
|
+
)
|
181
|
+
)
|
122
182
|
elif score > best_score:
|
123
183
|
perc_change = (score - best_score) / best_score
|
124
|
-
console.print(
|
184
|
+
console.print(
|
185
|
+
Text(
|
186
|
+
f"│ Evaluation score: {score:.4f} ({perc_change:.2%})",
|
187
|
+
style="green",
|
188
|
+
)
|
189
|
+
)
|
125
190
|
elif score < best_score:
|
126
191
|
perc_change = (score - best_score) / best_score
|
127
|
-
console.print(
|
192
|
+
console.print(
|
193
|
+
Text(
|
194
|
+
f"│ Evaluation score: {score:.4f} ({perc_change:.2%})",
|
195
|
+
style="red",
|
196
|
+
)
|
197
|
+
)
|
128
198
|
else:
|
129
|
-
console.print(
|
130
|
-
|
199
|
+
console.print(
|
200
|
+
Text(
|
201
|
+
f"│ Evaluation score: {score:.4f}",
|
202
|
+
style="dim yellow",
|
203
|
+
)
|
204
|
+
)
|
205
|
+
|
131
206
|
console.print(Text("│"))
|
132
207
|
console.print(Text("│"))
|
208
|
+
|
133
209
|
try:
|
134
210
|
with suppress_opik_logs():
|
135
211
|
with convert_tqdm_to_rich("│ Evaluation", verbose=verbose):
|
136
212
|
yield Reporter()
|
137
213
|
finally:
|
138
214
|
pass
|
139
|
-
|
@@ -22,12 +22,13 @@ from dspy.dsp.utils.settings import settings
|
|
22
22
|
from dspy.utils.callback import BaseCallback, with_callbacks
|
23
23
|
from dspy.clients.base_lm import BaseLM
|
24
24
|
|
25
|
-
from .._throttle import
|
25
|
+
from .._throttle import rate_limited, get_rate_limiter_for_current_opik_installation
|
26
26
|
|
27
27
|
logger = logging.getLogger(__name__)
|
28
28
|
# Limit how fast an LLM can be called:
|
29
29
|
limiter = get_rate_limiter_for_current_opik_installation()
|
30
30
|
|
31
|
+
|
31
32
|
class LM(BaseLM):
|
32
33
|
"""
|
33
34
|
A language model supporting chat or text completion requests for use with DSPy modules.
|
@@ -95,7 +96,9 @@ class LM(BaseLM):
|
|
95
96
|
assert (
|
96
97
|
max_tokens >= 20_000 and temperature == 1.0
|
97
98
|
), "OpenAI's reasoning models require passing temperature=1.0 and max_tokens >= 20_000 to `dspy.LM(...)`"
|
98
|
-
self.kwargs = dict(
|
99
|
+
self.kwargs = dict(
|
100
|
+
temperature=temperature, max_completion_tokens=max_tokens, **kwargs
|
101
|
+
)
|
99
102
|
else:
|
100
103
|
self.kwargs = dict(temperature=temperature, max_tokens=max_tokens, **kwargs)
|
101
104
|
|
@@ -111,14 +114,22 @@ class LM(BaseLM):
|
|
111
114
|
|
112
115
|
# Make the request and handle LRU & disk caching.
|
113
116
|
if cache_in_memory:
|
114
|
-
completion =
|
117
|
+
completion = (
|
118
|
+
cached_litellm_completion
|
119
|
+
if self.model_type == "chat"
|
120
|
+
else cached_litellm_text_completion
|
121
|
+
)
|
115
122
|
|
116
123
|
results = completion(
|
117
124
|
request=dict(model=self.model, messages=messages, **kwargs),
|
118
125
|
num_retries=self.num_retries,
|
119
126
|
)
|
120
127
|
else:
|
121
|
-
completion =
|
128
|
+
completion = (
|
129
|
+
litellm_completion
|
130
|
+
if self.model_type == "chat"
|
131
|
+
else litellm_text_completion
|
132
|
+
)
|
122
133
|
|
123
134
|
results = completion(
|
124
135
|
request=dict(model=self.model, messages=messages, **kwargs),
|
@@ -127,7 +138,11 @@ class LM(BaseLM):
|
|
127
138
|
cache={"no-cache": not cache, "no-store": not cache},
|
128
139
|
)
|
129
140
|
|
130
|
-
if
|
141
|
+
if (
|
142
|
+
not getattr(results, "cache_hit", False)
|
143
|
+
and dspy.settings.usage_tracker
|
144
|
+
and hasattr(results, "usage")
|
145
|
+
):
|
131
146
|
settings.usage_tracker.add_usage(self.model, dict(results.usage))
|
132
147
|
|
133
148
|
self.llm_call_counter += 1
|
@@ -239,7 +254,11 @@ def request_cache(maxsize: Optional[int] = None):
|
|
239
254
|
return value.model_json_schema()
|
240
255
|
elif isinstance(value, pydantic.BaseModel):
|
241
256
|
return value.model_dump()
|
242
|
-
elif
|
257
|
+
elif (
|
258
|
+
callable(value)
|
259
|
+
and hasattr(value, "__code__")
|
260
|
+
and hasattr(value.__code__, "co_code")
|
261
|
+
):
|
243
262
|
return value.__code__.co_code.decode("utf-8")
|
244
263
|
else:
|
245
264
|
# Note: We don't attempt to compute a hash of the value, since the default
|
@@ -292,7 +311,11 @@ def cached_litellm_completion(request: Dict[str, Any], num_retries: int):
|
|
292
311
|
)
|
293
312
|
|
294
313
|
|
295
|
-
def litellm_completion(
|
314
|
+
def litellm_completion(
|
315
|
+
request: Dict[str, Any],
|
316
|
+
num_retries: int,
|
317
|
+
cache={"no-cache": True, "no-store": True},
|
318
|
+
):
|
296
319
|
retry_kwargs = dict(
|
297
320
|
retry_policy=_get_litellm_retry_policy(num_retries),
|
298
321
|
retry_strategy="exponential_backoff_retry",
|
@@ -347,7 +370,11 @@ def cached_litellm_text_completion(request: Dict[str, Any], num_retries: int):
|
|
347
370
|
)
|
348
371
|
|
349
372
|
|
350
|
-
def litellm_text_completion(
|
373
|
+
def litellm_text_completion(
|
374
|
+
request: Dict[str, Any],
|
375
|
+
num_retries: int,
|
376
|
+
cache={"no-cache": True, "no-store": True},
|
377
|
+
):
|
351
378
|
# Extract the provider and model from the model string.
|
352
379
|
# TODO: Not all the models are in the format of "provider/model"
|
353
380
|
model = request.pop("model").split("/", 1)
|
@@ -358,7 +385,9 @@ def litellm_text_completion(request: Dict[str, Any], num_retries: int, cache={"n
|
|
358
385
|
api_base = request.pop("api_base", None) or os.getenv(f"{provider}_API_BASE")
|
359
386
|
|
360
387
|
# Build the prompt from the messages.
|
361
|
-
prompt = "\n\n".join(
|
388
|
+
prompt = "\n\n".join(
|
389
|
+
[x["content"] for x in request.pop("messages")] + ["BEGIN RESPONSE:"]
|
390
|
+
)
|
362
391
|
|
363
392
|
return litellm.text_completion(
|
364
393
|
cache=cache,
|
@@ -23,12 +23,15 @@ from dspy.teleprompt.utils import (
|
|
23
23
|
from optuna.distributions import CategoricalDistribution
|
24
24
|
|
25
25
|
from ..optimization_config.configs import TaskConfig
|
26
|
+
from opik_optimizer import task_evaluator
|
27
|
+
from opik_optimizer.optimization_config import mappers
|
26
28
|
|
27
29
|
|
28
|
-
class Logger
|
30
|
+
class Logger:
|
29
31
|
def info(self, *args, **kwargs):
|
30
32
|
print(*args)
|
31
33
|
|
34
|
+
|
32
35
|
logger = Logger()
|
33
36
|
|
34
37
|
# Constants
|
@@ -49,10 +52,6 @@ BLUE = "\033[94m"
|
|
49
52
|
BOLD = "\033[1m"
|
50
53
|
ENDC = "\033[0m" # Resets the color to default
|
51
54
|
|
52
|
-
import opik
|
53
|
-
from opik_optimizer import task_evaluator
|
54
|
-
from opik_optimizer.optimization_config.configs import TaskConfig
|
55
|
-
from opik_optimizer.optimization_config import mappers
|
56
55
|
|
57
56
|
def get_prompt(program):
|
58
57
|
"""
|
@@ -65,6 +64,7 @@ def get_prompt(program):
|
|
65
64
|
|
66
65
|
return instructions
|
67
66
|
|
67
|
+
|
68
68
|
class MIPROv2(Teleprompter):
|
69
69
|
def __init__(
|
70
70
|
self,
|
@@ -554,18 +554,19 @@ class MIPROv2(Teleprompter):
|
|
554
554
|
|
555
555
|
examples = []
|
556
556
|
for demo in demo_candidates.values():
|
557
|
-
for
|
558
|
-
for example in
|
557
|
+
for l_ in demo:
|
558
|
+
for example in l_:
|
559
559
|
examples.append(example.toDict())
|
560
560
|
prompt = get_prompt(program)
|
561
561
|
experiment_config = {
|
562
562
|
**self.experiment_config,
|
563
|
-
**{
|
564
|
-
"
|
565
|
-
|
563
|
+
**{
|
564
|
+
"configuration": {
|
565
|
+
"prompt": prompt,
|
566
|
+
"examples": examples,
|
567
|
+
},
|
568
|
+
"evaluation": "initial",
|
566
569
|
},
|
567
|
-
"evaluation": "initial",
|
568
|
-
}
|
569
570
|
}
|
570
571
|
|
571
572
|
default_score = eval_candidate_program_with_opik(
|
@@ -579,7 +580,7 @@ class MIPROv2(Teleprompter):
|
|
579
580
|
experiment_config=experiment_config,
|
580
581
|
optimization_id=self.opik_optimization_id,
|
581
582
|
)
|
582
|
-
|
583
|
+
|
583
584
|
logger.info(f"Default program score: {default_score}\n")
|
584
585
|
|
585
586
|
trial_logs = {}
|
@@ -606,7 +607,13 @@ class MIPROv2(Teleprompter):
|
|
606
607
|
|
607
608
|
# Define the objective function
|
608
609
|
def objective(trial):
|
609
|
-
nonlocal
|
610
|
+
nonlocal \
|
611
|
+
program, \
|
612
|
+
best_program, \
|
613
|
+
best_score, \
|
614
|
+
trial_logs, \
|
615
|
+
total_eval_calls, \
|
616
|
+
score_data
|
610
617
|
|
611
618
|
trial_num = trial.number + 1
|
612
619
|
if minibatch:
|
@@ -927,18 +934,19 @@ class MIPROv2(Teleprompter):
|
|
927
934
|
|
928
935
|
examples = []
|
929
936
|
for demo in demo_candidates.values():
|
930
|
-
for
|
931
|
-
for example in
|
937
|
+
for l_ in demo:
|
938
|
+
for example in l_:
|
932
939
|
examples.append(example.toDict())
|
933
940
|
prompt = get_prompt(highest_mean_program)
|
934
941
|
experiment_config = {
|
935
942
|
**self.experiment_config,
|
936
|
-
**{
|
937
|
-
"
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
943
|
+
**{
|
944
|
+
"configuration": {
|
945
|
+
"prompt": prompt,
|
946
|
+
"examples": examples,
|
947
|
+
},
|
948
|
+
"evaluation": "full",
|
949
|
+
},
|
942
950
|
}
|
943
951
|
|
944
952
|
full_eval_score = eval_candidate_program_with_opik(
|
@@ -988,7 +996,7 @@ class MIPROv2(Teleprompter):
|
|
988
996
|
trial_logs[trial_num + 1]["full_eval_score"] = full_eval_score
|
989
997
|
|
990
998
|
if full_eval_score == 1.0:
|
991
|
-
return self.early_stop(default_score, program)
|
999
|
+
return self.early_stop(default_score, program) # noqa
|
992
1000
|
|
993
1001
|
# Update best score and program if necessary
|
994
1002
|
if full_eval_score > best_score:
|
@@ -1042,9 +1050,12 @@ def eval_candidate_program_with_opik(
|
|
1042
1050
|
candidate_program._assert_failures += dspy.settings.get("assert_failures")
|
1043
1051
|
if hasattr(candidate_program, "_suggest_failures"):
|
1044
1052
|
candidate_program._suggest_failures += dspy.settings.get("suggest_failures")
|
1045
|
-
|
1046
|
-
return {mappers.from_llm_response_text(): prediction[prompt_task_config.output_dataset_field]}
|
1047
1053
|
|
1054
|
+
return {
|
1055
|
+
mappers.from_llm_response_text(): prediction[
|
1056
|
+
prompt_task_config.output_dataset_field
|
1057
|
+
]
|
1058
|
+
}
|
1048
1059
|
|
1049
1060
|
score = task_evaluator.evaluate(
|
1050
1061
|
dataset=opik_dataset,
|
@@ -1056,5 +1067,5 @@ def eval_candidate_program_with_opik(
|
|
1056
1067
|
experiment_config=experiment_config,
|
1057
1068
|
optimization_id=optimization_id,
|
1058
1069
|
)
|
1059
|
-
|
1070
|
+
|
1060
1071
|
return score
|