opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -3
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +41 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +28 -20
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +96 -46
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +122 -37
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.0.dist-info/RECORD +50 -0
- opik_optimizer-0.9.1.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,9 @@
|
|
1
1
|
from contextlib import contextmanager
|
2
|
+
from typing import Any, Iterator
|
2
3
|
|
3
|
-
import rich
|
4
4
|
from rich.text import Text
|
5
5
|
|
6
|
+
from ..optimization_config import chat_prompt
|
6
7
|
from ..reporting_utils import (
|
7
8
|
convert_tqdm_to_rich,
|
8
9
|
display_configuration, # noqa: F401
|
@@ -18,33 +19,66 @@ console = get_console()
|
|
18
19
|
|
19
20
|
|
20
21
|
@contextmanager
|
21
|
-
def display_round_progress(max_rounds: int, verbose: int = 1):
|
22
|
+
def display_round_progress(max_rounds: int, verbose: int = 1) -> Any:
|
22
23
|
"""Context manager to display messages during an evaluation phase."""
|
23
|
-
|
24
|
+
|
24
25
|
# Create a simple object with a method to set the score
|
25
26
|
class Reporter:
|
26
|
-
def failed_to_generate(self, num_prompts, error):
|
27
|
+
def failed_to_generate(self, num_prompts: int, error: str) -> None:
|
27
28
|
if verbose >= 1:
|
28
|
-
console.print(
|
29
|
+
console.print(
|
30
|
+
Text(
|
31
|
+
f"│ Failed to generate {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}: {error}",
|
32
|
+
style="red",
|
33
|
+
)
|
34
|
+
)
|
29
35
|
console.print(Text("│"))
|
30
|
-
|
31
|
-
def round_start(self, round_number):
|
36
|
+
|
37
|
+
def round_start(self, round_number: int) -> None:
|
32
38
|
if verbose >= 1:
|
33
|
-
console.print(
|
39
|
+
console.print(
|
40
|
+
Text(
|
41
|
+
f"│ - Starting optimization round {round_number + 1} of {max_rounds}"
|
42
|
+
)
|
43
|
+
)
|
34
44
|
|
35
|
-
def round_end(self, round_number, score, best_score
|
45
|
+
def round_end(self, round_number: int, score: float, best_score: float) -> None:
|
36
46
|
if verbose >= 1:
|
37
|
-
console.print(
|
47
|
+
console.print(
|
48
|
+
Text(
|
49
|
+
f"│ Completed optimization round {round_number + 1} of {max_rounds}"
|
50
|
+
)
|
51
|
+
)
|
38
52
|
if best_score == 0 and score == 0:
|
39
|
-
console.print(
|
53
|
+
console.print(
|
54
|
+
Text(
|
55
|
+
"│ No improvement in this optimization round - score is 0",
|
56
|
+
style="yellow",
|
57
|
+
)
|
58
|
+
)
|
40
59
|
elif best_score == 0:
|
41
|
-
console.print(
|
60
|
+
console.print(
|
61
|
+
Text(
|
62
|
+
f"│ Found a new best performing prompt: {score:.4f}",
|
63
|
+
style="green",
|
64
|
+
)
|
65
|
+
)
|
42
66
|
elif score > best_score:
|
43
67
|
perc_change = (score - best_score) / best_score
|
44
|
-
console.print(
|
68
|
+
console.print(
|
69
|
+
Text(
|
70
|
+
f"│ Found a new best performing prompt: {score:.4f} ({perc_change:.2%})",
|
71
|
+
style="green",
|
72
|
+
)
|
73
|
+
)
|
45
74
|
elif score <= best_score:
|
46
|
-
console.print(
|
47
|
-
|
75
|
+
console.print(
|
76
|
+
Text(
|
77
|
+
"│ No improvement in this optimization round",
|
78
|
+
style="red",
|
79
|
+
)
|
80
|
+
)
|
81
|
+
|
48
82
|
console.print(Text("│"))
|
49
83
|
|
50
84
|
# Use our log suppression context manager and yield the reporter
|
@@ -57,20 +91,22 @@ def display_round_progress(max_rounds: int, verbose: int = 1):
|
|
57
91
|
|
58
92
|
|
59
93
|
@contextmanager
|
60
|
-
def display_evaluation(
|
94
|
+
def display_evaluation(
|
95
|
+
message: str = "First we will establish the baseline performance:", verbose: int = 1
|
96
|
+
) -> Any:
|
61
97
|
"""Context manager to display messages during an evaluation phase."""
|
62
|
-
score = None
|
63
|
-
|
64
98
|
# Entry point
|
65
99
|
if verbose >= 1:
|
66
100
|
console.print(Text(f"> {message}"))
|
67
|
-
|
101
|
+
|
68
102
|
# Create a simple object with a method to set the score
|
69
103
|
class Reporter:
|
70
|
-
def set_score(self, s):
|
104
|
+
def set_score(self, s: float) -> None:
|
71
105
|
if verbose >= 1:
|
72
|
-
console.print(
|
73
|
-
|
106
|
+
console.print(
|
107
|
+
Text(f"\r Baseline score was: {s:.4f}.\n", style="green")
|
108
|
+
)
|
109
|
+
|
74
110
|
# Use our log suppression context manager and yield the reporter
|
75
111
|
with suppress_opik_logs():
|
76
112
|
with convert_tqdm_to_rich(" Evaluation", verbose=verbose):
|
@@ -79,62 +115,100 @@ def display_evaluation(message: str = "First we will establish the baseline perf
|
|
79
115
|
finally:
|
80
116
|
pass
|
81
117
|
|
82
|
-
|
118
|
+
|
119
|
+
def display_optimization_start_message(verbose: int = 1) -> None:
|
83
120
|
if verbose >= 1:
|
84
121
|
console.print(Text("> Starting the optimization run"))
|
85
122
|
console.print(Text("│"))
|
86
123
|
|
87
124
|
|
125
|
+
class CandidateGenerationReporter:
|
126
|
+
def __init__(self, num_prompts: int):
|
127
|
+
self.num_prompts = num_prompts
|
128
|
+
|
129
|
+
def set_generated_prompts(self) -> None:
|
130
|
+
console.print(
|
131
|
+
Text(
|
132
|
+
f"│ Successfully generated {self.num_prompts} candidate prompt{'' if self.num_prompts == 1 else 's'}",
|
133
|
+
style="dim",
|
134
|
+
)
|
135
|
+
)
|
136
|
+
console.print(Text("│"))
|
137
|
+
|
138
|
+
|
88
139
|
@contextmanager
|
89
|
-
def display_candidate_generation_report(
|
90
|
-
|
91
|
-
|
140
|
+
def display_candidate_generation_report(
|
141
|
+
num_prompts: int, verbose: int = 1
|
142
|
+
) -> Iterator[CandidateGenerationReporter]:
|
92
143
|
if verbose >= 1:
|
93
|
-
console.print(
|
94
|
-
|
95
|
-
|
96
|
-
class Reporter:
|
97
|
-
def set_generated_prompts(self, prompts):
|
98
|
-
console.print(Text(f"│ Successfully generated {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}", style="dim"))
|
99
|
-
console.print(Text("│"))
|
144
|
+
console.print(
|
145
|
+
Text(f"│ Generating candidate prompt{'' if num_prompts == 1 else 's'}:")
|
146
|
+
)
|
100
147
|
|
101
148
|
try:
|
102
|
-
yield
|
149
|
+
yield CandidateGenerationReporter(num_prompts)
|
103
150
|
finally:
|
104
151
|
pass
|
105
152
|
|
106
153
|
|
107
154
|
@contextmanager
|
108
|
-
def display_prompt_candidate_scoring_report(
|
155
|
+
def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
|
109
156
|
"""Context manager to display messages during an evaluation phase."""
|
157
|
+
|
110
158
|
# Create a simple object with a method to set the score
|
111
159
|
class Reporter:
|
112
|
-
def set_generated_prompts(
|
160
|
+
def set_generated_prompts(
|
161
|
+
self, candidate_count: int, prompt: chat_prompt.ChatPrompt
|
162
|
+
) -> None:
|
113
163
|
if verbose >= 1:
|
114
|
-
console.print(
|
115
|
-
|
116
|
-
|
117
|
-
|
164
|
+
console.print(
|
165
|
+
Text(f"│ Evaluating candidate prompt {candidate_count+1}:")
|
166
|
+
)
|
167
|
+
display_messages(prompt.get_messages(), "│ ")
|
168
|
+
|
169
|
+
def set_final_score(self, best_score: float, score: float) -> None:
|
118
170
|
if verbose >= 1:
|
119
171
|
if best_score == 0 and score > 0:
|
120
|
-
console.print(
|
172
|
+
console.print(
|
173
|
+
Text(f"│ Evaluation score: {score:.4f}", style="green")
|
174
|
+
)
|
121
175
|
elif best_score == 0 and score == 0:
|
122
|
-
console.print(
|
176
|
+
console.print(
|
177
|
+
Text(
|
178
|
+
f"│ Evaluation score: {score:.4f}",
|
179
|
+
style="dim yellow",
|
180
|
+
)
|
181
|
+
)
|
123
182
|
elif score > best_score:
|
124
183
|
perc_change = (score - best_score) / best_score
|
125
|
-
console.print(
|
184
|
+
console.print(
|
185
|
+
Text(
|
186
|
+
f"│ Evaluation score: {score:.4f} ({perc_change:.2%})",
|
187
|
+
style="green",
|
188
|
+
)
|
189
|
+
)
|
126
190
|
elif score < best_score:
|
127
191
|
perc_change = (score - best_score) / best_score
|
128
|
-
console.print(
|
192
|
+
console.print(
|
193
|
+
Text(
|
194
|
+
f"│ Evaluation score: {score:.4f} ({perc_change:.2%})",
|
195
|
+
style="red",
|
196
|
+
)
|
197
|
+
)
|
129
198
|
else:
|
130
|
-
console.print(
|
131
|
-
|
199
|
+
console.print(
|
200
|
+
Text(
|
201
|
+
f"│ Evaluation score: {score:.4f}",
|
202
|
+
style="dim yellow",
|
203
|
+
)
|
204
|
+
)
|
205
|
+
|
132
206
|
console.print(Text("│"))
|
133
207
|
console.print(Text("│"))
|
208
|
+
|
134
209
|
try:
|
135
210
|
with suppress_opik_logs():
|
136
211
|
with convert_tqdm_to_rich("│ Evaluation", verbose=verbose):
|
137
212
|
yield Reporter()
|
138
213
|
finally:
|
139
214
|
pass
|
140
|
-
|
@@ -22,12 +22,13 @@ from dspy.dsp.utils.settings import settings
|
|
22
22
|
from dspy.utils.callback import BaseCallback, with_callbacks
|
23
23
|
from dspy.clients.base_lm import BaseLM
|
24
24
|
|
25
|
-
from .._throttle import
|
25
|
+
from .._throttle import rate_limited, get_rate_limiter_for_current_opik_installation
|
26
26
|
|
27
27
|
logger = logging.getLogger(__name__)
|
28
28
|
# Limit how fast an LLM can be called:
|
29
29
|
limiter = get_rate_limiter_for_current_opik_installation()
|
30
30
|
|
31
|
+
|
31
32
|
class LM(BaseLM):
|
32
33
|
"""
|
33
34
|
A language model supporting chat or text completion requests for use with DSPy modules.
|
@@ -82,6 +83,7 @@ class LM(BaseLM):
|
|
82
83
|
self.finetuning_model = finetuning_model
|
83
84
|
self.launch_kwargs = launch_kwargs or {}
|
84
85
|
self.train_kwargs = train_kwargs or {}
|
86
|
+
self.llm_call_counter = 0
|
85
87
|
|
86
88
|
# Handle model-specific configuration for different model families
|
87
89
|
model_family = model.split("/")[-1].lower() if "/" in model else model.lower()
|
@@ -94,7 +96,9 @@ class LM(BaseLM):
|
|
94
96
|
assert (
|
95
97
|
max_tokens >= 20_000 and temperature == 1.0
|
96
98
|
), "OpenAI's reasoning models require passing temperature=1.0 and max_tokens >= 20_000 to `dspy.LM(...)`"
|
97
|
-
self.kwargs = dict(
|
99
|
+
self.kwargs = dict(
|
100
|
+
temperature=temperature, max_completion_tokens=max_tokens, **kwargs
|
101
|
+
)
|
98
102
|
else:
|
99
103
|
self.kwargs = dict(temperature=temperature, max_tokens=max_tokens, **kwargs)
|
100
104
|
|
@@ -110,14 +114,22 @@ class LM(BaseLM):
|
|
110
114
|
|
111
115
|
# Make the request and handle LRU & disk caching.
|
112
116
|
if cache_in_memory:
|
113
|
-
completion =
|
117
|
+
completion = (
|
118
|
+
cached_litellm_completion
|
119
|
+
if self.model_type == "chat"
|
120
|
+
else cached_litellm_text_completion
|
121
|
+
)
|
114
122
|
|
115
123
|
results = completion(
|
116
124
|
request=dict(model=self.model, messages=messages, **kwargs),
|
117
125
|
num_retries=self.num_retries,
|
118
126
|
)
|
119
127
|
else:
|
120
|
-
completion =
|
128
|
+
completion = (
|
129
|
+
litellm_completion
|
130
|
+
if self.model_type == "chat"
|
131
|
+
else litellm_text_completion
|
132
|
+
)
|
121
133
|
|
122
134
|
results = completion(
|
123
135
|
request=dict(model=self.model, messages=messages, **kwargs),
|
@@ -126,9 +138,14 @@ class LM(BaseLM):
|
|
126
138
|
cache={"no-cache": not cache, "no-store": not cache},
|
127
139
|
)
|
128
140
|
|
129
|
-
if
|
141
|
+
if (
|
142
|
+
not getattr(results, "cache_hit", False)
|
143
|
+
and dspy.settings.usage_tracker
|
144
|
+
and hasattr(results, "usage")
|
145
|
+
):
|
130
146
|
settings.usage_tracker.add_usage(self.model, dict(results.usage))
|
131
147
|
|
148
|
+
self.llm_call_counter += 1
|
132
149
|
return results
|
133
150
|
|
134
151
|
def launch(self, launch_kwargs: Optional[Dict[str, Any]] = None):
|
@@ -237,7 +254,11 @@ def request_cache(maxsize: Optional[int] = None):
|
|
237
254
|
return value.model_json_schema()
|
238
255
|
elif isinstance(value, pydantic.BaseModel):
|
239
256
|
return value.model_dump()
|
240
|
-
elif
|
257
|
+
elif (
|
258
|
+
callable(value)
|
259
|
+
and hasattr(value, "__code__")
|
260
|
+
and hasattr(value.__code__, "co_code")
|
261
|
+
):
|
241
262
|
return value.__code__.co_code.decode("utf-8")
|
242
263
|
else:
|
243
264
|
# Note: We don't attempt to compute a hash of the value, since the default
|
@@ -290,7 +311,11 @@ def cached_litellm_completion(request: Dict[str, Any], num_retries: int):
|
|
290
311
|
)
|
291
312
|
|
292
313
|
|
293
|
-
def litellm_completion(
|
314
|
+
def litellm_completion(
|
315
|
+
request: Dict[str, Any],
|
316
|
+
num_retries: int,
|
317
|
+
cache={"no-cache": True, "no-store": True},
|
318
|
+
):
|
294
319
|
retry_kwargs = dict(
|
295
320
|
retry_policy=_get_litellm_retry_policy(num_retries),
|
296
321
|
retry_strategy="exponential_backoff_retry",
|
@@ -323,6 +348,7 @@ def litellm_completion(request: Dict[str, Any], num_retries: int, cache={"no-cac
|
|
323
348
|
**retry_kwargs,
|
324
349
|
**request,
|
325
350
|
)
|
351
|
+
|
326
352
|
chunks = []
|
327
353
|
async for chunk in response:
|
328
354
|
if caller_predict_id:
|
@@ -344,7 +370,11 @@ def cached_litellm_text_completion(request: Dict[str, Any], num_retries: int):
|
|
344
370
|
)
|
345
371
|
|
346
372
|
|
347
|
-
def litellm_text_completion(
|
373
|
+
def litellm_text_completion(
|
374
|
+
request: Dict[str, Any],
|
375
|
+
num_retries: int,
|
376
|
+
cache={"no-cache": True, "no-store": True},
|
377
|
+
):
|
348
378
|
# Extract the provider and model from the model string.
|
349
379
|
# TODO: Not all the models are in the format of "provider/model"
|
350
380
|
model = request.pop("model").split("/", 1)
|
@@ -355,7 +385,9 @@ def litellm_text_completion(request: Dict[str, Any], num_retries: int, cache={"n
|
|
355
385
|
api_base = request.pop("api_base", None) or os.getenv(f"{provider}_API_BASE")
|
356
386
|
|
357
387
|
# Build the prompt from the messages.
|
358
|
-
prompt = "\n\n".join(
|
388
|
+
prompt = "\n\n".join(
|
389
|
+
[x["content"] for x in request.pop("messages")] + ["BEGIN RESPONSE:"]
|
390
|
+
)
|
359
391
|
|
360
392
|
return litellm.text_completion(
|
361
393
|
cache=cache,
|
@@ -23,12 +23,15 @@ from dspy.teleprompt.utils import (
|
|
23
23
|
from optuna.distributions import CategoricalDistribution
|
24
24
|
|
25
25
|
from ..optimization_config.configs import TaskConfig
|
26
|
+
from opik_optimizer import task_evaluator
|
27
|
+
from opik_optimizer.optimization_config import mappers
|
26
28
|
|
27
29
|
|
28
|
-
class Logger
|
30
|
+
class Logger:
|
29
31
|
def info(self, *args, **kwargs):
|
30
32
|
print(*args)
|
31
33
|
|
34
|
+
|
32
35
|
logger = Logger()
|
33
36
|
|
34
37
|
# Constants
|
@@ -49,10 +52,6 @@ BLUE = "\033[94m"
|
|
49
52
|
BOLD = "\033[1m"
|
50
53
|
ENDC = "\033[0m" # Resets the color to default
|
51
54
|
|
52
|
-
import opik
|
53
|
-
from opik_optimizer import task_evaluator
|
54
|
-
from opik_optimizer.optimization_config.configs import TaskConfig
|
55
|
-
from opik_optimizer.optimization_config import mappers
|
56
55
|
|
57
56
|
def get_prompt(program):
|
58
57
|
"""
|
@@ -65,6 +64,7 @@ def get_prompt(program):
|
|
65
64
|
|
66
65
|
return instructions
|
67
66
|
|
67
|
+
|
68
68
|
class MIPROv2(Teleprompter):
|
69
69
|
def __init__(
|
70
70
|
self,
|
@@ -554,18 +554,19 @@ class MIPROv2(Teleprompter):
|
|
554
554
|
|
555
555
|
examples = []
|
556
556
|
for demo in demo_candidates.values():
|
557
|
-
for
|
558
|
-
for example in
|
557
|
+
for l_ in demo:
|
558
|
+
for example in l_:
|
559
559
|
examples.append(example.toDict())
|
560
560
|
prompt = get_prompt(program)
|
561
561
|
experiment_config = {
|
562
562
|
**self.experiment_config,
|
563
|
-
**{
|
564
|
-
"
|
565
|
-
|
563
|
+
**{
|
564
|
+
"configuration": {
|
565
|
+
"prompt": prompt,
|
566
|
+
"examples": examples,
|
567
|
+
},
|
568
|
+
"evaluation": "initial",
|
566
569
|
},
|
567
|
-
"evaluation": "initial",
|
568
|
-
}
|
569
570
|
}
|
570
571
|
|
571
572
|
default_score = eval_candidate_program_with_opik(
|
@@ -579,7 +580,7 @@ class MIPROv2(Teleprompter):
|
|
579
580
|
experiment_config=experiment_config,
|
580
581
|
optimization_id=self.opik_optimization_id,
|
581
582
|
)
|
582
|
-
|
583
|
+
|
583
584
|
logger.info(f"Default program score: {default_score}\n")
|
584
585
|
|
585
586
|
trial_logs = {}
|
@@ -606,7 +607,13 @@ class MIPROv2(Teleprompter):
|
|
606
607
|
|
607
608
|
# Define the objective function
|
608
609
|
def objective(trial):
|
609
|
-
nonlocal
|
610
|
+
nonlocal \
|
611
|
+
program, \
|
612
|
+
best_program, \
|
613
|
+
best_score, \
|
614
|
+
trial_logs, \
|
615
|
+
total_eval_calls, \
|
616
|
+
score_data
|
610
617
|
|
611
618
|
trial_num = trial.number + 1
|
612
619
|
if minibatch:
|
@@ -927,18 +934,19 @@ class MIPROv2(Teleprompter):
|
|
927
934
|
|
928
935
|
examples = []
|
929
936
|
for demo in demo_candidates.values():
|
930
|
-
for
|
931
|
-
for example in
|
937
|
+
for l_ in demo:
|
938
|
+
for example in l_:
|
932
939
|
examples.append(example.toDict())
|
933
940
|
prompt = get_prompt(highest_mean_program)
|
934
941
|
experiment_config = {
|
935
942
|
**self.experiment_config,
|
936
|
-
**{
|
937
|
-
"
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
943
|
+
**{
|
944
|
+
"configuration": {
|
945
|
+
"prompt": prompt,
|
946
|
+
"examples": examples,
|
947
|
+
},
|
948
|
+
"evaluation": "full",
|
949
|
+
},
|
942
950
|
}
|
943
951
|
|
944
952
|
full_eval_score = eval_candidate_program_with_opik(
|
@@ -988,7 +996,7 @@ class MIPROv2(Teleprompter):
|
|
988
996
|
trial_logs[trial_num + 1]["full_eval_score"] = full_eval_score
|
989
997
|
|
990
998
|
if full_eval_score == 1.0:
|
991
|
-
return self.early_stop(default_score, program)
|
999
|
+
return self.early_stop(default_score, program) # noqa
|
992
1000
|
|
993
1001
|
# Update best score and program if necessary
|
994
1002
|
if full_eval_score > best_score:
|
@@ -1042,9 +1050,12 @@ def eval_candidate_program_with_opik(
|
|
1042
1050
|
candidate_program._assert_failures += dspy.settings.get("assert_failures")
|
1043
1051
|
if hasattr(candidate_program, "_suggest_failures"):
|
1044
1052
|
candidate_program._suggest_failures += dspy.settings.get("suggest_failures")
|
1045
|
-
|
1046
|
-
return {mappers.from_llm_response_text(): prediction[prompt_task_config.output_dataset_field]}
|
1047
1053
|
|
1054
|
+
return {
|
1055
|
+
mappers.from_llm_response_text(): prediction[
|
1056
|
+
prompt_task_config.output_dataset_field
|
1057
|
+
]
|
1058
|
+
}
|
1048
1059
|
|
1049
1060
|
score = task_evaluator.evaluate(
|
1050
1061
|
dataset=opik_dataset,
|
@@ -1056,5 +1067,5 @@ def eval_candidate_program_with_opik(
|
|
1056
1067
|
experiment_config=experiment_config,
|
1057
1068
|
optimization_id=optimization_id,
|
1058
1069
|
)
|
1059
|
-
|
1070
|
+
|
1060
1071
|
return score
|