opik-optimizer 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +28 -11
- opik_optimizer/colbert.py +236 -0
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +152 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +20 -20
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +18 -17
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +21 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/{utils.py → utils/core.py} +111 -26
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
- opik_optimizer-1.1.0.dist-info/RECORD +73 -0
- opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.5.dist-info/RECORD +0 -50
- opik_optimizer-1.0.5.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
|
|
1
1
|
from contextlib import contextmanager
|
2
|
-
from typing import Any
|
2
|
+
from typing import Any
|
3
|
+
from collections.abc import Iterator
|
3
4
|
|
5
|
+
from rich.panel import Panel
|
4
6
|
from rich.text import Text
|
5
7
|
|
6
8
|
from ..optimization_config import chat_prompt
|
@@ -136,6 +138,18 @@ class CandidateGenerationReporter:
|
|
136
138
|
console.print(Text("│"))
|
137
139
|
|
138
140
|
|
141
|
+
def display_tool_description(description: str, label: str, color: str) -> None:
|
142
|
+
if not description.strip():
|
143
|
+
return
|
144
|
+
console.print(
|
145
|
+
Panel(
|
146
|
+
description.strip(),
|
147
|
+
title=label,
|
148
|
+
border_style=color,
|
149
|
+
)
|
150
|
+
)
|
151
|
+
|
152
|
+
|
139
153
|
@contextmanager
|
140
154
|
def display_candidate_generation_report(
|
141
155
|
num_prompts: int, verbose: int = 1
|
@@ -162,7 +176,7 @@ def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
|
|
162
176
|
) -> None:
|
163
177
|
if verbose >= 1:
|
164
178
|
console.print(
|
165
|
-
Text(f"│ Evaluating candidate prompt {candidate_count+1}:")
|
179
|
+
Text(f"│ Evaluating candidate prompt {candidate_count + 1}:")
|
166
180
|
)
|
167
181
|
display_messages(prompt.get_messages(), "│ ")
|
168
182
|
|
@@ -4,7 +4,7 @@ import os
|
|
4
4
|
import re
|
5
5
|
import threading
|
6
6
|
from hashlib import sha256
|
7
|
-
from typing import Any,
|
7
|
+
from typing import Any, Literal, cast
|
8
8
|
|
9
9
|
import litellm
|
10
10
|
import pydantic
|
@@ -42,12 +42,12 @@ class LM(BaseLM):
|
|
42
42
|
max_tokens: int = 1000,
|
43
43
|
cache: bool = True,
|
44
44
|
cache_in_memory: bool = True,
|
45
|
-
callbacks:
|
45
|
+
callbacks: list[BaseCallback] | None = None,
|
46
46
|
num_retries: int = 8,
|
47
47
|
provider=None,
|
48
|
-
finetuning_model:
|
49
|
-
launch_kwargs:
|
50
|
-
train_kwargs:
|
48
|
+
finetuning_model: str | None = None,
|
49
|
+
launch_kwargs: dict[str, Any] | None = None,
|
50
|
+
train_kwargs: dict[str, Any] | None = None,
|
51
51
|
**kwargs,
|
52
52
|
):
|
53
53
|
"""
|
@@ -93,9 +93,9 @@ class LM(BaseLM):
|
|
93
93
|
|
94
94
|
if model_pattern:
|
95
95
|
# Handle OpenAI reasoning models (o1, o3)
|
96
|
-
assert (
|
97
|
-
max_tokens >= 20_000
|
98
|
-
)
|
96
|
+
assert max_tokens >= 20_000 and temperature == 1.0, (
|
97
|
+
"OpenAI's reasoning models require passing temperature=1.0 and max_tokens >= 20_000 to `dspy.LM(...)`"
|
98
|
+
)
|
99
99
|
self.kwargs = dict(
|
100
100
|
temperature=temperature, max_completion_tokens=max_tokens, **kwargs
|
101
101
|
)
|
@@ -148,17 +148,17 @@ class LM(BaseLM):
|
|
148
148
|
self.llm_call_counter += 1
|
149
149
|
return results
|
150
150
|
|
151
|
-
def launch(self, launch_kwargs:
|
151
|
+
def launch(self, launch_kwargs: dict[str, Any] | None = None):
|
152
152
|
self.provider.launch(self, launch_kwargs)
|
153
153
|
|
154
|
-
def kill(self, launch_kwargs:
|
154
|
+
def kill(self, launch_kwargs: dict[str, Any] | None = None):
|
155
155
|
self.provider.kill(self, launch_kwargs)
|
156
156
|
|
157
157
|
def finetune(
|
158
158
|
self,
|
159
|
-
train_data:
|
160
|
-
train_data_format:
|
161
|
-
train_kwargs:
|
159
|
+
train_data: list[dict[str, Any]],
|
160
|
+
train_data_format: TrainDataFormat | None,
|
161
|
+
train_kwargs: dict[str, Any] | None = None,
|
162
162
|
) -> TrainingJob:
|
163
163
|
from dspy import settings as settings
|
164
164
|
|
@@ -222,7 +222,7 @@ class LM(BaseLM):
|
|
222
222
|
return {key: getattr(self, key) for key in state_keys} | self.kwargs
|
223
223
|
|
224
224
|
|
225
|
-
def request_cache(maxsize:
|
225
|
+
def request_cache(maxsize: int | None = None):
|
226
226
|
"""
|
227
227
|
A threadsafe decorator to create an in-memory LRU cache for LM inference functions that accept
|
228
228
|
a dictionary-like LM request. An in-memory cache for LM calls is critical for ensuring
|
@@ -235,7 +235,7 @@ def request_cache(maxsize: Optional[int] = None):
|
|
235
235
|
A decorator that wraps the target function with caching.
|
236
236
|
"""
|
237
237
|
|
238
|
-
def cache_key(request:
|
238
|
+
def cache_key(request: dict[str, Any]) -> str:
|
239
239
|
"""
|
240
240
|
Obtain a unique cache key for the given request dictionary by hashing its JSON
|
241
241
|
representation. For request fields having types that are known to be JSON-incompatible,
|
@@ -278,7 +278,7 @@ def request_cache(maxsize: Optional[int] = None):
|
|
278
278
|
# concurrently, e.g. during optimization and evaluation
|
279
279
|
lock=threading.RLock(),
|
280
280
|
)
|
281
|
-
def func_cached(key: str, request:
|
281
|
+
def func_cached(key: str, request: dict[str, Any], *args, **kwargs):
|
282
282
|
return func(request, *args, **kwargs)
|
283
283
|
|
284
284
|
@functools.wraps(func)
|
@@ -303,7 +303,7 @@ def request_cache(maxsize: Optional[int] = None):
|
|
303
303
|
|
304
304
|
|
305
305
|
@request_cache(maxsize=None)
|
306
|
-
def cached_litellm_completion(request:
|
306
|
+
def cached_litellm_completion(request: dict[str, Any], num_retries: int):
|
307
307
|
return litellm_completion(
|
308
308
|
request,
|
309
309
|
cache={"no-cache": False, "no-store": False},
|
@@ -312,7 +312,7 @@ def cached_litellm_completion(request: Dict[str, Any], num_retries: int):
|
|
312
312
|
|
313
313
|
|
314
314
|
def litellm_completion(
|
315
|
-
request:
|
315
|
+
request: dict[str, Any],
|
316
316
|
num_retries: int,
|
317
317
|
cache={"no-cache": True, "no-store": True},
|
318
318
|
):
|
@@ -362,7 +362,7 @@ def litellm_completion(
|
|
362
362
|
|
363
363
|
|
364
364
|
@request_cache(maxsize=None)
|
365
|
-
def cached_litellm_text_completion(request:
|
365
|
+
def cached_litellm_text_completion(request: dict[str, Any], num_retries: int):
|
366
366
|
return litellm_text_completion(
|
367
367
|
request,
|
368
368
|
num_retries=num_retries,
|
@@ -371,7 +371,7 @@ def cached_litellm_text_completion(request: Dict[str, Any], num_retries: int):
|
|
371
371
|
|
372
372
|
|
373
373
|
def litellm_text_completion(
|
374
|
-
request:
|
374
|
+
request: dict[str, Any],
|
375
375
|
num_retries: int,
|
376
376
|
cache={"no-cache": True, "no-store": True},
|
377
377
|
):
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import random
|
2
2
|
import textwrap
|
3
3
|
from collections import defaultdict
|
4
|
-
from typing import Any,
|
4
|
+
from typing import Any, Literal
|
5
|
+
from collections.abc import Callable
|
5
6
|
|
6
7
|
import dspy
|
7
8
|
import numpy as np
|
@@ -69,27 +70,27 @@ class MIPROv2(Teleprompter):
|
|
69
70
|
def __init__(
|
70
71
|
self,
|
71
72
|
metric: Callable,
|
72
|
-
prompt_model:
|
73
|
-
task_model:
|
74
|
-
teacher_settings:
|
73
|
+
prompt_model: Any | None = None,
|
74
|
+
task_model: Any | None = None,
|
75
|
+
teacher_settings: dict = {},
|
75
76
|
max_bootstrapped_demos: int = 4,
|
76
77
|
max_labeled_demos: int = 4,
|
77
|
-
auto:
|
78
|
+
auto: Literal["light", "medium", "heavy"] | None = "medium",
|
78
79
|
num_candidates: int = 10,
|
79
|
-
num_threads:
|
80
|
+
num_threads: int | None = None,
|
80
81
|
max_errors: int = 10,
|
81
82
|
seed: int = 9,
|
82
83
|
init_temperature: float = 0.5,
|
83
84
|
verbose: bool = False,
|
84
85
|
track_stats: bool = True,
|
85
|
-
log_dir:
|
86
|
-
metric_threshold:
|
87
|
-
opik_dataset:
|
88
|
-
opik_metric:
|
89
|
-
opik_prompt_task_config:
|
90
|
-
opik_project_name:
|
91
|
-
opik_optimization_id:
|
92
|
-
experiment_config:
|
86
|
+
log_dir: str | None = None,
|
87
|
+
metric_threshold: float | None = None,
|
88
|
+
opik_dataset: opik.Dataset | None = None,
|
89
|
+
opik_metric: Callable | None = None,
|
90
|
+
opik_prompt_task_config: TaskConfig | None = None,
|
91
|
+
opik_project_name: str | None = None,
|
92
|
+
opik_optimization_id: str | None = None,
|
93
|
+
experiment_config: dict[str, Any] | None = None,
|
93
94
|
):
|
94
95
|
# Validate 'auto' parameter
|
95
96
|
allowed_modes = {None, "light", "medium", "heavy"}
|
@@ -129,13 +130,13 @@ class MIPROv2(Teleprompter):
|
|
129
130
|
self,
|
130
131
|
student: Any,
|
131
132
|
*,
|
132
|
-
trainset:
|
133
|
+
trainset: list,
|
133
134
|
teacher: Any = None,
|
134
|
-
valset:
|
135
|
+
valset: list | None = None,
|
135
136
|
num_trials: int = 30,
|
136
|
-
max_bootstrapped_demos:
|
137
|
-
max_labeled_demos:
|
138
|
-
seed:
|
137
|
+
max_bootstrapped_demos: int | None = None,
|
138
|
+
max_labeled_demos: int | None = None,
|
139
|
+
seed: int | None = None,
|
139
140
|
minibatch: bool = True,
|
140
141
|
minibatch_size: int = 35,
|
141
142
|
minibatch_full_eval_steps: int = 5,
|
@@ -145,7 +146,7 @@ class MIPROv2(Teleprompter):
|
|
145
146
|
tip_aware_proposer: bool = True,
|
146
147
|
fewshot_aware_proposer: bool = True,
|
147
148
|
requires_permission_to_run: bool = True,
|
148
|
-
provide_traceback:
|
149
|
+
provide_traceback: bool | None = None,
|
149
150
|
) -> Any:
|
150
151
|
# Set random seeds
|
151
152
|
seed = seed or self.seed
|
@@ -252,8 +253,8 @@ class MIPROv2(Teleprompter):
|
|
252
253
|
num_trials: int,
|
253
254
|
minibatch: bool,
|
254
255
|
zeroshot_opt: bool,
|
255
|
-
valset:
|
256
|
-
) ->
|
256
|
+
valset: list,
|
257
|
+
) -> tuple[int, list, bool]:
|
257
258
|
if self.auto is None:
|
258
259
|
return num_trials, valset, minibatch
|
259
260
|
|
@@ -273,7 +274,7 @@ class MIPROv2(Teleprompter):
|
|
273
274
|
|
274
275
|
return num_trials, valset, minibatch
|
275
276
|
|
276
|
-
def _set_and_validate_datasets(self, trainset:
|
277
|
+
def _set_and_validate_datasets(self, trainset: list, valset: list | None):
|
277
278
|
if not trainset:
|
278
279
|
raise ValueError("Trainset cannot be empty.")
|
279
280
|
|
@@ -292,7 +293,7 @@ class MIPROv2(Teleprompter):
|
|
292
293
|
|
293
294
|
return trainset, valset
|
294
295
|
|
295
|
-
def _print_auto_run_settings(self, num_trials: int, minibatch: bool, valset:
|
296
|
+
def _print_auto_run_settings(self, num_trials: int, minibatch: bool, valset: list):
|
296
297
|
logger.info(
|
297
298
|
f"\nRUNNING WITH THE FOLLOWING {self.auto.upper()} AUTO RUN SETTINGS:"
|
298
299
|
f"\nnum_trials: {num_trials}"
|
@@ -308,9 +309,9 @@ class MIPROv2(Teleprompter):
|
|
308
309
|
minibatch: bool,
|
309
310
|
minibatch_size: int,
|
310
311
|
minibatch_full_eval_steps: int,
|
311
|
-
valset:
|
312
|
+
valset: list,
|
312
313
|
program_aware_proposer: bool,
|
313
|
-
) ->
|
314
|
+
) -> tuple[str, str]:
|
314
315
|
num_predictors = len(program.predictors())
|
315
316
|
|
316
317
|
# Estimate prompt model calls
|
@@ -359,7 +360,7 @@ class MIPROv2(Teleprompter):
|
|
359
360
|
minibatch: bool,
|
360
361
|
minibatch_size: int,
|
361
362
|
minibatch_full_eval_steps: int,
|
362
|
-
valset:
|
363
|
+
valset: list,
|
363
364
|
program_aware_proposer: bool,
|
364
365
|
) -> bool:
|
365
366
|
prompt_model_line, task_model_line = self._estimate_lm_calls(
|
@@ -414,8 +415,8 @@ class MIPROv2(Teleprompter):
|
|
414
415
|
return user_input == "y"
|
415
416
|
|
416
417
|
def _bootstrap_fewshot_examples(
|
417
|
-
self, program: Any, trainset:
|
418
|
-
) ->
|
418
|
+
self, program: Any, trainset: list, seed: int, teacher: Any
|
419
|
+
) -> list | None:
|
419
420
|
logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==")
|
420
421
|
if self.max_bootstrapped_demos > 0:
|
421
422
|
logger.info(
|
@@ -461,14 +462,14 @@ class MIPROv2(Teleprompter):
|
|
461
462
|
def _propose_instructions(
|
462
463
|
self,
|
463
464
|
program: Any,
|
464
|
-
trainset:
|
465
|
-
demo_candidates:
|
465
|
+
trainset: list,
|
466
|
+
demo_candidates: list | None,
|
466
467
|
view_data_batch_size: int,
|
467
468
|
program_aware_proposer: bool,
|
468
469
|
data_aware_proposer: bool,
|
469
470
|
tip_aware_proposer: bool,
|
470
471
|
fewshot_aware_proposer: bool,
|
471
|
-
) ->
|
472
|
+
) -> dict[int, list[str]]:
|
472
473
|
logger.info("\n==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==")
|
473
474
|
logger.info(
|
474
475
|
"We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions."
|
@@ -513,16 +514,16 @@ class MIPROv2(Teleprompter):
|
|
513
514
|
def _optimize_prompt_parameters(
|
514
515
|
self,
|
515
516
|
program: Any,
|
516
|
-
instruction_candidates:
|
517
|
-
demo_candidates:
|
517
|
+
instruction_candidates: dict[int, list[str]],
|
518
|
+
demo_candidates: list | None,
|
518
519
|
evaluate: Evaluate,
|
519
|
-
valset:
|
520
|
+
valset: list,
|
520
521
|
num_trials: int,
|
521
522
|
minibatch: bool,
|
522
523
|
minibatch_size: int,
|
523
524
|
minibatch_full_eval_steps: int,
|
524
525
|
seed: int,
|
525
|
-
) ->
|
526
|
+
) -> Any | None:
|
526
527
|
# Run optimization
|
527
528
|
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
528
529
|
logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
|
@@ -848,12 +849,12 @@ class MIPROv2(Teleprompter):
|
|
848
849
|
def _select_and_insert_instructions_and_demos(
|
849
850
|
self,
|
850
851
|
candidate_program: Any,
|
851
|
-
instruction_candidates:
|
852
|
-
demo_candidates:
|
852
|
+
instruction_candidates: dict[int, list[str]],
|
853
|
+
demo_candidates: list | None,
|
853
854
|
trial: optuna.trial.Trial,
|
854
|
-
trial_logs:
|
855
|
+
trial_logs: dict,
|
855
856
|
trial_num: int,
|
856
|
-
) ->
|
857
|
+
) -> list[str]:
|
857
858
|
chosen_params = []
|
858
859
|
raw_chosen_params = {}
|
859
860
|
|
@@ -902,18 +903,18 @@ class MIPROv2(Teleprompter):
|
|
902
903
|
self,
|
903
904
|
trial_num: int,
|
904
905
|
adjusted_num_trials: int,
|
905
|
-
param_score_dict:
|
906
|
-
fully_evaled_param_combos:
|
906
|
+
param_score_dict: dict,
|
907
|
+
fully_evaled_param_combos: dict,
|
907
908
|
evaluate: Evaluate,
|
908
|
-
valset:
|
909
|
-
trial_logs:
|
909
|
+
valset: list,
|
910
|
+
trial_logs: dict,
|
910
911
|
total_eval_calls: int,
|
911
912
|
score_data,
|
912
913
|
best_score: float,
|
913
914
|
best_program: Any,
|
914
915
|
study: optuna.Study,
|
915
|
-
instruction_candidates:
|
916
|
-
demo_candidates:
|
916
|
+
instruction_candidates: list,
|
917
|
+
demo_candidates: list,
|
917
918
|
):
|
918
919
|
logger.info(
|
919
920
|
f"===== Trial {trial_num + 1} / {adjusted_num_trials} - Full Evaluation ====="
|
@@ -1026,19 +1027,19 @@ class MIPROv2(Teleprompter):
|
|
1026
1027
|
|
1027
1028
|
def eval_candidate_program_with_opik(
|
1028
1029
|
opik_dataset: opik.Dataset,
|
1029
|
-
trainset:
|
1030
|
+
trainset: list,
|
1030
1031
|
candidate_program: Any,
|
1031
1032
|
project_name: str,
|
1032
1033
|
metric: Callable,
|
1033
1034
|
prompt_task_config: TaskConfig,
|
1034
1035
|
num_threads: int,
|
1035
|
-
experiment_config:
|
1036
|
-
optimization_id:
|
1036
|
+
experiment_config: dict[str, Any] | None = None,
|
1037
|
+
optimization_id: str | None = None,
|
1037
1038
|
):
|
1038
1039
|
"""Evaluate a candidate program on the trainset, using the specified batch size."""
|
1039
1040
|
dataset_item_ids = [example["id"] for example in trainset]
|
1040
1041
|
|
1041
|
-
def program_task(dataset_item:
|
1042
|
+
def program_task(dataset_item: dict[str, Any]) -> dict[str, Any]:
|
1042
1043
|
program_inputs = {
|
1043
1044
|
input_key: dataset_item[input_key]
|
1044
1045
|
for input_key in prompt_task_config.input_dataset_fields
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import os
|
2
2
|
import random
|
3
3
|
from datetime import datetime
|
4
|
-
from typing import
|
4
|
+
from typing import Literal
|
5
|
+
from collections.abc import Callable
|
5
6
|
import logging
|
6
7
|
|
7
8
|
import dspy
|
@@ -37,7 +38,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
37
38
|
def __init__(
|
38
39
|
self,
|
39
40
|
model,
|
40
|
-
project_name:
|
41
|
+
project_name: str | None = None,
|
41
42
|
verbose: int = 1,
|
42
43
|
**model_kwargs,
|
43
44
|
):
|
@@ -54,13 +55,13 @@ class MiproOptimizer(BaseOptimizer):
|
|
54
55
|
|
55
56
|
def evaluate_prompt(
|
56
57
|
self,
|
57
|
-
dataset:
|
58
|
+
dataset: str | Dataset,
|
58
59
|
metric: Callable,
|
59
60
|
task_config: TaskConfig,
|
60
|
-
prompt:
|
61
|
+
prompt: str | dspy.Module | OptimizationResult | None = None,
|
61
62
|
n_samples: int = 10,
|
62
|
-
dataset_item_ids:
|
63
|
-
experiment_config:
|
63
|
+
dataset_item_ids: list[str] | None = None,
|
64
|
+
experiment_config: dict | None = None,
|
64
65
|
verbose: int = 1,
|
65
66
|
**kwargs,
|
66
67
|
) -> float:
|
@@ -238,14 +239,14 @@ class MiproOptimizer(BaseOptimizer):
|
|
238
239
|
|
239
240
|
def optimize_prompt(
|
240
241
|
self,
|
241
|
-
dataset:
|
242
|
+
dataset: str | Dataset,
|
242
243
|
metric: Callable,
|
243
244
|
task_config: TaskConfig,
|
244
245
|
num_candidates: int = 10,
|
245
|
-
experiment_config:
|
246
|
-
num_trials:
|
247
|
-
n_samples:
|
248
|
-
auto:
|
246
|
+
experiment_config: dict | None = None,
|
247
|
+
num_trials: int | None = 3,
|
248
|
+
n_samples: int | None = 10,
|
249
|
+
auto: Literal["light", "medium", "heavy"] | None = "light",
|
249
250
|
**kwargs,
|
250
251
|
) -> OptimizationResult:
|
251
252
|
self._opik_client = opik.Opik()
|
@@ -271,15 +272,15 @@ class MiproOptimizer(BaseOptimizer):
|
|
271
272
|
|
272
273
|
def _optimize_prompt(
|
273
274
|
self,
|
274
|
-
dataset:
|
275
|
+
dataset: str | Dataset,
|
275
276
|
metric: Callable,
|
276
277
|
task_config: TaskConfig,
|
277
278
|
num_candidates: int = 10,
|
278
|
-
experiment_config:
|
279
|
-
optimization_id:
|
280
|
-
num_trials:
|
281
|
-
n_samples:
|
282
|
-
auto:
|
279
|
+
experiment_config: dict | None = None,
|
280
|
+
optimization_id: str | None = None,
|
281
|
+
num_trials: int | None = 3,
|
282
|
+
n_samples: int | None = 10,
|
283
|
+
auto: Literal["light", "medium", "heavy"] | None = "light",
|
283
284
|
**kwargs,
|
284
285
|
) -> OptimizationResult:
|
285
286
|
logger.info("Preparing MIPRO optimization...")
|
@@ -306,11 +307,11 @@ class MiproOptimizer(BaseOptimizer):
|
|
306
307
|
metric,
|
307
308
|
task_config,
|
308
309
|
num_candidates: int = 10,
|
309
|
-
experiment_config:
|
310
|
-
optimization_id:
|
311
|
-
num_trials:
|
312
|
-
n_samples:
|
313
|
-
auto:
|
310
|
+
experiment_config: dict | None = None,
|
311
|
+
optimization_id: str | None = None,
|
312
|
+
num_trials: int | None = 3,
|
313
|
+
n_samples: int | None = 10,
|
314
|
+
auto: Literal["light", "medium", "heavy"] | None = "light",
|
314
315
|
**kwargs,
|
315
316
|
) -> None:
|
316
317
|
# FIXME: Intermediate values:
|
@@ -508,9 +509,11 @@ class MiproOptimizer(BaseOptimizer):
|
|
508
509
|
}
|
509
510
|
],
|
510
511
|
score=0.0,
|
511
|
-
metric_name=
|
512
|
-
|
513
|
-
|
512
|
+
metric_name=(
|
513
|
+
self.opik_metric.__name__
|
514
|
+
if hasattr(self, "opik_metric")
|
515
|
+
else "unknown_metric"
|
516
|
+
),
|
514
517
|
details={"error": "No candidate programs generated by MIPRO"},
|
515
518
|
history=mipro_history_processed,
|
516
519
|
llm_calls=self.lm.llm_call_counter,
|
@@ -564,9 +567,11 @@ class MiproOptimizer(BaseOptimizer):
|
|
564
567
|
}
|
565
568
|
],
|
566
569
|
score=0.0,
|
567
|
-
metric_name=
|
568
|
-
|
569
|
-
|
570
|
+
metric_name=(
|
571
|
+
getattr(self, "opik_metric", None).name
|
572
|
+
if hasattr(self, "opik_metric") and self.opik_metric
|
573
|
+
else "unknown_metric"
|
574
|
+
),
|
570
575
|
details={"error": "No programs generated or compile failed"},
|
571
576
|
history=[],
|
572
577
|
llm_calls=self.lm.llm_call_counter,
|
@@ -1,5 +1,3 @@
|
|
1
|
-
from typing import Dict, Optional
|
2
|
-
|
3
1
|
import uuid
|
4
2
|
import dspy
|
5
3
|
import re
|
@@ -61,7 +59,7 @@ def opik_metric_to_dspy(metric, output):
|
|
61
59
|
|
62
60
|
|
63
61
|
def create_dspy_training_set(
|
64
|
-
data: list[dict], input: str, n_samples:
|
62
|
+
data: list[dict], input: str, n_samples: int | None = None
|
65
63
|
) -> list[dspy.Example]:
|
66
64
|
"""
|
67
65
|
Turn a list of dicts into a list of dspy Examples
|
@@ -80,7 +78,7 @@ def create_dspy_training_set(
|
|
80
78
|
return output
|
81
79
|
|
82
80
|
|
83
|
-
def get_tool_prompts(tool_names, text: str) ->
|
81
|
+
def get_tool_prompts(tool_names, text: str) -> dict[str, str]:
|
84
82
|
"""
|
85
83
|
Extract the embedded tool prompts from a text.
|
86
84
|
"""
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import Any, TYPE_CHECKING
|
2
2
|
import json
|
3
3
|
import os
|
4
4
|
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
16
16
|
from .optimization_config.chat_prompt import ChatPrompt
|
17
17
|
|
18
18
|
|
19
|
-
def tools_to_dict(tools:
|
19
|
+
def tools_to_dict(tools: dict[str, dict[str, Any]]) -> dict[str, Any]:
|
20
20
|
retval = {}
|
21
21
|
for name in tools:
|
22
22
|
parts = {}
|
@@ -38,11 +38,11 @@ class OptimizableAgent:
|
|
38
38
|
project_name (Optional[str]): The project name for tracking
|
39
39
|
"""
|
40
40
|
|
41
|
-
model:
|
42
|
-
model_kwargs:
|
43
|
-
project_name:
|
44
|
-
input_dataset_field:
|
45
|
-
prompts:
|
41
|
+
model: str | None = None
|
42
|
+
model_kwargs: dict[str, Any] = {}
|
43
|
+
project_name: str | None = "Default Project"
|
44
|
+
input_dataset_field: str | None = None
|
45
|
+
prompts: dict[str, "ChatPrompt"]
|
46
46
|
prompt: "ChatPrompt"
|
47
47
|
|
48
48
|
def __init__(self, prompt: "ChatPrompt") -> None:
|
@@ -58,7 +58,8 @@ class OptimizableAgent:
|
|
58
58
|
def init_llm(self) -> None:
|
59
59
|
"""Initialize the LLM with the appropriate callbacks."""
|
60
60
|
# Litellm bug requires this (maybe problematic if multi-threaded)
|
61
|
-
|
61
|
+
if "OPIK_PROJECT_NAME" not in os.environ:
|
62
|
+
os.environ["OPIK_PROJECT_NAME"] = str(self.project_name)
|
62
63
|
self.opik_logger = OpikLogger()
|
63
64
|
litellm.callbacks = [self.opik_logger]
|
64
65
|
|
@@ -70,8 +71,8 @@ class OptimizableAgent:
|
|
70
71
|
@_throttle.rate_limited(_limiter)
|
71
72
|
def _llm_complete(
|
72
73
|
self,
|
73
|
-
messages:
|
74
|
-
tools:
|
74
|
+
messages: list[dict[str, str]],
|
75
|
+
tools: list[dict[str, str]] | None,
|
75
76
|
seed: int,
|
76
77
|
) -> Any:
|
77
78
|
response = litellm.completion(
|
@@ -90,10 +91,10 @@ class OptimizableAgent:
|
|
90
91
|
|
91
92
|
def llm_invoke(
|
92
93
|
self,
|
93
|
-
query:
|
94
|
-
messages:
|
95
|
-
seed:
|
96
|
-
allow_tool_use:
|
94
|
+
query: str | None = None,
|
95
|
+
messages: list[dict[str, str]] | None = None,
|
96
|
+
seed: int | None = None,
|
97
|
+
allow_tool_use: bool | None = False,
|
97
98
|
) -> str:
|
98
99
|
"""
|
99
100
|
NOTE: this is the default LiteLLM API. It is used
|
@@ -155,14 +156,14 @@ class OptimizableAgent:
|
|
155
156
|
result = response.choices[0].message.content
|
156
157
|
return result
|
157
158
|
|
158
|
-
def invoke_dataset_item(self, dataset_item:
|
159
|
+
def invoke_dataset_item(self, dataset_item: dict[str, str]) -> str:
|
159
160
|
messages = self.prompt.get_messages(dataset_item)
|
160
161
|
return self.invoke(messages)
|
161
162
|
|
162
163
|
def invoke(
|
163
164
|
self,
|
164
|
-
messages:
|
165
|
-
seed:
|
165
|
+
messages: list[dict[str, str]],
|
166
|
+
seed: int | None = None,
|
166
167
|
) -> str:
|
167
168
|
"""
|
168
169
|
Invoke the agent with a dataset item.
|