opik-optimizer 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +28 -11
- opik_optimizer/colbert.py +236 -0
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +152 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +20 -20
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +18 -17
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +21 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/{utils.py → utils/core.py} +111 -26
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
- opik_optimizer-1.1.0.dist-info/RECORD +73 -0
- opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.5.dist-info/RECORD +0 -50
- opik_optimizer-1.0.5.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,194 @@
|
|
1
|
+
from typing import Any, TYPE_CHECKING
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import random
|
5
|
+
import json
|
6
|
+
|
7
|
+
from deap import creator as _creator
|
8
|
+
|
9
|
+
from . import prompts as evo_prompts
|
10
|
+
from . import reporting
|
11
|
+
from .. import utils
|
12
|
+
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
creator = _creator # backward compt.
|
16
|
+
|
17
|
+
|
18
|
+
class CrossoverOps:
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
verbose: int
|
21
|
+
output_style_guidance: str
|
22
|
+
_call_model: Any
|
23
|
+
|
24
|
+
def _deap_crossover_chunking_strategy(
|
25
|
+
self, messages_1_str: str, messages_2_str: str
|
26
|
+
) -> tuple[str, str]:
|
27
|
+
chunks1 = [
|
28
|
+
chunk.strip() for chunk in messages_1_str.split(".") if chunk.strip()
|
29
|
+
]
|
30
|
+
chunks2 = [
|
31
|
+
chunk.strip() for chunk in messages_2_str.split(".") if chunk.strip()
|
32
|
+
]
|
33
|
+
|
34
|
+
if len(chunks1) >= 2 and len(chunks2) >= 2:
|
35
|
+
min_num_chunks = min(len(chunks1), len(chunks2))
|
36
|
+
point = random.randint(1, min_num_chunks - 1)
|
37
|
+
child1_chunks = chunks1[:point] + chunks2[point:]
|
38
|
+
child2_chunks = chunks2[:point] + chunks1[point:]
|
39
|
+
child1_str = ". ".join(child1_chunks) + ("." if child1_chunks else "")
|
40
|
+
child2_str = ". ".join(child2_chunks) + ("." if child2_chunks else "")
|
41
|
+
return child1_str, child2_str
|
42
|
+
else:
|
43
|
+
raise ValueError(
|
44
|
+
"Not enough chunks in either prompt for chunk-level crossover"
|
45
|
+
)
|
46
|
+
|
47
|
+
def _deap_crossover_word_level(
|
48
|
+
self, messages_1_str: str, messages_2_str: str
|
49
|
+
) -> tuple[str, str]:
|
50
|
+
words1 = messages_1_str.split()
|
51
|
+
words2 = messages_2_str.split()
|
52
|
+
if not words1 or not words2:
|
53
|
+
return messages_1_str, messages_2_str
|
54
|
+
min_word_len = min(len(words1), len(words2))
|
55
|
+
if min_word_len < 2:
|
56
|
+
return messages_1_str, messages_2_str
|
57
|
+
point = random.randint(1, min_word_len - 1)
|
58
|
+
child1_words = words1[:point] + words2[point:]
|
59
|
+
child2_words = words2[:point] + words1[point:]
|
60
|
+
return " ".join(child1_words), " ".join(child2_words)
|
61
|
+
|
62
|
+
def _deap_crossover(self, ind1: Any, ind2: Any) -> tuple[Any, Any]:
|
63
|
+
"""Crossover operation that preserves semantic meaning.
|
64
|
+
Attempts chunk-level crossover first, then falls back to word-level.
|
65
|
+
"""
|
66
|
+
reporting.display_message(
|
67
|
+
" Recombining prompts by mixing and matching words and sentences.",
|
68
|
+
verbose=self.verbose,
|
69
|
+
)
|
70
|
+
messages_1_orig: list[dict[str, str]] = ind1
|
71
|
+
messages_2_orig: list[dict[str, str]] = ind2
|
72
|
+
|
73
|
+
for i, message_1 in enumerate(messages_1_orig):
|
74
|
+
role: str = message_1["role"]
|
75
|
+
message_1_str: str = message_1["content"]
|
76
|
+
if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]["role"] == role):
|
77
|
+
message_2 = messages_2_orig[i]
|
78
|
+
message_2_str: str = message_2["content"]
|
79
|
+
try:
|
80
|
+
child1_str, child2_str = self._deap_crossover_chunking_strategy(
|
81
|
+
message_1_str, message_2_str
|
82
|
+
)
|
83
|
+
except ValueError:
|
84
|
+
child1_str, child2_str = self._deap_crossover_word_level(
|
85
|
+
message_1_str, message_2_str
|
86
|
+
)
|
87
|
+
messages_1_orig[i]["content"] = child1_str
|
88
|
+
messages_2_orig[i]["content"] = child2_str
|
89
|
+
else:
|
90
|
+
pass
|
91
|
+
|
92
|
+
return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
|
93
|
+
|
94
|
+
def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> tuple[Any, Any]:
|
95
|
+
"""Perform crossover by asking an LLM to blend two parent prompts."""
|
96
|
+
reporting.display_message(
|
97
|
+
" Recombining prompts using an LLM.", verbose=self.verbose
|
98
|
+
)
|
99
|
+
|
100
|
+
parent1_messages: list[dict[str, str]] = ind1
|
101
|
+
parent2_messages: list[dict[str, str]] = ind2
|
102
|
+
current_output_style_guidance = self.output_style_guidance
|
103
|
+
|
104
|
+
user_prompt_for_llm_crossover = evo_prompts.llm_crossover_user_prompt(
|
105
|
+
parent1_messages, parent2_messages, current_output_style_guidance
|
106
|
+
)
|
107
|
+
try:
|
108
|
+
logger.debug(
|
109
|
+
f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'"
|
110
|
+
)
|
111
|
+
response_content = self._call_model(
|
112
|
+
messages=[
|
113
|
+
{
|
114
|
+
"role": "system",
|
115
|
+
"content": evo_prompts.llm_crossover_system_prompt(
|
116
|
+
current_output_style_guidance
|
117
|
+
),
|
118
|
+
},
|
119
|
+
{"role": "user", "content": user_prompt_for_llm_crossover},
|
120
|
+
],
|
121
|
+
is_reasoning=True,
|
122
|
+
)
|
123
|
+
logger.debug(f"Raw LLM response for crossover: {response_content}")
|
124
|
+
|
125
|
+
# First, try strict JSON parsing
|
126
|
+
json_response = None
|
127
|
+
try:
|
128
|
+
json_response = utils.json_to_dict(response_content)
|
129
|
+
except Exception:
|
130
|
+
# Continue with heuristic extraction below
|
131
|
+
json_response = None
|
132
|
+
children: list[list[dict[str, str]]] = []
|
133
|
+
if isinstance(json_response, list):
|
134
|
+
children = [c for c in json_response if isinstance(c, list)]
|
135
|
+
|
136
|
+
# If strict parse failed to yield children, try extracting arrays heuristically
|
137
|
+
if not children:
|
138
|
+
extracted = self._extract_json_arrays(response_content)
|
139
|
+
for arr in extracted:
|
140
|
+
try:
|
141
|
+
parsed = json.loads(arr)
|
142
|
+
if isinstance(parsed, list) and all(
|
143
|
+
isinstance(m, dict) and {"role", "content"} <= set(m.keys())
|
144
|
+
for m in parsed
|
145
|
+
):
|
146
|
+
children.append(parsed)
|
147
|
+
except Exception:
|
148
|
+
continue
|
149
|
+
|
150
|
+
if len(children) == 0:
|
151
|
+
raise ValueError("LLM response did not include any valid child prompts")
|
152
|
+
|
153
|
+
# We only need two children; if only one returned, duplicate pattern from DEAP
|
154
|
+
first_child = children[0]
|
155
|
+
second_child = children[1] if len(children) > 1 else children[0]
|
156
|
+
return creator.Individual(first_child), creator.Individual(second_child)
|
157
|
+
except Exception as e:
|
158
|
+
logger.warning(
|
159
|
+
f"LLM-driven crossover failed: {e}. Falling back to DEAP crossover."
|
160
|
+
)
|
161
|
+
return self._deap_crossover(ind1, ind2)
|
162
|
+
|
163
|
+
def _extract_json_arrays(self, text: str) -> list[str]:
|
164
|
+
"""Extract top-level JSON array substrings from arbitrary text.
|
165
|
+
This helps when models return multiple arrays like `[...],\n[...]`.
|
166
|
+
"""
|
167
|
+
arrays: list[str] = []
|
168
|
+
depth = 0
|
169
|
+
start: int | None = None
|
170
|
+
in_str = False
|
171
|
+
escape = False
|
172
|
+
for i, ch in enumerate(text):
|
173
|
+
if escape:
|
174
|
+
# current char is escaped; skip special handling
|
175
|
+
escape = False
|
176
|
+
continue
|
177
|
+
if ch == "\\":
|
178
|
+
escape = True
|
179
|
+
continue
|
180
|
+
if ch == '"':
|
181
|
+
in_str = not in_str
|
182
|
+
continue
|
183
|
+
if in_str:
|
184
|
+
continue
|
185
|
+
if ch == "[":
|
186
|
+
if depth == 0:
|
187
|
+
start = i
|
188
|
+
depth += 1
|
189
|
+
elif ch == "]" and depth > 0:
|
190
|
+
depth -= 1
|
191
|
+
if depth == 0 and start is not None:
|
192
|
+
arrays.append(text[start : i + 1])
|
193
|
+
start = None
|
194
|
+
return arrays
|
@@ -0,0 +1,73 @@
|
|
1
|
+
from typing import Any, TYPE_CHECKING
|
2
|
+
from collections.abc import Callable
|
3
|
+
|
4
|
+
|
5
|
+
from .. import task_evaluator
|
6
|
+
from ..optimization_config import mappers, chat_prompt
|
7
|
+
import opik
|
8
|
+
|
9
|
+
|
10
|
+
class EvaluationOps:
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
agent_class: type[Any]
|
13
|
+
num_threads: int
|
14
|
+
|
15
|
+
def _evaluate_prompt(
|
16
|
+
self,
|
17
|
+
prompt: chat_prompt.ChatPrompt,
|
18
|
+
messages: list[dict[str, str]],
|
19
|
+
dataset: opik.Dataset,
|
20
|
+
metric: Callable,
|
21
|
+
n_samples: int | None = None,
|
22
|
+
dataset_item_ids: list[str] | None = None,
|
23
|
+
experiment_config: dict | None = None,
|
24
|
+
optimization_id: str | None = None,
|
25
|
+
verbose: int = 0,
|
26
|
+
**kwargs: Any,
|
27
|
+
) -> float:
|
28
|
+
"""Evaluate a single prompt (individual) against the dataset and return the score."""
|
29
|
+
total_items = len(dataset.get_items())
|
30
|
+
|
31
|
+
new_prompt = prompt.copy()
|
32
|
+
new_prompt.set_messages(messages)
|
33
|
+
|
34
|
+
experiment_config = experiment_config or {}
|
35
|
+
experiment_config["project_name"] = self.agent_class.project_name
|
36
|
+
experiment_config = {
|
37
|
+
**experiment_config,
|
38
|
+
"optimizer": self.__class__.__name__,
|
39
|
+
"agent_class": self.agent_class.__name__,
|
40
|
+
"agent_config": new_prompt.to_dict(),
|
41
|
+
"metric": metric.__name__,
|
42
|
+
"dataset": dataset.name,
|
43
|
+
"configuration": {
|
44
|
+
"prompt": new_prompt.get_messages(),
|
45
|
+
"n_samples_for_eval": (
|
46
|
+
len(dataset_item_ids) if dataset_item_ids is not None else n_samples
|
47
|
+
),
|
48
|
+
"total_dataset_items": total_items,
|
49
|
+
},
|
50
|
+
}
|
51
|
+
try:
|
52
|
+
agent = self.agent_class(new_prompt)
|
53
|
+
except Exception:
|
54
|
+
return 0.0
|
55
|
+
|
56
|
+
def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
|
57
|
+
messages = new_prompt.get_messages(dataset_item)
|
58
|
+
model_output = agent.invoke(messages)
|
59
|
+
return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
|
60
|
+
|
61
|
+
score = task_evaluator.evaluate(
|
62
|
+
dataset=dataset,
|
63
|
+
dataset_item_ids=dataset_item_ids,
|
64
|
+
metric=metric,
|
65
|
+
evaluated_task=llm_task,
|
66
|
+
num_threads=self.num_threads,
|
67
|
+
project_name=experiment_config["project_name"],
|
68
|
+
n_samples=n_samples if dataset_item_ids is None else None,
|
69
|
+
experiment_config=experiment_config,
|
70
|
+
optimization_id=optimization_id,
|
71
|
+
verbose=verbose,
|
72
|
+
)
|
73
|
+
return score
|