opik-optimizer 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. opik_optimizer/__init__.py +2 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +28 -11
  4. opik_optimizer/colbert.py +236 -0
  5. opik_optimizer/data/context7_eval.jsonl +3 -0
  6. opik_optimizer/datasets/context7_eval.py +90 -0
  7. opik_optimizer/datasets/tiny_test.py +33 -34
  8. opik_optimizer/datasets/truthful_qa.py +2 -2
  9. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  10. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
  11. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
  12. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  13. opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +152 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +20 -20
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +16 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +21 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/{utils.py → utils/core.py} +111 -26
  46. opik_optimizer/utils/dataset_utils.py +49 -0
  47. opik_optimizer/utils/prompt_segments.py +186 -0
  48. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
  49. opik_optimizer-1.1.0.dist-info/RECORD +73 -0
  50. opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
  51. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  52. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  53. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
  54. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,194 @@
1
+ from typing import Any, TYPE_CHECKING
2
+
3
+ import logging
4
+ import random
5
+ import json
6
+
7
+ from deap import creator as _creator
8
+
9
+ from . import prompts as evo_prompts
10
+ from . import reporting
11
+ from .. import utils
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+ creator = _creator # backward compt.
16
+
17
+
18
+ class CrossoverOps:
19
+ if TYPE_CHECKING:
20
+ verbose: int
21
+ output_style_guidance: str
22
+ _call_model: Any
23
+
24
+ def _deap_crossover_chunking_strategy(
25
+ self, messages_1_str: str, messages_2_str: str
26
+ ) -> tuple[str, str]:
27
+ chunks1 = [
28
+ chunk.strip() for chunk in messages_1_str.split(".") if chunk.strip()
29
+ ]
30
+ chunks2 = [
31
+ chunk.strip() for chunk in messages_2_str.split(".") if chunk.strip()
32
+ ]
33
+
34
+ if len(chunks1) >= 2 and len(chunks2) >= 2:
35
+ min_num_chunks = min(len(chunks1), len(chunks2))
36
+ point = random.randint(1, min_num_chunks - 1)
37
+ child1_chunks = chunks1[:point] + chunks2[point:]
38
+ child2_chunks = chunks2[:point] + chunks1[point:]
39
+ child1_str = ". ".join(child1_chunks) + ("." if child1_chunks else "")
40
+ child2_str = ". ".join(child2_chunks) + ("." if child2_chunks else "")
41
+ return child1_str, child2_str
42
+ else:
43
+ raise ValueError(
44
+ "Not enough chunks in either prompt for chunk-level crossover"
45
+ )
46
+
47
+ def _deap_crossover_word_level(
48
+ self, messages_1_str: str, messages_2_str: str
49
+ ) -> tuple[str, str]:
50
+ words1 = messages_1_str.split()
51
+ words2 = messages_2_str.split()
52
+ if not words1 or not words2:
53
+ return messages_1_str, messages_2_str
54
+ min_word_len = min(len(words1), len(words2))
55
+ if min_word_len < 2:
56
+ return messages_1_str, messages_2_str
57
+ point = random.randint(1, min_word_len - 1)
58
+ child1_words = words1[:point] + words2[point:]
59
+ child2_words = words2[:point] + words1[point:]
60
+ return " ".join(child1_words), " ".join(child2_words)
61
+
62
+ def _deap_crossover(self, ind1: Any, ind2: Any) -> tuple[Any, Any]:
63
+ """Crossover operation that preserves semantic meaning.
64
+ Attempts chunk-level crossover first, then falls back to word-level.
65
+ """
66
+ reporting.display_message(
67
+ " Recombining prompts by mixing and matching words and sentences.",
68
+ verbose=self.verbose,
69
+ )
70
+ messages_1_orig: list[dict[str, str]] = ind1
71
+ messages_2_orig: list[dict[str, str]] = ind2
72
+
73
+ for i, message_1 in enumerate(messages_1_orig):
74
+ role: str = message_1["role"]
75
+ message_1_str: str = message_1["content"]
76
+ if (len(messages_2_orig) >= i + 1) and (messages_2_orig[i]["role"] == role):
77
+ message_2 = messages_2_orig[i]
78
+ message_2_str: str = message_2["content"]
79
+ try:
80
+ child1_str, child2_str = self._deap_crossover_chunking_strategy(
81
+ message_1_str, message_2_str
82
+ )
83
+ except ValueError:
84
+ child1_str, child2_str = self._deap_crossover_word_level(
85
+ message_1_str, message_2_str
86
+ )
87
+ messages_1_orig[i]["content"] = child1_str
88
+ messages_2_orig[i]["content"] = child2_str
89
+ else:
90
+ pass
91
+
92
+ return creator.Individual(messages_1_orig), creator.Individual(messages_2_orig)
93
+
94
+ def _llm_deap_crossover(self, ind1: Any, ind2: Any) -> tuple[Any, Any]:
95
+ """Perform crossover by asking an LLM to blend two parent prompts."""
96
+ reporting.display_message(
97
+ " Recombining prompts using an LLM.", verbose=self.verbose
98
+ )
99
+
100
+ parent1_messages: list[dict[str, str]] = ind1
101
+ parent2_messages: list[dict[str, str]] = ind2
102
+ current_output_style_guidance = self.output_style_guidance
103
+
104
+ user_prompt_for_llm_crossover = evo_prompts.llm_crossover_user_prompt(
105
+ parent1_messages, parent2_messages, current_output_style_guidance
106
+ )
107
+ try:
108
+ logger.debug(
109
+ f"Attempting LLM-driven crossover between: '{parent1_messages[:50]}...' and '{parent2_messages[:50]}...' aiming for style: '{current_output_style_guidance[:30]}...'"
110
+ )
111
+ response_content = self._call_model(
112
+ messages=[
113
+ {
114
+ "role": "system",
115
+ "content": evo_prompts.llm_crossover_system_prompt(
116
+ current_output_style_guidance
117
+ ),
118
+ },
119
+ {"role": "user", "content": user_prompt_for_llm_crossover},
120
+ ],
121
+ is_reasoning=True,
122
+ )
123
+ logger.debug(f"Raw LLM response for crossover: {response_content}")
124
+
125
+ # First, try strict JSON parsing
126
+ json_response = None
127
+ try:
128
+ json_response = utils.json_to_dict(response_content)
129
+ except Exception:
130
+ # Continue with heuristic extraction below
131
+ json_response = None
132
+ children: list[list[dict[str, str]]] = []
133
+ if isinstance(json_response, list):
134
+ children = [c for c in json_response if isinstance(c, list)]
135
+
136
+ # If strict parse failed to yield children, try extracting arrays heuristically
137
+ if not children:
138
+ extracted = self._extract_json_arrays(response_content)
139
+ for arr in extracted:
140
+ try:
141
+ parsed = json.loads(arr)
142
+ if isinstance(parsed, list) and all(
143
+ isinstance(m, dict) and {"role", "content"} <= set(m.keys())
144
+ for m in parsed
145
+ ):
146
+ children.append(parsed)
147
+ except Exception:
148
+ continue
149
+
150
+ if len(children) == 0:
151
+ raise ValueError("LLM response did not include any valid child prompts")
152
+
153
+ # We only need two children; if only one returned, duplicate pattern from DEAP
154
+ first_child = children[0]
155
+ second_child = children[1] if len(children) > 1 else children[0]
156
+ return creator.Individual(first_child), creator.Individual(second_child)
157
+ except Exception as e:
158
+ logger.warning(
159
+ f"LLM-driven crossover failed: {e}. Falling back to DEAP crossover."
160
+ )
161
+ return self._deap_crossover(ind1, ind2)
162
+
163
+ def _extract_json_arrays(self, text: str) -> list[str]:
164
+ """Extract top-level JSON array substrings from arbitrary text.
165
+ This helps when models return multiple arrays like `[...],\n[...]`.
166
+ """
167
+ arrays: list[str] = []
168
+ depth = 0
169
+ start: int | None = None
170
+ in_str = False
171
+ escape = False
172
+ for i, ch in enumerate(text):
173
+ if escape:
174
+ # current char is escaped; skip special handling
175
+ escape = False
176
+ continue
177
+ if ch == "\\":
178
+ escape = True
179
+ continue
180
+ if ch == '"':
181
+ in_str = not in_str
182
+ continue
183
+ if in_str:
184
+ continue
185
+ if ch == "[":
186
+ if depth == 0:
187
+ start = i
188
+ depth += 1
189
+ elif ch == "]" and depth > 0:
190
+ depth -= 1
191
+ if depth == 0 and start is not None:
192
+ arrays.append(text[start : i + 1])
193
+ start = None
194
+ return arrays
@@ -0,0 +1,73 @@
1
+ from typing import Any, TYPE_CHECKING
2
+ from collections.abc import Callable
3
+
4
+
5
+ from .. import task_evaluator
6
+ from ..optimization_config import mappers, chat_prompt
7
+ import opik
8
+
9
+
10
+ class EvaluationOps:
11
+ if TYPE_CHECKING:
12
+ agent_class: type[Any]
13
+ num_threads: int
14
+
15
+ def _evaluate_prompt(
16
+ self,
17
+ prompt: chat_prompt.ChatPrompt,
18
+ messages: list[dict[str, str]],
19
+ dataset: opik.Dataset,
20
+ metric: Callable,
21
+ n_samples: int | None = None,
22
+ dataset_item_ids: list[str] | None = None,
23
+ experiment_config: dict | None = None,
24
+ optimization_id: str | None = None,
25
+ verbose: int = 0,
26
+ **kwargs: Any,
27
+ ) -> float:
28
+ """Evaluate a single prompt (individual) against the dataset and return the score."""
29
+ total_items = len(dataset.get_items())
30
+
31
+ new_prompt = prompt.copy()
32
+ new_prompt.set_messages(messages)
33
+
34
+ experiment_config = experiment_config or {}
35
+ experiment_config["project_name"] = self.agent_class.project_name
36
+ experiment_config = {
37
+ **experiment_config,
38
+ "optimizer": self.__class__.__name__,
39
+ "agent_class": self.agent_class.__name__,
40
+ "agent_config": new_prompt.to_dict(),
41
+ "metric": metric.__name__,
42
+ "dataset": dataset.name,
43
+ "configuration": {
44
+ "prompt": new_prompt.get_messages(),
45
+ "n_samples_for_eval": (
46
+ len(dataset_item_ids) if dataset_item_ids is not None else n_samples
47
+ ),
48
+ "total_dataset_items": total_items,
49
+ },
50
+ }
51
+ try:
52
+ agent = self.agent_class(new_prompt)
53
+ except Exception:
54
+ return 0.0
55
+
56
+ def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
57
+ messages = new_prompt.get_messages(dataset_item)
58
+ model_output = agent.invoke(messages)
59
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
60
+
61
+ score = task_evaluator.evaluate(
62
+ dataset=dataset,
63
+ dataset_item_ids=dataset_item_ids,
64
+ metric=metric,
65
+ evaluated_task=llm_task,
66
+ num_threads=self.num_threads,
67
+ project_name=experiment_config["project_name"],
68
+ n_samples=n_samples if dataset_item_ids is None else None,
69
+ experiment_config=experiment_config,
70
+ optimization_id=optimization_id,
71
+ verbose=verbose,
72
+ )
73
+ return score