opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. opik_optimizer/__init__.py +4 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +402 -28
  4. opik_optimizer/data/context7_eval.jsonl +3 -0
  5. opik_optimizer/datasets/context7_eval.py +90 -0
  6. opik_optimizer/datasets/tiny_test.py +33 -34
  7. opik_optimizer/datasets/truthful_qa.py +2 -2
  8. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  9. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
  10. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
  11. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  12. opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
  13. opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +154 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +30 -23
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +21 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +22 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/utils/colbert.py +236 -0
  46. opik_optimizer/{utils.py → utils/core.py} +160 -33
  47. opik_optimizer/utils/dataset_utils.py +49 -0
  48. opik_optimizer/utils/prompt_segments.py +186 -0
  49. opik_optimizer-2.0.0.dist-info/METADATA +345 -0
  50. opik_optimizer-2.0.0.dist-info/RECORD +74 -0
  51. opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
  52. opik_optimizer-1.0.6.dist-info/METADATA +0 -181
  53. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  54. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  55. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
  56. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,8 @@
1
- from typing import Any, Callable, Dict, List, Optional, Tuple, Type
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ import warnings
2
4
 
5
+ import copy
3
6
  import json
4
7
  import logging
5
8
  import random
@@ -15,7 +18,6 @@ from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
15
18
  from pydantic import BaseModel
16
19
 
17
20
  from opik_optimizer import base_optimizer
18
- from ..utils import create_litellm_agent_class
19
21
  from ..optimization_config import chat_prompt, mappers
20
22
  from ..optimizable_agent import OptimizableAgent
21
23
  from .. import _throttle, optimization_result, task_evaluator, utils
@@ -56,7 +58,7 @@ Respond only with the JSON object. Do not include any explanation or extra text.
56
58
 
57
59
 
58
60
  class FewShotPromptTemplate(BaseModel):
59
- message_list_with_placeholder: List[Dict[str, str]]
61
+ message_list_with_placeholder: list[dict[str, str]]
60
62
  example_template: str
61
63
 
62
64
 
@@ -94,8 +96,11 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
94
96
  **model_kwargs: Additional model parameters
95
97
  """
96
98
  if "project_name" in model_kwargs:
97
- print(
98
- "Removing `project_name` from constructor; it now belongs in the ChatPrompt()"
99
+ warnings.warn(
100
+ "The 'project_name' parameter in optimizer constructor is deprecated. "
101
+ "Set project_name in the ChatPrompt instead.",
102
+ DeprecationWarning,
103
+ stacklevel=2,
99
104
  )
100
105
  del model_kwargs["project_name"]
101
106
 
@@ -111,18 +116,22 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
111
116
  elif self.verbose == 2:
112
117
  logger.setLevel(logging.DEBUG)
113
118
 
114
- self._opik_client = opik.Opik()
115
- self.llm_call_counter = 0
116
119
  logger.debug(f"Initialized FewShotBayesianOptimizer with model: {model}")
117
120
 
121
+ def get_optimizer_metadata(self) -> dict[str, Any]:
122
+ return {
123
+ "min_examples": self.min_examples,
124
+ "max_examples": self.max_examples,
125
+ }
126
+
118
127
  @_throttle.rate_limited(_limiter)
119
128
  def _call_model(
120
129
  self,
121
130
  model: str,
122
- messages: List[Dict[str, str]],
131
+ messages: list[dict[str, str]],
123
132
  seed: int,
124
- model_kwargs: Dict[str, Any],
125
- ) -> Dict[str, Any]:
133
+ model_kwargs: dict[str, Any],
134
+ ) -> dict[str, Any]:
126
135
  """
127
136
  Args:
128
137
  model: The model to use for the call
@@ -133,7 +142,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
133
142
  Returns:
134
143
  Dict containing the model's response
135
144
  """
136
- self.llm_call_counter += 1
145
+ self.increment_llm_counter()
137
146
 
138
147
  current_model_kwargs = self.model_kwargs.copy()
139
148
  current_model_kwargs.update(model_kwargs)
@@ -159,8 +168,8 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
159
168
  return response
160
169
 
161
170
  def _split_dataset(
162
- self, dataset: List[Dict[str, Any]], train_ratio: float
163
- ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
171
+ self, dataset: list[dict[str, Any]], train_ratio: float
172
+ ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
164
173
  """
165
174
  Split the dataset into training and validation sets.
166
175
 
@@ -194,7 +203,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
194
203
  self,
195
204
  model: str,
196
205
  prompt: chat_prompt.ChatPrompt,
197
- few_shot_examples: List[Dict[str, Any]],
206
+ few_shot_examples: list[dict[str, Any]],
198
207
  ) -> FewShotPromptTemplate:
199
208
  """
200
209
  Generate a few-shot prompt template that can be used to insert examples into the prompt.
@@ -215,7 +224,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
215
224
  "examples": few_shot_examples,
216
225
  }
217
226
 
218
- messages: List[Dict[str, str]] = [
227
+ messages: list[dict[str, str]] = [
219
228
  {"role": "system", "content": SYSTEM_PROMPT_TEMPLATE},
220
229
  {"role": "user", "content": json.dumps(user_message)},
221
230
  ]
@@ -244,9 +253,9 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
244
253
  metric: Callable,
245
254
  baseline_score: float,
246
255
  n_trials: int = 10,
247
- optimization_id: Optional[str] = None,
248
- experiment_config: Optional[Dict] = None,
249
- n_samples: Optional[int] = None,
256
+ optimization_id: str | None = None,
257
+ experiment_config: dict | None = None,
258
+ n_samples: int | None = None,
250
259
  ) -> optimization_result.OptimizationResult:
251
260
  reporting.start_optimization_run(verbose=self.verbose)
252
261
 
@@ -259,19 +268,20 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
259
268
  if n_samples is not None and n_samples < len(dataset_items):
260
269
  eval_dataset_item_ids = random.sample(all_dataset_item_ids, n_samples)
261
270
 
262
- # Define the experiment configuration
263
- experiment_config = experiment_config or {}
264
- base_experiment_config = { # Base config for reuse
265
- **experiment_config,
266
- **{
267
- "optimizer": self.__class__.__name__,
268
- "agent_class": self.agent_class.__name__,
269
- "agent_config": prompt.to_dict(),
270
- "metric": metric.__name__,
271
- "dataset": dataset.name,
272
- "configuration": {},
273
- },
274
- }
271
+ configuration_updates = self._drop_none(
272
+ {
273
+ "n_trials": n_trials,
274
+ "n_samples": n_samples,
275
+ "baseline_score": baseline_score,
276
+ }
277
+ )
278
+ base_experiment_config = self._prepare_experiment_config(
279
+ prompt=prompt,
280
+ dataset=dataset,
281
+ metric=metric,
282
+ experiment_config=experiment_config,
283
+ configuration_updates=configuration_updates,
284
+ )
275
285
 
276
286
  # Start Optuna Study
277
287
  def optimization_objective(trial: optuna.Trial) -> float:
@@ -326,7 +336,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
326
336
  ]
327
337
 
328
338
  # Log trial config
329
- trial_config = base_experiment_config.copy()
339
+ trial_config = copy.deepcopy(base_experiment_config)
330
340
  trial_config["configuration"]["prompt"] = (
331
341
  messages_for_reporting # Base instruction
332
342
  )
@@ -450,6 +460,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
450
460
  best_score=best_score,
451
461
  best_prompt=best_prompt,
452
462
  verbose=self.verbose,
463
+ tools=getattr(prompt, "tools", None),
453
464
  )
454
465
 
455
466
  return optimization_result.OptimizationResult(
@@ -479,6 +490,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
479
490
  },
480
491
  history=optuna_history_processed,
481
492
  llm_calls=self.llm_call_counter,
493
+ tool_calls=self.tool_call_counter,
482
494
  dataset_id=dataset.id,
483
495
  optimization_id=optimization_id,
484
496
  )
@@ -488,47 +500,39 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
488
500
  prompt: chat_prompt.ChatPrompt,
489
501
  dataset: Dataset,
490
502
  metric: Callable,
491
- n_trials: int = 10,
492
- agent_class: Optional[Type[OptimizableAgent]] = None,
493
- experiment_config: Optional[Dict] = None,
494
- n_samples: Optional[int] = None,
503
+ experiment_config: dict | None = None,
504
+ n_samples: int | None = None,
505
+ auto_continue: bool = False,
506
+ agent_class: type[OptimizableAgent] | None = None,
507
+ **kwargs: Any,
495
508
  ) -> optimization_result.OptimizationResult:
496
509
  """
497
510
  Args:
498
- prompt:
511
+ prompt: The prompt to optimize
499
512
  dataset: Opik Dataset to optimize on
500
513
  metric: Metric function to evaluate on
501
- n_trials: Number of trials for Bayesian Optimization
502
514
  experiment_config: Optional configuration for the experiment, useful to log additional metadata
503
515
  n_samples: Optional number of items to test in the dataset
516
+ auto_continue: Whether to auto-continue optimization
517
+ agent_class: Optional agent class to use
518
+ **kwargs: Additional parameters including:
519
+ n_trials (int): Number of trials for Bayesian Optimization (default: 10)
520
+ mcp_config (MCPExecutionConfig | None): MCP tool calling configuration (default: None)
504
521
 
505
522
  Returns:
506
523
  OptimizationResult: Result of the optimization
507
524
  """
508
- if not isinstance(prompt, chat_prompt.ChatPrompt):
509
- raise ValueError("Prompt must be a ChatPrompt object")
525
+ # Use base class validation and setup methods
526
+ self.validate_optimization_inputs(prompt, dataset, metric)
527
+ self.configure_prompt_model(prompt)
528
+ self.agent_class = self.setup_agent_class(prompt, agent_class)
510
529
 
511
- if not isinstance(dataset, Dataset):
512
- raise ValueError("Dataset must be a Dataset object")
513
-
514
- if not callable(metric):
515
- raise ValueError(
516
- "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
517
- )
518
-
519
- if prompt.model is None:
520
- prompt.model = self.model
521
- if prompt.model_kwargs is None:
522
- prompt.model_kwargs = self.model_kwargs
523
-
524
- if agent_class is None:
525
- self.agent_class = create_litellm_agent_class(prompt)
526
- else:
527
- self.agent_class = agent_class
530
+ # Extract n_trials from kwargs for backward compatibility
531
+ n_trials = kwargs.get("n_trials", 10)
528
532
 
529
533
  optimization = None
530
534
  try:
531
- optimization = self._opik_client.create_optimization(
535
+ optimization = self.opik_client.create_optimization(
532
536
  dataset_name=dataset.name,
533
537
  objective_name=metric.__name__,
534
538
  metadata={"optimizer": self.__class__.__name__},
@@ -557,6 +561,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
557
561
  "n_samples": n_samples,
558
562
  },
559
563
  verbose=self.verbose,
564
+ tools=getattr(prompt, "tools", None),
560
565
  )
561
566
 
562
567
  utils.disable_experiment_reporting()
@@ -614,10 +619,10 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
614
619
  prompt: chat_prompt.ChatPrompt,
615
620
  dataset: opik.Dataset,
616
621
  metric: Callable,
617
- n_samples: Optional[int] = None,
618
- dataset_item_ids: Optional[List[str]] = None,
619
- experiment_config: Optional[Dict] = None,
620
- optimization_id: Optional[str] = None,
622
+ n_samples: int | None = None,
623
+ dataset_item_ids: list[str] | None = None,
624
+ experiment_config: dict | None = None,
625
+ optimization_id: str | None = None,
621
626
  **kwargs: Any,
622
627
  ) -> float:
623
628
  """
@@ -633,20 +638,6 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
633
638
  """
634
639
  llm_task = self._build_task_from_messages(prompt, prompt.get_messages())
635
640
 
636
- experiment_config = experiment_config or {}
637
- experiment_config["project_name"] = self.agent_class.__name__
638
- experiment_config = {
639
- **experiment_config,
640
- **{
641
- "optimizer": self.__class__.__name__,
642
- "agent_class": self.agent_class.__name__,
643
- "agent_config": prompt.to_dict(),
644
- "metric": metric.__name__,
645
- "dataset": dataset.name,
646
- "configuration": {"prompt": prompt.get_messages()},
647
- },
648
- }
649
-
650
641
  if n_samples is not None:
651
642
  if dataset_item_ids is not None:
652
643
  raise Exception("Can't use n_samples and dataset_item_ids")
@@ -654,6 +645,24 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
654
645
  all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
655
646
  dataset_item_ids = random.sample(all_ids, n_samples)
656
647
 
648
+ configuration_updates = self._drop_none(
649
+ {
650
+ "n_samples": n_samples,
651
+ "dataset_item_ids": dataset_item_ids,
652
+ }
653
+ )
654
+ additional_metadata = (
655
+ {"optimization_id": optimization_id} if optimization_id else None
656
+ )
657
+ experiment_config = self._prepare_experiment_config(
658
+ prompt=prompt,
659
+ dataset=dataset,
660
+ metric=metric,
661
+ experiment_config=experiment_config,
662
+ configuration_updates=configuration_updates,
663
+ additional_metadata=additional_metadata,
664
+ )
665
+
657
666
  logger.debug("Starting FewShotBayesian evaluation...")
658
667
  score = task_evaluator.evaluate(
659
668
  dataset=dataset,
@@ -661,7 +670,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
661
670
  metric=metric,
662
671
  evaluated_task=llm_task,
663
672
  num_threads=self.n_threads,
664
- project_name=self.agent_class.project_name,
673
+ project_name=experiment_config.get("project_name"),
665
674
  experiment_config=experiment_config,
666
675
  optimization_id=optimization_id,
667
676
  verbose=self.verbose,
@@ -673,14 +682,14 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
673
682
  def _build_task_from_messages(
674
683
  self,
675
684
  prompt: chat_prompt.ChatPrompt,
676
- messages: List[Dict[str, str]],
677
- few_shot_examples: Optional[str] = None,
678
- ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
685
+ messages: list[dict[str, str]],
686
+ few_shot_examples: str | None = None,
687
+ ) -> Callable[[dict[str, Any]], dict[str, Any]]:
679
688
  new_prompt = prompt.copy()
680
689
  new_prompt.set_messages(messages)
681
690
  agent = self.agent_class(new_prompt)
682
691
 
683
- def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
692
+ def llm_task(dataset_item: dict[str, Any]) -> dict[str, Any]:
684
693
  """
685
694
  Process a single dataset item through the LLM task.
686
695
 
@@ -1,6 +1,6 @@
1
1
  from contextlib import contextmanager
2
2
  from io import StringIO
3
- from typing import Any, Dict, List, Optional, TYPE_CHECKING
3
+ from typing import Any, Optional, TYPE_CHECKING
4
4
 
5
5
  from rich.panel import Panel
6
6
  from rich.text import Text
@@ -46,9 +46,16 @@ def display_evaluation(
46
46
  yield Reporter()
47
47
  finally:
48
48
  if verbose >= 1:
49
- console.print(
50
- Text(f"\r Baseline score was: {score:.4f}.\n", style="green")
51
- )
49
+ if score is not None:
50
+ console.print(
51
+ Text(
52
+ f"\r Baseline score was: {score:.4f}.\n", style="green"
53
+ )
54
+ )
55
+ else:
56
+ console.print(
57
+ Text("\r Baseline score was: None\n", style="red")
58
+ )
52
59
 
53
60
 
54
61
  @contextmanager
@@ -121,7 +128,7 @@ def start_optimization_trial(
121
128
 
122
129
  # Create a simple object with a method to set the score
123
130
  class Reporter:
124
- def start_trial(self, messages: List[Dict[str, str]]) -> None:
131
+ def start_trial(self, messages: list[dict[str, str]]) -> None:
125
132
  if verbose >= 1:
126
133
  console.print(
127
134
  Text(
@@ -0,0 +1,3 @@
1
+ from .gepa_optimizer import GepaOptimizer
2
+
3
+ __all__ = ["GepaOptimizer"]
@@ -0,0 +1,154 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+ from collections.abc import Callable, Iterable
6
+
7
+ import logging
8
+
9
+ from gepa.core.adapter import EvaluationBatch, GEPAAdapter
10
+
11
+ from ..optimization_config import chat_prompt
12
+ from ..utils import create_litellm_agent_class
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass
19
+ class OpikDataInst:
20
+ """Data instance handed to GEPA.
21
+
22
+ We keep the original Opik dataset item so metrics and prompt formatting can use it
23
+ directly without duplicated bookkeeping.
24
+ """
25
+
26
+ input_text: str
27
+ answer: str
28
+ additional_context: dict[str, str]
29
+ opik_item: dict[str, Any]
30
+
31
+
32
+ def _extract_system_text(candidate: dict[str, str], fallback: str) -> str:
33
+ for key in ("system_prompt", "system", "prompt"):
34
+ value = candidate.get(key)
35
+ if isinstance(value, str) and value.strip():
36
+ return value
37
+ return fallback
38
+
39
+
40
+ def _apply_system_text(
41
+ prompt_obj: chat_prompt.ChatPrompt, system_text: str
42
+ ) -> chat_prompt.ChatPrompt:
43
+ updated = prompt_obj.copy()
44
+ if updated.messages is not None:
45
+ messages = updated.get_messages()
46
+ if messages and messages[0].get("role") == "system":
47
+ messages[0]["content"] = system_text
48
+ else:
49
+ messages.insert(0, {"role": "system", "content": system_text})
50
+ updated.set_messages(messages)
51
+ else:
52
+ updated.system = system_text
53
+ return updated
54
+
55
+
56
+ class OpikGEPAAdapter(GEPAAdapter[OpikDataInst, dict[str, Any], dict[str, Any]]):
57
+ """Minimal GEPA adapter that routes evaluation through Opik's metric."""
58
+
59
+ def __init__(
60
+ self,
61
+ base_prompt: chat_prompt.ChatPrompt,
62
+ optimizer: Any,
63
+ metric: Callable[[dict[str, Any], str], Any],
64
+ system_fallback: str,
65
+ ) -> None:
66
+ self._base_prompt = base_prompt
67
+ self._optimizer = optimizer
68
+ self._metric = metric
69
+ self._system_fallback = system_fallback
70
+
71
+ def evaluate(
72
+ self,
73
+ batch: list[OpikDataInst],
74
+ candidate: dict[str, str],
75
+ capture_traces: bool = False,
76
+ ) -> EvaluationBatch[dict[str, Any], dict[str, Any]]:
77
+ system_text = _extract_system_text(candidate, self._system_fallback)
78
+ prompt_variant = _apply_system_text(self._base_prompt, system_text)
79
+
80
+ agent_class = create_litellm_agent_class(
81
+ prompt_variant, optimizer_ref=self._optimizer
82
+ )
83
+ agent = agent_class(prompt_variant)
84
+
85
+ outputs: list[dict[str, Any]] = []
86
+ scores: list[float] = []
87
+ trajectories: list[dict[str, Any]] | None = [] if capture_traces else None
88
+
89
+ for inst in batch:
90
+ dataset_item = inst.opik_item
91
+ messages = prompt_variant.get_messages(dataset_item)
92
+ raw_output = agent.invoke(messages).strip()
93
+
94
+ metric_result = self._metric(dataset_item, raw_output)
95
+ if hasattr(metric_result, "value"):
96
+ score = float(metric_result.value)
97
+ elif hasattr(metric_result, "score"):
98
+ score = float(metric_result.score)
99
+ else:
100
+ score = float(metric_result)
101
+
102
+ outputs.append({"output": raw_output})
103
+ scores.append(score)
104
+ try:
105
+ self._optimizer._gepa_live_metric_calls += 1
106
+ except Exception:
107
+ pass
108
+
109
+ if trajectories is not None:
110
+ trajectories.append(
111
+ {
112
+ "input": dataset_item,
113
+ "output": raw_output,
114
+ "score": score,
115
+ }
116
+ )
117
+
118
+ return EvaluationBatch(
119
+ outputs=outputs, scores=scores, trajectories=trajectories
120
+ )
121
+
122
+ def make_reflective_dataset(
123
+ self,
124
+ candidate: dict[str, str],
125
+ eval_batch: EvaluationBatch[dict[str, Any], dict[str, Any]],
126
+ components_to_update: list[str],
127
+ ) -> dict[str, list[dict[str, Any]]]:
128
+ components = components_to_update or ["system_prompt"]
129
+ trajectories = eval_batch.trajectories or []
130
+
131
+ def _records() -> Iterable[dict[str, Any]]:
132
+ for traj in trajectories:
133
+ dataset_item = traj.get("input", {})
134
+ output_text = traj.get("output", "")
135
+ score = traj.get("score", 0.0)
136
+ feedback = f"Observed score={score:.4f}. Expected answer: {dataset_item.get('answer', '')}"
137
+ yield {
138
+ "Inputs": {
139
+ "text": dataset_item.get("input")
140
+ or dataset_item.get("question")
141
+ or "",
142
+ },
143
+ "Generated Outputs": output_text,
144
+ "Feedback": feedback,
145
+ }
146
+
147
+ reflective_records = list(_records())
148
+ if not reflective_records:
149
+ logger.debug(
150
+ "No trajectories captured for candidate; returning empty reflective dataset"
151
+ )
152
+ reflective_records = []
153
+
154
+ return {component: reflective_records for component in components}