opik-optimizer 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. opik_optimizer/__init__.py +2 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +28 -11
  4. opik_optimizer/colbert.py +236 -0
  5. opik_optimizer/data/context7_eval.jsonl +3 -0
  6. opik_optimizer/datasets/context7_eval.py +90 -0
  7. opik_optimizer/datasets/tiny_test.py +33 -34
  8. opik_optimizer/datasets/truthful_qa.py +2 -2
  9. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  10. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
  11. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
  12. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  13. opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +152 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +20 -20
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +16 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +21 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/{utils.py → utils/core.py} +111 -26
  46. opik_optimizer/utils/dataset_utils.py +49 -0
  47. opik_optimizer/utils/prompt_segments.py +186 -0
  48. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
  49. opik_optimizer-1.1.0.dist-info/RECORD +73 -0
  50. opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
  51. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  52. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  53. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
  54. {opik_optimizer-1.0.6.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,556 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from contextlib import nullcontext
5
+ from typing import Any, ContextManager
6
+ from collections.abc import Callable
7
+
8
+ import opik
9
+ from opik import Dataset
10
+ from opik.evaluation.metrics.score_result import ScoreResult
11
+
12
+ from ..base_optimizer import BaseOptimizer
13
+ from ..optimization_config import chat_prompt, mappers
14
+ from ..optimization_result import OptimizationResult
15
+ from ..utils import optimization_context, create_litellm_agent_class
16
+ from ..logging_config import setup_logging as _setup_logging
17
+ from .. import task_evaluator
18
+ from . import reporting as gepa_reporting
19
+ from .adapter import OpikDataInst, OpikGEPAAdapter
20
+
21
+
22
+ _setup_logging()
23
+ LOGGER = logging.getLogger("opik_optimizer.gepa.optimizer")
24
+
25
+
26
+ class GepaOptimizer(BaseOptimizer):
27
+ """Minimal integration against the upstream GEPA engine."""
28
+
29
+ def __init__(
30
+ self,
31
+ model: str,
32
+ project_name: str | None = None,
33
+ reflection_model: str | None = None,
34
+ verbose: int = 1,
35
+ **model_kwargs: Any,
36
+ ) -> None:
37
+ super().__init__(model=model, verbose=verbose, **model_kwargs)
38
+ self.project_name = project_name
39
+ self.reflection_model = reflection_model or model
40
+ self.num_threads = self.model_kwargs.pop("num_threads", 6)
41
+ self.seed = self.model_kwargs.pop("seed", 42)
42
+ self._gepa_live_metric_calls = 0
43
+
44
+ # ------------------------------------------------------------------
45
+ # Helpers
46
+ # ------------------------------------------------------------------
47
+
48
+ def _build_data_insts(
49
+ self,
50
+ dataset_items: list[dict[str, Any]],
51
+ input_key: str,
52
+ output_key: str,
53
+ ) -> list[OpikDataInst]:
54
+ data_insts: list[OpikDataInst] = []
55
+ for item in dataset_items:
56
+ additional_context: dict[str, str] = {}
57
+ metadata = item.get("metadata") or {}
58
+ if isinstance(metadata, dict):
59
+ context_value = metadata.get("context")
60
+ if isinstance(context_value, str):
61
+ additional_context["context"] = context_value
62
+ if "context" in item and isinstance(item["context"], str):
63
+ additional_context.setdefault("context", item["context"])
64
+
65
+ data_insts.append(
66
+ OpikDataInst(
67
+ input_text=str(item.get(input_key, "")),
68
+ answer=str(item.get(output_key, "")),
69
+ additional_context=additional_context,
70
+ opik_item=item,
71
+ )
72
+ )
73
+ return data_insts
74
+
75
+ def _apply_system_text(
76
+ self, prompt_obj: chat_prompt.ChatPrompt, system_text: str
77
+ ) -> chat_prompt.ChatPrompt:
78
+ updated = prompt_obj.copy()
79
+ if updated.messages is not None:
80
+ messages = updated.get_messages()
81
+ if messages and messages[0].get("role") == "system":
82
+ messages[0]["content"] = system_text
83
+ else:
84
+ messages.insert(0, {"role": "system", "content": system_text})
85
+ updated.set_messages(messages)
86
+ else:
87
+ updated.system = system_text
88
+ return updated
89
+
90
+ def _infer_dataset_keys(self, dataset: Dataset) -> tuple[str, str]:
91
+ items = dataset.get_items(1)
92
+ if not items:
93
+ return "text", "label"
94
+ sample = items[0]
95
+ output_candidates = ["label", "answer", "output", "expected_output"]
96
+ output_key = next((k for k in output_candidates if k in sample), "label")
97
+ excluded = {output_key, "id", "metadata"}
98
+ input_key = next((k for k in sample.keys() if k not in excluded), "text")
99
+ return input_key, output_key
100
+
101
+ # ------------------------------------------------------------------
102
+ # Base optimizer overrides
103
+ # ------------------------------------------------------------------
104
+
105
+ def optimize_prompt(
106
+ self,
107
+ prompt: chat_prompt.ChatPrompt,
108
+ dataset: str | Dataset,
109
+ metric: Callable[[dict[str, Any], str], ScoreResult],
110
+ experiment_config: dict[str, Any] | None = None,
111
+ **kwargs: Any,
112
+ ) -> OptimizationResult:
113
+ if isinstance(dataset, str):
114
+ client = opik.Opik(project_name=self.project_name)
115
+ dataset = client.get_dataset(dataset)
116
+
117
+ max_metric_calls: int = int(kwargs.get("max_metric_calls", 30))
118
+ reflection_minibatch_size: int = int(kwargs.get("reflection_minibatch_size", 3))
119
+ candidate_selection_strategy: str = str(
120
+ kwargs.get("candidate_selection_strategy", "pareto")
121
+ )
122
+ n_samples: int | None = kwargs.get("n_samples")
123
+
124
+ prompt = prompt.copy()
125
+ if self.project_name:
126
+ prompt.project_name = self.project_name
127
+ if prompt.model is None:
128
+ prompt.model = self.model
129
+ if not prompt.model_kwargs:
130
+ prompt.model_kwargs = dict(self.model_kwargs)
131
+
132
+ seed_prompt_text = self._extract_system_text(prompt)
133
+ input_key, output_key = self._infer_dataset_keys(dataset)
134
+
135
+ items = dataset.get_items()
136
+ if n_samples and 0 < n_samples < len(items):
137
+ items = items[:n_samples]
138
+
139
+ data_insts = self._build_data_insts(items, input_key, output_key)
140
+
141
+ self._gepa_live_metric_calls = 0
142
+
143
+ base_prompt = prompt.copy()
144
+
145
+ opt_id: str | None = None
146
+ ds_id: str | None = getattr(dataset, "id", None)
147
+
148
+ opik_client = opik.Opik(project_name=self.project_name)
149
+
150
+ with optimization_context(
151
+ client=opik_client,
152
+ dataset_name=dataset.name,
153
+ objective_name=metric.__name__,
154
+ metadata={"optimizer": self.__class__.__name__},
155
+ ) as optimization:
156
+ try:
157
+ opt_id = optimization.id if optimization is not None else None
158
+ except Exception:
159
+ opt_id = None
160
+
161
+ gepa_reporting.display_header(
162
+ algorithm="GEPA",
163
+ optimization_id=opt_id,
164
+ dataset_id=getattr(dataset, "id", None),
165
+ verbose=self.verbose,
166
+ )
167
+
168
+ from ..reporting_utils import display_configuration as _display_config
169
+
170
+ _display_config(
171
+ messages=prompt.get_messages(),
172
+ optimizer_config={
173
+ "optimizer": "GEPA",
174
+ "model": self.model,
175
+ "reflection_model": self.reflection_model,
176
+ "max_metric_calls": max_metric_calls,
177
+ "reflection_minibatch_size": reflection_minibatch_size,
178
+ "candidate_selection_strategy": candidate_selection_strategy,
179
+ "n_samples": n_samples or "all",
180
+ },
181
+ verbose=self.verbose,
182
+ )
183
+
184
+ # Baseline evaluation
185
+ initial_prompt_messages = prompt.get_messages()
186
+ initial_score = 0.0
187
+ with gepa_reporting.baseline_evaluation(verbose=self.verbose) as baseline:
188
+ try:
189
+ baseline_suppress: ContextManager[Any] = nullcontext()
190
+ try:
191
+ from ..reporting_utils import (
192
+ suppress_opik_logs as _suppress_logs,
193
+ )
194
+
195
+ baseline_suppress = _suppress_logs()
196
+ except Exception:
197
+ pass
198
+ eval_kwargs = dict(
199
+ prompt=prompt,
200
+ dataset=dataset,
201
+ metric=metric,
202
+ n_samples=n_samples,
203
+ optimization_id=opt_id,
204
+ extra_metadata={"phase": "baseline"},
205
+ verbose=0,
206
+ )
207
+ with baseline_suppress:
208
+ initial_score = float(
209
+ self._evaluate_prompt_logged(**eval_kwargs)
210
+ )
211
+ baseline.set_score(initial_score)
212
+ except Exception:
213
+ LOGGER.exception("Baseline evaluation failed")
214
+
215
+ adapter_prompt = self._apply_system_text(base_prompt, seed_prompt_text)
216
+ adapter_prompt.project_name = self.project_name
217
+ adapter_prompt.model = self.model
218
+ # Filter out GEPA-specific parameters that shouldn't be passed to LLM
219
+ filtered_model_kwargs = {
220
+ k: v
221
+ for k, v in self.model_kwargs.items()
222
+ if k not in ["num_prompts_per_round", "rounds"]
223
+ }
224
+ adapter_prompt.model_kwargs = filtered_model_kwargs
225
+
226
+ adapter = OpikGEPAAdapter(
227
+ base_prompt=adapter_prompt,
228
+ optimizer=self,
229
+ metric=metric,
230
+ system_fallback=seed_prompt_text,
231
+ )
232
+
233
+ try:
234
+ import gepa
235
+ import inspect
236
+ except Exception as exc: # pragma: no cover
237
+ raise ImportError("gepa package is required for GepaOptimizer") from exc
238
+
239
+ kwargs_gepa: dict[str, Any] = {
240
+ "seed_candidate": {"system_prompt": seed_prompt_text},
241
+ "trainset": data_insts,
242
+ "valset": data_insts,
243
+ "adapter": adapter,
244
+ "task_lm": None,
245
+ "reflection_lm": self.reflection_model,
246
+ "candidate_selection_strategy": candidate_selection_strategy,
247
+ "reflection_minibatch_size": reflection_minibatch_size,
248
+ "max_metric_calls": max_metric_calls,
249
+ "display_progress_bar": False,
250
+ "track_best_outputs": False,
251
+ "logger": gepa_reporting.RichGEPAOptimizerLogger(
252
+ self, verbose=self.verbose
253
+ ),
254
+ }
255
+
256
+ optimize_sig = None
257
+ try:
258
+ optimize_sig = inspect.signature(gepa.optimize)
259
+ except Exception:
260
+ optimize_sig = None
261
+
262
+ if optimize_sig and "stop_callbacks" not in optimize_sig.parameters:
263
+ kwargs_gepa["max_metric_calls"] = max_metric_calls
264
+
265
+ with gepa_reporting.start_gepa_optimization(verbose=self.verbose):
266
+ gepa_result = gepa.optimize(**kwargs_gepa)
267
+
268
+ try:
269
+ opt_id = optimization.id if optimization is not None else None
270
+ except Exception:
271
+ opt_id = None
272
+
273
+ # ------------------------------------------------------------------
274
+ # Rescoring & result assembly
275
+ # ------------------------------------------------------------------
276
+
277
+ candidates: list[dict[str, str]] = getattr(gepa_result, "candidates", []) or []
278
+ val_scores: list[float] = list(getattr(gepa_result, "val_aggregate_scores", []))
279
+
280
+ rescored: list[float] = []
281
+ candidate_rows: list[dict[str, Any]] = []
282
+ history: list[dict[str, Any]] = []
283
+
284
+ for idx, candidate in enumerate(candidates):
285
+ candidate_prompt = self._extract_system_text_from_candidate(
286
+ candidate, seed_prompt_text
287
+ )
288
+ prompt_variant = self._apply_system_text(prompt, candidate_prompt)
289
+ prompt_variant.project_name = self.project_name
290
+ prompt_variant.model = self.model
291
+ # Filter out GEPA-specific parameters that shouldn't be passed to LLM
292
+ filtered_model_kwargs = {
293
+ k: v
294
+ for k, v in self.model_kwargs.items()
295
+ if k not in ["num_prompts_per_round", "rounds"]
296
+ }
297
+ prompt_variant.model_kwargs = filtered_model_kwargs
298
+
299
+ eval_kwargs = dict(
300
+ prompt=prompt_variant,
301
+ dataset=dataset,
302
+ metric=metric,
303
+ n_samples=n_samples,
304
+ optimization_id=opt_id,
305
+ extra_metadata={"phase": "rescoring", "candidate_index": idx},
306
+ verbose=0,
307
+ )
308
+ try:
309
+ score = float(self._evaluate_prompt_logged(**eval_kwargs))
310
+ except Exception:
311
+ LOGGER.debug("Rescoring failed for candidate %s", idx, exc_info=True)
312
+ score = 0.0
313
+
314
+ rescored.append(score)
315
+ candidate_rows.append(
316
+ {
317
+ "iteration": idx + 1,
318
+ "system_prompt": candidate_prompt,
319
+ "gepa_score": val_scores[idx] if idx < len(val_scores) else None,
320
+ "opik_score": score,
321
+ "source": "GEPA",
322
+ }
323
+ )
324
+ history.append(
325
+ {
326
+ "iteration": idx + 1,
327
+ "prompt_candidate": candidate_prompt,
328
+ "scores": [
329
+ {
330
+ "metric_name": f"GEPA-{metric.__name__}",
331
+ "score": val_scores[idx] if idx < len(val_scores) else None,
332
+ },
333
+ {"metric_name": metric.__name__, "score": score},
334
+ ],
335
+ "metadata": {},
336
+ }
337
+ )
338
+
339
+ if rescored:
340
+ best_idx = max(range(len(rescored)), key=lambda i: rescored[i])
341
+ best_score = rescored[best_idx]
342
+ else:
343
+ best_idx = getattr(gepa_result, "best_idx", 0) or 0
344
+ best_score = float(val_scores[best_idx]) if val_scores else 0.0
345
+
346
+ best_candidate = (
347
+ candidates[best_idx] if candidates else {"system_prompt": seed_prompt_text}
348
+ )
349
+ best_prompt_text = self._extract_system_text_from_candidate(
350
+ best_candidate, seed_prompt_text
351
+ )
352
+
353
+ final_prompt = self._apply_system_text(prompt, best_prompt_text)
354
+ final_prompt.project_name = self.project_name
355
+ final_prompt.model = self.model
356
+ # Filter out GEPA-specific parameters that shouldn't be passed to LLM
357
+ filtered_model_kwargs = {
358
+ k: v
359
+ for k, v in self.model_kwargs.items()
360
+ if k not in ["num_prompts_per_round", "rounds"]
361
+ }
362
+ final_prompt.model_kwargs = filtered_model_kwargs
363
+
364
+ final_eval_kwargs = dict(
365
+ prompt=final_prompt,
366
+ dataset=dataset,
367
+ metric=metric,
368
+ n_samples=n_samples,
369
+ optimization_id=opt_id,
370
+ extra_metadata={"phase": "final", "selected": True},
371
+ verbose=0,
372
+ )
373
+ suppress_logs: ContextManager[Any] = nullcontext()
374
+ try:
375
+ from ..reporting_utils import suppress_opik_logs as _suppress_logs
376
+
377
+ suppress_logs = _suppress_logs()
378
+ except Exception:
379
+ pass
380
+
381
+ with suppress_logs:
382
+ try:
383
+ self._evaluate_prompt_logged(**final_eval_kwargs)
384
+ except Exception:
385
+ LOGGER.debug("Final evaluation failed", exc_info=True)
386
+
387
+ per_item_scores: list[dict[str, Any]] = []
388
+ try:
389
+ analysis_prompt = final_prompt.copy()
390
+ agent_cls = create_litellm_agent_class(analysis_prompt)
391
+ agent = agent_cls(analysis_prompt)
392
+ for item in items:
393
+ messages = analysis_prompt.get_messages(item)
394
+ output_text = agent.invoke(messages).strip()
395
+ metric_result = metric(item, output_text)
396
+ if hasattr(metric_result, "value"):
397
+ score_val = float(metric_result.value)
398
+ elif hasattr(metric_result, "score"):
399
+ score_val = float(metric_result.score)
400
+ else:
401
+ score_val = float(metric_result)
402
+ per_item_scores.append(
403
+ {
404
+ "dataset_item_id": item.get("id"),
405
+ "score": score_val,
406
+ "answer": item.get(output_key),
407
+ "output": output_text,
408
+ }
409
+ )
410
+ except Exception:
411
+ LOGGER.debug("Per-item diagnostics failed", exc_info=True)
412
+
413
+ details: dict[str, Any] = {
414
+ "model": self.model,
415
+ "temperature": self.model_kwargs.get("temperature"),
416
+ "optimizer": self.__class__.__name__,
417
+ "num_candidates": getattr(gepa_result, "num_candidates", None),
418
+ "total_metric_calls": getattr(gepa_result, "total_metric_calls", None),
419
+ "parents": getattr(gepa_result, "parents", None),
420
+ "val_scores": val_scores,
421
+ "opik_rescored_scores": rescored,
422
+ "candidate_summary": candidate_rows,
423
+ "best_candidate_iteration": candidate_rows[best_idx]["iteration"]
424
+ if candidate_rows
425
+ else 0,
426
+ "selected_candidate_index": best_idx,
427
+ "selected_candidate_gepa_score": val_scores[best_idx]
428
+ if best_idx < len(val_scores)
429
+ else None,
430
+ "selected_candidate_opik_score": best_score,
431
+ "gepa_live_metric_used": True,
432
+ "gepa_live_metric_call_count": self._gepa_live_metric_calls,
433
+ "selected_candidate_item_scores": per_item_scores,
434
+ "dataset_item_ids": [item.get("id") for item in items],
435
+ }
436
+ if experiment_config:
437
+ details["experiment"] = experiment_config
438
+
439
+ final_messages = final_prompt.get_messages()
440
+
441
+ if self.verbose >= 1:
442
+ gepa_reporting.display_candidate_scores(
443
+ candidate_rows, verbose=self.verbose
444
+ )
445
+ gepa_reporting.display_selected_candidate(
446
+ best_prompt_text, best_score, verbose=self.verbose
447
+ )
448
+
449
+ if LOGGER.isEnabledFor(logging.DEBUG):
450
+ for idx, row in enumerate(candidate_rows):
451
+ LOGGER.debug(
452
+ "candidate=%s source=%s gepa=%s opik=%s",
453
+ idx,
454
+ row.get("source"),
455
+ row.get("gepa_score"),
456
+ row.get("opik_score"),
457
+ )
458
+ LOGGER.debug(
459
+ "selected candidate idx=%s gepa=%s opik=%.4f",
460
+ best_idx,
461
+ details.get("selected_candidate_gepa_score"),
462
+ best_score,
463
+ )
464
+
465
+ return OptimizationResult(
466
+ optimizer=self.__class__.__name__,
467
+ prompt=final_messages,
468
+ score=best_score,
469
+ metric_name=metric.__name__,
470
+ optimization_id=opt_id,
471
+ dataset_id=ds_id,
472
+ initial_prompt=initial_prompt_messages,
473
+ initial_score=initial_score,
474
+ details=details,
475
+ history=history,
476
+ llm_calls=None,
477
+ )
478
+
479
+ # ------------------------------------------------------------------
480
+ # Helpers used by BaseOptimizer.evaluate_prompt
481
+ # ------------------------------------------------------------------
482
+
483
+ def _extract_system_text(self, prompt: chat_prompt.ChatPrompt) -> str:
484
+ messages = prompt.get_messages()
485
+ for message in messages:
486
+ if message.get("role") == "system":
487
+ return str(message.get("content", "")).strip()
488
+ for message in messages:
489
+ if message.get("role") == "user":
490
+ return f"You are a helpful assistant. Respond to: {message.get('content', '')}"
491
+ return "You are a helpful assistant."
492
+
493
+ def _extract_system_text_from_candidate(
494
+ self, candidate: dict[str, Any], fallback: str
495
+ ) -> str:
496
+ for key in ("system_prompt", "system", "prompt"):
497
+ value = candidate.get(key)
498
+ if isinstance(value, str) and value.strip():
499
+ return value
500
+ return fallback
501
+
502
+ def _evaluate_prompt_logged(
503
+ self,
504
+ prompt: chat_prompt.ChatPrompt,
505
+ dataset: Dataset,
506
+ metric: Callable[[dict[str, Any], str], ScoreResult],
507
+ n_samples: int | None = None,
508
+ dataset_item_ids: list[str] | None = None,
509
+ experiment_config: dict[str, Any] | None = None,
510
+ optimization_id: str | None = None,
511
+ extra_metadata: dict[str, Any] | None = None,
512
+ verbose: int = 1,
513
+ ) -> float:
514
+ if prompt.model is None:
515
+ prompt.model = self.model
516
+ if prompt.model_kwargs is None:
517
+ prompt.model_kwargs = self.model_kwargs
518
+
519
+ agent_class = create_litellm_agent_class(prompt)
520
+ agent = agent_class(prompt)
521
+
522
+ def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
523
+ messages = prompt.get_messages(dataset_item)
524
+ raw = agent.invoke(messages)
525
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: raw.strip()}
526
+
527
+ experiment_config = experiment_config or {}
528
+ experiment_config["project_name"] = agent_class.__name__
529
+ experiment_config = {
530
+ **experiment_config,
531
+ **{
532
+ "optimizer": self.__class__.__name__,
533
+ "agent_class": agent_class.__name__,
534
+ "agent_config": prompt.to_dict(),
535
+ "metric": metric.__name__,
536
+ "dataset": dataset.name,
537
+ "configuration": {
538
+ "prompt": prompt.get_messages(),
539
+ "gepa": (extra_metadata or {}),
540
+ },
541
+ },
542
+ }
543
+
544
+ score = task_evaluator.evaluate(
545
+ dataset=dataset,
546
+ dataset_item_ids=dataset_item_ids,
547
+ metric=metric,
548
+ evaluated_task=llm_task,
549
+ num_threads=self.num_threads,
550
+ project_name=agent_class.project_name,
551
+ experiment_config=experiment_config,
552
+ optimization_id=optimization_id,
553
+ n_samples=n_samples,
554
+ verbose=verbose,
555
+ )
556
+ return score