opik-optimizer 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. opik_optimizer/__init__.py +2 -0
  2. opik_optimizer/base_optimizer.py +376 -19
  3. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +80 -17
  4. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +179 -39
  5. opik_optimizer/evolutionary_optimizer/llm_support.py +3 -1
  6. opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +17 -3
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +5 -0
  9. opik_optimizer/evolutionary_optimizer/prompts.py +47 -0
  10. opik_optimizer/evolutionary_optimizer/reporting.py +12 -0
  11. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +65 -59
  12. opik_optimizer/gepa_optimizer/adapter.py +5 -3
  13. opik_optimizer/gepa_optimizer/gepa_optimizer.py +163 -66
  14. opik_optimizer/mcp_utils/mcp_workflow.py +57 -3
  15. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +75 -69
  16. opik_optimizer/mipro_optimizer/_lm.py +10 -3
  17. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +1 -1
  18. opik_optimizer/mipro_optimizer/mipro_optimizer.py +96 -21
  19. opik_optimizer/optimizable_agent.py +5 -0
  20. opik_optimizer/optimization_result.py +1 -0
  21. opik_optimizer/utils/core.py +56 -14
  22. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/METADATA +96 -9
  23. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/RECORD +27 -26
  24. /opik_optimizer/{colbert.py → utils/colbert.py} +0 -0
  25. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
  26. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/licenses/LICENSE +0 -0
  27. {opik_optimizer-1.1.0.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -15,6 +15,7 @@ from .few_shot_bayesian_optimizer import FewShotBayesianOptimizer
15
15
  from .gepa_optimizer import GepaOptimizer
16
16
  from .logging_config import setup_logging
17
17
  from .meta_prompt_optimizer import MetaPromptOptimizer
18
+ from .mipro_optimizer import MiproOptimizer
18
19
  from .optimization_config.configs import TaskConfig
19
20
  from .optimization_result import OptimizationResult
20
21
 
@@ -31,6 +32,7 @@ __all__ = [
31
32
  "FewShotBayesianOptimizer",
32
33
  "GepaOptimizer",
33
34
  "MetaPromptOptimizer",
35
+ "MiproOptimizer",
34
36
  "EvolutionaryOptimizer",
35
37
  "OptimizationResult",
36
38
  "OptimizableAgent",
@@ -1,10 +1,13 @@
1
- from typing import Any
1
+ from typing import Any, cast
2
2
  from collections.abc import Callable
3
3
 
4
+ import copy
5
+ import inspect
4
6
  import logging
5
7
  import time
6
- from abc import abstractmethod
8
+ from abc import ABC, abstractmethod
7
9
  import random
10
+ import importlib.metadata
8
11
 
9
12
 
10
13
  import litellm
@@ -17,7 +20,7 @@ from . import _throttle, optimization_result
17
20
  from .cache_config import initialize_cache
18
21
  from .optimization_config import chat_prompt, mappers
19
22
  from .optimizable_agent import OptimizableAgent
20
- from .utils import create_litellm_agent_class
23
+ from .utils import create_litellm_agent_class, optimization_context
21
24
  from . import task_evaluator
22
25
 
23
26
  _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
@@ -29,6 +32,12 @@ litellm.drop_params = True
29
32
  logger = logging.getLogger(__name__)
30
33
 
31
34
 
35
+ try:
36
+ _OPTIMIZER_VERSION = importlib.metadata.version("opik_optimizer")
37
+ except importlib.metadata.PackageNotFoundError: # pragma: no cover - dev installs
38
+ _OPTIMIZER_VERSION = "unknown"
39
+
40
+
32
41
  class OptimizationRound(BaseModel):
33
42
  model_config = {"arbitrary_types_allowed": True}
34
43
 
@@ -41,11 +50,12 @@ class OptimizationRound(BaseModel):
41
50
  improvement: float
42
51
 
43
52
 
44
- class BaseOptimizer:
53
+ class BaseOptimizer(ABC):
45
54
  def __init__(
46
55
  self,
47
56
  model: str,
48
57
  verbose: int = 1,
58
+ seed: int = 42,
49
59
  **model_kwargs: Any,
50
60
  ) -> None:
51
61
  """
@@ -54,19 +64,335 @@ class BaseOptimizer:
54
64
  Args:
55
65
  model: LiteLLM model name
56
66
  verbose: Controls internal logging/progress bars (0=off, 1=on).
67
+ seed: Random seed for reproducibility (default: 42)
57
68
  model_kwargs: additional args for model (eg, temperature)
58
69
  """
59
70
  self.model = model
60
71
  self.reasoning_model = model
61
72
  self.model_kwargs = model_kwargs
62
73
  self.verbose = verbose
74
+ self.seed = seed
63
75
  self._history: list[OptimizationRound] = []
64
76
  self.experiment_config = None
65
77
  self.llm_call_counter = 0
78
+ self.tool_call_counter = 0
79
+ self._opik_client = None # Lazy initialization
66
80
 
67
81
  # Initialize shared cache
68
82
  initialize_cache()
69
83
 
84
+ def reset_counters(self) -> None:
85
+ """Reset all call counters for a new optimization run."""
86
+ self.llm_call_counter = 0
87
+ self.tool_call_counter = 0
88
+
89
+ def increment_llm_counter(self) -> None:
90
+ """Increment the LLM call counter."""
91
+ self.llm_call_counter += 1
92
+
93
+ def increment_tool_counter(self) -> None:
94
+ """Increment the tool call counter."""
95
+ self.tool_call_counter += 1
96
+
97
+ def cleanup(self) -> None:
98
+ """
99
+ Clean up resources and perform memory management.
100
+ Should be called when the optimizer is no longer needed.
101
+ """
102
+ # Reset counters
103
+ self.reset_counters()
104
+
105
+ # Clear history to free memory
106
+ self._history.clear()
107
+
108
+ # Clear Opik client if it exists
109
+ if self._opik_client is not None:
110
+ # Note: Opik client doesn't have explicit cleanup, but we can clear the reference
111
+ self._opik_client = None
112
+
113
+ logger.debug(f"Cleaned up resources for {self.__class__.__name__}")
114
+
115
+ def __del__(self) -> None:
116
+ """Destructor to ensure cleanup is called."""
117
+ try:
118
+ self.cleanup()
119
+ except Exception:
120
+ # Ignore exceptions during cleanup in destructor
121
+ pass
122
+
123
+ @property
124
+ def opik_client(self) -> Any:
125
+ """Lazy initialization of Opik client."""
126
+ if self._opik_client is None:
127
+ import opik
128
+
129
+ self._opik_client = opik.Opik()
130
+ return self._opik_client
131
+
132
+ def validate_optimization_inputs(
133
+ self, prompt: "chat_prompt.ChatPrompt", dataset: "Dataset", metric: Callable
134
+ ) -> None:
135
+ """
136
+ Validate common optimization inputs.
137
+
138
+ Args:
139
+ prompt: The chat prompt to validate
140
+ dataset: The dataset to validate
141
+ metric: The metric function to validate
142
+
143
+ Raises:
144
+ ValueError: If any input is invalid
145
+ """
146
+ if not isinstance(prompt, chat_prompt.ChatPrompt):
147
+ raise ValueError("Prompt must be a ChatPrompt object")
148
+
149
+ if not isinstance(dataset, Dataset):
150
+ raise ValueError("Dataset must be a Dataset object")
151
+
152
+ if not callable(metric):
153
+ raise ValueError(
154
+ "Metric must be a function that takes `dataset_item` and `llm_output` as arguments."
155
+ )
156
+
157
+ def setup_agent_class(
158
+ self, prompt: "chat_prompt.ChatPrompt", agent_class: Any = None
159
+ ) -> Any:
160
+ """
161
+ Setup agent class for optimization.
162
+
163
+ Args:
164
+ prompt: The chat prompt
165
+ agent_class: Optional custom agent class
166
+
167
+ Returns:
168
+ The agent class to use
169
+ """
170
+ if agent_class is None:
171
+ return create_litellm_agent_class(prompt, optimizer_ref=self)
172
+ else:
173
+ return agent_class
174
+
175
+ def configure_prompt_model(self, prompt: "chat_prompt.ChatPrompt") -> None:
176
+ """
177
+ Configure prompt model and model_kwargs if not set.
178
+
179
+ Args:
180
+ prompt: The chat prompt to configure
181
+ """
182
+ # Only configure if prompt is a valid ChatPrompt object
183
+ if hasattr(prompt, "model") and hasattr(prompt, "model_kwargs"):
184
+ if prompt.model is None:
185
+ prompt.model = self.model
186
+ if prompt.model_kwargs is None:
187
+ prompt.model_kwargs = self.model_kwargs
188
+
189
+ # ------------------------------------------------------------------
190
+ # Experiment metadata helpers
191
+ # ------------------------------------------------------------------
192
+
193
+ @staticmethod
194
+ def _drop_none(metadata: dict[str, Any]) -> dict[str, Any]:
195
+ return {k: v for k, v in metadata.items() if v is not None}
196
+
197
+ @staticmethod
198
+ def _deep_merge_dicts(
199
+ base: dict[str, Any], overrides: dict[str, Any]
200
+ ) -> dict[str, Any]:
201
+ result = copy.deepcopy(base)
202
+ for key, value in overrides.items():
203
+ if (
204
+ key in result
205
+ and isinstance(result[key], dict)
206
+ and isinstance(value, dict)
207
+ ):
208
+ result[key] = BaseOptimizer._deep_merge_dicts(result[key], value)
209
+ else:
210
+ result[key] = value
211
+ return result
212
+
213
+ @staticmethod
214
+ def _serialize_tools(prompt: "chat_prompt.ChatPrompt") -> list[dict[str, Any]]:
215
+ tools_obj = getattr(prompt, "tools", None)
216
+ if not isinstance(tools_obj, list):
217
+ return []
218
+
219
+ try:
220
+ return copy.deepcopy(cast(list[dict[str, Any]], tools_obj))
221
+ except Exception: # pragma: no cover - defensive
222
+ serialized_tools: list[dict[str, Any]] = []
223
+ for tool in tools_obj:
224
+ if isinstance(tool, dict):
225
+ serialized_tools.append({k: v for k, v in tool.items() if k})
226
+ return serialized_tools
227
+
228
+ @staticmethod
229
+ def _describe_annotation(annotation: Any) -> str | None:
230
+ if annotation is inspect._empty:
231
+ return None
232
+ if isinstance(annotation, type):
233
+ return annotation.__name__
234
+ return str(annotation)
235
+
236
+ def _summarize_tool_signatures(
237
+ self, prompt: "chat_prompt.ChatPrompt"
238
+ ) -> list[dict[str, Any]]:
239
+ signatures: list[dict[str, Any]] = []
240
+ for name, func in getattr(prompt, "function_map", {}).items():
241
+ callable_obj = getattr(func, "__wrapped__", func)
242
+ try:
243
+ sig = inspect.signature(callable_obj)
244
+ except (TypeError, ValueError): # pragma: no cover - defensive
245
+ signatures.append({"name": name, "signature": "unavailable"})
246
+ continue
247
+
248
+ params: list[dict[str, Any]] = []
249
+ for parameter in sig.parameters.values():
250
+ params.append(
251
+ self._drop_none(
252
+ {
253
+ "name": parameter.name,
254
+ "kind": parameter.kind.name,
255
+ "annotation": self._describe_annotation(
256
+ parameter.annotation
257
+ ),
258
+ "default": (
259
+ None
260
+ if parameter.default is inspect._empty
261
+ else parameter.default
262
+ ),
263
+ }
264
+ )
265
+ )
266
+
267
+ signatures.append(
268
+ self._drop_none(
269
+ {
270
+ "name": name,
271
+ "parameters": params,
272
+ "docstring": inspect.getdoc(callable_obj),
273
+ }
274
+ )
275
+ )
276
+ return signatures
277
+
278
+ def _build_agent_config(self, prompt: "chat_prompt.ChatPrompt") -> dict[str, Any]:
279
+ agent_config: dict[str, Any] = dict(prompt.to_dict())
280
+ agent_config["project_name"] = getattr(prompt, "project_name", None)
281
+ agent_config["model"] = getattr(prompt, "model", None) or self.model
282
+ agent_config["tools"] = self._serialize_tools(prompt)
283
+ return self._drop_none(agent_config)
284
+
285
+ def get_optimizer_metadata(self) -> dict[str, Any]:
286
+ """Override in subclasses to expose optimizer-specific parameters."""
287
+ return {}
288
+
289
+ def _build_optimizer_metadata(self) -> dict[str, Any]:
290
+ metadata = {
291
+ "name": self.__class__.__name__,
292
+ "version": _OPTIMIZER_VERSION,
293
+ "model": self.model,
294
+ "model_kwargs": self.model_kwargs or None,
295
+ "seed": getattr(self, "seed", None),
296
+ "num_threads": getattr(self, "num_threads", None),
297
+ }
298
+
299
+ # n_threads is used by some optimizers instead of num_threads
300
+ if metadata["num_threads"] is None and hasattr(self, "n_threads"):
301
+ metadata["num_threads"] = getattr(self, "n_threads")
302
+
303
+ if hasattr(self, "reasoning_model"):
304
+ metadata["reasoning_model"] = getattr(self, "reasoning_model")
305
+
306
+ extra_parameters = self.get_optimizer_metadata()
307
+ if extra_parameters:
308
+ metadata["parameters"] = extra_parameters
309
+
310
+ return self._drop_none(metadata)
311
+
312
+ def _prepare_experiment_config(
313
+ self,
314
+ *,
315
+ prompt: "chat_prompt.ChatPrompt",
316
+ dataset: Dataset,
317
+ metric: Callable,
318
+ experiment_config: dict[str, Any] | None = None,
319
+ configuration_updates: dict[str, Any] | None = None,
320
+ additional_metadata: dict[str, Any] | None = None,
321
+ ) -> dict[str, Any]:
322
+ dataset_id = getattr(dataset, "id", None)
323
+ project_name = (
324
+ getattr(self.agent_class, "project_name", None)
325
+ if hasattr(self, "agent_class")
326
+ else None
327
+ )
328
+ if not project_name:
329
+ project_name = getattr(prompt, "project_name", None)
330
+ if not project_name:
331
+ project_name = self.__class__.__name__
332
+
333
+ base_config: dict[str, Any] = {
334
+ "project_name": project_name,
335
+ "agent_class": (
336
+ getattr(self.agent_class, "__name__", None)
337
+ if hasattr(self, "agent_class")
338
+ else None
339
+ ),
340
+ "agent_config": self._build_agent_config(prompt),
341
+ "metric": getattr(metric, "__name__", str(metric)),
342
+ "dataset": getattr(dataset, "name", None),
343
+ "dataset_id": dataset_id,
344
+ "optimizer_metadata": self._build_optimizer_metadata(),
345
+ "tool_signatures": self._summarize_tool_signatures(prompt),
346
+ "configuration": {
347
+ "prompt": prompt.get_messages(),
348
+ "prompt_name": getattr(prompt, "name", None),
349
+ "tools": self._serialize_tools(prompt),
350
+ "prompt_project_name": getattr(prompt, "project_name", None),
351
+ },
352
+ }
353
+
354
+ if configuration_updates:
355
+ base_config["configuration"] = self._deep_merge_dicts(
356
+ base_config["configuration"], configuration_updates
357
+ )
358
+
359
+ if additional_metadata:
360
+ base_config = self._deep_merge_dicts(base_config, additional_metadata)
361
+
362
+ if experiment_config:
363
+ base_config = self._deep_merge_dicts(base_config, experiment_config)
364
+
365
+ return self._drop_none(base_config)
366
+
367
+ def create_optimization_context(
368
+ self, dataset: "Dataset", metric: Callable, metadata: dict | None = None
369
+ ) -> Any:
370
+ """
371
+ Create optimization context for tracking.
372
+
373
+ Args:
374
+ dataset: The dataset being optimized
375
+ metric: The metric function
376
+ metadata: Additional metadata
377
+
378
+ Returns:
379
+ Optimization context manager
380
+ """
381
+ context_metadata = {
382
+ "optimizer": self.__class__.__name__,
383
+ "model": self.model,
384
+ "seed": self.seed,
385
+ }
386
+ if metadata:
387
+ context_metadata.update(metadata)
388
+
389
+ return optimization_context(
390
+ client=self.opik_client,
391
+ dataset_name=dataset.name,
392
+ objective_name=metric.__name__,
393
+ metadata=context_metadata,
394
+ )
395
+
70
396
  @abstractmethod
71
397
  def optimize_prompt(
72
398
  self,
@@ -74,6 +400,9 @@ class BaseOptimizer:
74
400
  dataset: Dataset,
75
401
  metric: Callable,
76
402
  experiment_config: dict | None = None,
403
+ n_samples: int | None = None,
404
+ auto_continue: bool = False,
405
+ agent_class: type[OptimizableAgent] | None = None,
77
406
  **kwargs: Any,
78
407
  ) -> optimization_result.OptimizationResult:
79
408
  """
@@ -100,9 +429,43 @@ class BaseOptimizer:
100
429
  tool_name: str,
101
430
  second_pass: Any,
102
431
  experiment_config: dict | None = None,
432
+ n_samples: int | None = None,
433
+ auto_continue: bool = False,
434
+ agent_class: type[OptimizableAgent] | None = None,
435
+ fallback_invoker: Callable[[dict[str, Any]], str] | None = None,
436
+ fallback_arguments: Callable[[Any], dict[str, Any]] | None = None,
437
+ allow_tool_use_on_second_pass: bool = False,
103
438
  **kwargs: Any,
104
439
  ) -> optimization_result.OptimizationResult:
105
- """Optimize prompts that rely on MCP tooling."""
440
+ """
441
+ Optimize prompts that rely on MCP (Model Context Protocol) tooling.
442
+
443
+ This method provides a standardized interface for optimizing prompts that use
444
+ external tools through the MCP protocol. It handles tool invocation, second-pass
445
+ coordination, and fallback mechanisms.
446
+
447
+ Args:
448
+ prompt: The chat prompt to optimize, must include tools
449
+ dataset: Opik dataset containing evaluation data
450
+ metric: Evaluation function that takes (dataset_item, llm_output) and returns a score
451
+ tool_name: Name of the MCP tool to use for optimization
452
+ second_pass: MCPSecondPassCoordinator for handling second-pass tool calls
453
+ experiment_config: Optional configuration for the experiment
454
+ n_samples: Number of samples to use for optimization (default: None)
455
+ auto_continue: Whether to auto-continue optimization (default: False)
456
+ agent_class: Custom agent class to use (default: None)
457
+ fallback_invoker: Fallback function for tool invocation (default: None)
458
+ fallback_arguments: Function to extract tool arguments (default: None)
459
+ allow_tool_use_on_second_pass: Whether to allow tool use on second pass (default: False)
460
+ **kwargs: Additional arguments for optimization
461
+
462
+ Returns:
463
+ OptimizationResult: The optimization result containing the optimized prompt and metrics
464
+
465
+ Raises:
466
+ NotImplementedError: If the optimizer doesn't implement MCP optimization
467
+ ValueError: If the prompt doesn't include required tools
468
+ """
106
469
  raise NotImplementedError(
107
470
  f"{self.__class__.__name__} does not implement optimize_mcp yet."
108
471
  )
@@ -166,7 +529,7 @@ class BaseOptimizer:
166
529
  self.agent_class: type[OptimizableAgent]
167
530
 
168
531
  if agent_class is None:
169
- self.agent_class = create_litellm_agent_class(prompt)
532
+ self.agent_class = create_litellm_agent_class(prompt, optimizer_ref=self)
170
533
  else:
171
534
  self.agent_class = agent_class
172
535
 
@@ -181,18 +544,12 @@ class BaseOptimizer:
181
544
  }
182
545
  return result
183
546
 
184
- experiment_config = experiment_config or {}
185
- experiment_config["project_name"] = self.__class__.__name__
186
- experiment_config = {
187
- **experiment_config,
188
- **{
189
- "agent_class": self.agent_class.__name__,
190
- "agent_config": prompt.to_dict(),
191
- "metric": metric.__name__,
192
- "dataset": dataset.name,
193
- "configuration": {"prompt": (prompt.get_messages() if prompt else [])},
194
- },
195
- }
547
+ experiment_config = self._prepare_experiment_config(
548
+ prompt=prompt,
549
+ dataset=dataset,
550
+ metric=metric,
551
+ experiment_config=experiment_config,
552
+ )
196
553
 
197
554
  if n_samples is not None:
198
555
  if dataset_item_ids is not None:
@@ -207,7 +564,7 @@ class BaseOptimizer:
207
564
  metric=metric,
208
565
  evaluated_task=llm_task,
209
566
  num_threads=n_threads,
210
- project_name=self.agent_class.project_name,
567
+ project_name=experiment_config.get("project_name"),
211
568
  experiment_config=experiment_config,
212
569
  optimization_id=None,
213
570
  verbose=verbose,
@@ -1,10 +1,15 @@
1
- from typing import Any, TYPE_CHECKING
1
+ from typing import Any, TYPE_CHECKING, cast
2
2
  from collections.abc import Callable
3
3
 
4
4
 
5
5
  from .. import task_evaluator
6
6
  from ..optimization_config import mappers, chat_prompt
7
+ from ..mcp_utils.mcp_workflow import MCPExecutionConfig
7
8
  import opik
9
+ import copy
10
+
11
+ if TYPE_CHECKING: # pragma: no cover - typing only
12
+ from ..base_optimizer import BaseOptimizer
8
13
 
9
14
 
10
15
  class EvaluationOps:
@@ -30,33 +35,91 @@ class EvaluationOps:
30
35
 
31
36
  new_prompt = prompt.copy()
32
37
  new_prompt.set_messages(messages)
38
+ tools = getattr(messages, "tools", None)
39
+ if tools is not None:
40
+ new_prompt.tools = copy.deepcopy(tools)
41
+
42
+ optimizer = cast("BaseOptimizer", self)
33
43
 
34
- experiment_config = experiment_config or {}
35
- experiment_config["project_name"] = self.agent_class.project_name
36
- experiment_config = {
37
- **experiment_config,
38
- "optimizer": self.__class__.__name__,
39
- "agent_class": self.agent_class.__name__,
40
- "agent_config": new_prompt.to_dict(),
41
- "metric": metric.__name__,
42
- "dataset": dataset.name,
43
- "configuration": {
44
- "prompt": new_prompt.get_messages(),
44
+ configuration_updates = optimizer._drop_none(
45
+ {
45
46
  "n_samples_for_eval": (
46
47
  len(dataset_item_ids) if dataset_item_ids is not None else n_samples
47
48
  ),
48
49
  "total_dataset_items": total_items,
49
- },
50
- }
50
+ }
51
+ )
52
+ evaluation_details = optimizer._drop_none(
53
+ {
54
+ "dataset_item_ids": dataset_item_ids,
55
+ "optimization_id": optimization_id,
56
+ }
57
+ )
58
+ additional_metadata = (
59
+ {"evaluation": evaluation_details} if evaluation_details else None
60
+ )
61
+
62
+ experiment_config = optimizer._prepare_experiment_config(
63
+ prompt=new_prompt,
64
+ dataset=dataset,
65
+ metric=metric,
66
+ experiment_config=experiment_config,
67
+ configuration_updates=configuration_updates,
68
+ additional_metadata=additional_metadata,
69
+ )
51
70
  try:
52
71
  agent = self.agent_class(new_prompt)
53
72
  except Exception:
54
73
  return 0.0
55
74
 
75
+ mcp_execution_config: MCPExecutionConfig | None = kwargs.get("mcp_config")
76
+
56
77
  def llm_task(dataset_item: dict[str, Any]) -> dict[str, str]:
57
78
  messages = new_prompt.get_messages(dataset_item)
58
- model_output = agent.invoke(messages)
59
- return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
79
+
80
+ if mcp_execution_config is None:
81
+ model_output = agent.invoke(messages)
82
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: model_output}
83
+
84
+ coordinator = mcp_execution_config.coordinator
85
+ coordinator.reset()
86
+
87
+ raw_model_output = agent.llm_invoke(
88
+ messages=messages,
89
+ seed=getattr(self, "seed", None),
90
+ allow_tool_use=True,
91
+ )
92
+
93
+ second_pass_messages = coordinator.build_second_pass_messages(
94
+ base_messages=messages,
95
+ dataset_item=dataset_item,
96
+ )
97
+
98
+ if (
99
+ second_pass_messages is None
100
+ and mcp_execution_config.fallback_invoker is not None
101
+ ):
102
+ fallback_args = mcp_execution_config.fallback_arguments(dataset_item)
103
+ if fallback_args:
104
+ summary_override = mcp_execution_config.fallback_invoker(
105
+ fallback_args
106
+ )
107
+ second_pass_messages = coordinator.build_second_pass_messages(
108
+ base_messages=messages,
109
+ dataset_item=dataset_item,
110
+ summary_override=summary_override,
111
+ )
112
+
113
+ if second_pass_messages is not None:
114
+ final_response = agent.llm_invoke(
115
+ messages=second_pass_messages,
116
+ seed=getattr(self, "seed", None),
117
+ allow_tool_use=mcp_execution_config.allow_tool_use_on_second_pass,
118
+ )
119
+ else:
120
+ final_response = raw_model_output
121
+
122
+ return {mappers.EVALUATED_LLM_TASK_OUTPUT: final_response.strip()}
60
123
 
61
124
  score = task_evaluator.evaluate(
62
125
  dataset=dataset,
@@ -64,7 +127,7 @@ class EvaluationOps:
64
127
  metric=metric,
65
128
  evaluated_task=llm_task,
66
129
  num_threads=self.num_threads,
67
- project_name=experiment_config["project_name"],
130
+ project_name=experiment_config.get("project_name"),
68
131
  n_samples=n_samples if dataset_item_ids is None else None,
69
132
  experiment_config=experiment_config,
70
133
  optimization_id=optimization_id,