opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. opik_optimizer/__init__.py +4 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +402 -28
  4. opik_optimizer/data/context7_eval.jsonl +3 -0
  5. opik_optimizer/datasets/context7_eval.py +90 -0
  6. opik_optimizer/datasets/tiny_test.py +33 -34
  7. opik_optimizer/datasets/truthful_qa.py +2 -2
  8. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  9. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
  10. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
  11. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  12. opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
  13. opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +154 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +30 -23
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +21 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +22 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/utils/colbert.py +236 -0
  46. opik_optimizer/{utils.py → utils/core.py} +160 -33
  47. opik_optimizer/utils/dataset_utils.py +49 -0
  48. opik_optimizer/utils/prompt_segments.py +186 -0
  49. opik_optimizer-2.0.0.dist-info/METADATA +345 -0
  50. opik_optimizer-2.0.0.dist-info/RECORD +74 -0
  51. opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
  52. opik_optimizer-1.0.6.dist-info/METADATA +0 -181
  53. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  54. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  55. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
  56. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,116 @@
1
+ """Deterministic harness to score MCP tool usage during optimization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from difflib import SequenceMatcher
7
+ from typing import Any
8
+ from collections.abc import Callable, Mapping
9
+
10
+ from .mcp import ToolSignature, validate_tool_arguments
11
+
12
+
13
+ @dataclass
14
+ class ToolCallResult:
15
+ tool_name: str
16
+ arguments: Mapping[str, Any]
17
+ response: Any
18
+
19
+
20
+ @dataclass
21
+ class SimulationReport:
22
+ dataset_id: str
23
+ expected_tool: str
24
+ tool_called: bool
25
+ called_tool: str | None
26
+ arguments_valid: bool
27
+ score: float
28
+ response: Any
29
+ failure_reason: str | None
30
+
31
+
32
+ InvokeFn = Callable[[ToolSignature, Mapping[str, Any], dict[str, Any]], ToolCallResult]
33
+
34
+
35
+ def simulate_session(
36
+ signature_map: dict[str, ToolSignature],
37
+ dataset_item: dict[str, Any],
38
+ invoke_tool: InvokeFn | None = None,
39
+ ) -> SimulationReport:
40
+ """Simulate a tool invocation for ``dataset_item`` using ``signature_map``.
41
+
42
+ ``invoke_tool`` can run a real MCP client; when absent we assume the
43
+ expected tool is called with the reference arguments for deterministic
44
+ scoring.
45
+ """
46
+
47
+ dataset_id = dataset_item.get("id", "unknown")
48
+ expected_tool = dataset_item["expected_tool"]
49
+ reference_arguments = dataset_item.get("arguments", {})
50
+ reference_answer = dataset_item.get("reference_answer") or dataset_item.get(
51
+ "expected_answer_contains"
52
+ )
53
+
54
+ signature = signature_map.get(expected_tool)
55
+ if signature is None:
56
+ return SimulationReport(
57
+ dataset_id=dataset_id,
58
+ expected_tool=expected_tool,
59
+ tool_called=False,
60
+ called_tool=None,
61
+ arguments_valid=False,
62
+ score=0.0,
63
+ response=None,
64
+ failure_reason="missing_tool_signature",
65
+ )
66
+
67
+ if invoke_tool is None:
68
+ tool_call = ToolCallResult(
69
+ tool_name=expected_tool,
70
+ arguments=reference_arguments,
71
+ response=dataset_item.get("reference_response", reference_answer),
72
+ )
73
+ else:
74
+ tool_call = invoke_tool(signature, reference_arguments, dataset_item)
75
+
76
+ arguments_valid, validation_message = validate_tool_arguments(
77
+ signature, tool_call.arguments
78
+ )
79
+
80
+ tool_called = tool_call.tool_name is not None
81
+ called_tool = tool_call.tool_name
82
+ failure_reason = None
83
+ score = 0.0
84
+
85
+ if not tool_called:
86
+ failure_reason = "tool_not_called"
87
+ elif called_tool != expected_tool:
88
+ failure_reason = "wrong_tool"
89
+ elif not arguments_valid:
90
+ failure_reason = f"invalid_arguments:{validation_message}"
91
+ else:
92
+ response_text = (
93
+ str(tool_call.response) if tool_call.response is not None else ""
94
+ )
95
+ if reference_answer:
96
+ ratio = SequenceMatcher(
97
+ None,
98
+ " ".join(reference_answer.lower().split()),
99
+ " ".join(response_text.lower().split()),
100
+ ).ratio()
101
+ score = ratio
102
+ if ratio < 0.6:
103
+ failure_reason = "low_similarity"
104
+ else:
105
+ score = 1.0
106
+
107
+ return SimulationReport(
108
+ dataset_id=dataset_id,
109
+ expected_tool=expected_tool,
110
+ tool_called=tool_called,
111
+ called_tool=called_tool,
112
+ arguments_valid=arguments_valid,
113
+ score=score,
114
+ response=tool_call.response,
115
+ failure_reason=failure_reason,
116
+ )
@@ -0,0 +1,547 @@
1
+ """High-level helpers for MCP-powered demo scripts.
2
+
3
+ These utilities consolidate the boilerplate previously embedded in the
4
+ example scripts so that each demo can focus on the tooling configuration
5
+ instead of low-level orchestration details. The helpers are intentionally
6
+ generic so that any MCP tool can reuse them with minimal adjustment.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import contextlib
12
+ import copy
13
+ import io
14
+ import json
15
+ import logging
16
+ import os
17
+ import textwrap
18
+ import time
19
+ from contextvars import ContextVar
20
+ from dataclasses import dataclass, field
21
+ from pathlib import Path
22
+ from typing import Any
23
+ from collections.abc import Callable, Iterator, Mapping, Sequence
24
+
25
+ from opik import track
26
+ from opik.evaluation.metrics.score_result import ScoreResult
27
+
28
+ from .mcp import (
29
+ MCPManifest,
30
+ ToolSignature,
31
+ call_tool_from_manifest,
32
+ dump_mcp_signature,
33
+ list_tools_from_manifest,
34
+ load_tool_signature_from_manifest,
35
+ response_to_text,
36
+ )
37
+ from .mcp_second_pass import (
38
+ MCPSecondPassCoordinator,
39
+ FollowUpBuilder,
40
+ extract_user_query,
41
+ )
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ ToolCall = Callable[[str, dict[str, Any]], Any]
47
+ ArgumentAdapter = Callable[[dict[str, Any], ToolCall], dict[str, Any]]
48
+ SummaryBuilder = Callable[[str, Mapping[str, Any]], str]
49
+ FallbackArgumentsProvider = Callable[[Any], dict[str, Any]]
50
+ FallbackInvoker = Callable[[dict[str, Any]], str]
51
+
52
+
53
+ def _default_rate_limit() -> float:
54
+ value = os.getenv("MCP_RATELIMIT_SLEEP", "0.1")
55
+ try:
56
+ return float(value)
57
+ except ValueError:
58
+ logger.warning(
59
+ "Invalid MCP_RATELIMIT_SLEEP=%r, expected a numeric value, using default 0.1",
60
+ value,
61
+ )
62
+ return 0.1
63
+
64
+
65
+ DEFAULT_MCP_RATELIMIT_SLEEP = _default_rate_limit()
66
+
67
+
68
+ @contextlib.contextmanager
69
+ def suppress_mcp_stdout(logger: logging.Logger = logger) -> Iterator[None]:
70
+ buffer = io.StringIO()
71
+ with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer):
72
+ yield
73
+ for line in buffer.getvalue().splitlines():
74
+ trimmed = line.strip()
75
+ if not trimmed:
76
+ continue
77
+ if (
78
+ "MCP Server running on stdio" in trimmed
79
+ or "Context7 Documentation MCP Server running on stdio" in trimmed
80
+ ):
81
+ continue
82
+ logger.debug("MCP stdout: %s", trimmed)
83
+
84
+
85
+ def ensure_argument_via_resolver(
86
+ *,
87
+ target_field: str,
88
+ resolver_tool: str,
89
+ query_fields: Sequence[str],
90
+ ) -> ArgumentAdapter:
91
+ """Return an adapter that resolves ``target_field`` via an MCP tool."""
92
+
93
+ def _adapter(arguments: dict[str, Any], call_tool: ToolCall) -> dict[str, Any]:
94
+ prepared = dict(arguments)
95
+ if prepared.get(target_field):
96
+ return prepared
97
+ for key in query_fields:
98
+ query = prepared.get(key)
99
+ if not query:
100
+ continue
101
+ response = call_tool(resolver_tool, {"query": query})
102
+ resolved = response_to_text(response).strip()
103
+ if resolved:
104
+ prepared[target_field] = resolved
105
+ break
106
+ return prepared
107
+
108
+ return _adapter
109
+
110
+
111
+ def extract_tool_arguments(item: Any) -> dict[str, Any]:
112
+ """Best-effort extraction of tool arguments from dataset records.
113
+
114
+ The helper understands the common structures we use in tests and
115
+ examples but stays permissive so it keeps working with future
116
+ dataset variants.
117
+ """
118
+
119
+ if isinstance(item, dict):
120
+ if "arguments" in item and isinstance(item["arguments"], dict):
121
+ return dict(item["arguments"])
122
+ if "input" in item and isinstance(item["input"], dict):
123
+ arguments = item["input"].get("arguments")
124
+ if isinstance(arguments, dict):
125
+ return dict(arguments)
126
+
127
+ for attr in ("input_values", "input", "data"):
128
+ value = getattr(item, attr, None)
129
+ if isinstance(value, dict):
130
+ arguments = value.get("arguments")
131
+ if isinstance(arguments, dict):
132
+ return dict(arguments)
133
+
134
+ return {}
135
+
136
+
137
+ def create_second_pass_coordinator(
138
+ tool_name: str,
139
+ follow_up_template: str,
140
+ *,
141
+ summary_var_name: str | None = None,
142
+ ) -> MCPSecondPassCoordinator:
143
+ summary_var = create_summary_var(summary_var_name or f"{tool_name}_summary")
144
+ follow_up_builder = make_follow_up_builder(follow_up_template)
145
+ return MCPSecondPassCoordinator(
146
+ tool_name=tool_name,
147
+ summary_var=summary_var,
148
+ follow_up_builder=follow_up_builder,
149
+ )
150
+
151
+
152
+ def make_follow_up_builder(template: str) -> FollowUpBuilder:
153
+ """Create a ``FollowUpBuilder`` that fills a string template.
154
+
155
+ The template receives ``summary`` and ``user_query`` keyword
156
+ arguments. Missing user queries collapse to an empty string so the
157
+ template can stay simple (e.g. ``"Use the summary: {summary}"``).
158
+ """
159
+
160
+ def _builder(dataset_item: dict[str, Any], summary: str) -> str | None:
161
+ user_query = extract_user_query(dataset_item) or ""
162
+ rendered = template.format(summary=summary, user_query=user_query).strip()
163
+ return rendered or None
164
+
165
+ return _builder
166
+
167
+
168
+ def make_similarity_metric(name: str) -> Callable[[dict[str, Any], str], ScoreResult]:
169
+ """Return a Levenshtein-ratio style metric closure for demos."""
170
+
171
+ def _metric(dataset_item: dict[str, Any], llm_output: str) -> ScoreResult:
172
+ reference = (dataset_item.get("reference_answer") or "").strip()
173
+ if not reference:
174
+ return ScoreResult(
175
+ name=f"{name}_similarity", value=0.0, reason="Missing reference answer."
176
+ )
177
+
178
+ def _normalize(text: str) -> str:
179
+ return " ".join(text.lower().split())
180
+
181
+ ratio = _sequence_match_ratio(_normalize(reference), _normalize(llm_output))
182
+ reason = f"Levenshtein ratio {ratio:.2f} against reference."
183
+ return ScoreResult(
184
+ name=f"{name}_similarity",
185
+ value=ratio,
186
+ reason=reason,
187
+ metadata={"reference": reference},
188
+ )
189
+
190
+ return _metric
191
+
192
+
193
+ def _sequence_match_ratio(a: str, b: str) -> float:
194
+ """Local wrapper to avoid importing difflib in several modules."""
195
+
196
+ from difflib import SequenceMatcher
197
+
198
+ return SequenceMatcher(None, a, b).ratio()
199
+
200
+
201
+ def list_manifest_tools(
202
+ manifest: MCPManifest, *, logger: logging.Logger = logger
203
+ ) -> tuple[list[Any], list[str]]:
204
+ with suppress_mcp_stdout(logger):
205
+ tools = list_tools_from_manifest(manifest)
206
+ names = [getattr(tool, "name", "") for tool in tools if getattr(tool, "name", None)]
207
+ logger.info("MCP tools available: %s", names)
208
+ return tools, names
209
+
210
+
211
+ def load_manifest_tool_signature(
212
+ manifest: MCPManifest,
213
+ tool_name: str,
214
+ *,
215
+ logger: logging.Logger = logger,
216
+ ) -> ToolSignature:
217
+ signature = load_tool_signature_from_manifest(manifest, tool_name)
218
+ logger.debug("Loaded signature for %s", tool_name)
219
+ return signature
220
+
221
+
222
+ def dump_signature_artifact(
223
+ signature: ToolSignature,
224
+ artifacts_dir: Path | str,
225
+ filename: str,
226
+ *,
227
+ logger: logging.Logger = logger,
228
+ ) -> Path:
229
+ artifacts_path = Path(artifacts_dir)
230
+ artifacts_path.mkdir(parents=True, exist_ok=True)
231
+ destination = artifacts_path / filename
232
+ dump_mcp_signature([signature], destination)
233
+ logger.info("Signature written to %s", destination)
234
+ return destination
235
+
236
+
237
+ def update_signature_from_tool_entry(
238
+ signature: ToolSignature, tool_entry: Mapping[str, Any]
239
+ ) -> ToolSignature:
240
+ function_block = tool_entry.get("function", {})
241
+ signature.description = function_block.get("description", signature.description)
242
+ signature.parameters = function_block.get("parameters", signature.parameters)
243
+ signature.examples = function_block.get("examples", signature.examples)
244
+ signature.extra = {
245
+ **signature.extra,
246
+ **{k: v for k, v in tool_entry.items() if k != "function"},
247
+ }
248
+ return signature
249
+
250
+
251
+ def apply_tool_entry_from_prompt(
252
+ signature: ToolSignature,
253
+ prompt: Any,
254
+ default_entry: Mapping[str, Any],
255
+ ) -> dict[str, Any]:
256
+ tool_entry: dict[str, Any] = copy.deepcopy(dict(default_entry))
257
+ prompt_tools = getattr(prompt, "tools", None)
258
+ if prompt_tools:
259
+ tool_entry = copy.deepcopy(dict(prompt_tools[0]))
260
+ update_signature_from_tool_entry(signature, tool_entry)
261
+ return tool_entry
262
+
263
+
264
+ def preview_tool_output(
265
+ manifest: MCPManifest,
266
+ tool_name: str,
267
+ arguments: Mapping[str, Any],
268
+ *,
269
+ logger: logging.Logger = logger,
270
+ preview_chars: int = 200,
271
+ ) -> str:
272
+ with suppress_mcp_stdout(logger):
273
+ response = call_tool_from_manifest(manifest, tool_name, dict(arguments))
274
+ text = response_to_text(response)
275
+ preview = text[:preview_chars].replace("\n", " ")
276
+ logger.info("Sample tool output preview: %s", preview)
277
+ return text
278
+
279
+
280
+ def preview_dataset_tool_invocation(
281
+ *,
282
+ manifest: MCPManifest,
283
+ tool_name: str,
284
+ dataset: Any,
285
+ logger: logging.Logger = logger,
286
+ argument_adapter: ArgumentAdapter | None = None,
287
+ resolver_manifest: MCPManifest | None = None,
288
+ preview_chars: int = 200,
289
+ ) -> str | None:
290
+ """Execute a best-effort preview tool call using a dataset sample."""
291
+
292
+ resolver_manifest = resolver_manifest or manifest
293
+
294
+ try:
295
+ items = dataset.get_items(nb_samples=1)
296
+ except Exception as exc: # pragma: no cover - defensive logging
297
+ logger.warning("Failed to fetch dataset sample for preview: %s", exc)
298
+ return None
299
+
300
+ if not items:
301
+ logger.warning("No dataset items available for preview.")
302
+ return None
303
+
304
+ sample_item = items[0]
305
+ sample_args = extract_tool_arguments(sample_item)
306
+ if not sample_args:
307
+ logger.warning("No sample arguments available for preview.")
308
+ return None
309
+
310
+ def _resolver_call(name: str, payload: dict[str, Any]) -> Any:
311
+ with suppress_mcp_stdout(logger):
312
+ return call_tool_from_manifest(resolver_manifest, name, payload)
313
+
314
+ prepared_args: dict[str, Any] = dict(sample_args)
315
+ if argument_adapter:
316
+ prepared_args = argument_adapter(sample_args, _resolver_call)
317
+
318
+ return preview_tool_output(
319
+ manifest,
320
+ tool_name,
321
+ prepared_args,
322
+ logger=logger,
323
+ preview_chars=preview_chars,
324
+ )
325
+
326
+
327
+ def create_summary_var(name: str) -> ContextVar[str | None]:
328
+ """Return a ``ContextVar`` used to share tool summaries."""
329
+
330
+ return ContextVar(name, default=None)
331
+
332
+
333
+ @dataclass
334
+ class MCPToolInvocation:
335
+ """Callable helper for invoking MCP tools with consistent logging.
336
+
337
+ A single instance can be registered in a ``ChatPrompt`` function map
338
+ while keeping the script in charge of manifest, summary handling and
339
+ optional argument adaptation.
340
+ """
341
+
342
+ manifest: MCPManifest
343
+ tool_name: str
344
+ summary_handler: MCPSecondPassCoordinator | None = None
345
+ summary_builder: SummaryBuilder | None = None
346
+ argument_adapter: ArgumentAdapter | None = None
347
+ preview_label: str | None = None
348
+ preview_chars: int = 160
349
+ rate_limit_sleep: float = DEFAULT_MCP_RATELIMIT_SLEEP
350
+ cache_enabled: bool = True
351
+ _logger: logging.Logger = field(default_factory=lambda: logger)
352
+ _cache: dict[str, str] = field(default_factory=dict, init=False)
353
+
354
+ def __call__(self, **arguments: Any) -> str:
355
+ return self.invoke(arguments)
356
+
357
+ def clear_cache(self) -> None:
358
+ self._cache.clear()
359
+
360
+ def invoke(
361
+ self, arguments: Mapping[str, Any], *, use_cache: bool | None = None
362
+ ) -> str:
363
+ def call_tool(name: str, payload: dict[str, Any]) -> Any:
364
+ if self.rate_limit_sleep > 0:
365
+ time.sleep(self.rate_limit_sleep)
366
+ with suppress_mcp_stdout(self._logger):
367
+
368
+ @track(name=f"mcp_tool::{name}")
369
+ def _tracked() -> Any:
370
+ return call_tool_from_manifest(self.manifest, name, payload)
371
+
372
+ return _tracked()
373
+
374
+ prepared = dict(arguments)
375
+ if self.argument_adapter:
376
+ prepared = self.argument_adapter(prepared, call_tool)
377
+
378
+ effective_cache = self.cache_enabled if use_cache is None else use_cache
379
+ cache_key: str | None = None
380
+ if effective_cache:
381
+ cache_key = self._make_cache_key(prepared)
382
+ cached_summary = self._cache.get(cache_key)
383
+ if cached_summary is not None:
384
+ if self.summary_handler:
385
+ self.summary_handler.record_summary(cached_summary)
386
+ self._logger.debug(
387
+ "MCP tool %s cache hit arguments=%s", self.tool_name, prepared
388
+ )
389
+ return cached_summary
390
+
391
+ # TODO(opik-mcp): reuse a persistent MCP client so we avoid spawning a
392
+ # new stdio subprocess for each call. This currently mirrors the
393
+ # original blocking behaviour for stability.
394
+ with suppress_mcp_stdout(self._logger):
395
+
396
+ @track(name=f"mcp_tool::{self.tool_name}")
397
+ def _invoke() -> Any:
398
+ return call_tool(self.tool_name, prepared)
399
+
400
+ response = _invoke()
401
+ text = response_to_text(response)
402
+ preview = text[: self.preview_chars].replace("\n", " ")
403
+ label = self.preview_label or self.tool_name
404
+ self._logger.debug(
405
+ "MCP tool %s arguments=%s preview=%r", label, prepared, preview
406
+ )
407
+
408
+ summary = text
409
+ if self.summary_builder is not None:
410
+ summary = self.summary_builder(text, prepared)
411
+
412
+ if self.summary_handler:
413
+ self.summary_handler.record_summary(summary)
414
+
415
+ if effective_cache and cache_key is not None:
416
+ self._cache[cache_key] = summary
417
+
418
+ if os.getenv("OPIK_DEBUG_MCP"):
419
+ self._logger.info("MCP %s raw response:\n%s", label, text)
420
+
421
+ return summary
422
+
423
+ def _make_cache_key(self, payload: Mapping[str, Any]) -> str:
424
+ try:
425
+ return json.dumps(payload, sort_keys=True, default=str)
426
+ except TypeError:
427
+ normalised = self._normalise_cache_payload(payload)
428
+ return json.dumps(normalised, sort_keys=True, default=str)
429
+
430
+ @staticmethod
431
+ def _normalise_cache_payload(value: Any) -> Any:
432
+ if isinstance(value, Mapping):
433
+ return {
434
+ key: MCPToolInvocation._normalise_cache_payload(val)
435
+ for key, val in sorted(value.items(), key=lambda item: str(item[0]))
436
+ }
437
+ if isinstance(value, list):
438
+ return [MCPToolInvocation._normalise_cache_payload(item) for item in value]
439
+ if isinstance(value, tuple):
440
+ return [MCPToolInvocation._normalise_cache_payload(item) for item in value]
441
+ if isinstance(value, set):
442
+ return [
443
+ MCPToolInvocation._normalise_cache_payload(item)
444
+ for item in sorted(value, key=repr)
445
+ ]
446
+ if isinstance(value, (str, int, float, bool)) or value is None:
447
+ return value
448
+ return str(value)
449
+
450
+
451
+ def summarise_with_template(template: str) -> SummaryBuilder:
452
+ """Return a summary builder that fills the provided template."""
453
+
454
+ def _builder(tool_output: str, arguments: Mapping[str, Any]) -> str:
455
+ return template.format(response=tool_output, arguments=dict(arguments))
456
+
457
+ return _builder
458
+
459
+
460
+ def default_summary_builder(label: str, instructions: str) -> SummaryBuilder:
461
+ """Convenience factory for the demos' structured summaries."""
462
+
463
+ template = (
464
+ "{label}\n"
465
+ "Arguments: {{arguments}}\n"
466
+ "Instructions: {instructions}\n"
467
+ "Response Preview:\n"
468
+ "{{response}}"
469
+ ).format(label=label, instructions=instructions)
470
+
471
+ return summarise_with_template(template)
472
+
473
+
474
+ def make_argument_summary_builder(
475
+ *,
476
+ heading: str,
477
+ instructions: str,
478
+ argument_labels: Mapping[str, str],
479
+ preview_chars: int = 800,
480
+ ) -> SummaryBuilder:
481
+ """Return a structured summary builder that highlights selected arguments."""
482
+
483
+ def _builder(tool_output: str, arguments: Mapping[str, Any]) -> str:
484
+ scoped_args = dict(arguments)
485
+ highlighted = "\n".join(
486
+ f"{label}: {scoped_args.get(key, 'unknown')}"
487
+ for key, label in argument_labels.items()
488
+ )
489
+ snippet = tool_output[:preview_chars]
490
+ return textwrap.dedent(
491
+ f"""
492
+ {heading}
493
+ {highlighted}
494
+ Instructions: {instructions}
495
+ Documentation Snippet:
496
+ {snippet}
497
+ """
498
+ ).strip()
499
+
500
+ return _builder
501
+
502
+
503
+ @dataclass
504
+ class MCPExecutionConfig:
505
+ """Container describing how to run MCP-aware evaluations."""
506
+
507
+ coordinator: MCPSecondPassCoordinator
508
+ tool_name: str
509
+ fallback_arguments: FallbackArgumentsProvider = extract_tool_arguments
510
+ fallback_invoker: FallbackInvoker | None = None
511
+ allow_tool_use_on_second_pass: bool = False
512
+
513
+
514
+ def preview_second_pass(
515
+ prompt: Any,
516
+ dataset_item: dict[str, Any],
517
+ coordinator: MCPSecondPassCoordinator,
518
+ agent_factory: Callable[[Any], Any],
519
+ seed: int = 42,
520
+ ) -> None:
521
+ """Debug helper mirroring the old inline scripts."""
522
+
523
+ coordinator.reset()
524
+ agent = agent_factory(prompt)
525
+ base_messages = prompt.get_messages(dataset_item)
526
+
527
+ raw_output = agent.llm_invoke(
528
+ messages=base_messages, seed=seed, allow_tool_use=True
529
+ )
530
+ logger.debug("Raw model output: %s", raw_output)
531
+
532
+ second_pass_messages = coordinator.build_second_pass_messages(
533
+ base_messages=base_messages,
534
+ dataset_item=dataset_item,
535
+ )
536
+
537
+ if second_pass_messages:
538
+ logger.debug("Second-pass messages: %s", second_pass_messages)
539
+ final_output = agent.llm_invoke(
540
+ messages=second_pass_messages,
541
+ seed=seed,
542
+ allow_tool_use=True,
543
+ )
544
+ else:
545
+ final_output = raw_output
546
+
547
+ logger.debug("Coerced final output: %s", final_output)