opik-optimizer 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +28 -11
- opik_optimizer/colbert.py +236 -0
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +152 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +20 -20
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +18 -17
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +21 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/{utils.py → utils/core.py} +111 -26
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
- opik_optimizer-1.1.0.dist-info/RECORD +73 -0
- opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.5.dist-info/RECORD +0 -50
- opik_optimizer-1.0.5.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,116 @@
|
|
1
|
+
"""Deterministic harness to score MCP tool usage during optimization."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from difflib import SequenceMatcher
|
7
|
+
from typing import Any
|
8
|
+
from collections.abc import Callable, Mapping
|
9
|
+
|
10
|
+
from .mcp import ToolSignature, validate_tool_arguments
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass
|
14
|
+
class ToolCallResult:
|
15
|
+
tool_name: str
|
16
|
+
arguments: Mapping[str, Any]
|
17
|
+
response: Any
|
18
|
+
|
19
|
+
|
20
|
+
@dataclass
|
21
|
+
class SimulationReport:
|
22
|
+
dataset_id: str
|
23
|
+
expected_tool: str
|
24
|
+
tool_called: bool
|
25
|
+
called_tool: str | None
|
26
|
+
arguments_valid: bool
|
27
|
+
score: float
|
28
|
+
response: Any
|
29
|
+
failure_reason: str | None
|
30
|
+
|
31
|
+
|
32
|
+
InvokeFn = Callable[[ToolSignature, Mapping[str, Any], dict[str, Any]], ToolCallResult]
|
33
|
+
|
34
|
+
|
35
|
+
def simulate_session(
|
36
|
+
signature_map: dict[str, ToolSignature],
|
37
|
+
dataset_item: dict[str, Any],
|
38
|
+
invoke_tool: InvokeFn | None = None,
|
39
|
+
) -> SimulationReport:
|
40
|
+
"""Simulate a tool invocation for ``dataset_item`` using ``signature_map``.
|
41
|
+
|
42
|
+
``invoke_tool`` can run a real MCP client; when absent we assume the
|
43
|
+
expected tool is called with the reference arguments for deterministic
|
44
|
+
scoring.
|
45
|
+
"""
|
46
|
+
|
47
|
+
dataset_id = dataset_item.get("id", "unknown")
|
48
|
+
expected_tool = dataset_item["expected_tool"]
|
49
|
+
reference_arguments = dataset_item.get("arguments", {})
|
50
|
+
reference_answer = dataset_item.get("reference_answer") or dataset_item.get(
|
51
|
+
"expected_answer_contains"
|
52
|
+
)
|
53
|
+
|
54
|
+
signature = signature_map.get(expected_tool)
|
55
|
+
if signature is None:
|
56
|
+
return SimulationReport(
|
57
|
+
dataset_id=dataset_id,
|
58
|
+
expected_tool=expected_tool,
|
59
|
+
tool_called=False,
|
60
|
+
called_tool=None,
|
61
|
+
arguments_valid=False,
|
62
|
+
score=0.0,
|
63
|
+
response=None,
|
64
|
+
failure_reason="missing_tool_signature",
|
65
|
+
)
|
66
|
+
|
67
|
+
if invoke_tool is None:
|
68
|
+
tool_call = ToolCallResult(
|
69
|
+
tool_name=expected_tool,
|
70
|
+
arguments=reference_arguments,
|
71
|
+
response=dataset_item.get("reference_response", reference_answer),
|
72
|
+
)
|
73
|
+
else:
|
74
|
+
tool_call = invoke_tool(signature, reference_arguments, dataset_item)
|
75
|
+
|
76
|
+
arguments_valid, validation_message = validate_tool_arguments(
|
77
|
+
signature, tool_call.arguments
|
78
|
+
)
|
79
|
+
|
80
|
+
tool_called = tool_call.tool_name is not None
|
81
|
+
called_tool = tool_call.tool_name
|
82
|
+
failure_reason = None
|
83
|
+
score = 0.0
|
84
|
+
|
85
|
+
if not tool_called:
|
86
|
+
failure_reason = "tool_not_called"
|
87
|
+
elif called_tool != expected_tool:
|
88
|
+
failure_reason = "wrong_tool"
|
89
|
+
elif not arguments_valid:
|
90
|
+
failure_reason = f"invalid_arguments:{validation_message}"
|
91
|
+
else:
|
92
|
+
response_text = (
|
93
|
+
str(tool_call.response) if tool_call.response is not None else ""
|
94
|
+
)
|
95
|
+
if reference_answer:
|
96
|
+
ratio = SequenceMatcher(
|
97
|
+
None,
|
98
|
+
" ".join(reference_answer.lower().split()),
|
99
|
+
" ".join(response_text.lower().split()),
|
100
|
+
).ratio()
|
101
|
+
score = ratio
|
102
|
+
if ratio < 0.6:
|
103
|
+
failure_reason = "low_similarity"
|
104
|
+
else:
|
105
|
+
score = 1.0
|
106
|
+
|
107
|
+
return SimulationReport(
|
108
|
+
dataset_id=dataset_id,
|
109
|
+
expected_tool=expected_tool,
|
110
|
+
tool_called=tool_called,
|
111
|
+
called_tool=called_tool,
|
112
|
+
arguments_valid=arguments_valid,
|
113
|
+
score=score,
|
114
|
+
response=tool_call.response,
|
115
|
+
failure_reason=failure_reason,
|
116
|
+
)
|
@@ -0,0 +1,493 @@
|
|
1
|
+
"""High-level helpers for MCP-powered demo scripts.
|
2
|
+
|
3
|
+
These utilities consolidate the boilerplate previously embedded in the
|
4
|
+
example scripts so that each demo can focus on the tooling configuration
|
5
|
+
instead of low-level orchestration details. The helpers are intentionally
|
6
|
+
generic so that any MCP tool can reuse them with minimal adjustment.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from __future__ import annotations
|
10
|
+
|
11
|
+
import contextlib
|
12
|
+
import copy
|
13
|
+
import io
|
14
|
+
import logging
|
15
|
+
import os
|
16
|
+
import textwrap
|
17
|
+
import time
|
18
|
+
from contextvars import ContextVar
|
19
|
+
from dataclasses import dataclass, field
|
20
|
+
from pathlib import Path
|
21
|
+
from typing import Any
|
22
|
+
from collections.abc import Callable, Iterator, Mapping, Sequence
|
23
|
+
|
24
|
+
from opik import track
|
25
|
+
from opik.evaluation.metrics.score_result import ScoreResult
|
26
|
+
|
27
|
+
from .mcp import (
|
28
|
+
MCPManifest,
|
29
|
+
ToolSignature,
|
30
|
+
call_tool_from_manifest,
|
31
|
+
dump_mcp_signature,
|
32
|
+
list_tools_from_manifest,
|
33
|
+
load_tool_signature_from_manifest,
|
34
|
+
response_to_text,
|
35
|
+
)
|
36
|
+
from .mcp_second_pass import (
|
37
|
+
MCPSecondPassCoordinator,
|
38
|
+
FollowUpBuilder,
|
39
|
+
extract_user_query,
|
40
|
+
)
|
41
|
+
|
42
|
+
logger = logging.getLogger(__name__)
|
43
|
+
|
44
|
+
|
45
|
+
ToolCall = Callable[[str, dict[str, Any]], Any]
|
46
|
+
ArgumentAdapter = Callable[[dict[str, Any], ToolCall], dict[str, Any]]
|
47
|
+
SummaryBuilder = Callable[[str, Mapping[str, Any]], str]
|
48
|
+
FallbackArgumentsProvider = Callable[[Any], dict[str, Any]]
|
49
|
+
FallbackInvoker = Callable[[dict[str, Any]], str]
|
50
|
+
|
51
|
+
|
52
|
+
def _default_rate_limit() -> float:
|
53
|
+
value = os.getenv("MCP_RATELIMIT_SLEEP", "0.1")
|
54
|
+
try:
|
55
|
+
return float(value)
|
56
|
+
except ValueError:
|
57
|
+
logger.warning(
|
58
|
+
"Invalid MCP_RATELIMIT_SLEEP=%r, expected a numeric value, using default 0.1",
|
59
|
+
value,
|
60
|
+
)
|
61
|
+
return 0.1
|
62
|
+
|
63
|
+
|
64
|
+
DEFAULT_MCP_RATELIMIT_SLEEP = _default_rate_limit()
|
65
|
+
|
66
|
+
|
67
|
+
@contextlib.contextmanager
|
68
|
+
def suppress_mcp_stdout(logger: logging.Logger = logger) -> Iterator[None]:
|
69
|
+
buffer = io.StringIO()
|
70
|
+
with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer):
|
71
|
+
yield
|
72
|
+
for line in buffer.getvalue().splitlines():
|
73
|
+
trimmed = line.strip()
|
74
|
+
if not trimmed:
|
75
|
+
continue
|
76
|
+
if (
|
77
|
+
"MCP Server running on stdio" in trimmed
|
78
|
+
or "Context7 Documentation MCP Server running on stdio" in trimmed
|
79
|
+
):
|
80
|
+
continue
|
81
|
+
logger.debug("MCP stdout: %s", trimmed)
|
82
|
+
|
83
|
+
|
84
|
+
def ensure_argument_via_resolver(
|
85
|
+
*,
|
86
|
+
target_field: str,
|
87
|
+
resolver_tool: str,
|
88
|
+
query_fields: Sequence[str],
|
89
|
+
) -> ArgumentAdapter:
|
90
|
+
"""Return an adapter that resolves ``target_field`` via an MCP tool."""
|
91
|
+
|
92
|
+
def _adapter(arguments: dict[str, Any], call_tool: ToolCall) -> dict[str, Any]:
|
93
|
+
prepared = dict(arguments)
|
94
|
+
if prepared.get(target_field):
|
95
|
+
return prepared
|
96
|
+
for key in query_fields:
|
97
|
+
query = prepared.get(key)
|
98
|
+
if not query:
|
99
|
+
continue
|
100
|
+
response = call_tool(resolver_tool, {"query": query})
|
101
|
+
resolved = response_to_text(response).strip()
|
102
|
+
if resolved:
|
103
|
+
prepared[target_field] = resolved
|
104
|
+
break
|
105
|
+
return prepared
|
106
|
+
|
107
|
+
return _adapter
|
108
|
+
|
109
|
+
|
110
|
+
def extract_tool_arguments(item: Any) -> dict[str, Any]:
|
111
|
+
"""Best-effort extraction of tool arguments from dataset records.
|
112
|
+
|
113
|
+
The helper understands the common structures we use in tests and
|
114
|
+
examples but stays permissive so it keeps working with future
|
115
|
+
dataset variants.
|
116
|
+
"""
|
117
|
+
|
118
|
+
if isinstance(item, dict):
|
119
|
+
if "arguments" in item and isinstance(item["arguments"], dict):
|
120
|
+
return dict(item["arguments"])
|
121
|
+
if "input" in item and isinstance(item["input"], dict):
|
122
|
+
arguments = item["input"].get("arguments")
|
123
|
+
if isinstance(arguments, dict):
|
124
|
+
return dict(arguments)
|
125
|
+
|
126
|
+
for attr in ("input_values", "input", "data"):
|
127
|
+
value = getattr(item, attr, None)
|
128
|
+
if isinstance(value, dict):
|
129
|
+
arguments = value.get("arguments")
|
130
|
+
if isinstance(arguments, dict):
|
131
|
+
return dict(arguments)
|
132
|
+
|
133
|
+
return {}
|
134
|
+
|
135
|
+
|
136
|
+
def create_second_pass_coordinator(
|
137
|
+
tool_name: str,
|
138
|
+
follow_up_template: str,
|
139
|
+
*,
|
140
|
+
summary_var_name: str | None = None,
|
141
|
+
) -> MCPSecondPassCoordinator:
|
142
|
+
summary_var = create_summary_var(summary_var_name or f"{tool_name}_summary")
|
143
|
+
follow_up_builder = make_follow_up_builder(follow_up_template)
|
144
|
+
return MCPSecondPassCoordinator(
|
145
|
+
tool_name=tool_name,
|
146
|
+
summary_var=summary_var,
|
147
|
+
follow_up_builder=follow_up_builder,
|
148
|
+
)
|
149
|
+
|
150
|
+
|
151
|
+
def make_follow_up_builder(template: str) -> FollowUpBuilder:
|
152
|
+
"""Create a ``FollowUpBuilder`` that fills a string template.
|
153
|
+
|
154
|
+
The template receives ``summary`` and ``user_query`` keyword
|
155
|
+
arguments. Missing user queries collapse to an empty string so the
|
156
|
+
template can stay simple (e.g. ``"Use the summary: {summary}"``).
|
157
|
+
"""
|
158
|
+
|
159
|
+
def _builder(dataset_item: dict[str, Any], summary: str) -> str | None:
|
160
|
+
user_query = extract_user_query(dataset_item) or ""
|
161
|
+
rendered = template.format(summary=summary, user_query=user_query).strip()
|
162
|
+
return rendered or None
|
163
|
+
|
164
|
+
return _builder
|
165
|
+
|
166
|
+
|
167
|
+
def make_similarity_metric(name: str) -> Callable[[dict[str, Any], str], ScoreResult]:
|
168
|
+
"""Return a Levenshtein-ratio style metric closure for demos."""
|
169
|
+
|
170
|
+
def _metric(dataset_item: dict[str, Any], llm_output: str) -> ScoreResult:
|
171
|
+
reference = (dataset_item.get("reference_answer") or "").strip()
|
172
|
+
if not reference:
|
173
|
+
return ScoreResult(
|
174
|
+
name=f"{name}_similarity", value=0.0, reason="Missing reference answer."
|
175
|
+
)
|
176
|
+
|
177
|
+
def _normalize(text: str) -> str:
|
178
|
+
return " ".join(text.lower().split())
|
179
|
+
|
180
|
+
ratio = _sequence_match_ratio(_normalize(reference), _normalize(llm_output))
|
181
|
+
reason = f"Levenshtein ratio {ratio:.2f} against reference."
|
182
|
+
return ScoreResult(
|
183
|
+
name=f"{name}_similarity",
|
184
|
+
value=ratio,
|
185
|
+
reason=reason,
|
186
|
+
metadata={"reference": reference},
|
187
|
+
)
|
188
|
+
|
189
|
+
return _metric
|
190
|
+
|
191
|
+
|
192
|
+
def _sequence_match_ratio(a: str, b: str) -> float:
|
193
|
+
"""Local wrapper to avoid importing difflib in several modules."""
|
194
|
+
|
195
|
+
from difflib import SequenceMatcher
|
196
|
+
|
197
|
+
return SequenceMatcher(None, a, b).ratio()
|
198
|
+
|
199
|
+
|
200
|
+
def list_manifest_tools(
|
201
|
+
manifest: MCPManifest, *, logger: logging.Logger = logger
|
202
|
+
) -> tuple[list[Any], list[str]]:
|
203
|
+
with suppress_mcp_stdout(logger):
|
204
|
+
tools = list_tools_from_manifest(manifest)
|
205
|
+
names = [getattr(tool, "name", "") for tool in tools if getattr(tool, "name", None)]
|
206
|
+
logger.info("MCP tools available: %s", names)
|
207
|
+
return tools, names
|
208
|
+
|
209
|
+
|
210
|
+
def load_manifest_tool_signature(
|
211
|
+
manifest: MCPManifest,
|
212
|
+
tool_name: str,
|
213
|
+
*,
|
214
|
+
logger: logging.Logger = logger,
|
215
|
+
) -> ToolSignature:
|
216
|
+
signature = load_tool_signature_from_manifest(manifest, tool_name)
|
217
|
+
logger.debug("Loaded signature for %s", tool_name)
|
218
|
+
return signature
|
219
|
+
|
220
|
+
|
221
|
+
def dump_signature_artifact(
|
222
|
+
signature: ToolSignature,
|
223
|
+
artifacts_dir: Path | str,
|
224
|
+
filename: str,
|
225
|
+
*,
|
226
|
+
logger: logging.Logger = logger,
|
227
|
+
) -> Path:
|
228
|
+
artifacts_path = Path(artifacts_dir)
|
229
|
+
artifacts_path.mkdir(parents=True, exist_ok=True)
|
230
|
+
destination = artifacts_path / filename
|
231
|
+
dump_mcp_signature([signature], destination)
|
232
|
+
logger.info("Signature written to %s", destination)
|
233
|
+
return destination
|
234
|
+
|
235
|
+
|
236
|
+
def update_signature_from_tool_entry(
|
237
|
+
signature: ToolSignature, tool_entry: Mapping[str, Any]
|
238
|
+
) -> ToolSignature:
|
239
|
+
function_block = tool_entry.get("function", {})
|
240
|
+
signature.description = function_block.get("description", signature.description)
|
241
|
+
signature.parameters = function_block.get("parameters", signature.parameters)
|
242
|
+
signature.examples = function_block.get("examples", signature.examples)
|
243
|
+
signature.extra = {
|
244
|
+
**signature.extra,
|
245
|
+
**{k: v for k, v in tool_entry.items() if k != "function"},
|
246
|
+
}
|
247
|
+
return signature
|
248
|
+
|
249
|
+
|
250
|
+
def apply_tool_entry_from_prompt(
|
251
|
+
signature: ToolSignature,
|
252
|
+
prompt: Any,
|
253
|
+
default_entry: Mapping[str, Any],
|
254
|
+
) -> dict[str, Any]:
|
255
|
+
tool_entry: dict[str, Any] = copy.deepcopy(dict(default_entry))
|
256
|
+
prompt_tools = getattr(prompt, "tools", None)
|
257
|
+
if prompt_tools:
|
258
|
+
tool_entry = copy.deepcopy(dict(prompt_tools[0]))
|
259
|
+
update_signature_from_tool_entry(signature, tool_entry)
|
260
|
+
return tool_entry
|
261
|
+
|
262
|
+
|
263
|
+
def preview_tool_output(
|
264
|
+
manifest: MCPManifest,
|
265
|
+
tool_name: str,
|
266
|
+
arguments: Mapping[str, Any],
|
267
|
+
*,
|
268
|
+
logger: logging.Logger = logger,
|
269
|
+
preview_chars: int = 200,
|
270
|
+
) -> str:
|
271
|
+
with suppress_mcp_stdout(logger):
|
272
|
+
response = call_tool_from_manifest(manifest, tool_name, dict(arguments))
|
273
|
+
text = response_to_text(response)
|
274
|
+
preview = text[:preview_chars].replace("\n", " ")
|
275
|
+
logger.info("Sample tool output preview: %s", preview)
|
276
|
+
return text
|
277
|
+
|
278
|
+
|
279
|
+
def preview_dataset_tool_invocation(
|
280
|
+
*,
|
281
|
+
manifest: MCPManifest,
|
282
|
+
tool_name: str,
|
283
|
+
dataset: Any,
|
284
|
+
logger: logging.Logger = logger,
|
285
|
+
argument_adapter: ArgumentAdapter | None = None,
|
286
|
+
resolver_manifest: MCPManifest | None = None,
|
287
|
+
preview_chars: int = 200,
|
288
|
+
) -> str | None:
|
289
|
+
"""Execute a best-effort preview tool call using a dataset sample."""
|
290
|
+
|
291
|
+
resolver_manifest = resolver_manifest or manifest
|
292
|
+
|
293
|
+
try:
|
294
|
+
items = dataset.get_items(nb_samples=1)
|
295
|
+
except Exception as exc: # pragma: no cover - defensive logging
|
296
|
+
logger.warning("Failed to fetch dataset sample for preview: %s", exc)
|
297
|
+
return None
|
298
|
+
|
299
|
+
if not items:
|
300
|
+
logger.warning("No dataset items available for preview.")
|
301
|
+
return None
|
302
|
+
|
303
|
+
sample_item = items[0]
|
304
|
+
sample_args = extract_tool_arguments(sample_item)
|
305
|
+
if not sample_args:
|
306
|
+
logger.warning("No sample arguments available for preview.")
|
307
|
+
return None
|
308
|
+
|
309
|
+
def _resolver_call(name: str, payload: dict[str, Any]) -> Any:
|
310
|
+
with suppress_mcp_stdout(logger):
|
311
|
+
return call_tool_from_manifest(resolver_manifest, name, payload)
|
312
|
+
|
313
|
+
prepared_args: dict[str, Any] = dict(sample_args)
|
314
|
+
if argument_adapter:
|
315
|
+
prepared_args = argument_adapter(sample_args, _resolver_call)
|
316
|
+
|
317
|
+
return preview_tool_output(
|
318
|
+
manifest,
|
319
|
+
tool_name,
|
320
|
+
prepared_args,
|
321
|
+
logger=logger,
|
322
|
+
preview_chars=preview_chars,
|
323
|
+
)
|
324
|
+
|
325
|
+
|
326
|
+
def create_summary_var(name: str) -> ContextVar[str | None]:
|
327
|
+
"""Return a ``ContextVar`` used to share tool summaries."""
|
328
|
+
|
329
|
+
return ContextVar(name, default=None)
|
330
|
+
|
331
|
+
|
332
|
+
@dataclass
|
333
|
+
class MCPToolInvocation:
|
334
|
+
"""Callable helper for invoking MCP tools with consistent logging.
|
335
|
+
|
336
|
+
A single instance can be registered in a ``ChatPrompt`` function map
|
337
|
+
while keeping the script in charge of manifest, summary handling and
|
338
|
+
optional argument adaptation.
|
339
|
+
"""
|
340
|
+
|
341
|
+
manifest: MCPManifest
|
342
|
+
tool_name: str
|
343
|
+
summary_handler: MCPSecondPassCoordinator | None = None
|
344
|
+
summary_builder: SummaryBuilder | None = None
|
345
|
+
argument_adapter: ArgumentAdapter | None = None
|
346
|
+
preview_label: str | None = None
|
347
|
+
preview_chars: int = 160
|
348
|
+
rate_limit_sleep: float = DEFAULT_MCP_RATELIMIT_SLEEP
|
349
|
+
_logger: logging.Logger = field(default_factory=lambda: logger)
|
350
|
+
|
351
|
+
def __call__(self, **arguments: Any) -> str:
|
352
|
+
return self.invoke(arguments)
|
353
|
+
|
354
|
+
def invoke(self, arguments: Mapping[str, Any]) -> str:
|
355
|
+
def call_tool(name: str, payload: dict[str, Any]) -> Any:
|
356
|
+
if self.rate_limit_sleep > 0:
|
357
|
+
time.sleep(self.rate_limit_sleep)
|
358
|
+
with suppress_mcp_stdout(self._logger):
|
359
|
+
|
360
|
+
@track(name=f"mcp_tool::{name}")
|
361
|
+
def _tracked() -> Any:
|
362
|
+
return call_tool_from_manifest(self.manifest, name, payload)
|
363
|
+
|
364
|
+
return _tracked()
|
365
|
+
|
366
|
+
prepared = dict(arguments)
|
367
|
+
if self.argument_adapter:
|
368
|
+
prepared = self.argument_adapter(prepared, call_tool)
|
369
|
+
|
370
|
+
# TODO(opik-mcp): reuse a persistent MCP client so we avoid spawning a
|
371
|
+
# new stdio subprocess for each call. This currently mirrors the
|
372
|
+
# original blocking behaviour for stability.
|
373
|
+
with suppress_mcp_stdout(self._logger):
|
374
|
+
|
375
|
+
@track(name=f"mcp_tool::{self.tool_name}")
|
376
|
+
def _invoke() -> Any:
|
377
|
+
return call_tool(self.tool_name, prepared)
|
378
|
+
|
379
|
+
response = _invoke()
|
380
|
+
text = response_to_text(response)
|
381
|
+
preview = text[: self.preview_chars].replace("\n", " ")
|
382
|
+
label = self.preview_label or self.tool_name
|
383
|
+
self._logger.debug(
|
384
|
+
"MCP tool %s arguments=%s preview=%r", label, prepared, preview
|
385
|
+
)
|
386
|
+
|
387
|
+
summary = text
|
388
|
+
if self.summary_builder is not None:
|
389
|
+
summary = self.summary_builder(text, prepared)
|
390
|
+
|
391
|
+
if self.summary_handler:
|
392
|
+
self.summary_handler.record_summary(summary)
|
393
|
+
|
394
|
+
if os.getenv("OPIK_DEBUG_MCP"):
|
395
|
+
self._logger.info("MCP %s raw response:\n%s", label, text)
|
396
|
+
|
397
|
+
return summary
|
398
|
+
|
399
|
+
|
400
|
+
def summarise_with_template(template: str) -> SummaryBuilder:
|
401
|
+
"""Return a summary builder that fills the provided template."""
|
402
|
+
|
403
|
+
def _builder(tool_output: str, arguments: Mapping[str, Any]) -> str:
|
404
|
+
return template.format(response=tool_output, arguments=dict(arguments))
|
405
|
+
|
406
|
+
return _builder
|
407
|
+
|
408
|
+
|
409
|
+
def default_summary_builder(label: str, instructions: str) -> SummaryBuilder:
|
410
|
+
"""Convenience factory for the demos' structured summaries."""
|
411
|
+
|
412
|
+
template = (
|
413
|
+
"{label}\n"
|
414
|
+
"Arguments: {{arguments}}\n"
|
415
|
+
"Instructions: {instructions}\n"
|
416
|
+
"Response Preview:\n"
|
417
|
+
"{{response}}"
|
418
|
+
).format(label=label, instructions=instructions)
|
419
|
+
|
420
|
+
return summarise_with_template(template)
|
421
|
+
|
422
|
+
|
423
|
+
def make_argument_summary_builder(
|
424
|
+
*,
|
425
|
+
heading: str,
|
426
|
+
instructions: str,
|
427
|
+
argument_labels: Mapping[str, str],
|
428
|
+
preview_chars: int = 800,
|
429
|
+
) -> SummaryBuilder:
|
430
|
+
"""Return a structured summary builder that highlights selected arguments."""
|
431
|
+
|
432
|
+
def _builder(tool_output: str, arguments: Mapping[str, Any]) -> str:
|
433
|
+
scoped_args = dict(arguments)
|
434
|
+
highlighted = "\n".join(
|
435
|
+
f"{label}: {scoped_args.get(key, 'unknown')}"
|
436
|
+
for key, label in argument_labels.items()
|
437
|
+
)
|
438
|
+
snippet = tool_output[:preview_chars]
|
439
|
+
return textwrap.dedent(
|
440
|
+
f"""
|
441
|
+
{heading}
|
442
|
+
{highlighted}
|
443
|
+
Instructions: {instructions}
|
444
|
+
Documentation Snippet:
|
445
|
+
{snippet}
|
446
|
+
"""
|
447
|
+
).strip()
|
448
|
+
|
449
|
+
return _builder
|
450
|
+
|
451
|
+
|
452
|
+
@dataclass
|
453
|
+
class MCPExecutionConfig:
|
454
|
+
"""Container describing how to run MCP-aware evaluations."""
|
455
|
+
|
456
|
+
coordinator: MCPSecondPassCoordinator
|
457
|
+
tool_name: str
|
458
|
+
fallback_arguments: FallbackArgumentsProvider = extract_tool_arguments
|
459
|
+
fallback_invoker: FallbackInvoker | None = None
|
460
|
+
allow_tool_use_on_second_pass: bool = False
|
461
|
+
|
462
|
+
|
463
|
+
def preview_second_pass(
|
464
|
+
prompt: Any,
|
465
|
+
dataset_item: dict[str, Any],
|
466
|
+
coordinator: MCPSecondPassCoordinator,
|
467
|
+
agent_factory: Callable[[Any], Any],
|
468
|
+
) -> None:
|
469
|
+
"""Debug helper mirroring the old inline scripts."""
|
470
|
+
|
471
|
+
coordinator.reset()
|
472
|
+
agent = agent_factory(prompt)
|
473
|
+
base_messages = prompt.get_messages(dataset_item)
|
474
|
+
|
475
|
+
raw_output = agent.llm_invoke(messages=base_messages, seed=42, allow_tool_use=True)
|
476
|
+
logger.debug("Raw model output: %s", raw_output)
|
477
|
+
|
478
|
+
second_pass_messages = coordinator.build_second_pass_messages(
|
479
|
+
base_messages=base_messages,
|
480
|
+
dataset_item=dataset_item,
|
481
|
+
)
|
482
|
+
|
483
|
+
if second_pass_messages:
|
484
|
+
logger.debug("Second-pass messages: %s", second_pass_messages)
|
485
|
+
final_output = agent.llm_invoke(
|
486
|
+
messages=second_pass_messages,
|
487
|
+
seed=101,
|
488
|
+
allow_tool_use=True,
|
489
|
+
)
|
490
|
+
else:
|
491
|
+
final_output = raw_output
|
492
|
+
|
493
|
+
logger.debug("Coerced final output: %s", final_output)
|