opik-optimizer 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/_throttle.py +2 -1
- opik_optimizer/base_optimizer.py +28 -11
- opik_optimizer/colbert.py +236 -0
- opik_optimizer/data/context7_eval.jsonl +3 -0
- opik_optimizer/datasets/context7_eval.py +90 -0
- opik_optimizer/datasets/tiny_test.py +33 -34
- opik_optimizer/datasets/truthful_qa.py +2 -2
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +73 -0
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +124 -941
- opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
- opik_optimizer/evolutionary_optimizer/llm_support.py +134 -0
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +292 -0
- opik_optimizer/evolutionary_optimizer/population_ops.py +223 -0
- opik_optimizer/evolutionary_optimizer/prompts.py +305 -0
- opik_optimizer/evolutionary_optimizer/reporting.py +16 -4
- opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +26 -23
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
- opik_optimizer/gepa_optimizer/__init__.py +3 -0
- opik_optimizer/gepa_optimizer/adapter.py +152 -0
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +556 -0
- opik_optimizer/gepa_optimizer/reporting.py +181 -0
- opik_optimizer/logging_config.py +42 -7
- opik_optimizer/mcp_utils/__init__.py +22 -0
- opik_optimizer/mcp_utils/mcp.py +541 -0
- opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
- opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
- opik_optimizer/mcp_utils/mcp_workflow.py +493 -0
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +399 -69
- opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
- opik_optimizer/mipro_optimizer/_lm.py +20 -20
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +51 -50
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +33 -28
- opik_optimizer/mipro_optimizer/utils.py +2 -4
- opik_optimizer/optimizable_agent.py +18 -17
- opik_optimizer/optimization_config/chat_prompt.py +44 -23
- opik_optimizer/optimization_config/configs.py +3 -3
- opik_optimizer/optimization_config/mappers.py +9 -8
- opik_optimizer/optimization_result.py +21 -14
- opik_optimizer/reporting_utils.py +61 -10
- opik_optimizer/task_evaluator.py +9 -8
- opik_optimizer/utils/__init__.py +15 -0
- opik_optimizer/{utils.py → utils/core.py} +111 -26
- opik_optimizer/utils/dataset_utils.py +49 -0
- opik_optimizer/utils/prompt_segments.py +186 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/METADATA +93 -16
- opik_optimizer-1.1.0.dist-info/RECORD +73 -0
- opik_optimizer-1.1.0.dist-info/licenses/LICENSE +203 -0
- opik_optimizer-1.0.5.dist-info/RECORD +0 -50
- opik_optimizer-1.0.5.dist-info/licenses/LICENSE +0 -21
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-1.0.5.dist-info → opik_optimizer-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,541 @@
|
|
1
|
+
"""Helpers for working with MCP tool signatures in optimization flows."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import asyncio
|
6
|
+
import copy
|
7
|
+
import importlib
|
8
|
+
import json
|
9
|
+
import textwrap
|
10
|
+
from difflib import SequenceMatcher
|
11
|
+
from dataclasses import dataclass, field
|
12
|
+
from pathlib import Path
|
13
|
+
from types import TracebackType
|
14
|
+
from typing import (
|
15
|
+
Any,
|
16
|
+
TypeVar,
|
17
|
+
cast,
|
18
|
+
)
|
19
|
+
from collections.abc import Coroutine, Iterable, Mapping
|
20
|
+
|
21
|
+
ClientSession: type[Any] | None = None
|
22
|
+
StdioClientFactory: type[Any] | None = None
|
23
|
+
StdioServerParameters: type[Any] | None = None
|
24
|
+
types_mod: Any | None = None
|
25
|
+
|
26
|
+
_T = TypeVar("_T")
|
27
|
+
|
28
|
+
|
29
|
+
TOOL_ENTRY_KEY = "function"
|
30
|
+
|
31
|
+
|
32
|
+
@dataclass
|
33
|
+
class ToolSignature:
|
34
|
+
"""Representation of an MCP tool definition used for tuning."""
|
35
|
+
|
36
|
+
name: str
|
37
|
+
description: str
|
38
|
+
parameters: Mapping[str, Any]
|
39
|
+
examples: list[dict[str, Any]] | None = None
|
40
|
+
extra: dict[str, Any] = field(default_factory=dict)
|
41
|
+
|
42
|
+
@classmethod
|
43
|
+
def from_tool_entry(cls, entry: Mapping[str, Any]) -> ToolSignature:
|
44
|
+
if TOOL_ENTRY_KEY not in entry:
|
45
|
+
raise ValueError("Tool entry missing 'function' block")
|
46
|
+
|
47
|
+
function_block = entry[TOOL_ENTRY_KEY]
|
48
|
+
name = function_block.get("name")
|
49
|
+
if not name:
|
50
|
+
raise ValueError("Tool entry missing function name")
|
51
|
+
|
52
|
+
description = function_block.get("description", "")
|
53
|
+
parameters = function_block.get("parameters", {}) or {}
|
54
|
+
examples = function_block.get("examples")
|
55
|
+
extra = {
|
56
|
+
key: value for key, value in entry.items() if key not in {TOOL_ENTRY_KEY}
|
57
|
+
}
|
58
|
+
|
59
|
+
return cls(
|
60
|
+
name=name,
|
61
|
+
description=description,
|
62
|
+
parameters=parameters,
|
63
|
+
examples=examples,
|
64
|
+
extra=extra,
|
65
|
+
)
|
66
|
+
|
67
|
+
def to_tool_entry(self) -> dict[str, Any]:
|
68
|
+
entry = copy.deepcopy(self.extra)
|
69
|
+
entry.update(
|
70
|
+
{
|
71
|
+
TOOL_ENTRY_KEY: {
|
72
|
+
"name": self.name,
|
73
|
+
"description": self.description,
|
74
|
+
"parameters": self.parameters,
|
75
|
+
}
|
76
|
+
}
|
77
|
+
)
|
78
|
+
if self.examples is not None:
|
79
|
+
entry[TOOL_ENTRY_KEY]["examples"] = self.examples
|
80
|
+
return entry
|
81
|
+
|
82
|
+
def segment_update(self) -> tuple[str, str]:
|
83
|
+
return (f"tool:{self.name}", self.description)
|
84
|
+
|
85
|
+
|
86
|
+
def load_mcp_signature(path: Path) -> list[ToolSignature]:
|
87
|
+
data = json.loads(Path(path).read_text())
|
88
|
+
|
89
|
+
if isinstance(data, dict) and "tools" in data:
|
90
|
+
data = data["tools"]
|
91
|
+
|
92
|
+
if not isinstance(data, list):
|
93
|
+
raise ValueError("MCP signature file must contain a list of tools")
|
94
|
+
|
95
|
+
return [ToolSignature.from_tool_entry(entry) for entry in data]
|
96
|
+
|
97
|
+
|
98
|
+
def dump_mcp_signature(signatures: Iterable[ToolSignature], path: Path) -> None:
|
99
|
+
payload = [signature.to_tool_entry() for signature in signatures]
|
100
|
+
Path(path).write_text(json.dumps(payload, indent=2, sort_keys=True))
|
101
|
+
|
102
|
+
|
103
|
+
def tools_from_signatures(signatures: Iterable[ToolSignature]) -> list[dict[str, Any]]:
|
104
|
+
return [signature.to_tool_entry() for signature in signatures]
|
105
|
+
|
106
|
+
|
107
|
+
def signature_updates(signatures: Iterable[ToolSignature]) -> dict[str, str]:
|
108
|
+
return dict(signature.segment_update() for signature in signatures)
|
109
|
+
|
110
|
+
|
111
|
+
def validate_tool_arguments(
|
112
|
+
signature: ToolSignature, arguments: Mapping[str, Any]
|
113
|
+
) -> tuple[bool, str]:
|
114
|
+
"""Validate ``arguments`` against required fields in the signature schema."""
|
115
|
+
|
116
|
+
schema_required = signature.parameters.get("required", [])
|
117
|
+
for required_field in schema_required:
|
118
|
+
if required_field not in arguments:
|
119
|
+
return False, f"Missing required argument '{required_field}'"
|
120
|
+
|
121
|
+
properties = signature.parameters.get("properties", {})
|
122
|
+
for key, value in arguments.items():
|
123
|
+
prop_schema = properties.get(key)
|
124
|
+
if not prop_schema:
|
125
|
+
continue
|
126
|
+
expected_type = prop_schema.get("type")
|
127
|
+
if expected_type:
|
128
|
+
if expected_type == "string" and not isinstance(value, str):
|
129
|
+
return False, f"Argument '{key}' must be a string"
|
130
|
+
if expected_type == "number" and not isinstance(value, (int, float)):
|
131
|
+
return False, f"Argument '{key}' must be a number"
|
132
|
+
if expected_type == "integer" and not isinstance(value, int):
|
133
|
+
return False, f"Argument '{key}' must be an integer"
|
134
|
+
if expected_type == "boolean" and not isinstance(value, bool):
|
135
|
+
return False, f"Argument '{key}' must be a boolean"
|
136
|
+
|
137
|
+
return True, ""
|
138
|
+
|
139
|
+
|
140
|
+
# ---------------------------------------------------------------------------
|
141
|
+
# MCP runtime helpers using the official Python SDK
|
142
|
+
|
143
|
+
|
144
|
+
class MCPDependencyError(RuntimeError):
|
145
|
+
"""Raised when the Model Context Protocol SDK is unavailable."""
|
146
|
+
|
147
|
+
|
148
|
+
def _load_sdk() -> tuple[Any, Any, Any, Any]:
|
149
|
+
candidates = (
|
150
|
+
(
|
151
|
+
"mcp.client.session",
|
152
|
+
"mcp.client.stdio",
|
153
|
+
"mcp.types",
|
154
|
+
),
|
155
|
+
(
|
156
|
+
"modelcontextprotocol.client.session",
|
157
|
+
"modelcontextprotocol.client.stdio",
|
158
|
+
"modelcontextprotocol.types",
|
159
|
+
),
|
160
|
+
)
|
161
|
+
|
162
|
+
for session_path, stdio_path, types_path in candidates:
|
163
|
+
try:
|
164
|
+
session_mod = importlib.import_module(session_path)
|
165
|
+
stdio_mod = importlib.import_module(stdio_path)
|
166
|
+
types_mod = importlib.import_module(types_path)
|
167
|
+
except ImportError:
|
168
|
+
continue
|
169
|
+
|
170
|
+
session_cls = getattr(session_mod, "ClientSession", None)
|
171
|
+
stdio_client_fn = getattr(stdio_mod, "stdio_client", None)
|
172
|
+
stdio_params_cls = getattr(stdio_mod, "StdioServerParameters", None)
|
173
|
+
|
174
|
+
if session_cls and stdio_client_fn and stdio_params_cls:
|
175
|
+
return session_cls, stdio_client_fn, stdio_params_cls, types_mod
|
176
|
+
|
177
|
+
raise MCPDependencyError(
|
178
|
+
"modelcontextprotocol Python SDK not found. Install it with 'pip install mcp'."
|
179
|
+
)
|
180
|
+
|
181
|
+
|
182
|
+
try:
|
183
|
+
(ClientSession, StdioClientFactory, StdioServerParameters, types_mod) = _load_sdk()
|
184
|
+
_SDK_ERROR: Exception | None = None
|
185
|
+
except MCPDependencyError as exc: # pragma: no cover
|
186
|
+
ClientSession = None # type: ignore[assignment]
|
187
|
+
StdioClientFactory = None # type: ignore[assignment]
|
188
|
+
StdioServerParameters = None # type: ignore[assignment]
|
189
|
+
types_mod = None # type: ignore[assignment]
|
190
|
+
_SDK_ERROR = exc
|
191
|
+
|
192
|
+
|
193
|
+
@dataclass
|
194
|
+
class MCPManifest:
|
195
|
+
name: str
|
196
|
+
command: str
|
197
|
+
args: list[str]
|
198
|
+
env: dict[str, str]
|
199
|
+
|
200
|
+
@classmethod
|
201
|
+
def from_dict(cls, data: Mapping[str, Any]) -> MCPManifest:
|
202
|
+
command = data.get("command")
|
203
|
+
if not command:
|
204
|
+
raise ValueError("mcp.json missing 'command'")
|
205
|
+
return cls(
|
206
|
+
name=data.get("name", "mcp-server"),
|
207
|
+
command=command,
|
208
|
+
args=data.get("args", []),
|
209
|
+
env=data.get("env", {}),
|
210
|
+
)
|
211
|
+
|
212
|
+
@classmethod
|
213
|
+
def from_json(cls, path: Path) -> MCPManifest:
|
214
|
+
return cls.from_dict(json.loads(Path(path).read_text()))
|
215
|
+
|
216
|
+
|
217
|
+
class MCPClient:
|
218
|
+
def __init__(self, manifest: MCPManifest) -> None:
|
219
|
+
if _SDK_ERROR is not None:
|
220
|
+
raise MCPDependencyError(str(_SDK_ERROR))
|
221
|
+
if (
|
222
|
+
ClientSession is None
|
223
|
+
or StdioClientFactory is None
|
224
|
+
or StdioServerParameters is None
|
225
|
+
):
|
226
|
+
raise MCPDependencyError("MCP SDK is not available")
|
227
|
+
self.manifest = manifest
|
228
|
+
self._transport_cm: Any | None = None
|
229
|
+
self._session: Any | None = None
|
230
|
+
self._read_stream: Any | None = None
|
231
|
+
self._write_stream: Any | None = None
|
232
|
+
|
233
|
+
async def __aenter__(self) -> MCPClient:
|
234
|
+
server_params = cast(type[Any], StdioServerParameters)(
|
235
|
+
command=self.manifest.command,
|
236
|
+
args=self.manifest.args,
|
237
|
+
env=self.manifest.env or None,
|
238
|
+
)
|
239
|
+
|
240
|
+
transport_factory = cast(type[Any], StdioClientFactory)
|
241
|
+
transport_cm = transport_factory(server_params)
|
242
|
+
self._transport_cm = transport_cm
|
243
|
+
self._read_stream, self._write_stream = await transport_cm.__aenter__()
|
244
|
+
session_cls = cast(type[Any], ClientSession)
|
245
|
+
self._session = session_cls(self._read_stream, self._write_stream)
|
246
|
+
|
247
|
+
if hasattr(self._session, "__aenter__"):
|
248
|
+
await self._session.__aenter__()
|
249
|
+
|
250
|
+
if hasattr(self._session, "initialize"):
|
251
|
+
await self._session.initialize()
|
252
|
+
return self
|
253
|
+
|
254
|
+
async def __aexit__(
|
255
|
+
self,
|
256
|
+
exc_type: type[BaseException] | None,
|
257
|
+
exc: BaseException | None,
|
258
|
+
tb: TracebackType | None,
|
259
|
+
) -> bool | None:
|
260
|
+
if self._session is not None:
|
261
|
+
if hasattr(self._session, "__aexit__"):
|
262
|
+
await self._session.__aexit__(exc_type, exc, tb)
|
263
|
+
if self._transport_cm is not None:
|
264
|
+
await self._transport_cm.__aexit__(exc_type, exc, tb)
|
265
|
+
return None
|
266
|
+
|
267
|
+
async def list_tools(self) -> Any:
|
268
|
+
if self._session is None:
|
269
|
+
raise RuntimeError("MCP session not started")
|
270
|
+
if hasattr(self._session, "list_tools"):
|
271
|
+
response = await self._session.list_tools()
|
272
|
+
return getattr(response, "tools", response)
|
273
|
+
if hasattr(self._session, "tools"):
|
274
|
+
return await self._session.tools()
|
275
|
+
raise RuntimeError("MCP session missing list_tools")
|
276
|
+
|
277
|
+
async def get_tool(self, tool_name: str) -> Any:
|
278
|
+
tools = await self.list_tools()
|
279
|
+
for tool in tools:
|
280
|
+
if tool.name == tool_name:
|
281
|
+
return tool
|
282
|
+
raise ValueError(f"Tool '{tool_name}' not found")
|
283
|
+
|
284
|
+
async def call_tool(self, tool_name: str, arguments: Mapping[str, Any]) -> Any:
|
285
|
+
if self._session is None:
|
286
|
+
raise RuntimeError("MCP session not started")
|
287
|
+
return await self._session.call_tool(name=tool_name, arguments=arguments)
|
288
|
+
|
289
|
+
|
290
|
+
def run_sync(coro: Coroutine[Any, Any, _T]) -> _T:
|
291
|
+
return asyncio.run(coro)
|
292
|
+
|
293
|
+
|
294
|
+
def list_tools_from_manifest(manifest: MCPManifest) -> Any:
|
295
|
+
async def _inner() -> Any:
|
296
|
+
async with MCPClient(manifest) as client:
|
297
|
+
return await client.list_tools()
|
298
|
+
|
299
|
+
return run_sync(_inner())
|
300
|
+
|
301
|
+
|
302
|
+
def call_tool_from_manifest(
|
303
|
+
manifest: MCPManifest, tool_name: str, arguments: dict[str, Any]
|
304
|
+
) -> Any:
|
305
|
+
async def _inner() -> Any:
|
306
|
+
async with MCPClient(manifest) as client:
|
307
|
+
return await client.call_tool(tool_name, arguments)
|
308
|
+
|
309
|
+
return run_sync(_inner())
|
310
|
+
|
311
|
+
|
312
|
+
def response_to_text(response: object) -> str:
|
313
|
+
if hasattr(response, "content"):
|
314
|
+
content = getattr(response, "content")
|
315
|
+
if isinstance(content, list):
|
316
|
+
texts = []
|
317
|
+
for item in content:
|
318
|
+
text_value = getattr(item, "text", None)
|
319
|
+
if text_value:
|
320
|
+
texts.append(text_value)
|
321
|
+
if texts:
|
322
|
+
return "\n".join(texts)
|
323
|
+
return str(content)
|
324
|
+
if hasattr(response, "output"):
|
325
|
+
return str(getattr(response, "output"))
|
326
|
+
return str(response)
|
327
|
+
|
328
|
+
|
329
|
+
PROMPT_TOOL_HEADER = "<<TOOL_DESCRIPTION>>"
|
330
|
+
PROMPT_TOOL_FOOTER = "<<END_TOOL_DESCRIPTION>>"
|
331
|
+
|
332
|
+
# System-prompt scaffolding below is inspired by the MCP section of Cline's
|
333
|
+
# system prompt (Apache-2.0). See https://github.com/cline/cline for details.
|
334
|
+
TOOL_USE_GUIDELINES = textwrap.dedent(
|
335
|
+
"""
|
336
|
+
# Tool Use Guidelines
|
337
|
+
|
338
|
+
1. In <thinking> tags, decide what you already know and what information you still need.
|
339
|
+
2. Choose the best tool for the current step using the descriptions and schemas provided.
|
340
|
+
3. Use one tool call per message, wait for its result, then decide the next step.
|
341
|
+
4. Format tool calls exactly with the XML shown in the tool examples.
|
342
|
+
5. After each tool call, read the result carefully before responding or calling another tool.
|
343
|
+
6. Always incorporate the tool output into your final answer.
|
344
|
+
"""
|
345
|
+
).strip()
|
346
|
+
|
347
|
+
|
348
|
+
def _format_json_block(data: Mapping[str, Any]) -> str:
|
349
|
+
return json.dumps(data, sort_keys=True)
|
350
|
+
|
351
|
+
|
352
|
+
def system_prompt_from_tool(
|
353
|
+
signature: ToolSignature, manifest: MCPManifest | None = None
|
354
|
+
) -> str:
|
355
|
+
parameters = signature.parameters or {}
|
356
|
+
parameter_lines = []
|
357
|
+
for name, schema in parameters.get("properties", {}).items():
|
358
|
+
type_hint = schema.get("type", "any")
|
359
|
+
desc = schema.get("description", "")
|
360
|
+
parameter_lines.append(f"- {name} ({type_hint}): {desc}")
|
361
|
+
parameter_section = (
|
362
|
+
"\n".join(parameter_lines) if parameter_lines else "- No structured parameters."
|
363
|
+
)
|
364
|
+
|
365
|
+
mcp_header = ""
|
366
|
+
if manifest is not None:
|
367
|
+
command_line_parts = [manifest.command]
|
368
|
+
if manifest.args:
|
369
|
+
sanitized_args: list[str] = []
|
370
|
+
skip_next = False
|
371
|
+
for idx, token in enumerate(manifest.args):
|
372
|
+
if skip_next:
|
373
|
+
skip_next = False
|
374
|
+
continue
|
375
|
+
|
376
|
+
lowered = token.lower()
|
377
|
+
if lowered in {"--api-key", "--apikey", "--token"}:
|
378
|
+
sanitized_args.append(f"{token} ***")
|
379
|
+
if idx + 1 < len(manifest.args):
|
380
|
+
skip_next = True
|
381
|
+
continue
|
382
|
+
|
383
|
+
if any(keyword in lowered for keyword in ("key", "token", "secret")):
|
384
|
+
sanitized_args.append("***")
|
385
|
+
continue
|
386
|
+
|
387
|
+
sanitized_args.append(token)
|
388
|
+
command_line_parts.extend(sanitized_args)
|
389
|
+
|
390
|
+
command_line = " ".join(command_line_parts)
|
391
|
+
|
392
|
+
schema_block = (
|
393
|
+
_format_json_block(signature.parameters) if signature.parameters else "{}"
|
394
|
+
)
|
395
|
+
|
396
|
+
mcp_header = textwrap.dedent(
|
397
|
+
f"""
|
398
|
+
MCP SERVERS
|
399
|
+
|
400
|
+
The Model Context Protocol (MCP) enables communication between the system and locally running MCP servers that provide additional tools and resources to extend your capabilities.
|
401
|
+
|
402
|
+
# Connected MCP Servers
|
403
|
+
|
404
|
+
When a server is connected, you can use the server's tools via the `use_mcp_tool` tool, and access the server's resources via the `access_mcp_resource` tool.
|
405
|
+
|
406
|
+
## {manifest.name} (`{command_line}`)
|
407
|
+
|
408
|
+
### Available Tools
|
409
|
+
- {signature.name}: {signature.description}
|
410
|
+
Input Schema:
|
411
|
+
{schema_block}
|
412
|
+
"""
|
413
|
+
).strip()
|
414
|
+
|
415
|
+
body = textwrap.dedent(
|
416
|
+
f"""
|
417
|
+
You are an assistant that answers developer questions using the available MCP tool.
|
418
|
+
Always decide whether the tool is required before answering.
|
419
|
+
Always call the tool at least once before replying and incorporate the returned documentation into your answer (quote key terms, mention the library ID).
|
420
|
+
|
421
|
+
Tool description:
|
422
|
+
{PROMPT_TOOL_HEADER}
|
423
|
+
{signature.description}
|
424
|
+
{PROMPT_TOOL_FOOTER}
|
425
|
+
|
426
|
+
Tool parameters:
|
427
|
+
{parameter_section}
|
428
|
+
When you call the tool, read its response carefully before replying.
|
429
|
+
"""
|
430
|
+
).strip()
|
431
|
+
|
432
|
+
sections = [mcp_header, TOOL_USE_GUIDELINES, body]
|
433
|
+
return "\n\n".join(section for section in sections if section).strip()
|
434
|
+
|
435
|
+
|
436
|
+
def extract_description_from_system(system_prompt: str) -> str | None:
|
437
|
+
if (
|
438
|
+
PROMPT_TOOL_HEADER not in system_prompt
|
439
|
+
or PROMPT_TOOL_FOOTER not in system_prompt
|
440
|
+
):
|
441
|
+
return None
|
442
|
+
start = system_prompt.index(PROMPT_TOOL_HEADER) + len(PROMPT_TOOL_HEADER)
|
443
|
+
end = system_prompt.index(PROMPT_TOOL_FOOTER)
|
444
|
+
return system_prompt[start:end].strip()
|
445
|
+
|
446
|
+
|
447
|
+
def load_tool_signature_from_manifest(
|
448
|
+
manifest: MCPManifest, tool_name: str
|
449
|
+
) -> ToolSignature:
|
450
|
+
tools = list_tools_from_manifest(manifest)
|
451
|
+
tool = next(
|
452
|
+
(tool for tool in tools if getattr(tool, "name", None) == tool_name), None
|
453
|
+
)
|
454
|
+
if tool is None:
|
455
|
+
raise ValueError(f"Tool '{tool_name}' not found")
|
456
|
+
entry = tool.model_dump(by_alias=True)
|
457
|
+
annotations = entry.get("annotations") or {}
|
458
|
+
examples = annotations.get("examples")
|
459
|
+
return ToolSignature.from_tool_entry(
|
460
|
+
{
|
461
|
+
"type": "function",
|
462
|
+
"function": {
|
463
|
+
"name": entry.get("name", tool_name),
|
464
|
+
"description": entry.get("description", ""),
|
465
|
+
"parameters": entry.get("inputSchema", {}),
|
466
|
+
"examples": examples,
|
467
|
+
},
|
468
|
+
}
|
469
|
+
)
|
470
|
+
|
471
|
+
|
472
|
+
def score_query_tool(
|
473
|
+
manifest: MCPManifest,
|
474
|
+
tool_name: str,
|
475
|
+
dataset: Iterable[Mapping[str, Any]],
|
476
|
+
description: str,
|
477
|
+
argument_key: str = "query",
|
478
|
+
) -> float:
|
479
|
+
successes = 0
|
480
|
+
total = 0
|
481
|
+
description_tokens = set(description.lower().split())
|
482
|
+
for record in dataset:
|
483
|
+
arguments = record.get("arguments", {})
|
484
|
+
value = arguments.get(argument_key, "")
|
485
|
+
if not value:
|
486
|
+
continue
|
487
|
+
total += 1
|
488
|
+
value_tokens = set(value.lower().split())
|
489
|
+
if description_tokens.isdisjoint(value_tokens):
|
490
|
+
continue
|
491
|
+
response = call_tool_from_manifest(manifest, tool_name, arguments)
|
492
|
+
text = response_to_text(response)
|
493
|
+
reference = record.get("reference_answer") or record.get(
|
494
|
+
"expected_answer_contains", ""
|
495
|
+
)
|
496
|
+
if reference:
|
497
|
+
ratio = SequenceMatcher(
|
498
|
+
None,
|
499
|
+
" ".join(reference.lower().split()),
|
500
|
+
" ".join(text.lower().split()),
|
501
|
+
).ratio()
|
502
|
+
if ratio >= 0.6:
|
503
|
+
successes += 1
|
504
|
+
return successes / total if total else 0.0
|
505
|
+
|
506
|
+
|
507
|
+
def score_url_tool(
|
508
|
+
manifest: MCPManifest,
|
509
|
+
tool_name: str,
|
510
|
+
dataset: Iterable[Mapping[str, Any]],
|
511
|
+
description: str,
|
512
|
+
argument_key: str = "url",
|
513
|
+
) -> float:
|
514
|
+
from urllib.parse import urlparse
|
515
|
+
|
516
|
+
successes = 0
|
517
|
+
total = 0
|
518
|
+
description_tokens = set(description.lower().split())
|
519
|
+
for record in dataset:
|
520
|
+
arguments = record.get("arguments", {})
|
521
|
+
url = arguments.get(argument_key, "")
|
522
|
+
if not url:
|
523
|
+
continue
|
524
|
+
total += 1
|
525
|
+
host_tokens = set(urlparse(url).netloc.lower().split("."))
|
526
|
+
if description_tokens.isdisjoint(host_tokens):
|
527
|
+
continue
|
528
|
+
response = call_tool_from_manifest(manifest, tool_name, arguments)
|
529
|
+
text = response_to_text(response)
|
530
|
+
reference = record.get("reference_answer") or record.get(
|
531
|
+
"expected_answer_contains", ""
|
532
|
+
)
|
533
|
+
if reference:
|
534
|
+
ratio = SequenceMatcher(
|
535
|
+
None,
|
536
|
+
" ".join(reference.lower().split()),
|
537
|
+
" ".join(text.lower().split()),
|
538
|
+
).ratio()
|
539
|
+
if ratio >= 0.6:
|
540
|
+
successes += 1
|
541
|
+
return successes / total if total else 0.0
|
@@ -0,0 +1,152 @@
|
|
1
|
+
"""Utilities for coordinating multi-pass MCP prompt evaluations."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from contextvars import ContextVar
|
7
|
+
from typing import Any, Optional
|
8
|
+
from collections.abc import Callable
|
9
|
+
|
10
|
+
|
11
|
+
FollowUpBuilder = Callable[[dict[str, Any], str], Optional[str]]
|
12
|
+
|
13
|
+
|
14
|
+
def _insert_tool_message(
|
15
|
+
*,
|
16
|
+
messages: list[dict[str, Any]],
|
17
|
+
tool_name: str,
|
18
|
+
tool_content: str,
|
19
|
+
) -> list[dict[str, Any]]:
|
20
|
+
"""Insert a tool message immediately after the first assistant reply."""
|
21
|
+
|
22
|
+
with_tool: list[dict[str, Any]] = []
|
23
|
+
inserted = False
|
24
|
+
for message in messages:
|
25
|
+
with_tool.append(message)
|
26
|
+
if message.get("role") == "assistant" and not inserted:
|
27
|
+
logger.debug(
|
28
|
+
"Inserting tool summary for %s after assistant message", tool_name
|
29
|
+
)
|
30
|
+
with_tool.append(
|
31
|
+
{
|
32
|
+
"role": "assistant",
|
33
|
+
"content": (
|
34
|
+
f"Here is the result from tool `{tool_name}`:\n\n{tool_content}"
|
35
|
+
),
|
36
|
+
}
|
37
|
+
)
|
38
|
+
inserted = True
|
39
|
+
|
40
|
+
if not inserted:
|
41
|
+
logger.debug("No assistant message found; appending summary for %s", tool_name)
|
42
|
+
with_tool.append(
|
43
|
+
{
|
44
|
+
"role": "assistant",
|
45
|
+
"content": (f"Tool result from `{tool_name}`:\n\n{tool_content}"),
|
46
|
+
}
|
47
|
+
)
|
48
|
+
|
49
|
+
return with_tool
|
50
|
+
|
51
|
+
|
52
|
+
def extract_user_query(dataset_item: dict[str, Any]) -> str | None:
|
53
|
+
"""Best-effort extraction of a user query from dataset item structures."""
|
54
|
+
|
55
|
+
user_query = dataset_item.get("user_query")
|
56
|
+
if user_query:
|
57
|
+
return user_query
|
58
|
+
|
59
|
+
payload = dataset_item.get("input")
|
60
|
+
if isinstance(payload, dict):
|
61
|
+
for key in ("query", "user_query", "prompt"):
|
62
|
+
value = payload.get(key)
|
63
|
+
if isinstance(value, str) and value.strip():
|
64
|
+
return value
|
65
|
+
|
66
|
+
return None
|
67
|
+
|
68
|
+
|
69
|
+
class MCPSecondPassCoordinator:
|
70
|
+
"""Tracks MCP tool summaries and builds second-pass message sets.
|
71
|
+
|
72
|
+
TODO(opik-mcp): Replace this shim once optimizers understand multi-pass flows
|
73
|
+
natively and expose tool transcripts without direct ChatPrompt mutation.
|
74
|
+
"""
|
75
|
+
|
76
|
+
def __init__(
|
77
|
+
self,
|
78
|
+
*,
|
79
|
+
tool_name: str,
|
80
|
+
summary_var: ContextVar[str | None],
|
81
|
+
follow_up_builder: FollowUpBuilder,
|
82
|
+
) -> None:
|
83
|
+
self._tool_name = tool_name
|
84
|
+
self._summary_var = summary_var
|
85
|
+
self._follow_up_builder = follow_up_builder
|
86
|
+
self._last_summary: str | None = None
|
87
|
+
self._last_follow_up: str | None = None
|
88
|
+
|
89
|
+
@property
|
90
|
+
def tool_name(self) -> str:
|
91
|
+
return self._tool_name
|
92
|
+
|
93
|
+
def reset(self) -> None:
|
94
|
+
self._summary_var.set(None)
|
95
|
+
|
96
|
+
def record_summary(self, summary: str) -> None:
|
97
|
+
logger.debug("Recording summary for %s", self.tool_name)
|
98
|
+
self._summary_var.set(summary)
|
99
|
+
|
100
|
+
def fetch_summary(self) -> str | None:
|
101
|
+
return self._summary_var.get()
|
102
|
+
|
103
|
+
def get_last_summary(self) -> str | None:
|
104
|
+
return self._last_summary
|
105
|
+
|
106
|
+
def build_second_pass_messages(
|
107
|
+
self,
|
108
|
+
*,
|
109
|
+
base_messages: list[dict[str, Any]],
|
110
|
+
dataset_item: dict[str, Any],
|
111
|
+
summary_override: str | None = None,
|
112
|
+
) -> list[dict[str, Any]] | None:
|
113
|
+
self._last_summary = None
|
114
|
+
self._last_follow_up = None
|
115
|
+
summary = (
|
116
|
+
summary_override if summary_override is not None else self.fetch_summary()
|
117
|
+
)
|
118
|
+
if not summary:
|
119
|
+
logger.debug(
|
120
|
+
"No summary available for %s; skipping second pass", self.tool_name
|
121
|
+
)
|
122
|
+
return None
|
123
|
+
|
124
|
+
logger.debug("Summary captured for %s", self.tool_name)
|
125
|
+
logger.debug(
|
126
|
+
"Summary preview for %s: %s",
|
127
|
+
self.tool_name,
|
128
|
+
summary[:160].replace(chr(10), " "),
|
129
|
+
)
|
130
|
+
|
131
|
+
messages = _insert_tool_message(
|
132
|
+
messages=base_messages,
|
133
|
+
tool_name=self.tool_name,
|
134
|
+
tool_content=summary,
|
135
|
+
)
|
136
|
+
|
137
|
+
follow_up = self._follow_up_builder(dataset_item, summary)
|
138
|
+
if follow_up:
|
139
|
+
messages.append({"role": "user", "content": follow_up})
|
140
|
+
logger.debug(
|
141
|
+
"Follow-up appended for %s: %s",
|
142
|
+
self.tool_name,
|
143
|
+
follow_up[:120] if follow_up else "None",
|
144
|
+
)
|
145
|
+
|
146
|
+
self._last_summary = summary
|
147
|
+
self._last_follow_up = follow_up
|
148
|
+
self.reset()
|
149
|
+
return messages
|
150
|
+
|
151
|
+
|
152
|
+
logger = logging.getLogger(__name__)
|