@dedesfr/prompter 0.8.1 ā 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/dist/cli/index.js +1 -1
- package/package.json +1 -1
- package/skills/mcp-builder/LICENSE.txt +202 -0
- package/skills/mcp-builder/SKILL.md +236 -0
- package/skills/mcp-builder/reference/evaluation.md +602 -0
- package/skills/mcp-builder/reference/mcp_best_practices.md +249 -0
- package/skills/mcp-builder/reference/node_mcp_server.md +970 -0
- package/skills/mcp-builder/reference/python_mcp_server.md +719 -0
- package/skills/mcp-builder/scripts/connections.py +151 -0
- package/skills/mcp-builder/scripts/evaluation.py +373 -0
- package/skills/mcp-builder/scripts/example_evaluation.xml +22 -0
- package/skills/mcp-builder/scripts/requirements.txt +2 -0
- package/skills/project-orchestrator/SKILL.md +10 -2
- package/skills/project-orchestrator/assets/plan-summary-template.md +1 -0
- package/skills/skill-creator/LICENSE.txt +202 -0
- package/skills/skill-creator/SKILL.md +485 -0
- package/skills/skill-creator/agents/analyzer.md +274 -0
- package/skills/skill-creator/agents/comparator.md +202 -0
- package/skills/skill-creator/agents/grader.md +223 -0
- package/skills/skill-creator/assets/eval_review.html +146 -0
- package/skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/skill-creator/references/schemas.md +430 -0
- package/skills/skill-creator/scripts/__init__.py +0 -0
- package/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/skill-creator/scripts/generate_report.py +326 -0
- package/skills/skill-creator/scripts/improve_description.py +247 -0
- package/skills/skill-creator/scripts/package_skill.py +136 -0
- package/skills/skill-creator/scripts/quick_validate.py +103 -0
- package/skills/skill-creator/scripts/run_eval.py +310 -0
- package/skills/skill-creator/scripts/run_loop.py +328 -0
- package/skills/skill-creator/scripts/utils.py +47 -0
- package/src/cli/index.ts +1 -1
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Lightweight connection handling for MCP servers."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from contextlib import AsyncExitStack
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from mcp import ClientSession, StdioServerParameters
|
|
8
|
+
from mcp.client.sse import sse_client
|
|
9
|
+
from mcp.client.stdio import stdio_client
|
|
10
|
+
from mcp.client.streamable_http import streamablehttp_client
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MCPConnection(ABC):
|
|
14
|
+
"""Base class for MCP server connections."""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self.session = None
|
|
18
|
+
self._stack = None
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def _create_context(self):
|
|
22
|
+
"""Create the connection context based on connection type."""
|
|
23
|
+
|
|
24
|
+
async def __aenter__(self):
|
|
25
|
+
"""Initialize MCP server connection."""
|
|
26
|
+
self._stack = AsyncExitStack()
|
|
27
|
+
await self._stack.__aenter__()
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
ctx = self._create_context()
|
|
31
|
+
result = await self._stack.enter_async_context(ctx)
|
|
32
|
+
|
|
33
|
+
if len(result) == 2:
|
|
34
|
+
read, write = result
|
|
35
|
+
elif len(result) == 3:
|
|
36
|
+
read, write, _ = result
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"Unexpected context result: {result}")
|
|
39
|
+
|
|
40
|
+
session_ctx = ClientSession(read, write)
|
|
41
|
+
self.session = await self._stack.enter_async_context(session_ctx)
|
|
42
|
+
await self.session.initialize()
|
|
43
|
+
return self
|
|
44
|
+
except BaseException:
|
|
45
|
+
await self._stack.__aexit__(None, None, None)
|
|
46
|
+
raise
|
|
47
|
+
|
|
48
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
49
|
+
"""Clean up MCP server connection resources."""
|
|
50
|
+
if self._stack:
|
|
51
|
+
await self._stack.__aexit__(exc_type, exc_val, exc_tb)
|
|
52
|
+
self.session = None
|
|
53
|
+
self._stack = None
|
|
54
|
+
|
|
55
|
+
async def list_tools(self) -> list[dict[str, Any]]:
|
|
56
|
+
"""Retrieve available tools from the MCP server."""
|
|
57
|
+
response = await self.session.list_tools()
|
|
58
|
+
return [
|
|
59
|
+
{
|
|
60
|
+
"name": tool.name,
|
|
61
|
+
"description": tool.description,
|
|
62
|
+
"input_schema": tool.inputSchema,
|
|
63
|
+
}
|
|
64
|
+
for tool in response.tools
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
|
|
68
|
+
"""Call a tool on the MCP server with provided arguments."""
|
|
69
|
+
result = await self.session.call_tool(tool_name, arguments=arguments)
|
|
70
|
+
return result.content
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class MCPConnectionStdio(MCPConnection):
|
|
74
|
+
"""MCP connection using standard input/output."""
|
|
75
|
+
|
|
76
|
+
def __init__(self, command: str, args: list[str] = None, env: dict[str, str] = None):
|
|
77
|
+
super().__init__()
|
|
78
|
+
self.command = command
|
|
79
|
+
self.args = args or []
|
|
80
|
+
self.env = env
|
|
81
|
+
|
|
82
|
+
def _create_context(self):
|
|
83
|
+
return stdio_client(
|
|
84
|
+
StdioServerParameters(command=self.command, args=self.args, env=self.env)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class MCPConnectionSSE(MCPConnection):
|
|
89
|
+
"""MCP connection using Server-Sent Events."""
|
|
90
|
+
|
|
91
|
+
def __init__(self, url: str, headers: dict[str, str] = None):
|
|
92
|
+
super().__init__()
|
|
93
|
+
self.url = url
|
|
94
|
+
self.headers = headers or {}
|
|
95
|
+
|
|
96
|
+
def _create_context(self):
|
|
97
|
+
return sse_client(url=self.url, headers=self.headers)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class MCPConnectionHTTP(MCPConnection):
|
|
101
|
+
"""MCP connection using Streamable HTTP."""
|
|
102
|
+
|
|
103
|
+
def __init__(self, url: str, headers: dict[str, str] = None):
|
|
104
|
+
super().__init__()
|
|
105
|
+
self.url = url
|
|
106
|
+
self.headers = headers or {}
|
|
107
|
+
|
|
108
|
+
def _create_context(self):
|
|
109
|
+
return streamablehttp_client(url=self.url, headers=self.headers)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def create_connection(
|
|
113
|
+
transport: str,
|
|
114
|
+
command: str = None,
|
|
115
|
+
args: list[str] = None,
|
|
116
|
+
env: dict[str, str] = None,
|
|
117
|
+
url: str = None,
|
|
118
|
+
headers: dict[str, str] = None,
|
|
119
|
+
) -> MCPConnection:
|
|
120
|
+
"""Factory function to create the appropriate MCP connection.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
transport: Connection type ("stdio", "sse", or "http")
|
|
124
|
+
command: Command to run (stdio only)
|
|
125
|
+
args: Command arguments (stdio only)
|
|
126
|
+
env: Environment variables (stdio only)
|
|
127
|
+
url: Server URL (sse and http only)
|
|
128
|
+
headers: HTTP headers (sse and http only)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
MCPConnection instance
|
|
132
|
+
"""
|
|
133
|
+
transport = transport.lower()
|
|
134
|
+
|
|
135
|
+
if transport == "stdio":
|
|
136
|
+
if not command:
|
|
137
|
+
raise ValueError("Command is required for stdio transport")
|
|
138
|
+
return MCPConnectionStdio(command=command, args=args, env=env)
|
|
139
|
+
|
|
140
|
+
elif transport == "sse":
|
|
141
|
+
if not url:
|
|
142
|
+
raise ValueError("URL is required for sse transport")
|
|
143
|
+
return MCPConnectionSSE(url=url, headers=headers)
|
|
144
|
+
|
|
145
|
+
elif transport in ["http", "streamable_http", "streamable-http"]:
|
|
146
|
+
if not url:
|
|
147
|
+
raise ValueError("URL is required for http transport")
|
|
148
|
+
return MCPConnectionHTTP(url=url, headers=headers)
|
|
149
|
+
|
|
150
|
+
else:
|
|
151
|
+
raise ValueError(f"Unsupported transport type: {transport}. Use 'stdio', 'sse', or 'http'")
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
"""MCP Server Evaluation Harness
|
|
2
|
+
|
|
3
|
+
This script evaluates MCP servers by running test questions against them using Claude.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import re
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
import traceback
|
|
13
|
+
import xml.etree.ElementTree as ET
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from anthropic import Anthropic
|
|
18
|
+
|
|
19
|
+
from connections import create_connection
|
|
20
|
+
|
|
21
|
+
EVALUATION_PROMPT = """You are an AI assistant with access to tools.
|
|
22
|
+
|
|
23
|
+
When given a task, you MUST:
|
|
24
|
+
1. Use the available tools to complete the task
|
|
25
|
+
2. Provide summary of each step in your approach, wrapped in <summary> tags
|
|
26
|
+
3. Provide feedback on the tools provided, wrapped in <feedback> tags
|
|
27
|
+
4. Provide your final response, wrapped in <response> tags
|
|
28
|
+
|
|
29
|
+
Summary Requirements:
|
|
30
|
+
- In your <summary> tags, you must explain:
|
|
31
|
+
- The steps you took to complete the task
|
|
32
|
+
- Which tools you used, in what order, and why
|
|
33
|
+
- The inputs you provided to each tool
|
|
34
|
+
- The outputs you received from each tool
|
|
35
|
+
- A summary for how you arrived at the response
|
|
36
|
+
|
|
37
|
+
Feedback Requirements:
|
|
38
|
+
- In your <feedback> tags, provide constructive feedback on the tools:
|
|
39
|
+
- Comment on tool names: Are they clear and descriptive?
|
|
40
|
+
- Comment on input parameters: Are they well-documented? Are required vs optional parameters clear?
|
|
41
|
+
- Comment on descriptions: Do they accurately describe what the tool does?
|
|
42
|
+
- Comment on any errors encountered during tool usage: Did the tool fail to execute? Did the tool return too many tokens?
|
|
43
|
+
- Identify specific areas for improvement and explain WHY they would help
|
|
44
|
+
- Be specific and actionable in your suggestions
|
|
45
|
+
|
|
46
|
+
Response Requirements:
|
|
47
|
+
- Your response should be concise and directly address what was asked
|
|
48
|
+
- Always wrap your final response in <response> tags
|
|
49
|
+
- If you cannot solve the task return <response>NOT_FOUND</response>
|
|
50
|
+
- For numeric responses, provide just the number
|
|
51
|
+
- For IDs, provide just the ID
|
|
52
|
+
- For names or text, provide the exact text requested
|
|
53
|
+
- Your response should go last"""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parse_evaluation_file(file_path: Path) -> list[dict[str, Any]]:
|
|
57
|
+
"""Parse XML evaluation file with qa_pair elements."""
|
|
58
|
+
try:
|
|
59
|
+
tree = ET.parse(file_path)
|
|
60
|
+
root = tree.getroot()
|
|
61
|
+
evaluations = []
|
|
62
|
+
|
|
63
|
+
for qa_pair in root.findall(".//qa_pair"):
|
|
64
|
+
question_elem = qa_pair.find("question")
|
|
65
|
+
answer_elem = qa_pair.find("answer")
|
|
66
|
+
|
|
67
|
+
if question_elem is not None and answer_elem is not None:
|
|
68
|
+
evaluations.append({
|
|
69
|
+
"question": (question_elem.text or "").strip(),
|
|
70
|
+
"answer": (answer_elem.text or "").strip(),
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
return evaluations
|
|
74
|
+
except Exception as e:
|
|
75
|
+
print(f"Error parsing evaluation file {file_path}: {e}")
|
|
76
|
+
return []
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def extract_xml_content(text: str, tag: str) -> str | None:
|
|
80
|
+
"""Extract content from XML tags."""
|
|
81
|
+
pattern = rf"<{tag}>(.*?)</{tag}>"
|
|
82
|
+
matches = re.findall(pattern, text, re.DOTALL)
|
|
83
|
+
return matches[-1].strip() if matches else None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
async def agent_loop(
|
|
87
|
+
client: Anthropic,
|
|
88
|
+
model: str,
|
|
89
|
+
question: str,
|
|
90
|
+
tools: list[dict[str, Any]],
|
|
91
|
+
connection: Any,
|
|
92
|
+
) -> tuple[str, dict[str, Any]]:
|
|
93
|
+
"""Run the agent loop with MCP tools."""
|
|
94
|
+
messages = [{"role": "user", "content": question}]
|
|
95
|
+
|
|
96
|
+
response = await asyncio.to_thread(
|
|
97
|
+
client.messages.create,
|
|
98
|
+
model=model,
|
|
99
|
+
max_tokens=4096,
|
|
100
|
+
system=EVALUATION_PROMPT,
|
|
101
|
+
messages=messages,
|
|
102
|
+
tools=tools,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
messages.append({"role": "assistant", "content": response.content})
|
|
106
|
+
|
|
107
|
+
tool_metrics = {}
|
|
108
|
+
|
|
109
|
+
while response.stop_reason == "tool_use":
|
|
110
|
+
tool_use = next(block for block in response.content if block.type == "tool_use")
|
|
111
|
+
tool_name = tool_use.name
|
|
112
|
+
tool_input = tool_use.input
|
|
113
|
+
|
|
114
|
+
tool_start_ts = time.time()
|
|
115
|
+
try:
|
|
116
|
+
tool_result = await connection.call_tool(tool_name, tool_input)
|
|
117
|
+
tool_response = json.dumps(tool_result) if isinstance(tool_result, (dict, list)) else str(tool_result)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
tool_response = f"Error executing tool {tool_name}: {str(e)}\n"
|
|
120
|
+
tool_response += traceback.format_exc()
|
|
121
|
+
tool_duration = time.time() - tool_start_ts
|
|
122
|
+
|
|
123
|
+
if tool_name not in tool_metrics:
|
|
124
|
+
tool_metrics[tool_name] = {"count": 0, "durations": []}
|
|
125
|
+
tool_metrics[tool_name]["count"] += 1
|
|
126
|
+
tool_metrics[tool_name]["durations"].append(tool_duration)
|
|
127
|
+
|
|
128
|
+
messages.append({
|
|
129
|
+
"role": "user",
|
|
130
|
+
"content": [{
|
|
131
|
+
"type": "tool_result",
|
|
132
|
+
"tool_use_id": tool_use.id,
|
|
133
|
+
"content": tool_response,
|
|
134
|
+
}]
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
response = await asyncio.to_thread(
|
|
138
|
+
client.messages.create,
|
|
139
|
+
model=model,
|
|
140
|
+
max_tokens=4096,
|
|
141
|
+
system=EVALUATION_PROMPT,
|
|
142
|
+
messages=messages,
|
|
143
|
+
tools=tools,
|
|
144
|
+
)
|
|
145
|
+
messages.append({"role": "assistant", "content": response.content})
|
|
146
|
+
|
|
147
|
+
response_text = next(
|
|
148
|
+
(block.text for block in response.content if hasattr(block, "text")),
|
|
149
|
+
None,
|
|
150
|
+
)
|
|
151
|
+
return response_text, tool_metrics
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
async def evaluate_single_task(
|
|
155
|
+
client: Anthropic,
|
|
156
|
+
model: str,
|
|
157
|
+
qa_pair: dict[str, Any],
|
|
158
|
+
tools: list[dict[str, Any]],
|
|
159
|
+
connection: Any,
|
|
160
|
+
task_index: int,
|
|
161
|
+
) -> dict[str, Any]:
|
|
162
|
+
"""Evaluate a single QA pair with the given tools."""
|
|
163
|
+
start_time = time.time()
|
|
164
|
+
|
|
165
|
+
print(f"Task {task_index + 1}: Running task with question: {qa_pair['question']}")
|
|
166
|
+
response, tool_metrics = await agent_loop(client, model, qa_pair["question"], tools, connection)
|
|
167
|
+
|
|
168
|
+
response_value = extract_xml_content(response, "response")
|
|
169
|
+
summary = extract_xml_content(response, "summary")
|
|
170
|
+
feedback = extract_xml_content(response, "feedback")
|
|
171
|
+
|
|
172
|
+
duration_seconds = time.time() - start_time
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
"question": qa_pair["question"],
|
|
176
|
+
"expected": qa_pair["answer"],
|
|
177
|
+
"actual": response_value,
|
|
178
|
+
"score": int(response_value == qa_pair["answer"]) if response_value else 0,
|
|
179
|
+
"total_duration": duration_seconds,
|
|
180
|
+
"tool_calls": tool_metrics,
|
|
181
|
+
"num_tool_calls": sum(len(metrics["durations"]) for metrics in tool_metrics.values()),
|
|
182
|
+
"summary": summary,
|
|
183
|
+
"feedback": feedback,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
REPORT_HEADER = """
|
|
188
|
+
# Evaluation Report
|
|
189
|
+
|
|
190
|
+
## Summary
|
|
191
|
+
|
|
192
|
+
- **Accuracy**: {correct}/{total} ({accuracy:.1f}%)
|
|
193
|
+
- **Average Task Duration**: {average_duration_s:.2f}s
|
|
194
|
+
- **Average Tool Calls per Task**: {average_tool_calls:.2f}
|
|
195
|
+
- **Total Tool Calls**: {total_tool_calls}
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
TASK_TEMPLATE = """
|
|
201
|
+
### Task {task_num}
|
|
202
|
+
|
|
203
|
+
**Question**: {question}
|
|
204
|
+
**Ground Truth Answer**: `{expected_answer}`
|
|
205
|
+
**Actual Answer**: `{actual_answer}`
|
|
206
|
+
**Correct**: {correct_indicator}
|
|
207
|
+
**Duration**: {total_duration:.2f}s
|
|
208
|
+
**Tool Calls**: {tool_calls}
|
|
209
|
+
|
|
210
|
+
**Summary**
|
|
211
|
+
{summary}
|
|
212
|
+
|
|
213
|
+
**Feedback**
|
|
214
|
+
{feedback}
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
async def run_evaluation(
|
|
221
|
+
eval_path: Path,
|
|
222
|
+
connection: Any,
|
|
223
|
+
model: str = "claude-3-7-sonnet-20250219",
|
|
224
|
+
) -> str:
|
|
225
|
+
"""Run evaluation with MCP server tools."""
|
|
226
|
+
print("š Starting Evaluation")
|
|
227
|
+
|
|
228
|
+
client = Anthropic()
|
|
229
|
+
|
|
230
|
+
tools = await connection.list_tools()
|
|
231
|
+
print(f"š Loaded {len(tools)} tools from MCP server")
|
|
232
|
+
|
|
233
|
+
qa_pairs = parse_evaluation_file(eval_path)
|
|
234
|
+
print(f"š Loaded {len(qa_pairs)} evaluation tasks")
|
|
235
|
+
|
|
236
|
+
results = []
|
|
237
|
+
for i, qa_pair in enumerate(qa_pairs):
|
|
238
|
+
print(f"Processing task {i + 1}/{len(qa_pairs)}")
|
|
239
|
+
result = await evaluate_single_task(client, model, qa_pair, tools, connection, i)
|
|
240
|
+
results.append(result)
|
|
241
|
+
|
|
242
|
+
correct = sum(r["score"] for r in results)
|
|
243
|
+
accuracy = (correct / len(results)) * 100 if results else 0
|
|
244
|
+
average_duration_s = sum(r["total_duration"] for r in results) / len(results) if results else 0
|
|
245
|
+
average_tool_calls = sum(r["num_tool_calls"] for r in results) / len(results) if results else 0
|
|
246
|
+
total_tool_calls = sum(r["num_tool_calls"] for r in results)
|
|
247
|
+
|
|
248
|
+
report = REPORT_HEADER.format(
|
|
249
|
+
correct=correct,
|
|
250
|
+
total=len(results),
|
|
251
|
+
accuracy=accuracy,
|
|
252
|
+
average_duration_s=average_duration_s,
|
|
253
|
+
average_tool_calls=average_tool_calls,
|
|
254
|
+
total_tool_calls=total_tool_calls,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
report += "".join([
|
|
258
|
+
TASK_TEMPLATE.format(
|
|
259
|
+
task_num=i + 1,
|
|
260
|
+
question=qa_pair["question"],
|
|
261
|
+
expected_answer=qa_pair["answer"],
|
|
262
|
+
actual_answer=result["actual"] or "N/A",
|
|
263
|
+
correct_indicator="ā
" if result["score"] else "ā",
|
|
264
|
+
total_duration=result["total_duration"],
|
|
265
|
+
tool_calls=json.dumps(result["tool_calls"], indent=2),
|
|
266
|
+
summary=result["summary"] or "N/A",
|
|
267
|
+
feedback=result["feedback"] or "N/A",
|
|
268
|
+
)
|
|
269
|
+
for i, (qa_pair, result) in enumerate(zip(qa_pairs, results))
|
|
270
|
+
])
|
|
271
|
+
|
|
272
|
+
return report
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def parse_headers(header_list: list[str]) -> dict[str, str]:
|
|
276
|
+
"""Parse header strings in format 'Key: Value' into a dictionary."""
|
|
277
|
+
headers = {}
|
|
278
|
+
if not header_list:
|
|
279
|
+
return headers
|
|
280
|
+
|
|
281
|
+
for header in header_list:
|
|
282
|
+
if ":" in header:
|
|
283
|
+
key, value = header.split(":", 1)
|
|
284
|
+
headers[key.strip()] = value.strip()
|
|
285
|
+
else:
|
|
286
|
+
print(f"Warning: Ignoring malformed header: {header}")
|
|
287
|
+
return headers
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def parse_env_vars(env_list: list[str]) -> dict[str, str]:
|
|
291
|
+
"""Parse environment variable strings in format 'KEY=VALUE' into a dictionary."""
|
|
292
|
+
env = {}
|
|
293
|
+
if not env_list:
|
|
294
|
+
return env
|
|
295
|
+
|
|
296
|
+
for env_var in env_list:
|
|
297
|
+
if "=" in env_var:
|
|
298
|
+
key, value = env_var.split("=", 1)
|
|
299
|
+
env[key.strip()] = value.strip()
|
|
300
|
+
else:
|
|
301
|
+
print(f"Warning: Ignoring malformed environment variable: {env_var}")
|
|
302
|
+
return env
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
async def main():
|
|
306
|
+
parser = argparse.ArgumentParser(
|
|
307
|
+
description="Evaluate MCP servers using test questions",
|
|
308
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
309
|
+
epilog="""
|
|
310
|
+
Examples:
|
|
311
|
+
# Evaluate a local stdio MCP server
|
|
312
|
+
python evaluation.py -t stdio -c python -a my_server.py eval.xml
|
|
313
|
+
|
|
314
|
+
# Evaluate an SSE MCP server
|
|
315
|
+
python evaluation.py -t sse -u https://example.com/mcp -H "Authorization: Bearer token" eval.xml
|
|
316
|
+
|
|
317
|
+
# Evaluate an HTTP MCP server with custom model
|
|
318
|
+
python evaluation.py -t http -u https://example.com/mcp -m claude-3-5-sonnet-20241022 eval.xml
|
|
319
|
+
""",
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
parser.add_argument("eval_file", type=Path, help="Path to evaluation XML file")
|
|
323
|
+
parser.add_argument("-t", "--transport", choices=["stdio", "sse", "http"], default="stdio", help="Transport type (default: stdio)")
|
|
324
|
+
parser.add_argument("-m", "--model", default="claude-3-7-sonnet-20250219", help="Claude model to use (default: claude-3-7-sonnet-20250219)")
|
|
325
|
+
|
|
326
|
+
stdio_group = parser.add_argument_group("stdio options")
|
|
327
|
+
stdio_group.add_argument("-c", "--command", help="Command to run MCP server (stdio only)")
|
|
328
|
+
stdio_group.add_argument("-a", "--args", nargs="+", help="Arguments for the command (stdio only)")
|
|
329
|
+
stdio_group.add_argument("-e", "--env", nargs="+", help="Environment variables in KEY=VALUE format (stdio only)")
|
|
330
|
+
|
|
331
|
+
remote_group = parser.add_argument_group("sse/http options")
|
|
332
|
+
remote_group.add_argument("-u", "--url", help="MCP server URL (sse/http only)")
|
|
333
|
+
remote_group.add_argument("-H", "--header", nargs="+", dest="headers", help="HTTP headers in 'Key: Value' format (sse/http only)")
|
|
334
|
+
|
|
335
|
+
parser.add_argument("-o", "--output", type=Path, help="Output file for evaluation report (default: stdout)")
|
|
336
|
+
|
|
337
|
+
args = parser.parse_args()
|
|
338
|
+
|
|
339
|
+
if not args.eval_file.exists():
|
|
340
|
+
print(f"Error: Evaluation file not found: {args.eval_file}")
|
|
341
|
+
sys.exit(1)
|
|
342
|
+
|
|
343
|
+
headers = parse_headers(args.headers) if args.headers else None
|
|
344
|
+
env_vars = parse_env_vars(args.env) if args.env else None
|
|
345
|
+
|
|
346
|
+
try:
|
|
347
|
+
connection = create_connection(
|
|
348
|
+
transport=args.transport,
|
|
349
|
+
command=args.command,
|
|
350
|
+
args=args.args,
|
|
351
|
+
env=env_vars,
|
|
352
|
+
url=args.url,
|
|
353
|
+
headers=headers,
|
|
354
|
+
)
|
|
355
|
+
except ValueError as e:
|
|
356
|
+
print(f"Error: {e}")
|
|
357
|
+
sys.exit(1)
|
|
358
|
+
|
|
359
|
+
print(f"š Connecting to MCP server via {args.transport}...")
|
|
360
|
+
|
|
361
|
+
async with connection:
|
|
362
|
+
print("ā
Connected successfully")
|
|
363
|
+
report = await run_evaluation(args.eval_file, connection, args.model)
|
|
364
|
+
|
|
365
|
+
if args.output:
|
|
366
|
+
args.output.write_text(report)
|
|
367
|
+
print(f"\nā
Report saved to {args.output}")
|
|
368
|
+
else:
|
|
369
|
+
print("\n" + report)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
if __name__ == "__main__":
|
|
373
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
<evaluation>
|
|
2
|
+
<qa_pair>
|
|
3
|
+
<question>Calculate the compound interest on $10,000 invested at 5% annual interest rate, compounded monthly for 3 years. What is the final amount in dollars (rounded to 2 decimal places)?</question>
|
|
4
|
+
<answer>11614.72</answer>
|
|
5
|
+
</qa_pair>
|
|
6
|
+
<qa_pair>
|
|
7
|
+
<question>A projectile is launched at a 45-degree angle with an initial velocity of 50 m/s. Calculate the total distance (in meters) it has traveled from the launch point after 2 seconds, assuming g=9.8 m/s². Round to 2 decimal places.</question>
|
|
8
|
+
<answer>87.25</answer>
|
|
9
|
+
</qa_pair>
|
|
10
|
+
<qa_pair>
|
|
11
|
+
<question>A sphere has a volume of 500 cubic meters. Calculate its surface area in square meters. Round to 2 decimal places.</question>
|
|
12
|
+
<answer>304.65</answer>
|
|
13
|
+
</qa_pair>
|
|
14
|
+
<qa_pair>
|
|
15
|
+
<question>Calculate the population standard deviation of this dataset: [12, 15, 18, 22, 25, 30, 35]. Round to 2 decimal places.</question>
|
|
16
|
+
<answer>7.61</answer>
|
|
17
|
+
</qa_pair>
|
|
18
|
+
<qa_pair>
|
|
19
|
+
<question>Calculate the pH of a solution with a hydrogen ion concentration of 3.5 Ć 10^-5 M. Round to 2 decimal places.</question>
|
|
20
|
+
<answer>4.46</answer>
|
|
21
|
+
</qa_pair>
|
|
22
|
+
</evaluation>
|
|
@@ -50,6 +50,7 @@ Use the `AskUserQuestion` tool for **every question** in the interview. This ren
|
|
|
50
50
|
{ "label": "React + Convex", "description": "React (Vite or Next.js) + Convex (real-time backend + built-in DB)" },
|
|
51
51
|
{ "label": "Laravel Classic", "description": "Laravel + Blade + Tailwind + PostgreSQL/MySQL" },
|
|
52
52
|
{ "label": "Laravel + React", "description": "Laravel + Inertia.js (React) + PostgreSQL/MySQL" },
|
|
53
|
+
{ "label": "Laravel + Filament", "description": "Laravel + Filament (admin panel & CRUD) + Tailwind + PostgreSQL/MySQL" },
|
|
53
54
|
{ "label": "Unsure", "description": "I'll recommend based on your project needs" }
|
|
54
55
|
]
|
|
55
56
|
}
|
|
@@ -101,6 +102,7 @@ Always use these exact commands when scaffolding projects. Include the correct c
|
|
|
101
102
|
| Express | `npm install express --save` |
|
|
102
103
|
| NestJS | `npm i -g @nestjs/cli && nest new {project_name}` |
|
|
103
104
|
| Laravel 12 | `composer create-project laravel/laravel:^12.0 {project_name}` |
|
|
105
|
+
| Filament | `composer require filament/filament && php artisan filament:install --panels` |
|
|
104
106
|
| React + Convex | `npm create convex@latest` |
|
|
105
107
|
|
|
106
108
|
**Rules:**
|
|
@@ -108,6 +110,7 @@ Always use these exact commands when scaffolding projects. Include the correct c
|
|
|
108
110
|
- For Bundle 1 (JS/TS Full-Stack): include the frontend command (React via Vite or Next.js) AND the backend command (Express or NestJS).
|
|
109
111
|
- For Bundle 2 (React + Convex): include only `npm create convex@latest` -- it scaffolds both the React frontend and Convex backend in one step.
|
|
110
112
|
- For Bundles 3 and 4 (Laravel): include only the Laravel command -- Blade, Inertia, and Tailwind are configured within the Laravel project.
|
|
113
|
+
- For Bundle 5 (Laravel + Filament): include the Laravel command first, then the Filament install command (`composer require filament/filament && php artisan filament:install --panels`).
|
|
111
114
|
- Never invent or substitute alternative installation commands. Use these exactly as shown.
|
|
112
115
|
|
|
113
116
|
---
|
|
@@ -241,11 +244,12 @@ Let's pick your tech stack. Here are four proven bundles:
|
|
|
241
244
|
2. **React + Convex**: React (Vite or Next.js) + Convex (real-time backend + built-in document DB, no SQL setup needed)
|
|
242
245
|
3. **Laravel Classic**: Laravel + Blade + Tailwind CSS + MySQL or PostgreSQL
|
|
243
246
|
4. **Laravel + React**: Laravel + Inertia.js (React) + MySQL or PostgreSQL
|
|
247
|
+
5. **Laravel + Filament**: Laravel + Filament (admin panel & CRUD generator) + Tailwind CSS + MySQL or PostgreSQL
|
|
244
248
|
|
|
245
|
-
Which bundle fits your project best? (Pick 1-
|
|
249
|
+
Which bundle fits your project best? (Pick 1-5, or say "unsure")
|
|
246
250
|
```
|
|
247
251
|
|
|
248
|
-
If unsure: Recommend based on what you've learned (e.g., "Since you need SEO and prefer a simpler setup, I'd go with Laravel Classic -- it's fast to build, great for server-rendered pages, and has excellent built-in tooling." Or "If you want real-time features out of the box with minimal backend setup, React + Convex is a great choice.").
|
|
252
|
+
If unsure: Recommend based on what you've learned (e.g., "Since you need SEO and prefer a simpler setup, I'd go with Laravel Classic -- it's fast to build, great for server-rendered pages, and has excellent built-in tooling." Or "If you want real-time features out of the box with minimal backend setup, React + Convex is a great choice." Or "If your app is primarily an admin panel, back-office tool, or data management system, Laravel + Filament gives you a complete CRUD interface with minimal custom frontend work.").
|
|
249
253
|
|
|
250
254
|
### Sub-Choices
|
|
251
255
|
|
|
@@ -266,6 +270,10 @@ After the user picks a bundle, ask ONLY the necessary sub-choices:
|
|
|
266
270
|
**Bundle 4 sub-choices:**
|
|
267
271
|
- MySQL vs PostgreSQL? (Same guidance as above)
|
|
268
272
|
|
|
273
|
+
**Bundle 5 sub-choices:**
|
|
274
|
+
- MySQL vs PostgreSQL? (Same guidance as above)
|
|
275
|
+
- Filament panels: Admin only, or also a user-facing app panel? (Recommend admin-only for MVP -- add a user-facing panel later if needed. If the user needs a public-facing frontend beyond Filament, suggest combining with Blade or consider Bundle 4 instead.)
|
|
276
|
+
|
|
269
277
|
Provide a brief recommendation for each sub-choice based on the project's stated needs.
|
|
270
278
|
|
|
271
279
|
---
|
|
@@ -112,6 +112,7 @@
|
|
|
112
112
|
# Express: npm install express --save
|
|
113
113
|
# NestJS: npm i -g @nestjs/cli && nest new {project_name}
|
|
114
114
|
# Laravel 12: composer create-project laravel/laravel:^12.0 {project_name}
|
|
115
|
+
# Filament: composer require filament/filament && php artisan filament:install --panels
|
|
115
116
|
```
|
|
116
117
|
> Replace the above with only the command(s) matching the selected stack.
|
|
117
118
|
|