@peerlm/mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -0
- package/dist/index.js +679 -0
- package/package.json +26 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,679 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/index.ts
|
|
4
|
+
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
5
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
6
|
+
import {
|
|
7
|
+
CallToolRequestSchema,
|
|
8
|
+
ListToolsRequestSchema
|
|
9
|
+
} from "@modelcontextprotocol/sdk/types.js";
|
|
10
|
+
|
|
11
|
+
// src/client.ts
|
|
12
|
+
var PeerLMError = class extends Error {
|
|
13
|
+
constructor(status, message) {
|
|
14
|
+
super(message);
|
|
15
|
+
this.status = status;
|
|
16
|
+
this.name = "PeerLMError";
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
var PeerLMClient = class {
|
|
20
|
+
baseUrl;
|
|
21
|
+
apiKey;
|
|
22
|
+
constructor(apiKey2, baseUrl2) {
|
|
23
|
+
this.apiKey = apiKey2;
|
|
24
|
+
this.baseUrl = baseUrl2.replace(/\/$/, "");
|
|
25
|
+
}
|
|
26
|
+
async request(method, path, body) {
|
|
27
|
+
const url = `${this.baseUrl}/api/v1${path}`;
|
|
28
|
+
const headers = {
|
|
29
|
+
"X-API-Key": this.apiKey,
|
|
30
|
+
"Content-Type": "application/json"
|
|
31
|
+
};
|
|
32
|
+
let res;
|
|
33
|
+
try {
|
|
34
|
+
res = await fetch(url, {
|
|
35
|
+
method,
|
|
36
|
+
headers,
|
|
37
|
+
body: body ? JSON.stringify(body) : void 0
|
|
38
|
+
});
|
|
39
|
+
} catch (err) {
|
|
40
|
+
throw new PeerLMError(
|
|
41
|
+
0,
|
|
42
|
+
`PeerLM API unreachable at ${this.baseUrl}. Check your PEERLM_BASE_URL and network connection.`
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
const json = await res.json();
|
|
46
|
+
if (!res.ok) {
|
|
47
|
+
const message = json?.error || `HTTP ${res.status}`;
|
|
48
|
+
throw new PeerLMError(res.status, message);
|
|
49
|
+
}
|
|
50
|
+
return json.data;
|
|
51
|
+
}
|
|
52
|
+
async listSuites() {
|
|
53
|
+
return this.request("GET", "/suites");
|
|
54
|
+
}
|
|
55
|
+
async getSuite(suiteId) {
|
|
56
|
+
return this.request("GET", `/suites/${suiteId}`);
|
|
57
|
+
}
|
|
58
|
+
async listModels(params) {
|
|
59
|
+
const query = new URLSearchParams();
|
|
60
|
+
if (params?.provider) query.set("provider", params.provider);
|
|
61
|
+
if (params?.tier) query.set("tier", params.tier);
|
|
62
|
+
const qs = query.toString();
|
|
63
|
+
return this.request("GET", `/models${qs ? `?${qs}` : ""}`);
|
|
64
|
+
}
|
|
65
|
+
async getUsage() {
|
|
66
|
+
return this.request("GET", "/usage");
|
|
67
|
+
}
|
|
68
|
+
async createSystemPrompt(data) {
|
|
69
|
+
return this.request("POST", "/system-prompts", data);
|
|
70
|
+
}
|
|
71
|
+
async createTestPrompt(data) {
|
|
72
|
+
return this.request("POST", "/test-prompts", data);
|
|
73
|
+
}
|
|
74
|
+
async createSuite(data) {
|
|
75
|
+
return this.request("POST", "/suites", data);
|
|
76
|
+
}
|
|
77
|
+
async runEval(suiteId) {
|
|
78
|
+
return this.request("POST", "/runs", { suite_id: suiteId });
|
|
79
|
+
}
|
|
80
|
+
async getResults(runId) {
|
|
81
|
+
return this.request("GET", `/runs/${runId}`);
|
|
82
|
+
}
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
// src/formatters.ts
|
|
86
|
+
function formatSuiteList(suites) {
|
|
87
|
+
if (suites.length === 0) return "No evaluation suites found.";
|
|
88
|
+
const lines = suites.map(
|
|
89
|
+
(s) => `- **${s.name}** (v${s.version})
|
|
90
|
+
ID: ${s.id}${s.description ? `
|
|
91
|
+
${s.description}` : ""}`
|
|
92
|
+
);
|
|
93
|
+
return `Found ${suites.length} suite(s):
|
|
94
|
+
|
|
95
|
+
${lines.join("\n\n")}`;
|
|
96
|
+
}
|
|
97
|
+
function formatSuiteDetail(suite) {
|
|
98
|
+
const config = suite.configuration;
|
|
99
|
+
const lines = [
|
|
100
|
+
`**${suite.name}** (v${suite.version})`,
|
|
101
|
+
`ID: ${suite.id}`
|
|
102
|
+
];
|
|
103
|
+
if (suite.description) lines.push(`Description: ${suite.description}`);
|
|
104
|
+
if (config) {
|
|
105
|
+
lines.push("");
|
|
106
|
+
lines.push(`Generator Models: ${config.generatorModels.length} model(s)`);
|
|
107
|
+
config.generatorModels.forEach((id) => lines.push(` - ${id}`));
|
|
108
|
+
lines.push(`Evaluator Models: ${config.evaluatorModels.length} model(s)`);
|
|
109
|
+
config.evaluatorModels.forEach((id) => lines.push(` - ${id}`));
|
|
110
|
+
lines.push(`System Prompts: ${config.systemPrompts.length}`);
|
|
111
|
+
lines.push(`Test Prompts: ${config.testPrompts.length}`);
|
|
112
|
+
lines.push(`Samples per Prompt: ${config.responsesPerTopicPerPersona || 1}`);
|
|
113
|
+
lines.push(`Deterministic Mode: ${config.deterministicMode ? "Yes" : "No"}`);
|
|
114
|
+
if (config.criteria?.length) {
|
|
115
|
+
lines.push("");
|
|
116
|
+
lines.push("Criteria:");
|
|
117
|
+
config.criteria.forEach(
|
|
118
|
+
(c) => lines.push(` - ${c.label} (weight: ${c.weight}): ${c.description}`)
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
if (config.thresholds) {
|
|
122
|
+
lines.push("");
|
|
123
|
+
lines.push("Thresholds:");
|
|
124
|
+
if (config.thresholds.min_overall_score != null)
|
|
125
|
+
lines.push(` Min Overall Score: ${config.thresholds.min_overall_score}`);
|
|
126
|
+
if (config.thresholds.max_latency_ms != null)
|
|
127
|
+
lines.push(` Max Latency: ${config.thresholds.max_latency_ms}ms`);
|
|
128
|
+
if (config.thresholds.min_criteria_scores) {
|
|
129
|
+
for (const [k, v] of Object.entries(config.thresholds.min_criteria_scores)) {
|
|
130
|
+
lines.push(` Min ${k}: ${v}`);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
return lines.join("\n");
|
|
136
|
+
}
|
|
137
|
+
function formatModelList(models) {
|
|
138
|
+
if (models.length === 0) return "No models found matching your filters.";
|
|
139
|
+
const byProvider = /* @__PURE__ */ new Map();
|
|
140
|
+
for (const m of models) {
|
|
141
|
+
const list = byProvider.get(m.provider) || [];
|
|
142
|
+
list.push(m);
|
|
143
|
+
byProvider.set(m.provider, list);
|
|
144
|
+
}
|
|
145
|
+
const sections = [];
|
|
146
|
+
for (const [provider, provModels] of byProvider) {
|
|
147
|
+
const lines = provModels.map((m) => {
|
|
148
|
+
const pricing = m.pricingPrompt && m.pricingCompletion ? ` | $${m.pricingPrompt}/$${m.pricingCompletion} per 1M tokens` : "";
|
|
149
|
+
return ` - **${m.name}** [${m.tier}]${pricing}
|
|
150
|
+
ID: ${m.id}`;
|
|
151
|
+
});
|
|
152
|
+
sections.push(`**${provider}**
|
|
153
|
+
${lines.join("\n")}`);
|
|
154
|
+
}
|
|
155
|
+
return `${models.length} model(s) available:
|
|
156
|
+
|
|
157
|
+
${sections.join("\n\n")}`;
|
|
158
|
+
}
|
|
159
|
+
function formatRunStatus(run) {
|
|
160
|
+
const lines = [
|
|
161
|
+
`**${run.name || "Evaluation Run"}**`,
|
|
162
|
+
`Run ID: ${run.id}`,
|
|
163
|
+
`Status: ${run.status}`
|
|
164
|
+
];
|
|
165
|
+
if (run.status === "queued" || run.status === "running") {
|
|
166
|
+
if (run.progress) {
|
|
167
|
+
const p = run.progress;
|
|
168
|
+
lines.push(`Phase: ${p.phase}`);
|
|
169
|
+
if (p.phase === "generating") {
|
|
170
|
+
const pct = p.totalResponses > 0 ? Math.round(p.completedResponses / p.totalResponses * 100) : 0;
|
|
171
|
+
lines.push(
|
|
172
|
+
`Progress: ${p.completedResponses}/${p.totalResponses} responses (${pct}%)`
|
|
173
|
+
);
|
|
174
|
+
if (p.cacheHits > 0) lines.push(`Cache Hits: ${p.cacheHits}`);
|
|
175
|
+
} else if (p.phase === "evaluating") {
|
|
176
|
+
const pct = p.totalEvaluations > 0 ? Math.round(
|
|
177
|
+
p.completedEvaluations / p.totalEvaluations * 100
|
|
178
|
+
) : 0;
|
|
179
|
+
lines.push(
|
|
180
|
+
`Progress: ${p.completedEvaluations}/${p.totalEvaluations} evaluations (${pct}%)`
|
|
181
|
+
);
|
|
182
|
+
} else if (p.phase === "aggregating") {
|
|
183
|
+
lines.push("Aggregating results...");
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
if (run.evalCreditsEstimated) {
|
|
187
|
+
lines.push(`Estimated Credits: ${run.evalCreditsEstimated}`);
|
|
188
|
+
}
|
|
189
|
+
return lines.join("\n");
|
|
190
|
+
}
|
|
191
|
+
if (run.status === "failed") {
|
|
192
|
+
lines.push(`Error: ${run.errorMessage || "Unknown error"}`);
|
|
193
|
+
return lines.join("\n");
|
|
194
|
+
}
|
|
195
|
+
if (run.passed != null) {
|
|
196
|
+
lines.push(`Result: ${run.passed ? "PASSED" : "FAILED"}`);
|
|
197
|
+
}
|
|
198
|
+
if (run.evalCreditsConsumed != null) {
|
|
199
|
+
lines.push(`Credits Used: ${run.evalCreditsConsumed}`);
|
|
200
|
+
}
|
|
201
|
+
if (run.resultsSummary?.leaderboard?.length) {
|
|
202
|
+
lines.push("");
|
|
203
|
+
lines.push(formatLeaderboard(run.resultsSummary.leaderboard));
|
|
204
|
+
}
|
|
205
|
+
if (run.resultsSummary?.decisionMeta) {
|
|
206
|
+
const dm = run.resultsSummary.decisionMeta;
|
|
207
|
+
if (dm.recommendation) {
|
|
208
|
+
lines.push("");
|
|
209
|
+
lines.push(`Recommendation: ${dm.recommendation}`);
|
|
210
|
+
}
|
|
211
|
+
if (dm.insights?.length) {
|
|
212
|
+
lines.push("Insights:");
|
|
213
|
+
dm.insights.forEach((i) => lines.push(` - ${i}`));
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
if (run.thresholdViolations) {
|
|
217
|
+
lines.push("");
|
|
218
|
+
lines.push(
|
|
219
|
+
`Threshold Violations: ${JSON.stringify(run.thresholdViolations)}`
|
|
220
|
+
);
|
|
221
|
+
}
|
|
222
|
+
return lines.join("\n");
|
|
223
|
+
}
|
|
224
|
+
function formatLeaderboard(entries) {
|
|
225
|
+
const header = "Rank | Model | Score | Latency | Cost";
|
|
226
|
+
const separator = "-----|-------|-------|---------|-----";
|
|
227
|
+
const rows = entries.map((e) => {
|
|
228
|
+
const latency = e.avgLatencyMs != null ? `${Math.round(e.avgLatencyMs)}ms` : "-";
|
|
229
|
+
const cost = e.totalCostUsd != null ? `$${e.totalCostUsd.toFixed(4)}` : "-";
|
|
230
|
+
return `#${e.rank} | ${e.modelName} | ${e.overallScore.toFixed(2)} | ${latency} | ${cost}`;
|
|
231
|
+
});
|
|
232
|
+
return `Leaderboard:
|
|
233
|
+
${header}
|
|
234
|
+
${separator}
|
|
235
|
+
${rows.join("\n")}`;
|
|
236
|
+
}
|
|
237
|
+
function formatUsage(usage) {
|
|
238
|
+
const lines = [
|
|
239
|
+
`Plan: ${usage.plan}`,
|
|
240
|
+
`Credits: ${usage.creditsRemaining} remaining / ${usage.creditsIncluded} included`,
|
|
241
|
+
`Credits Used: ${usage.creditsUsed}`
|
|
242
|
+
];
|
|
243
|
+
if (usage.overageEnabled) lines.push("Overage Billing: Enabled");
|
|
244
|
+
if (usage.paygEnabled) lines.push("Pay-as-you-go: Enabled");
|
|
245
|
+
return lines.join("\n");
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// src/tools/shared.ts
|
|
249
|
+
function handleToolError(err) {
|
|
250
|
+
if (err instanceof PeerLMError) {
|
|
251
|
+
let message2 = err.message;
|
|
252
|
+
if (err.status === 401) {
|
|
253
|
+
message2 = "Invalid or expired API key. Check your PEERLM_API_KEY. API access requires a Pro or Enterprise plan.";
|
|
254
|
+
} else if (err.status === 402) {
|
|
255
|
+
message2 = "Insufficient credits. Visit PeerLM to add credits or upgrade your plan.";
|
|
256
|
+
} else if (err.status === 403) {
|
|
257
|
+
message2 = err.message || "This action requires a read-write API key.";
|
|
258
|
+
}
|
|
259
|
+
return {
|
|
260
|
+
content: [{ type: "text", text: message2 }],
|
|
261
|
+
isError: true
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
const message = err instanceof Error ? err.message : "An unexpected error occurred.";
|
|
265
|
+
return {
|
|
266
|
+
content: [{ type: "text", text: message }],
|
|
267
|
+
isError: true
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// src/tools/list-suites.ts
|
|
272
|
+
var listSuitesDescription = "List all evaluation suites in your PeerLM workspace. Returns suite names, IDs, versions, and descriptions.";
|
|
273
|
+
async function listSuitesHandler(client2) {
|
|
274
|
+
try {
|
|
275
|
+
const suites = await client2.listSuites();
|
|
276
|
+
return { content: [{ type: "text", text: formatSuiteList(suites) }] };
|
|
277
|
+
} catch (err) {
|
|
278
|
+
return handleToolError(err);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// src/tools/get-suite.ts
|
|
283
|
+
var getSuiteSchema = {
|
|
284
|
+
suite_id: {
|
|
285
|
+
type: "string",
|
|
286
|
+
description: "The ID of the evaluation suite to retrieve"
|
|
287
|
+
}
|
|
288
|
+
};
|
|
289
|
+
var getSuiteDescription = "Get full details of an evaluation suite including generator/evaluator models, prompts, criteria, and thresholds.";
|
|
290
|
+
async function getSuiteHandler(client2, args) {
|
|
291
|
+
try {
|
|
292
|
+
const suite = await client2.getSuite(args.suite_id);
|
|
293
|
+
return { content: [{ type: "text", text: formatSuiteDetail(suite) }] };
|
|
294
|
+
} catch (err) {
|
|
295
|
+
return handleToolError(err);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// src/tools/list-models.ts
|
|
300
|
+
var listModelsSchema = {
|
|
301
|
+
provider: {
|
|
302
|
+
type: "string",
|
|
303
|
+
description: "Filter by provider (e.g. openai, anthropic, google, mistral)"
|
|
304
|
+
},
|
|
305
|
+
tier: {
|
|
306
|
+
type: "string",
|
|
307
|
+
description: "Filter by tier (standard, advanced, premium, frontier)"
|
|
308
|
+
}
|
|
309
|
+
};
|
|
310
|
+
var listModelsDescription = "List available LLM models in PeerLM with their IDs, providers, tiers, and pricing. Use these IDs when creating suites.";
|
|
311
|
+
async function listModelsHandler(client2, args) {
|
|
312
|
+
try {
|
|
313
|
+
const models = await client2.listModels(args);
|
|
314
|
+
return { content: [{ type: "text", text: formatModelList(models) }] };
|
|
315
|
+
} catch (err) {
|
|
316
|
+
return handleToolError(err);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// src/tools/get-usage.ts
|
|
321
|
+
var getUsageDescription = "Get your current PeerLM plan, credit balance, and usage information.";
|
|
322
|
+
async function getUsageHandler(client2) {
|
|
323
|
+
try {
|
|
324
|
+
const usage = await client2.getUsage();
|
|
325
|
+
return { content: [{ type: "text", text: formatUsage(usage) }] };
|
|
326
|
+
} catch (err) {
|
|
327
|
+
return handleToolError(err);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// src/tools/create-system-prompt.ts
|
|
332
|
+
var createSystemPromptSchema = {
|
|
333
|
+
name: {
|
|
334
|
+
type: "string",
|
|
335
|
+
description: "A descriptive name for the system prompt"
|
|
336
|
+
},
|
|
337
|
+
system_prompt: {
|
|
338
|
+
type: "string",
|
|
339
|
+
description: "The full system prompt text that defines the role/context given to models during evaluation"
|
|
340
|
+
},
|
|
341
|
+
description: {
|
|
342
|
+
type: "string",
|
|
343
|
+
description: "Optional description of what this system prompt tests"
|
|
344
|
+
},
|
|
345
|
+
tags: {
|
|
346
|
+
type: "array",
|
|
347
|
+
items: { type: "string" },
|
|
348
|
+
description: "Optional tags for organization"
|
|
349
|
+
}
|
|
350
|
+
};
|
|
351
|
+
var createSystemPromptDescription = "Create a system prompt (persona) in PeerLM's library. System prompts define the role/context given to models during evaluation. Returns the prompt ID needed for creating suites.";
|
|
352
|
+
async function createSystemPromptHandler(client2, args) {
|
|
353
|
+
try {
|
|
354
|
+
const result = await client2.createSystemPrompt(args);
|
|
355
|
+
return {
|
|
356
|
+
content: [
|
|
357
|
+
{
|
|
358
|
+
type: "text",
|
|
359
|
+
text: `System prompt created successfully.
|
|
360
|
+
|
|
361
|
+
ID: ${result.id}
|
|
362
|
+
Name: ${result.name}${result.description ? `
|
|
363
|
+
Description: ${result.description}` : ""}`
|
|
364
|
+
}
|
|
365
|
+
]
|
|
366
|
+
};
|
|
367
|
+
} catch (err) {
|
|
368
|
+
return handleToolError(err);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// src/tools/create-test-prompt.ts
|
|
373
|
+
var createTestPromptSchema = {
|
|
374
|
+
title: {
|
|
375
|
+
type: "string",
|
|
376
|
+
description: "A descriptive title for the test prompt"
|
|
377
|
+
},
|
|
378
|
+
prompt: {
|
|
379
|
+
type: "string",
|
|
380
|
+
description: "The actual question or task that models will be evaluated on"
|
|
381
|
+
},
|
|
382
|
+
description: {
|
|
383
|
+
type: "string",
|
|
384
|
+
description: "Optional description of what this prompt tests"
|
|
385
|
+
},
|
|
386
|
+
tags: {
|
|
387
|
+
type: "array",
|
|
388
|
+
items: { type: "string" },
|
|
389
|
+
description: "Optional tags for organization"
|
|
390
|
+
}
|
|
391
|
+
};
|
|
392
|
+
var createTestPromptDescription = "Create a test prompt (task) in PeerLM's library. Test prompts are the actual questions/tasks models will be evaluated on. Returns the prompt ID needed for creating suites.";
|
|
393
|
+
async function createTestPromptHandler(client2, args) {
|
|
394
|
+
try {
|
|
395
|
+
const result = await client2.createTestPrompt(args);
|
|
396
|
+
return {
|
|
397
|
+
content: [
|
|
398
|
+
{
|
|
399
|
+
type: "text",
|
|
400
|
+
text: `Test prompt created successfully.
|
|
401
|
+
|
|
402
|
+
ID: ${result.id}
|
|
403
|
+
Title: ${result.title}${result.description ? `
|
|
404
|
+
Description: ${result.description}` : ""}`
|
|
405
|
+
}
|
|
406
|
+
]
|
|
407
|
+
};
|
|
408
|
+
} catch (err) {
|
|
409
|
+
return handleToolError(err);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// src/tools/create-suite.ts
|
|
414
|
+
var createSuiteSchema = {
|
|
415
|
+
name: {
|
|
416
|
+
type: "string",
|
|
417
|
+
description: "Name for the evaluation suite"
|
|
418
|
+
},
|
|
419
|
+
description: {
|
|
420
|
+
type: "string",
|
|
421
|
+
description: "Optional description of the evaluation purpose"
|
|
422
|
+
},
|
|
423
|
+
generator_models: {
|
|
424
|
+
type: "array",
|
|
425
|
+
items: { type: "string" },
|
|
426
|
+
description: "Model IDs to compare (minimum 2). Use list_models to find IDs."
|
|
427
|
+
},
|
|
428
|
+
evaluator_models: {
|
|
429
|
+
type: "array",
|
|
430
|
+
items: { type: "string" },
|
|
431
|
+
description: "Model IDs to use as evaluators (minimum 1). Use list_models to find IDs."
|
|
432
|
+
},
|
|
433
|
+
system_prompt_ids: {
|
|
434
|
+
type: "array",
|
|
435
|
+
items: { type: "string" },
|
|
436
|
+
description: "IDs of system prompts to use. Create them first with create_system_prompt."
|
|
437
|
+
},
|
|
438
|
+
test_prompt_ids: {
|
|
439
|
+
type: "array",
|
|
440
|
+
items: { type: "string" },
|
|
441
|
+
description: "IDs of test prompts to use. Create them first with create_test_prompt."
|
|
442
|
+
},
|
|
443
|
+
criteria: {
|
|
444
|
+
type: "array",
|
|
445
|
+
items: {
|
|
446
|
+
type: "object",
|
|
447
|
+
properties: {
|
|
448
|
+
label: { type: "string", description: "Criterion name (e.g. Accuracy)" },
|
|
449
|
+
description: { type: "string", description: "What the evaluator should assess" },
|
|
450
|
+
weight: { type: "number", description: "Relative weight (default 1)" }
|
|
451
|
+
},
|
|
452
|
+
required: ["label", "description", "weight"]
|
|
453
|
+
},
|
|
454
|
+
description: "Evaluation criteria with labels, descriptions, and weights"
|
|
455
|
+
},
|
|
456
|
+
deterministic_mode: {
|
|
457
|
+
type: "boolean",
|
|
458
|
+
description: "If true, sets temperature=0 and fixed seed where supported (default false)"
|
|
459
|
+
},
|
|
460
|
+
samples_per_prompt: {
|
|
461
|
+
type: "number",
|
|
462
|
+
description: "Number of response samples per model per prompt (default 1)"
|
|
463
|
+
},
|
|
464
|
+
evaluation_method: {
|
|
465
|
+
type: "string",
|
|
466
|
+
enum: ["rubric", "comparative"],
|
|
467
|
+
description: 'Evaluation method (default "rubric")'
|
|
468
|
+
}
|
|
469
|
+
};
|
|
470
|
+
var createSuiteDescription = "Create an evaluation suite that defines which models to compare, which prompts to test, and which criteria to score on. Use list_models to find model IDs, and create_system_prompt/create_test_prompt to create prompts first.";
|
|
471
|
+
async function createSuiteHandler(client2, args) {
|
|
472
|
+
try {
|
|
473
|
+
const result = await client2.createSuite(args);
|
|
474
|
+
return {
|
|
475
|
+
content: [
|
|
476
|
+
{
|
|
477
|
+
type: "text",
|
|
478
|
+
text: `Suite created successfully.
|
|
479
|
+
|
|
480
|
+
ID: ${result.id}
|
|
481
|
+
Name: ${result.name}
|
|
482
|
+
Version: ${result.version}
|
|
483
|
+
|
|
484
|
+
You can now run an evaluation with run_eval using this suite ID.`
|
|
485
|
+
}
|
|
486
|
+
]
|
|
487
|
+
};
|
|
488
|
+
} catch (err) {
|
|
489
|
+
return handleToolError(err);
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// src/tools/run-eval.ts
|
|
494
|
+
var runEvalSchema = {
|
|
495
|
+
suite_id: {
|
|
496
|
+
type: "string",
|
|
497
|
+
description: "The ID of the evaluation suite to run"
|
|
498
|
+
}
|
|
499
|
+
};
|
|
500
|
+
var runEvalDescription = "Trigger an evaluation run for a suite. The evaluation runs asynchronously \u2014 use get_results to check progress and retrieve results.";
|
|
501
|
+
async function runEvalHandler(client2, args) {
|
|
502
|
+
try {
|
|
503
|
+
const run = await client2.runEval(args.suite_id);
|
|
504
|
+
const lines = [
|
|
505
|
+
"Evaluation run started.",
|
|
506
|
+
"",
|
|
507
|
+
`Run ID: ${run.id}`,
|
|
508
|
+
`Status: ${run.status}`
|
|
509
|
+
];
|
|
510
|
+
if (run.evalCreditsEstimated) {
|
|
511
|
+
lines.push(`Estimated Credits: ${run.evalCreditsEstimated}`);
|
|
512
|
+
}
|
|
513
|
+
lines.push(
|
|
514
|
+
"",
|
|
515
|
+
"The evaluation is running asynchronously. Use get_results with this run ID to check progress and retrieve results."
|
|
516
|
+
);
|
|
517
|
+
return { content: [{ type: "text", text: lines.join("\n") }] };
|
|
518
|
+
} catch (err) {
|
|
519
|
+
return handleToolError(err);
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
// src/tools/get-results.ts
|
|
524
|
+
var getResultsSchema = {
|
|
525
|
+
run_id: {
|
|
526
|
+
type: "string",
|
|
527
|
+
description: "The ID of the evaluation run to check"
|
|
528
|
+
}
|
|
529
|
+
};
|
|
530
|
+
var getResultsDescription = "Get the status and results of an evaluation run. Shows progress while running, and a full leaderboard with scores, latency, cost, and decision insights when completed.";
|
|
531
|
+
async function getResultsHandler(client2, args) {
|
|
532
|
+
try {
|
|
533
|
+
const run = await client2.getResults(args.run_id);
|
|
534
|
+
return { content: [{ type: "text", text: formatRunStatus(run) }] };
|
|
535
|
+
} catch (err) {
|
|
536
|
+
return handleToolError(err);
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// src/index.ts
|
|
541
|
+
var apiKey = process.env.PEERLM_API_KEY;
|
|
542
|
+
if (!apiKey) {
|
|
543
|
+
console.error(
|
|
544
|
+
'Error: PEERLM_API_KEY environment variable is required.\n\nSet it in your MCP client config:\n "env": { "PEERLM_API_KEY": "plm_live_..." }\n\nGenerate an API key at https://app.peerlm.com/settings/api-keys'
|
|
545
|
+
);
|
|
546
|
+
process.exit(1);
|
|
547
|
+
}
|
|
548
|
+
var baseUrl = process.env.PEERLM_BASE_URL || "https://app.peerlm.com";
|
|
549
|
+
var client = new PeerLMClient(apiKey, baseUrl);
|
|
550
|
+
var server = new Server(
|
|
551
|
+
{ name: "peerlm", version: "0.1.0" },
|
|
552
|
+
{ capabilities: { tools: {} } }
|
|
553
|
+
);
|
|
554
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
555
|
+
tools: [
|
|
556
|
+
{
|
|
557
|
+
name: "list_suites",
|
|
558
|
+
description: listSuitesDescription,
|
|
559
|
+
inputSchema: { type: "object", properties: {} }
|
|
560
|
+
},
|
|
561
|
+
{
|
|
562
|
+
name: "get_suite",
|
|
563
|
+
description: getSuiteDescription,
|
|
564
|
+
inputSchema: {
|
|
565
|
+
type: "object",
|
|
566
|
+
properties: getSuiteSchema,
|
|
567
|
+
required: ["suite_id"]
|
|
568
|
+
}
|
|
569
|
+
},
|
|
570
|
+
{
|
|
571
|
+
name: "list_models",
|
|
572
|
+
description: listModelsDescription,
|
|
573
|
+
inputSchema: {
|
|
574
|
+
type: "object",
|
|
575
|
+
properties: listModelsSchema
|
|
576
|
+
}
|
|
577
|
+
},
|
|
578
|
+
{
|
|
579
|
+
name: "get_usage",
|
|
580
|
+
description: getUsageDescription,
|
|
581
|
+
inputSchema: { type: "object", properties: {} }
|
|
582
|
+
},
|
|
583
|
+
{
|
|
584
|
+
name: "create_system_prompt",
|
|
585
|
+
description: createSystemPromptDescription,
|
|
586
|
+
inputSchema: {
|
|
587
|
+
type: "object",
|
|
588
|
+
properties: createSystemPromptSchema,
|
|
589
|
+
required: ["name", "system_prompt"]
|
|
590
|
+
}
|
|
591
|
+
},
|
|
592
|
+
{
|
|
593
|
+
name: "create_test_prompt",
|
|
594
|
+
description: createTestPromptDescription,
|
|
595
|
+
inputSchema: {
|
|
596
|
+
type: "object",
|
|
597
|
+
properties: createTestPromptSchema,
|
|
598
|
+
required: ["title", "prompt"]
|
|
599
|
+
}
|
|
600
|
+
},
|
|
601
|
+
{
|
|
602
|
+
name: "create_suite",
|
|
603
|
+
description: createSuiteDescription,
|
|
604
|
+
inputSchema: {
|
|
605
|
+
type: "object",
|
|
606
|
+
properties: createSuiteSchema,
|
|
607
|
+
required: [
|
|
608
|
+
"name",
|
|
609
|
+
"generator_models",
|
|
610
|
+
"evaluator_models",
|
|
611
|
+
"system_prompt_ids",
|
|
612
|
+
"test_prompt_ids",
|
|
613
|
+
"criteria"
|
|
614
|
+
]
|
|
615
|
+
}
|
|
616
|
+
},
|
|
617
|
+
{
|
|
618
|
+
name: "run_eval",
|
|
619
|
+
description: runEvalDescription,
|
|
620
|
+
inputSchema: {
|
|
621
|
+
type: "object",
|
|
622
|
+
properties: runEvalSchema,
|
|
623
|
+
required: ["suite_id"]
|
|
624
|
+
}
|
|
625
|
+
},
|
|
626
|
+
{
|
|
627
|
+
name: "get_results",
|
|
628
|
+
description: getResultsDescription,
|
|
629
|
+
inputSchema: {
|
|
630
|
+
type: "object",
|
|
631
|
+
properties: getResultsSchema,
|
|
632
|
+
required: ["run_id"]
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
]
|
|
636
|
+
}));
|
|
637
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
638
|
+
const { name, arguments: args } = request.params;
|
|
639
|
+
switch (name) {
|
|
640
|
+
case "list_suites":
|
|
641
|
+
return listSuitesHandler(client);
|
|
642
|
+
case "get_suite":
|
|
643
|
+
return getSuiteHandler(client, args);
|
|
644
|
+
case "list_models":
|
|
645
|
+
return listModelsHandler(client, args);
|
|
646
|
+
case "get_usage":
|
|
647
|
+
return getUsageHandler(client);
|
|
648
|
+
case "create_system_prompt":
|
|
649
|
+
return createSystemPromptHandler(
|
|
650
|
+
client,
|
|
651
|
+
args
|
|
652
|
+
);
|
|
653
|
+
case "create_test_prompt":
|
|
654
|
+
return createTestPromptHandler(
|
|
655
|
+
client,
|
|
656
|
+
args
|
|
657
|
+
);
|
|
658
|
+
case "create_suite":
|
|
659
|
+
return createSuiteHandler(client, args);
|
|
660
|
+
case "run_eval":
|
|
661
|
+
return runEvalHandler(client, args);
|
|
662
|
+
case "get_results":
|
|
663
|
+
return getResultsHandler(client, args);
|
|
664
|
+
default:
|
|
665
|
+
return {
|
|
666
|
+
content: [{ type: "text", text: `Unknown tool: ${name}` }],
|
|
667
|
+
isError: true
|
|
668
|
+
};
|
|
669
|
+
}
|
|
670
|
+
});
|
|
671
|
+
async function main() {
|
|
672
|
+
const transport = new StdioServerTransport();
|
|
673
|
+
await server.connect(transport);
|
|
674
|
+
console.error("PeerLM MCP server running on stdio");
|
|
675
|
+
}
|
|
676
|
+
main().catch((err) => {
|
|
677
|
+
console.error("Fatal error:", err);
|
|
678
|
+
process.exit(1);
|
|
679
|
+
});
|
package/package.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@peerlm/mcp",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "MCP server for PeerLM — run evaluations from Claude Desktop, Cursor, or any MCP client",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"peerlm-mcp": "dist/index.js"
|
|
8
|
+
},
|
|
9
|
+
"main": "./dist/index.js",
|
|
10
|
+
"files": [
|
|
11
|
+
"dist"
|
|
12
|
+
],
|
|
13
|
+
"scripts": {
|
|
14
|
+
"build": "tsup src/index.ts --format esm --target node20 --dts --clean",
|
|
15
|
+
"type-check": "tsc --noEmit",
|
|
16
|
+
"dev": "tsup src/index.ts --format esm --target node20 --watch"
|
|
17
|
+
},
|
|
18
|
+
"dependencies": {
|
|
19
|
+
"@modelcontextprotocol/sdk": "^1.12.1"
|
|
20
|
+
},
|
|
21
|
+
"devDependencies": {
|
|
22
|
+
"@types/node": "^22.0.0",
|
|
23
|
+
"tsup": "^8.3.0",
|
|
24
|
+
"typescript": "^5.7.0"
|
|
25
|
+
}
|
|
26
|
+
}
|