@hasna/evals 0.1.20 → 0.1.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +2 -1
- package/README.md +13 -0
- package/dist/cli/commands/compare.d.ts +2 -0
- package/dist/cli/commands/compare.d.ts.map +1 -1
- package/dist/cli/commands/compare.test.d.ts +2 -0
- package/dist/cli/commands/compare.test.d.ts.map +1 -0
- package/dist/cli/commands/doctor.d.ts.map +1 -1
- package/dist/cli/commands/generate.d.ts.map +1 -1
- package/dist/cli/index.js +57 -14
- package/dist/mcp/http.d.ts +12 -0
- package/dist/mcp/http.d.ts.map +1 -0
- package/dist/mcp/http.test.d.ts +2 -0
- package/dist/mcp/http.test.d.ts.map +1 -0
- package/dist/mcp/index.js +441 -259
- package/dist/mcp/server.d.ts +5 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/package.json +1 -1
package/LICENSE
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
|
|
1
2
|
Apache License
|
|
2
3
|
Version 2.0, January 2004
|
|
3
4
|
http://www.apache.org/licenses/
|
|
@@ -175,7 +176,7 @@
|
|
|
175
176
|
|
|
176
177
|
END OF TERMS AND CONDITIONS
|
|
177
178
|
|
|
178
|
-
Copyright 2026
|
|
179
|
+
Copyright 2026 Hasna, Inc.
|
|
179
180
|
|
|
180
181
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
181
182
|
you may not use this file except in compliance with the License.
|
package/README.md
CHANGED
|
@@ -240,6 +240,19 @@ evals_run_single(
|
|
|
240
240
|
→ PASS — The response correctly identifies Paris.
|
|
241
241
|
```
|
|
242
242
|
|
|
243
|
+
## HTTP mode
|
|
244
|
+
|
|
245
|
+
Shared Streamable HTTP transport for multi-agent sessions (stdio remains the default):
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
evals-mcp --http # http://127.0.0.1:8817/mcp
|
|
249
|
+
MCP_HTTP=1 evals-mcp # same
|
|
250
|
+
evals-mcp --http --port 8817 # explicit port
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
- Health: `GET http://127.0.0.1:8817/health` → `{"status":"ok","name":"evals"}`
|
|
254
|
+
- Override port with `MCP_HTTP_PORT` or `--port`
|
|
255
|
+
|
|
243
256
|
---
|
|
244
257
|
|
|
245
258
|
## License
|
|
@@ -1,3 +1,5 @@
|
|
|
1
1
|
import { Command } from "commander";
|
|
2
|
+
import { compareRuns } from "../../core/reporter.js";
|
|
3
|
+
export declare function renderMarkdownDiff(diff: ReturnType<typeof compareRuns>): string;
|
|
2
4
|
export declare function compareCommand(): Command;
|
|
3
5
|
//# sourceMappingURL=compare.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"compare.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"compare.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,WAAW,EAA+B,MAAM,wBAAwB,CAAC;AAGlF,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,GAAG,MAAM,CA2B/E;AAED,wBAAgB,cAAc,IAAI,OAAO,CAgCxC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compare.test.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.test.ts"],"names":[],"mappings":""}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"doctor.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/doctor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAqCpC,wBAAgB,aAAa,IAAI,OAAO,
|
|
1
|
+
{"version":3,"file":"doctor.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/doctor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAqCpC,wBAAgB,aAAa,IAAI,OAAO,CAqFvC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"generate.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/generate.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAiBpC,wBAAgB,eAAe,IAAI,OAAO,
|
|
1
|
+
{"version":3,"file":"generate.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/generate.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAiBpC,wBAAgB,eAAe,IAAI,OAAO,CAyDzC"}
|
package/dist/cli/index.js
CHANGED
|
@@ -30508,7 +30508,7 @@ function parseAdapterConfig(opts) {
|
|
|
30508
30508
|
|
|
30509
30509
|
// src/cli/commands/run.ts
|
|
30510
30510
|
function runCommand() {
|
|
30511
|
-
return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter, default: default)").option("--command <cmd>", "Shell command (for cli adapter, use {{input}} placeholder)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("--json", "Alias for --output json").action(async (dataset, opts) => {
|
|
30511
|
+
return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter, default: default)").option("--command <cmd>", "Shell command (for cli adapter, use {{input}} placeholder)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("-j, --json", "Alias for --output json").action(async (dataset, opts) => {
|
|
30512
30512
|
const { cases, warnings } = await loadDataset(dataset, {
|
|
30513
30513
|
tags: opts["tags"] ? opts["tags"].split(",") : undefined
|
|
30514
30514
|
});
|
|
@@ -30548,7 +30548,7 @@ init_loader();
|
|
|
30548
30548
|
init_store();
|
|
30549
30549
|
function ciCommand() {
|
|
30550
30550
|
const cmd = new Command("ci").description("Run evals in CI mode \u2014 exit non-zero on regression");
|
|
30551
|
-
cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter)").option("--command <cmd>", "Shell command (for cli adapter)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--no-judge", "Skip LLM judge, assertions only").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("--json", "Output JSON").action(async (dataset, opts) => {
|
|
30551
|
+
cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter)").option("--command <cmd>", "Shell command (for cli adapter)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--no-judge", "Skip LLM judge, assertions only").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("-j, --json", "Output JSON").action(async (dataset, opts) => {
|
|
30552
30552
|
const { cases } = await loadDataset(dataset);
|
|
30553
30553
|
const adapter = parseAdapterConfig(opts);
|
|
30554
30554
|
const run = await runEvals(cases, { dataset, adapter, skipJudge: opts["judge"] === false || opts["noJudge"] === true });
|
|
@@ -30596,7 +30596,7 @@ No baseline "${baselineName}" found \u2014 use "evals ci set-baseline" to create
|
|
|
30596
30596
|
|
|
30597
30597
|
// src/cli/commands/judge.ts
|
|
30598
30598
|
function judgeCommand() {
|
|
30599
|
-
return new Command("judge").description("Ad-hoc: judge a single input/output pair against a rubric").requiredOption("--input <text>", "The input that was given to the AI").requiredOption("--output <text>", "The AI's response").requiredOption("--rubric <text>", "Plain-English grading criteria").option("--expected <text>", "Expected behavior description").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--provider <provider>", "Judge provider: anthropic|openai", "anthropic").option("--json", "Output JSON").action(async (opts) => {
|
|
30599
|
+
return new Command("judge").description("Ad-hoc: judge a single input/output pair against a rubric").requiredOption("--input <text>", "The input that was given to the AI").requiredOption("--output <text>", "The AI's response").requiredOption("--rubric <text>", "Plain-English grading criteria").option("--expected <text>", "Expected behavior description").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--provider <provider>", "Judge provider: anthropic|openai", "anthropic").option("-j, --json", "Output JSON").action(async (opts) => {
|
|
30600
30600
|
const result = await judgeOnce({
|
|
30601
30601
|
input: opts["input"] ?? "",
|
|
30602
30602
|
output: opts["output"] ?? "",
|
|
@@ -30622,8 +30622,34 @@ ${icon} ${result.verdict}
|
|
|
30622
30622
|
|
|
30623
30623
|
// src/cli/commands/compare.ts
|
|
30624
30624
|
init_store();
|
|
30625
|
+
function renderMarkdownDiff(diff) {
|
|
30626
|
+
const lines = ["## Diff", ""];
|
|
30627
|
+
if (diff.regressions.length === 0 && diff.improvements.length === 0) {
|
|
30628
|
+
lines.push("- No changes between runs.");
|
|
30629
|
+
return lines.join(`
|
|
30630
|
+
`);
|
|
30631
|
+
}
|
|
30632
|
+
if (diff.regressions.length > 0) {
|
|
30633
|
+
lines.push("### Regressions");
|
|
30634
|
+
for (const r of diff.regressions) {
|
|
30635
|
+
lines.push(`- ${r.caseId}: ${r.before} -> ${r.after}`);
|
|
30636
|
+
}
|
|
30637
|
+
lines.push("");
|
|
30638
|
+
}
|
|
30639
|
+
if (diff.improvements.length > 0) {
|
|
30640
|
+
lines.push("### Improvements");
|
|
30641
|
+
for (const i of diff.improvements) {
|
|
30642
|
+
lines.push(`- ${i.caseId}: ${i.before} -> ${i.after}`);
|
|
30643
|
+
}
|
|
30644
|
+
lines.push("");
|
|
30645
|
+
}
|
|
30646
|
+
const delta = diff.passRateDelta * 100;
|
|
30647
|
+
lines.push(`- Score delta: ${delta >= 0 ? "+" : ""}${delta.toFixed(1)}%`);
|
|
30648
|
+
return lines.join(`
|
|
30649
|
+
`);
|
|
30650
|
+
}
|
|
30625
30651
|
function compareCommand() {
|
|
30626
|
-
return new Command("compare").description("Compare two eval runs side-by-side").argument("<before>", "Before run ID or baseline name").argument("<after>", "After run ID (or 'latest')").option("--json", "Output JSON diff").option("--markdown", "Output markdown diff").action(async (beforeArg, afterArg, opts) => {
|
|
30652
|
+
return new Command("compare").description("Compare two eval runs side-by-side").argument("<before>", "Before run ID or baseline name").argument("<after>", "After run ID (or 'latest')").option("-j, --json", "Output JSON diff").option("--markdown", "Output markdown diff").action(async (beforeArg, afterArg, opts) => {
|
|
30627
30653
|
const { listRuns: listRuns2 } = await Promise.resolve().then(() => (init_store(), exports_store));
|
|
30628
30654
|
const beforeRun = getRun(beforeArg) ?? getBaseline(beforeArg);
|
|
30629
30655
|
const afterRun = afterArg === "latest" ? listRuns2(1)[0] : getRun(afterArg) ?? getBaseline(afterArg);
|
|
@@ -30640,10 +30666,8 @@ function compareCommand() {
|
|
|
30640
30666
|
console.log(JSON.stringify(diff, null, 2));
|
|
30641
30667
|
} else if (opts["markdown"]) {
|
|
30642
30668
|
console.log(toMarkdown(afterRun));
|
|
30643
|
-
console.log(
|
|
30644
|
-
|
|
30645
|
-
`);
|
|
30646
|
-
printDiffReport(diff);
|
|
30669
|
+
console.log();
|
|
30670
|
+
console.log(renderMarkdownDiff(diff));
|
|
30647
30671
|
} else {
|
|
30648
30672
|
printDiffReport(diff);
|
|
30649
30673
|
}
|
|
@@ -30662,7 +30686,7 @@ var COST_PER_1K_INPUT = {
|
|
|
30662
30686
|
};
|
|
30663
30687
|
var AVG_JUDGE_TOKENS = 800;
|
|
30664
30688
|
function estimateCommand() {
|
|
30665
|
-
return new Command("estimate").description("Estimate cost before running evals (no API calls made)").argument("<dataset>", "Path to JSONL/JSON dataset").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--no-judge", "Assume no judge calls").option("--json", "Output JSON").action(async (dataset, opts) => {
|
|
30689
|
+
return new Command("estimate").description("Estimate cost before running evals (no API calls made)").argument("<dataset>", "Path to JSONL/JSON dataset").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--no-judge", "Assume no judge calls").option("-j, --json", "Output JSON").action(async (dataset, opts) => {
|
|
30666
30690
|
const { cases, warnings } = await loadDataset(dataset);
|
|
30667
30691
|
if (warnings.length > 0)
|
|
30668
30692
|
for (const w of warnings)
|
|
@@ -30713,7 +30737,7 @@ Each case must be a valid JSON object on a single line with these fields:
|
|
|
30713
30737
|
Generate varied cases that cover edge cases, typical usage, and boundary conditions.
|
|
30714
30738
|
Output ONLY valid JSONL \u2014 one JSON object per line, no markdown, no explanation.`;
|
|
30715
30739
|
function generateCommand() {
|
|
30716
|
-
return new Command("generate").description("Generate eval cases from a description using Claude").requiredOption("--description <text>", "What behavior to test (e.g. 'refund policy responses')").option("--seeds <path>", "Path to JSONL file with seed examples").option("--count <n>", "Number of cases to generate", "10").option("--output <path>", "Output JSONL file path", "generated.jsonl").option("--model <model>", "Model to use for generation", "claude-sonnet-4-6").action(async (opts) => {
|
|
30740
|
+
return new Command("generate").description("Generate eval cases from a description using Claude").requiredOption("--description <text>", "What behavior to test (e.g. 'refund policy responses')").option("--seeds <path>", "Path to JSONL file with seed examples").option("--count <n>", "Number of cases to generate", "10").option("--output <path>", "Output JSONL file path", "generated.jsonl").option("--model <model>", "Model to use for generation", "claude-sonnet-4-6").option("-j, --json", "Output JSON summary").action(async (opts) => {
|
|
30717
30741
|
const client = new Anthropic;
|
|
30718
30742
|
const count = parseInt(opts["count"] ?? "10");
|
|
30719
30743
|
let seedText = "";
|
|
@@ -30746,11 +30770,22 @@ Output ${count} JSONL lines starting with {"id":"gen-001",...}`;
|
|
|
30746
30770
|
valid.push(parsed);
|
|
30747
30771
|
} catch {}
|
|
30748
30772
|
}
|
|
30773
|
+
const outputPath = opts["output"] ?? "generated.jsonl";
|
|
30749
30774
|
const output = valid.map((c) => JSON.stringify(c)).join(`
|
|
30750
30775
|
`);
|
|
30751
|
-
writeFileSync(
|
|
30776
|
+
writeFileSync(outputPath, output + `
|
|
30752
30777
|
`);
|
|
30753
|
-
|
|
30778
|
+
if (opts["json"]) {
|
|
30779
|
+
console.log(JSON.stringify({
|
|
30780
|
+
generated: valid.length,
|
|
30781
|
+
requested: count,
|
|
30782
|
+
output: outputPath,
|
|
30783
|
+
model: opts["model"] ?? "claude-sonnet-4-6",
|
|
30784
|
+
description: opts["description"] ?? ""
|
|
30785
|
+
}, null, 2));
|
|
30786
|
+
return;
|
|
30787
|
+
}
|
|
30788
|
+
console.log(`\x1B[32m\u2713 Generated ${valid.length} cases \u2192 ${outputPath}\x1B[0m`);
|
|
30754
30789
|
});
|
|
30755
30790
|
}
|
|
30756
30791
|
|
|
@@ -30844,7 +30879,7 @@ function resolveApiKey(envVar, secretsPath, secretsKey) {
|
|
|
30844
30879
|
return;
|
|
30845
30880
|
}
|
|
30846
30881
|
function doctorCommand() {
|
|
30847
|
-
return new Command("doctor").description("Health check \u2014 verify API keys, DB, and config").action(async () => {
|
|
30882
|
+
return new Command("doctor").description("Health check \u2014 verify API keys, DB, and config").option("-j, --json", "Output JSON").action(async (opts) => {
|
|
30848
30883
|
const checks = [];
|
|
30849
30884
|
const anthropicKey = resolveApiKey("ANTHROPIC_API_KEY", "hasnaxyz/anthropic/live.env", "HASNAXYZ_ANTHROPIC_LIVE_API_KEY");
|
|
30850
30885
|
checks.push({
|
|
@@ -30888,6 +30923,15 @@ function doctorCommand() {
|
|
|
30888
30923
|
} catch {
|
|
30889
30924
|
checks.push({ name: "Example dataset (optional)", ok: false, hint: "datasets/examples/smoke.jsonl not found \u2014 install @hasna/evals globally to include examples" });
|
|
30890
30925
|
}
|
|
30926
|
+
const allOk = checks.every((c) => c.ok || c.name.toLowerCase().includes("optional"));
|
|
30927
|
+
if (opts.json) {
|
|
30928
|
+
console.log(JSON.stringify({
|
|
30929
|
+
ok: allOk,
|
|
30930
|
+
checks,
|
|
30931
|
+
summary: allOk ? "All checks passed." : "Some checks failed \u2014 see hints above."
|
|
30932
|
+
}, null, 2));
|
|
30933
|
+
process.exit(allOk ? 0 : 1);
|
|
30934
|
+
}
|
|
30891
30935
|
console.log(`
|
|
30892
30936
|
\x1B[1mevals doctor\x1B[0m
|
|
30893
30937
|
`);
|
|
@@ -30896,7 +30940,6 @@ function doctorCommand() {
|
|
|
30896
30940
|
console.log(` ${icon} ${c.name}${!c.ok && c.hint ? `
|
|
30897
30941
|
hint: ${c.hint}` : ""}`);
|
|
30898
30942
|
}
|
|
30899
|
-
const allOk = checks.every((c) => c.ok || c.name.toLowerCase().includes("optional"));
|
|
30900
30943
|
console.log(allOk ? `
|
|
30901
30944
|
\x1B[32m All checks passed.\x1B[0m
|
|
30902
30945
|
` : `
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { DEFAULT_MCP_HTTP_PORT, MCP_NAME } from "./server.js";
|
|
2
|
+
export { DEFAULT_MCP_HTTP_PORT, MCP_NAME };
|
|
3
|
+
export declare function isHttpMode(argv?: string[]): boolean;
|
|
4
|
+
export declare function resolveHttpPort(argv?: string[]): number;
|
|
5
|
+
export declare function handleMcpHttpRequest(req: Request): Promise<Response>;
|
|
6
|
+
export interface StartHttpServerOptions {
|
|
7
|
+
port?: number;
|
|
8
|
+
hostname?: string;
|
|
9
|
+
log?: (message: string) => void;
|
|
10
|
+
}
|
|
11
|
+
export declare function startHttpServer(options?: StartHttpServerOptions): ReturnType<typeof Bun.serve>;
|
|
12
|
+
//# sourceMappingURL=http.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"http.d.ts","sourceRoot":"","sources":["../../src/mcp/http.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,qBAAqB,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE3E,OAAO,EAAE,qBAAqB,EAAE,QAAQ,EAAE,CAAC;AAE3C,wBAAgB,UAAU,CAAC,IAAI,GAAE,MAAM,EAA0B,GAAG,OAAO,CAE1E;AAED,wBAAgB,eAAe,CAAC,IAAI,GAAE,MAAM,EAA0B,GAAG,MAAM,CAa9E;AAUD,wBAAsB,oBAAoB,CAAC,GAAG,EAAE,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,CAiB1E;AAED,MAAM,WAAW,sBAAsB;IACrC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;CACjC;AAED,wBAAgB,eAAe,CAAC,OAAO,GAAE,sBAA2B,GAAG,UAAU,CAAC,OAAO,GAAG,CAAC,KAAK,CAAC,CAclG"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"http.test.d.ts","sourceRoot":"","sources":["../../src/mcp/http.test.ts"],"names":[],"mappings":""}
|
package/dist/mcp/index.js
CHANGED
|
@@ -10587,8 +10587,91 @@ var _db = null;
|
|
|
10587
10587
|
var init_store = () => {};
|
|
10588
10588
|
|
|
10589
10589
|
// src/mcp/index.ts
|
|
10590
|
-
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
10591
10590
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
10591
|
+
// package.json
|
|
10592
|
+
var package_default = {
|
|
10593
|
+
name: "@hasna/evals",
|
|
10594
|
+
version: "0.1.22",
|
|
10595
|
+
description: "Open source AI evaluation framework \u2014 LLM-as-judge + assertion-based evals for any AI app. CLI + MCP server.",
|
|
10596
|
+
type: "module",
|
|
10597
|
+
main: "dist/index.js",
|
|
10598
|
+
types: "dist/index.d.ts",
|
|
10599
|
+
bin: {
|
|
10600
|
+
evals: "dist/cli/index.js",
|
|
10601
|
+
"evals-mcp": "dist/mcp/index.js",
|
|
10602
|
+
"evals-serve": "dist/server/index.js"
|
|
10603
|
+
},
|
|
10604
|
+
exports: {
|
|
10605
|
+
".": {
|
|
10606
|
+
types: "./dist/index.d.ts",
|
|
10607
|
+
import: "./dist/index.js"
|
|
10608
|
+
}
|
|
10609
|
+
},
|
|
10610
|
+
files: [
|
|
10611
|
+
"dist",
|
|
10612
|
+
"dashboard/dist",
|
|
10613
|
+
"datasets/examples",
|
|
10614
|
+
"LICENSE",
|
|
10615
|
+
"README.md"
|
|
10616
|
+
],
|
|
10617
|
+
scripts: {
|
|
10618
|
+
build: "cd dashboard && bun run build && cd .. && bun build src/cli/index.ts --outdir dist/cli --target bun --external ink --external react --external chalk --external @modelcontextprotocol/sdk && bun build src/mcp/index.ts --outdir dist/mcp --target bun --external @modelcontextprotocol/sdk && bun build src/server/index.ts --outdir dist/server --target bun && bun build src/index.ts --outdir dist --target bun && tsc --emitDeclarationOnly --outDir dist",
|
|
10619
|
+
"build:dashboard": "cd dashboard && bun run build",
|
|
10620
|
+
typecheck: "tsc --noEmit",
|
|
10621
|
+
test: "bun test",
|
|
10622
|
+
"dev:cli": "bun run src/cli/index.ts",
|
|
10623
|
+
"dev:mcp": "bun run src/mcp/index.ts",
|
|
10624
|
+
"dev:serve": "bun run src/server/index.ts",
|
|
10625
|
+
prepublishOnly: "bun run typecheck && bun test && bun run build",
|
|
10626
|
+
postinstall: "mkdir -p $HOME/.hasna/evals 2>/dev/null || true"
|
|
10627
|
+
},
|
|
10628
|
+
keywords: [
|
|
10629
|
+
"evals",
|
|
10630
|
+
"llm",
|
|
10631
|
+
"ai",
|
|
10632
|
+
"testing",
|
|
10633
|
+
"evaluation",
|
|
10634
|
+
"mcp",
|
|
10635
|
+
"claude",
|
|
10636
|
+
"llm-as-judge",
|
|
10637
|
+
"typescript",
|
|
10638
|
+
"cli"
|
|
10639
|
+
],
|
|
10640
|
+
publishConfig: {
|
|
10641
|
+
registry: "https://registry.npmjs.org",
|
|
10642
|
+
access: "public"
|
|
10643
|
+
},
|
|
10644
|
+
repository: {
|
|
10645
|
+
type: "git",
|
|
10646
|
+
url: "https://github.com/hasna/open-evals.git"
|
|
10647
|
+
},
|
|
10648
|
+
homepage: "https://github.com/hasna/open-evals",
|
|
10649
|
+
bugs: {
|
|
10650
|
+
url: "https://github.com/hasna/open-evals/issues"
|
|
10651
|
+
},
|
|
10652
|
+
engines: {
|
|
10653
|
+
bun: ">=1.0.0"
|
|
10654
|
+
},
|
|
10655
|
+
author: "Andrei Hasna <andrei@hasna.com>",
|
|
10656
|
+
license: "Apache-2.0",
|
|
10657
|
+
dependencies: {
|
|
10658
|
+
"@anthropic-ai/sdk": "^0.82.0",
|
|
10659
|
+
"@hasna/cloud": "^0.1.30",
|
|
10660
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
10661
|
+
ajv: "^8.18.0",
|
|
10662
|
+
chalk: "^5.4.1",
|
|
10663
|
+
commander: "^14.0.3",
|
|
10664
|
+
openai: "^6.33.0",
|
|
10665
|
+
zod: "^4.3.6"
|
|
10666
|
+
},
|
|
10667
|
+
devDependencies: {
|
|
10668
|
+
"@types/bun": "^1.2.4",
|
|
10669
|
+
typescript: "^6.0.2"
|
|
10670
|
+
}
|
|
10671
|
+
};
|
|
10672
|
+
|
|
10673
|
+
// src/mcp/server.ts
|
|
10674
|
+
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
10592
10675
|
import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
|
|
10593
10676
|
|
|
10594
10677
|
// node_modules/zod/v4/classic/external.js
|
|
@@ -30881,284 +30964,383 @@ function compareRuns(before, after) {
|
|
|
30881
30964
|
};
|
|
30882
30965
|
}
|
|
30883
30966
|
|
|
30884
|
-
// src/mcp/
|
|
30967
|
+
// src/mcp/server.ts
|
|
30885
30968
|
init_store();
|
|
30886
30969
|
import { writeFileSync, appendFileSync } from "fs";
|
|
30887
30970
|
var pkg = await Bun.file(new URL("../../package.json", import.meta.url)).json();
|
|
30888
|
-
var
|
|
30889
|
-
var
|
|
30890
|
-
|
|
30891
|
-
|
|
30892
|
-
|
|
30893
|
-
|
|
30894
|
-
|
|
30895
|
-
|
|
30896
|
-
|
|
30897
|
-
|
|
30898
|
-
|
|
30899
|
-
|
|
30900
|
-
|
|
30901
|
-
|
|
30902
|
-
|
|
30903
|
-
|
|
30904
|
-
|
|
30905
|
-
|
|
30906
|
-
|
|
30907
|
-
|
|
30908
|
-
|
|
30909
|
-
|
|
30910
|
-
|
|
30911
|
-
|
|
30912
|
-
|
|
30913
|
-
|
|
30914
|
-
|
|
30915
|
-
|
|
30916
|
-
|
|
30917
|
-
name: "evals_run_single",
|
|
30918
|
-
description: "Run a single eval case ad-hoc \u2014 useful for agents to verify their own output quality",
|
|
30919
|
-
inputSchema: {
|
|
30920
|
-
type: "object",
|
|
30921
|
-
properties: {
|
|
30922
|
-
input: { type: "string", description: "Input to the AI app" },
|
|
30923
|
-
output: { type: "string", description: "AI's response to evaluate" },
|
|
30924
|
-
rubric: { type: "string", description: "Plain-English grading criteria" },
|
|
30925
|
-
expected: { type: "string", description: "Expected behavior description" },
|
|
30926
|
-
assertions: { type: "array", description: "Optional deterministic assertions" },
|
|
30927
|
-
judge_model: { type: "string", description: "Judge model (default: claude-sonnet-4-6)" },
|
|
30928
|
-
judge_provider: { type: "string", enum: ["anthropic", "openai"] }
|
|
30929
|
-
},
|
|
30930
|
-
required: ["input", "output", "rubric"]
|
|
30931
|
-
}
|
|
30932
|
-
},
|
|
30933
|
-
{
|
|
30934
|
-
name: "evals_judge",
|
|
30935
|
-
description: "One-shot LLM judge \u2014 no dataset needed",
|
|
30936
|
-
inputSchema: {
|
|
30937
|
-
type: "object",
|
|
30938
|
-
properties: {
|
|
30939
|
-
input: { type: "string" },
|
|
30940
|
-
output: { type: "string" },
|
|
30941
|
-
rubric: { type: "string" },
|
|
30942
|
-
expected: { type: "string" },
|
|
30943
|
-
model: { type: "string" },
|
|
30944
|
-
provider: { type: "string", enum: ["anthropic", "openai"] }
|
|
30945
|
-
},
|
|
30946
|
-
required: ["input", "output", "rubric"]
|
|
30947
|
-
}
|
|
30948
|
-
},
|
|
30949
|
-
{
|
|
30950
|
-
name: "evals_list_datasets",
|
|
30951
|
-
description: "List available JSONL datasets in a directory",
|
|
30952
|
-
inputSchema: {
|
|
30953
|
-
type: "object",
|
|
30954
|
-
properties: {
|
|
30955
|
-
directory: { type: "string", description: "Directory to search (default: ./datasets)" }
|
|
30971
|
+
var MCP_NAME = "evals";
|
|
30972
|
+
var DEFAULT_MCP_HTTP_PORT = 8817;
|
|
30973
|
+
function buildServer() {
|
|
30974
|
+
const server = new Server({ name: MCP_NAME, version: pkg.version }, { capabilities: { tools: {} } });
|
|
30975
|
+
const AdapterSchema = exports_external.object({
|
|
30976
|
+
type: exports_external.enum(["http", "anthropic", "openai", "mcp", "function", "cli"]),
|
|
30977
|
+
url: exports_external.string().optional(),
|
|
30978
|
+
model: exports_external.string().optional(),
|
|
30979
|
+
systemPrompt: exports_external.string().optional(),
|
|
30980
|
+
command: exports_external.array(exports_external.string()).optional(),
|
|
30981
|
+
tool: exports_external.string().optional(),
|
|
30982
|
+
modulePath: exports_external.string().optional()
|
|
30983
|
+
}).passthrough();
|
|
30984
|
+
const tools = [
|
|
30985
|
+
{
|
|
30986
|
+
name: "evals_run",
|
|
30987
|
+
description: "Run a full eval dataset against an app and return results",
|
|
30988
|
+
inputSchema: {
|
|
30989
|
+
type: "object",
|
|
30990
|
+
properties: {
|
|
30991
|
+
dataset: { type: "string", description: "Path to JSONL/JSON dataset" },
|
|
30992
|
+
adapter: { type: "object", description: "Adapter config (type, url/model/command, etc.)" },
|
|
30993
|
+
concurrency: { type: "number", description: "Parallel execution limit (default: 5)" },
|
|
30994
|
+
skip_judge: { type: "boolean", description: "Skip LLM judge, run assertions only" },
|
|
30995
|
+
tags: { type: "array", items: { type: "string" }, description: "Filter cases by tags" },
|
|
30996
|
+
save: { type: "boolean", description: "Save run to database" },
|
|
30997
|
+
output_format: { type: "string", enum: ["json", "markdown", "summary"], description: "Output format" }
|
|
30998
|
+
},
|
|
30999
|
+
required: ["dataset", "adapter"]
|
|
30956
31000
|
}
|
|
30957
|
-
}
|
|
30958
|
-
|
|
30959
|
-
|
|
30960
|
-
|
|
30961
|
-
|
|
30962
|
-
|
|
30963
|
-
|
|
30964
|
-
|
|
30965
|
-
|
|
30966
|
-
|
|
30967
|
-
|
|
31001
|
+
},
|
|
31002
|
+
{
|
|
31003
|
+
name: "evals_run_single",
|
|
31004
|
+
description: "Run a single eval case ad-hoc \u2014 useful for agents to verify their own output quality",
|
|
31005
|
+
inputSchema: {
|
|
31006
|
+
type: "object",
|
|
31007
|
+
properties: {
|
|
31008
|
+
input: { type: "string", description: "Input to the AI app" },
|
|
31009
|
+
output: { type: "string", description: "AI's response to evaluate" },
|
|
31010
|
+
rubric: { type: "string", description: "Plain-English grading criteria" },
|
|
31011
|
+
expected: { type: "string", description: "Expected behavior description" },
|
|
31012
|
+
assertions: { type: "array", description: "Optional deterministic assertions" },
|
|
31013
|
+
judge_model: { type: "string", description: "Judge model (default: claude-sonnet-4-6)" },
|
|
31014
|
+
judge_provider: { type: "string", enum: ["anthropic", "openai"] }
|
|
31015
|
+
},
|
|
31016
|
+
required: ["input", "output", "rubric"]
|
|
31017
|
+
}
|
|
31018
|
+
},
|
|
31019
|
+
{
|
|
31020
|
+
name: "evals_judge",
|
|
31021
|
+
description: "One-shot LLM judge \u2014 no dataset needed",
|
|
31022
|
+
inputSchema: {
|
|
31023
|
+
type: "object",
|
|
31024
|
+
properties: {
|
|
31025
|
+
input: { type: "string" },
|
|
31026
|
+
output: { type: "string" },
|
|
31027
|
+
rubric: { type: "string" },
|
|
31028
|
+
expected: { type: "string" },
|
|
31029
|
+
model: { type: "string" },
|
|
31030
|
+
provider: { type: "string", enum: ["anthropic", "openai"] }
|
|
31031
|
+
},
|
|
31032
|
+
required: ["input", "output", "rubric"]
|
|
31033
|
+
}
|
|
31034
|
+
},
|
|
31035
|
+
{
|
|
31036
|
+
name: "evals_list_datasets",
|
|
31037
|
+
description: "List available JSONL datasets in a directory",
|
|
31038
|
+
inputSchema: {
|
|
31039
|
+
type: "object",
|
|
31040
|
+
properties: {
|
|
31041
|
+
directory: { type: "string", description: "Directory to search (default: ./datasets)" }
|
|
31042
|
+
}
|
|
31043
|
+
}
|
|
31044
|
+
},
|
|
31045
|
+
{
|
|
31046
|
+
name: "evals_get_results",
|
|
31047
|
+
description: "Get results for a past eval run",
|
|
31048
|
+
inputSchema: {
|
|
31049
|
+
type: "object",
|
|
31050
|
+
properties: {
|
|
31051
|
+
run_id: { type: "string", description: "Run ID or partial ID" },
|
|
31052
|
+
format: { type: "string", enum: ["json", "markdown", "summary"] },
|
|
31053
|
+
limit: { type: "number", description: "Max runs to list if no run_id given" }
|
|
31054
|
+
}
|
|
31055
|
+
}
|
|
31056
|
+
},
|
|
31057
|
+
{
|
|
31058
|
+
name: "evals_compare",
|
|
31059
|
+
description: "Compare two eval runs \u2014 show regressions and improvements",
|
|
31060
|
+
inputSchema: {
|
|
31061
|
+
type: "object",
|
|
31062
|
+
properties: {
|
|
31063
|
+
before: { type: "string", description: "Before run ID or baseline name" },
|
|
31064
|
+
after: { type: "string", description: "After run ID" }
|
|
31065
|
+
},
|
|
31066
|
+
required: ["before", "after"]
|
|
31067
|
+
}
|
|
31068
|
+
},
|
|
31069
|
+
{
|
|
31070
|
+
name: "evals_create_case",
|
|
31071
|
+
description: "Add a new eval case to a dataset file",
|
|
31072
|
+
inputSchema: {
|
|
31073
|
+
type: "object",
|
|
31074
|
+
properties: {
|
|
31075
|
+
dataset: { type: "string", description: "Path to JSONL file to append to" },
|
|
31076
|
+
id: { type: "string", description: "Unique case ID" },
|
|
31077
|
+
input: { type: "string" },
|
|
31078
|
+
expected: { type: "string" },
|
|
31079
|
+
rubric: { type: "string", description: "Judge rubric for this case" },
|
|
31080
|
+
assertions: { type: "array" },
|
|
31081
|
+
tags: { type: "array", items: { type: "string" } }
|
|
31082
|
+
},
|
|
31083
|
+
required: ["dataset", "id", "input"]
|
|
31084
|
+
}
|
|
31085
|
+
},
|
|
31086
|
+
{
|
|
31087
|
+
name: "evals_generate_cases",
|
|
31088
|
+
description: "Auto-generate eval cases from a description using Claude",
|
|
31089
|
+
inputSchema: {
|
|
31090
|
+
type: "object",
|
|
31091
|
+
properties: {
|
|
31092
|
+
description: { type: "string", description: "What behavior to test" },
|
|
31093
|
+
count: { type: "number", description: "Number of cases to generate (default: 10)" },
|
|
31094
|
+
output: { type: "string", description: "Output JSONL path" },
|
|
31095
|
+
seeds: { type: "string", description: "Path to seed examples JSONL" }
|
|
31096
|
+
},
|
|
31097
|
+
required: ["description"]
|
|
30968
31098
|
}
|
|
30969
31099
|
}
|
|
30970
|
-
|
|
30971
|
-
{
|
|
30972
|
-
|
|
30973
|
-
|
|
30974
|
-
|
|
30975
|
-
|
|
30976
|
-
|
|
30977
|
-
|
|
30978
|
-
|
|
30979
|
-
|
|
30980
|
-
|
|
30981
|
-
|
|
30982
|
-
|
|
30983
|
-
|
|
30984
|
-
|
|
30985
|
-
|
|
30986
|
-
|
|
30987
|
-
|
|
30988
|
-
|
|
30989
|
-
|
|
30990
|
-
|
|
30991
|
-
|
|
30992
|
-
|
|
30993
|
-
|
|
30994
|
-
|
|
30995
|
-
|
|
30996
|
-
|
|
30997
|
-
|
|
30998
|
-
|
|
30999
|
-
|
|
31000
|
-
|
|
31001
|
-
|
|
31002
|
-
|
|
31003
|
-
|
|
31004
|
-
|
|
31005
|
-
|
|
31006
|
-
|
|
31007
|
-
|
|
31008
|
-
|
|
31009
|
-
|
|
31010
|
-
},
|
|
31011
|
-
required: ["description"]
|
|
31012
|
-
}
|
|
31013
|
-
}
|
|
31014
|
-
];
|
|
31015
|
-
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
|
|
31016
|
-
server.setRequestHandler(CallToolRequestSchema, async (req) => {
|
|
31017
|
-
const { name, arguments: args } = req.params;
|
|
31018
|
-
const a = args ?? {};
|
|
31019
|
-
try {
|
|
31020
|
-
switch (name) {
|
|
31021
|
-
case "evals_run": {
|
|
31022
|
-
const adapter = AdapterSchema.parse(a["adapter"]);
|
|
31023
|
-
const { cases } = await loadDataset(String(a["dataset"]), {
|
|
31024
|
-
tags: a["tags"]
|
|
31025
|
-
});
|
|
31026
|
-
const run = await runEvals(cases, {
|
|
31027
|
-
dataset: String(a["dataset"]),
|
|
31028
|
-
adapter,
|
|
31029
|
-
concurrency: Number(a["concurrency"] ?? 5),
|
|
31030
|
-
skipJudge: Boolean(a["skip_judge"])
|
|
31031
|
-
});
|
|
31032
|
-
if (a["save"])
|
|
31033
|
-
saveRun(run);
|
|
31034
|
-
const fmt = String(a["output_format"] ?? "summary");
|
|
31035
|
-
const output = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%) in ${run.stats.totalDurationMs}ms. Run ID: ${run.id.slice(0, 8)}`;
|
|
31036
|
-
return { content: [{ type: "text", text: output }] };
|
|
31037
|
-
}
|
|
31038
|
-
case "evals_run_single": {
|
|
31039
|
-
const evalCase = {
|
|
31040
|
-
id: "mcp-single",
|
|
31041
|
-
input: String(a["input"]),
|
|
31042
|
-
expected: a["expected"] ? String(a["expected"]) : undefined,
|
|
31043
|
-
assertions: a["assertions"] ?? [],
|
|
31044
|
-
judge: {
|
|
31100
|
+
];
|
|
31101
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
|
|
31102
|
+
server.setRequestHandler(CallToolRequestSchema, async (req) => {
|
|
31103
|
+
const { name, arguments: args } = req.params;
|
|
31104
|
+
const a = args ?? {};
|
|
31105
|
+
try {
|
|
31106
|
+
switch (name) {
|
|
31107
|
+
case "evals_run": {
|
|
31108
|
+
const adapter = AdapterSchema.parse(a["adapter"]);
|
|
31109
|
+
const { cases } = await loadDataset(String(a["dataset"]), {
|
|
31110
|
+
tags: a["tags"]
|
|
31111
|
+
});
|
|
31112
|
+
const run = await runEvals(cases, {
|
|
31113
|
+
dataset: String(a["dataset"]),
|
|
31114
|
+
adapter,
|
|
31115
|
+
concurrency: Number(a["concurrency"] ?? 5),
|
|
31116
|
+
skipJudge: Boolean(a["skip_judge"])
|
|
31117
|
+
});
|
|
31118
|
+
if (a["save"])
|
|
31119
|
+
saveRun(run);
|
|
31120
|
+
const fmt = String(a["output_format"] ?? "summary");
|
|
31121
|
+
const output = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%) in ${run.stats.totalDurationMs}ms. Run ID: ${run.id.slice(0, 8)}`;
|
|
31122
|
+
return { content: [{ type: "text", text: output }] };
|
|
31123
|
+
}
|
|
31124
|
+
case "evals_run_single": {
|
|
31125
|
+
const evalCase = {
|
|
31126
|
+
id: "mcp-single",
|
|
31127
|
+
input: String(a["input"]),
|
|
31128
|
+
expected: a["expected"] ? String(a["expected"]) : undefined,
|
|
31129
|
+
assertions: a["assertions"] ?? [],
|
|
31130
|
+
judge: {
|
|
31131
|
+
rubric: String(a["rubric"]),
|
|
31132
|
+
model: a["judge_model"] ? String(a["judge_model"]) : undefined,
|
|
31133
|
+
provider: a["judge_provider"]
|
|
31134
|
+
}
|
|
31135
|
+
};
|
|
31136
|
+
const mockAdapter = { type: "function", modulePath: "__mock__" };
|
|
31137
|
+
const judgeResult = await judgeOnce({
|
|
31138
|
+
input: String(a["input"]),
|
|
31139
|
+
output: String(a["output"]),
|
|
31045
31140
|
rubric: String(a["rubric"]),
|
|
31141
|
+
expected: a["expected"] ? String(a["expected"]) : undefined,
|
|
31046
31142
|
model: a["judge_model"] ? String(a["judge_model"]) : undefined,
|
|
31047
31143
|
provider: a["judge_provider"]
|
|
31048
|
-
}
|
|
31049
|
-
|
|
31050
|
-
|
|
31051
|
-
|
|
31052
|
-
|
|
31053
|
-
output: String(a["output"]),
|
|
31054
|
-
rubric: String(a["rubric"]),
|
|
31055
|
-
expected: a["expected"] ? String(a["expected"]) : undefined,
|
|
31056
|
-
model: a["judge_model"] ? String(a["judge_model"]) : undefined,
|
|
31057
|
-
provider: a["judge_provider"]
|
|
31058
|
-
});
|
|
31059
|
-
return {
|
|
31060
|
-
content: [{
|
|
31061
|
-
type: "text",
|
|
31062
|
-
text: `VERDICT: ${judgeResult.verdict}
|
|
31144
|
+
});
|
|
31145
|
+
return {
|
|
31146
|
+
content: [{
|
|
31147
|
+
type: "text",
|
|
31148
|
+
text: `VERDICT: ${judgeResult.verdict}
|
|
31063
31149
|
REASONING: ${judgeResult.reasoning}`
|
|
31064
|
-
|
|
31065
|
-
|
|
31066
|
-
|
|
31067
|
-
|
|
31068
|
-
|
|
31069
|
-
|
|
31070
|
-
|
|
31071
|
-
|
|
31072
|
-
|
|
31073
|
-
|
|
31074
|
-
|
|
31075
|
-
|
|
31076
|
-
|
|
31150
|
+
}]
|
|
31151
|
+
};
|
|
31152
|
+
}
|
|
31153
|
+
case "evals_judge": {
|
|
31154
|
+
const r = await judgeOnce({
|
|
31155
|
+
input: String(a["input"]),
|
|
31156
|
+
output: String(a["output"]),
|
|
31157
|
+
rubric: String(a["rubric"]),
|
|
31158
|
+
expected: a["expected"] ? String(a["expected"]) : undefined,
|
|
31159
|
+
model: a["model"] ? String(a["model"]) : undefined,
|
|
31160
|
+
provider: a["provider"]
|
|
31161
|
+
});
|
|
31162
|
+
return { content: [{ type: "text", text: `${r.verdict}
|
|
31077
31163
|
${r.reasoning}` }] };
|
|
31078
|
-
|
|
31079
|
-
|
|
31080
|
-
|
|
31081
|
-
|
|
31082
|
-
|
|
31083
|
-
|
|
31084
|
-
|
|
31085
|
-
|
|
31086
|
-
|
|
31164
|
+
}
|
|
31165
|
+
case "evals_list_datasets": {
|
|
31166
|
+
const dir = String(a["directory"] ?? "./datasets");
|
|
31167
|
+
const files = [];
|
|
31168
|
+
for await (const f of new Bun.Glob(`${dir}/**/*.jsonl`).scan("."))
|
|
31169
|
+
files.push(f);
|
|
31170
|
+
for await (const f of new Bun.Glob(`${dir}/**/*.json`).scan("."))
|
|
31171
|
+
files.push(f);
|
|
31172
|
+
return { content: [{ type: "text", text: files.length > 0 ? files.join(`
|
|
31087
31173
|
`) : "No datasets found" }] };
|
|
31088
|
-
|
|
31089
|
-
|
|
31090
|
-
|
|
31091
|
-
|
|
31092
|
-
|
|
31093
|
-
|
|
31094
|
-
|
|
31095
|
-
|
|
31096
|
-
|
|
31097
|
-
|
|
31098
|
-
|
|
31099
|
-
|
|
31174
|
+
}
|
|
31175
|
+
case "evals_get_results": {
|
|
31176
|
+
if (a["run_id"]) {
|
|
31177
|
+
const run = getRun(String(a["run_id"]));
|
|
31178
|
+
if (!run)
|
|
31179
|
+
return { content: [{ type: "text", text: "Run not found" }] };
|
|
31180
|
+
const fmt = String(a["format"] ?? "summary");
|
|
31181
|
+
const text = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `Run ${run.id.slice(0, 8)}: ${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%)`;
|
|
31182
|
+
return { content: [{ type: "text", text }] };
|
|
31183
|
+
} else {
|
|
31184
|
+
const runs = listRuns(Number(a["limit"] ?? 10));
|
|
31185
|
+
const summary = runs.map((r) => `${r.id.slice(0, 8)} | ${r.createdAt.slice(0, 10)} | ${r.dataset} | ${r.stats.passed}/${r.stats.total} passed`).join(`
|
|
31100
31186
|
`);
|
|
31101
|
-
|
|
31102
|
-
|
|
31103
|
-
|
|
31104
|
-
|
|
31105
|
-
|
|
31106
|
-
|
|
31107
|
-
|
|
31108
|
-
|
|
31109
|
-
|
|
31110
|
-
|
|
31111
|
-
|
|
31112
|
-
|
|
31113
|
-
|
|
31114
|
-
|
|
31115
|
-
|
|
31116
|
-
|
|
31187
|
+
return { content: [{ type: "text", text: summary || "No runs found" }] };
|
|
31188
|
+
}
|
|
31189
|
+
}
|
|
31190
|
+
case "evals_compare": {
|
|
31191
|
+
const { getBaseline: getBaseline2 } = await Promise.resolve().then(() => (init_store(), exports_store));
|
|
31192
|
+
const before = getRun(String(a["before"])) ?? getBaseline2(String(a["before"]));
|
|
31193
|
+
const after = getRun(String(a["after"])) ?? getBaseline2(String(a["after"]));
|
|
31194
|
+
if (!before || !after)
|
|
31195
|
+
return { content: [{ type: "text", text: "Run(s) not found" }] };
|
|
31196
|
+
const diff = compareRuns(before, after);
|
|
31197
|
+
const lines = [
|
|
31198
|
+
`Score delta: ${diff.passRateDelta >= 0 ? "+" : ""}${(diff.passRateDelta * 100).toFixed(1)}%`,
|
|
31199
|
+
...diff.regressions.map((r) => `\u2193 REGRESSION: ${r.caseId} (${r.before} \u2192 ${r.after})`),
|
|
31200
|
+
...diff.improvements.map((i) => `\u2191 IMPROVEMENT: ${i.caseId} (${i.before} \u2192 ${i.after})`)
|
|
31201
|
+
];
|
|
31202
|
+
return { content: [{ type: "text", text: lines.join(`
|
|
31117
31203
|
`) }] };
|
|
31118
|
-
|
|
31119
|
-
|
|
31120
|
-
|
|
31121
|
-
|
|
31122
|
-
|
|
31123
|
-
|
|
31124
|
-
|
|
31125
|
-
|
|
31126
|
-
|
|
31127
|
-
|
|
31128
|
-
|
|
31204
|
+
}
|
|
31205
|
+
case "evals_create_case": {
|
|
31206
|
+
const evalCase = {
|
|
31207
|
+
id: String(a["id"]),
|
|
31208
|
+
input: String(a["input"]),
|
|
31209
|
+
expected: a["expected"] ? String(a["expected"]) : undefined,
|
|
31210
|
+
judge: a["rubric"] ? { rubric: String(a["rubric"]) } : undefined,
|
|
31211
|
+
assertions: a["assertions"] ?? undefined,
|
|
31212
|
+
tags: a["tags"] ?? undefined
|
|
31213
|
+
};
|
|
31214
|
+
appendFileSync(String(a["dataset"]), JSON.stringify(evalCase) + `
|
|
31129
31215
|
`);
|
|
31130
|
-
|
|
31131
|
-
|
|
31132
|
-
|
|
31133
|
-
|
|
31134
|
-
|
|
31135
|
-
|
|
31136
|
-
|
|
31137
|
-
|
|
31138
|
-
|
|
31139
|
-
|
|
31140
|
-
|
|
31141
|
-
|
|
31142
|
-
|
|
31143
|
-
|
|
31216
|
+
return { content: [{ type: "text", text: `Case "${evalCase.id}" appended to ${a["dataset"]}` }] };
|
|
31217
|
+
}
|
|
31218
|
+
case "evals_generate_cases": {
|
|
31219
|
+
const Anthropic2 = (await Promise.resolve().then(() => (init_sdk(), exports_sdk))).default;
|
|
31220
|
+
const client = new Anthropic2;
|
|
31221
|
+
const count = Number(a["count"] ?? 10);
|
|
31222
|
+
const res = await client.messages.create({
|
|
31223
|
+
model: "claude-sonnet-4-6",
|
|
31224
|
+
max_tokens: 4096,
|
|
31225
|
+
temperature: 1,
|
|
31226
|
+
system: "Generate eval cases as JSONL. Each line: {id, input, expected, judge: {rubric}, tags}. Output only JSONL lines.",
|
|
31227
|
+
messages: [{ role: "user", content: `Generate ${count} eval cases for: ${a["description"]}` }]
|
|
31228
|
+
});
|
|
31229
|
+
const text = res.content.filter((b) => b.type === "text").map((b) => b.text).join(`
|
|
31144
31230
|
`);
|
|
31145
|
-
|
|
31231
|
+
const lines = text.split(`
|
|
31146
31232
|
`).filter((l) => l.trim().startsWith("{"));
|
|
31147
|
-
|
|
31148
|
-
|
|
31233
|
+
const output = String(a["output"] ?? "generated.jsonl");
|
|
31234
|
+
writeFileSync(output, lines.join(`
|
|
31149
31235
|
`) + `
|
|
31150
31236
|
`);
|
|
31151
|
-
|
|
31237
|
+
return { content: [{ type: "text", text: `Generated ${lines.length} cases \u2192 ${output}` }] };
|
|
31238
|
+
}
|
|
31239
|
+
default:
|
|
31240
|
+
return { content: [{ type: "text", text: `Unknown tool: ${name}` }] };
|
|
31152
31241
|
}
|
|
31153
|
-
|
|
31154
|
-
|
|
31242
|
+
} catch (err) {
|
|
31243
|
+
return {
|
|
31244
|
+
content: [{ type: "text", text: `Error: ${err instanceof Error ? err.message : String(err)}` }],
|
|
31245
|
+
isError: true
|
|
31246
|
+
};
|
|
31155
31247
|
}
|
|
31156
|
-
}
|
|
31157
|
-
|
|
31158
|
-
|
|
31159
|
-
|
|
31160
|
-
|
|
31248
|
+
});
|
|
31249
|
+
return server;
|
|
31250
|
+
}
|
|
31251
|
+
|
|
31252
|
+
// src/mcp/http.ts
|
|
31253
|
+
import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
|
|
31254
|
+
function isHttpMode(argv = process.argv.slice(2)) {
|
|
31255
|
+
return argv.includes("--http") || process.env["MCP_HTTP"] === "1";
|
|
31256
|
+
}
|
|
31257
|
+
function resolveHttpPort(argv = process.argv.slice(2)) {
|
|
31258
|
+
for (let i = 0;i < argv.length; i++) {
|
|
31259
|
+
const arg = argv[i];
|
|
31260
|
+
if (arg === "--port" || arg === "-p") {
|
|
31261
|
+
const raw = argv[i + 1];
|
|
31262
|
+
if (!raw)
|
|
31263
|
+
throw new Error(`Invalid port: ${raw ?? ""}`);
|
|
31264
|
+
return parsePort(raw, "port");
|
|
31265
|
+
}
|
|
31266
|
+
}
|
|
31267
|
+
const fromEnv = process.env["MCP_HTTP_PORT"];
|
|
31268
|
+
if (fromEnv)
|
|
31269
|
+
return parsePort(fromEnv, "MCP_HTTP_PORT");
|
|
31270
|
+
return DEFAULT_MCP_HTTP_PORT;
|
|
31271
|
+
}
|
|
31272
|
+
function parsePort(raw, label) {
|
|
31273
|
+
const value = Number(raw);
|
|
31274
|
+
if (!Number.isInteger(value) || value < 1 || value > 65535) {
|
|
31275
|
+
throw new Error(`Invalid ${label}: ${raw}`);
|
|
31276
|
+
}
|
|
31277
|
+
return value;
|
|
31278
|
+
}
|
|
31279
|
+
async function handleMcpHttpRequest(req) {
|
|
31280
|
+
const url2 = new URL(req.url);
|
|
31281
|
+
if (url2.pathname === "/health" && req.method === "GET") {
|
|
31282
|
+
return Response.json({ status: "ok", name: MCP_NAME });
|
|
31161
31283
|
}
|
|
31284
|
+
if (url2.pathname === "/mcp") {
|
|
31285
|
+
const transport = new WebStandardStreamableHTTPServerTransport({
|
|
31286
|
+
sessionIdGenerator: undefined
|
|
31287
|
+
});
|
|
31288
|
+
const server = buildServer();
|
|
31289
|
+
await server.connect(transport);
|
|
31290
|
+
return transport.handleRequest(req);
|
|
31291
|
+
}
|
|
31292
|
+
return new Response("Not Found", { status: 404 });
|
|
31293
|
+
}
|
|
31294
|
+
function startHttpServer(options = {}) {
|
|
31295
|
+
const port = options.port ?? DEFAULT_MCP_HTTP_PORT;
|
|
31296
|
+
const hostname3 = options.hostname ?? "127.0.0.1";
|
|
31297
|
+
const log2 = options.log ?? console.error;
|
|
31298
|
+
const server = Bun.serve({
|
|
31299
|
+
port,
|
|
31300
|
+
hostname: hostname3,
|
|
31301
|
+
fetch: handleMcpHttpRequest
|
|
31302
|
+
});
|
|
31303
|
+
const address = `http://${hostname3}:${server.port}`;
|
|
31304
|
+
log2(`${MCP_NAME}-mcp HTTP listening on ${address}/mcp (health: ${address}/health)`);
|
|
31305
|
+
return server;
|
|
31306
|
+
}
|
|
31307
|
+
|
|
31308
|
+
// src/mcp/index.ts
|
|
31309
|
+
function printHelp() {
|
|
31310
|
+
console.log(`Usage: evals-mcp [options]
|
|
31311
|
+
|
|
31312
|
+
Runs the @hasna/evals MCP server (stdio by default).
|
|
31313
|
+
|
|
31314
|
+
Options:
|
|
31315
|
+
--http Serve MCP over Streamable HTTP on 127.0.0.1
|
|
31316
|
+
-p, --port <port> HTTP port (default: MCP_HTTP_PORT or 8817)
|
|
31317
|
+
-V, --version output the version number
|
|
31318
|
+
-h, --help display help for command
|
|
31319
|
+
|
|
31320
|
+
Environment:
|
|
31321
|
+
MCP_HTTP=1 Enable HTTP mode
|
|
31322
|
+
MCP_HTTP_PORT Override default HTTP port (8817)`);
|
|
31323
|
+
}
|
|
31324
|
+
var args = process.argv.slice(2);
|
|
31325
|
+
if (args.includes("--help") || args.includes("-h")) {
|
|
31326
|
+
printHelp();
|
|
31327
|
+
process.exit(0);
|
|
31328
|
+
}
|
|
31329
|
+
if (args.includes("--version") || args.includes("-V")) {
|
|
31330
|
+
console.log(package_default.version);
|
|
31331
|
+
process.exit(0);
|
|
31332
|
+
}
|
|
31333
|
+
async function main() {
|
|
31334
|
+
if (isHttpMode(args)) {
|
|
31335
|
+
startHttpServer({ port: resolveHttpPort(args) });
|
|
31336
|
+
await new Promise(() => {});
|
|
31337
|
+
return;
|
|
31338
|
+
}
|
|
31339
|
+
const server = buildServer();
|
|
31340
|
+
const transport = new StdioServerTransport;
|
|
31341
|
+
await server.connect(transport);
|
|
31342
|
+
}
|
|
31343
|
+
main().catch((error50) => {
|
|
31344
|
+
console.error("MCP server error:", error50);
|
|
31345
|
+
process.exit(1);
|
|
31162
31346
|
});
|
|
31163
|
-
var transport = new StdioServerTransport;
|
|
31164
|
-
await server.connect(transport);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../../src/mcp/server.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,2CAA2C,CAAC;AAanE,eAAO,MAAM,QAAQ,UAAU,CAAC;AAChC,eAAO,MAAM,qBAAqB,OAAO,CAAC;AAE1C,wBAAgB,WAAW,IAAI,MAAM,CAmSpC"}
|
package/package.json
CHANGED