@hasna/evals 0.1.20 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE CHANGED
@@ -1,3 +1,4 @@
1
+
1
2
  Apache License
2
3
  Version 2.0, January 2004
3
4
  http://www.apache.org/licenses/
@@ -175,7 +176,7 @@
175
176
 
176
177
  END OF TERMS AND CONDITIONS
177
178
 
178
- Copyright 2026 hasna
179
+ Copyright 2026 Hasna, Inc.
179
180
 
180
181
  Licensed under the Apache License, Version 2.0 (the "License");
181
182
  you may not use this file except in compliance with the License.
package/README.md CHANGED
@@ -240,6 +240,19 @@ evals_run_single(
240
240
  → PASS — The response correctly identifies Paris.
241
241
  ```
242
242
 
243
+ ## HTTP mode
244
+
245
+ Shared Streamable HTTP transport for multi-agent sessions (stdio remains the default):
246
+
247
+ ```bash
248
+ evals-mcp --http # http://127.0.0.1:8817/mcp
249
+ MCP_HTTP=1 evals-mcp # same
250
+ evals-mcp --http --port 8817 # explicit port
251
+ ```
252
+
253
+ - Health: `GET http://127.0.0.1:8817/health` → `{"status":"ok","name":"evals"}`
254
+ - Override port with `MCP_HTTP_PORT` or `--port`
255
+
243
256
  ---
244
257
 
245
258
  ## License
@@ -1,3 +1,5 @@
1
1
  import { Command } from "commander";
2
+ import { compareRuns } from "../../core/reporter.js";
3
+ export declare function renderMarkdownDiff(diff: ReturnType<typeof compareRuns>): string;
2
4
  export declare function compareCommand(): Command;
3
5
  //# sourceMappingURL=compare.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"compare.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAIpC,wBAAgB,cAAc,IAAI,OAAO,CAgCxC"}
1
+ {"version":3,"file":"compare.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,WAAW,EAA+B,MAAM,wBAAwB,CAAC;AAGlF,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,GAAG,MAAM,CA2B/E;AAED,wBAAgB,cAAc,IAAI,OAAO,CAgCxC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=compare.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compare.test.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.test.ts"],"names":[],"mappings":""}
@@ -1 +1 @@
1
- {"version":3,"file":"doctor.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/doctor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAqCpC,wBAAgB,aAAa,IAAI,OAAO,CA4EvC"}
1
+ {"version":3,"file":"doctor.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/doctor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAqCpC,wBAAgB,aAAa,IAAI,OAAO,CAqFvC"}
@@ -1 +1 @@
1
- {"version":3,"file":"generate.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/generate.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAiBpC,wBAAgB,eAAe,IAAI,OAAO,CA2CzC"}
1
+ {"version":3,"file":"generate.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/generate.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAiBpC,wBAAgB,eAAe,IAAI,OAAO,CAyDzC"}
package/dist/cli/index.js CHANGED
@@ -30508,7 +30508,7 @@ function parseAdapterConfig(opts) {
30508
30508
 
30509
30509
  // src/cli/commands/run.ts
30510
30510
  function runCommand() {
30511
- return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter, default: default)").option("--command <cmd>", "Shell command (for cli adapter, use {{input}} placeholder)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("--json", "Alias for --output json").action(async (dataset, opts) => {
30511
+ return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter, default: default)").option("--command <cmd>", "Shell command (for cli adapter, use {{input}} placeholder)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("-j, --json", "Alias for --output json").action(async (dataset, opts) => {
30512
30512
  const { cases, warnings } = await loadDataset(dataset, {
30513
30513
  tags: opts["tags"] ? opts["tags"].split(",") : undefined
30514
30514
  });
@@ -30548,7 +30548,7 @@ init_loader();
30548
30548
  init_store();
30549
30549
  function ciCommand() {
30550
30550
  const cmd = new Command("ci").description("Run evals in CI mode \u2014 exit non-zero on regression");
30551
- cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter)").option("--command <cmd>", "Shell command (for cli adapter)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--no-judge", "Skip LLM judge, assertions only").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("--json", "Output JSON").action(async (dataset, opts) => {
30551
+ cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter)").option("--command <cmd>", "Shell command (for cli adapter)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--no-judge", "Skip LLM judge, assertions only").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("-j, --json", "Output JSON").action(async (dataset, opts) => {
30552
30552
  const { cases } = await loadDataset(dataset);
30553
30553
  const adapter = parseAdapterConfig(opts);
30554
30554
  const run = await runEvals(cases, { dataset, adapter, skipJudge: opts["judge"] === false || opts["noJudge"] === true });
@@ -30596,7 +30596,7 @@ No baseline "${baselineName}" found \u2014 use "evals ci set-baseline" to create
30596
30596
 
30597
30597
  // src/cli/commands/judge.ts
30598
30598
  function judgeCommand() {
30599
- return new Command("judge").description("Ad-hoc: judge a single input/output pair against a rubric").requiredOption("--input <text>", "The input that was given to the AI").requiredOption("--output <text>", "The AI's response").requiredOption("--rubric <text>", "Plain-English grading criteria").option("--expected <text>", "Expected behavior description").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--provider <provider>", "Judge provider: anthropic|openai", "anthropic").option("--json", "Output JSON").action(async (opts) => {
30599
+ return new Command("judge").description("Ad-hoc: judge a single input/output pair against a rubric").requiredOption("--input <text>", "The input that was given to the AI").requiredOption("--output <text>", "The AI's response").requiredOption("--rubric <text>", "Plain-English grading criteria").option("--expected <text>", "Expected behavior description").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--provider <provider>", "Judge provider: anthropic|openai", "anthropic").option("-j, --json", "Output JSON").action(async (opts) => {
30600
30600
  const result = await judgeOnce({
30601
30601
  input: opts["input"] ?? "",
30602
30602
  output: opts["output"] ?? "",
@@ -30622,8 +30622,34 @@ ${icon} ${result.verdict}
30622
30622
 
30623
30623
  // src/cli/commands/compare.ts
30624
30624
  init_store();
30625
+ function renderMarkdownDiff(diff) {
30626
+ const lines = ["## Diff", ""];
30627
+ if (diff.regressions.length === 0 && diff.improvements.length === 0) {
30628
+ lines.push("- No changes between runs.");
30629
+ return lines.join(`
30630
+ `);
30631
+ }
30632
+ if (diff.regressions.length > 0) {
30633
+ lines.push("### Regressions");
30634
+ for (const r of diff.regressions) {
30635
+ lines.push(`- ${r.caseId}: ${r.before} -> ${r.after}`);
30636
+ }
30637
+ lines.push("");
30638
+ }
30639
+ if (diff.improvements.length > 0) {
30640
+ lines.push("### Improvements");
30641
+ for (const i of diff.improvements) {
30642
+ lines.push(`- ${i.caseId}: ${i.before} -> ${i.after}`);
30643
+ }
30644
+ lines.push("");
30645
+ }
30646
+ const delta = diff.passRateDelta * 100;
30647
+ lines.push(`- Score delta: ${delta >= 0 ? "+" : ""}${delta.toFixed(1)}%`);
30648
+ return lines.join(`
30649
+ `);
30650
+ }
30625
30651
  function compareCommand() {
30626
- return new Command("compare").description("Compare two eval runs side-by-side").argument("<before>", "Before run ID or baseline name").argument("<after>", "After run ID (or 'latest')").option("--json", "Output JSON diff").option("--markdown", "Output markdown diff").action(async (beforeArg, afterArg, opts) => {
30652
+ return new Command("compare").description("Compare two eval runs side-by-side").argument("<before>", "Before run ID or baseline name").argument("<after>", "After run ID (or 'latest')").option("-j, --json", "Output JSON diff").option("--markdown", "Output markdown diff").action(async (beforeArg, afterArg, opts) => {
30627
30653
  const { listRuns: listRuns2 } = await Promise.resolve().then(() => (init_store(), exports_store));
30628
30654
  const beforeRun = getRun(beforeArg) ?? getBaseline(beforeArg);
30629
30655
  const afterRun = afterArg === "latest" ? listRuns2(1)[0] : getRun(afterArg) ?? getBaseline(afterArg);
@@ -30640,10 +30666,8 @@ function compareCommand() {
30640
30666
  console.log(JSON.stringify(diff, null, 2));
30641
30667
  } else if (opts["markdown"]) {
30642
30668
  console.log(toMarkdown(afterRun));
30643
- console.log(`
30644
- ## Diff
30645
- `);
30646
- printDiffReport(diff);
30669
+ console.log();
30670
+ console.log(renderMarkdownDiff(diff));
30647
30671
  } else {
30648
30672
  printDiffReport(diff);
30649
30673
  }
@@ -30662,7 +30686,7 @@ var COST_PER_1K_INPUT = {
30662
30686
  };
30663
30687
  var AVG_JUDGE_TOKENS = 800;
30664
30688
  function estimateCommand() {
30665
- return new Command("estimate").description("Estimate cost before running evals (no API calls made)").argument("<dataset>", "Path to JSONL/JSON dataset").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--no-judge", "Assume no judge calls").option("--json", "Output JSON").action(async (dataset, opts) => {
30689
+ return new Command("estimate").description("Estimate cost before running evals (no API calls made)").argument("<dataset>", "Path to JSONL/JSON dataset").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--no-judge", "Assume no judge calls").option("-j, --json", "Output JSON").action(async (dataset, opts) => {
30666
30690
  const { cases, warnings } = await loadDataset(dataset);
30667
30691
  if (warnings.length > 0)
30668
30692
  for (const w of warnings)
@@ -30713,7 +30737,7 @@ Each case must be a valid JSON object on a single line with these fields:
30713
30737
  Generate varied cases that cover edge cases, typical usage, and boundary conditions.
30714
30738
  Output ONLY valid JSONL \u2014 one JSON object per line, no markdown, no explanation.`;
30715
30739
  function generateCommand() {
30716
- return new Command("generate").description("Generate eval cases from a description using Claude").requiredOption("--description <text>", "What behavior to test (e.g. 'refund policy responses')").option("--seeds <path>", "Path to JSONL file with seed examples").option("--count <n>", "Number of cases to generate", "10").option("--output <path>", "Output JSONL file path", "generated.jsonl").option("--model <model>", "Model to use for generation", "claude-sonnet-4-6").action(async (opts) => {
30740
+ return new Command("generate").description("Generate eval cases from a description using Claude").requiredOption("--description <text>", "What behavior to test (e.g. 'refund policy responses')").option("--seeds <path>", "Path to JSONL file with seed examples").option("--count <n>", "Number of cases to generate", "10").option("--output <path>", "Output JSONL file path", "generated.jsonl").option("--model <model>", "Model to use for generation", "claude-sonnet-4-6").option("-j, --json", "Output JSON summary").action(async (opts) => {
30717
30741
  const client = new Anthropic;
30718
30742
  const count = parseInt(opts["count"] ?? "10");
30719
30743
  let seedText = "";
@@ -30746,11 +30770,22 @@ Output ${count} JSONL lines starting with {"id":"gen-001",...}`;
30746
30770
  valid.push(parsed);
30747
30771
  } catch {}
30748
30772
  }
30773
+ const outputPath = opts["output"] ?? "generated.jsonl";
30749
30774
  const output = valid.map((c) => JSON.stringify(c)).join(`
30750
30775
  `);
30751
- writeFileSync(opts["output"] ?? "generated.jsonl", output + `
30776
+ writeFileSync(outputPath, output + `
30752
30777
  `);
30753
- console.log(`\x1B[32m\u2713 Generated ${valid.length} cases \u2192 ${opts["output"]}\x1B[0m`);
30778
+ if (opts["json"]) {
30779
+ console.log(JSON.stringify({
30780
+ generated: valid.length,
30781
+ requested: count,
30782
+ output: outputPath,
30783
+ model: opts["model"] ?? "claude-sonnet-4-6",
30784
+ description: opts["description"] ?? ""
30785
+ }, null, 2));
30786
+ return;
30787
+ }
30788
+ console.log(`\x1B[32m\u2713 Generated ${valid.length} cases \u2192 ${outputPath}\x1B[0m`);
30754
30789
  });
30755
30790
  }
30756
30791
 
@@ -30844,7 +30879,7 @@ function resolveApiKey(envVar, secretsPath, secretsKey) {
30844
30879
  return;
30845
30880
  }
30846
30881
  function doctorCommand() {
30847
- return new Command("doctor").description("Health check \u2014 verify API keys, DB, and config").action(async () => {
30882
+ return new Command("doctor").description("Health check \u2014 verify API keys, DB, and config").option("-j, --json", "Output JSON").action(async (opts) => {
30848
30883
  const checks = [];
30849
30884
  const anthropicKey = resolveApiKey("ANTHROPIC_API_KEY", "hasnaxyz/anthropic/live.env", "HASNAXYZ_ANTHROPIC_LIVE_API_KEY");
30850
30885
  checks.push({
@@ -30888,6 +30923,15 @@ function doctorCommand() {
30888
30923
  } catch {
30889
30924
  checks.push({ name: "Example dataset (optional)", ok: false, hint: "datasets/examples/smoke.jsonl not found \u2014 install @hasna/evals globally to include examples" });
30890
30925
  }
30926
+ const allOk = checks.every((c) => c.ok || c.name.toLowerCase().includes("optional"));
30927
+ if (opts.json) {
30928
+ console.log(JSON.stringify({
30929
+ ok: allOk,
30930
+ checks,
30931
+ summary: allOk ? "All checks passed." : "Some checks failed \u2014 see hints above."
30932
+ }, null, 2));
30933
+ process.exit(allOk ? 0 : 1);
30934
+ }
30891
30935
  console.log(`
30892
30936
  \x1B[1mevals doctor\x1B[0m
30893
30937
  `);
@@ -30896,7 +30940,6 @@ function doctorCommand() {
30896
30940
  console.log(` ${icon} ${c.name}${!c.ok && c.hint ? `
30897
30941
  hint: ${c.hint}` : ""}`);
30898
30942
  }
30899
- const allOk = checks.every((c) => c.ok || c.name.toLowerCase().includes("optional"));
30900
30943
  console.log(allOk ? `
30901
30944
  \x1B[32m All checks passed.\x1B[0m
30902
30945
  ` : `
@@ -0,0 +1,12 @@
1
+ import { DEFAULT_MCP_HTTP_PORT, MCP_NAME } from "./server.js";
2
+ export { DEFAULT_MCP_HTTP_PORT, MCP_NAME };
3
+ export declare function isHttpMode(argv?: string[]): boolean;
4
+ export declare function resolveHttpPort(argv?: string[]): number;
5
+ export declare function handleMcpHttpRequest(req: Request): Promise<Response>;
6
+ export interface StartHttpServerOptions {
7
+ port?: number;
8
+ hostname?: string;
9
+ log?: (message: string) => void;
10
+ }
11
+ export declare function startHttpServer(options?: StartHttpServerOptions): ReturnType<typeof Bun.serve>;
12
+ //# sourceMappingURL=http.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"http.d.ts","sourceRoot":"","sources":["../../src/mcp/http.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,qBAAqB,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE3E,OAAO,EAAE,qBAAqB,EAAE,QAAQ,EAAE,CAAC;AAE3C,wBAAgB,UAAU,CAAC,IAAI,GAAE,MAAM,EAA0B,GAAG,OAAO,CAE1E;AAED,wBAAgB,eAAe,CAAC,IAAI,GAAE,MAAM,EAA0B,GAAG,MAAM,CAa9E;AAUD,wBAAsB,oBAAoB,CAAC,GAAG,EAAE,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,CAiB1E;AAED,MAAM,WAAW,sBAAsB;IACrC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;CACjC;AAED,wBAAgB,eAAe,CAAC,OAAO,GAAE,sBAA2B,GAAG,UAAU,CAAC,OAAO,GAAG,CAAC,KAAK,CAAC,CAclG"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=http.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"http.test.d.ts","sourceRoot":"","sources":["../../src/mcp/http.test.ts"],"names":[],"mappings":""}
package/dist/mcp/index.js CHANGED
@@ -10587,8 +10587,91 @@ var _db = null;
10587
10587
  var init_store = () => {};
10588
10588
 
10589
10589
  // src/mcp/index.ts
10590
- import { Server } from "@modelcontextprotocol/sdk/server/index.js";
10591
10590
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
10591
+ // package.json
10592
+ var package_default = {
10593
+ name: "@hasna/evals",
10594
+ version: "0.1.22",
10595
+ description: "Open source AI evaluation framework \u2014 LLM-as-judge + assertion-based evals for any AI app. CLI + MCP server.",
10596
+ type: "module",
10597
+ main: "dist/index.js",
10598
+ types: "dist/index.d.ts",
10599
+ bin: {
10600
+ evals: "dist/cli/index.js",
10601
+ "evals-mcp": "dist/mcp/index.js",
10602
+ "evals-serve": "dist/server/index.js"
10603
+ },
10604
+ exports: {
10605
+ ".": {
10606
+ types: "./dist/index.d.ts",
10607
+ import: "./dist/index.js"
10608
+ }
10609
+ },
10610
+ files: [
10611
+ "dist",
10612
+ "dashboard/dist",
10613
+ "datasets/examples",
10614
+ "LICENSE",
10615
+ "README.md"
10616
+ ],
10617
+ scripts: {
10618
+ build: "cd dashboard && bun run build && cd .. && bun build src/cli/index.ts --outdir dist/cli --target bun --external ink --external react --external chalk --external @modelcontextprotocol/sdk && bun build src/mcp/index.ts --outdir dist/mcp --target bun --external @modelcontextprotocol/sdk && bun build src/server/index.ts --outdir dist/server --target bun && bun build src/index.ts --outdir dist --target bun && tsc --emitDeclarationOnly --outDir dist",
10619
+ "build:dashboard": "cd dashboard && bun run build",
10620
+ typecheck: "tsc --noEmit",
10621
+ test: "bun test",
10622
+ "dev:cli": "bun run src/cli/index.ts",
10623
+ "dev:mcp": "bun run src/mcp/index.ts",
10624
+ "dev:serve": "bun run src/server/index.ts",
10625
+ prepublishOnly: "bun run typecheck && bun test && bun run build",
10626
+ postinstall: "mkdir -p $HOME/.hasna/evals 2>/dev/null || true"
10627
+ },
10628
+ keywords: [
10629
+ "evals",
10630
+ "llm",
10631
+ "ai",
10632
+ "testing",
10633
+ "evaluation",
10634
+ "mcp",
10635
+ "claude",
10636
+ "llm-as-judge",
10637
+ "typescript",
10638
+ "cli"
10639
+ ],
10640
+ publishConfig: {
10641
+ registry: "https://registry.npmjs.org",
10642
+ access: "public"
10643
+ },
10644
+ repository: {
10645
+ type: "git",
10646
+ url: "https://github.com/hasna/open-evals.git"
10647
+ },
10648
+ homepage: "https://github.com/hasna/open-evals",
10649
+ bugs: {
10650
+ url: "https://github.com/hasna/open-evals/issues"
10651
+ },
10652
+ engines: {
10653
+ bun: ">=1.0.0"
10654
+ },
10655
+ author: "Andrei Hasna <andrei@hasna.com>",
10656
+ license: "Apache-2.0",
10657
+ dependencies: {
10658
+ "@anthropic-ai/sdk": "^0.82.0",
10659
+ "@hasna/cloud": "^0.1.30",
10660
+ "@modelcontextprotocol/sdk": "^1.29.0",
10661
+ ajv: "^8.18.0",
10662
+ chalk: "^5.4.1",
10663
+ commander: "^14.0.3",
10664
+ openai: "^6.33.0",
10665
+ zod: "^4.3.6"
10666
+ },
10667
+ devDependencies: {
10668
+ "@types/bun": "^1.2.4",
10669
+ typescript: "^6.0.2"
10670
+ }
10671
+ };
10672
+
10673
+ // src/mcp/server.ts
10674
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
10592
10675
  import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
10593
10676
 
10594
10677
  // node_modules/zod/v4/classic/external.js
@@ -30881,284 +30964,383 @@ function compareRuns(before, after) {
30881
30964
  };
30882
30965
  }
30883
30966
 
30884
- // src/mcp/index.ts
30967
+ // src/mcp/server.ts
30885
30968
  init_store();
30886
30969
  import { writeFileSync, appendFileSync } from "fs";
30887
30970
  var pkg = await Bun.file(new URL("../../package.json", import.meta.url)).json();
30888
- var server = new Server({ name: "evals", version: pkg.version }, { capabilities: { tools: {} } });
30889
- var AdapterSchema = exports_external.object({
30890
- type: exports_external.enum(["http", "anthropic", "openai", "mcp", "function", "cli"]),
30891
- url: exports_external.string().optional(),
30892
- model: exports_external.string().optional(),
30893
- systemPrompt: exports_external.string().optional(),
30894
- command: exports_external.array(exports_external.string()).optional(),
30895
- tool: exports_external.string().optional(),
30896
- modulePath: exports_external.string().optional()
30897
- }).passthrough();
30898
- var tools = [
30899
- {
30900
- name: "evals_run",
30901
- description: "Run a full eval dataset against an app and return results",
30902
- inputSchema: {
30903
- type: "object",
30904
- properties: {
30905
- dataset: { type: "string", description: "Path to JSONL/JSON dataset" },
30906
- adapter: { type: "object", description: "Adapter config (type, url/model/command, etc.)" },
30907
- concurrency: { type: "number", description: "Parallel execution limit (default: 5)" },
30908
- skip_judge: { type: "boolean", description: "Skip LLM judge, run assertions only" },
30909
- tags: { type: "array", items: { type: "string" }, description: "Filter cases by tags" },
30910
- save: { type: "boolean", description: "Save run to database" },
30911
- output_format: { type: "string", enum: ["json", "markdown", "summary"], description: "Output format" }
30912
- },
30913
- required: ["dataset", "adapter"]
30914
- }
30915
- },
30916
- {
30917
- name: "evals_run_single",
30918
- description: "Run a single eval case ad-hoc \u2014 useful for agents to verify their own output quality",
30919
- inputSchema: {
30920
- type: "object",
30921
- properties: {
30922
- input: { type: "string", description: "Input to the AI app" },
30923
- output: { type: "string", description: "AI's response to evaluate" },
30924
- rubric: { type: "string", description: "Plain-English grading criteria" },
30925
- expected: { type: "string", description: "Expected behavior description" },
30926
- assertions: { type: "array", description: "Optional deterministic assertions" },
30927
- judge_model: { type: "string", description: "Judge model (default: claude-sonnet-4-6)" },
30928
- judge_provider: { type: "string", enum: ["anthropic", "openai"] }
30929
- },
30930
- required: ["input", "output", "rubric"]
30931
- }
30932
- },
30933
- {
30934
- name: "evals_judge",
30935
- description: "One-shot LLM judge \u2014 no dataset needed",
30936
- inputSchema: {
30937
- type: "object",
30938
- properties: {
30939
- input: { type: "string" },
30940
- output: { type: "string" },
30941
- rubric: { type: "string" },
30942
- expected: { type: "string" },
30943
- model: { type: "string" },
30944
- provider: { type: "string", enum: ["anthropic", "openai"] }
30945
- },
30946
- required: ["input", "output", "rubric"]
30947
- }
30948
- },
30949
- {
30950
- name: "evals_list_datasets",
30951
- description: "List available JSONL datasets in a directory",
30952
- inputSchema: {
30953
- type: "object",
30954
- properties: {
30955
- directory: { type: "string", description: "Directory to search (default: ./datasets)" }
30971
+ var MCP_NAME = "evals";
30972
+ var DEFAULT_MCP_HTTP_PORT = 8817;
30973
+ function buildServer() {
30974
+ const server = new Server({ name: MCP_NAME, version: pkg.version }, { capabilities: { tools: {} } });
30975
+ const AdapterSchema = exports_external.object({
30976
+ type: exports_external.enum(["http", "anthropic", "openai", "mcp", "function", "cli"]),
30977
+ url: exports_external.string().optional(),
30978
+ model: exports_external.string().optional(),
30979
+ systemPrompt: exports_external.string().optional(),
30980
+ command: exports_external.array(exports_external.string()).optional(),
30981
+ tool: exports_external.string().optional(),
30982
+ modulePath: exports_external.string().optional()
30983
+ }).passthrough();
30984
+ const tools = [
30985
+ {
30986
+ name: "evals_run",
30987
+ description: "Run a full eval dataset against an app and return results",
30988
+ inputSchema: {
30989
+ type: "object",
30990
+ properties: {
30991
+ dataset: { type: "string", description: "Path to JSONL/JSON dataset" },
30992
+ adapter: { type: "object", description: "Adapter config (type, url/model/command, etc.)" },
30993
+ concurrency: { type: "number", description: "Parallel execution limit (default: 5)" },
30994
+ skip_judge: { type: "boolean", description: "Skip LLM judge, run assertions only" },
30995
+ tags: { type: "array", items: { type: "string" }, description: "Filter cases by tags" },
30996
+ save: { type: "boolean", description: "Save run to database" },
30997
+ output_format: { type: "string", enum: ["json", "markdown", "summary"], description: "Output format" }
30998
+ },
30999
+ required: ["dataset", "adapter"]
30956
31000
  }
30957
- }
30958
- },
30959
- {
30960
- name: "evals_get_results",
30961
- description: "Get results for a past eval run",
30962
- inputSchema: {
30963
- type: "object",
30964
- properties: {
30965
- run_id: { type: "string", description: "Run ID or partial ID" },
30966
- format: { type: "string", enum: ["json", "markdown", "summary"] },
30967
- limit: { type: "number", description: "Max runs to list if no run_id given" }
31001
+ },
31002
+ {
31003
+ name: "evals_run_single",
31004
+ description: "Run a single eval case ad-hoc \u2014 useful for agents to verify their own output quality",
31005
+ inputSchema: {
31006
+ type: "object",
31007
+ properties: {
31008
+ input: { type: "string", description: "Input to the AI app" },
31009
+ output: { type: "string", description: "AI's response to evaluate" },
31010
+ rubric: { type: "string", description: "Plain-English grading criteria" },
31011
+ expected: { type: "string", description: "Expected behavior description" },
31012
+ assertions: { type: "array", description: "Optional deterministic assertions" },
31013
+ judge_model: { type: "string", description: "Judge model (default: claude-sonnet-4-6)" },
31014
+ judge_provider: { type: "string", enum: ["anthropic", "openai"] }
31015
+ },
31016
+ required: ["input", "output", "rubric"]
31017
+ }
31018
+ },
31019
+ {
31020
+ name: "evals_judge",
31021
+ description: "One-shot LLM judge \u2014 no dataset needed",
31022
+ inputSchema: {
31023
+ type: "object",
31024
+ properties: {
31025
+ input: { type: "string" },
31026
+ output: { type: "string" },
31027
+ rubric: { type: "string" },
31028
+ expected: { type: "string" },
31029
+ model: { type: "string" },
31030
+ provider: { type: "string", enum: ["anthropic", "openai"] }
31031
+ },
31032
+ required: ["input", "output", "rubric"]
31033
+ }
31034
+ },
31035
+ {
31036
+ name: "evals_list_datasets",
31037
+ description: "List available JSONL datasets in a directory",
31038
+ inputSchema: {
31039
+ type: "object",
31040
+ properties: {
31041
+ directory: { type: "string", description: "Directory to search (default: ./datasets)" }
31042
+ }
31043
+ }
31044
+ },
31045
+ {
31046
+ name: "evals_get_results",
31047
+ description: "Get results for a past eval run",
31048
+ inputSchema: {
31049
+ type: "object",
31050
+ properties: {
31051
+ run_id: { type: "string", description: "Run ID or partial ID" },
31052
+ format: { type: "string", enum: ["json", "markdown", "summary"] },
31053
+ limit: { type: "number", description: "Max runs to list if no run_id given" }
31054
+ }
31055
+ }
31056
+ },
31057
+ {
31058
+ name: "evals_compare",
31059
+ description: "Compare two eval runs \u2014 show regressions and improvements",
31060
+ inputSchema: {
31061
+ type: "object",
31062
+ properties: {
31063
+ before: { type: "string", description: "Before run ID or baseline name" },
31064
+ after: { type: "string", description: "After run ID" }
31065
+ },
31066
+ required: ["before", "after"]
31067
+ }
31068
+ },
31069
+ {
31070
+ name: "evals_create_case",
31071
+ description: "Add a new eval case to a dataset file",
31072
+ inputSchema: {
31073
+ type: "object",
31074
+ properties: {
31075
+ dataset: { type: "string", description: "Path to JSONL file to append to" },
31076
+ id: { type: "string", description: "Unique case ID" },
31077
+ input: { type: "string" },
31078
+ expected: { type: "string" },
31079
+ rubric: { type: "string", description: "Judge rubric for this case" },
31080
+ assertions: { type: "array" },
31081
+ tags: { type: "array", items: { type: "string" } }
31082
+ },
31083
+ required: ["dataset", "id", "input"]
31084
+ }
31085
+ },
31086
+ {
31087
+ name: "evals_generate_cases",
31088
+ description: "Auto-generate eval cases from a description using Claude",
31089
+ inputSchema: {
31090
+ type: "object",
31091
+ properties: {
31092
+ description: { type: "string", description: "What behavior to test" },
31093
+ count: { type: "number", description: "Number of cases to generate (default: 10)" },
31094
+ output: { type: "string", description: "Output JSONL path" },
31095
+ seeds: { type: "string", description: "Path to seed examples JSONL" }
31096
+ },
31097
+ required: ["description"]
30968
31098
  }
30969
31099
  }
30970
- },
30971
- {
30972
- name: "evals_compare",
30973
- description: "Compare two eval runs \u2014 show regressions and improvements",
30974
- inputSchema: {
30975
- type: "object",
30976
- properties: {
30977
- before: { type: "string", description: "Before run ID or baseline name" },
30978
- after: { type: "string", description: "After run ID" }
30979
- },
30980
- required: ["before", "after"]
30981
- }
30982
- },
30983
- {
30984
- name: "evals_create_case",
30985
- description: "Add a new eval case to a dataset file",
30986
- inputSchema: {
30987
- type: "object",
30988
- properties: {
30989
- dataset: { type: "string", description: "Path to JSONL file to append to" },
30990
- id: { type: "string", description: "Unique case ID" },
30991
- input: { type: "string" },
30992
- expected: { type: "string" },
30993
- rubric: { type: "string", description: "Judge rubric for this case" },
30994
- assertions: { type: "array" },
30995
- tags: { type: "array", items: { type: "string" } }
30996
- },
30997
- required: ["dataset", "id", "input"]
30998
- }
30999
- },
31000
- {
31001
- name: "evals_generate_cases",
31002
- description: "Auto-generate eval cases from a description using Claude",
31003
- inputSchema: {
31004
- type: "object",
31005
- properties: {
31006
- description: { type: "string", description: "What behavior to test" },
31007
- count: { type: "number", description: "Number of cases to generate (default: 10)" },
31008
- output: { type: "string", description: "Output JSONL path" },
31009
- seeds: { type: "string", description: "Path to seed examples JSONL" }
31010
- },
31011
- required: ["description"]
31012
- }
31013
- }
31014
- ];
31015
- server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
31016
- server.setRequestHandler(CallToolRequestSchema, async (req) => {
31017
- const { name, arguments: args } = req.params;
31018
- const a = args ?? {};
31019
- try {
31020
- switch (name) {
31021
- case "evals_run": {
31022
- const adapter = AdapterSchema.parse(a["adapter"]);
31023
- const { cases } = await loadDataset(String(a["dataset"]), {
31024
- tags: a["tags"]
31025
- });
31026
- const run = await runEvals(cases, {
31027
- dataset: String(a["dataset"]),
31028
- adapter,
31029
- concurrency: Number(a["concurrency"] ?? 5),
31030
- skipJudge: Boolean(a["skip_judge"])
31031
- });
31032
- if (a["save"])
31033
- saveRun(run);
31034
- const fmt = String(a["output_format"] ?? "summary");
31035
- const output = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%) in ${run.stats.totalDurationMs}ms. Run ID: ${run.id.slice(0, 8)}`;
31036
- return { content: [{ type: "text", text: output }] };
31037
- }
31038
- case "evals_run_single": {
31039
- const evalCase = {
31040
- id: "mcp-single",
31041
- input: String(a["input"]),
31042
- expected: a["expected"] ? String(a["expected"]) : undefined,
31043
- assertions: a["assertions"] ?? [],
31044
- judge: {
31100
+ ];
31101
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
31102
+ server.setRequestHandler(CallToolRequestSchema, async (req) => {
31103
+ const { name, arguments: args } = req.params;
31104
+ const a = args ?? {};
31105
+ try {
31106
+ switch (name) {
31107
+ case "evals_run": {
31108
+ const adapter = AdapterSchema.parse(a["adapter"]);
31109
+ const { cases } = await loadDataset(String(a["dataset"]), {
31110
+ tags: a["tags"]
31111
+ });
31112
+ const run = await runEvals(cases, {
31113
+ dataset: String(a["dataset"]),
31114
+ adapter,
31115
+ concurrency: Number(a["concurrency"] ?? 5),
31116
+ skipJudge: Boolean(a["skip_judge"])
31117
+ });
31118
+ if (a["save"])
31119
+ saveRun(run);
31120
+ const fmt = String(a["output_format"] ?? "summary");
31121
+ const output = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%) in ${run.stats.totalDurationMs}ms. Run ID: ${run.id.slice(0, 8)}`;
31122
+ return { content: [{ type: "text", text: output }] };
31123
+ }
31124
+ case "evals_run_single": {
31125
+ const evalCase = {
31126
+ id: "mcp-single",
31127
+ input: String(a["input"]),
31128
+ expected: a["expected"] ? String(a["expected"]) : undefined,
31129
+ assertions: a["assertions"] ?? [],
31130
+ judge: {
31131
+ rubric: String(a["rubric"]),
31132
+ model: a["judge_model"] ? String(a["judge_model"]) : undefined,
31133
+ provider: a["judge_provider"]
31134
+ }
31135
+ };
31136
+ const mockAdapter = { type: "function", modulePath: "__mock__" };
31137
+ const judgeResult = await judgeOnce({
31138
+ input: String(a["input"]),
31139
+ output: String(a["output"]),
31045
31140
  rubric: String(a["rubric"]),
31141
+ expected: a["expected"] ? String(a["expected"]) : undefined,
31046
31142
  model: a["judge_model"] ? String(a["judge_model"]) : undefined,
31047
31143
  provider: a["judge_provider"]
31048
- }
31049
- };
31050
- const mockAdapter = { type: "function", modulePath: "__mock__" };
31051
- const judgeResult = await judgeOnce({
31052
- input: String(a["input"]),
31053
- output: String(a["output"]),
31054
- rubric: String(a["rubric"]),
31055
- expected: a["expected"] ? String(a["expected"]) : undefined,
31056
- model: a["judge_model"] ? String(a["judge_model"]) : undefined,
31057
- provider: a["judge_provider"]
31058
- });
31059
- return {
31060
- content: [{
31061
- type: "text",
31062
- text: `VERDICT: ${judgeResult.verdict}
31144
+ });
31145
+ return {
31146
+ content: [{
31147
+ type: "text",
31148
+ text: `VERDICT: ${judgeResult.verdict}
31063
31149
  REASONING: ${judgeResult.reasoning}`
31064
- }]
31065
- };
31066
- }
31067
- case "evals_judge": {
31068
- const r = await judgeOnce({
31069
- input: String(a["input"]),
31070
- output: String(a["output"]),
31071
- rubric: String(a["rubric"]),
31072
- expected: a["expected"] ? String(a["expected"]) : undefined,
31073
- model: a["model"] ? String(a["model"]) : undefined,
31074
- provider: a["provider"]
31075
- });
31076
- return { content: [{ type: "text", text: `${r.verdict}
31150
+ }]
31151
+ };
31152
+ }
31153
+ case "evals_judge": {
31154
+ const r = await judgeOnce({
31155
+ input: String(a["input"]),
31156
+ output: String(a["output"]),
31157
+ rubric: String(a["rubric"]),
31158
+ expected: a["expected"] ? String(a["expected"]) : undefined,
31159
+ model: a["model"] ? String(a["model"]) : undefined,
31160
+ provider: a["provider"]
31161
+ });
31162
+ return { content: [{ type: "text", text: `${r.verdict}
31077
31163
  ${r.reasoning}` }] };
31078
- }
31079
- case "evals_list_datasets": {
31080
- const dir = String(a["directory"] ?? "./datasets");
31081
- const files = [];
31082
- for await (const f of new Bun.Glob(`${dir}/**/*.jsonl`).scan("."))
31083
- files.push(f);
31084
- for await (const f of new Bun.Glob(`${dir}/**/*.json`).scan("."))
31085
- files.push(f);
31086
- return { content: [{ type: "text", text: files.length > 0 ? files.join(`
31164
+ }
31165
+ case "evals_list_datasets": {
31166
+ const dir = String(a["directory"] ?? "./datasets");
31167
+ const files = [];
31168
+ for await (const f of new Bun.Glob(`${dir}/**/*.jsonl`).scan("."))
31169
+ files.push(f);
31170
+ for await (const f of new Bun.Glob(`${dir}/**/*.json`).scan("."))
31171
+ files.push(f);
31172
+ return { content: [{ type: "text", text: files.length > 0 ? files.join(`
31087
31173
  `) : "No datasets found" }] };
31088
- }
31089
- case "evals_get_results": {
31090
- if (a["run_id"]) {
31091
- const run = getRun(String(a["run_id"]));
31092
- if (!run)
31093
- return { content: [{ type: "text", text: "Run not found" }] };
31094
- const fmt = String(a["format"] ?? "summary");
31095
- const text = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `Run ${run.id.slice(0, 8)}: ${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%)`;
31096
- return { content: [{ type: "text", text }] };
31097
- } else {
31098
- const runs = listRuns(Number(a["limit"] ?? 10));
31099
- const summary = runs.map((r) => `${r.id.slice(0, 8)} | ${r.createdAt.slice(0, 10)} | ${r.dataset} | ${r.stats.passed}/${r.stats.total} passed`).join(`
31174
+ }
31175
+ case "evals_get_results": {
31176
+ if (a["run_id"]) {
31177
+ const run = getRun(String(a["run_id"]));
31178
+ if (!run)
31179
+ return { content: [{ type: "text", text: "Run not found" }] };
31180
+ const fmt = String(a["format"] ?? "summary");
31181
+ const text = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `Run ${run.id.slice(0, 8)}: ${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%)`;
31182
+ return { content: [{ type: "text", text }] };
31183
+ } else {
31184
+ const runs = listRuns(Number(a["limit"] ?? 10));
31185
+ const summary = runs.map((r) => `${r.id.slice(0, 8)} | ${r.createdAt.slice(0, 10)} | ${r.dataset} | ${r.stats.passed}/${r.stats.total} passed`).join(`
31100
31186
  `);
31101
- return { content: [{ type: "text", text: summary || "No runs found" }] };
31102
- }
31103
- }
31104
- case "evals_compare": {
31105
- const { getBaseline: getBaseline2 } = await Promise.resolve().then(() => (init_store(), exports_store));
31106
- const before = getRun(String(a["before"])) ?? getBaseline2(String(a["before"]));
31107
- const after = getRun(String(a["after"])) ?? getBaseline2(String(a["after"]));
31108
- if (!before || !after)
31109
- return { content: [{ type: "text", text: "Run(s) not found" }] };
31110
- const diff = compareRuns(before, after);
31111
- const lines = [
31112
- `Score delta: ${diff.passRateDelta >= 0 ? "+" : ""}${(diff.passRateDelta * 100).toFixed(1)}%`,
31113
- ...diff.regressions.map((r) => `\u2193 REGRESSION: ${r.caseId} (${r.before} \u2192 ${r.after})`),
31114
- ...diff.improvements.map((i) => `\u2191 IMPROVEMENT: ${i.caseId} (${i.before} \u2192 ${i.after})`)
31115
- ];
31116
- return { content: [{ type: "text", text: lines.join(`
31187
+ return { content: [{ type: "text", text: summary || "No runs found" }] };
31188
+ }
31189
+ }
31190
+ case "evals_compare": {
31191
+ const { getBaseline: getBaseline2 } = await Promise.resolve().then(() => (init_store(), exports_store));
31192
+ const before = getRun(String(a["before"])) ?? getBaseline2(String(a["before"]));
31193
+ const after = getRun(String(a["after"])) ?? getBaseline2(String(a["after"]));
31194
+ if (!before || !after)
31195
+ return { content: [{ type: "text", text: "Run(s) not found" }] };
31196
+ const diff = compareRuns(before, after);
31197
+ const lines = [
31198
+ `Score delta: ${diff.passRateDelta >= 0 ? "+" : ""}${(diff.passRateDelta * 100).toFixed(1)}%`,
31199
+ ...diff.regressions.map((r) => `\u2193 REGRESSION: ${r.caseId} (${r.before} \u2192 ${r.after})`),
31200
+ ...diff.improvements.map((i) => `\u2191 IMPROVEMENT: ${i.caseId} (${i.before} \u2192 ${i.after})`)
31201
+ ];
31202
+ return { content: [{ type: "text", text: lines.join(`
31117
31203
  `) }] };
31118
- }
31119
- case "evals_create_case": {
31120
- const evalCase = {
31121
- id: String(a["id"]),
31122
- input: String(a["input"]),
31123
- expected: a["expected"] ? String(a["expected"]) : undefined,
31124
- judge: a["rubric"] ? { rubric: String(a["rubric"]) } : undefined,
31125
- assertions: a["assertions"] ?? undefined,
31126
- tags: a["tags"] ?? undefined
31127
- };
31128
- appendFileSync(String(a["dataset"]), JSON.stringify(evalCase) + `
31204
+ }
31205
+ case "evals_create_case": {
31206
+ const evalCase = {
31207
+ id: String(a["id"]),
31208
+ input: String(a["input"]),
31209
+ expected: a["expected"] ? String(a["expected"]) : undefined,
31210
+ judge: a["rubric"] ? { rubric: String(a["rubric"]) } : undefined,
31211
+ assertions: a["assertions"] ?? undefined,
31212
+ tags: a["tags"] ?? undefined
31213
+ };
31214
+ appendFileSync(String(a["dataset"]), JSON.stringify(evalCase) + `
31129
31215
  `);
31130
- return { content: [{ type: "text", text: `Case "${evalCase.id}" appended to ${a["dataset"]}` }] };
31131
- }
31132
- case "evals_generate_cases": {
31133
- const Anthropic2 = (await Promise.resolve().then(() => (init_sdk(), exports_sdk))).default;
31134
- const client = new Anthropic2;
31135
- const count = Number(a["count"] ?? 10);
31136
- const res = await client.messages.create({
31137
- model: "claude-sonnet-4-6",
31138
- max_tokens: 4096,
31139
- temperature: 1,
31140
- system: "Generate eval cases as JSONL. Each line: {id, input, expected, judge: {rubric}, tags}. Output only JSONL lines.",
31141
- messages: [{ role: "user", content: `Generate ${count} eval cases for: ${a["description"]}` }]
31142
- });
31143
- const text = res.content.filter((b) => b.type === "text").map((b) => b.text).join(`
31216
+ return { content: [{ type: "text", text: `Case "${evalCase.id}" appended to ${a["dataset"]}` }] };
31217
+ }
31218
+ case "evals_generate_cases": {
31219
+ const Anthropic2 = (await Promise.resolve().then(() => (init_sdk(), exports_sdk))).default;
31220
+ const client = new Anthropic2;
31221
+ const count = Number(a["count"] ?? 10);
31222
+ const res = await client.messages.create({
31223
+ model: "claude-sonnet-4-6",
31224
+ max_tokens: 4096,
31225
+ temperature: 1,
31226
+ system: "Generate eval cases as JSONL. Each line: {id, input, expected, judge: {rubric}, tags}. Output only JSONL lines.",
31227
+ messages: [{ role: "user", content: `Generate ${count} eval cases for: ${a["description"]}` }]
31228
+ });
31229
+ const text = res.content.filter((b) => b.type === "text").map((b) => b.text).join(`
31144
31230
  `);
31145
- const lines = text.split(`
31231
+ const lines = text.split(`
31146
31232
  `).filter((l) => l.trim().startsWith("{"));
31147
- const output = String(a["output"] ?? "generated.jsonl");
31148
- writeFileSync(output, lines.join(`
31233
+ const output = String(a["output"] ?? "generated.jsonl");
31234
+ writeFileSync(output, lines.join(`
31149
31235
  `) + `
31150
31236
  `);
31151
- return { content: [{ type: "text", text: `Generated ${lines.length} cases \u2192 ${output}` }] };
31237
+ return { content: [{ type: "text", text: `Generated ${lines.length} cases \u2192 ${output}` }] };
31238
+ }
31239
+ default:
31240
+ return { content: [{ type: "text", text: `Unknown tool: ${name}` }] };
31152
31241
  }
31153
- default:
31154
- return { content: [{ type: "text", text: `Unknown tool: ${name}` }] };
31242
+ } catch (err) {
31243
+ return {
31244
+ content: [{ type: "text", text: `Error: ${err instanceof Error ? err.message : String(err)}` }],
31245
+ isError: true
31246
+ };
31155
31247
  }
31156
- } catch (err) {
31157
- return {
31158
- content: [{ type: "text", text: `Error: ${err instanceof Error ? err.message : String(err)}` }],
31159
- isError: true
31160
- };
31248
+ });
31249
+ return server;
31250
+ }
31251
+
31252
+ // src/mcp/http.ts
31253
+ import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
31254
+ function isHttpMode(argv = process.argv.slice(2)) {
31255
+ return argv.includes("--http") || process.env["MCP_HTTP"] === "1";
31256
+ }
31257
+ function resolveHttpPort(argv = process.argv.slice(2)) {
31258
+ for (let i = 0;i < argv.length; i++) {
31259
+ const arg = argv[i];
31260
+ if (arg === "--port" || arg === "-p") {
31261
+ const raw = argv[i + 1];
31262
+ if (!raw)
31263
+ throw new Error(`Invalid port: ${raw ?? ""}`);
31264
+ return parsePort(raw, "port");
31265
+ }
31266
+ }
31267
+ const fromEnv = process.env["MCP_HTTP_PORT"];
31268
+ if (fromEnv)
31269
+ return parsePort(fromEnv, "MCP_HTTP_PORT");
31270
+ return DEFAULT_MCP_HTTP_PORT;
31271
+ }
31272
+ function parsePort(raw, label) {
31273
+ const value = Number(raw);
31274
+ if (!Number.isInteger(value) || value < 1 || value > 65535) {
31275
+ throw new Error(`Invalid ${label}: ${raw}`);
31276
+ }
31277
+ return value;
31278
+ }
31279
+ async function handleMcpHttpRequest(req) {
31280
+ const url2 = new URL(req.url);
31281
+ if (url2.pathname === "/health" && req.method === "GET") {
31282
+ return Response.json({ status: "ok", name: MCP_NAME });
31161
31283
  }
31284
+ if (url2.pathname === "/mcp") {
31285
+ const transport = new WebStandardStreamableHTTPServerTransport({
31286
+ sessionIdGenerator: undefined
31287
+ });
31288
+ const server = buildServer();
31289
+ await server.connect(transport);
31290
+ return transport.handleRequest(req);
31291
+ }
31292
+ return new Response("Not Found", { status: 404 });
31293
+ }
31294
+ function startHttpServer(options = {}) {
31295
+ const port = options.port ?? DEFAULT_MCP_HTTP_PORT;
31296
+ const hostname3 = options.hostname ?? "127.0.0.1";
31297
+ const log2 = options.log ?? console.error;
31298
+ const server = Bun.serve({
31299
+ port,
31300
+ hostname: hostname3,
31301
+ fetch: handleMcpHttpRequest
31302
+ });
31303
+ const address = `http://${hostname3}:${server.port}`;
31304
+ log2(`${MCP_NAME}-mcp HTTP listening on ${address}/mcp (health: ${address}/health)`);
31305
+ return server;
31306
+ }
31307
+
31308
+ // src/mcp/index.ts
31309
+ function printHelp() {
31310
+ console.log(`Usage: evals-mcp [options]
31311
+
31312
+ Runs the @hasna/evals MCP server (stdio by default).
31313
+
31314
+ Options:
31315
+ --http Serve MCP over Streamable HTTP on 127.0.0.1
31316
+ -p, --port <port> HTTP port (default: MCP_HTTP_PORT or 8817)
31317
+ -V, --version output the version number
31318
+ -h, --help display help for command
31319
+
31320
+ Environment:
31321
+ MCP_HTTP=1 Enable HTTP mode
31322
+ MCP_HTTP_PORT Override default HTTP port (8817)`);
31323
+ }
31324
+ var args = process.argv.slice(2);
31325
+ if (args.includes("--help") || args.includes("-h")) {
31326
+ printHelp();
31327
+ process.exit(0);
31328
+ }
31329
+ if (args.includes("--version") || args.includes("-V")) {
31330
+ console.log(package_default.version);
31331
+ process.exit(0);
31332
+ }
31333
+ async function main() {
31334
+ if (isHttpMode(args)) {
31335
+ startHttpServer({ port: resolveHttpPort(args) });
31336
+ await new Promise(() => {});
31337
+ return;
31338
+ }
31339
+ const server = buildServer();
31340
+ const transport = new StdioServerTransport;
31341
+ await server.connect(transport);
31342
+ }
31343
+ main().catch((error50) => {
31344
+ console.error("MCP server error:", error50);
31345
+ process.exit(1);
31162
31346
  });
31163
- var transport = new StdioServerTransport;
31164
- await server.connect(transport);
@@ -0,0 +1,5 @@
1
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
2
+ export declare const MCP_NAME = "evals";
3
+ export declare const DEFAULT_MCP_HTTP_PORT = 8817;
4
+ export declare function buildServer(): Server;
5
+ //# sourceMappingURL=server.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../../src/mcp/server.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,2CAA2C,CAAC;AAanE,eAAO,MAAM,QAAQ,UAAU,CAAC;AAChC,eAAO,MAAM,qBAAqB,OAAO,CAAC;AAE1C,wBAAgB,WAAW,IAAI,MAAM,CAmSpC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hasna/evals",
3
- "version": "0.1.20",
3
+ "version": "0.1.22",
4
4
  "description": "Open source AI evaluation framework — LLM-as-judge + assertion-based evals for any AI app. CLI + MCP server.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",