npm - @hasna/evals - Versions diffs - 0.1.20 → 0.1.22 - Mend

@hasna/evals 0.1.20 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/LICENSE +2 -1
package/README.md +13 -0
package/dist/cli/commands/compare.d.ts +2 -0
package/dist/cli/commands/compare.d.ts.map +1 -1
package/dist/cli/commands/compare.test.d.ts +2 -0
package/dist/cli/commands/compare.test.d.ts.map +1 -0
package/dist/cli/commands/doctor.d.ts.map +1 -1
package/dist/cli/commands/generate.d.ts.map +1 -1
package/dist/cli/index.js +57 -14
package/dist/mcp/http.d.ts +12 -0
package/dist/mcp/http.d.ts.map +1 -0
package/dist/mcp/http.test.d.ts +2 -0
package/dist/mcp/http.test.d.ts.map +1 -0
package/dist/mcp/index.js +441 -259
package/dist/mcp/server.d.ts +5 -0
package/dist/mcp/server.d.ts.map +1 -0
package/package.json +1 -1

package/LICENSE CHANGED Viewed

@@ -1,3 +1,4 @@
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -175,7 +176,7 @@
    END OF TERMS AND CONDITIONS
-   Copyright 2026 hasna
+   Copyright 2026 Hasna, Inc.
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

package/README.md CHANGED Viewed

@@ -240,6 +240,19 @@ evals_run_single(
 → PASS — The response correctly identifies Paris.
 ```
+## HTTP mode
+Shared Streamable HTTP transport for multi-agent sessions (stdio remains the default):
+```bash
+evals-mcp --http              # http://127.0.0.1:8817/mcp
+MCP_HTTP=1 evals-mcp          # same
+evals-mcp --http --port 8817  # explicit port
+```
+- Health: `GET http://127.0.0.1:8817/health` → `{"status":"ok","name":"evals"}`
+- Override port with `MCP_HTTP_PORT` or `--port`
 ---
 ## License

package/dist/cli/commands/compare.d.ts CHANGED Viewed

@@ -1,3 +1,5 @@
 import { Command } from "commander";
+import { compareRuns } from "../../core/reporter.js";
+export declare function renderMarkdownDiff(diff: ReturnType<typeof compareRuns>): string;
 export declare function compareCommand(): Command;
 //# sourceMappingURL=compare.d.ts.map

package/dist/cli/commands/compare.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"compare.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;~~AAIpC~~,wBAAgB,cAAc,IAAI,OAAO,CAgCxC"}
1	+ {"version":3,"file":"compare.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,WAAW,EAA+B,MAAM,wBAAwB,CAAC;AAGlF,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,GAAG,MAAM,CA2B/E;AAED,wBAAgB,cAAc,IAAI,OAAO,CAgCxC"}

package/dist/cli/commands/compare.test.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export {};
2	+ //# sourceMappingURL=compare.test.d.ts.map

package/dist/cli/commands/compare.test.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"compare.test.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/compare.test.ts"],"names":[],"mappings":""}

package/dist/cli/commands/doctor.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"doctor.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/doctor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAqCpC,wBAAgB,aAAa,IAAI,OAAO,~~CA4EvC~~"}
1	+ {"version":3,"file":"doctor.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/doctor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAqCpC,wBAAgB,aAAa,IAAI,OAAO,CAqFvC"}

package/dist/cli/commands/generate.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"generate.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/generate.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAiBpC,wBAAgB,eAAe,IAAI,OAAO,~~CA2CzC~~"}
1	+ {"version":3,"file":"generate.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/generate.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAiBpC,wBAAgB,eAAe,IAAI,OAAO,CAyDzC"}

package/dist/cli/index.js CHANGED Viewed

@@ -30508,7 +30508,7 @@ function parseAdapterConfig(opts) {
 // src/cli/commands/run.ts
 function runCommand() {
-  return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter, default: default)").option("--command <cmd>", "Shell command (for cli adapter, use {{input}} placeholder)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("--json", "Alias for --output json").action(async (dataset, opts) => {
+  return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter, default: default)").option("--command <cmd>", "Shell command (for cli adapter, use {{input}} placeholder)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("-j, --json", "Alias for --output json").action(async (dataset, opts) => {
     const { cases, warnings } = await loadDataset(dataset, {
       tags: opts["tags"] ? opts["tags"].split(",") : undefined
     });
@@ -30548,7 +30548,7 @@ init_loader();
 init_store();
 function ciCommand() {
   const cmd = new Command("ci").description("Run evals in CI mode \u2014 exit non-zero on regression");
-  cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter)").option("--command <cmd>", "Shell command (for cli adapter)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--no-judge", "Skip LLM judge, assertions only").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("--json", "Output JSON").action(async (dataset, opts) => {
+  cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter)").option("--command <cmd>", "Shell command (for cli adapter)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--no-judge", "Skip LLM judge, assertions only").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("-j, --json", "Output JSON").action(async (dataset, opts) => {
     const { cases } = await loadDataset(dataset);
     const adapter = parseAdapterConfig(opts);
     const run = await runEvals(cases, { dataset, adapter, skipJudge: opts["judge"] === false || opts["noJudge"] === true });
@@ -30596,7 +30596,7 @@ No baseline "${baselineName}" found \u2014 use "evals ci set-baseline" to create
 // src/cli/commands/judge.ts
 function judgeCommand() {
-  return new Command("judge").description("Ad-hoc: judge a single input/output pair against a rubric").requiredOption("--input <text>", "The input that was given to the AI").requiredOption("--output <text>", "The AI's response").requiredOption("--rubric <text>", "Plain-English grading criteria").option("--expected <text>", "Expected behavior description").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--provider <provider>", "Judge provider: anthropic|openai", "anthropic").option("--json", "Output JSON").action(async (opts) => {
+  return new Command("judge").description("Ad-hoc: judge a single input/output pair against a rubric").requiredOption("--input <text>", "The input that was given to the AI").requiredOption("--output <text>", "The AI's response").requiredOption("--rubric <text>", "Plain-English grading criteria").option("--expected <text>", "Expected behavior description").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--provider <provider>", "Judge provider: anthropic|openai", "anthropic").option("-j, --json", "Output JSON").action(async (opts) => {
     const result = await judgeOnce({
       input: opts["input"] ?? "",
       output: opts["output"] ?? "",
@@ -30622,8 +30622,34 @@ ${icon} ${result.verdict}
 // src/cli/commands/compare.ts
 init_store();
+function renderMarkdownDiff(diff) {
+  const lines = ["## Diff", ""];
+  if (diff.regressions.length === 0 && diff.improvements.length === 0) {
+    lines.push("- No changes between runs.");
+    return lines.join(`
+`);
+  }
+  if (diff.regressions.length > 0) {
+    lines.push("### Regressions");
+    for (const r of diff.regressions) {
+      lines.push(`- ${r.caseId}: ${r.before} -> ${r.after}`);
+    }
+    lines.push("");
+  }
+  if (diff.improvements.length > 0) {
+    lines.push("### Improvements");
+    for (const i of diff.improvements) {
+      lines.push(`- ${i.caseId}: ${i.before} -> ${i.after}`);
+    }
+    lines.push("");
+  }
+  const delta = diff.passRateDelta * 100;
+  lines.push(`- Score delta: ${delta >= 0 ? "+" : ""}${delta.toFixed(1)}%`);
+  return lines.join(`
+`);
+}
 function compareCommand() {
-  return new Command("compare").description("Compare two eval runs side-by-side").argument("<before>", "Before run ID or baseline name").argument("<after>", "After run ID (or 'latest')").option("--json", "Output JSON diff").option("--markdown", "Output markdown diff").action(async (beforeArg, afterArg, opts) => {
+  return new Command("compare").description("Compare two eval runs side-by-side").argument("<before>", "Before run ID or baseline name").argument("<after>", "After run ID (or 'latest')").option("-j, --json", "Output JSON diff").option("--markdown", "Output markdown diff").action(async (beforeArg, afterArg, opts) => {
     const { listRuns: listRuns2 } = await Promise.resolve().then(() => (init_store(), exports_store));
     const beforeRun = getRun(beforeArg) ?? getBaseline(beforeArg);
     const afterRun = afterArg === "latest" ? listRuns2(1)[0] : getRun(afterArg) ?? getBaseline(afterArg);
@@ -30640,10 +30666,8 @@ function compareCommand() {
       console.log(JSON.stringify(diff, null, 2));
     } else if (opts["markdown"]) {
       console.log(toMarkdown(afterRun));
-      console.log(`
-## Diff
-`);
-      printDiffReport(diff);
+      console.log();
+      console.log(renderMarkdownDiff(diff));
     } else {
       printDiffReport(diff);
     }
@@ -30662,7 +30686,7 @@ var COST_PER_1K_INPUT = {
 };
 var AVG_JUDGE_TOKENS = 800;
 function estimateCommand() {
-  return new Command("estimate").description("Estimate cost before running evals (no API calls made)").argument("<dataset>", "Path to JSONL/JSON dataset").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--no-judge", "Assume no judge calls").option("--json", "Output JSON").action(async (dataset, opts) => {
+  return new Command("estimate").description("Estimate cost before running evals (no API calls made)").argument("<dataset>", "Path to JSONL/JSON dataset").option("--model <model>", "Judge model", "claude-sonnet-4-6").option("--no-judge", "Assume no judge calls").option("-j, --json", "Output JSON").action(async (dataset, opts) => {
     const { cases, warnings } = await loadDataset(dataset);
     if (warnings.length > 0)
       for (const w of warnings)
@@ -30713,7 +30737,7 @@ Each case must be a valid JSON object on a single line with these fields:
 Generate varied cases that cover edge cases, typical usage, and boundary conditions.
 Output ONLY valid JSONL \u2014 one JSON object per line, no markdown, no explanation.`;
 function generateCommand() {
-  return new Command("generate").description("Generate eval cases from a description using Claude").requiredOption("--description <text>", "What behavior to test (e.g. 'refund policy responses')").option("--seeds <path>", "Path to JSONL file with seed examples").option("--count <n>", "Number of cases to generate", "10").option("--output <path>", "Output JSONL file path", "generated.jsonl").option("--model <model>", "Model to use for generation", "claude-sonnet-4-6").action(async (opts) => {
+  return new Command("generate").description("Generate eval cases from a description using Claude").requiredOption("--description <text>", "What behavior to test (e.g. 'refund policy responses')").option("--seeds <path>", "Path to JSONL file with seed examples").option("--count <n>", "Number of cases to generate", "10").option("--output <path>", "Output JSONL file path", "generated.jsonl").option("--model <model>", "Model to use for generation", "claude-sonnet-4-6").option("-j, --json", "Output JSON summary").action(async (opts) => {
     const client = new Anthropic;
     const count = parseInt(opts["count"] ?? "10");
     let seedText = "";
@@ -30746,11 +30770,22 @@ Output ${count} JSONL lines starting with {"id":"gen-001",...}`;
           valid.push(parsed);
       } catch {}
     }
+    const outputPath = opts["output"] ?? "generated.jsonl";
     const output = valid.map((c) => JSON.stringify(c)).join(`
 `);
-    writeFileSync(opts["output"] ?? "generated.jsonl", output + `
+    writeFileSync(outputPath, output + `
 `);
-    console.log(`\x1B[32m\u2713 Generated ${valid.length} cases \u2192 ${opts["output"]}\x1B[0m`);
+    if (opts["json"]) {
+      console.log(JSON.stringify({
+        generated: valid.length,
+        requested: count,
+        output: outputPath,
+        model: opts["model"] ?? "claude-sonnet-4-6",
+        description: opts["description"] ?? ""
+      }, null, 2));
+      return;
+    }
+    console.log(`\x1B[32m\u2713 Generated ${valid.length} cases \u2192 ${outputPath}\x1B[0m`);
   });
 }
@@ -30844,7 +30879,7 @@ function resolveApiKey(envVar, secretsPath, secretsKey) {
   return;
 }
 function doctorCommand() {
-  return new Command("doctor").description("Health check \u2014 verify API keys, DB, and config").action(async () => {
+  return new Command("doctor").description("Health check \u2014 verify API keys, DB, and config").option("-j, --json", "Output JSON").action(async (opts) => {
     const checks = [];
     const anthropicKey = resolveApiKey("ANTHROPIC_API_KEY", "hasnaxyz/anthropic/live.env", "HASNAXYZ_ANTHROPIC_LIVE_API_KEY");
     checks.push({
@@ -30888,6 +30923,15 @@ function doctorCommand() {
     } catch {
       checks.push({ name: "Example dataset (optional)", ok: false, hint: "datasets/examples/smoke.jsonl not found \u2014 install @hasna/evals globally to include examples" });
     }
+    const allOk = checks.every((c) => c.ok || c.name.toLowerCase().includes("optional"));
+    if (opts.json) {
+      console.log(JSON.stringify({
+        ok: allOk,
+        checks,
+        summary: allOk ? "All checks passed." : "Some checks failed \u2014 see hints above."
+      }, null, 2));
+      process.exit(allOk ? 0 : 1);
+    }
     console.log(`
 \x1B[1mevals doctor\x1B[0m
 `);
@@ -30896,7 +30940,6 @@ function doctorCommand() {
       console.log(`  ${icon} ${c.name}${!c.ok && c.hint ? `
       hint: ${c.hint}` : ""}`);
     }
-    const allOk = checks.every((c) => c.ok || c.name.toLowerCase().includes("optional"));
     console.log(allOk ? `
 \x1B[32m  All checks passed.\x1B[0m
 ` : `

package/dist/mcp/http.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+import { DEFAULT_MCP_HTTP_PORT, MCP_NAME } from "./server.js";
+export { DEFAULT_MCP_HTTP_PORT, MCP_NAME };
+export declare function isHttpMode(argv?: string[]): boolean;
+export declare function resolveHttpPort(argv?: string[]): number;
+export declare function handleMcpHttpRequest(req: Request): Promise<Response>;
+export interface StartHttpServerOptions {
+    port?: number;
+    hostname?: string;
+    log?: (message: string) => void;
+}
+export declare function startHttpServer(options?: StartHttpServerOptions): ReturnType<typeof Bun.serve>;
+//# sourceMappingURL=http.d.ts.map

package/dist/mcp/http.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"http.d.ts","sourceRoot":"","sources":["../../src/mcp/http.ts"],"names":[],"mappings":"AACA,OAAO,EAAe,qBAAqB,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE3E,OAAO,EAAE,qBAAqB,EAAE,QAAQ,EAAE,CAAC;AAE3C,wBAAgB,UAAU,CAAC,IAAI,GAAE,MAAM,EAA0B,GAAG,OAAO,CAE1E;AAED,wBAAgB,eAAe,CAAC,IAAI,GAAE,MAAM,EAA0B,GAAG,MAAM,CAa9E;AAUD,wBAAsB,oBAAoB,CAAC,GAAG,EAAE,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,CAiB1E;AAED,MAAM,WAAW,sBAAsB;IACrC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;CACjC;AAED,wBAAgB,eAAe,CAAC,OAAO,GAAE,sBAA2B,GAAG,UAAU,CAAC,OAAO,GAAG,CAAC,KAAK,CAAC,CAclG"}

package/dist/mcp/http.test.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export {};
2	+ //# sourceMappingURL=http.test.d.ts.map

package/dist/mcp/http.test.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"http.test.d.ts","sourceRoot":"","sources":["../../src/mcp/http.test.ts"],"names":[],"mappings":""}

package/dist/mcp/index.js CHANGED Viewed

@@ -10587,8 +10587,91 @@ var _db = null;
 var init_store = () => {};
 // src/mcp/index.ts
-import { Server } from "@modelcontextprotocol/sdk/server/index.js";
 import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+// package.json
+var package_default = {
+  name: "@hasna/evals",
+  version: "0.1.22",
+  description: "Open source AI evaluation framework \u2014 LLM-as-judge + assertion-based evals for any AI app. CLI + MCP server.",
+  type: "module",
+  main: "dist/index.js",
+  types: "dist/index.d.ts",
+  bin: {
+    evals: "dist/cli/index.js",
+    "evals-mcp": "dist/mcp/index.js",
+    "evals-serve": "dist/server/index.js"
+  },
+  exports: {
+    ".": {
+      types: "./dist/index.d.ts",
+      import: "./dist/index.js"
+    }
+  },
+  files: [
+    "dist",
+    "dashboard/dist",
+    "datasets/examples",
+    "LICENSE",
+    "README.md"
+  ],
+  scripts: {
+    build: "cd dashboard && bun run build && cd .. && bun build src/cli/index.ts --outdir dist/cli --target bun --external ink --external react --external chalk --external @modelcontextprotocol/sdk && bun build src/mcp/index.ts --outdir dist/mcp --target bun --external @modelcontextprotocol/sdk && bun build src/server/index.ts --outdir dist/server --target bun && bun build src/index.ts --outdir dist --target bun && tsc --emitDeclarationOnly --outDir dist",
+    "build:dashboard": "cd dashboard && bun run build",
+    typecheck: "tsc --noEmit",
+    test: "bun test",
+    "dev:cli": "bun run src/cli/index.ts",
+    "dev:mcp": "bun run src/mcp/index.ts",
+    "dev:serve": "bun run src/server/index.ts",
+    prepublishOnly: "bun run typecheck && bun test && bun run build",
+    postinstall: "mkdir -p $HOME/.hasna/evals 2>/dev/null || true"
+  },
+  keywords: [
+    "evals",
+    "llm",
+    "ai",
+    "testing",
+    "evaluation",
+    "mcp",
+    "claude",
+    "llm-as-judge",
+    "typescript",
+    "cli"
+  ],
+  publishConfig: {
+    registry: "https://registry.npmjs.org",
+    access: "public"
+  },
+  repository: {
+    type: "git",
+    url: "https://github.com/hasna/open-evals.git"
+  },
+  homepage: "https://github.com/hasna/open-evals",
+  bugs: {
+    url: "https://github.com/hasna/open-evals/issues"
+  },
+  engines: {
+    bun: ">=1.0.0"
+  },
+  author: "Andrei Hasna <andrei@hasna.com>",
+  license: "Apache-2.0",
+  dependencies: {
+    "@anthropic-ai/sdk": "^0.82.0",
+    "@hasna/cloud": "^0.1.30",
+    "@modelcontextprotocol/sdk": "^1.29.0",
+    ajv: "^8.18.0",
+    chalk: "^5.4.1",
+    commander: "^14.0.3",
+    openai: "^6.33.0",
+    zod: "^4.3.6"
+  },
+  devDependencies: {
+    "@types/bun": "^1.2.4",
+    typescript: "^6.0.2"
+  }
+};
+// src/mcp/server.ts
+import { Server } from "@modelcontextprotocol/sdk/server/index.js";
 import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
 // node_modules/zod/v4/classic/external.js
@@ -30881,284 +30964,383 @@ function compareRuns(before, after) {
   };
 }
-// src/mcp/index.ts
+// src/mcp/server.ts
 init_store();
 import { writeFileSync, appendFileSync } from "fs";
 var pkg = await Bun.file(new URL("../../package.json", import.meta.url)).json();
-var server = new Server({ name: "evals", version: pkg.version }, { capabilities: { tools: {} } });
-var AdapterSchema = exports_external.object({
-  type: exports_external.enum(["http", "anthropic", "openai", "mcp", "function", "cli"]),
-  url: exports_external.string().optional(),
-  model: exports_external.string().optional(),
-  systemPrompt: exports_external.string().optional(),
-  command: exports_external.array(exports_external.string()).optional(),
-  tool: exports_external.string().optional(),
-  modulePath: exports_external.string().optional()
-}).passthrough();
-var tools = [
-  {
-    name: "evals_run",
-    description: "Run a full eval dataset against an app and return results",
-    inputSchema: {
-      type: "object",
-      properties: {
-        dataset: { type: "string", description: "Path to JSONL/JSON dataset" },
-        adapter: { type: "object", description: "Adapter config (type, url/model/command, etc.)" },
-        concurrency: { type: "number", description: "Parallel execution limit (default: 5)" },
-        skip_judge: { type: "boolean", description: "Skip LLM judge, run assertions only" },
-        tags: { type: "array", items: { type: "string" }, description: "Filter cases by tags" },
-        save: { type: "boolean", description: "Save run to database" },
-        output_format: { type: "string", enum: ["json", "markdown", "summary"], description: "Output format" }
-      },
-      required: ["dataset", "adapter"]
-    }
-  },
-  {
-    name: "evals_run_single",
-    description: "Run a single eval case ad-hoc \u2014 useful for agents to verify their own output quality",
-    inputSchema: {
-      type: "object",
-      properties: {
-        input: { type: "string", description: "Input to the AI app" },
-        output: { type: "string", description: "AI's response to evaluate" },
-        rubric: { type: "string", description: "Plain-English grading criteria" },
-        expected: { type: "string", description: "Expected behavior description" },
-        assertions: { type: "array", description: "Optional deterministic assertions" },
-        judge_model: { type: "string", description: "Judge model (default: claude-sonnet-4-6)" },
-        judge_provider: { type: "string", enum: ["anthropic", "openai"] }
-      },
-      required: ["input", "output", "rubric"]
-    }
-  },
-  {
-    name: "evals_judge",
-    description: "One-shot LLM judge \u2014 no dataset needed",
-    inputSchema: {
-      type: "object",
-      properties: {
-        input: { type: "string" },
-        output: { type: "string" },
-        rubric: { type: "string" },
-        expected: { type: "string" },
-        model: { type: "string" },
-        provider: { type: "string", enum: ["anthropic", "openai"] }
-      },
-      required: ["input", "output", "rubric"]
-    }
-  },
-  {
-    name: "evals_list_datasets",
-    description: "List available JSONL datasets in a directory",
-    inputSchema: {
-      type: "object",
-      properties: {
-        directory: { type: "string", description: "Directory to search (default: ./datasets)" }
+var MCP_NAME = "evals";
+var DEFAULT_MCP_HTTP_PORT = 8817;
+function buildServer() {
+  const server = new Server({ name: MCP_NAME, version: pkg.version }, { capabilities: { tools: {} } });
+  const AdapterSchema = exports_external.object({
+    type: exports_external.enum(["http", "anthropic", "openai", "mcp", "function", "cli"]),
+    url: exports_external.string().optional(),
+    model: exports_external.string().optional(),
+    systemPrompt: exports_external.string().optional(),
+    command: exports_external.array(exports_external.string()).optional(),
+    tool: exports_external.string().optional(),
+    modulePath: exports_external.string().optional()
+  }).passthrough();
+  const tools = [
+    {
+      name: "evals_run",
+      description: "Run a full eval dataset against an app and return results",
+      inputSchema: {
+        type: "object",
+        properties: {
+          dataset: { type: "string", description: "Path to JSONL/JSON dataset" },
+          adapter: { type: "object", description: "Adapter config (type, url/model/command, etc.)" },
+          concurrency: { type: "number", description: "Parallel execution limit (default: 5)" },
+          skip_judge: { type: "boolean", description: "Skip LLM judge, run assertions only" },
+          tags: { type: "array", items: { type: "string" }, description: "Filter cases by tags" },
+          save: { type: "boolean", description: "Save run to database" },
+          output_format: { type: "string", enum: ["json", "markdown", "summary"], description: "Output format" }
+        },
+        required: ["dataset", "adapter"]
       }
-    }
-  },
-  {
-    name: "evals_get_results",
-    description: "Get results for a past eval run",
-    inputSchema: {
-      type: "object",
-      properties: {
-        run_id: { type: "string", description: "Run ID or partial ID" },
-        format: { type: "string", enum: ["json", "markdown", "summary"] },
-        limit: { type: "number", description: "Max runs to list if no run_id given" }
+    },
+    {
+      name: "evals_run_single",
+      description: "Run a single eval case ad-hoc \u2014 useful for agents to verify their own output quality",
+      inputSchema: {
+        type: "object",
+        properties: {
+          input: { type: "string", description: "Input to the AI app" },
+          output: { type: "string", description: "AI's response to evaluate" },
+          rubric: { type: "string", description: "Plain-English grading criteria" },
+          expected: { type: "string", description: "Expected behavior description" },
+          assertions: { type: "array", description: "Optional deterministic assertions" },
+          judge_model: { type: "string", description: "Judge model (default: claude-sonnet-4-6)" },
+          judge_provider: { type: "string", enum: ["anthropic", "openai"] }
+        },
+        required: ["input", "output", "rubric"]
+      }
+    },
+    {
+      name: "evals_judge",
+      description: "One-shot LLM judge \u2014 no dataset needed",
+      inputSchema: {
+        type: "object",
+        properties: {
+          input: { type: "string" },
+          output: { type: "string" },
+          rubric: { type: "string" },
+          expected: { type: "string" },
+          model: { type: "string" },
+          provider: { type: "string", enum: ["anthropic", "openai"] }
+        },
+        required: ["input", "output", "rubric"]
+      }
+    },
+    {
+      name: "evals_list_datasets",
+      description: "List available JSONL datasets in a directory",
+      inputSchema: {
+        type: "object",
+        properties: {
+          directory: { type: "string", description: "Directory to search (default: ./datasets)" }
+        }
+      }
+    },
+    {
+      name: "evals_get_results",
+      description: "Get results for a past eval run",
+      inputSchema: {
+        type: "object",
+        properties: {
+          run_id: { type: "string", description: "Run ID or partial ID" },
+          format: { type: "string", enum: ["json", "markdown", "summary"] },
+          limit: { type: "number", description: "Max runs to list if no run_id given" }
+        }
+      }
+    },
+    {
+      name: "evals_compare",
+      description: "Compare two eval runs \u2014 show regressions and improvements",
+      inputSchema: {
+        type: "object",
+        properties: {
+          before: { type: "string", description: "Before run ID or baseline name" },
+          after: { type: "string", description: "After run ID" }
+        },
+        required: ["before", "after"]
+      }
+    },
+    {
+      name: "evals_create_case",
+      description: "Add a new eval case to a dataset file",
+      inputSchema: {
+        type: "object",
+        properties: {
+          dataset: { type: "string", description: "Path to JSONL file to append to" },
+          id: { type: "string", description: "Unique case ID" },
+          input: { type: "string" },
+          expected: { type: "string" },
+          rubric: { type: "string", description: "Judge rubric for this case" },
+          assertions: { type: "array" },
+          tags: { type: "array", items: { type: "string" } }
+        },
+        required: ["dataset", "id", "input"]
+      }
+    },
+    {
+      name: "evals_generate_cases",
+      description: "Auto-generate eval cases from a description using Claude",
+      inputSchema: {
+        type: "object",
+        properties: {
+          description: { type: "string", description: "What behavior to test" },
+          count: { type: "number", description: "Number of cases to generate (default: 10)" },
+          output: { type: "string", description: "Output JSONL path" },
+          seeds: { type: "string", description: "Path to seed examples JSONL" }
+        },
+        required: ["description"]
       }
     }
-  },
-  {
-    name: "evals_compare",
-    description: "Compare two eval runs \u2014 show regressions and improvements",
-    inputSchema: {
-      type: "object",
-      properties: {
-        before: { type: "string", description: "Before run ID or baseline name" },
-        after: { type: "string", description: "After run ID" }
-      },
-      required: ["before", "after"]
-    }
-  },
-  {
-    name: "evals_create_case",
-    description: "Add a new eval case to a dataset file",
-    inputSchema: {
-      type: "object",
-      properties: {
-        dataset: { type: "string", description: "Path to JSONL file to append to" },
-        id: { type: "string", description: "Unique case ID" },
-        input: { type: "string" },
-        expected: { type: "string" },
-        rubric: { type: "string", description: "Judge rubric for this case" },
-        assertions: { type: "array" },
-        tags: { type: "array", items: { type: "string" } }
-      },
-      required: ["dataset", "id", "input"]
-    }
-  },
-  {
-    name: "evals_generate_cases",
-    description: "Auto-generate eval cases from a description using Claude",
-    inputSchema: {
-      type: "object",
-      properties: {
-        description: { type: "string", description: "What behavior to test" },
-        count: { type: "number", description: "Number of cases to generate (default: 10)" },
-        output: { type: "string", description: "Output JSONL path" },
-        seeds: { type: "string", description: "Path to seed examples JSONL" }
-      },
-      required: ["description"]
-    }
-  }
-];
-server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
-server.setRequestHandler(CallToolRequestSchema, async (req) => {
-  const { name, arguments: args } = req.params;
-  const a = args ?? {};
-  try {
-    switch (name) {
-      case "evals_run": {
-        const adapter = AdapterSchema.parse(a["adapter"]);
-        const { cases } = await loadDataset(String(a["dataset"]), {
-          tags: a["tags"]
-        });
-        const run = await runEvals(cases, {
-          dataset: String(a["dataset"]),
-          adapter,
-          concurrency: Number(a["concurrency"] ?? 5),
-          skipJudge: Boolean(a["skip_judge"])
-        });
-        if (a["save"])
-          saveRun(run);
-        const fmt = String(a["output_format"] ?? "summary");
-        const output = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%) in ${run.stats.totalDurationMs}ms. Run ID: ${run.id.slice(0, 8)}`;
-        return { content: [{ type: "text", text: output }] };
-      }
-      case "evals_run_single": {
-        const evalCase = {
-          id: "mcp-single",
-          input: String(a["input"]),
-          expected: a["expected"] ? String(a["expected"]) : undefined,
-          assertions: a["assertions"] ?? [],
-          judge: {
+  ];
+  server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools }));
+  server.setRequestHandler(CallToolRequestSchema, async (req) => {
+    const { name, arguments: args } = req.params;
+    const a = args ?? {};
+    try {
+      switch (name) {
+        case "evals_run": {
+          const adapter = AdapterSchema.parse(a["adapter"]);
+          const { cases } = await loadDataset(String(a["dataset"]), {
+            tags: a["tags"]
+          });
+          const run = await runEvals(cases, {
+            dataset: String(a["dataset"]),
+            adapter,
+            concurrency: Number(a["concurrency"] ?? 5),
+            skipJudge: Boolean(a["skip_judge"])
+          });
+          if (a["save"])
+            saveRun(run);
+          const fmt = String(a["output_format"] ?? "summary");
+          const output = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%) in ${run.stats.totalDurationMs}ms. Run ID: ${run.id.slice(0, 8)}`;
+          return { content: [{ type: "text", text: output }] };
+        }
+        case "evals_run_single": {
+          const evalCase = {
+            id: "mcp-single",
+            input: String(a["input"]),
+            expected: a["expected"] ? String(a["expected"]) : undefined,
+            assertions: a["assertions"] ?? [],
+            judge: {
+              rubric: String(a["rubric"]),
+              model: a["judge_model"] ? String(a["judge_model"]) : undefined,
+              provider: a["judge_provider"]
+            }
+          };
+          const mockAdapter = { type: "function", modulePath: "__mock__" };
+          const judgeResult = await judgeOnce({
+            input: String(a["input"]),
+            output: String(a["output"]),
             rubric: String(a["rubric"]),
+            expected: a["expected"] ? String(a["expected"]) : undefined,
             model: a["judge_model"] ? String(a["judge_model"]) : undefined,
             provider: a["judge_provider"]
-          }
-        };
-        const mockAdapter = { type: "function", modulePath: "__mock__" };
-        const judgeResult = await judgeOnce({
-          input: String(a["input"]),
-          output: String(a["output"]),
-          rubric: String(a["rubric"]),
-          expected: a["expected"] ? String(a["expected"]) : undefined,
-          model: a["judge_model"] ? String(a["judge_model"]) : undefined,
-          provider: a["judge_provider"]
-        });
-        return {
-          content: [{
-            type: "text",
-            text: `VERDICT: ${judgeResult.verdict}
+          });
+          return {
+            content: [{
+              type: "text",
+              text: `VERDICT: ${judgeResult.verdict}
 REASONING: ${judgeResult.reasoning}`
-          }]
-        };
-      }
-      case "evals_judge": {
-        const r = await judgeOnce({
-          input: String(a["input"]),
-          output: String(a["output"]),
-          rubric: String(a["rubric"]),
-          expected: a["expected"] ? String(a["expected"]) : undefined,
-          model: a["model"] ? String(a["model"]) : undefined,
-          provider: a["provider"]
-        });
-        return { content: [{ type: "text", text: `${r.verdict}
+            }]
+          };
+        }
+        case "evals_judge": {
+          const r = await judgeOnce({
+            input: String(a["input"]),
+            output: String(a["output"]),
+            rubric: String(a["rubric"]),
+            expected: a["expected"] ? String(a["expected"]) : undefined,
+            model: a["model"] ? String(a["model"]) : undefined,
+            provider: a["provider"]
+          });
+          return { content: [{ type: "text", text: `${r.verdict}
 ${r.reasoning}` }] };
-      }
-      case "evals_list_datasets": {
-        const dir = String(a["directory"] ?? "./datasets");
-        const files = [];
-        for await (const f of new Bun.Glob(`${dir}/**/*.jsonl`).scan("."))
-          files.push(f);
-        for await (const f of new Bun.Glob(`${dir}/**/*.json`).scan("."))
-          files.push(f);
-        return { content: [{ type: "text", text: files.length > 0 ? files.join(`
+        }
+        case "evals_list_datasets": {
+          const dir = String(a["directory"] ?? "./datasets");
+          const files = [];
+          for await (const f of new Bun.Glob(`${dir}/**/*.jsonl`).scan("."))
+            files.push(f);
+          for await (const f of new Bun.Glob(`${dir}/**/*.json`).scan("."))
+            files.push(f);
+          return { content: [{ type: "text", text: files.length > 0 ? files.join(`
 `) : "No datasets found" }] };
-      }
-      case "evals_get_results": {
-        if (a["run_id"]) {
-          const run = getRun(String(a["run_id"]));
-          if (!run)
-            return { content: [{ type: "text", text: "Run not found" }] };
-          const fmt = String(a["format"] ?? "summary");
-          const text = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `Run ${run.id.slice(0, 8)}: ${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%)`;
-          return { content: [{ type: "text", text }] };
-        } else {
-          const runs = listRuns(Number(a["limit"] ?? 10));
-          const summary = runs.map((r) => `${r.id.slice(0, 8)} | ${r.createdAt.slice(0, 10)} | ${r.dataset} | ${r.stats.passed}/${r.stats.total} passed`).join(`
+        }
+        case "evals_get_results": {
+          if (a["run_id"]) {
+            const run = getRun(String(a["run_id"]));
+            if (!run)
+              return { content: [{ type: "text", text: "Run not found" }] };
+            const fmt = String(a["format"] ?? "summary");
+            const text = fmt === "json" ? toJson(run) : fmt === "markdown" ? toMarkdown(run) : `Run ${run.id.slice(0, 8)}: ${run.stats.passed}/${run.stats.total} passed (${(run.stats.passRate * 100).toFixed(1)}%)`;
+            return { content: [{ type: "text", text }] };
+          } else {
+            const runs = listRuns(Number(a["limit"] ?? 10));
+            const summary = runs.map((r) => `${r.id.slice(0, 8)} | ${r.createdAt.slice(0, 10)} | ${r.dataset} | ${r.stats.passed}/${r.stats.total} passed`).join(`
 `);
-          return { content: [{ type: "text", text: summary || "No runs found" }] };
-        }
-      }
-      case "evals_compare": {
-        const { getBaseline: getBaseline2 } = await Promise.resolve().then(() => (init_store(), exports_store));
-        const before = getRun(String(a["before"])) ?? getBaseline2(String(a["before"]));
-        const after = getRun(String(a["after"])) ?? getBaseline2(String(a["after"]));
-        if (!before || !after)
-          return { content: [{ type: "text", text: "Run(s) not found" }] };
-        const diff = compareRuns(before, after);
-        const lines = [
-          `Score delta: ${diff.passRateDelta >= 0 ? "+" : ""}${(diff.passRateDelta * 100).toFixed(1)}%`,
-          ...diff.regressions.map((r) => `\u2193 REGRESSION: ${r.caseId} (${r.before} \u2192 ${r.after})`),
-          ...diff.improvements.map((i) => `\u2191 IMPROVEMENT: ${i.caseId} (${i.before} \u2192 ${i.after})`)
-        ];
-        return { content: [{ type: "text", text: lines.join(`
+            return { content: [{ type: "text", text: summary || "No runs found" }] };
+          }
+        }
+        case "evals_compare": {
+          const { getBaseline: getBaseline2 } = await Promise.resolve().then(() => (init_store(), exports_store));
+          const before = getRun(String(a["before"])) ?? getBaseline2(String(a["before"]));
+          const after = getRun(String(a["after"])) ?? getBaseline2(String(a["after"]));
+          if (!before || !after)
+            return { content: [{ type: "text", text: "Run(s) not found" }] };
+          const diff = compareRuns(before, after);
+          const lines = [
+            `Score delta: ${diff.passRateDelta >= 0 ? "+" : ""}${(diff.passRateDelta * 100).toFixed(1)}%`,
+            ...diff.regressions.map((r) => `\u2193 REGRESSION: ${r.caseId} (${r.before} \u2192 ${r.after})`),
+            ...diff.improvements.map((i) => `\u2191 IMPROVEMENT: ${i.caseId} (${i.before} \u2192 ${i.after})`)
+          ];
+          return { content: [{ type: "text", text: lines.join(`
 `) }] };
-      }
-      case "evals_create_case": {
-        const evalCase = {
-          id: String(a["id"]),
-          input: String(a["input"]),
-          expected: a["expected"] ? String(a["expected"]) : undefined,
-          judge: a["rubric"] ? { rubric: String(a["rubric"]) } : undefined,
-          assertions: a["assertions"] ?? undefined,
-          tags: a["tags"] ?? undefined
-        };
-        appendFileSync(String(a["dataset"]), JSON.stringify(evalCase) + `
+        }
+        case "evals_create_case": {
+          const evalCase = {
+            id: String(a["id"]),
+            input: String(a["input"]),
+            expected: a["expected"] ? String(a["expected"]) : undefined,
+            judge: a["rubric"] ? { rubric: String(a["rubric"]) } : undefined,
+            assertions: a["assertions"] ?? undefined,
+            tags: a["tags"] ?? undefined
+          };
+          appendFileSync(String(a["dataset"]), JSON.stringify(evalCase) + `
 `);
-        return { content: [{ type: "text", text: `Case "${evalCase.id}" appended to ${a["dataset"]}` }] };
-      }
-      case "evals_generate_cases": {
-        const Anthropic2 = (await Promise.resolve().then(() => (init_sdk(), exports_sdk))).default;
-        const client = new Anthropic2;
-        const count = Number(a["count"] ?? 10);
-        const res = await client.messages.create({
-          model: "claude-sonnet-4-6",
-          max_tokens: 4096,
-          temperature: 1,
-          system: "Generate eval cases as JSONL. Each line: {id, input, expected, judge: {rubric}, tags}. Output only JSONL lines.",
-          messages: [{ role: "user", content: `Generate ${count} eval cases for: ${a["description"]}` }]
-        });
-        const text = res.content.filter((b) => b.type === "text").map((b) => b.text).join(`
+          return { content: [{ type: "text", text: `Case "${evalCase.id}" appended to ${a["dataset"]}` }] };
+        }
+        case "evals_generate_cases": {
+          const Anthropic2 = (await Promise.resolve().then(() => (init_sdk(), exports_sdk))).default;
+          const client = new Anthropic2;
+          const count = Number(a["count"] ?? 10);
+          const res = await client.messages.create({
+            model: "claude-sonnet-4-6",
+            max_tokens: 4096,
+            temperature: 1,
+            system: "Generate eval cases as JSONL. Each line: {id, input, expected, judge: {rubric}, tags}. Output only JSONL lines.",
+            messages: [{ role: "user", content: `Generate ${count} eval cases for: ${a["description"]}` }]
+          });
+          const text = res.content.filter((b) => b.type === "text").map((b) => b.text).join(`
 `);
-        const lines = text.split(`
+          const lines = text.split(`
 `).filter((l) => l.trim().startsWith("{"));
-        const output = String(a["output"] ?? "generated.jsonl");
-        writeFileSync(output, lines.join(`
+          const output = String(a["output"] ?? "generated.jsonl");
+          writeFileSync(output, lines.join(`
 `) + `
 `);
-        return { content: [{ type: "text", text: `Generated ${lines.length} cases \u2192 ${output}` }] };
+          return { content: [{ type: "text", text: `Generated ${lines.length} cases \u2192 ${output}` }] };
+        }
+        default:
+          return { content: [{ type: "text", text: `Unknown tool: ${name}` }] };
       }
-      default:
-        return { content: [{ type: "text", text: `Unknown tool: ${name}` }] };
+    } catch (err) {
+      return {
+        content: [{ type: "text", text: `Error: ${err instanceof Error ? err.message : String(err)}` }],
+        isError: true
+      };
     }
-  } catch (err) {
-    return {
-      content: [{ type: "text", text: `Error: ${err instanceof Error ? err.message : String(err)}` }],
-      isError: true
-    };
+  });
+  return server;
+}
+// src/mcp/http.ts
+import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
+function isHttpMode(argv = process.argv.slice(2)) {
+  return argv.includes("--http") || process.env["MCP_HTTP"] === "1";
+}
+function resolveHttpPort(argv = process.argv.slice(2)) {
+  for (let i = 0;i < argv.length; i++) {
+    const arg = argv[i];
+    if (arg === "--port" || arg === "-p") {
+      const raw = argv[i + 1];
+      if (!raw)
+        throw new Error(`Invalid port: ${raw ?? ""}`);
+      return parsePort(raw, "port");
+    }
+  }
+  const fromEnv = process.env["MCP_HTTP_PORT"];
+  if (fromEnv)
+    return parsePort(fromEnv, "MCP_HTTP_PORT");
+  return DEFAULT_MCP_HTTP_PORT;
+}
+function parsePort(raw, label) {
+  const value = Number(raw);
+  if (!Number.isInteger(value) || value < 1 || value > 65535) {
+    throw new Error(`Invalid ${label}: ${raw}`);
+  }
+  return value;
+}
+async function handleMcpHttpRequest(req) {
+  const url2 = new URL(req.url);
+  if (url2.pathname === "/health" && req.method === "GET") {
+    return Response.json({ status: "ok", name: MCP_NAME });
   }
+  if (url2.pathname === "/mcp") {
+    const transport = new WebStandardStreamableHTTPServerTransport({
+      sessionIdGenerator: undefined
+    });
+    const server = buildServer();
+    await server.connect(transport);
+    return transport.handleRequest(req);
+  }
+  return new Response("Not Found", { status: 404 });
+}
+function startHttpServer(options = {}) {
+  const port = options.port ?? DEFAULT_MCP_HTTP_PORT;
+  const hostname3 = options.hostname ?? "127.0.0.1";
+  const log2 = options.log ?? console.error;
+  const server = Bun.serve({
+    port,
+    hostname: hostname3,
+    fetch: handleMcpHttpRequest
+  });
+  const address = `http://${hostname3}:${server.port}`;
+  log2(`${MCP_NAME}-mcp HTTP listening on ${address}/mcp (health: ${address}/health)`);
+  return server;
+}
+// src/mcp/index.ts
+function printHelp() {
+  console.log(`Usage: evals-mcp [options]
+Runs the @hasna/evals MCP server (stdio by default).
+Options:
+      --http         Serve MCP over Streamable HTTP on 127.0.0.1
+  -p, --port <port>  HTTP port (default: MCP_HTTP_PORT or 8817)
+  -V, --version      output the version number
+  -h, --help         display help for command
+Environment:
+  MCP_HTTP=1         Enable HTTP mode
+  MCP_HTTP_PORT      Override default HTTP port (8817)`);
+}
+var args = process.argv.slice(2);
+if (args.includes("--help") || args.includes("-h")) {
+  printHelp();
+  process.exit(0);
+}
+if (args.includes("--version") || args.includes("-V")) {
+  console.log(package_default.version);
+  process.exit(0);
+}
+async function main() {
+  if (isHttpMode(args)) {
+    startHttpServer({ port: resolveHttpPort(args) });
+    await new Promise(() => {});
+    return;
+  }
+  const server = buildServer();
+  const transport = new StdioServerTransport;
+  await server.connect(transport);
+}
+main().catch((error50) => {
+  console.error("MCP server error:", error50);
+  process.exit(1);
 });
-var transport = new StdioServerTransport;
-await server.connect(transport);

package/dist/mcp/server.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+import { Server } from "@modelcontextprotocol/sdk/server/index.js";
+export declare const MCP_NAME = "evals";
+export declare const DEFAULT_MCP_HTTP_PORT = 8817;
+export declare function buildServer(): Server;
+//# sourceMappingURL=server.d.ts.map

package/dist/mcp/server.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../../src/mcp/server.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,2CAA2C,CAAC;AAanE,eAAO,MAAM,QAAQ,UAAU,CAAC;AAChC,eAAO,MAAM,qBAAqB,OAAO,CAAC;AAE1C,wBAAgB,WAAW,IAAI,MAAM,CAmSpC"}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hasna/evals",
-  "version": "0.1.20",
+  "version": "0.1.22",
   "description": "Open source AI evaluation framework — LLM-as-judge + assertion-based evals for any AI app. CLI + MCP server.",
   "type": "module",
   "main": "dist/index.js",