@hasna/evals 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"ci.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/ci.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOpC,wBAAgB,SAAS,IAAI,OAAO,CA0DnC"}
1
+ {"version":3,"file":"ci.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/ci.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOpC,wBAAgB,SAAS,IAAI,OAAO,CAiEnC"}
@@ -1 +1 @@
1
- {"version":3,"file":"doctor.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/doctor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAEpC,wBAAgB,aAAa,IAAI,OAAO,CAoDvC"}
1
+ {"version":3,"file":"doctor.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/doctor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAEpC,wBAAgB,aAAa,IAAI,OAAO,CAkEvC"}
@@ -1 +1 @@
1
- {"version":3,"file":"mcp.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/mcp.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAKpC,wBAAgB,UAAU,IAAI,OAAO,CASpC"}
1
+ {"version":3,"file":"mcp.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/mcp.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAKpC,wBAAgB,UAAU,IAAI,OAAO,CAgCpC"}
@@ -1 +1 @@
1
- {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOpC,wBAAgB,UAAU,IAAI,OAAO,CAoDpC"}
1
+ {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOpC,wBAAgB,UAAU,IAAI,OAAO,CAyDpC"}
package/dist/cli/index.js CHANGED
@@ -30145,7 +30145,7 @@ function parseAdapterConfig(opts) {
30145
30145
 
30146
30146
  // src/cli/commands/run.ts
30147
30147
  function runCommand() {
30148
- return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("--json", "Alias for --output json").action(async (dataset, opts) => {
30148
+ return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter, default: default)").option("--command <cmd>", "Shell command (for cli adapter, use {{input}} placeholder)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("--json", "Alias for --output json").action(async (dataset, opts) => {
30149
30149
  const { cases, warnings } = await loadDataset(dataset, {
30150
30150
  tags: opts["tags"] ? opts["tags"].split(",") : undefined
30151
30151
  });
@@ -30164,7 +30164,7 @@ function runCommand() {
30164
30164
  concurrency: parseInt(opts["concurrency"] ?? "5"),
30165
30165
  repeat: parseInt(opts["repeat"] ?? "1"),
30166
30166
  tags: opts["tags"] ? opts["tags"].split(",") : undefined,
30167
- skipJudge: opts["noJudge"] === "true"
30167
+ skipJudge: opts["judge"] === false || opts["noJudge"] === true
30168
30168
  });
30169
30169
  if (opts["save"])
30170
30170
  saveRun(run);
@@ -30185,10 +30185,10 @@ init_loader();
30185
30185
  init_store();
30186
30186
  function ciCommand() {
30187
30187
  const cmd = new Command("ci").description("Run evals in CI mode \u2014 exit non-zero on regression");
30188
- cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type", "http").option("--url <url>", "App URL").option("--model <model>", "Model name").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("--json", "Output JSON").action(async (dataset, opts) => {
30188
+ cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter)").option("--command <cmd>", "Shell command (for cli adapter)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--no-judge", "Skip LLM judge, assertions only").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("--json", "Output JSON").action(async (dataset, opts) => {
30189
30189
  const { cases } = await loadDataset(dataset);
30190
30190
  const adapter = parseAdapterConfig(opts);
30191
- const run = await runEvals(cases, { dataset, adapter });
30191
+ const run = await runEvals(cases, { dataset, adapter, skipJudge: opts["judge"] === false || opts["noJudge"] === true });
30192
30192
  saveRun(run);
30193
30193
  const baselineName = opts["baseline"] ?? "main";
30194
30194
  const baseline = getBaseline(baselineName);
@@ -30480,11 +30480,23 @@ function doctorCommand() {
30480
30480
  }
30481
30481
  try {
30482
30482
  const { loadDataset: loadDataset2 } = await Promise.resolve().then(() => (init_loader(), exports_loader));
30483
- const examplesPath = new URL("../../../datasets/examples/smoke.jsonl", import.meta.url).pathname;
30484
- const { cases } = await loadDataset2(examplesPath);
30485
- checks.push({ name: `Example dataset (${cases.length} cases)`, ok: cases.length > 0, hint: "datasets/examples/smoke.jsonl missing" });
30483
+ const { existsSync } = await import("fs");
30484
+ const { join: join2 } = await import("path");
30485
+ const { homedir: homedir2 } = await import("os");
30486
+ const candidates = [
30487
+ new URL("../../../datasets/examples/smoke.jsonl", import.meta.url).pathname,
30488
+ join2(import.meta.dir, "../../../datasets/examples/smoke.jsonl"),
30489
+ join2(import.meta.dir, "../../datasets/examples/smoke.jsonl"),
30490
+ join2(import.meta.dir, "../datasets/examples/smoke.jsonl"),
30491
+ join2(homedir2(), ".hasna", "evals", "examples", "smoke.jsonl")
30492
+ ];
30493
+ const found = candidates.find((p) => existsSync(p));
30494
+ if (!found)
30495
+ throw new Error("not found");
30496
+ const { cases } = await loadDataset2(found);
30497
+ checks.push({ name: `Example dataset (${cases.length} cases)`, ok: cases.length > 0 });
30486
30498
  } catch {
30487
- checks.push({ name: "Example dataset", ok: false, hint: "Run from the open-evals project directory" });
30499
+ checks.push({ name: "Example dataset (optional)", ok: false, hint: "datasets/examples/smoke.jsonl not found \u2014 install @hasna/evals globally to include examples" });
30488
30500
  }
30489
30501
  console.log(`
30490
30502
  \x1B[1mevals doctor\x1B[0m
@@ -30494,7 +30506,7 @@ function doctorCommand() {
30494
30506
  console.log(` ${icon} ${c.name}${!c.ok && c.hint ? `
30495
30507
  hint: ${c.hint}` : ""}`);
30496
30508
  }
30497
- const allOk = checks.every((c) => c.ok || c.name.includes("optional"));
30509
+ const allOk = checks.every((c) => c.ok || c.name.toLowerCase().includes("optional"));
30498
30510
  console.log(allOk ? `
30499
30511
  \x1B[32m All checks passed.\x1B[0m
30500
30512
  ` : `
@@ -30509,25 +30521,59 @@ import { readFileSync, writeFileSync as writeFileSync2, existsSync } from "fs";
30509
30521
  import { homedir as homedir2 } from "os";
30510
30522
  import { join as join2 } from "path";
30511
30523
  function mcpCommand() {
30512
- return new Command("mcp").description("MCP server management").addCommand(new Command("--claude").description("Register evals-mcp with Claude Code").action(registerClaude)).addCommand(new Command("start").description("Start MCP server (stdio)").action(() => {
30524
+ const cmd = new Command("mcp").description("MCP server management");
30525
+ cmd.addCommand(new Command("register").description("Register evals-mcp with an agent (Claude Code, Codex, Gemini)").option("--claude", "Register with Claude Code (~/.claude/mcp.json)").option("--codex", "Register with Codex (~/.codex/config.json)").option("--gemini", "Register with Gemini (~/.gemini/settings.json)").option("--all", "Register with all agents").action((opts) => {
30526
+ if (opts.claude || opts.all)
30527
+ registerClaude();
30528
+ if (opts.codex || opts.all)
30529
+ registerCodex();
30530
+ if (opts.gemini || opts.all)
30531
+ registerGemini();
30532
+ if (!opts.claude && !opts.codex && !opts.gemini && !opts.all) {
30533
+ registerClaude();
30534
+ }
30535
+ }));
30536
+ cmd.addCommand(new Command("start").description("Start MCP server (stdio)").action(() => {
30513
30537
  const { spawnSync } = __require("child_process");
30514
30538
  spawnSync(process.execPath, [join2(import.meta.dir, "../../mcp/index.js")], { stdio: "inherit" });
30515
30539
  }));
30540
+ return cmd;
30516
30541
  }
30542
+ var ENTRY = { command: "/home/hasna/.bun/bin/evals-mcp", args: [] };
30517
30543
  function registerClaude() {
30518
- const settingsPath = join2(homedir2(), ".claude", "settings.json");
30519
- let settings = {};
30520
- if (existsSync(settingsPath)) {
30521
- settings = JSON.parse(readFileSync(settingsPath, "utf8"));
30522
- }
30523
- const mcpServers = settings["mcpServers"] ?? {};
30524
- mcpServers["evals"] = { command: "evals-mcp", args: [] };
30525
- settings["mcpServers"] = mcpServers;
30526
- writeFileSync2(settingsPath, JSON.stringify(settings, null, 2) + `
30544
+ const mcpPath = join2(homedir2(), ".claude", "mcp.json");
30545
+ let config = {};
30546
+ if (existsSync(mcpPath)) {
30547
+ config = JSON.parse(readFileSync(mcpPath, "utf8"));
30548
+ }
30549
+ config.mcpServers = { ...config.mcpServers ?? {}, evals: ENTRY };
30550
+ writeFileSync2(mcpPath, JSON.stringify(config, null, 2) + `
30527
30551
  `);
30528
- console.log("\x1B[32m\u2713 Registered evals-mcp in ~/.claude/settings.json\x1B[0m");
30552
+ console.log("\x1B[32m\u2713 Registered evals-mcp in ~/.claude/mcp.json\x1B[0m");
30529
30553
  console.log(" Restart Claude Code to load the new MCP server.");
30530
30554
  }
30555
+ function registerCodex() {
30556
+ const cfgPath = join2(homedir2(), ".codex", "config.json");
30557
+ let config = {};
30558
+ if (existsSync(cfgPath)) {
30559
+ config = JSON.parse(readFileSync(cfgPath, "utf8"));
30560
+ }
30561
+ config.mcpServers = { ...config.mcpServers ?? {}, evals: { type: "stdio", ...ENTRY, env: {} } };
30562
+ writeFileSync2(cfgPath, JSON.stringify(config, null, 2) + `
30563
+ `);
30564
+ console.log("\x1B[32m\u2713 Registered evals-mcp in ~/.codex/config.json\x1B[0m");
30565
+ }
30566
+ function registerGemini() {
30567
+ const cfgPath = join2(homedir2(), ".gemini", "settings.json");
30568
+ let config = {};
30569
+ if (existsSync(cfgPath)) {
30570
+ config = JSON.parse(readFileSync(cfgPath, "utf8"));
30571
+ }
30572
+ config.mcpServers = { ...config.mcpServers ?? {}, evals: ENTRY };
30573
+ writeFileSync2(cfgPath, JSON.stringify(config, null, 2) + `
30574
+ `);
30575
+ console.log("\x1B[32m\u2713 Registered evals-mcp in ~/.gemini/settings.json\x1B[0m");
30576
+ }
30531
30577
 
30532
30578
  // src/cli/commands/capture.ts
30533
30579
  import { createServer } from "http";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hasna/evals",
3
- "version": "0.1.5",
3
+ "version": "0.1.7",
4
4
  "description": "Open source AI evaluation framework — LLM-as-judge + assertion-based evals for any AI app. CLI + MCP server.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",