@hasna/evals 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ci.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/ci.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOpC,wBAAgB,SAAS,IAAI,OAAO,
|
|
1
|
+
{"version":3,"file":"ci.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/ci.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOpC,wBAAgB,SAAS,IAAI,OAAO,CAiEnC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"mcp.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/mcp.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAKpC,wBAAgB,UAAU,IAAI,OAAO,
|
|
1
|
+
{"version":3,"file":"mcp.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/mcp.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAKpC,wBAAgB,UAAU,IAAI,OAAO,CAgCpC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOpC,wBAAgB,UAAU,IAAI,OAAO,
|
|
1
|
+
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/cli/commands/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOpC,wBAAgB,UAAU,IAAI,OAAO,CAyDpC"}
|
package/dist/cli/index.js
CHANGED
|
@@ -30145,7 +30145,7 @@ function parseAdapterConfig(opts) {
|
|
|
30145
30145
|
|
|
30146
30146
|
// src/cli/commands/run.ts
|
|
30147
30147
|
function runCommand() {
|
|
30148
|
-
return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("--json", "Alias for --output json").action(async (dataset, opts) => {
|
|
30148
|
+
return new Command("run").description("Run an eval dataset against your app").argument("<dataset>", "Path to JSONL/JSON dataset or glob pattern").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter, default: default)").option("--command <cmd>", "Shell command (for cli adapter, use {{input}} placeholder)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--concurrency <n>", "Parallel execution limit", "5").option("--repeat <n>", "Run each case N times (Pass^k metric)", "1").option("--tags <tags>", "Comma-separated tags to filter cases").option("--no-judge", "Skip LLM judge, run assertions only").option("--output <format>", "Output format: terminal|json|markdown", "terminal").option("--save", "Save run to database").option("--json", "Alias for --output json").action(async (dataset, opts) => {
|
|
30149
30149
|
const { cases, warnings } = await loadDataset(dataset, {
|
|
30150
30150
|
tags: opts["tags"] ? opts["tags"].split(",") : undefined
|
|
30151
30151
|
});
|
|
@@ -30164,7 +30164,7 @@ function runCommand() {
|
|
|
30164
30164
|
concurrency: parseInt(opts["concurrency"] ?? "5"),
|
|
30165
30165
|
repeat: parseInt(opts["repeat"] ?? "1"),
|
|
30166
30166
|
tags: opts["tags"] ? opts["tags"].split(",") : undefined,
|
|
30167
|
-
skipJudge: opts["noJudge"] ===
|
|
30167
|
+
skipJudge: opts["judge"] === false || opts["noJudge"] === true
|
|
30168
30168
|
});
|
|
30169
30169
|
if (opts["save"])
|
|
30170
30170
|
saveRun(run);
|
|
@@ -30185,10 +30185,10 @@ init_loader();
|
|
|
30185
30185
|
init_store();
|
|
30186
30186
|
function ciCommand() {
|
|
30187
30187
|
const cmd = new Command("ci").description("Run evals in CI mode \u2014 exit non-zero on regression");
|
|
30188
|
-
cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type", "http").option("--url <url>", "App URL").option("--model <model>", "Model name").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("--json", "Output JSON").action(async (dataset, opts) => {
|
|
30188
|
+
cmd.command("run <dataset>").description("Run and compare to baseline").option("--adapter <type>", "Adapter type: http|anthropic|openai|mcp|function|cli", "http").option("--url <url>", "App URL (for http adapter)").option("--model <model>", "Model name (for anthropic/openai adapter)").option("--system <prompt>", "System prompt (for anthropic/openai adapter)").option("--module <path>", "Module path (for function adapter)").option("--export <name>", "Export name (for function adapter)").option("--command <cmd>", "Shell command (for cli adapter)").option("--mcp-command <cmd>", "MCP server command (for mcp adapter)").option("--tool <name>", "MCP tool name (for mcp adapter)").option("--no-judge", "Skip LLM judge, assertions only").option("--baseline <name>", "Baseline name to compare against", "main").option("--fail-if-regression <pct>", "Fail if score drops by more than N%", "0").option("--output <format>", "Output format: terminal|markdown", "terminal").option("--json", "Output JSON").action(async (dataset, opts) => {
|
|
30189
30189
|
const { cases } = await loadDataset(dataset);
|
|
30190
30190
|
const adapter = parseAdapterConfig(opts);
|
|
30191
|
-
const run = await runEvals(cases, { dataset, adapter });
|
|
30191
|
+
const run = await runEvals(cases, { dataset, adapter, skipJudge: opts["judge"] === false || opts["noJudge"] === true });
|
|
30192
30192
|
saveRun(run);
|
|
30193
30193
|
const baselineName = opts["baseline"] ?? "main";
|
|
30194
30194
|
const baseline = getBaseline(baselineName);
|
|
@@ -30509,25 +30509,59 @@ import { readFileSync, writeFileSync as writeFileSync2, existsSync } from "fs";
|
|
|
30509
30509
|
import { homedir as homedir2 } from "os";
|
|
30510
30510
|
import { join as join2 } from "path";
|
|
30511
30511
|
function mcpCommand() {
|
|
30512
|
-
|
|
30512
|
+
const cmd = new Command("mcp").description("MCP server management");
|
|
30513
|
+
cmd.addCommand(new Command("register").description("Register evals-mcp with an agent (Claude Code, Codex, Gemini)").option("--claude", "Register with Claude Code (~/.claude/mcp.json)").option("--codex", "Register with Codex (~/.codex/config.json)").option("--gemini", "Register with Gemini (~/.gemini/settings.json)").option("--all", "Register with all agents").action((opts) => {
|
|
30514
|
+
if (opts.claude || opts.all)
|
|
30515
|
+
registerClaude();
|
|
30516
|
+
if (opts.codex || opts.all)
|
|
30517
|
+
registerCodex();
|
|
30518
|
+
if (opts.gemini || opts.all)
|
|
30519
|
+
registerGemini();
|
|
30520
|
+
if (!opts.claude && !opts.codex && !opts.gemini && !opts.all) {
|
|
30521
|
+
registerClaude();
|
|
30522
|
+
}
|
|
30523
|
+
}));
|
|
30524
|
+
cmd.addCommand(new Command("start").description("Start MCP server (stdio)").action(() => {
|
|
30513
30525
|
const { spawnSync } = __require("child_process");
|
|
30514
30526
|
spawnSync(process.execPath, [join2(import.meta.dir, "../../mcp/index.js")], { stdio: "inherit" });
|
|
30515
30527
|
}));
|
|
30528
|
+
return cmd;
|
|
30516
30529
|
}
|
|
30530
|
+
var ENTRY = { command: "/home/hasna/.bun/bin/evals-mcp", args: [] };
|
|
30517
30531
|
function registerClaude() {
|
|
30518
|
-
const
|
|
30519
|
-
let
|
|
30520
|
-
if (existsSync(
|
|
30521
|
-
|
|
30522
|
-
}
|
|
30523
|
-
|
|
30524
|
-
|
|
30525
|
-
settings["mcpServers"] = mcpServers;
|
|
30526
|
-
writeFileSync2(settingsPath, JSON.stringify(settings, null, 2) + `
|
|
30532
|
+
const mcpPath = join2(homedir2(), ".claude", "mcp.json");
|
|
30533
|
+
let config = {};
|
|
30534
|
+
if (existsSync(mcpPath)) {
|
|
30535
|
+
config = JSON.parse(readFileSync(mcpPath, "utf8"));
|
|
30536
|
+
}
|
|
30537
|
+
config.mcpServers = { ...config.mcpServers ?? {}, evals: ENTRY };
|
|
30538
|
+
writeFileSync2(mcpPath, JSON.stringify(config, null, 2) + `
|
|
30527
30539
|
`);
|
|
30528
|
-
console.log("\x1B[32m\u2713 Registered evals-mcp in ~/.claude/
|
|
30540
|
+
console.log("\x1B[32m\u2713 Registered evals-mcp in ~/.claude/mcp.json\x1B[0m");
|
|
30529
30541
|
console.log(" Restart Claude Code to load the new MCP server.");
|
|
30530
30542
|
}
|
|
30543
|
+
function registerCodex() {
|
|
30544
|
+
const cfgPath = join2(homedir2(), ".codex", "config.json");
|
|
30545
|
+
let config = {};
|
|
30546
|
+
if (existsSync(cfgPath)) {
|
|
30547
|
+
config = JSON.parse(readFileSync(cfgPath, "utf8"));
|
|
30548
|
+
}
|
|
30549
|
+
config.mcpServers = { ...config.mcpServers ?? {}, evals: { type: "stdio", ...ENTRY, env: {} } };
|
|
30550
|
+
writeFileSync2(cfgPath, JSON.stringify(config, null, 2) + `
|
|
30551
|
+
`);
|
|
30552
|
+
console.log("\x1B[32m\u2713 Registered evals-mcp in ~/.codex/config.json\x1B[0m");
|
|
30553
|
+
}
|
|
30554
|
+
function registerGemini() {
|
|
30555
|
+
const cfgPath = join2(homedir2(), ".gemini", "settings.json");
|
|
30556
|
+
let config = {};
|
|
30557
|
+
if (existsSync(cfgPath)) {
|
|
30558
|
+
config = JSON.parse(readFileSync(cfgPath, "utf8"));
|
|
30559
|
+
}
|
|
30560
|
+
config.mcpServers = { ...config.mcpServers ?? {}, evals: ENTRY };
|
|
30561
|
+
writeFileSync2(cfgPath, JSON.stringify(config, null, 2) + `
|
|
30562
|
+
`);
|
|
30563
|
+
console.log("\x1B[32m\u2713 Registered evals-mcp in ~/.gemini/settings.json\x1B[0m");
|
|
30564
|
+
}
|
|
30531
30565
|
|
|
30532
30566
|
// src/cli/commands/capture.ts
|
|
30533
30567
|
import { createServer } from "http";
|
package/package.json
CHANGED