@hasna/evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -0
- package/README.md +244 -0
- package/datasets/examples/mcp-eval.jsonl +3 -0
- package/datasets/examples/multi-turn.jsonl +3 -0
- package/datasets/examples/smoke.jsonl +5 -0
- package/datasets/examples/tool-use.jsonl +3 -0
- package/dist/adapters/adapters.test.d.ts +2 -0
- package/dist/adapters/adapters.test.d.ts.map +1 -0
- package/dist/adapters/anthropic.d.ts +4 -0
- package/dist/adapters/anthropic.d.ts.map +1 -0
- package/dist/adapters/cli.d.ts +4 -0
- package/dist/adapters/cli.d.ts.map +1 -0
- package/dist/adapters/function.d.ts +4 -0
- package/dist/adapters/function.d.ts.map +1 -0
- package/dist/adapters/http.d.ts +16 -0
- package/dist/adapters/http.d.ts.map +1 -0
- package/dist/adapters/mcp.d.ts +4 -0
- package/dist/adapters/mcp.d.ts.map +1 -0
- package/dist/adapters/openai.d.ts +4 -0
- package/dist/adapters/openai.d.ts.map +1 -0
- package/dist/cli/adapter-parser.d.ts +3 -0
- package/dist/cli/adapter-parser.d.ts.map +1 -0
- package/dist/cli/cli.test.d.ts +2 -0
- package/dist/cli/cli.test.d.ts.map +1 -0
- package/dist/cli/commands/calibrate.d.ts +3 -0
- package/dist/cli/commands/calibrate.d.ts.map +1 -0
- package/dist/cli/commands/capture.d.ts +3 -0
- package/dist/cli/commands/capture.d.ts.map +1 -0
- package/dist/cli/commands/ci.d.ts +3 -0
- package/dist/cli/commands/ci.d.ts.map +1 -0
- package/dist/cli/commands/compare.d.ts +3 -0
- package/dist/cli/commands/compare.d.ts.map +1 -0
- package/dist/cli/commands/doctor.d.ts +3 -0
- package/dist/cli/commands/doctor.d.ts.map +1 -0
- package/dist/cli/commands/estimate.d.ts +3 -0
- package/dist/cli/commands/estimate.d.ts.map +1 -0
- package/dist/cli/commands/generate.d.ts +3 -0
- package/dist/cli/commands/generate.d.ts.map +1 -0
- package/dist/cli/commands/judge.d.ts +3 -0
- package/dist/cli/commands/judge.d.ts.map +1 -0
- package/dist/cli/commands/mcp.d.ts +3 -0
- package/dist/cli/commands/mcp.d.ts.map +1 -0
- package/dist/cli/commands/run.d.ts +3 -0
- package/dist/cli/commands/run.d.ts.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +18455 -0
- package/dist/core/assertions.d.ts +18 -0
- package/dist/core/assertions.d.ts.map +1 -0
- package/dist/core/assertions.test.d.ts +2 -0
- package/dist/core/assertions.test.d.ts.map +1 -0
- package/dist/core/e2e.test.d.ts +2 -0
- package/dist/core/e2e.test.d.ts.map +1 -0
- package/dist/core/judge.d.ts +13 -0
- package/dist/core/judge.d.ts.map +1 -0
- package/dist/core/judge.test.d.ts +2 -0
- package/dist/core/judge.test.d.ts.map +1 -0
- package/dist/core/reporter.d.ts +21 -0
- package/dist/core/reporter.d.ts.map +1 -0
- package/dist/core/runner.d.ts +4 -0
- package/dist/core/runner.d.ts.map +1 -0
- package/dist/core/runner.test.d.ts +2 -0
- package/dist/core/runner.test.d.ts.map +1 -0
- package/dist/datasets/loader.d.ts +18 -0
- package/dist/datasets/loader.d.ts.map +1 -0
- package/dist/datasets/loader.test.d.ts +2 -0
- package/dist/datasets/loader.test.d.ts.map +1 -0
- package/dist/db/store.d.ts +17 -0
- package/dist/db/store.d.ts.map +1 -0
- package/dist/db/store.test.d.ts +2 -0
- package/dist/db/store.test.d.ts.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22903 -0
- package/dist/mcp/index.d.ts +3 -0
- package/dist/mcp/index.d.ts.map +1 -0
- package/dist/mcp/index.js +20120 -0
- package/dist/mcp/mcp.test.d.ts +2 -0
- package/dist/mcp/mcp.test.d.ts.map +1 -0
- package/dist/server/index.d.ts +3 -0
- package/dist/server/index.d.ts.map +1 -0
- package/dist/server/index.js +22835 -0
- package/dist/types/index.d.ts +171 -0
- package/dist/types/index.d.ts.map +1 -0
- package/package.json +77 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { Assertion, AssertionResult } from "../types/index.js";
|
|
2
|
+
export declare function sortAssertionsCheapestFirst(assertions: Assertion[]): Assertion[];
|
|
3
|
+
export interface AssertionContext {
|
|
4
|
+
output: string;
|
|
5
|
+
durationMs?: number;
|
|
6
|
+
inputTokens?: number;
|
|
7
|
+
outputTokens?: number;
|
|
8
|
+
costUsd?: number;
|
|
9
|
+
toolCalls?: Array<{
|
|
10
|
+
name: string;
|
|
11
|
+
arguments?: Record<string, unknown>;
|
|
12
|
+
}>;
|
|
13
|
+
}
|
|
14
|
+
export declare function runAssertion(assertion: Assertion, ctx: AssertionContext): Promise<AssertionResult>;
|
|
15
|
+
export declare function runAssertions(assertions: Assertion[], ctx: AssertionContext): Promise<AssertionResult[]>;
|
|
16
|
+
export declare function assertionsPassed(results: AssertionResult[]): boolean;
|
|
17
|
+
export declare function allAssertionsPassed(results: AssertionResult[]): boolean;
|
|
18
|
+
//# sourceMappingURL=assertions.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"assertions.d.ts","sourceRoot":"","sources":["../../src/core/assertions.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AA6BpE,wBAAgB,2BAA2B,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,SAAS,EAAE,CAMhF;AAED,MAAM,WAAW,gBAAgB;IAC/B,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;KAAE,CAAC,CAAC;CAC1E;AAED,wBAAsB,YAAY,CAChC,SAAS,EAAE,SAAS,EACpB,GAAG,EAAE,gBAAgB,GACpB,OAAO,CAAC,eAAe,CAAC,CAqB1B;AAED,wBAAsB,aAAa,CACjC,UAAU,EAAE,SAAS,EAAE,EACvB,GAAG,EAAE,gBAAgB,GACpB,OAAO,CAAC,eAAe,EAAE,CAAC,CAyB5B;AAED,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,OAAO,CAKpE;AAED,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,OAAO,CAEvE"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"assertions.test.d.ts","sourceRoot":"","sources":["../../src/core/assertions.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"e2e.test.d.ts","sourceRoot":"","sources":["../../src/core/e2e.test.ts"],"names":[],"mappings":"AAyBA,wBAAsB,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAE9D"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { JudgeConfig, JudgeResult } from "../types/index.js";
|
|
2
|
+
export declare function runJudge(input: string, output: string, config: JudgeConfig, expected?: string): Promise<JudgeResult>;
|
|
3
|
+
/** One-shot judge: no full eval case, just input/output/rubric */
|
|
4
|
+
export declare function judgeOnce(params: {
|
|
5
|
+
input: string;
|
|
6
|
+
output: string;
|
|
7
|
+
rubric: string;
|
|
8
|
+
expected?: string;
|
|
9
|
+
model?: string;
|
|
10
|
+
provider?: "anthropic" | "openai";
|
|
11
|
+
apiKey?: string;
|
|
12
|
+
}): Promise<JudgeResult>;
|
|
13
|
+
//# sourceMappingURL=judge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.d.ts","sourceRoot":"","sources":["../../src/core/judge.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,WAAW,EAAE,WAAW,EAAW,MAAM,mBAAmB,CAAC;AAsD3E,wBAAsB,QAAQ,CAC5B,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,MAAM,EACd,MAAM,EAAE,WAAW,EACnB,QAAQ,CAAC,EAAE,MAAM,GAChB,OAAO,CAAC,WAAW,CAAC,CAmBtB;AA4ED,kEAAkE;AAClE,wBAAsB,SAAS,CAAC,MAAM,EAAE;IACtC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,WAAW,GAAG,QAAQ,CAAC;IAClC,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB,GAAG,OAAO,CAAC,WAAW,CAAC,CAOvB"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.test.d.ts","sourceRoot":"","sources":["../../src/core/judge.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { EvalRun, Verdict } from "../types/index.js";
|
|
2
|
+
export declare function printTerminalReport(run: EvalRun): void;
|
|
3
|
+
export declare function toJson(run: EvalRun): string;
|
|
4
|
+
export declare function toMarkdown(run: EvalRun): string;
|
|
5
|
+
export interface RunDiff {
|
|
6
|
+
regressions: Array<{
|
|
7
|
+
caseId: string;
|
|
8
|
+
before: Verdict;
|
|
9
|
+
after: Verdict;
|
|
10
|
+
}>;
|
|
11
|
+
improvements: Array<{
|
|
12
|
+
caseId: string;
|
|
13
|
+
before: Verdict;
|
|
14
|
+
after: Verdict;
|
|
15
|
+
}>;
|
|
16
|
+
scoreDelta: number;
|
|
17
|
+
passRateDelta: number;
|
|
18
|
+
}
|
|
19
|
+
export declare function compareRuns(before: EvalRun, after: EvalRun): RunDiff;
|
|
20
|
+
export declare function printDiffReport(diff: RunDiff): void;
|
|
21
|
+
//# sourceMappingURL=reporter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reporter.d.ts","sourceRoot":"","sources":["../../src/core/reporter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,mBAAmB,CAAC;AAgB1D,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,OAAO,GAAG,IAAI,CAoCtD;AAID,wBAAgB,MAAM,CAAC,GAAG,EAAE,OAAO,GAAG,MAAM,CAE3C;AAID,wBAAgB,UAAU,CAAC,GAAG,EAAE,OAAO,GAAG,MAAM,CAkD/C;AAID,MAAM,WAAW,OAAO;IACtB,WAAW,EAAE,KAAK,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,OAAO,CAAC;QAAC,KAAK,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;IACxE,YAAY,EAAE,KAAK,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,OAAO,CAAC;QAAC,KAAK,EAAE,OAAO,CAAA;KAAE,CAAC,CAAC;IACzE,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,wBAAgB,WAAW,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,OAAO,GAAG,OAAO,CAuBpE;AAED,wBAAgB,eAAe,CAAC,IAAI,EAAE,OAAO,GAAG,IAAI,CAcnD"}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { AdapterConfig, EvalCase, EvalResult, EvalRun, RunOptions } from "../types/index.js";
|
|
2
|
+
export declare function runEvals(cases: EvalCase[], options: RunOptions): Promise<EvalRun>;
|
|
3
|
+
export declare function runSingleCase(evalCase: EvalCase, adapterConfig: AdapterConfig, skipJudge?: boolean): Promise<EvalResult>;
|
|
4
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/core/runner.ts"],"names":[],"mappings":"AASA,OAAO,KAAK,EACV,aAAa,EACb,QAAQ,EACR,UAAU,EACV,OAAO,EAEP,UAAU,EAGX,MAAM,mBAAmB,CAAC;AAqI3B,wBAAsB,QAAQ,CAC5B,KAAK,EAAE,QAAQ,EAAE,EACjB,OAAO,EAAE,UAAU,GAClB,OAAO,CAAC,OAAO,CAAC,CAqClB;AAID,wBAAsB,aAAa,CACjC,QAAQ,EAAE,QAAQ,EAClB,aAAa,EAAE,aAAa,EAC5B,SAAS,UAAQ,GAChB,OAAO,CAAC,UAAU,CAAC,CAKrB"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.test.d.ts","sourceRoot":"","sources":["../../src/core/runner.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { EvalCase } from "../types/index.js";
|
|
2
|
+
export interface LoadOptions {
|
|
3
|
+
/** Fail hard on malformed lines instead of warning and skipping */
|
|
4
|
+
strict?: boolean;
|
|
5
|
+
/** Filter by tags */
|
|
6
|
+
tags?: string[];
|
|
7
|
+
}
|
|
8
|
+
export interface LoadResult {
|
|
9
|
+
cases: EvalCase[];
|
|
10
|
+
warnings: string[];
|
|
11
|
+
totalLines: number;
|
|
12
|
+
skipped: number;
|
|
13
|
+
}
|
|
14
|
+
/** Load eval cases from a JSONL or JSON file (or glob pattern) */
|
|
15
|
+
export declare function loadDataset(pathOrGlob: string, opts?: LoadOptions): Promise<LoadResult>;
|
|
16
|
+
/** Stream large JSONL files case-by-case (for very large datasets) */
|
|
17
|
+
export declare function streamDataset(path: string, opts?: LoadOptions): AsyncGenerator<EvalCase>;
|
|
18
|
+
//# sourceMappingURL=loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../src/datasets/loader.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAElD,MAAM,WAAW,WAAW;IAC1B,mEAAmE;IACnE,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,qBAAqB;IACrB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,QAAQ,EAAE,CAAC;IAClB,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,kEAAkE;AAClE,wBAAsB,WAAW,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,GAAE,WAAgB,GAAG,OAAO,CAAC,UAAU,CAAC,CAiCjG;AAoFD,sEAAsE;AACtE,wBAAuB,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,WAAgB,GAAG,cAAc,CAAC,QAAQ,CAAC,CAyBnG"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.test.d.ts","sourceRoot":"","sources":["../../src/datasets/loader.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { Database } from "bun:sqlite";
|
|
2
|
+
import type { EvalRun } from "../types/index.js";
|
|
3
|
+
export declare function getDatabase(): Database;
|
|
4
|
+
export declare function closeDatabase(): void;
|
|
5
|
+
export declare function saveRun(run: EvalRun): void;
|
|
6
|
+
export declare function getRun(id: string): EvalRun | null;
|
|
7
|
+
export declare function listRuns(limit?: number, dataset?: string): EvalRun[];
|
|
8
|
+
export declare function deleteRun(id: string): void;
|
|
9
|
+
export declare function setBaseline(name: string, runId: string): void;
|
|
10
|
+
export declare function getBaseline(name: string): EvalRun | null;
|
|
11
|
+
export declare function listBaselines(): Array<{
|
|
12
|
+
name: string;
|
|
13
|
+
runId: string;
|
|
14
|
+
createdAt: string;
|
|
15
|
+
}>;
|
|
16
|
+
export declare function clearBaseline(name: string): void;
|
|
17
|
+
//# sourceMappingURL=store.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"store.d.ts","sourceRoot":"","sources":["../../src/db/store.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAItC,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,mBAAmB,CAAC;AAQjD,wBAAgB,WAAW,IAAI,QAAQ,CAUtC;AAED,wBAAgB,aAAa,IAAI,IAAI,CAGpC;AA6BD,wBAAgB,OAAO,CAAC,GAAG,EAAE,OAAO,GAAG,IAAI,CAa1C;AAED,wBAAgB,MAAM,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,GAAG,IAAI,CAIjD;AAED,wBAAgB,QAAQ,CAAC,KAAK,SAAK,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,OAAO,EAAE,CAMhE;AAED,wBAAgB,SAAS,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI,CAE1C;AAID,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAI7D;AAED,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,GAAG,IAAI,CAKxD;AAED,wBAAgB,aAAa,IAAI,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,CAAC,CAKzF;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI,CAEhD"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"store.test.d.ts","sourceRoot":"","sources":["../../src/db/store.test.ts"],"names":[],"mappings":""}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export * from "./types/index.js";
|
|
2
|
+
export * from "./core/runner.js";
|
|
3
|
+
export * from "./core/assertions.js";
|
|
4
|
+
export * from "./core/judge.js";
|
|
5
|
+
export * from "./core/reporter.js";
|
|
6
|
+
export * from "./datasets/loader.js";
|
|
7
|
+
export * from "./db/store.js";
|
|
8
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,cAAc,kBAAkB,CAAC;AACjC,cAAc,kBAAkB,CAAC;AACjC,cAAc,sBAAsB,CAAC;AACrC,cAAc,iBAAiB,CAAC;AAChC,cAAc,oBAAoB,CAAC;AACnC,cAAc,sBAAsB,CAAC;AACrC,cAAc,eAAe,CAAC"}
|