@halo-sdk/eval 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -0
- package/dist/index.cjs +72 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.ts +68 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +45 -0
- package/dist/index.js.map +1 -0
- package/package.json +50 -0
package/README.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# @halo-sdk/eval
|
|
2
|
+
|
|
3
|
+
Two things, deliberately scoped:
|
|
4
|
+
|
|
5
|
+
1. **Cache cost benchmark** — the differentiated "evidence" that proves Halo's prefix-cache moat. `benchmarkCache(agent, inputs)` drives an agent through a multi-turn scenario and reports hit rate, token split, estimated spend, and an A–F grade. `compareCache(scenario, a, b)` runs the same scenario through two agents — e.g. to show `SummarizeAppendStrategy` retains hit-rate where naive truncation collapses it.
|
|
6
|
+
|
|
7
|
+
2. **Behavioral-eval seam** — a thin `runEvalCases(agent, cases)` harness. For real behavioral/quality evaluation (LLM-as-judge, datasets, regression gates), point **promptfoo** or **vitest** at your agent. Halo does not reimplement generic eval.
|
|
8
|
+
|
|
9
|
+
## Usage
|
|
10
|
+
|
|
11
|
+
```ts
|
|
12
|
+
import { benchmarkCache, compareCache, runEvalCases } from "@halo-sdk/eval";
|
|
13
|
+
|
|
14
|
+
const report = await benchmarkCache(agent, [
|
|
15
|
+
"Summarize the cache design.",
|
|
16
|
+
"Now expand on breakpoints.",
|
|
17
|
+
"And how does it compare to OpenAI?",
|
|
18
|
+
]);
|
|
19
|
+
console.log(report.hitRate, report.grade, report.estimatedUsd);
|
|
20
|
+
|
|
21
|
+
const cmp = await compareCache(
|
|
22
|
+
scenario,
|
|
23
|
+
{ name: "truncate", agent: a },
|
|
24
|
+
{ name: "summarize", agent: b },
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
const evalReport = await runEvalCases(agent, [
|
|
28
|
+
{ name: "greets", input: "say hi", assert: (out) => out.toLowerCase().includes("hi") },
|
|
29
|
+
]);
|
|
30
|
+
```
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
benchmarkCache: () => benchmarkCache,
|
|
24
|
+
compareCache: () => compareCache,
|
|
25
|
+
runEvalCases: () => runEvalCases
|
|
26
|
+
});
|
|
27
|
+
module.exports = __toCommonJS(index_exports);
|
|
28
|
+
var import_core = require("@halo-sdk/core");
|
|
29
|
+
async function benchmarkCache(agent, inputs, opts) {
|
|
30
|
+
const now = opts?.now ?? (() => Date.now());
|
|
31
|
+
const profiler = new import_core.CacheProfiler();
|
|
32
|
+
const off = agent.observe(profiler.observe);
|
|
33
|
+
const start = now();
|
|
34
|
+
try {
|
|
35
|
+
for (const input of inputs) {
|
|
36
|
+
await agent.generateText(input);
|
|
37
|
+
}
|
|
38
|
+
} finally {
|
|
39
|
+
off();
|
|
40
|
+
}
|
|
41
|
+
return { ...profiler.report(), durationMs: now() - start, inputs: inputs.length };
|
|
42
|
+
}
|
|
43
|
+
async function compareCache(scenario, a, b, opts) {
|
|
44
|
+
const ra = await benchmarkCache(a.agent, scenario, opts);
|
|
45
|
+
const rb = await benchmarkCache(b.agent, scenario, opts);
|
|
46
|
+
return { [a.name]: ra, [b.name]: rb };
|
|
47
|
+
}
|
|
48
|
+
async function runEvalCases(agent, cases) {
|
|
49
|
+
const results = [];
|
|
50
|
+
for (const c of cases) {
|
|
51
|
+
try {
|
|
52
|
+
const { content } = await agent.generateText(c.input);
|
|
53
|
+
const passed = await c.assert(content);
|
|
54
|
+
results.push({ name: c.name, passed, output: content });
|
|
55
|
+
} catch (err) {
|
|
56
|
+
results.push({
|
|
57
|
+
name: c.name,
|
|
58
|
+
passed: false,
|
|
59
|
+
output: "",
|
|
60
|
+
error: err instanceof Error ? err.message : String(err)
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return { results, passed: results.filter((r) => r.passed).length, total: results.length };
|
|
65
|
+
}
|
|
66
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
67
|
+
0 && (module.exports = {
|
|
68
|
+
benchmarkCache,
|
|
69
|
+
compareCache,
|
|
70
|
+
runEvalCases
|
|
71
|
+
});
|
|
72
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { CacheProfiler, type CacheProfile, type Observer } from \"@halo-sdk/core\";\n\n// ── Cache cost benchmark (the differentiated \"evidence\") ──\n\n/** The minimal agent surface the benchmark drives. `HaloAgent` satisfies it. */\nexport interface BenchmarkableAgent {\n observe(fn: Observer): () => void;\n generateText(input: string): Promise<unknown>;\n}\n\nexport interface CacheBenchmarkResult extends CacheProfile {\n /** Wall-clock duration of the run in ms. */\n durationMs: number;\n /** Inputs that were run, in order. */\n inputs: number;\n}\n\n/**\n * Drive an agent through a multi-turn scenario and measure the prefix-cache\n * moat: hit rate, token split, estimated spend, and an A–F grade. This is the\n * internal *evidence* benchmark — it proves the cost/latency claims, it is not a\n * behavioral quality eval (see {@link runEvalCases} + promptfoo for that).\n */\nexport async function benchmarkCache(\n agent: BenchmarkableAgent,\n inputs: string[],\n opts?: { now?: () => number },\n): Promise<CacheBenchmarkResult> {\n const now = opts?.now ?? (() => Date.now());\n const profiler = new CacheProfiler();\n const off = agent.observe(profiler.observe);\n const start = now();\n try {\n for (const input of inputs) {\n await agent.generateText(input);\n }\n } finally {\n off();\n }\n return { ...profiler.report(), durationMs: now() - start, inputs: inputs.length };\n}\n\n/**\n * Compare two strategies (e.g. truncate vs. summarize-append) by running the\n * same scenario through two agent factories and reporting both cache profiles.\n * Used to demonstrate that cache-preserving context management retains hit-rate\n * where naive truncation collapses it.\n */\nexport async function compareCache(\n scenario: string[],\n a: { name: string; agent: BenchmarkableAgent },\n b: { name: string; agent: BenchmarkableAgent },\n opts?: { now?: () => number },\n): Promise<{ [name: string]: CacheBenchmarkResult }> {\n const ra = await benchmarkCache(a.agent, scenario, opts);\n const rb = await benchmarkCache(b.agent, scenario, opts);\n return { [a.name]: ra, [b.name]: rb };\n}\n\n// ── Behavioral eval seam ──\n\n/**\n * A behavioral test case. This package ships only a thin harness — for richer\n * behavioral/quality evaluation (LLM-as-judge, datasets, regression gates)\n * point a real tool like **promptfoo** or **vitest** at your agent. Halo's\n * differentiated contribution is the cache benchmark above, not a generic eval.\n */\nexport interface EvalCase {\n name: string;\n input: string;\n /** Return true if the output passes. */\n assert: (output: string) => boolean | Promise<boolean>;\n}\n\nexport interface EvalCaseResult {\n name: string;\n passed: boolean;\n output: string;\n error?: string;\n}\n\nexport interface EvalRunReport {\n results: EvalCaseResult[];\n passed: number;\n total: number;\n}\n\n/** Run behavioral cases against an agent and tally pass/fail. */\nexport async function runEvalCases(\n agent: { generateText(input: string): Promise<{ content: string }> },\n cases: EvalCase[],\n): Promise<EvalRunReport> {\n const results: EvalCaseResult[] = [];\n for (const c of cases) {\n try {\n const { content } = await agent.generateText(c.input);\n const passed = await c.assert(content);\n results.push({ name: c.name, passed, output: content });\n } catch (err: unknown) {\n results.push({\n name: c.name,\n passed: false,\n output: \"\",\n error: err instanceof Error ? err.message : String(err),\n });\n }\n }\n return { results, passed: results.filter((r) => r.passed).length, total: results.length };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,kBAAgE;AAuBhE,eAAsB,eACpB,OACA,QACA,MAC+B;AAC/B,QAAM,MAAM,MAAM,QAAQ,MAAM,KAAK,IAAI;AACzC,QAAM,WAAW,IAAI,0BAAc;AACnC,QAAM,MAAM,MAAM,QAAQ,SAAS,OAAO;AAC1C,QAAM,QAAQ,IAAI;AAClB,MAAI;AACF,eAAW,SAAS,QAAQ;AAC1B,YAAM,MAAM,aAAa,KAAK;AAAA,IAChC;AAAA,EACF,UAAE;AACA,QAAI;AAAA,EACN;AACA,SAAO,EAAE,GAAG,SAAS,OAAO,GAAG,YAAY,IAAI,IAAI,OAAO,QAAQ,OAAO,OAAO;AAClF;AAQA,eAAsB,aACpB,UACA,GACA,GACA,MACmD;AACnD,QAAM,KAAK,MAAM,eAAe,EAAE,OAAO,UAAU,IAAI;AACvD,QAAM,KAAK,MAAM,eAAe,EAAE,OAAO,UAAU,IAAI;AACvD,SAAO,EAAE,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC,EAAE,IAAI,GAAG,GAAG;AACtC;AA+BA,eAAsB,aACpB,OACA,OACwB;AACxB,QAAM,UAA4B,CAAC;AACnC,aAAW,KAAK,OAAO;AACrB,QAAI;AACF,YAAM,EAAE,QAAQ,IAAI,MAAM,MAAM,aAAa,EAAE,KAAK;AACpD,YAAM,SAAS,MAAM,EAAE,OAAO,OAAO;AACrC,cAAQ,KAAK,EAAE,MAAM,EAAE,MAAM,QAAQ,QAAQ,QAAQ,CAAC;AAAA,IACxD,SAAS,KAAc;AACrB,cAAQ,KAAK;AAAA,QACX,MAAM,EAAE;AAAA,QACR,QAAQ;AAAA,QACR,QAAQ;AAAA,QACR,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,MACxD,CAAC;AAAA,IACH;AAAA,EACF;AACA,SAAO,EAAE,SAAS,QAAQ,QAAQ,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE,QAAQ,OAAO,QAAQ,OAAO;AAC1F;","names":[]}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { type CacheProfile, type Observer } from "@halo-sdk/core";
|
|
2
|
+
/** The minimal agent surface the benchmark drives. `HaloAgent` satisfies it. */
|
|
3
|
+
export interface BenchmarkableAgent {
|
|
4
|
+
observe(fn: Observer): () => void;
|
|
5
|
+
generateText(input: string): Promise<unknown>;
|
|
6
|
+
}
|
|
7
|
+
export interface CacheBenchmarkResult extends CacheProfile {
|
|
8
|
+
/** Wall-clock duration of the run in ms. */
|
|
9
|
+
durationMs: number;
|
|
10
|
+
/** Inputs that were run, in order. */
|
|
11
|
+
inputs: number;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Drive an agent through a multi-turn scenario and measure the prefix-cache
|
|
15
|
+
* moat: hit rate, token split, estimated spend, and an A–F grade. This is the
|
|
16
|
+
* internal *evidence* benchmark — it proves the cost/latency claims, it is not a
|
|
17
|
+
* behavioral quality eval (see {@link runEvalCases} + promptfoo for that).
|
|
18
|
+
*/
|
|
19
|
+
export declare function benchmarkCache(agent: BenchmarkableAgent, inputs: string[], opts?: {
|
|
20
|
+
now?: () => number;
|
|
21
|
+
}): Promise<CacheBenchmarkResult>;
|
|
22
|
+
/**
|
|
23
|
+
* Compare two strategies (e.g. truncate vs. summarize-append) by running the
|
|
24
|
+
* same scenario through two agent factories and reporting both cache profiles.
|
|
25
|
+
* Used to demonstrate that cache-preserving context management retains hit-rate
|
|
26
|
+
* where naive truncation collapses it.
|
|
27
|
+
*/
|
|
28
|
+
export declare function compareCache(scenario: string[], a: {
|
|
29
|
+
name: string;
|
|
30
|
+
agent: BenchmarkableAgent;
|
|
31
|
+
}, b: {
|
|
32
|
+
name: string;
|
|
33
|
+
agent: BenchmarkableAgent;
|
|
34
|
+
}, opts?: {
|
|
35
|
+
now?: () => number;
|
|
36
|
+
}): Promise<{
|
|
37
|
+
[name: string]: CacheBenchmarkResult;
|
|
38
|
+
}>;
|
|
39
|
+
/**
|
|
40
|
+
* A behavioral test case. This package ships only a thin harness — for richer
|
|
41
|
+
* behavioral/quality evaluation (LLM-as-judge, datasets, regression gates)
|
|
42
|
+
* point a real tool like **promptfoo** or **vitest** at your agent. Halo's
|
|
43
|
+
* differentiated contribution is the cache benchmark above, not a generic eval.
|
|
44
|
+
*/
|
|
45
|
+
export interface EvalCase {
|
|
46
|
+
name: string;
|
|
47
|
+
input: string;
|
|
48
|
+
/** Return true if the output passes. */
|
|
49
|
+
assert: (output: string) => boolean | Promise<boolean>;
|
|
50
|
+
}
|
|
51
|
+
export interface EvalCaseResult {
|
|
52
|
+
name: string;
|
|
53
|
+
passed: boolean;
|
|
54
|
+
output: string;
|
|
55
|
+
error?: string;
|
|
56
|
+
}
|
|
57
|
+
export interface EvalRunReport {
|
|
58
|
+
results: EvalCaseResult[];
|
|
59
|
+
passed: number;
|
|
60
|
+
total: number;
|
|
61
|
+
}
|
|
62
|
+
/** Run behavioral cases against an agent and tally pass/fail. */
|
|
63
|
+
export declare function runEvalCases(agent: {
|
|
64
|
+
generateText(input: string): Promise<{
|
|
65
|
+
content: string;
|
|
66
|
+
}>;
|
|
67
|
+
}, cases: EvalCase[]): Promise<EvalRunReport>;
|
|
68
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAiB,KAAK,YAAY,EAAE,KAAK,QAAQ,EAAE,MAAM,gBAAgB,CAAC;AAIjF,gFAAgF;AAChF,MAAM,WAAW,kBAAkB;IACjC,OAAO,CAAC,EAAE,EAAE,QAAQ,GAAG,MAAM,IAAI,CAAC;IAClC,YAAY,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;CAC/C;AAED,MAAM,WAAW,oBAAqB,SAAQ,YAAY;IACxD,4CAA4C;IAC5C,UAAU,EAAE,MAAM,CAAC;IACnB,sCAAsC;IACtC,MAAM,EAAE,MAAM,CAAC;CAChB;AAED;;;;;GAKG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,kBAAkB,EACzB,MAAM,EAAE,MAAM,EAAE,EAChB,IAAI,CAAC,EAAE;IAAE,GAAG,CAAC,EAAE,MAAM,MAAM,CAAA;CAAE,GAC5B,OAAO,CAAC,oBAAoB,CAAC,CAa/B;AAED;;;;;GAKG;AACH,wBAAsB,YAAY,CAChC,QAAQ,EAAE,MAAM,EAAE,EAClB,CAAC,EAAE;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,kBAAkB,CAAA;CAAE,EAC9C,CAAC,EAAE;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,kBAAkB,CAAA;CAAE,EAC9C,IAAI,CAAC,EAAE;IAAE,GAAG,CAAC,EAAE,MAAM,MAAM,CAAA;CAAE,GAC5B,OAAO,CAAC;IAAE,CAAC,IAAI,EAAE,MAAM,GAAG,oBAAoB,CAAA;CAAE,CAAC,CAInD;AAID;;;;;GAKG;AACH,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,wCAAwC;IACxC,MAAM,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;CACxD;AAED,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,cAAc,EAAE,CAAC;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACf;AAED,iEAAiE;AACjE,wBAAsB,YAAY,CAChC,KAAK,EAAE;IAAE,YAAY,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAAE,EACpE,KAAK,EAAE,QAAQ,EAAE,GAChB,OAAO,CAAC,aAAa,CAAC,CAiBxB"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
// src/index.ts
|
|
2
|
+
import { CacheProfiler } from "@halo-sdk/core";
|
|
3
|
+
async function benchmarkCache(agent, inputs, opts) {
|
|
4
|
+
const now = opts?.now ?? (() => Date.now());
|
|
5
|
+
const profiler = new CacheProfiler();
|
|
6
|
+
const off = agent.observe(profiler.observe);
|
|
7
|
+
const start = now();
|
|
8
|
+
try {
|
|
9
|
+
for (const input of inputs) {
|
|
10
|
+
await agent.generateText(input);
|
|
11
|
+
}
|
|
12
|
+
} finally {
|
|
13
|
+
off();
|
|
14
|
+
}
|
|
15
|
+
return { ...profiler.report(), durationMs: now() - start, inputs: inputs.length };
|
|
16
|
+
}
|
|
17
|
+
async function compareCache(scenario, a, b, opts) {
|
|
18
|
+
const ra = await benchmarkCache(a.agent, scenario, opts);
|
|
19
|
+
const rb = await benchmarkCache(b.agent, scenario, opts);
|
|
20
|
+
return { [a.name]: ra, [b.name]: rb };
|
|
21
|
+
}
|
|
22
|
+
async function runEvalCases(agent, cases) {
|
|
23
|
+
const results = [];
|
|
24
|
+
for (const c of cases) {
|
|
25
|
+
try {
|
|
26
|
+
const { content } = await agent.generateText(c.input);
|
|
27
|
+
const passed = await c.assert(content);
|
|
28
|
+
results.push({ name: c.name, passed, output: content });
|
|
29
|
+
} catch (err) {
|
|
30
|
+
results.push({
|
|
31
|
+
name: c.name,
|
|
32
|
+
passed: false,
|
|
33
|
+
output: "",
|
|
34
|
+
error: err instanceof Error ? err.message : String(err)
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return { results, passed: results.filter((r) => r.passed).length, total: results.length };
|
|
39
|
+
}
|
|
40
|
+
export {
|
|
41
|
+
benchmarkCache,
|
|
42
|
+
compareCache,
|
|
43
|
+
runEvalCases
|
|
44
|
+
};
|
|
45
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { CacheProfiler, type CacheProfile, type Observer } from \"@halo-sdk/core\";\n\n// ── Cache cost benchmark (the differentiated \"evidence\") ──\n\n/** The minimal agent surface the benchmark drives. `HaloAgent` satisfies it. */\nexport interface BenchmarkableAgent {\n observe(fn: Observer): () => void;\n generateText(input: string): Promise<unknown>;\n}\n\nexport interface CacheBenchmarkResult extends CacheProfile {\n /** Wall-clock duration of the run in ms. */\n durationMs: number;\n /** Inputs that were run, in order. */\n inputs: number;\n}\n\n/**\n * Drive an agent through a multi-turn scenario and measure the prefix-cache\n * moat: hit rate, token split, estimated spend, and an A–F grade. This is the\n * internal *evidence* benchmark — it proves the cost/latency claims, it is not a\n * behavioral quality eval (see {@link runEvalCases} + promptfoo for that).\n */\nexport async function benchmarkCache(\n agent: BenchmarkableAgent,\n inputs: string[],\n opts?: { now?: () => number },\n): Promise<CacheBenchmarkResult> {\n const now = opts?.now ?? (() => Date.now());\n const profiler = new CacheProfiler();\n const off = agent.observe(profiler.observe);\n const start = now();\n try {\n for (const input of inputs) {\n await agent.generateText(input);\n }\n } finally {\n off();\n }\n return { ...profiler.report(), durationMs: now() - start, inputs: inputs.length };\n}\n\n/**\n * Compare two strategies (e.g. truncate vs. summarize-append) by running the\n * same scenario through two agent factories and reporting both cache profiles.\n * Used to demonstrate that cache-preserving context management retains hit-rate\n * where naive truncation collapses it.\n */\nexport async function compareCache(\n scenario: string[],\n a: { name: string; agent: BenchmarkableAgent },\n b: { name: string; agent: BenchmarkableAgent },\n opts?: { now?: () => number },\n): Promise<{ [name: string]: CacheBenchmarkResult }> {\n const ra = await benchmarkCache(a.agent, scenario, opts);\n const rb = await benchmarkCache(b.agent, scenario, opts);\n return { [a.name]: ra, [b.name]: rb };\n}\n\n// ── Behavioral eval seam ──\n\n/**\n * A behavioral test case. This package ships only a thin harness — for richer\n * behavioral/quality evaluation (LLM-as-judge, datasets, regression gates)\n * point a real tool like **promptfoo** or **vitest** at your agent. Halo's\n * differentiated contribution is the cache benchmark above, not a generic eval.\n */\nexport interface EvalCase {\n name: string;\n input: string;\n /** Return true if the output passes. */\n assert: (output: string) => boolean | Promise<boolean>;\n}\n\nexport interface EvalCaseResult {\n name: string;\n passed: boolean;\n output: string;\n error?: string;\n}\n\nexport interface EvalRunReport {\n results: EvalCaseResult[];\n passed: number;\n total: number;\n}\n\n/** Run behavioral cases against an agent and tally pass/fail. */\nexport async function runEvalCases(\n agent: { generateText(input: string): Promise<{ content: string }> },\n cases: EvalCase[],\n): Promise<EvalRunReport> {\n const results: EvalCaseResult[] = [];\n for (const c of cases) {\n try {\n const { content } = await agent.generateText(c.input);\n const passed = await c.assert(content);\n results.push({ name: c.name, passed, output: content });\n } catch (err: unknown) {\n results.push({\n name: c.name,\n passed: false,\n output: \"\",\n error: err instanceof Error ? err.message : String(err),\n });\n }\n }\n return { results, passed: results.filter((r) => r.passed).length, total: results.length };\n}\n"],"mappings":";AAAA,SAAS,qBAAuD;AAuBhE,eAAsB,eACpB,OACA,QACA,MAC+B;AAC/B,QAAM,MAAM,MAAM,QAAQ,MAAM,KAAK,IAAI;AACzC,QAAM,WAAW,IAAI,cAAc;AACnC,QAAM,MAAM,MAAM,QAAQ,SAAS,OAAO;AAC1C,QAAM,QAAQ,IAAI;AAClB,MAAI;AACF,eAAW,SAAS,QAAQ;AAC1B,YAAM,MAAM,aAAa,KAAK;AAAA,IAChC;AAAA,EACF,UAAE;AACA,QAAI;AAAA,EACN;AACA,SAAO,EAAE,GAAG,SAAS,OAAO,GAAG,YAAY,IAAI,IAAI,OAAO,QAAQ,OAAO,OAAO;AAClF;AAQA,eAAsB,aACpB,UACA,GACA,GACA,MACmD;AACnD,QAAM,KAAK,MAAM,eAAe,EAAE,OAAO,UAAU,IAAI;AACvD,QAAM,KAAK,MAAM,eAAe,EAAE,OAAO,UAAU,IAAI;AACvD,SAAO,EAAE,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC,EAAE,IAAI,GAAG,GAAG;AACtC;AA+BA,eAAsB,aACpB,OACA,OACwB;AACxB,QAAM,UAA4B,CAAC;AACnC,aAAW,KAAK,OAAO;AACrB,QAAI;AACF,YAAM,EAAE,QAAQ,IAAI,MAAM,MAAM,aAAa,EAAE,KAAK;AACpD,YAAM,SAAS,MAAM,EAAE,OAAO,OAAO;AACrC,cAAQ,KAAK,EAAE,MAAM,EAAE,MAAM,QAAQ,QAAQ,QAAQ,CAAC;AAAA,IACxD,SAAS,KAAc;AACrB,cAAQ,KAAK;AAAA,QACX,MAAM,EAAE;AAAA,QACR,QAAQ;AAAA,QACR,QAAQ;AAAA,QACR,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,MACxD,CAAC;AAAA,IACH;AAAA,EACF;AACA,SAAO,EAAE,SAAS,QAAQ,QAAQ,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE,QAAQ,OAAO,QAAQ,OAAO;AAC1F;","names":[]}
|
package/package.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@halo-sdk/eval",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Cache cost benchmark + behavioral-eval seam for Halo SDK — measure the prefix-cache moat, plug in promptfoo/vitest for quality",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"ai",
|
|
7
|
+
"benchmark",
|
|
8
|
+
"eval",
|
|
9
|
+
"llm",
|
|
10
|
+
"prefix-cache"
|
|
11
|
+
],
|
|
12
|
+
"license": "MIT",
|
|
13
|
+
"repository": {
|
|
14
|
+
"type": "git",
|
|
15
|
+
"url": "https://github.com/halo-sdk/halo-ai",
|
|
16
|
+
"directory": "packages/eval"
|
|
17
|
+
},
|
|
18
|
+
"files": [
|
|
19
|
+
"dist"
|
|
20
|
+
],
|
|
21
|
+
"type": "module",
|
|
22
|
+
"main": "./dist/index.js",
|
|
23
|
+
"types": "./dist/index.d.ts",
|
|
24
|
+
"exports": {
|
|
25
|
+
".": {
|
|
26
|
+
"types": "./dist/index.d.ts",
|
|
27
|
+
"import": "./dist/index.js",
|
|
28
|
+
"require": "./dist/index.cjs"
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"publishConfig": {
|
|
32
|
+
"access": "public"
|
|
33
|
+
},
|
|
34
|
+
"devDependencies": {
|
|
35
|
+
"typescript": "^5.8.0",
|
|
36
|
+
"vitest": "^3.0.0",
|
|
37
|
+
"@halo-sdk/core": "1.1.0"
|
|
38
|
+
},
|
|
39
|
+
"peerDependencies": {
|
|
40
|
+
"@halo-sdk/core": ">=1.1.0"
|
|
41
|
+
},
|
|
42
|
+
"scripts": {
|
|
43
|
+
"build": "tsc --build --emitDeclarationOnly && tsup",
|
|
44
|
+
"dev": "tsup --watch",
|
|
45
|
+
"clean": "del-cli dist *.tsbuildinfo",
|
|
46
|
+
"publint": "publint",
|
|
47
|
+
"test": "vitest run",
|
|
48
|
+
"test:watch": "vitest"
|
|
49
|
+
}
|
|
50
|
+
}
|