@speakeasy-api/docs-mcp-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -0
- package/README.md +30 -0
- package/dist/bin.d.ts +3 -0
- package/dist/bin.d.ts.map +1 -0
- package/dist/bin.js +79 -0
- package/dist/bin.js.map +1 -0
- package/dist/delta.d.ts +28 -0
- package/dist/delta.d.ts.map +1 -0
- package/dist/delta.js +109 -0
- package/dist/delta.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -0
- package/dist/metrics.d.ts +27 -0
- package/dist/metrics.d.ts.map +1 -0
- package/dist/metrics.js +64 -0
- package/dist/metrics.js.map +1 -0
- package/dist/runner.d.ts +70 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +311 -0
- package/dist/runner.js.map +1 -0
- package/package.json +44 -0
package/README.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# @speakeasy-api/docs-mcp-eval
|
|
2
|
+
|
|
3
|
+
Evaluation and benchmarking harness for [Speakeasy Docs MCP](https://github.com/speakeasy-api/docs-mcp) search quality metrics.
|
|
4
|
+
|
|
5
|
+
**Beta.** Part of the [`docs-mcp`](https://github.com/speakeasy-api/docs-mcp) monorepo.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install -g @speakeasy-api/docs-mcp-eval
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
docs-mcp-eval --server-cmd "docs-mcp-server --index-dir ./index" --cases ./cases.json
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Metrics
|
|
20
|
+
|
|
21
|
+
- **Recall\@K** -- fraction of expected chunks found in the top-K results
|
|
22
|
+
- **MRR** (Mean Reciprocal Rank) -- how early the first relevant result appears
|
|
23
|
+
- **Precision\@K** -- fraction of top-K results that are relevant
|
|
24
|
+
- **Delta reports** -- side-by-side comparison between evaluation runs
|
|
25
|
+
|
|
26
|
+
See [docs/eval.md](https://github.com/speakeasy-api/docs-mcp/blob/main/docs/eval.md) for the full evaluation framework specification.
|
|
27
|
+
|
|
28
|
+
## License
|
|
29
|
+
|
|
30
|
+
[AGPL-3.0](https://github.com/speakeasy-api/docs-mcp/blob/main/LICENSE)
|
package/dist/bin.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bin.d.ts","sourceRoot":"","sources":["../src/bin.ts"],"names":[],"mappings":""}
|
package/dist/bin.js
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { readFile, writeFile } from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { Command } from "commander";
|
|
5
|
+
import { generateDeltaMarkdown, toDeltaCases } from "./delta.js";
|
|
6
|
+
import { runEvaluationAgainstServer } from "./runner.js";
|
|
7
|
+
const program = new Command();
|
|
8
|
+
program
|
|
9
|
+
.name("docs-mcp-eval")
|
|
10
|
+
.description("Run MCP docs eval suite against an MCP server over stdio")
|
|
11
|
+
.requiredOption("--cases <path>", "Path to JSON array of eval cases")
|
|
12
|
+
.requiredOption("--server-command <value>", "Command to launch the MCP server")
|
|
13
|
+
.option("--server-arg <value>", "Server arg (repeatable)", collectValues, [])
|
|
14
|
+
.option("--server-cwd <path>", "Working directory for server process")
|
|
15
|
+
.option("--build-command <value>", "Optional command to run index build benchmark before eval")
|
|
16
|
+
.option("--build-arg <value>", "Build arg (repeatable)", collectValues, [])
|
|
17
|
+
.option("--build-cwd <path>", "Working directory for build command")
|
|
18
|
+
.option("--warmup-queries <number>", "Number of warmup search_docs calls", parseIntOption, 0)
|
|
19
|
+
.option("--baseline <path>", "Optional baseline eval JSON for delta markdown")
|
|
20
|
+
.option("--out <path>", "Optional output JSON path")
|
|
21
|
+
.action(async (options) => {
|
|
22
|
+
const casesPath = path.resolve(options.cases);
|
|
23
|
+
const casesRaw = await readFile(casesPath, "utf8");
|
|
24
|
+
const cases = JSON.parse(casesRaw);
|
|
25
|
+
const server = {
|
|
26
|
+
command: options.serverCommand,
|
|
27
|
+
args: options.serverArg,
|
|
28
|
+
...(options.serverCwd ? { cwd: path.resolve(options.serverCwd) } : {})
|
|
29
|
+
};
|
|
30
|
+
const build = options.buildCommand
|
|
31
|
+
? {
|
|
32
|
+
command: options.buildCommand,
|
|
33
|
+
args: options.buildArg,
|
|
34
|
+
...(options.buildCwd ? { cwd: path.resolve(options.buildCwd) } : {})
|
|
35
|
+
}
|
|
36
|
+
: undefined;
|
|
37
|
+
const result = await runEvaluationAgainstServer({
|
|
38
|
+
server,
|
|
39
|
+
...(build ? { build } : {}),
|
|
40
|
+
cases,
|
|
41
|
+
warmupQueries: options.warmupQueries,
|
|
42
|
+
deterministic: true
|
|
43
|
+
});
|
|
44
|
+
let deltaMarkdown;
|
|
45
|
+
if (options.baseline) {
|
|
46
|
+
const baselinePath = path.resolve(options.baseline);
|
|
47
|
+
const baselineRaw = await readFile(baselinePath, "utf8");
|
|
48
|
+
const baseline = JSON.parse(baselineRaw);
|
|
49
|
+
deltaMarkdown = generateDeltaMarkdown({ summary: result.summary, cases: toDeltaCases(result.rankedCases) }, { summary: baseline.summary, cases: toDeltaCases(baseline.rankedCases) });
|
|
50
|
+
}
|
|
51
|
+
const payload = {
|
|
52
|
+
...result,
|
|
53
|
+
deltaMarkdown
|
|
54
|
+
};
|
|
55
|
+
const serialized = `${JSON.stringify(payload, null, 2)}\n`;
|
|
56
|
+
if (options.out) {
|
|
57
|
+
const outPath = path.resolve(options.out);
|
|
58
|
+
await writeFile(outPath, serialized);
|
|
59
|
+
console.log(`wrote eval result to ${outPath}`);
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
process.stdout.write(serialized);
|
|
63
|
+
}
|
|
64
|
+
if (deltaMarkdown) {
|
|
65
|
+
process.stderr.write(`${deltaMarkdown}\n`);
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
void program.parseAsync(process.argv);
|
|
69
|
+
function collectValues(value, previous) {
|
|
70
|
+
return [...previous, value];
|
|
71
|
+
}
|
|
72
|
+
function parseIntOption(value) {
|
|
73
|
+
const parsed = Number.parseInt(value, 10);
|
|
74
|
+
if (!Number.isFinite(parsed) || parsed < 0) {
|
|
75
|
+
throw new Error(`expected a non-negative integer, got '${value}'`);
|
|
76
|
+
}
|
|
77
|
+
return parsed;
|
|
78
|
+
}
|
|
79
|
+
//# sourceMappingURL=bin.js.map
|
package/dist/bin.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bin.js","sourceRoot":"","sources":["../src/bin.ts"],"names":[],"mappings":";AAEA,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACvD,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,qBAAqB,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AACjE,OAAO,EAAE,0BAA0B,EAA8C,MAAM,aAAa,CAAC;AAErG,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,eAAe,CAAC;KACrB,WAAW,CAAC,0DAA0D,CAAC;KACvE,cAAc,CAAC,gBAAgB,EAAE,kCAAkC,CAAC;KACpE,cAAc,CAAC,0BAA0B,EAAE,kCAAkC,CAAC;KAC9E,MAAM,CAAC,sBAAsB,EAAE,yBAAyB,EAAE,aAAa,EAAE,EAAc,CAAC;KACxF,MAAM,CAAC,qBAAqB,EAAE,sCAAsC,CAAC;KACrE,MAAM,CAAC,yBAAyB,EAAE,2DAA2D,CAAC;KAC9F,MAAM,CAAC,qBAAqB,EAAE,wBAAwB,EAAE,aAAa,EAAE,EAAc,CAAC;KACtF,MAAM,CAAC,oBAAoB,EAAE,qCAAqC,CAAC;KACnE,MAAM,CAAC,2BAA2B,EAAE,oCAAoC,EAAE,cAAc,EAAE,CAAC,CAAC;KAC5F,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,CAAC;KAC7E,MAAM,CAAC,cAAc,EAAE,2BAA2B,CAAC;KACnD,MAAM,CAAC,KAAK,EAAE,OAWd,EAAE,EAAE;IACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IAC9C,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IACnD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAoB,CAAC;IAEtD,MAAM,MAAM,GAAG;QACb,OAAO,EAAE,OAAO,CAAC,aAAa;QAC9B,IAAI,EAAE,OAAO,CAAC,SAAS;QACvB,GAAG,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KACvE,CAAC;IACF,MAAM,KAAK,GAAG,OAAO,CAAC,YAAY;QAChC,CAAC,CAAC;YACE,OAAO,EAAE,OAAO,CAAC,YAAY;YAC7B,IAAI,EAAE,OAAO,CAAC,QAAQ;YACtB,GAAG,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACrE;QACH,CAAC,CAAC,SAAS,CAAC;IAEd,MAAM,MAAM,GAAG,MAAM,0BAA0B,CAAC;QAC9C,MAAM;QACN,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3B,KAAK;QACL,aAAa,EAAE,OAAO,CAAC,aAAa;QACpC,aAAa,EAAE,IAAI;KACpB,CAAC,CAAC;IAEH,IAAI,aAAiC,CAAC;IACtC,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACpD,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;QACzD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAsB,CAAC;QAC9D,aAAa,GAAG,qBAAqB,CACnC,EAAE,OAAO,EAAE,MAAM,CAAC,OAAO,EAAE,KAAK,EAAE,YAAY,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,EACpE,EAAE,OAAO,EAAE,QAAQ,CAAC,OAAO,EAAE,KAAK,EAAE,YAAY,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CACzE,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG;QACd,GAAG,MAAM;QACT,aAAa;KACd,CAAC;IAEF,MAAM,UAAU,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC;IAC3D,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC1C,MAAM,SAAS,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,wBAAwB,OAAO,EAAE,CAAC,CAAC;IACjD,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,EAAE,CAAC;QAClB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,aAAa,IAAI,CAAC,CAAC;IAC7C,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,KAAK,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;AAEtC,SAAS,aAAa,CAAC,KAAa,EAAE,QAAkB;IACtD,OAAO,CAAC,GAAG,QAAQ,EAAE,KAAK,CAAC,CAAC;AAC9B,CAAC;AAED,SAAS,cAAc,CAAC,KAAa;IACnC,MAAM,MAAM,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAC1C,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3C,MAAM,IAAI,KAAK,CAAC,yCAAyC,KAAK,GAAG,CAAC,CAAC;IACrE,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/dist/delta.d.ts
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { EvalSummary, RankedCase } from "./metrics.js";
|
|
2
|
+
export interface DeltaCaseData {
|
|
3
|
+
/** Human-readable name for this case */
|
|
4
|
+
name: string;
|
|
5
|
+
/** Whether the expected chunk was found in the top 5 results */
|
|
6
|
+
passed: boolean;
|
|
7
|
+
/** The expected chunk ID for this case */
|
|
8
|
+
expectedChunkId: string;
|
|
9
|
+
/** Ranked chunk IDs returned from search */
|
|
10
|
+
rankedChunkIds: string[];
|
|
11
|
+
}
|
|
12
|
+
export interface DeltaInput {
|
|
13
|
+
summary: EvalSummary;
|
|
14
|
+
cases: DeltaCaseData[];
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Generate a markdown-formatted delta report comparing current eval run against a baseline.
|
|
18
|
+
*
|
|
19
|
+
* Includes a metrics comparison table and, when case-level data is provided,
|
|
20
|
+
* regression and improvement sections highlighting individual cases that changed.
|
|
21
|
+
*/
|
|
22
|
+
export declare function generateDeltaMarkdown(current: DeltaInput | EvalSummary, baseline: DeltaInput | EvalSummary): string;
|
|
23
|
+
/**
|
|
24
|
+
* Build a DeltaCaseData array from RankedCase array, using index-based names
|
|
25
|
+
* when no names are available.
|
|
26
|
+
*/
|
|
27
|
+
export declare function toDeltaCases(cases: RankedCase[]): DeltaCaseData[];
|
|
28
|
+
//# sourceMappingURL=delta.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"delta.d.ts","sourceRoot":"","sources":["../src/delta.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAE5D,MAAM,WAAW,aAAa;IAC5B,wCAAwC;IACxC,IAAI,EAAE,MAAM,CAAC;IACb,gEAAgE;IAChE,MAAM,EAAE,OAAO,CAAC;IAChB,0CAA0C;IAC1C,eAAe,EAAE,MAAM,CAAC;IACxB,4CAA4C;IAC5C,cAAc,EAAE,MAAM,EAAE,CAAC;CAC1B;AAED,MAAM,WAAW,UAAU;IACzB,OAAO,EAAE,WAAW,CAAC;IACrB,KAAK,EAAE,aAAa,EAAE,CAAC;CACxB;AAED;;;;;GAKG;AACH,wBAAgB,qBAAqB,CACnC,OAAO,EAAE,UAAU,GAAG,WAAW,EACjC,QAAQ,EAAE,UAAU,GAAG,WAAW,GACjC,MAAM,CA2ER;AAED;;;GAGG;AACH,wBAAgB,YAAY,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,aAAa,EAAE,CAOjE"}
|
package/dist/delta.js
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generate a markdown-formatted delta report comparing current eval run against a baseline.
|
|
3
|
+
*
|
|
4
|
+
* Includes a metrics comparison table and, when case-level data is provided,
|
|
5
|
+
* regression and improvement sections highlighting individual cases that changed.
|
|
6
|
+
*/
|
|
7
|
+
export function generateDeltaMarkdown(current, baseline) {
|
|
8
|
+
const currentInput = normalizeDeltaInput(current);
|
|
9
|
+
const baselineInput = normalizeDeltaInput(baseline);
|
|
10
|
+
const currentSummary = normalizeSummary(currentInput.summary);
|
|
11
|
+
const baselineSummary = normalizeSummary(baselineInput.summary);
|
|
12
|
+
const rows = [
|
|
13
|
+
metricRow("MRR@5", baselineSummary.mrrAt5, currentSummary.mrrAt5),
|
|
14
|
+
metricRow("NDCG@5", baselineSummary.ndcgAt5, currentSummary.ndcgAt5),
|
|
15
|
+
metricRow("Avg Rounds to Right Doc", baselineSummary.avgRoundsToRightDoc, currentSummary.avgRoundsToRightDoc),
|
|
16
|
+
metricRow("Facet Precision", baselineSummary.facetPrecision, currentSummary.facetPrecision),
|
|
17
|
+
metricRow("Search p50 (ms)", baselineSummary.searchP50Ms, currentSummary.searchP50Ms),
|
|
18
|
+
metricRow("Search p95 (ms)", baselineSummary.searchP95Ms, currentSummary.searchP95Ms),
|
|
19
|
+
metricRow("Get Doc p50 (ms)", baselineSummary.getDocP50Ms, currentSummary.getDocP50Ms),
|
|
20
|
+
metricRow("Build Time (ms)", baselineSummary.buildTimeMs, currentSummary.buildTimeMs),
|
|
21
|
+
metricRow("Peak RSS (MB)", baselineSummary.peakRssMb, currentSummary.peakRssMb)
|
|
22
|
+
];
|
|
23
|
+
const lines = [
|
|
24
|
+
"| Metric | Baseline | Current | Delta |",
|
|
25
|
+
"| --- | ---: | ---: | ---: |",
|
|
26
|
+
...rows
|
|
27
|
+
];
|
|
28
|
+
// Regression / improvement tracking requires case-level data on both sides
|
|
29
|
+
if (currentInput.cases.length > 0 && baselineInput.cases.length > 0) {
|
|
30
|
+
const baselineCaseMap = new Map(baselineInput.cases.map((c) => [c.name, c]));
|
|
31
|
+
const regressions = currentInput.cases.filter((c) => {
|
|
32
|
+
const bc = baselineCaseMap.get(c.name);
|
|
33
|
+
if (!bc)
|
|
34
|
+
return false;
|
|
35
|
+
return bc.passed && !c.passed;
|
|
36
|
+
});
|
|
37
|
+
const improvements = currentInput.cases.filter((c) => {
|
|
38
|
+
const bc = baselineCaseMap.get(c.name);
|
|
39
|
+
if (!bc)
|
|
40
|
+
return false;
|
|
41
|
+
return !bc.passed && c.passed;
|
|
42
|
+
});
|
|
43
|
+
if (regressions.length > 0) {
|
|
44
|
+
lines.push("");
|
|
45
|
+
lines.push("### Regressions");
|
|
46
|
+
lines.push("");
|
|
47
|
+
for (const r of regressions) {
|
|
48
|
+
const bc = baselineCaseMap.get(r.name);
|
|
49
|
+
const baselineRank = rankLabel(bc);
|
|
50
|
+
const currentRank = rankLabel(r);
|
|
51
|
+
lines.push(`- **${r.name}**: was rank ${baselineRank}, now rank ${currentRank}`);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
if (improvements.length > 0) {
|
|
55
|
+
lines.push("");
|
|
56
|
+
lines.push("### Improvements");
|
|
57
|
+
lines.push("");
|
|
58
|
+
for (const imp of improvements) {
|
|
59
|
+
const currentRank = rankLabel(imp);
|
|
60
|
+
lines.push(`- **${imp.name}**: now found at rank ${currentRank}`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return lines.join("\n");
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Build a DeltaCaseData array from RankedCase array, using index-based names
|
|
68
|
+
* when no names are available.
|
|
69
|
+
*/
|
|
70
|
+
export function toDeltaCases(cases) {
|
|
71
|
+
return cases.map((c, i) => ({
|
|
72
|
+
name: `case-${i}`,
|
|
73
|
+
passed: c.rankedChunkIds.slice(0, 5).includes(c.expectedChunkId),
|
|
74
|
+
expectedChunkId: c.expectedChunkId,
|
|
75
|
+
rankedChunkIds: c.rankedChunkIds
|
|
76
|
+
}));
|
|
77
|
+
}
|
|
78
|
+
function rankLabel(c) {
|
|
79
|
+
const idx = c.rankedChunkIds.indexOf(c.expectedChunkId);
|
|
80
|
+
return idx >= 0 ? String(idx + 1) : "N/F";
|
|
81
|
+
}
|
|
82
|
+
function normalizeDeltaInput(input) {
|
|
83
|
+
if ("summary" in input) {
|
|
84
|
+
return input;
|
|
85
|
+
}
|
|
86
|
+
return { summary: input, cases: [] };
|
|
87
|
+
}
|
|
88
|
+
function metricRow(metric, baseline, current) {
|
|
89
|
+
const delta = current - baseline;
|
|
90
|
+
const signedDelta = delta >= 0 ? `+${delta.toFixed(6)}` : delta.toFixed(6);
|
|
91
|
+
return `| ${metric} | ${baseline.toFixed(6)} | ${current.toFixed(6)} | ${signedDelta} |`;
|
|
92
|
+
}
|
|
93
|
+
function normalizeSummary(summary) {
|
|
94
|
+
return {
|
|
95
|
+
mrrAt5: coerceNumber(summary.mrrAt5),
|
|
96
|
+
ndcgAt5: coerceNumber(summary.ndcgAt5),
|
|
97
|
+
avgRoundsToRightDoc: coerceNumber(summary.avgRoundsToRightDoc),
|
|
98
|
+
facetPrecision: coerceNumber(summary.facetPrecision),
|
|
99
|
+
searchP50Ms: coerceNumber(summary.searchP50Ms),
|
|
100
|
+
searchP95Ms: coerceNumber(summary.searchP95Ms),
|
|
101
|
+
getDocP50Ms: coerceNumber(summary.getDocP50Ms),
|
|
102
|
+
buildTimeMs: coerceNumber(summary.buildTimeMs),
|
|
103
|
+
peakRssMb: coerceNumber(summary.peakRssMb)
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
function coerceNumber(value) {
|
|
107
|
+
return Number.isFinite(value) ? value : 0;
|
|
108
|
+
}
|
|
109
|
+
//# sourceMappingURL=delta.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"delta.js","sourceRoot":"","sources":["../src/delta.ts"],"names":[],"mappings":"AAkBA;;;;;GAKG;AACH,MAAM,UAAU,qBAAqB,CACnC,OAAiC,EACjC,QAAkC;IAElC,MAAM,YAAY,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAC;IAClD,MAAM,aAAa,GAAG,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAEpD,MAAM,cAAc,GAAG,gBAAgB,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;IAC9D,MAAM,eAAe,GAAG,gBAAgB,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAEhE,MAAM,IAAI,GAAG;QACX,SAAS,CAAC,OAAO,EAAE,eAAe,CAAC,MAAM,EAAE,cAAc,CAAC,MAAM,CAAC;QACjE,SAAS,CAAC,QAAQ,EAAE,eAAe,CAAC,OAAO,EAAE,cAAc,CAAC,OAAO,CAAC;QACpE,SAAS,CACP,yBAAyB,EACzB,eAAe,CAAC,mBAAmB,EACnC,cAAc,CAAC,mBAAmB,CACnC;QACD,SAAS,CAAC,iBAAiB,EAAE,eAAe,CAAC,cAAc,EAAE,cAAc,CAAC,cAAc,CAAC;QAC3F,SAAS,CAAC,iBAAiB,EAAE,eAAe,CAAC,WAAW,EAAE,cAAc,CAAC,WAAW,CAAC;QACrF,SAAS,CAAC,iBAAiB,EAAE,eAAe,CAAC,WAAW,EAAE,cAAc,CAAC,WAAW,CAAC;QACrF,SAAS,CAAC,kBAAkB,EAAE,eAAe,CAAC,WAAW,EAAE,cAAc,CAAC,WAAW,CAAC;QACtF,SAAS,CAAC,iBAAiB,EAAE,eAAe,CAAC,WAAW,EAAE,cAAc,CAAC,WAAW,CAAC;QACrF,SAAS,CAAC,eAAe,EAAE,eAAe,CAAC,SAAS,EAAE,cAAc,CAAC,SAAS,CAAC;KAChF,CAAC;IAEF,MAAM,KAAK,GAAG;QACZ,yCAAyC;QACzC,8BAA8B;QAC9B,GAAG,IAAI;KACR,CAAC;IAEF,2EAA2E;IAC3E,IAAI,YAAY,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,aAAa,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpE,MAAM,eAAe,GAAG,IAAI,GAAG,CAC7B,aAAa,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAC5C,CAAC;QAEF,MAAM,WAAW,GAAG,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YAClD,MAAM,EAAE,GAAG,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YACvC,IAAI,CAAC,EAAE;gBAAE,OAAO,KAAK,CAAC;YACtB,OAAO,EAAE,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,MAAM,YAAY,GAAG,YAAY,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACnD,MAAM,EAAE,GAAG,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YACvC,IAAI,CAAC,EAAE;gBAAE,OAAO,KAAK,CAAC;YACtB,OAAO,CAAC,EAAE,CAAC,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;YAC9B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;gBAC5B,MAAM,EAAE,GAAG,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAE,CAAC;gBACxC,MAAM,YAAY,GAAG,SAAS,CAAC,EAAE,CAAC,CAAC;gBACnC,MAAM,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;gBACjC,KAAK,CAAC,IAAI,CACR,OAAO,CAAC,CAAC,IAAI,gBAAgB,YAAY,cAAc,WAAW,EAAE,CACrE,CAAC;YACJ,CAAC;QACH,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;YAC/B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,KAAK,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;gBAC/B,MAAM,WAAW,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;gBACnC,KAAK,CAAC,IAAI,CACR,OAAO,GAAG,CAAC,IAAI,yBAAyB,WAAW,EAAE,CACtD,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,YAAY,CAAC,KAAmB;IAC9C,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAC1B,IAAI,EAAE,QAAQ,CAAC,EAAE;QACjB,MAAM,EAAE,CAAC,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,eAAe,CAAC;QAChE,eAAe,EAAE,CAAC,CAAC,eAAe;QAClC,cAAc,EAAE,CAAC,CAAC,cAAc;KACjC,CAAC,CAAC,CAAC;AACN,CAAC;AAED,SAAS,SAAS,CAAC,CAAgB;IACjC,MAAM,GAAG,GAAG,CAAC,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC;IACxD,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;AAC5C,CAAC;AAED,SAAS,mBAAmB,CAAC,KAA+B;IAC1D,IAAI,SAAS,IAAI,KAAK,EAAE,CAAC;QACvB,OAAO,KAAK,CAAC;IACf,CAAC;IACD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;AACvC,CAAC;AAED,SAAS,SAAS,CAAC,MAAc,EAAE,QAAgB,EAAE,OAAe;IAClE,MAAM,KAAK,GAAG,OAAO,GAAG,QAAQ,CAAC;IACjC,MAAM,WAAW,GAAG,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;IAC3E,OAAO,KAAK,MAAM,MAAM,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,WAAW,IAAI,CAAC;AAC3F,CAAC;AAED,SAAS,gBAAgB,CAAC,OAAoB;IAC5C,OAAO;QACL,MAAM,EAAE,YAAY,CAAC,OAAO,CAAC,MAAM,CAAC;QACpC,OAAO,EAAE,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC;QACtC,mBAAmB,EAAE,YAAY,CAAC,OAAO,CAAC,mBAAmB,CAAC;QAC9D,cAAc,EAAE,YAAY,CAAC,OAAO,CAAC,cAAc,CAAC;QACpD,WAAW,EAAE,YAAY,CAAC,OAAO,CAAC,WAAW,CAAC;QAC9C,WAAW,EAAE,YAAY,CAAC,OAAO,CAAC,WAAW,CAAC;QAC9C,WAAW,EAAE,YAAY,CAAC,OAAO,CAAC,WAAW,CAAC;QAC9C,WAAW,EAAE,YAAY,CAAC,OAAO,CAAC,WAAW,CAAC;QAC9C,SAAS,EAAE,YAAY,CAAC,OAAO,CAAC,SAAS,CAAC;KAC3C,CAAC;AACJ,CAAC;AAED,SAAS,YAAY,CAAC,KAAa;IACjC,OAAO,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;AAC5C,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,cAAc,CAAC;AAC7B,cAAc,YAAY,CAAC;AAC3B,cAAc,aAAa,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,cAAc,CAAC;AAC7B,cAAc,YAAY,CAAC;AAC3B,cAAc,aAAa,CAAC"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
export interface RankedCase {
|
|
2
|
+
expectedChunkId: string;
|
|
3
|
+
rankedChunkIds: string[];
|
|
4
|
+
roundsToRightDoc: number;
|
|
5
|
+
}
|
|
6
|
+
export interface EvalSummary {
|
|
7
|
+
mrrAt5: number;
|
|
8
|
+
ndcgAt5: number;
|
|
9
|
+
avgRoundsToRightDoc: number;
|
|
10
|
+
/** Fraction of cases where the expected chunk was found in the top 5 results */
|
|
11
|
+
facetPrecision: number;
|
|
12
|
+
searchP50Ms: number;
|
|
13
|
+
searchP95Ms: number;
|
|
14
|
+
getDocP50Ms: number;
|
|
15
|
+
buildTimeMs: number;
|
|
16
|
+
peakRssMb: number;
|
|
17
|
+
}
|
|
18
|
+
export declare function computeMrrAtK(cases: RankedCase[], k: number): number;
|
|
19
|
+
export declare function computeNdcgAtK(cases: RankedCase[], k: number): number;
|
|
20
|
+
export declare function computeAvgRoundsToRightDoc(cases: RankedCase[]): number;
|
|
21
|
+
export declare function summarizeCases(cases: RankedCase[], timings?: {
|
|
22
|
+
searchLatenciesMs?: number[];
|
|
23
|
+
getDocLatenciesMs?: number[];
|
|
24
|
+
buildTimeMs?: number;
|
|
25
|
+
peakRssMb?: number;
|
|
26
|
+
}): EvalSummary;
|
|
27
|
+
//# sourceMappingURL=metrics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metrics.d.ts","sourceRoot":"","sources":["../src/metrics.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,UAAU;IACzB,eAAe,EAAE,MAAM,CAAC;IACxB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,gBAAgB,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,WAAW;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,mBAAmB,EAAE,MAAM,CAAC;IAC5B,gFAAgF;IAChF,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,wBAAgB,aAAa,CAAC,KAAK,EAAE,UAAU,EAAE,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAcpE;AAED,wBAAgB,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAiBrE;AAED,wBAAgB,0BAA0B,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,MAAM,CAOtE;AAED,wBAAgB,cAAc,CAC5B,KAAK,EAAE,UAAU,EAAE,EACnB,OAAO,GAAE;IACP,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC7B,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC7B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACf,GACL,WAAW,CAeb"}
|
package/dist/metrics.js
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
export function computeMrrAtK(cases, k) {
|
|
2
|
+
if (cases.length === 0) {
|
|
3
|
+
return 0;
|
|
4
|
+
}
|
|
5
|
+
const total = cases.reduce((sum, testCase) => {
|
|
6
|
+
const position = testCase.rankedChunkIds.slice(0, k).indexOf(testCase.expectedChunkId);
|
|
7
|
+
if (position < 0) {
|
|
8
|
+
return sum;
|
|
9
|
+
}
|
|
10
|
+
return sum + 1 / (position + 1);
|
|
11
|
+
}, 0);
|
|
12
|
+
return total / cases.length;
|
|
13
|
+
}
|
|
14
|
+
export function computeNdcgAtK(cases, k) {
|
|
15
|
+
if (cases.length === 0) {
|
|
16
|
+
return 0;
|
|
17
|
+
}
|
|
18
|
+
const total = cases.reduce((sum, testCase) => {
|
|
19
|
+
const position = testCase.rankedChunkIds.slice(0, k).indexOf(testCase.expectedChunkId);
|
|
20
|
+
if (position < 0) {
|
|
21
|
+
return sum;
|
|
22
|
+
}
|
|
23
|
+
const dcg = 1 / log2(position + 2);
|
|
24
|
+
const idealDcg = 1;
|
|
25
|
+
return sum + dcg / idealDcg;
|
|
26
|
+
}, 0);
|
|
27
|
+
return total / cases.length;
|
|
28
|
+
}
|
|
29
|
+
export function computeAvgRoundsToRightDoc(cases) {
|
|
30
|
+
if (cases.length === 0) {
|
|
31
|
+
return 0;
|
|
32
|
+
}
|
|
33
|
+
const total = cases.reduce((sum, testCase) => sum + testCase.roundsToRightDoc, 0);
|
|
34
|
+
return total / cases.length;
|
|
35
|
+
}
|
|
36
|
+
export function summarizeCases(cases, timings = {}) {
|
|
37
|
+
return {
|
|
38
|
+
mrrAt5: round(computeMrrAtK(cases, 5)),
|
|
39
|
+
ndcgAt5: round(computeNdcgAtK(cases, 5)),
|
|
40
|
+
avgRoundsToRightDoc: round(computeAvgRoundsToRightDoc(cases)),
|
|
41
|
+
facetPrecision: round(cases.filter((c) => c.rankedChunkIds.slice(0, 5).includes(c.expectedChunkId)).length /
|
|
42
|
+
(cases.length || 1)),
|
|
43
|
+
searchP50Ms: round(percentile(timings.searchLatenciesMs ?? [], 0.5)),
|
|
44
|
+
searchP95Ms: round(percentile(timings.searchLatenciesMs ?? [], 0.95)),
|
|
45
|
+
getDocP50Ms: round(percentile(timings.getDocLatenciesMs ?? [], 0.5)),
|
|
46
|
+
buildTimeMs: round(timings.buildTimeMs ?? 0),
|
|
47
|
+
peakRssMb: round(timings.peakRssMb ?? 0)
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
function log2(value) {
|
|
51
|
+
return Math.log(value) / Math.log(2);
|
|
52
|
+
}
|
|
53
|
+
function round(value) {
|
|
54
|
+
return Number(value.toFixed(6));
|
|
55
|
+
}
|
|
56
|
+
function percentile(values, p) {
|
|
57
|
+
if (values.length === 0) {
|
|
58
|
+
return 0;
|
|
59
|
+
}
|
|
60
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
61
|
+
const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * p) - 1));
|
|
62
|
+
return sorted[index] ?? 0;
|
|
63
|
+
}
|
|
64
|
+
//# sourceMappingURL=metrics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metrics.js","sourceRoot":"","sources":["../src/metrics.ts"],"names":[],"mappings":"AAmBA,MAAM,UAAU,aAAa,CAAC,KAAmB,EAAE,CAAS;IAC1D,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE;QAC3C,MAAM,QAAQ,GAAG,QAAQ,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC;QACvF,IAAI,QAAQ,GAAG,CAAC,EAAE,CAAC;YACjB,OAAO,GAAG,CAAC;QACb,CAAC;QACD,OAAO,GAAG,GAAG,CAAC,GAAG,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC;IAClC,CAAC,EAAE,CAAC,CAAC,CAAC;IAEN,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC;AAC9B,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,KAAmB,EAAE,CAAS;IAC3D,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE;QAC3C,MAAM,QAAQ,GAAG,QAAQ,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC;QACvF,IAAI,QAAQ,GAAG,CAAC,EAAE,CAAC;YACjB,OAAO,GAAG,CAAC;QACb,CAAC;QAED,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC;QACnC,MAAM,QAAQ,GAAG,CAAC,CAAC;QACnB,OAAO,GAAG,GAAG,GAAG,GAAG,QAAQ,CAAC;IAC9B,CAAC,EAAE,CAAC,CAAC,CAAC;IAEN,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC;AAC9B,CAAC;AAED,MAAM,UAAU,0BAA0B,CAAC,KAAmB;IAC5D,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE,CAAC,GAAG,GAAG,QAAQ,CAAC,gBAAgB,EAAE,CAAC,CAAC,CAAC;IAClF,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC;AAC9B,CAAC;AAED,MAAM,UAAU,cAAc,CAC5B,KAAmB,EACnB,UAKI,EAAE;IAEN,OAAO;QACL,MAAM,EAAE,KAAK,CAAC,aAAa,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtC,OAAO,EAAE,KAAK,CAAC,cAAc,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACxC,mBAAmB,EAAE,KAAK,CAAC,0BAA0B,CAAC,KAAK,CAAC,CAAC;QAC7D,cAAc,EAAE,KAAK,CACnB,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,MAAM;YAClF,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,CACtB;QACD,WAAW,EAAE,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,iBAAiB,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC;QACpE,WAAW,EAAE,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,iBAAiB,IAAI,EAAE,EAAE,IAAI,CAAC,CAAC;QACrE,WAAW,EAAE,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,iBAAiB,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC;QACpE,WAAW,EAAE,KAAK,CAAC,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;QAC5C,SAAS,EAAE,KAAK,CAAC,OAAO,CAAC,SAAS,IAAI,CAAC,CAAC;KACzC,CAAC;AACJ,CAAC;AAED,SAAS,IAAI,CAAC,KAAa;IACzB,OAAO,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,KAAK,CAAC,KAAa;IAC1B,OAAO,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;AAClC,CAAC;AAED,SAAS,UAAU,CAAC,MAAgB,EAAE,CAAS;IAC7C,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,MAAM,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACjD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;IACzF,OAAO,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;AAC5B,CAAC"}
|
package/dist/runner.d.ts
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import type { EvalSummary, RankedCase } from "./metrics.js";
|
|
2
|
+
export interface EvalRunInput {
|
|
3
|
+
cases: RankedCase[];
|
|
4
|
+
timings?: {
|
|
5
|
+
searchLatenciesMs?: number[];
|
|
6
|
+
getDocLatenciesMs?: number[];
|
|
7
|
+
buildTimeMs?: number;
|
|
8
|
+
peakRssMb?: number;
|
|
9
|
+
};
|
|
10
|
+
model?: {
|
|
11
|
+
provider: string;
|
|
12
|
+
model: string;
|
|
13
|
+
};
|
|
14
|
+
deterministic?: boolean;
|
|
15
|
+
}
|
|
16
|
+
export interface EvalRunOutput {
|
|
17
|
+
summary: EvalSummary;
|
|
18
|
+
metadata: {
|
|
19
|
+
deterministic: boolean;
|
|
20
|
+
provider: string | null;
|
|
21
|
+
model: string | null;
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
export interface EvalServerConfig {
|
|
25
|
+
command: string;
|
|
26
|
+
args?: string[];
|
|
27
|
+
cwd?: string;
|
|
28
|
+
env?: Record<string, string>;
|
|
29
|
+
}
|
|
30
|
+
export interface EvalBuildConfig {
|
|
31
|
+
command: string;
|
|
32
|
+
args?: string[];
|
|
33
|
+
cwd?: string;
|
|
34
|
+
env?: Record<string, string>;
|
|
35
|
+
}
|
|
36
|
+
export interface EvalQueryCase {
|
|
37
|
+
query: string;
|
|
38
|
+
expectedChunkId: string;
|
|
39
|
+
filters?: Record<string, string>;
|
|
40
|
+
limit?: number;
|
|
41
|
+
maxRounds?: number;
|
|
42
|
+
}
|
|
43
|
+
export interface EvalHarnessInput {
|
|
44
|
+
server: EvalServerConfig;
|
|
45
|
+
build?: EvalBuildConfig;
|
|
46
|
+
cases: EvalQueryCase[];
|
|
47
|
+
warmupQueries?: number;
|
|
48
|
+
model?: {
|
|
49
|
+
provider: string;
|
|
50
|
+
model: string;
|
|
51
|
+
};
|
|
52
|
+
deterministic?: boolean;
|
|
53
|
+
}
|
|
54
|
+
export interface EvalHarnessOutput extends EvalRunOutput {
|
|
55
|
+
rankedCases: RankedCase[];
|
|
56
|
+
stats: {
|
|
57
|
+
searchLatenciesMs: number[];
|
|
58
|
+
getDocLatenciesMs: number[];
|
|
59
|
+
buildTimeMs: number;
|
|
60
|
+
peakRssMb: number;
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
export declare function runEvaluation(input: EvalRunInput): EvalRunOutput;
|
|
64
|
+
export declare function runEvaluationAgainstServer(input: EvalHarnessInput): Promise<EvalHarnessOutput>;
|
|
65
|
+
export declare function computeRoundsToRightDoc(input: {
|
|
66
|
+
found: boolean;
|
|
67
|
+
roundsExecuted: number;
|
|
68
|
+
maxRounds: number;
|
|
69
|
+
}): number;
|
|
70
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAK5D,MAAM,WAAW,YAAY;IAC3B,KAAK,EAAE,UAAU,EAAE,CAAC;IACpB,OAAO,CAAC,EAAE;QACR,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;QAC7B,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;QAC7B,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,KAAK,CAAC,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,KAAK,EAAE,MAAM,CAAC;KACf,CAAC;IACF,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,WAAW,CAAC;IACrB,QAAQ,EAAE;QACR,aAAa,EAAE,OAAO,CAAC;QACvB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;QACxB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;KACtB,CAAC;CACH;AAED,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC9B;AAED,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC9B;AAED,MAAM,WAAW,aAAa;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,eAAe,EAAE,MAAM,CAAC;IACxB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B,MAAM,EAAE,gBAAgB,CAAC;IACzB,KAAK,CAAC,EAAE,eAAe,CAAC;IACxB,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,KAAK,EAAE,MAAM,CAAC;KACf,CAAC;IACF,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,iBAAkB,SAAQ,aAAa;IACtD,WAAW,EAAE,UAAU,EAAE,CAAC;IAC1B,KAAK,EAAE;QACL,iBAAiB,EAAE,MAAM,EAAE,CAAC;QAC5B,iBAAiB,EAAE,MAAM,EAAE,CAAC;QAC5B,WAAW,EAAE,MAAM,CAAC;QACpB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;CACH;AAED,wBAAgB,aAAa,CAAC,KAAK,EAAE,YAAY,GAAG,aAAa,CAUhE;AAED,wBAAsB,0BAA0B,CAC9C,KAAK,EAAE,gBAAgB,GACtB,OAAO,CAAC,iBAAiB,CAAC,CAgF5B;AAoGD,wBAAgB,uBAAuB,CAAC,KAAK,EAAE;IAC7C,KAAK,EAAE,OAAO,CAAC;IACf,cAAc,EAAE,MAAM,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;CACnB,GAAG,MAAM,CAET"}
|