@deepagents/evals 0.19.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/dataset/index.d.ts +3 -0
- package/dist/dataset/index.d.ts.map +1 -1
- package/dist/dataset/index.js +84 -1
- package/dist/dataset/index.js.map +3 -3
- package/dist/dataset/record-selection.d.ts +8 -0
- package/dist/dataset/record-selection.d.ts.map +1 -0
- package/dist/engine/index.d.ts.map +1 -1
- package/dist/engine/index.js +6 -3
- package/dist/engine/index.js.map +2 -2
- package/dist/evaluate/index.d.ts +16 -3
- package/dist/evaluate/index.d.ts.map +1 -1
- package/dist/evaluate/index.js +225 -359
- package/dist/evaluate/index.js.map +3 -3
- package/dist/index.d.ts +5 -5
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +429 -110
- package/dist/index.js.map +4 -4
- package/dist/reporters/console.d.ts.map +1 -1
- package/dist/reporters/csv.d.ts.map +1 -1
- package/dist/reporters/html.d.ts.map +1 -1
- package/dist/reporters/index.js +129 -36
- package/dist/reporters/index.js.map +3 -3
- package/dist/reporters/markdown.d.ts.map +1 -1
- package/dist/scorers/index.d.ts +2 -6
- package/dist/scorers/index.d.ts.map +1 -1
- package/dist/scorers/index.js +32 -54
- package/dist/scorers/index.js.map +2 -2
- package/dist/store/index.d.ts +2 -0
- package/dist/store/index.d.ts.map +1 -1
- package/dist/store/index.js +22 -0
- package/dist/store/index.js.map +2 -2
- package/package.json +3 -2
package/dist/scorers/index.js
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
// packages/evals/src/scorers/index.ts
|
|
2
|
-
import {
|
|
3
|
-
|
|
2
|
+
import {
|
|
3
|
+
Factuality as AutoevalsFactuality,
|
|
4
|
+
Levenshtein as AutoevalsLevenshtein
|
|
5
|
+
} from "autoevals";
|
|
4
6
|
var exactMatch = async ({ output, expected }) => {
|
|
5
7
|
const exp = expected == null ? "" : String(expected);
|
|
6
8
|
if (output === exp) return { score: 1 };
|
|
@@ -22,32 +24,32 @@ function regex(pattern) {
|
|
|
22
24
|
return { score: pattern.test(output) ? 1 : 0 };
|
|
23
25
|
};
|
|
24
26
|
}
|
|
25
|
-
function
|
|
26
|
-
if (
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
27
|
+
function normalizeScore(score) {
|
|
28
|
+
if (typeof score !== "number" || !Number.isFinite(score)) return 0;
|
|
29
|
+
return Math.max(0, Math.min(1, score));
|
|
30
|
+
}
|
|
31
|
+
function reasonFromMetadata(metadata) {
|
|
32
|
+
if (!metadata) return void 0;
|
|
33
|
+
const candidates = [
|
|
34
|
+
metadata.reason,
|
|
35
|
+
metadata.rationale,
|
|
36
|
+
metadata.explanation
|
|
37
|
+
];
|
|
38
|
+
for (const candidate of candidates) {
|
|
39
|
+
if (typeof candidate === "string" && candidate.trim().length > 0) {
|
|
40
|
+
return candidate;
|
|
36
41
|
}
|
|
37
|
-
[prev, curr] = [curr, prev];
|
|
38
42
|
}
|
|
39
|
-
return
|
|
43
|
+
return void 0;
|
|
40
44
|
}
|
|
41
45
|
var levenshtein = async ({ output, expected }) => {
|
|
42
46
|
const exp = expected == null ? "" : String(expected);
|
|
43
|
-
|
|
44
|
-
const
|
|
45
|
-
const distance = levenshteinDistance(output, exp);
|
|
46
|
-
const score = Math.max(0, 1 - distance / maxLen);
|
|
47
|
-
if (score === 1) return { score };
|
|
47
|
+
const result = await AutoevalsLevenshtein({ output, expected: exp });
|
|
48
|
+
const score = normalizeScore(result.score);
|
|
48
49
|
return {
|
|
49
50
|
score,
|
|
50
|
-
reason:
|
|
51
|
+
reason: reasonFromMetadata(result.metadata),
|
|
52
|
+
metadata: result.metadata
|
|
51
53
|
};
|
|
52
54
|
};
|
|
53
55
|
function deepEqual(a, b) {
|
|
@@ -81,42 +83,19 @@ var jsonMatch = async ({ output, expected }) => {
|
|
|
81
83
|
return { score: 0, reason: "Failed to parse JSON" };
|
|
82
84
|
}
|
|
83
85
|
};
|
|
84
|
-
var llmScorerSchema = z.object({
|
|
85
|
-
score: z.number().min(0).max(1),
|
|
86
|
-
reason: z.string()
|
|
87
|
-
});
|
|
88
|
-
function llmJudge(config) {
|
|
89
|
-
return async ({ input, output, expected }) => {
|
|
90
|
-
const { object } = await generateObject({
|
|
91
|
-
model: config.model,
|
|
92
|
-
schema: llmScorerSchema,
|
|
93
|
-
prompt: `You are an expert evaluator. Grade the output based on the following criteria:
|
|
94
|
-
${config.criteria}
|
|
95
|
-
|
|
96
|
-
Input: ${JSON.stringify(input)}
|
|
97
|
-
Output: ${output}
|
|
98
|
-
${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
|
|
99
|
-
|
|
100
|
-
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
101
|
-
});
|
|
102
|
-
return { score: object.score, reason: object.reason };
|
|
103
|
-
};
|
|
104
|
-
}
|
|
105
86
|
function factuality(config) {
|
|
106
87
|
return async ({ input, output, expected }) => {
|
|
107
|
-
const
|
|
88
|
+
const result = await AutoevalsFactuality({
|
|
108
89
|
model: config.model,
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
Input: ${JSON.stringify(input)}
|
|
113
|
-
Output: ${output}
|
|
114
|
-
Expected reference: ${JSON.stringify(expected)}
|
|
115
|
-
|
|
116
|
-
Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
|
|
117
|
-
Return a score from 0.0 to 1.0 and a brief reason.`
|
|
90
|
+
input: typeof input === "string" ? input : JSON.stringify(input),
|
|
91
|
+
output,
|
|
92
|
+
expected: expected == null ? void 0 : String(expected)
|
|
118
93
|
});
|
|
119
|
-
return {
|
|
94
|
+
return {
|
|
95
|
+
score: normalizeScore(result.score),
|
|
96
|
+
reason: reasonFromMetadata(result.metadata),
|
|
97
|
+
metadata: result.metadata
|
|
98
|
+
};
|
|
120
99
|
};
|
|
121
100
|
}
|
|
122
101
|
function all(...scorers) {
|
|
@@ -168,7 +147,6 @@ export {
|
|
|
168
147
|
includes,
|
|
169
148
|
jsonMatch,
|
|
170
149
|
levenshtein,
|
|
171
|
-
llmJudge,
|
|
172
150
|
regex,
|
|
173
151
|
weighted
|
|
174
152
|
};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../src/scorers/index.ts"],
|
|
4
|
-
"sourcesContent": ["import
|
|
5
|
-
"mappings": ";
|
|
4
|
+
"sourcesContent": ["import {\n Factuality as AutoevalsFactuality,\n Levenshtein as AutoevalsLevenshtein,\n} from 'autoevals';\n\nexport interface ScorerArgs {\n input: unknown;\n output: string;\n expected?: unknown;\n}\n\nexport interface ScorerResult {\n score: number;\n reason?: string;\n metadata?: Record<string, unknown>;\n}\n\nexport type Scorer = (args: ScorerArgs) => Promise<ScorerResult>;\n\nexport const exactMatch: Scorer = async ({ output, expected }) => {\n const exp = expected == null ? '' : String(expected);\n if (output === exp) return { score: 1.0 };\n return {\n score: 0.0,\n reason: `Output does not exactly match expected. Expected \"${exp}\" but got \"${output}\".`,\n };\n};\n\nexport const includes: Scorer = async ({ output, expected }) => {\n const exp = expected == null ? '' : String(expected);\n if (output.includes(exp)) return { score: 1.0 };\n return {\n score: 0.0,\n reason: `Output does not include expected substring \"${exp}\".`,\n };\n};\n\nexport function regex(pattern: RegExp): Scorer {\n return async ({ output }) => {\n return { score: pattern.test(output) ? 1.0 : 0.0 };\n };\n}\n\nfunction normalizeScore(score: number | null | undefined): number {\n if (typeof score !== 'number' || !Number.isFinite(score)) return 0;\n return Math.max(0, Math.min(1, score));\n}\n\nfunction reasonFromMetadata(\n metadata?: Record<string, unknown>,\n): string | undefined {\n if (!metadata) return undefined;\n const candidates = [\n metadata.reason,\n metadata.rationale,\n metadata.explanation,\n ];\n for (const candidate of candidates) {\n if (typeof candidate === 'string' && candidate.trim().length > 0) {\n return candidate;\n }\n }\n return undefined;\n}\n\nexport const levenshtein: Scorer = async ({ output, expected }) => {\n const exp = expected == null ? '' : String(expected);\n const result = await AutoevalsLevenshtein({ output, expected: exp });\n const score = normalizeScore(result.score);\n return {\n score,\n reason: reasonFromMetadata(result.metadata),\n metadata: result.metadata,\n };\n};\n\nfunction deepEqual(a: unknown, b: unknown): boolean {\n if (a === b) return true;\n if (a == null || b == null) return false;\n if (typeof a !== typeof b) return false;\n\n if (Array.isArray(a)) {\n if (!Array.isArray(b) || a.length !== b.length) return false;\n return a.every((val, i) => deepEqual(val, b[i]));\n }\n\n if (typeof a === 'object') {\n const keysA = Object.keys(a as Record<string, unknown>).sort();\n const keysB = Object.keys(b as Record<string, unknown>).sort();\n if (keysA.length !== keysB.length) return false;\n return keysA.every(\n (key, i) =>\n keysB[i] === key &&\n deepEqual(\n (a as Record<string, unknown>)[key],\n (b as Record<string, unknown>)[key],\n ),\n );\n }\n\n return false;\n}\n\nexport const jsonMatch: Scorer = async ({ output, expected }) => {\n try {\n const parsedOutput = JSON.parse(output);\n const parsedExpected =\n typeof expected === 'string' ? JSON.parse(expected) : expected;\n if (deepEqual(parsedOutput, parsedExpected)) return { score: 1.0 };\n return { score: 0.0, reason: 'JSON payload differs from expected JSON.' };\n } catch {\n return { score: 0.0, reason: 'Failed to parse JSON' };\n }\n};\n\nexport function factuality(config: { model: string }): Scorer {\n return async ({ input, output, expected }) => {\n const result = await AutoevalsFactuality({\n model: config.model,\n input: typeof input === 'string' ? input : JSON.stringify(input),\n output,\n expected: expected == null ? undefined : String(expected),\n });\n return {\n score: normalizeScore(result.score),\n reason: reasonFromMetadata(result.metadata),\n metadata: result.metadata,\n };\n };\n}\n\nexport function all(...scorers: Scorer[]): Scorer {\n return async (args) => {\n if (scorers.length === 0) return { score: 1.0 };\n const results = await Promise.all(scorers.map((s) => s(args)));\n const minResult = results.reduce((min, r) =>\n r.score < min.score ? r : min,\n );\n const reasons = results\n .filter((r) => r.reason)\n .map((r) => r.reason)\n .join('; ');\n return { score: minResult.score, reason: reasons || undefined };\n };\n}\n\nexport function any(...scorers: Scorer[]): Scorer {\n return async (args) => {\n if (scorers.length === 0) return { score: 0.0 };\n const results = await Promise.all(scorers.map((s) => s(args)));\n const maxResult = results.reduce((max, r) =>\n r.score > max.score ? r : max,\n );\n return { score: maxResult.score, reason: maxResult.reason };\n };\n}\n\nexport function weighted(\n config: Record<string, { scorer: Scorer; weight: number }>,\n): Scorer {\n return async (args) => {\n const entries = Object.entries(config);\n const results = await Promise.all(\n entries.map(async ([name, { scorer, weight }]) => ({\n name,\n result: await scorer(args),\n weight,\n })),\n );\n const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);\n const weightedScore = results.reduce(\n (sum, r) => sum + r.result.score * r.weight,\n 0,\n );\n const score = totalWeight > 0 ? weightedScore / totalWeight : 0;\n const reasons = results\n .map((r) => `${r.name}: ${r.result.score.toFixed(2)} (w=${r.weight})`)\n .join(', ');\n return { score, reason: reasons || undefined };\n };\n}\n"],
|
|
5
|
+
"mappings": ";AAAA;AAAA,EACE,cAAc;AAAA,EACd,eAAe;AAAA,OACV;AAgBA,IAAM,aAAqB,OAAO,EAAE,QAAQ,SAAS,MAAM;AAChE,QAAM,MAAM,YAAY,OAAO,KAAK,OAAO,QAAQ;AACnD,MAAI,WAAW,IAAK,QAAO,EAAE,OAAO,EAAI;AACxC,SAAO;AAAA,IACL,OAAO;AAAA,IACP,QAAQ,qDAAqD,GAAG,cAAc,MAAM;AAAA,EACtF;AACF;AAEO,IAAM,WAAmB,OAAO,EAAE,QAAQ,SAAS,MAAM;AAC9D,QAAM,MAAM,YAAY,OAAO,KAAK,OAAO,QAAQ;AACnD,MAAI,OAAO,SAAS,GAAG,EAAG,QAAO,EAAE,OAAO,EAAI;AAC9C,SAAO;AAAA,IACL,OAAO;AAAA,IACP,QAAQ,+CAA+C,GAAG;AAAA,EAC5D;AACF;AAEO,SAAS,MAAM,SAAyB;AAC7C,SAAO,OAAO,EAAE,OAAO,MAAM;AAC3B,WAAO,EAAE,OAAO,QAAQ,KAAK,MAAM,IAAI,IAAM,EAAI;AAAA,EACnD;AACF;AAEA,SAAS,eAAe,OAA0C;AAChE,MAAI,OAAO,UAAU,YAAY,CAAC,OAAO,SAAS,KAAK,EAAG,QAAO;AACjE,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,KAAK,CAAC;AACvC;AAEA,SAAS,mBACP,UACoB;AACpB,MAAI,CAAC,SAAU,QAAO;AACtB,QAAM,aAAa;AAAA,IACjB,SAAS;AAAA,IACT,SAAS;AAAA,IACT,SAAS;AAAA,EACX;AACA,aAAW,aAAa,YAAY;AAClC,QAAI,OAAO,cAAc,YAAY,UAAU,KAAK,EAAE,SAAS,GAAG;AAChE,aAAO;AAAA,IACT;AAAA,EACF;AACA,SAAO;AACT;AAEO,IAAM,cAAsB,OAAO,EAAE,QAAQ,SAAS,MAAM;AACjE,QAAM,MAAM,YAAY,OAAO,KAAK,OAAO,QAAQ;AACnD,QAAM,SAAS,MAAM,qBAAqB,EAAE,QAAQ,UAAU,IAAI,CAAC;AACnE,QAAM,QAAQ,eAAe,OAAO,KAAK;AACzC,SAAO;AAAA,IACL;AAAA,IACA,QAAQ,mBAAmB,OAAO,QAAQ;AAAA,IAC1C,UAAU,OAAO;AAAA,EACnB;AACF;AAEA,SAAS,UAAU,GAAY,GAAqB;AAClD,MAAI,MAAM,EAAG,QAAO;AACpB,MAAI,KAAK,QAAQ,KAAK,KAAM,QAAO;AACnC,MAAI,OAAO,MAAM,OAAO,EAAG,QAAO;AAElC,MAAI,MAAM,QAAQ,CAAC,GAAG;AACpB,QAAI,CAAC,MAAM,QAAQ,CAAC,KAAK,EAAE,WAAW,EAAE,OAAQ,QAAO;AACvD,WAAO,EAAE,MAAM,CAAC,KAAK,MAAM,UAAU,KAAK,EAAE,CAAC,CAAC,CAAC;AAAA,EACjD;AAEA,MAAI,OAAO,MAAM,UAAU;AACzB,UAAM,QAAQ,OAAO,KAAK,CAA4B,EAAE,KAAK;AAC7D,UAAM,QAAQ,OAAO,KAAK,CAA4B,EAAE,KAAK;AAC7D,QAAI,MAAM,WAAW,MAAM,OAAQ,QAAO;AAC1C,WAAO,MAAM;AAAA,MACX,CAAC,KAAK,MACJ,MAAM,CAAC,MAAM,OACb;AAAA,QACG,EAA8B,GAAG;AAAA,QACjC,EAA8B,GAAG;AAAA,MACpC;AAAA,IACJ;AAAA,EACF;AAEA,SAAO;AACT;AAEO,IAAM,YAAoB,OAAO,EAAE,QAAQ,SAAS,MAAM;AAC/D,MAAI;AACF,UAAM,eAAe,KAAK,MAAM,MAAM;AACtC,UAAM,iBACJ,OAAO,aAAa,WAAW,KAAK,MAAM,QAAQ,IAAI;AACxD,QAAI,UAAU,cAAc,cAAc,EAAG,QAAO,EAAE,OAAO,EAAI;AACjE,WAAO,EAAE,OAAO,GAAK,QAAQ,2CAA2C;AAAA,EAC1E,QAAQ;AACN,WAAO,EAAE,OAAO,GAAK,QAAQ,uBAAuB;AAAA,EACtD;AACF;AAEO,SAAS,WAAW,QAAmC;AAC5D,SAAO,OAAO,EAAE,OAAO,QAAQ,SAAS,MAAM;AAC5C,UAAM,SAAS,MAAM,oBAAoB;AAAA,MACvC,OAAO,OAAO;AAAA,MACd,OAAO,OAAO,UAAU,WAAW,QAAQ,KAAK,UAAU,KAAK;AAAA,MAC/D;AAAA,MACA,UAAU,YAAY,OAAO,SAAY,OAAO,QAAQ;AAAA,IAC1D,CAAC;AACD,WAAO;AAAA,MACL,OAAO,eAAe,OAAO,KAAK;AAAA,MAClC,QAAQ,mBAAmB,OAAO,QAAQ;AAAA,MAC1C,UAAU,OAAO;AAAA,IACnB;AAAA,EACF;AACF;AAEO,SAAS,OAAO,SAA2B;AAChD,SAAO,OAAO,SAAS;AACrB,QAAI,QAAQ,WAAW,EAAG,QAAO,EAAE,OAAO,EAAI;AAC9C,UAAM,UAAU,MAAM,QAAQ,IAAI,QAAQ,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;AAC7D,UAAM,YAAY,QAAQ;AAAA,MAAO,CAAC,KAAK,MACrC,EAAE,QAAQ,IAAI,QAAQ,IAAI;AAAA,IAC5B;AACA,UAAM,UAAU,QACb,OAAO,CAAC,MAAM,EAAE,MAAM,EACtB,IAAI,CAAC,MAAM,EAAE,MAAM,EACnB,KAAK,IAAI;AACZ,WAAO,EAAE,OAAO,UAAU,OAAO,QAAQ,WAAW,OAAU;AAAA,EAChE;AACF;AAEO,SAAS,OAAO,SAA2B;AAChD,SAAO,OAAO,SAAS;AACrB,QAAI,QAAQ,WAAW,EAAG,QAAO,EAAE,OAAO,EAAI;AAC9C,UAAM,UAAU,MAAM,QAAQ,IAAI,QAAQ,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;AAC7D,UAAM,YAAY,QAAQ;AAAA,MAAO,CAAC,KAAK,MACrC,EAAE,QAAQ,IAAI,QAAQ,IAAI;AAAA,IAC5B;AACA,WAAO,EAAE,OAAO,UAAU,OAAO,QAAQ,UAAU,OAAO;AAAA,EAC5D;AACF;AAEO,SAAS,SACd,QACQ;AACR,SAAO,OAAO,SAAS;AACrB,UAAM,UAAU,OAAO,QAAQ,MAAM;AACrC,UAAM,UAAU,MAAM,QAAQ;AAAA,MAC5B,QAAQ,IAAI,OAAO,CAAC,MAAM,EAAE,QAAQ,OAAO,CAAC,OAAO;AAAA,QACjD;AAAA,QACA,QAAQ,MAAM,OAAO,IAAI;AAAA,QACzB;AAAA,MACF,EAAE;AAAA,IACJ;AACA,UAAM,cAAc,QAAQ,OAAO,CAAC,KAAK,MAAM,MAAM,EAAE,QAAQ,CAAC;AAChE,UAAM,gBAAgB,QAAQ;AAAA,MAC5B,CAAC,KAAK,MAAM,MAAM,EAAE,OAAO,QAAQ,EAAE;AAAA,MACrC;AAAA,IACF;AACA,UAAM,QAAQ,cAAc,IAAI,gBAAgB,cAAc;AAC9D,UAAM,UAAU,QACb,IAAI,CAAC,MAAM,GAAG,EAAE,IAAI,KAAK,EAAE,OAAO,MAAM,QAAQ,CAAC,CAAC,OAAO,EAAE,MAAM,GAAG,EACpE,KAAK,IAAI;AACZ,WAAO,EAAE,OAAO,QAAQ,WAAW,OAAU;AAAA,EAC/C;AACF;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
package/dist/store/index.d.ts
CHANGED
|
@@ -94,6 +94,8 @@ export declare class RunStore {
|
|
|
94
94
|
getCases(runId: string): CaseRow[];
|
|
95
95
|
getFailingCases(runId: string, threshold?: number): CaseWithScores[];
|
|
96
96
|
getRunSummary(runId: string, threshold?: number): RunSummary;
|
|
97
|
+
findSuiteByName(name: string): SuiteRow | undefined;
|
|
98
|
+
getLatestCompletedRun(suiteId: string, model?: string): RunRow | undefined;
|
|
97
99
|
listSuites(): SuiteRow[];
|
|
98
100
|
createPrompt(name: string, content: string): PromptRow;
|
|
99
101
|
listPrompts(): PromptRow[];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/store/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAI3C,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,MAAM;IACrB,EAAE,EAAE,MAAM,CAAC;IACX,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,MAAM,EAAE,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC;IAC3C,OAAO,EAAE,UAAU,GAAG,IAAI,CAAC;CAC5B;AAED,MAAM,WAAW,OAAO;IACtB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,OAAO,CAAC;IACf,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,QAAQ,EAAE,OAAO,GAAG,IAAI,CAAC;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;CACtB;AAED,MAAM,WAAW,cAAe,SAAQ,OAAO;IAC7C,MAAM,EAAE,KAAK,CAAC;QAAE,WAAW,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,GAAG,IAAI,CAAA;KAAE,CAAC,CAAC;CAC9E;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;CACvB;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,SAAS;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,OAAO,CAAC;IACf,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,SAAS;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,qBAAa,QAAQ;;gBAyBP,QAAQ,CAAC,EAAE,MAAM,GAAG,YAAY;IAmH5C,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ;IASnC,SAAS,CAAC,GAAG,EAAE;QACb,QAAQ,EAAE,MAAM,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAClC,GAAG,MAAM;IAgBV,SAAS,CACP,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,WAAW,GAAG,QAAQ,EAC9B,OAAO,CAAC,EAAE,UAAU,GACnB,IAAI;IAMP,SAAS,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,IAAI;IAsBlC,UAAU,CAAC,MAAM,EAAE,SAAS,EAAE,GAAG,IAAI;IAWrC,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IA4BzC,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE;IA8BpC,QAAQ,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,EAAE;IA6BlC,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,SAAM,GAAG,cAAc,EAAE;IAmDjE,aAAa,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,SAAM,GAAG,UAAU;IAuDzD,UAAU,IAAI,QAAQ,EAAE;IAWxB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,SAAS;IAetD,WAAW,IAAI,SAAS,EAAE;IAmB1B,SAAS,CAAC,EAAE,EAAE,MAAM,GAAG,SAAS,GAAG,SAAS;IAoB5C,YAAY,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI;CAG/B"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/store/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAI3C,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,MAAM;IACrB,EAAE,EAAE,MAAM,CAAC;IACX,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;IACvC,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,MAAM,EAAE,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC;IAC3C,OAAO,EAAE,UAAU,GAAG,IAAI,CAAC;CAC5B;AAED,MAAM,WAAW,OAAO;IACtB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,OAAO,CAAC;IACf,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,QAAQ,EAAE,OAAO,GAAG,IAAI,CAAC;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;CACtB;AAED,MAAM,WAAW,cAAe,SAAQ,OAAO;IAC7C,MAAM,EAAE,KAAK,CAAC;QAAE,WAAW,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,GAAG,IAAI,CAAA;KAAE,CAAC,CAAC;CAC9E;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;CACvB;AAED,MAAM,WAAW,UAAU;IACzB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,SAAS;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,OAAO,CAAC;IACf,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,SAAS;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,qBAAa,QAAQ;;gBAyBP,QAAQ,CAAC,EAAE,MAAM,GAAG,YAAY;IAmH5C,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ;IASnC,SAAS,CAAC,GAAG,EAAE;QACb,QAAQ,EAAE,MAAM,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAClC,GAAG,MAAM;IAgBV,SAAS,CACP,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,WAAW,GAAG,QAAQ,EAC9B,OAAO,CAAC,EAAE,UAAU,GACnB,IAAI;IAMP,SAAS,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,IAAI;IAsBlC,UAAU,CAAC,MAAM,EAAE,SAAS,EAAE,GAAG,IAAI;IAWrC,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IA4BzC,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE;IA8BpC,QAAQ,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,EAAE;IA6BlC,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,SAAM,GAAG,cAAc,EAAE;IAmDjE,aAAa,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,SAAM,GAAG,UAAU;IAuDzD,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ,GAAG,SAAS;IAOnD,qBAAqB,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAmC1E,UAAU,IAAI,QAAQ,EAAE;IAWxB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,SAAS;IAetD,WAAW,IAAI,SAAS,EAAE;IAmB1B,SAAS,CAAC,EAAE,EAAE,MAAM,GAAG,SAAS,GAAG,SAAS;IAoB5C,YAAY,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI;CAG/B"}
|
package/dist/store/index.js
CHANGED
|
@@ -306,6 +306,28 @@ var RunStore = class {
|
|
|
306
306
|
totalTokensOut: totals.totalTokensOut
|
|
307
307
|
};
|
|
308
308
|
}
|
|
309
|
+
findSuiteByName(name) {
|
|
310
|
+
const row = this.#stmt(
|
|
311
|
+
"SELECT * FROM suites WHERE name = ? ORDER BY created_at DESC LIMIT 1"
|
|
312
|
+
).get(name);
|
|
313
|
+
return row ?? void 0;
|
|
314
|
+
}
|
|
315
|
+
getLatestCompletedRun(suiteId, model) {
|
|
316
|
+
const sql = model ? "SELECT * FROM runs WHERE suite_id = ? AND status = ? AND model = ? ORDER BY started_at DESC LIMIT 1" : "SELECT * FROM runs WHERE suite_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1";
|
|
317
|
+
const row = model ? this.#stmt(sql).get(suiteId, "completed", model) : this.#stmt(sql).get(suiteId, "completed");
|
|
318
|
+
if (!row) return void 0;
|
|
319
|
+
return {
|
|
320
|
+
id: row.id,
|
|
321
|
+
suite_id: row.suite_id,
|
|
322
|
+
name: row.name,
|
|
323
|
+
model: row.model,
|
|
324
|
+
config: row.config ? JSON.parse(row.config) : null,
|
|
325
|
+
started_at: row.started_at,
|
|
326
|
+
finished_at: row.finished_at,
|
|
327
|
+
status: row.status,
|
|
328
|
+
summary: row.summary ? JSON.parse(row.summary) : null
|
|
329
|
+
};
|
|
330
|
+
}
|
|
309
331
|
listSuites() {
|
|
310
332
|
const rows = this.#stmt(
|
|
311
333
|
"SELECT * FROM suites ORDER BY created_at DESC"
|
package/dist/store/index.js.map
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../src/store/index.ts", "../../src/store/ddl.sqlite.sql"],
|
|
4
|
-
"sourcesContent": ["import { mkdirSync } from 'node:fs';\nimport { dirname } from 'node:path';\nimport { DatabaseSync } from 'node:sqlite';\n\nimport DDL from './ddl.sqlite.sql';\n\nexport interface SuiteRow {\n id: string;\n name: string;\n created_at: number;\n}\n\nexport interface RunRow {\n id: string;\n suite_id: string;\n name: string;\n model: string;\n config: Record<string, unknown> | null;\n started_at: number;\n finished_at: number | null;\n status: 'running' | 'completed' | 'failed';\n summary: RunSummary | null;\n}\n\nexport interface CaseRow {\n id: string;\n run_id: string;\n idx: number;\n input: unknown;\n output: string | null;\n expected: unknown | null;\n latency_ms: number;\n tokens_in: number;\n tokens_out: number;\n error: string | null;\n}\n\nexport interface CaseWithScores extends CaseRow {\n scores: Array<{ scorer_name: string; score: number; reason: string | null }>;\n}\n\nexport interface ScoreRow {\n id: string;\n case_id: string;\n scorer_name: string;\n score: number;\n reason: string | null;\n}\n\nexport interface RunSummary {\n totalCases: number;\n passCount: number;\n failCount: number;\n meanScores: Record<string, number>;\n totalLatencyMs: number;\n totalTokensIn: number;\n totalTokensOut: number;\n}\n\nexport interface PromptRow {\n id: string;\n name: string;\n version: number;\n content: string;\n created_at: number;\n}\n\nexport interface CaseData {\n id: string;\n run_id: string;\n idx: number;\n input: unknown;\n output: string | null;\n expected?: unknown;\n latency_ms: number;\n tokens_in: number;\n tokens_out: number;\n error?: string;\n}\n\nexport interface ScoreData {\n id: string;\n case_id: string;\n scorer_name: string;\n score: number;\n reason?: string;\n}\n\nexport class RunStore {\n #db: DatabaseSync;\n #statements = new Map<string, ReturnType<DatabaseSync['prepare']>>();\n\n #stmt(sql: string): ReturnType<DatabaseSync['prepare']> {\n let stmt = this.#statements.get(sql);\n if (!stmt) {\n stmt = this.#db.prepare(sql);\n this.#statements.set(sql, stmt);\n }\n return stmt;\n }\n\n #transaction<T>(fn: () => T): T {\n this.#db.exec('BEGIN TRANSACTION');\n try {\n const result = fn();\n this.#db.exec('COMMIT');\n return result;\n } catch (error) {\n this.#db.exec('ROLLBACK');\n throw error;\n }\n }\n\n constructor(pathOrDb?: string | DatabaseSync) {\n if (pathOrDb instanceof DatabaseSync) {\n this.#db = pathOrDb;\n } else {\n const dbPath = pathOrDb ?? '.evals/store.db';\n mkdirSync(dirname(dbPath), { recursive: true });\n this.#db = new DatabaseSync(dbPath);\n }\n this.#db.exec(DDL);\n this.#migrateRunsTableToSuiteRequired();\n this.#migratePromptsTableIfNeeded();\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)',\n );\n }\n\n #migratePromptsTableIfNeeded(): void {\n const columns = this.#stmt('PRAGMA table_info(prompts)').all() as Array<{\n name: string;\n }>;\n\n if (columns.length === 0) return;\n if (columns.some((column) => column.name === 'version')) return;\n\n this.#transaction(() => {\n this.#db.exec('ALTER TABLE prompts RENAME TO prompts_legacy');\n this.#db.exec(`\n CREATE TABLE prompts (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL,\n version INTEGER NOT NULL,\n content TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000),\n UNIQUE(name, version)\n )\n `);\n this.#db.exec(`\n INSERT INTO prompts (id, name, version, content, created_at)\n SELECT id, name, 1, content, created_at\n FROM prompts_legacy\n `);\n this.#db.exec('DROP TABLE prompts_legacy');\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at)',\n );\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)',\n );\n });\n }\n\n #migrateRunsTableToSuiteRequired(): void {\n const runColumns = this.#stmt('PRAGMA table_info(runs)').all() as Array<{\n name: string;\n notnull: number;\n }>;\n\n if (runColumns.length === 0) return;\n\n const suiteColumn = runColumns.find((column) => column.name === 'suite_id');\n const hasNonNullSuite = suiteColumn?.notnull === 1;\n\n const runForeignKeys = this.#stmt(\n 'PRAGMA foreign_key_list(runs)',\n ).all() as Array<{\n from: string;\n on_delete: string;\n table: string;\n }>;\n const suiteForeignKey = runForeignKeys.find(\n (fk) => fk.from === 'suite_id' && fk.table === 'suites',\n );\n const hasCascadeDelete = suiteForeignKey?.on_delete === 'CASCADE';\n\n if (hasNonNullSuite && hasCascadeDelete) return;\n\n this.#statements.clear();\n this.#transaction(() => {\n this.#db.exec(`\n CREATE TABLE runs_next (\n id TEXT PRIMARY KEY,\n suite_id TEXT NOT NULL,\n name TEXT NOT NULL,\n model TEXT NOT NULL,\n config TEXT,\n started_at INTEGER NOT NULL,\n finished_at INTEGER,\n status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),\n summary TEXT,\n FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE\n )\n `);\n\n // Drop legacy orphaned runs that do not belong to a suite.\n this.#db.exec('DELETE FROM runs WHERE suite_id IS NULL');\n\n this.#db.exec(`\n INSERT INTO runs_next (id, suite_id, name, model, config, started_at, finished_at, status, summary)\n SELECT r.id, r.suite_id, r.name, r.model, r.config, r.started_at, r.finished_at, r.status, r.summary\n FROM runs r\n JOIN suites s ON s.id = r.suite_id\n `);\n\n this.#db.exec('DROP TABLE runs');\n this.#db.exec('ALTER TABLE runs_next RENAME TO runs');\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id)',\n );\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at)',\n );\n });\n this.#statements.clear();\n }\n\n createSuite(name: string): SuiteRow {\n const id = crypto.randomUUID();\n const now = Date.now();\n this.#stmt(\n 'INSERT INTO suites (id, name, created_at) VALUES (?, ?, ?)',\n ).run(id, name, now);\n return { id, name, created_at: now };\n }\n\n createRun(run: {\n suite_id: string;\n name: string;\n model: string;\n config?: Record<string, unknown>;\n }): string {\n const id = crypto.randomUUID();\n const now = Date.now();\n this.#stmt(\n 'INSERT INTO runs (id, suite_id, name, model, config, started_at) VALUES (?, ?, ?, ?, ?, ?)',\n ).run(\n id,\n run.suite_id,\n run.name,\n run.model,\n run.config ? JSON.stringify(run.config) : null,\n now,\n );\n return id;\n }\n\n finishRun(\n runId: string,\n status: 'completed' | 'failed',\n summary?: RunSummary,\n ): void {\n this.#stmt(\n 'UPDATE runs SET finished_at = ?, status = ?, summary = ? WHERE id = ?',\n ).run(Date.now(), status, summary ? JSON.stringify(summary) : null, runId);\n }\n\n saveCases(cases: CaseData[]): void {\n this.#transaction(() => {\n const stmt = this.#stmt(\n 'INSERT INTO cases (id, run_id, idx, input, output, expected, latency_ms, tokens_in, tokens_out, error) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',\n );\n for (const c of cases) {\n stmt.run(\n c.id,\n c.run_id,\n c.idx,\n JSON.stringify(c.input),\n c.output,\n c.expected != null ? JSON.stringify(c.expected) : null,\n c.latency_ms,\n c.tokens_in,\n c.tokens_out,\n c.error ?? null,\n );\n }\n });\n }\n\n saveScores(scores: ScoreData[]): void {\n this.#transaction(() => {\n const stmt = this.#stmt(\n 'INSERT INTO scores (id, case_id, scorer_name, score, reason) VALUES (?, ?, ?, ?, ?)',\n );\n for (const s of scores) {\n stmt.run(s.id, s.case_id, s.scorer_name, s.score, s.reason ?? null);\n }\n });\n }\n\n getRun(runId: string): RunRow | undefined {\n const row = this.#stmt('SELECT * FROM runs WHERE id = ?').get(runId) as\n | {\n id: string;\n suite_id: string;\n name: string;\n model: string;\n config: string | null;\n started_at: number;\n finished_at: number | null;\n status: string;\n summary: string | null;\n }\n | undefined;\n if (!row) return undefined;\n return {\n id: row.id,\n suite_id: row.suite_id,\n name: row.name,\n model: row.model,\n config: row.config ? JSON.parse(row.config) : null,\n started_at: row.started_at,\n finished_at: row.finished_at,\n status: row.status as RunRow['status'],\n summary: row.summary ? JSON.parse(row.summary) : null,\n };\n }\n\n listRuns(suiteId?: string): RunRow[] {\n const sql = suiteId\n ? 'SELECT * FROM runs WHERE suite_id = ? ORDER BY started_at'\n : 'SELECT * FROM runs ORDER BY started_at';\n const rows = (\n suiteId ? this.#stmt(sql).all(suiteId) : this.#stmt(sql).all()\n ) as Array<{\n id: string;\n suite_id: string;\n name: string;\n model: string;\n config: string | null;\n started_at: number;\n finished_at: number | null;\n status: string;\n summary: string | null;\n }>;\n return rows.map((row) => ({\n id: row.id,\n suite_id: row.suite_id,\n name: row.name,\n model: row.model,\n config: row.config ? JSON.parse(row.config) : null,\n started_at: row.started_at,\n finished_at: row.finished_at,\n status: row.status as RunRow['status'],\n summary: row.summary ? JSON.parse(row.summary) : null,\n }));\n }\n\n getCases(runId: string): CaseRow[] {\n const rows = this.#stmt(\n 'SELECT * FROM cases WHERE run_id = ? ORDER BY idx',\n ).all(runId) as Array<{\n id: string;\n run_id: string;\n idx: number;\n input: string;\n output: string | null;\n expected: string | null;\n latency_ms: number;\n tokens_in: number;\n tokens_out: number;\n error: string | null;\n }>;\n return rows.map((row) => ({\n id: row.id,\n run_id: row.run_id,\n idx: row.idx,\n input: JSON.parse(row.input),\n output: row.output,\n expected: row.expected ? JSON.parse(row.expected) : null,\n latency_ms: row.latency_ms,\n tokens_in: row.tokens_in,\n tokens_out: row.tokens_out,\n error: row.error,\n }));\n }\n\n getFailingCases(runId: string, threshold = 0.5): CaseWithScores[] {\n const rows = this.#stmt(\n `SELECT c.*, s.scorer_name, s.score, s.reason as score_reason\n FROM cases c\n JOIN scores s ON s.case_id = c.id\n WHERE c.run_id = ? AND s.score < ?\n ORDER BY c.idx`,\n ).all(runId, threshold) as Array<{\n id: string;\n run_id: string;\n idx: number;\n input: string;\n output: string | null;\n expected: string | null;\n latency_ms: number;\n tokens_in: number;\n tokens_out: number;\n error: string | null;\n scorer_name: string;\n score: number;\n score_reason: string | null;\n }>;\n\n const caseMap = new Map<string, CaseWithScores>();\n for (const row of rows) {\n let c = caseMap.get(row.id);\n if (!c) {\n c = {\n id: row.id,\n run_id: row.run_id,\n idx: row.idx,\n input: JSON.parse(row.input),\n output: row.output,\n expected: row.expected ? JSON.parse(row.expected) : null,\n latency_ms: row.latency_ms,\n tokens_in: row.tokens_in,\n tokens_out: row.tokens_out,\n error: row.error,\n scores: [],\n };\n caseMap.set(row.id, c);\n }\n c.scores.push({\n scorer_name: row.scorer_name,\n score: row.score,\n reason: row.score_reason,\n });\n }\n return Array.from(caseMap.values());\n }\n\n getRunSummary(runId: string, threshold = 0.5): RunSummary {\n const totals = this.#stmt(\n `SELECT\n COUNT(DISTINCT c.id) as totalCases,\n COALESCE(SUM(c.latency_ms), 0) as totalLatencyMs,\n COALESCE(SUM(c.tokens_in), 0) as totalTokensIn,\n COALESCE(SUM(c.tokens_out), 0) as totalTokensOut\n FROM cases c WHERE c.run_id = ?`,\n ).get(runId) as {\n totalCases: number;\n totalLatencyMs: number;\n totalTokensIn: number;\n totalTokensOut: number;\n };\n\n const scorerMeans = this.#stmt(\n `SELECT s.scorer_name, AVG(s.score) as meanScore\n FROM scores s\n JOIN cases c ON c.id = s.case_id\n WHERE c.run_id = ?\n GROUP BY s.scorer_name`,\n ).all(runId) as Array<{ scorer_name: string; meanScore: number }>;\n\n const meanScores: Record<string, number> = {};\n for (const row of scorerMeans) {\n meanScores[row.scorer_name] = row.meanScore;\n }\n\n const passFail = this.#stmt(\n `SELECT c.id,\n MIN(s.score) as minScore\n FROM cases c\n JOIN scores s ON s.case_id = c.id\n WHERE c.run_id = ?\n GROUP BY c.id`,\n ).all(runId) as Array<{ id: string; minScore: number }>;\n\n let passCount = 0;\n let failCount = 0;\n for (const row of passFail) {\n if (row.minScore >= threshold) passCount++;\n else failCount++;\n }\n\n return {\n totalCases: totals.totalCases,\n passCount,\n failCount,\n meanScores,\n totalLatencyMs: totals.totalLatencyMs,\n totalTokensIn: totals.totalTokensIn,\n totalTokensOut: totals.totalTokensOut,\n };\n }\n\n listSuites(): SuiteRow[] {\n const rows = this.#stmt(\n 'SELECT * FROM suites ORDER BY created_at DESC',\n ).all() as Array<{ id: string; name: string; created_at: number }>;\n return rows.map((row) => ({\n id: row.id,\n name: row.name,\n created_at: row.created_at,\n }));\n }\n\n createPrompt(name: string, content: string): PromptRow {\n const id = crypto.randomUUID();\n const now = Date.now();\n\n const latest = this.#stmt(\n 'SELECT MAX(version) as latestVersion FROM prompts WHERE name = ?',\n ).get(name) as { latestVersion: number | null } | undefined;\n const version = (latest?.latestVersion ?? 0) + 1;\n\n this.#stmt(\n 'INSERT INTO prompts (id, name, version, content, created_at) VALUES (?, ?, ?, ?, ?)',\n ).run(id, name, version, content, now);\n return { id, name, version, content, created_at: now };\n }\n\n listPrompts(): PromptRow[] {\n const rows = this.#stmt(\n 'SELECT * FROM prompts ORDER BY name COLLATE NOCASE ASC, version DESC',\n ).all() as Array<{\n id: string;\n name: string;\n version: number;\n content: string;\n created_at: number;\n }>;\n return rows.map((row) => ({\n id: row.id,\n name: row.name,\n version: row.version,\n content: row.content,\n created_at: row.created_at,\n }));\n }\n\n getPrompt(id: string): PromptRow | undefined {\n const row = this.#stmt('SELECT * FROM prompts WHERE id = ?').get(id) as\n | {\n id: string;\n name: string;\n version: number;\n content: string;\n created_at: number;\n }\n | undefined;\n if (!row) return undefined;\n return {\n id: row.id,\n name: row.name,\n version: row.version,\n content: row.content,\n created_at: row.created_at,\n };\n }\n\n deletePrompt(id: string): void {\n this.#stmt('DELETE FROM prompts WHERE id = ?').run(id);\n }\n}\n", "PRAGMA journal_mode = WAL;\nPRAGMA synchronous = NORMAL;\nPRAGMA foreign_keys = ON;\n\nCREATE TABLE IF NOT EXISTS suites (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000)\n);\n\nCREATE TABLE IF NOT EXISTS runs (\n id TEXT PRIMARY KEY,\n suite_id TEXT NOT NULL,\n name TEXT NOT NULL,\n model TEXT NOT NULL,\n config TEXT,\n started_at INTEGER NOT NULL,\n finished_at INTEGER,\n status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),\n summary TEXT,\n FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id);\nCREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);\n\nCREATE TABLE IF NOT EXISTS cases (\n id TEXT PRIMARY KEY,\n run_id TEXT NOT NULL,\n idx INTEGER NOT NULL,\n input TEXT NOT NULL,\n output TEXT,\n expected TEXT,\n latency_ms INTEGER,\n tokens_in INTEGER,\n tokens_out INTEGER,\n error TEXT,\n FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_cases_run_id ON cases(run_id);\n\nCREATE TABLE IF NOT EXISTS scores (\n id TEXT PRIMARY KEY,\n case_id TEXT NOT NULL,\n scorer_name TEXT NOT NULL,\n score REAL NOT NULL,\n reason TEXT,\n FOREIGN KEY (case_id) REFERENCES cases(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_scores_case_id ON scores(case_id);\n\nCREATE TABLE IF NOT EXISTS prompts (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL UNIQUE,\n content TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000)\n);\n\nCREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at);\n"],
|
|
5
|
-
"mappings": ";AAAA,SAAS,iBAAiB;AAC1B,SAAS,eAAe;AACxB,SAAS,oBAAoB;;;ACF7B;;;ADwFO,IAAM,WAAN,MAAe;AAAA,EACpB;AAAA,EACA,cAAc,oBAAI,IAAiD;AAAA,EAEnE,MAAM,KAAkD;AACtD,QAAI,OAAO,KAAK,YAAY,IAAI,GAAG;AACnC,QAAI,CAAC,MAAM;AACT,aAAO,KAAK,IAAI,QAAQ,GAAG;AAC3B,WAAK,YAAY,IAAI,KAAK,IAAI;AAAA,IAChC;AACA,WAAO;AAAA,EACT;AAAA,EAEA,aAAgB,IAAgB;AAC9B,SAAK,IAAI,KAAK,mBAAmB;AACjC,QAAI;AACF,YAAM,SAAS,GAAG;AAClB,WAAK,IAAI,KAAK,QAAQ;AACtB,aAAO;AAAA,IACT,SAAS,OAAO;AACd,WAAK,IAAI,KAAK,UAAU;AACxB,YAAM;AAAA,IACR;AAAA,EACF;AAAA,EAEA,YAAY,UAAkC;AAC5C,QAAI,oBAAoB,cAAc;AACpC,WAAK,MAAM;AAAA,IACb,OAAO;AACL,YAAM,SAAS,YAAY;AAC3B,gBAAU,QAAQ,MAAM,GAAG,EAAE,WAAW,KAAK,CAAC;AAC9C,WAAK,MAAM,IAAI,aAAa,MAAM;AAAA,IACpC;AACA,SAAK,IAAI,KAAK,kBAAG;AACjB,SAAK,iCAAiC;AACtC,SAAK,6BAA6B;AAClC,SAAK,IAAI;AAAA,MACP;AAAA,IACF;AAAA,EACF;AAAA,EAEA,+BAAqC;AACnC,UAAM,UAAU,KAAK,MAAM,4BAA4B,EAAE,IAAI;AAI7D,QAAI,QAAQ,WAAW,EAAG;AAC1B,QAAI,QAAQ,KAAK,CAAC,WAAW,OAAO,SAAS,SAAS,EAAG;AAEzD,SAAK,aAAa,MAAM;AACtB,WAAK,IAAI,KAAK,8CAA8C;AAC5D,WAAK,IAAI,KAAK;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,OASb;AACD,WAAK,IAAI,KAAK;AAAA;AAAA;AAAA;AAAA,OAIb;AACD,WAAK,IAAI,KAAK,2BAA2B;AACzC,WAAK,IAAI;AAAA,QACP;AAAA,MACF;AACA,WAAK,IAAI;AAAA,QACP;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,mCAAyC;AACvC,UAAM,aAAa,KAAK,MAAM,yBAAyB,EAAE,IAAI;AAK7D,QAAI,WAAW,WAAW,EAAG;AAE7B,UAAM,cAAc,WAAW,KAAK,CAAC,WAAW,OAAO,SAAS,UAAU;AAC1E,UAAM,kBAAkB,aAAa,YAAY;AAEjD,UAAM,iBAAiB,KAAK;AAAA,MAC1B;AAAA,IACF,EAAE,IAAI;AAKN,UAAM,kBAAkB,eAAe;AAAA,MACrC,CAAC,OAAO,GAAG,SAAS,cAAc,GAAG,UAAU;AAAA,IACjD;AACA,UAAM,mBAAmB,iBAAiB,cAAc;AAExD,QAAI,mBAAmB,iBAAkB;AAEzC,SAAK,YAAY,MAAM;AACvB,SAAK,aAAa,MAAM;AACtB,WAAK,IAAI,KAAK;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,OAab;AAGD,WAAK,IAAI,KAAK,yCAAyC;AAEvD,WAAK,IAAI,KAAK;AAAA;AAAA;AAAA;AAAA;AAAA,OAKb;AAED,WAAK,IAAI,KAAK,iBAAiB;AAC/B,WAAK,IAAI,KAAK,sCAAsC;AACpD,WAAK,IAAI;AAAA,QACP;AAAA,MACF;AACA,WAAK,IAAI;AAAA,QACP;AAAA,MACF;AAAA,IACF,CAAC;AACD,SAAK,YAAY,MAAM;AAAA,EACzB;AAAA,EAEA,YAAY,MAAwB;AAClC,UAAM,KAAK,OAAO,WAAW;AAC7B,UAAM,MAAM,KAAK,IAAI;AACrB,SAAK;AAAA,MACH;AAAA,IACF,EAAE,IAAI,IAAI,MAAM,GAAG;AACnB,WAAO,EAAE,IAAI,MAAM,YAAY,IAAI;AAAA,EACrC;AAAA,EAEA,UAAU,KAKC;AACT,UAAM,KAAK,OAAO,WAAW;AAC7B,UAAM,MAAM,KAAK,IAAI;AACrB,SAAK;AAAA,MACH;AAAA,IACF,EAAE;AAAA,MACA;AAAA,MACA,IAAI;AAAA,MACJ,IAAI;AAAA,MACJ,IAAI;AAAA,MACJ,IAAI,SAAS,KAAK,UAAU,IAAI,MAAM,IAAI;AAAA,MAC1C;AAAA,IACF;AACA,WAAO;AAAA,EACT;AAAA,EAEA,UACE,OACA,QACA,SACM;AACN,SAAK;AAAA,MACH;AAAA,IACF,EAAE,IAAI,KAAK,IAAI,GAAG,QAAQ,UAAU,KAAK,UAAU,OAAO,IAAI,MAAM,KAAK;AAAA,EAC3E;AAAA,EAEA,UAAU,OAAyB;AACjC,SAAK,aAAa,MAAM;AACtB,YAAM,OAAO,KAAK;AAAA,QAChB;AAAA,MACF;AACA,iBAAW,KAAK,OAAO;AACrB,aAAK;AAAA,UACH,EAAE;AAAA,UACF,EAAE;AAAA,UACF,EAAE;AAAA,UACF,KAAK,UAAU,EAAE,KAAK;AAAA,UACtB,EAAE;AAAA,UACF,EAAE,YAAY,OAAO,KAAK,UAAU,EAAE,QAAQ,IAAI;AAAA,UAClD,EAAE;AAAA,UACF,EAAE;AAAA,UACF,EAAE;AAAA,UACF,EAAE,SAAS;AAAA,QACb;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,WAAW,QAA2B;AACpC,SAAK,aAAa,MAAM;AACtB,YAAM,OAAO,KAAK;AAAA,QAChB;AAAA,MACF;AACA,iBAAW,KAAK,QAAQ;AACtB,aAAK,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,aAAa,EAAE,OAAO,EAAE,UAAU,IAAI;AAAA,MACpE;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,OAAO,OAAmC;AACxC,UAAM,MAAM,KAAK,MAAM,iCAAiC,EAAE,IAAI,KAAK;AAanE,QAAI,CAAC,IAAK,QAAO;AACjB,WAAO;AAAA,MACL,IAAI,IAAI;AAAA,MACR,UAAU,IAAI;AAAA,MACd,MAAM,IAAI;AAAA,MACV,OAAO,IAAI;AAAA,MACX,QAAQ,IAAI,SAAS,KAAK,MAAM,IAAI,MAAM,IAAI;AAAA,MAC9C,YAAY,IAAI;AAAA,MAChB,aAAa,IAAI;AAAA,MACjB,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI,UAAU,KAAK,MAAM,IAAI,OAAO,IAAI;AAAA,IACnD;AAAA,EACF;AAAA,EAEA,SAAS,SAA4B;AACnC,UAAM,MAAM,UACR,8DACA;AACJ,UAAM,OACJ,UAAU,KAAK,MAAM,GAAG,EAAE,IAAI,OAAO,IAAI,KAAK,MAAM,GAAG,EAAE,IAAI;AAY/D,WAAO,KAAK,IAAI,CAAC,SAAS;AAAA,MACxB,IAAI,IAAI;AAAA,MACR,UAAU,IAAI;AAAA,MACd,MAAM,IAAI;AAAA,MACV,OAAO,IAAI;AAAA,MACX,QAAQ,IAAI,SAAS,KAAK,MAAM,IAAI,MAAM,IAAI;AAAA,MAC9C,YAAY,IAAI;AAAA,MAChB,aAAa,IAAI;AAAA,MACjB,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI,UAAU,KAAK,MAAM,IAAI,OAAO,IAAI;AAAA,IACnD,EAAE;AAAA,EACJ;AAAA,EAEA,SAAS,OAA0B;AACjC,UAAM,OAAO,KAAK;AAAA,MAChB;AAAA,IACF,EAAE,IAAI,KAAK;AAYX,WAAO,KAAK,IAAI,CAAC,SAAS;AAAA,MACxB,IAAI,IAAI;AAAA,MACR,QAAQ,IAAI;AAAA,MACZ,KAAK,IAAI;AAAA,MACT,OAAO,KAAK,MAAM,IAAI,KAAK;AAAA,MAC3B,QAAQ,IAAI;AAAA,MACZ,UAAU,IAAI,WAAW,KAAK,MAAM,IAAI,QAAQ,IAAI;AAAA,MACpD,YAAY,IAAI;AAAA,MAChB,WAAW,IAAI;AAAA,MACf,YAAY,IAAI;AAAA,MAChB,OAAO,IAAI;AAAA,IACb,EAAE;AAAA,EACJ;AAAA,EAEA,gBAAgB,OAAe,YAAY,KAAuB;AAChE,UAAM,OAAO,KAAK;AAAA,MAChB;AAAA;AAAA;AAAA;AAAA;AAAA,IAKF,EAAE,IAAI,OAAO,SAAS;AAgBtB,UAAM,UAAU,oBAAI,IAA4B;AAChD,eAAW,OAAO,MAAM;AACtB,UAAI,IAAI,QAAQ,IAAI,IAAI,EAAE;AAC1B,UAAI,CAAC,GAAG;AACN,YAAI;AAAA,UACF,IAAI,IAAI;AAAA,UACR,QAAQ,IAAI;AAAA,UACZ,KAAK,IAAI;AAAA,UACT,OAAO,KAAK,MAAM,IAAI,KAAK;AAAA,UAC3B,QAAQ,IAAI;AAAA,UACZ,UAAU,IAAI,WAAW,KAAK,MAAM,IAAI,QAAQ,IAAI;AAAA,UACpD,YAAY,IAAI;AAAA,UAChB,WAAW,IAAI;AAAA,UACf,YAAY,IAAI;AAAA,UAChB,OAAO,IAAI;AAAA,UACX,QAAQ,CAAC;AAAA,QACX;AACA,gBAAQ,IAAI,IAAI,IAAI,CAAC;AAAA,MACvB;AACA,QAAE,OAAO,KAAK;AAAA,QACZ,aAAa,IAAI;AAAA,QACjB,OAAO,IAAI;AAAA,QACX,QAAQ,IAAI;AAAA,MACd,CAAC;AAAA,IACH;AACA,WAAO,MAAM,KAAK,QAAQ,OAAO,CAAC;AAAA,EACpC;AAAA,EAEA,cAAc,OAAe,YAAY,KAAiB;AACxD,UAAM,SAAS,KAAK;AAAA,MAClB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,IAMF,EAAE,IAAI,KAAK;AAOX,UAAM,cAAc,KAAK;AAAA,MACvB;AAAA;AAAA;AAAA;AAAA;AAAA,IAKF,EAAE,IAAI,KAAK;AAEX,UAAM,aAAqC,CAAC;AAC5C,eAAW,OAAO,aAAa;AAC7B,iBAAW,IAAI,WAAW,IAAI,IAAI;AAAA,IACpC;AAEA,UAAM,WAAW,KAAK;AAAA,MACpB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,IAMF,EAAE,IAAI,KAAK;AAEX,QAAI,YAAY;AAChB,QAAI,YAAY;AAChB,eAAW,OAAO,UAAU;AAC1B,UAAI,IAAI,YAAY,UAAW;AAAA,UAC1B;AAAA,IACP;AAEA,WAAO;AAAA,MACL,YAAY,OAAO;AAAA,MACnB;AAAA,MACA;AAAA,MACA;AAAA,MACA,gBAAgB,OAAO;AAAA,MACvB,eAAe,OAAO;AAAA,MACtB,gBAAgB,OAAO;AAAA,IACzB;AAAA,EACF;AAAA,EAEA,aAAyB;AACvB,UAAM,OAAO,KAAK;AAAA,MAChB;AAAA,IACF,EAAE,IAAI;AACN,WAAO,KAAK,IAAI,CAAC,SAAS;AAAA,MACxB,IAAI,IAAI;AAAA,MACR,MAAM,IAAI;AAAA,MACV,YAAY,IAAI;AAAA,IAClB,EAAE;AAAA,EACJ;AAAA,EAEA,aAAa,MAAc,SAA4B;AACrD,UAAM,KAAK,OAAO,WAAW;AAC7B,UAAM,MAAM,KAAK,IAAI;AAErB,UAAM,SAAS,KAAK;AAAA,MAClB;AAAA,IACF,EAAE,IAAI,IAAI;AACV,UAAM,WAAW,QAAQ,iBAAiB,KAAK;AAE/C,SAAK;AAAA,MACH;AAAA,IACF,EAAE,IAAI,IAAI,MAAM,SAAS,SAAS,GAAG;AACrC,WAAO,EAAE,IAAI,MAAM,SAAS,SAAS,YAAY,IAAI;AAAA,EACvD;AAAA,EAEA,cAA2B;AACzB,UAAM,OAAO,KAAK;AAAA,MAChB;AAAA,IACF,EAAE,IAAI;AAON,WAAO,KAAK,IAAI,CAAC,SAAS;AAAA,MACxB,IAAI,IAAI;AAAA,MACR,MAAM,IAAI;AAAA,MACV,SAAS,IAAI;AAAA,MACb,SAAS,IAAI;AAAA,MACb,YAAY,IAAI;AAAA,IAClB,EAAE;AAAA,EACJ;AAAA,EAEA,UAAU,IAAmC;AAC3C,UAAM,MAAM,KAAK,MAAM,oCAAoC,EAAE,IAAI,EAAE;AASnE,QAAI,CAAC,IAAK,QAAO;AACjB,WAAO;AAAA,MACL,IAAI,IAAI;AAAA,MACR,MAAM,IAAI;AAAA,MACV,SAAS,IAAI;AAAA,MACb,SAAS,IAAI;AAAA,MACb,YAAY,IAAI;AAAA,IAClB;AAAA,EACF;AAAA,EAEA,aAAa,IAAkB;AAC7B,SAAK,MAAM,kCAAkC,EAAE,IAAI,EAAE;AAAA,EACvD;AACF;",
|
|
4
|
+
"sourcesContent": ["import { mkdirSync } from 'node:fs';\nimport { dirname } from 'node:path';\nimport { DatabaseSync } from 'node:sqlite';\n\nimport DDL from './ddl.sqlite.sql';\n\nexport interface SuiteRow {\n id: string;\n name: string;\n created_at: number;\n}\n\nexport interface RunRow {\n id: string;\n suite_id: string;\n name: string;\n model: string;\n config: Record<string, unknown> | null;\n started_at: number;\n finished_at: number | null;\n status: 'running' | 'completed' | 'failed';\n summary: RunSummary | null;\n}\n\nexport interface CaseRow {\n id: string;\n run_id: string;\n idx: number;\n input: unknown;\n output: string | null;\n expected: unknown | null;\n latency_ms: number;\n tokens_in: number;\n tokens_out: number;\n error: string | null;\n}\n\nexport interface CaseWithScores extends CaseRow {\n scores: Array<{ scorer_name: string; score: number; reason: string | null }>;\n}\n\nexport interface ScoreRow {\n id: string;\n case_id: string;\n scorer_name: string;\n score: number;\n reason: string | null;\n}\n\nexport interface RunSummary {\n totalCases: number;\n passCount: number;\n failCount: number;\n meanScores: Record<string, number>;\n totalLatencyMs: number;\n totalTokensIn: number;\n totalTokensOut: number;\n}\n\nexport interface PromptRow {\n id: string;\n name: string;\n version: number;\n content: string;\n created_at: number;\n}\n\nexport interface CaseData {\n id: string;\n run_id: string;\n idx: number;\n input: unknown;\n output: string | null;\n expected?: unknown;\n latency_ms: number;\n tokens_in: number;\n tokens_out: number;\n error?: string;\n}\n\nexport interface ScoreData {\n id: string;\n case_id: string;\n scorer_name: string;\n score: number;\n reason?: string;\n}\n\nexport class RunStore {\n #db: DatabaseSync;\n #statements = new Map<string, ReturnType<DatabaseSync['prepare']>>();\n\n #stmt(sql: string): ReturnType<DatabaseSync['prepare']> {\n let stmt = this.#statements.get(sql);\n if (!stmt) {\n stmt = this.#db.prepare(sql);\n this.#statements.set(sql, stmt);\n }\n return stmt;\n }\n\n #transaction<T>(fn: () => T): T {\n this.#db.exec('BEGIN TRANSACTION');\n try {\n const result = fn();\n this.#db.exec('COMMIT');\n return result;\n } catch (error) {\n this.#db.exec('ROLLBACK');\n throw error;\n }\n }\n\n constructor(pathOrDb?: string | DatabaseSync) {\n if (pathOrDb instanceof DatabaseSync) {\n this.#db = pathOrDb;\n } else {\n const dbPath = pathOrDb ?? '.evals/store.db';\n mkdirSync(dirname(dbPath), { recursive: true });\n this.#db = new DatabaseSync(dbPath);\n }\n this.#db.exec(DDL);\n this.#migrateRunsTableToSuiteRequired();\n this.#migratePromptsTableIfNeeded();\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)',\n );\n }\n\n #migratePromptsTableIfNeeded(): void {\n const columns = this.#stmt('PRAGMA table_info(prompts)').all() as Array<{\n name: string;\n }>;\n\n if (columns.length === 0) return;\n if (columns.some((column) => column.name === 'version')) return;\n\n this.#transaction(() => {\n this.#db.exec('ALTER TABLE prompts RENAME TO prompts_legacy');\n this.#db.exec(`\n CREATE TABLE prompts (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL,\n version INTEGER NOT NULL,\n content TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000),\n UNIQUE(name, version)\n )\n `);\n this.#db.exec(`\n INSERT INTO prompts (id, name, version, content, created_at)\n SELECT id, name, 1, content, created_at\n FROM prompts_legacy\n `);\n this.#db.exec('DROP TABLE prompts_legacy');\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at)',\n );\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)',\n );\n });\n }\n\n #migrateRunsTableToSuiteRequired(): void {\n const runColumns = this.#stmt('PRAGMA table_info(runs)').all() as Array<{\n name: string;\n notnull: number;\n }>;\n\n if (runColumns.length === 0) return;\n\n const suiteColumn = runColumns.find((column) => column.name === 'suite_id');\n const hasNonNullSuite = suiteColumn?.notnull === 1;\n\n const runForeignKeys = this.#stmt(\n 'PRAGMA foreign_key_list(runs)',\n ).all() as Array<{\n from: string;\n on_delete: string;\n table: string;\n }>;\n const suiteForeignKey = runForeignKeys.find(\n (fk) => fk.from === 'suite_id' && fk.table === 'suites',\n );\n const hasCascadeDelete = suiteForeignKey?.on_delete === 'CASCADE';\n\n if (hasNonNullSuite && hasCascadeDelete) return;\n\n this.#statements.clear();\n this.#transaction(() => {\n this.#db.exec(`\n CREATE TABLE runs_next (\n id TEXT PRIMARY KEY,\n suite_id TEXT NOT NULL,\n name TEXT NOT NULL,\n model TEXT NOT NULL,\n config TEXT,\n started_at INTEGER NOT NULL,\n finished_at INTEGER,\n status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),\n summary TEXT,\n FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE\n )\n `);\n\n // Drop legacy orphaned runs that do not belong to a suite.\n this.#db.exec('DELETE FROM runs WHERE suite_id IS NULL');\n\n this.#db.exec(`\n INSERT INTO runs_next (id, suite_id, name, model, config, started_at, finished_at, status, summary)\n SELECT r.id, r.suite_id, r.name, r.model, r.config, r.started_at, r.finished_at, r.status, r.summary\n FROM runs r\n JOIN suites s ON s.id = r.suite_id\n `);\n\n this.#db.exec('DROP TABLE runs');\n this.#db.exec('ALTER TABLE runs_next RENAME TO runs');\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id)',\n );\n this.#db.exec(\n 'CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at)',\n );\n });\n this.#statements.clear();\n }\n\n createSuite(name: string): SuiteRow {\n const id = crypto.randomUUID();\n const now = Date.now();\n this.#stmt(\n 'INSERT INTO suites (id, name, created_at) VALUES (?, ?, ?)',\n ).run(id, name, now);\n return { id, name, created_at: now };\n }\n\n createRun(run: {\n suite_id: string;\n name: string;\n model: string;\n config?: Record<string, unknown>;\n }): string {\n const id = crypto.randomUUID();\n const now = Date.now();\n this.#stmt(\n 'INSERT INTO runs (id, suite_id, name, model, config, started_at) VALUES (?, ?, ?, ?, ?, ?)',\n ).run(\n id,\n run.suite_id,\n run.name,\n run.model,\n run.config ? JSON.stringify(run.config) : null,\n now,\n );\n return id;\n }\n\n finishRun(\n runId: string,\n status: 'completed' | 'failed',\n summary?: RunSummary,\n ): void {\n this.#stmt(\n 'UPDATE runs SET finished_at = ?, status = ?, summary = ? WHERE id = ?',\n ).run(Date.now(), status, summary ? JSON.stringify(summary) : null, runId);\n }\n\n saveCases(cases: CaseData[]): void {\n this.#transaction(() => {\n const stmt = this.#stmt(\n 'INSERT INTO cases (id, run_id, idx, input, output, expected, latency_ms, tokens_in, tokens_out, error) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',\n );\n for (const c of cases) {\n stmt.run(\n c.id,\n c.run_id,\n c.idx,\n JSON.stringify(c.input),\n c.output,\n c.expected != null ? JSON.stringify(c.expected) : null,\n c.latency_ms,\n c.tokens_in,\n c.tokens_out,\n c.error ?? null,\n );\n }\n });\n }\n\n saveScores(scores: ScoreData[]): void {\n this.#transaction(() => {\n const stmt = this.#stmt(\n 'INSERT INTO scores (id, case_id, scorer_name, score, reason) VALUES (?, ?, ?, ?, ?)',\n );\n for (const s of scores) {\n stmt.run(s.id, s.case_id, s.scorer_name, s.score, s.reason ?? null);\n }\n });\n }\n\n getRun(runId: string): RunRow | undefined {\n const row = this.#stmt('SELECT * FROM runs WHERE id = ?').get(runId) as\n | {\n id: string;\n suite_id: string;\n name: string;\n model: string;\n config: string | null;\n started_at: number;\n finished_at: number | null;\n status: string;\n summary: string | null;\n }\n | undefined;\n if (!row) return undefined;\n return {\n id: row.id,\n suite_id: row.suite_id,\n name: row.name,\n model: row.model,\n config: row.config ? JSON.parse(row.config) : null,\n started_at: row.started_at,\n finished_at: row.finished_at,\n status: row.status as RunRow['status'],\n summary: row.summary ? JSON.parse(row.summary) : null,\n };\n }\n\n listRuns(suiteId?: string): RunRow[] {\n const sql = suiteId\n ? 'SELECT * FROM runs WHERE suite_id = ? ORDER BY started_at'\n : 'SELECT * FROM runs ORDER BY started_at';\n const rows = (\n suiteId ? this.#stmt(sql).all(suiteId) : this.#stmt(sql).all()\n ) as Array<{\n id: string;\n suite_id: string;\n name: string;\n model: string;\n config: string | null;\n started_at: number;\n finished_at: number | null;\n status: string;\n summary: string | null;\n }>;\n return rows.map((row) => ({\n id: row.id,\n suite_id: row.suite_id,\n name: row.name,\n model: row.model,\n config: row.config ? JSON.parse(row.config) : null,\n started_at: row.started_at,\n finished_at: row.finished_at,\n status: row.status as RunRow['status'],\n summary: row.summary ? JSON.parse(row.summary) : null,\n }));\n }\n\n getCases(runId: string): CaseRow[] {\n const rows = this.#stmt(\n 'SELECT * FROM cases WHERE run_id = ? ORDER BY idx',\n ).all(runId) as Array<{\n id: string;\n run_id: string;\n idx: number;\n input: string;\n output: string | null;\n expected: string | null;\n latency_ms: number;\n tokens_in: number;\n tokens_out: number;\n error: string | null;\n }>;\n return rows.map((row) => ({\n id: row.id,\n run_id: row.run_id,\n idx: row.idx,\n input: JSON.parse(row.input),\n output: row.output,\n expected: row.expected ? JSON.parse(row.expected) : null,\n latency_ms: row.latency_ms,\n tokens_in: row.tokens_in,\n tokens_out: row.tokens_out,\n error: row.error,\n }));\n }\n\n getFailingCases(runId: string, threshold = 0.5): CaseWithScores[] {\n const rows = this.#stmt(\n `SELECT c.*, s.scorer_name, s.score, s.reason as score_reason\n FROM cases c\n JOIN scores s ON s.case_id = c.id\n WHERE c.run_id = ? AND s.score < ?\n ORDER BY c.idx`,\n ).all(runId, threshold) as Array<{\n id: string;\n run_id: string;\n idx: number;\n input: string;\n output: string | null;\n expected: string | null;\n latency_ms: number;\n tokens_in: number;\n tokens_out: number;\n error: string | null;\n scorer_name: string;\n score: number;\n score_reason: string | null;\n }>;\n\n const caseMap = new Map<string, CaseWithScores>();\n for (const row of rows) {\n let c = caseMap.get(row.id);\n if (!c) {\n c = {\n id: row.id,\n run_id: row.run_id,\n idx: row.idx,\n input: JSON.parse(row.input),\n output: row.output,\n expected: row.expected ? JSON.parse(row.expected) : null,\n latency_ms: row.latency_ms,\n tokens_in: row.tokens_in,\n tokens_out: row.tokens_out,\n error: row.error,\n scores: [],\n };\n caseMap.set(row.id, c);\n }\n c.scores.push({\n scorer_name: row.scorer_name,\n score: row.score,\n reason: row.score_reason,\n });\n }\n return Array.from(caseMap.values());\n }\n\n getRunSummary(runId: string, threshold = 0.5): RunSummary {\n const totals = this.#stmt(\n `SELECT\n COUNT(DISTINCT c.id) as totalCases,\n COALESCE(SUM(c.latency_ms), 0) as totalLatencyMs,\n COALESCE(SUM(c.tokens_in), 0) as totalTokensIn,\n COALESCE(SUM(c.tokens_out), 0) as totalTokensOut\n FROM cases c WHERE c.run_id = ?`,\n ).get(runId) as {\n totalCases: number;\n totalLatencyMs: number;\n totalTokensIn: number;\n totalTokensOut: number;\n };\n\n const scorerMeans = this.#stmt(\n `SELECT s.scorer_name, AVG(s.score) as meanScore\n FROM scores s\n JOIN cases c ON c.id = s.case_id\n WHERE c.run_id = ?\n GROUP BY s.scorer_name`,\n ).all(runId) as Array<{ scorer_name: string; meanScore: number }>;\n\n const meanScores: Record<string, number> = {};\n for (const row of scorerMeans) {\n meanScores[row.scorer_name] = row.meanScore;\n }\n\n const passFail = this.#stmt(\n `SELECT c.id,\n MIN(s.score) as minScore\n FROM cases c\n JOIN scores s ON s.case_id = c.id\n WHERE c.run_id = ?\n GROUP BY c.id`,\n ).all(runId) as Array<{ id: string; minScore: number }>;\n\n let passCount = 0;\n let failCount = 0;\n for (const row of passFail) {\n if (row.minScore >= threshold) passCount++;\n else failCount++;\n }\n\n return {\n totalCases: totals.totalCases,\n passCount,\n failCount,\n meanScores,\n totalLatencyMs: totals.totalLatencyMs,\n totalTokensIn: totals.totalTokensIn,\n totalTokensOut: totals.totalTokensOut,\n };\n }\n\n findSuiteByName(name: string): SuiteRow | undefined {\n const row = this.#stmt(\n 'SELECT * FROM suites WHERE name = ? ORDER BY created_at DESC LIMIT 1',\n ).get(name) as { id: string; name: string; created_at: number } | undefined;\n return row ?? undefined;\n }\n\n getLatestCompletedRun(suiteId: string, model?: string): RunRow | undefined {\n const sql = model\n ? 'SELECT * FROM runs WHERE suite_id = ? AND status = ? AND model = ? ORDER BY started_at DESC LIMIT 1'\n : 'SELECT * FROM runs WHERE suite_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1';\n const row = (\n model\n ? this.#stmt(sql).get(suiteId, 'completed', model)\n : this.#stmt(sql).get(suiteId, 'completed')\n ) as\n | {\n id: string;\n suite_id: string;\n name: string;\n model: string;\n config: string | null;\n started_at: number;\n finished_at: number | null;\n status: string;\n summary: string | null;\n }\n | undefined;\n if (!row) return undefined;\n return {\n id: row.id,\n suite_id: row.suite_id,\n name: row.name,\n model: row.model,\n config: row.config ? JSON.parse(row.config) : null,\n started_at: row.started_at,\n finished_at: row.finished_at,\n status: row.status as RunRow['status'],\n summary: row.summary ? JSON.parse(row.summary) : null,\n };\n }\n\n listSuites(): SuiteRow[] {\n const rows = this.#stmt(\n 'SELECT * FROM suites ORDER BY created_at DESC',\n ).all() as Array<{ id: string; name: string; created_at: number }>;\n return rows.map((row) => ({\n id: row.id,\n name: row.name,\n created_at: row.created_at,\n }));\n }\n\n createPrompt(name: string, content: string): PromptRow {\n const id = crypto.randomUUID();\n const now = Date.now();\n\n const latest = this.#stmt(\n 'SELECT MAX(version) as latestVersion FROM prompts WHERE name = ?',\n ).get(name) as { latestVersion: number | null } | undefined;\n const version = (latest?.latestVersion ?? 0) + 1;\n\n this.#stmt(\n 'INSERT INTO prompts (id, name, version, content, created_at) VALUES (?, ?, ?, ?, ?)',\n ).run(id, name, version, content, now);\n return { id, name, version, content, created_at: now };\n }\n\n listPrompts(): PromptRow[] {\n const rows = this.#stmt(\n 'SELECT * FROM prompts ORDER BY name COLLATE NOCASE ASC, version DESC',\n ).all() as Array<{\n id: string;\n name: string;\n version: number;\n content: string;\n created_at: number;\n }>;\n return rows.map((row) => ({\n id: row.id,\n name: row.name,\n version: row.version,\n content: row.content,\n created_at: row.created_at,\n }));\n }\n\n getPrompt(id: string): PromptRow | undefined {\n const row = this.#stmt('SELECT * FROM prompts WHERE id = ?').get(id) as\n | {\n id: string;\n name: string;\n version: number;\n content: string;\n created_at: number;\n }\n | undefined;\n if (!row) return undefined;\n return {\n id: row.id,\n name: row.name,\n version: row.version,\n content: row.content,\n created_at: row.created_at,\n };\n }\n\n deletePrompt(id: string): void {\n this.#stmt('DELETE FROM prompts WHERE id = ?').run(id);\n }\n}\n", "PRAGMA journal_mode = WAL;\nPRAGMA synchronous = NORMAL;\nPRAGMA foreign_keys = ON;\n\nCREATE TABLE IF NOT EXISTS suites (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000)\n);\n\nCREATE TABLE IF NOT EXISTS runs (\n id TEXT PRIMARY KEY,\n suite_id TEXT NOT NULL,\n name TEXT NOT NULL,\n model TEXT NOT NULL,\n config TEXT,\n started_at INTEGER NOT NULL,\n finished_at INTEGER,\n status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),\n summary TEXT,\n FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id);\nCREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);\n\nCREATE TABLE IF NOT EXISTS cases (\n id TEXT PRIMARY KEY,\n run_id TEXT NOT NULL,\n idx INTEGER NOT NULL,\n input TEXT NOT NULL,\n output TEXT,\n expected TEXT,\n latency_ms INTEGER,\n tokens_in INTEGER,\n tokens_out INTEGER,\n error TEXT,\n FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_cases_run_id ON cases(run_id);\n\nCREATE TABLE IF NOT EXISTS scores (\n id TEXT PRIMARY KEY,\n case_id TEXT NOT NULL,\n scorer_name TEXT NOT NULL,\n score REAL NOT NULL,\n reason TEXT,\n FOREIGN KEY (case_id) REFERENCES cases(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_scores_case_id ON scores(case_id);\n\nCREATE TABLE IF NOT EXISTS prompts (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL UNIQUE,\n content TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000)\n);\n\nCREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at);\n"],
|
|
5
|
+
"mappings": ";AAAA,SAAS,iBAAiB;AAC1B,SAAS,eAAe;AACxB,SAAS,oBAAoB;;;ACF7B;;;ADwFO,IAAM,WAAN,MAAe;AAAA,EACpB;AAAA,EACA,cAAc,oBAAI,IAAiD;AAAA,EAEnE,MAAM,KAAkD;AACtD,QAAI,OAAO,KAAK,YAAY,IAAI,GAAG;AACnC,QAAI,CAAC,MAAM;AACT,aAAO,KAAK,IAAI,QAAQ,GAAG;AAC3B,WAAK,YAAY,IAAI,KAAK,IAAI;AAAA,IAChC;AACA,WAAO;AAAA,EACT;AAAA,EAEA,aAAgB,IAAgB;AAC9B,SAAK,IAAI,KAAK,mBAAmB;AACjC,QAAI;AACF,YAAM,SAAS,GAAG;AAClB,WAAK,IAAI,KAAK,QAAQ;AACtB,aAAO;AAAA,IACT,SAAS,OAAO;AACd,WAAK,IAAI,KAAK,UAAU;AACxB,YAAM;AAAA,IACR;AAAA,EACF;AAAA,EAEA,YAAY,UAAkC;AAC5C,QAAI,oBAAoB,cAAc;AACpC,WAAK,MAAM;AAAA,IACb,OAAO;AACL,YAAM,SAAS,YAAY;AAC3B,gBAAU,QAAQ,MAAM,GAAG,EAAE,WAAW,KAAK,CAAC;AAC9C,WAAK,MAAM,IAAI,aAAa,MAAM;AAAA,IACpC;AACA,SAAK,IAAI,KAAK,kBAAG;AACjB,SAAK,iCAAiC;AACtC,SAAK,6BAA6B;AAClC,SAAK,IAAI;AAAA,MACP;AAAA,IACF;AAAA,EACF;AAAA,EAEA,+BAAqC;AACnC,UAAM,UAAU,KAAK,MAAM,4BAA4B,EAAE,IAAI;AAI7D,QAAI,QAAQ,WAAW,EAAG;AAC1B,QAAI,QAAQ,KAAK,CAAC,WAAW,OAAO,SAAS,SAAS,EAAG;AAEzD,SAAK,aAAa,MAAM;AACtB,WAAK,IAAI,KAAK,8CAA8C;AAC5D,WAAK,IAAI,KAAK;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,OASb;AACD,WAAK,IAAI,KAAK;AAAA;AAAA;AAAA;AAAA,OAIb;AACD,WAAK,IAAI,KAAK,2BAA2B;AACzC,WAAK,IAAI;AAAA,QACP;AAAA,MACF;AACA,WAAK,IAAI;AAAA,QACP;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,mCAAyC;AACvC,UAAM,aAAa,KAAK,MAAM,yBAAyB,EAAE,IAAI;AAK7D,QAAI,WAAW,WAAW,EAAG;AAE7B,UAAM,cAAc,WAAW,KAAK,CAAC,WAAW,OAAO,SAAS,UAAU;AAC1E,UAAM,kBAAkB,aAAa,YAAY;AAEjD,UAAM,iBAAiB,KAAK;AAAA,MAC1B;AAAA,IACF,EAAE,IAAI;AAKN,UAAM,kBAAkB,eAAe;AAAA,MACrC,CAAC,OAAO,GAAG,SAAS,cAAc,GAAG,UAAU;AAAA,IACjD;AACA,UAAM,mBAAmB,iBAAiB,cAAc;AAExD,QAAI,mBAAmB,iBAAkB;AAEzC,SAAK,YAAY,MAAM;AACvB,SAAK,aAAa,MAAM;AACtB,WAAK,IAAI,KAAK;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,OAab;AAGD,WAAK,IAAI,KAAK,yCAAyC;AAEvD,WAAK,IAAI,KAAK;AAAA;AAAA;AAAA;AAAA;AAAA,OAKb;AAED,WAAK,IAAI,KAAK,iBAAiB;AAC/B,WAAK,IAAI,KAAK,sCAAsC;AACpD,WAAK,IAAI;AAAA,QACP;AAAA,MACF;AACA,WAAK,IAAI;AAAA,QACP;AAAA,MACF;AAAA,IACF,CAAC;AACD,SAAK,YAAY,MAAM;AAAA,EACzB;AAAA,EAEA,YAAY,MAAwB;AAClC,UAAM,KAAK,OAAO,WAAW;AAC7B,UAAM,MAAM,KAAK,IAAI;AACrB,SAAK;AAAA,MACH;AAAA,IACF,EAAE,IAAI,IAAI,MAAM,GAAG;AACnB,WAAO,EAAE,IAAI,MAAM,YAAY,IAAI;AAAA,EACrC;AAAA,EAEA,UAAU,KAKC;AACT,UAAM,KAAK,OAAO,WAAW;AAC7B,UAAM,MAAM,KAAK,IAAI;AACrB,SAAK;AAAA,MACH;AAAA,IACF,EAAE;AAAA,MACA;AAAA,MACA,IAAI;AAAA,MACJ,IAAI;AAAA,MACJ,IAAI;AAAA,MACJ,IAAI,SAAS,KAAK,UAAU,IAAI,MAAM,IAAI;AAAA,MAC1C;AAAA,IACF;AACA,WAAO;AAAA,EACT;AAAA,EAEA,UACE,OACA,QACA,SACM;AACN,SAAK;AAAA,MACH;AAAA,IACF,EAAE,IAAI,KAAK,IAAI,GAAG,QAAQ,UAAU,KAAK,UAAU,OAAO,IAAI,MAAM,KAAK;AAAA,EAC3E;AAAA,EAEA,UAAU,OAAyB;AACjC,SAAK,aAAa,MAAM;AACtB,YAAM,OAAO,KAAK;AAAA,QAChB;AAAA,MACF;AACA,iBAAW,KAAK,OAAO;AACrB,aAAK;AAAA,UACH,EAAE;AAAA,UACF,EAAE;AAAA,UACF,EAAE;AAAA,UACF,KAAK,UAAU,EAAE,KAAK;AAAA,UACtB,EAAE;AAAA,UACF,EAAE,YAAY,OAAO,KAAK,UAAU,EAAE,QAAQ,IAAI;AAAA,UAClD,EAAE;AAAA,UACF,EAAE;AAAA,UACF,EAAE;AAAA,UACF,EAAE,SAAS;AAAA,QACb;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,WAAW,QAA2B;AACpC,SAAK,aAAa,MAAM;AACtB,YAAM,OAAO,KAAK;AAAA,QAChB;AAAA,MACF;AACA,iBAAW,KAAK,QAAQ;AACtB,aAAK,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,aAAa,EAAE,OAAO,EAAE,UAAU,IAAI;AAAA,MACpE;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEA,OAAO,OAAmC;AACxC,UAAM,MAAM,KAAK,MAAM,iCAAiC,EAAE,IAAI,KAAK;AAanE,QAAI,CAAC,IAAK,QAAO;AACjB,WAAO;AAAA,MACL,IAAI,IAAI;AAAA,MACR,UAAU,IAAI;AAAA,MACd,MAAM,IAAI;AAAA,MACV,OAAO,IAAI;AAAA,MACX,QAAQ,IAAI,SAAS,KAAK,MAAM,IAAI,MAAM,IAAI;AAAA,MAC9C,YAAY,IAAI;AAAA,MAChB,aAAa,IAAI;AAAA,MACjB,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI,UAAU,KAAK,MAAM,IAAI,OAAO,IAAI;AAAA,IACnD;AAAA,EACF;AAAA,EAEA,SAAS,SAA4B;AACnC,UAAM,MAAM,UACR,8DACA;AACJ,UAAM,OACJ,UAAU,KAAK,MAAM,GAAG,EAAE,IAAI,OAAO,IAAI,KAAK,MAAM,GAAG,EAAE,IAAI;AAY/D,WAAO,KAAK,IAAI,CAAC,SAAS;AAAA,MACxB,IAAI,IAAI;AAAA,MACR,UAAU,IAAI;AAAA,MACd,MAAM,IAAI;AAAA,MACV,OAAO,IAAI;AAAA,MACX,QAAQ,IAAI,SAAS,KAAK,MAAM,IAAI,MAAM,IAAI;AAAA,MAC9C,YAAY,IAAI;AAAA,MAChB,aAAa,IAAI;AAAA,MACjB,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI,UAAU,KAAK,MAAM,IAAI,OAAO,IAAI;AAAA,IACnD,EAAE;AAAA,EACJ;AAAA,EAEA,SAAS,OAA0B;AACjC,UAAM,OAAO,KAAK;AAAA,MAChB;AAAA,IACF,EAAE,IAAI,KAAK;AAYX,WAAO,KAAK,IAAI,CAAC,SAAS;AAAA,MACxB,IAAI,IAAI;AAAA,MACR,QAAQ,IAAI;AAAA,MACZ,KAAK,IAAI;AAAA,MACT,OAAO,KAAK,MAAM,IAAI,KAAK;AAAA,MAC3B,QAAQ,IAAI;AAAA,MACZ,UAAU,IAAI,WAAW,KAAK,MAAM,IAAI,QAAQ,IAAI;AAAA,MACpD,YAAY,IAAI;AAAA,MAChB,WAAW,IAAI;AAAA,MACf,YAAY,IAAI;AAAA,MAChB,OAAO,IAAI;AAAA,IACb,EAAE;AAAA,EACJ;AAAA,EAEA,gBAAgB,OAAe,YAAY,KAAuB;AAChE,UAAM,OAAO,KAAK;AAAA,MAChB;AAAA;AAAA;AAAA;AAAA;AAAA,IAKF,EAAE,IAAI,OAAO,SAAS;AAgBtB,UAAM,UAAU,oBAAI,IAA4B;AAChD,eAAW,OAAO,MAAM;AACtB,UAAI,IAAI,QAAQ,IAAI,IAAI,EAAE;AAC1B,UAAI,CAAC,GAAG;AACN,YAAI;AAAA,UACF,IAAI,IAAI;AAAA,UACR,QAAQ,IAAI;AAAA,UACZ,KAAK,IAAI;AAAA,UACT,OAAO,KAAK,MAAM,IAAI,KAAK;AAAA,UAC3B,QAAQ,IAAI;AAAA,UACZ,UAAU,IAAI,WAAW,KAAK,MAAM,IAAI,QAAQ,IAAI;AAAA,UACpD,YAAY,IAAI;AAAA,UAChB,WAAW,IAAI;AAAA,UACf,YAAY,IAAI;AAAA,UAChB,OAAO,IAAI;AAAA,UACX,QAAQ,CAAC;AAAA,QACX;AACA,gBAAQ,IAAI,IAAI,IAAI,CAAC;AAAA,MACvB;AACA,QAAE,OAAO,KAAK;AAAA,QACZ,aAAa,IAAI;AAAA,QACjB,OAAO,IAAI;AAAA,QACX,QAAQ,IAAI;AAAA,MACd,CAAC;AAAA,IACH;AACA,WAAO,MAAM,KAAK,QAAQ,OAAO,CAAC;AAAA,EACpC;AAAA,EAEA,cAAc,OAAe,YAAY,KAAiB;AACxD,UAAM,SAAS,KAAK;AAAA,MAClB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,IAMF,EAAE,IAAI,KAAK;AAOX,UAAM,cAAc,KAAK;AAAA,MACvB;AAAA;AAAA;AAAA;AAAA;AAAA,IAKF,EAAE,IAAI,KAAK;AAEX,UAAM,aAAqC,CAAC;AAC5C,eAAW,OAAO,aAAa;AAC7B,iBAAW,IAAI,WAAW,IAAI,IAAI;AAAA,IACpC;AAEA,UAAM,WAAW,KAAK;AAAA,MACpB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,IAMF,EAAE,IAAI,KAAK;AAEX,QAAI,YAAY;AAChB,QAAI,YAAY;AAChB,eAAW,OAAO,UAAU;AAC1B,UAAI,IAAI,YAAY,UAAW;AAAA,UAC1B;AAAA,IACP;AAEA,WAAO;AAAA,MACL,YAAY,OAAO;AAAA,MACnB;AAAA,MACA;AAAA,MACA;AAAA,MACA,gBAAgB,OAAO;AAAA,MACvB,eAAe,OAAO;AAAA,MACtB,gBAAgB,OAAO;AAAA,IACzB;AAAA,EACF;AAAA,EAEA,gBAAgB,MAAoC;AAClD,UAAM,MAAM,KAAK;AAAA,MACf;AAAA,IACF,EAAE,IAAI,IAAI;AACV,WAAO,OAAO;AAAA,EAChB;AAAA,EAEA,sBAAsB,SAAiB,OAAoC;AACzE,UAAM,MAAM,QACR,wGACA;AACJ,UAAM,MACJ,QACI,KAAK,MAAM,GAAG,EAAE,IAAI,SAAS,aAAa,KAAK,IAC/C,KAAK,MAAM,GAAG,EAAE,IAAI,SAAS,WAAW;AAc9C,QAAI,CAAC,IAAK,QAAO;AACjB,WAAO;AAAA,MACL,IAAI,IAAI;AAAA,MACR,UAAU,IAAI;AAAA,MACd,MAAM,IAAI;AAAA,MACV,OAAO,IAAI;AAAA,MACX,QAAQ,IAAI,SAAS,KAAK,MAAM,IAAI,MAAM,IAAI;AAAA,MAC9C,YAAY,IAAI;AAAA,MAChB,aAAa,IAAI;AAAA,MACjB,QAAQ,IAAI;AAAA,MACZ,SAAS,IAAI,UAAU,KAAK,MAAM,IAAI,OAAO,IAAI;AAAA,IACnD;AAAA,EACF;AAAA,EAEA,aAAyB;AACvB,UAAM,OAAO,KAAK;AAAA,MAChB;AAAA,IACF,EAAE,IAAI;AACN,WAAO,KAAK,IAAI,CAAC,SAAS;AAAA,MACxB,IAAI,IAAI;AAAA,MACR,MAAM,IAAI;AAAA,MACV,YAAY,IAAI;AAAA,IAClB,EAAE;AAAA,EACJ;AAAA,EAEA,aAAa,MAAc,SAA4B;AACrD,UAAM,KAAK,OAAO,WAAW;AAC7B,UAAM,MAAM,KAAK,IAAI;AAErB,UAAM,SAAS,KAAK;AAAA,MAClB;AAAA,IACF,EAAE,IAAI,IAAI;AACV,UAAM,WAAW,QAAQ,iBAAiB,KAAK;AAE/C,SAAK;AAAA,MACH;AAAA,IACF,EAAE,IAAI,IAAI,MAAM,SAAS,SAAS,GAAG;AACrC,WAAO,EAAE,IAAI,MAAM,SAAS,SAAS,YAAY,IAAI;AAAA,EACvD;AAAA,EAEA,cAA2B;AACzB,UAAM,OAAO,KAAK;AAAA,MAChB;AAAA,IACF,EAAE,IAAI;AAON,WAAO,KAAK,IAAI,CAAC,SAAS;AAAA,MACxB,IAAI,IAAI;AAAA,MACR,MAAM,IAAI;AAAA,MACV,SAAS,IAAI;AAAA,MACb,SAAS,IAAI;AAAA,MACb,YAAY,IAAI;AAAA,IAClB,EAAE;AAAA,EACJ;AAAA,EAEA,UAAU,IAAmC;AAC3C,UAAM,MAAM,KAAK,MAAM,oCAAoC,EAAE,IAAI,EAAE;AASnE,QAAI,CAAC,IAAK,QAAO;AACjB,WAAO;AAAA,MACL,IAAI,IAAI;AAAA,MACR,MAAM,IAAI;AAAA,MACV,SAAS,IAAI;AAAA,MACb,SAAS,IAAI;AAAA,MACb,YAAY,IAAI;AAAA,IAClB;AAAA,EACF;AAAA,EAEA,aAAa,IAAkB;AAC7B,SAAK,MAAM,kCAAkC,EAAE,IAAI,EAAE;AAAA,EACvD;AACF;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@deepagents/evals",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.22.0",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -78,10 +78,11 @@
|
|
|
78
78
|
"@hono/vite-build": "^1.9.3",
|
|
79
79
|
"@hono/vite-dev-server": "^0.25.0",
|
|
80
80
|
"ai": "^6.0.90",
|
|
81
|
+
"autoevals": "0.0.132",
|
|
81
82
|
"chalk": "^5.6.0",
|
|
82
83
|
"zod": "^3.25.76 || ^4.0.0",
|
|
83
84
|
"hono": "4.10.6",
|
|
84
|
-
"@deepagents/agent": "0.
|
|
85
|
+
"@deepagents/agent": "0.22.0",
|
|
85
86
|
"@hono/node-server": "1.19.6",
|
|
86
87
|
"@ai-sdk/anthropic": "^3.0.44",
|
|
87
88
|
"@ai-sdk/google": "^3.0.29",
|