@goreal-ai/echo-pdk 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/ai-judge/index.d.ts +11 -21
- package/dist/ai-judge/index.d.ts.map +1 -1
- package/dist/ai-judge/index.js +36 -90
- package/dist/ai-judge/index.js.map +1 -1
- package/dist/embeddings/cosine.d.ts +15 -0
- package/dist/embeddings/cosine.d.ts.map +1 -0
- package/dist/embeddings/cosine.js +37 -0
- package/dist/embeddings/cosine.js.map +1 -0
- package/dist/embeddings/index.d.ts +9 -0
- package/dist/embeddings/index.d.ts.map +1 -0
- package/dist/embeddings/index.js +11 -0
- package/dist/embeddings/index.js.map +1 -0
- package/dist/embeddings/openai.d.ts +11 -0
- package/dist/embeddings/openai.d.ts.map +1 -0
- package/dist/embeddings/openai.js +38 -0
- package/dist/embeddings/openai.js.map +1 -0
- package/dist/embeddings/registry.d.ts +13 -0
- package/dist/embeddings/registry.d.ts.map +1 -0
- package/dist/embeddings/registry.js +29 -0
- package/dist/embeddings/registry.js.map +1 -0
- package/dist/embeddings/types.d.ts +35 -0
- package/dist/embeddings/types.d.ts.map +1 -0
- package/dist/embeddings/types.js +8 -0
- package/dist/embeddings/types.js.map +1 -0
- package/dist/embeddings/voyage.d.ts +12 -0
- package/dist/embeddings/voyage.d.ts.map +1 -0
- package/dist/embeddings/voyage.js +39 -0
- package/dist/embeddings/voyage.js.map +1 -0
- package/dist/eval/assertions.d.ts +35 -0
- package/dist/eval/assertions.d.ts.map +1 -0
- package/dist/eval/assertions.js +349 -0
- package/dist/eval/assertions.js.map +1 -0
- package/dist/eval/dataset.d.ts +42 -0
- package/dist/eval/dataset.d.ts.map +1 -0
- package/dist/eval/dataset.js +101 -0
- package/dist/eval/dataset.js.map +1 -0
- package/dist/eval/index.d.ts +14 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +17 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/loader.d.ts +30 -0
- package/dist/eval/loader.d.ts.map +1 -0
- package/dist/eval/loader.js +170 -0
- package/dist/eval/loader.js.map +1 -0
- package/dist/eval/reporter.d.ts +26 -0
- package/dist/eval/reporter.d.ts.map +1 -0
- package/dist/eval/reporter.js +164 -0
- package/dist/eval/reporter.js.map +1 -0
- package/dist/eval/runner.d.ts +28 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +232 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/eval/types.d.ts +257 -0
- package/dist/eval/types.d.ts.map +1 -0
- package/dist/eval/types.js +11 -0
- package/dist/eval/types.js.map +1 -0
- package/dist/evaluator/evaluator.d.ts +2 -2
- package/dist/evaluator/evaluator.js +5 -5
- package/dist/evaluator/evaluator.js.map +1 -1
- package/dist/evaluator/index.d.ts +1 -1
- package/dist/evaluator/index.d.ts.map +1 -1
- package/dist/evaluator/index.js +1 -1
- package/dist/evaluator/index.js.map +1 -1
- package/dist/evaluator/operators.d.ts +9 -5
- package/dist/evaluator/operators.d.ts.map +1 -1
- package/dist/evaluator/operators.js +26 -33
- package/dist/evaluator/operators.js.map +1 -1
- package/dist/index.d.ts +8 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +38 -20
- package/dist/index.js.map +1 -1
- package/dist/parser/ast.js +1 -1
- package/dist/parser/ast.js.map +1 -1
- package/dist/parser/lexer.d.ts +1 -1
- package/dist/parser/lexer.js +1 -1
- package/dist/project/index.d.ts.map +1 -1
- package/dist/project/index.js +10 -3
- package/dist/project/index.js.map +1 -1
- package/dist/project/types.d.ts +19 -4
- package/dist/project/types.d.ts.map +1 -1
- package/dist/project/types.js +3 -0
- package/dist/project/types.js.map +1 -1
- package/dist/providers/anthropic.d.ts +18 -0
- package/dist/providers/anthropic.d.ts.map +1 -0
- package/dist/providers/anthropic.js +123 -0
- package/dist/providers/anthropic.js.map +1 -0
- package/dist/providers/base.d.ts +45 -0
- package/dist/providers/base.d.ts.map +1 -0
- package/dist/providers/base.js +107 -0
- package/dist/providers/base.js.map +1 -0
- package/dist/providers/index.d.ts +14 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +16 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/providers/openai.d.ts +18 -0
- package/dist/providers/openai.d.ts.map +1 -0
- package/dist/providers/openai.js +106 -0
- package/dist/providers/openai.js.map +1 -0
- package/dist/providers/registry.d.ts +80 -0
- package/dist/providers/registry.d.ts.map +1 -0
- package/dist/providers/registry.js +118 -0
- package/dist/providers/registry.js.map +1 -0
- package/dist/providers/run-prompt.d.ts +69 -0
- package/dist/providers/run-prompt.d.ts.map +1 -0
- package/dist/providers/run-prompt.js +79 -0
- package/dist/providers/run-prompt.js.map +1 -0
- package/dist/providers/types.d.ts +123 -0
- package/dist/providers/types.d.ts.map +1 -0
- package/dist/providers/types.js +9 -0
- package/dist/providers/types.js.map +1 -0
- package/dist/types.d.ts +5 -5
- package/dist/types.d.ts.map +1 -1
- package/package.json +10 -7
- package/scripts/bundle-for-graaljs.mjs +45 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Loader for .eval and .dset YAML files
|
|
3
|
+
*
|
|
4
|
+
* Parses and validates eval suite definitions and dataset files.
|
|
5
|
+
*/
|
|
6
|
+
import { readFile } from 'fs/promises';
|
|
7
|
+
import { parse as parseYaml } from 'yaml';
|
|
8
|
+
// =============================================================================
|
|
9
|
+
// EVAL FILE LOADER
|
|
10
|
+
// =============================================================================
|
|
11
|
+
/**
|
|
12
|
+
* Load and validate an .eval file from disk.
|
|
13
|
+
*/
|
|
14
|
+
export async function loadEvalFile(filePath) {
|
|
15
|
+
const content = await readFile(filePath, 'utf-8');
|
|
16
|
+
return parseEvalContent(content, filePath);
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Parse .eval YAML content into an EvalSuite.
|
|
20
|
+
*/
|
|
21
|
+
export function parseEvalContent(content, source) {
|
|
22
|
+
const raw = parseYaml(content);
|
|
23
|
+
if (!raw || typeof raw !== 'object') {
|
|
24
|
+
throw new EvalLoadError('Invalid .eval file: expected YAML object', source);
|
|
25
|
+
}
|
|
26
|
+
// Validate required fields
|
|
27
|
+
if (typeof raw.suite !== 'string') {
|
|
28
|
+
throw new EvalLoadError('Missing required field: suite', source);
|
|
29
|
+
}
|
|
30
|
+
if (!Array.isArray(raw.tests) || raw.tests.length === 0) {
|
|
31
|
+
throw new EvalLoadError('Missing or empty required field: tests', source);
|
|
32
|
+
}
|
|
33
|
+
// Parse config
|
|
34
|
+
const config = parseConfig(raw.config, source);
|
|
35
|
+
// Parse tests
|
|
36
|
+
const tests = raw.tests.map((t, i) => parseTest(t, i, source));
|
|
37
|
+
return {
|
|
38
|
+
suite: raw.suite,
|
|
39
|
+
config,
|
|
40
|
+
tests,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
function parseConfig(raw, _source) {
|
|
44
|
+
if (!raw || typeof raw !== 'object') {
|
|
45
|
+
return { target: 'prompt.pdk' };
|
|
46
|
+
}
|
|
47
|
+
return {
|
|
48
|
+
target: typeof raw.target === 'string' ? raw.target : 'prompt.pdk',
|
|
49
|
+
model: typeof raw.model === 'string' ? raw.model : undefined,
|
|
50
|
+
timeout: typeof raw.timeout === 'number' ? raw.timeout : undefined,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
function parseTest(raw, index, source) {
|
|
54
|
+
if (typeof raw.name !== 'string') {
|
|
55
|
+
throw new EvalLoadError(`Test at index ${index} missing required field: name`, source);
|
|
56
|
+
}
|
|
57
|
+
const test = {
|
|
58
|
+
name: raw.name,
|
|
59
|
+
};
|
|
60
|
+
if (raw.given && typeof raw.given === 'object') {
|
|
61
|
+
test.given = raw.given;
|
|
62
|
+
}
|
|
63
|
+
if (typeof raw.dataset === 'string') {
|
|
64
|
+
test.dataset = raw.dataset;
|
|
65
|
+
}
|
|
66
|
+
if (typeof raw.params === 'string') {
|
|
67
|
+
test.params = raw.params;
|
|
68
|
+
}
|
|
69
|
+
if (Array.isArray(raw.expect_render)) {
|
|
70
|
+
test.expect_render = raw.expect_render.map((a, i) => parseAssertion(a, i, raw.name, source));
|
|
71
|
+
}
|
|
72
|
+
if (Array.isArray(raw.expect_llm)) {
|
|
73
|
+
test.expect_llm = raw.expect_llm.map((a, i) => parseAssertion(a, i, raw.name, source));
|
|
74
|
+
}
|
|
75
|
+
if (!test.expect_render && !test.expect_llm) {
|
|
76
|
+
throw new EvalLoadError(`Test "${test.name}" has no assertions (need expect_render or expect_llm)`, source);
|
|
77
|
+
}
|
|
78
|
+
return test;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Parse a single assertion from YAML.
|
|
82
|
+
* Each assertion is a single-key object.
|
|
83
|
+
*/
|
|
84
|
+
function parseAssertion(raw, index, testName, source) {
|
|
85
|
+
const keys = Object.keys(raw);
|
|
86
|
+
if (keys.length === 0) {
|
|
87
|
+
throw new EvalLoadError(`Empty assertion at index ${index} in test "${testName}"`, source);
|
|
88
|
+
}
|
|
89
|
+
// The first key is the operator
|
|
90
|
+
const operator = keys[0];
|
|
91
|
+
const value = raw[operator];
|
|
92
|
+
// Validate known operators
|
|
93
|
+
const knownOperators = new Set([
|
|
94
|
+
'contains', 'not_contains', 'equals', 'matches',
|
|
95
|
+
'starts_with', 'ends_with', 'length', 'word_count',
|
|
96
|
+
'json_valid', 'json_schema', 'llm_judge', 'similar_to',
|
|
97
|
+
'sentiment', 'latency', 'token_count', 'cost',
|
|
98
|
+
]);
|
|
99
|
+
if (!knownOperators.has(operator)) {
|
|
100
|
+
throw new EvalLoadError(`Unknown assertion operator "${operator}" in test "${testName}"`, source);
|
|
101
|
+
}
|
|
102
|
+
// Return as-is — the runner will handle type checking per operator
|
|
103
|
+
return { [operator]: value };
|
|
104
|
+
}
|
|
105
|
+
// =============================================================================
|
|
106
|
+
// DATASET FILE LOADER
|
|
107
|
+
// =============================================================================
|
|
108
|
+
/**
|
|
109
|
+
* Load and validate a .dset file from disk.
|
|
110
|
+
*/
|
|
111
|
+
export async function loadDatasetFile(filePath) {
|
|
112
|
+
const content = await readFile(filePath, 'utf-8');
|
|
113
|
+
return parseDatasetContent(content, filePath);
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Parse .dset YAML content into an EvalDataset.
|
|
117
|
+
*/
|
|
118
|
+
export function parseDatasetContent(content, source) {
|
|
119
|
+
const raw = parseYaml(content);
|
|
120
|
+
if (!raw || typeof raw !== 'object') {
|
|
121
|
+
throw new EvalLoadError('Invalid .dset file: expected YAML object', source);
|
|
122
|
+
}
|
|
123
|
+
if (typeof raw.name !== 'string') {
|
|
124
|
+
throw new EvalLoadError('Missing required field: name', source);
|
|
125
|
+
}
|
|
126
|
+
if (!Array.isArray(raw.parameters) || raw.parameters.length === 0) {
|
|
127
|
+
throw new EvalLoadError('Missing or empty required field: parameters', source);
|
|
128
|
+
}
|
|
129
|
+
// Parse golden
|
|
130
|
+
let golden;
|
|
131
|
+
if (raw.golden && typeof raw.golden === 'object') {
|
|
132
|
+
const g = raw.golden;
|
|
133
|
+
golden = {
|
|
134
|
+
response: typeof g.response === 'string' ? g.response : '',
|
|
135
|
+
model: typeof g.model === 'string' ? g.model : undefined,
|
|
136
|
+
recorded_at: typeof g.recorded_at === 'string' ? g.recorded_at : undefined,
|
|
137
|
+
metadata: g.metadata && typeof g.metadata === 'object'
|
|
138
|
+
? g.metadata
|
|
139
|
+
: undefined,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
// Parse parameters
|
|
143
|
+
const parameters = raw.parameters.map((p, i) => {
|
|
144
|
+
if (typeof p.name !== 'string') {
|
|
145
|
+
throw new EvalLoadError(`Parameter set at index ${i} missing required field: name`, source);
|
|
146
|
+
}
|
|
147
|
+
return p;
|
|
148
|
+
});
|
|
149
|
+
return {
|
|
150
|
+
name: raw.name,
|
|
151
|
+
description: typeof raw.description === 'string' ? raw.description : undefined,
|
|
152
|
+
golden,
|
|
153
|
+
parameters,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
// =============================================================================
|
|
157
|
+
// ERROR TYPE
|
|
158
|
+
// =============================================================================
|
|
159
|
+
/**
|
|
160
|
+
* Error thrown when loading/parsing eval or dataset files.
|
|
161
|
+
*/
|
|
162
|
+
export class EvalLoadError extends Error {
|
|
163
|
+
source;
|
|
164
|
+
constructor(message, source) {
|
|
165
|
+
super(source ? `${message} (in ${source})` : message);
|
|
166
|
+
this.source = source;
|
|
167
|
+
this.name = 'EvalLoadError';
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
//# sourceMappingURL=loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/eval/loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAW1C,gFAAgF;AAChF,mBAAmB;AACnB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB;IACjD,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAClD,OAAO,gBAAgB,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;AAC7C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAAe,EAAE,MAAe;IAC/D,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,CAA4B,CAAC;IAE1D,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,0CAA0C,EAAE,MAAM,CAAC,CAAC;IAC9E,CAAC;IAED,2BAA2B;IAC3B,IAAI,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;QAClC,MAAM,IAAI,aAAa,CAAC,+BAA+B,EAAE,MAAM,CAAC,CAAC;IACnE,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,aAAa,CAAC,wCAAwC,EAAE,MAAM,CAAC,CAAC;IAC5E,CAAC;IAED,eAAe;IACf,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,CAAC,MAA6C,EAAE,MAAM,CAAC,CAAC;IAEtF,cAAc;IACd,MAAM,KAAK,GAAI,GAAG,CAAC,KAAmC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAClE,SAAS,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,CACxB,CAAC;IAEF,OAAO;QACL,KAAK,EAAE,GAAG,CAAC,KAAe;QAC1B,MAAM;QACN,KAAK;KACN,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAClB,GAAwC,EACxC,OAAgB;IAEhB,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QACpC,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC;IAClC,CAAC;IAED,OAAO;QACL,MAAM,EAAE,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,YAAY;QAClE,KAAK,EAAE,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;QAC5D,OAAO,EAAE,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS;KACnE,CAAC;AACJ,CAAC;AAED,SAAS,SAAS,CAChB,GAA4B,EAC5B,KAAa,EACb,MAAe;IAEf,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QACjC,MAAM,IAAI,aAAa,CAAC,iBAAiB,KAAK,+BAA+B,EAAE,MAAM,CAAC,CAAC;IACzF,CAAC;IAED,MAAM,IAAI,GAAa;QACrB,IAAI,EAAE,GAAG,CAAC,IAAc;KACzB,CAAC;IAEF,IAAI,GAAG,CAAC,KAAK,IAAI,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC/C,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAgC,CAAC;IACpD,CAAC;IAED,IAAI,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;QACpC,IAAI,CAAC,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC;IAC7B,CAAC;IAED,IAAI,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;QACnC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC;IAC3B,CAAC;IAED,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,EAAE,CAAC;QACrC,IAAI,CAAC,aAAa,GAAG,GAAG,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAClD,cAAc,CAAC,CAA4B,EAAE,CAAC,EAAE,GAAG,CAAC,IAAc,EAAE,MAAM,CAAC,CAC5E,CAAC;IACJ,CAAC;IAED,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;QAClC,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAC5C,cAAc,CAAC,CAA4B,EAAE,CAAC,EAAE,GAAG,CAAC,IAAc,EAAE,MAAM,CAAC,CAC5E,CAAC;IACJ,CAAC;IAED,IAAI,CAAC,IAAI,CAAC,aAAa,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;QAC5C,MAAM,IAAI,aAAa,CACrB,SAAS,IAAI,CAAC,IAAI,wDAAwD,EAC1E,MAAM,CACP,CAAC;IACJ,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,SAAS,cAAc,CACrB,GAA4B,EAC5B,KAAa,EACb,QAAgB,EAChB,MAAe;IAEf,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC9B,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,MAAM,IAAI,aAAa,CACrB,4BAA4B,KAAK,aAAa,QAAQ,GAAG,EACzD,MAAM,CACP,CAAC;IACJ,CAAC;IAED,gCAAgC;IAChC,MAAM,QAAQ,GAAG,IAAI,CAAC,CAAC,CAAW,CAAC;IACnC,MAAM,KAAK,GAAG,GAAG,CAAC,QAAQ,CAAC,CAAC;IAE5B,2BAA2B;IAC3B,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;QAC7B,UAAU,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAS;QAC/C,aAAa,EAAE,WAAW,EAAE,QAAQ,EAAE,YAAY;QAClD,YAAY,EAAE,aAAa,EAAE,WAAW,EAAE,YAAY;QACtD,WAAW,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM;KAC9C,CAAC,CAAC;IAEH,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;QAClC,MAAM,IAAI,aAAa,CACrB,+BAA+B,QAAQ,cAAc,QAAQ,GAAG,EAChE,MAAM,CACP,CAAC;IACJ,CAAC;IAED,mEAAmE;IACnE,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE,KAAK,EAAe,CAAC;AAC5C,CAAC;AAED,gFAAgF;AAChF,sBAAsB;AACtB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,QAAgB;IACpD,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAClD,OAAO,mBAAmB,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;AAChD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,OAAe,EAAE,MAAe;IAClE,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,CAA4B,CAAC;IAE1D,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,0CAA0C,EAAE,MAAM,CAAC,CAAC;IAC9E,CAAC;IAED,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QACjC,MAAM,IAAI,aAAa,CAAC,8BAA8B,EAAE,MAAM,CAAC,CAAC;IAClE,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClE,MAAM,IAAI,aAAa,CAAC,6CAA6C,EAAE,MAAM,CAAC,CAAC;IACjF,CAAC;IAED,eAAe;IACf,IAAI,MAA8B,CAAC;IACnC,IAAI,GAAG,CAAC,MAAM,IAAI,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;QACjD,MAAM,CAAC,GAAG,GAAG,CAAC,MAAiC,CAAC;QAChD,MAAM,GAAG;YACP,QAAQ,EAAE,OAAO,CAAC,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE;YAC1D,KAAK,EAAE,OAAO,CAAC,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;YACxD,WAAW,EAAE,OAAO,CAAC,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;YAC1E,QAAQ,EAAE,CAAC,CAAC,QAAQ,IAAI,OAAO,CAAC,CAAC,QAAQ,KAAK,QAAQ;gBACpD,CAAC,CAAC,CAAC,CAAC,QAAmC;gBACvC,CAAC,CAAC,SAAS;SACd,CAAC;IACJ,CAAC;IAED,mBAAmB;IACnB,MAAM,UAAU,GAAwB,GAAG,CAAC,UAAwC,CAAC,GAAG,CACtF,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACP,IAAI,OAAO,CAAC,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC/B,MAAM,IAAI,aAAa,CAAC,0BAA0B,CAAC,+BAA+B,EAAE,MAAM,CAAC,CAAC;QAC9F,CAAC;QACD,OAAO,CAAqB,CAAC;IAC/B,CAAC,CACF,CAAC;IAEF,OAAO;QACL,IAAI,EAAE,GAAG,CAAC,IAAc;QACxB,WAAW,EAAE,OAAO,GAAG,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;QAC9E,MAAM;QACN,UAAU;KACX,CAAC;AACJ,CAAC;AAED,gFAAgF;AAChF,aAAa;AACb,gFAAgF;AAEhF;;GAEG;AACH,MAAM,OAAO,aAAc,SAAQ,KAAK;IAG7B;IAFT,YACE,OAAe,EACR,MAAe;QAEtB,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,OAAO,QAAQ,MAAM,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAF/C,WAAM,GAAN,MAAM,CAAS;QAGtB,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;IAC9B,CAAC;CACF"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Eval result reporters
|
|
3
|
+
*
|
|
4
|
+
* Formats EvalSuiteResult for different output targets:
|
|
5
|
+
* - Console: colorful human-readable output
|
|
6
|
+
* - JSON: structured data for programmatic use
|
|
7
|
+
* - JUnit XML: CI integration format
|
|
8
|
+
*/
|
|
9
|
+
import type { EvalSuiteResult } from './types.js';
|
|
10
|
+
/**
|
|
11
|
+
* Format eval results for console output with colors (ANSI).
|
|
12
|
+
*/
|
|
13
|
+
export declare function formatConsole(result: EvalSuiteResult): string;
|
|
14
|
+
/**
|
|
15
|
+
* Format eval results as JSON string.
|
|
16
|
+
*/
|
|
17
|
+
export declare function formatJson(result: EvalSuiteResult): string;
|
|
18
|
+
/**
|
|
19
|
+
* Format eval results as JUnit XML for CI integration.
|
|
20
|
+
*/
|
|
21
|
+
export declare function formatJunit(result: EvalSuiteResult): string;
|
|
22
|
+
/**
|
|
23
|
+
* Format eval results using the specified reporter.
|
|
24
|
+
*/
|
|
25
|
+
export declare function formatResults(result: EvalSuiteResult, reporter?: 'console' | 'json' | 'junit'): string;
|
|
26
|
+
//# sourceMappingURL=reporter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reporter.d.ts","sourceRoot":"","sources":["../../src/eval/reporter.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMlD;;GAEG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CA4E7D;AAMD;;GAEG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CAE1D;AAMD;;GAEG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CAuC3D;AAMD;;GAEG;AACH,wBAAgB,aAAa,CAC3B,MAAM,EAAE,eAAe,EACvB,QAAQ,GAAE,SAAS,GAAG,MAAM,GAAG,OAAmB,GACjD,MAAM,CASR"}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Eval result reporters
|
|
3
|
+
*
|
|
4
|
+
* Formats EvalSuiteResult for different output targets:
|
|
5
|
+
* - Console: colorful human-readable output
|
|
6
|
+
* - JSON: structured data for programmatic use
|
|
7
|
+
* - JUnit XML: CI integration format
|
|
8
|
+
*/
|
|
9
|
+
// =============================================================================
|
|
10
|
+
// CONSOLE REPORTER
|
|
11
|
+
// =============================================================================
|
|
12
|
+
/**
|
|
13
|
+
* Format eval results for console output with colors (ANSI).
|
|
14
|
+
*/
|
|
15
|
+
export function formatConsole(result) {
|
|
16
|
+
const lines = [];
|
|
17
|
+
const { suiteName, tests, summary } = result;
|
|
18
|
+
// Header
|
|
19
|
+
const statusIcon = result.status === 'pass' ? '✓' : result.status === 'fail' ? '✗' : '⚠';
|
|
20
|
+
const statusColor = result.status === 'pass' ? '\x1b[32m' : result.status === 'fail' ? '\x1b[31m' : '\x1b[33m';
|
|
21
|
+
const reset = '\x1b[0m';
|
|
22
|
+
const dim = '\x1b[2m';
|
|
23
|
+
const bold = '\x1b[1m';
|
|
24
|
+
lines.push(`\n${bold}${statusColor}${statusIcon} ${suiteName}${reset}`);
|
|
25
|
+
lines.push(`${dim}${'─'.repeat(60)}${reset}`);
|
|
26
|
+
// Tests
|
|
27
|
+
for (const test of tests) {
|
|
28
|
+
const icon = test.status === 'pass' ? '\x1b[32m✓' : test.status === 'fail' ? '\x1b[31m✗' : '\x1b[33m⚠';
|
|
29
|
+
const duration = test.durationMs != null ? ` ${dim}(${test.durationMs}ms)${reset}` : '';
|
|
30
|
+
lines.push(` ${icon} ${test.name}${reset}${duration}`);
|
|
31
|
+
if (test.error) {
|
|
32
|
+
lines.push(` \x1b[31mError: ${test.error}${reset}`);
|
|
33
|
+
}
|
|
34
|
+
// Show rendered prompt if available
|
|
35
|
+
if (test.renderedOutput) {
|
|
36
|
+
lines.push(` ${dim}Rendered prompt:${reset}`);
|
|
37
|
+
const renderedLines = test.renderedOutput.split('\n');
|
|
38
|
+
for (const rl of renderedLines) {
|
|
39
|
+
lines.push(` ${dim} ${rl}${reset}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
// Show LLM response if available (use !== undefined to handle empty strings)
|
|
43
|
+
if (test.llmResponse !== undefined) {
|
|
44
|
+
lines.push(` ${dim}LLM response:${reset}`);
|
|
45
|
+
if (test.llmResponse) {
|
|
46
|
+
const responseLines = test.llmResponse.split('\n');
|
|
47
|
+
for (const rl of responseLines) {
|
|
48
|
+
lines.push(` ${dim} ${rl}${reset}`);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
lines.push(` ${dim} (empty — model may have returned an error)${reset}`);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
// Show assertions
|
|
56
|
+
for (const assertion of test.assertions) {
|
|
57
|
+
if (assertion.status !== 'pass') {
|
|
58
|
+
const aIcon = assertion.status === 'fail' ? '\x1b[31m✗' : '\x1b[33m⚠';
|
|
59
|
+
const msg = assertion.message ?? `${assertion.operator} failed`;
|
|
60
|
+
lines.push(` ${aIcon} ${assertion.operator}: ${msg}${reset}`);
|
|
61
|
+
if (assertion.expected) {
|
|
62
|
+
lines.push(` ${dim}expected: ${assertion.expected}${reset}`);
|
|
63
|
+
}
|
|
64
|
+
if (assertion.actual) {
|
|
65
|
+
lines.push(` ${dim}actual: ${assertion.actual}${reset}`);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
else if (assertion.operator === 'llm_judge' || assertion.operator === 'sentiment') {
|
|
69
|
+
// Always show reasoning for AI assertions, even on pass
|
|
70
|
+
lines.push(` \x1b[32m✓ ${assertion.operator}:${reset} ${dim}${assertion.message ?? 'passed'}${reset}`);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
// Summary
|
|
75
|
+
lines.push(`${dim}${'─'.repeat(60)}${reset}`);
|
|
76
|
+
const parts = [];
|
|
77
|
+
if (summary.passed > 0)
|
|
78
|
+
parts.push(`\x1b[32m${summary.passed} passed${reset}`);
|
|
79
|
+
if (summary.failed > 0)
|
|
80
|
+
parts.push(`\x1b[31m${summary.failed} failed${reset}`);
|
|
81
|
+
if (summary.errored > 0)
|
|
82
|
+
parts.push(`\x1b[33m${summary.errored} errored${reset}`);
|
|
83
|
+
parts.push(`${summary.total} total`);
|
|
84
|
+
lines.push(` ${parts.join(', ')} ${dim}(${summary.durationMs}ms)${reset}`);
|
|
85
|
+
lines.push('');
|
|
86
|
+
return lines.join('\n');
|
|
87
|
+
}
|
|
88
|
+
// =============================================================================
|
|
89
|
+
// JSON REPORTER
|
|
90
|
+
// =============================================================================
|
|
91
|
+
/**
|
|
92
|
+
* Format eval results as JSON string.
|
|
93
|
+
*/
|
|
94
|
+
export function formatJson(result) {
|
|
95
|
+
return JSON.stringify(result, null, 2);
|
|
96
|
+
}
|
|
97
|
+
// =============================================================================
|
|
98
|
+
// JUNIT XML REPORTER
|
|
99
|
+
// =============================================================================
|
|
100
|
+
/**
|
|
101
|
+
* Format eval results as JUnit XML for CI integration.
|
|
102
|
+
*/
|
|
103
|
+
export function formatJunit(result) {
|
|
104
|
+
const { suiteName, tests, summary } = result;
|
|
105
|
+
const lines = [];
|
|
106
|
+
lines.push('<?xml version="1.0" encoding="UTF-8"?>');
|
|
107
|
+
lines.push(`<testsuite name="${escapeXml(suiteName)}" tests="${summary.total}" ` +
|
|
108
|
+
`failures="${summary.failed}" errors="${summary.errored}" ` +
|
|
109
|
+
`time="${(summary.durationMs / 1000).toFixed(3)}">`);
|
|
110
|
+
for (const test of tests) {
|
|
111
|
+
const time = test.durationMs != null ? ` time="${(test.durationMs / 1000).toFixed(3)}"` : '';
|
|
112
|
+
lines.push(` <testcase name="${escapeXml(test.name)}"${time}>`);
|
|
113
|
+
if (test.status === 'fail') {
|
|
114
|
+
const failedAssertions = test.assertions.filter((a) => a.status === 'fail');
|
|
115
|
+
const message = failedAssertions
|
|
116
|
+
.map((a) => a.message ?? `${a.operator} failed`)
|
|
117
|
+
.join('; ');
|
|
118
|
+
lines.push(` <failure message="${escapeXml(message)}">`);
|
|
119
|
+
for (const a of failedAssertions) {
|
|
120
|
+
lines.push(` [${a.operator}] ${a.message ?? 'failed'}`);
|
|
121
|
+
if (a.expected)
|
|
122
|
+
lines.push(` expected: ${a.expected}`);
|
|
123
|
+
if (a.actual)
|
|
124
|
+
lines.push(` actual: ${a.actual}`);
|
|
125
|
+
}
|
|
126
|
+
lines.push(' </failure>');
|
|
127
|
+
}
|
|
128
|
+
if (test.status === 'error') {
|
|
129
|
+
const errorMsg = test.error ?? 'Unknown error';
|
|
130
|
+
lines.push(` <error message="${escapeXml(errorMsg)}">${escapeXml(errorMsg)}</error>`);
|
|
131
|
+
}
|
|
132
|
+
lines.push(' </testcase>');
|
|
133
|
+
}
|
|
134
|
+
lines.push('</testsuite>');
|
|
135
|
+
return lines.join('\n');
|
|
136
|
+
}
|
|
137
|
+
// =============================================================================
|
|
138
|
+
// FORMAT SELECTOR
|
|
139
|
+
// =============================================================================
|
|
140
|
+
/**
|
|
141
|
+
* Format eval results using the specified reporter.
|
|
142
|
+
*/
|
|
143
|
+
export function formatResults(result, reporter = 'console') {
|
|
144
|
+
switch (reporter) {
|
|
145
|
+
case 'console':
|
|
146
|
+
return formatConsole(result);
|
|
147
|
+
case 'json':
|
|
148
|
+
return formatJson(result);
|
|
149
|
+
case 'junit':
|
|
150
|
+
return formatJunit(result);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// =============================================================================
|
|
154
|
+
// HELPERS
|
|
155
|
+
// =============================================================================
|
|
156
|
+
function escapeXml(text) {
|
|
157
|
+
return text
|
|
158
|
+
.replace(/&/g, '&')
|
|
159
|
+
.replace(/</g, '<')
|
|
160
|
+
.replace(/>/g, '>')
|
|
161
|
+
.replace(/"/g, '"')
|
|
162
|
+
.replace(/'/g, ''');
|
|
163
|
+
}
|
|
164
|
+
//# sourceMappingURL=reporter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reporter.js","sourceRoot":"","sources":["../../src/eval/reporter.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,gFAAgF;AAChF,mBAAmB;AACnB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,MAAuB;IACnD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,CAAC;IAE7C,SAAS;IACT,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IACzF,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC;IAC/G,MAAM,KAAK,GAAG,SAAS,CAAC;IACxB,MAAM,GAAG,GAAG,SAAS,CAAC;IACtB,MAAM,IAAI,GAAG,SAAS,CAAC;IAEvB,KAAK,CAAC,IAAI,CAAC,KAAK,IAAI,GAAG,WAAW,GAAG,UAAU,IAAI,SAAS,GAAG,KAAK,EAAE,CAAC,CAAC;IACxE,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,CAAC;IAE9C,QAAQ;IACR,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC;QACvG,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,GAAG,IAAI,IAAI,CAAC,UAAU,MAAM,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACxF,KAAK,CAAC,IAAI,CAAC,KAAK,IAAI,IAAI,IAAI,CAAC,IAAI,GAAG,KAAK,GAAG,QAAQ,EAAE,CAAC,CAAC;QAExD,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,sBAAsB,IAAI,CAAC,KAAK,GAAG,KAAK,EAAE,CAAC,CAAC;QACzD,CAAC;QAED,oCAAoC;QACpC,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACxB,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,mBAAmB,KAAK,EAAE,CAAC,CAAC;YACjD,MAAM,aAAa,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACtD,KAAK,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;gBAC/B,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,KAAK,EAAE,GAAG,KAAK,EAAE,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;QAED,6EAA6E;QAC7E,IAAI,IAAI,CAAC,WAAW,KAAK,SAAS,EAAE,CAAC;YACnC,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,gBAAgB,KAAK,EAAE,CAAC,CAAC;YAC9C,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;gBACrB,MAAM,aAAa,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;gBACnD,KAAK,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;oBAC/B,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,KAAK,EAAE,GAAG,KAAK,EAAE,CAAC,CAAC;gBAC1C,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,+CAA+C,KAAK,EAAE,CAAC,CAAC;YAC/E,CAAC;QACH,CAAC;QAED,kBAAkB;QAClB,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACxC,IAAI,SAAS,CAAC,MAAM,KAAK,MAAM,EAAE,CAAC;gBAChC,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC;gBACtE,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,IAAI,GAAG,SAAS,CAAC,QAAQ,SAAS,CAAC;gBAChE,KAAK,CAAC,IAAI,CAAC,OAAO,KAAK,IAAI,SAAS,CAAC,QAAQ,KAAK,GAAG,GAAG,KAAK,EAAE,CAAC,CAAC;gBACjE,IAAI,SAAS,CAAC,QAAQ,EAAE,CAAC;oBACvB,KAAK,CAAC,IAAI,CAAC,SAAS,GAAG,aAAa,SAAS,CAAC,QAAQ,GAAG,KAAK,EAAE,CAAC,CAAC;gBACpE,CAAC;gBACD,IAAI,SAAS,CAAC,MAAM,EAAE,CAAC;oBACrB,KAAK,CAAC,IAAI,CAAC,SAAS,GAAG,aAAa,SAAS,CAAC,MAAM,GAAG,KAAK,EAAE,CAAC,CAAC;gBAClE,CAAC;YACH,CAAC;iBAAM,IAAI,SAAS,CAAC,QAAQ,KAAK,WAAW,IAAI,SAAS,CAAC,QAAQ,KAAK,WAAW,EAAE,CAAC;gBACpF,wDAAwD;gBACxD,KAAK,CAAC,IAAI,CAAC,iBAAiB,SAAS,CAAC,QAAQ,IAAI,KAAK,IAAI,GAAG,GAAG,SAAS,CAAC,OAAO,IAAI,QAAQ,GAAG,KAAK,EAAE,CAAC,CAAC;YAC5G,CAAC;QACH,CAAC;IACH,CAAC;IAED,UAAU;IACV,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,CAAC;IAC9C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,WAAW,OAAO,CAAC,MAAM,UAAU,KAAK,EAAE,CAAC,CAAC;IAC/E,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,WAAW,OAAO,CAAC,MAAM,UAAU,KAAK,EAAE,CAAC,CAAC;IAC/E,IAAI,OAAO,CAAC,OAAO,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,WAAW,OAAO,CAAC,OAAO,WAAW,KAAK,EAAE,CAAC,CAAC;IAClF,KAAK,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,QAAQ,CAAC,CAAC;IACrC,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,IAAI,OAAO,CAAC,UAAU,MAAM,KAAK,EAAE,CAAC,CAAC;IAC5E,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,gFAAgF;AAChF,gBAAgB;AAChB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,MAAuB;IAChD,OAAO,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AACzC,CAAC;AAED,gFAAgF;AAChF,qBAAqB;AACrB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,MAAuB;IACjD,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,CAAC;IAE7C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,wCAAwC,CAAC,CAAC;IACrD,KAAK,CAAC,IAAI,CACR,oBAAoB,SAAS,CAAC,SAAS,CAAC,YAAY,OAAO,CAAC,KAAK,IAAI;QACnE,aAAa,OAAO,CAAC,MAAM,aAAa,OAAO,CAAC,OAAO,IAAI;QAC3D,SAAS,CAAC,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CACtD,CAAC;IAEF,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7F,KAAK,CAAC,IAAI,CAAC,qBAAqB,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,GAAG,CAAC,CAAC;QAEjE,IAAI,IAAI,CAAC,MAAM,KAAK,MAAM,EAAE,CAAC;YAC3B,MAAM,gBAAgB,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC;YAC5E,MAAM,OAAO,GAAG,gBAAgB;iBAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,IAAI,GAAG,CAAC,CAAC,QAAQ,SAAS,CAAC;iBAC/C,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,KAAK,CAAC,IAAI,CAAC,yBAAyB,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC5D,KAAK,MAAM,CAAC,IAAI,gBAAgB,EAAE,CAAC;gBACjC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,QAAQ,KAAK,CAAC,CAAC,OAAO,IAAI,QAAQ,EAAE,CAAC,CAAC;gBAC7D,IAAI,CAAC,CAAC,QAAQ;oBAAE,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;gBAC9D,IAAI,CAAC,CAAC,MAAM;oBAAE,KAAK,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YAC1D,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAC/B,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,KAAK,OAAO,EAAE,CAAC;YAC5B,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,IAAI,eAAe,CAAC;YAC/C,KAAK,CAAC,IAAI,CAAC,uBAAuB,SAAS,CAAC,QAAQ,CAAC,KAAK,SAAS,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC;QAC3F,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC9B,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IAC3B,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,gFAAgF;AAChF,kBAAkB;AAClB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,aAAa,CAC3B,MAAuB,EACvB,WAAyC,SAAS;IAElD,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,SAAS;YACZ,OAAO,aAAa,CAAC,MAAM,CAAC,CAAC;QAC/B,KAAK,MAAM;YACT,OAAO,UAAU,CAAC,MAAM,CAAC,CAAC;QAC5B,KAAK,OAAO;YACV,OAAO,WAAW,CAAC,MAAM,CAAC,CAAC;IAC/B,CAAC;AACH,CAAC;AAED,gFAAgF;AAChF,UAAU;AACV,gFAAgF;AAEhF,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC;SACtB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC;SACvB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;AAC7B,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Eval test runner
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* .eval file (YAML)
|
|
6
|
+
* ↓ load + validate
|
|
7
|
+
* EvalSuite { config, tests[] }
|
|
8
|
+
* ↓ for each test
|
|
9
|
+
* ├── Load variables (from given: or from dataset + params)
|
|
10
|
+
* ├── Read target prompt.pdk
|
|
11
|
+
* ├── Render with echo-pdk
|
|
12
|
+
* ├── If expect_render: run assertions on rendered output
|
|
13
|
+
* ├── If expect_llm: send to LLM → run assertions on response
|
|
14
|
+
* ├── If record mode: save response as golden in .dset file
|
|
15
|
+
* └── Collect results
|
|
16
|
+
* ↓
|
|
17
|
+
* EvalSuiteResult { tests[], summary }
|
|
18
|
+
*/
|
|
19
|
+
import type { EvalSuite, EvalSuiteResult, EvalRunnerConfig } from './types.js';
|
|
20
|
+
/**
|
|
21
|
+
* Create and run an eval suite from a file.
|
|
22
|
+
*/
|
|
23
|
+
export declare function runEvalFile(evalFilePath: string, config: EvalRunnerConfig): Promise<EvalSuiteResult>;
|
|
24
|
+
/**
|
|
25
|
+
* Run an eval suite.
|
|
26
|
+
*/
|
|
27
|
+
export declare function runEvalSuite(suite: EvalSuite, evalFilePath: string, config: EvalRunnerConfig): Promise<EvalSuiteResult>;
|
|
28
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAaH,OAAO,KAAK,EACV,SAAS,EAET,eAAe,EAIf,gBAAgB,EAGjB,MAAM,YAAY,CAAC;AAMpB;;GAEG;AACH,wBAAsB,WAAW,CAC/B,YAAY,EAAE,MAAM,EACpB,MAAM,EAAE,gBAAgB,GACvB,OAAO,CAAC,eAAe,CAAC,CAG1B;AAED;;GAEG;AACH,wBAAsB,YAAY,CAChC,KAAK,EAAE,SAAS,EAChB,YAAY,EAAE,MAAM,EACpB,MAAM,EAAE,gBAAgB,GACvB,OAAO,CAAC,eAAe,CAAC,CAkF1B"}
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Eval test runner
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* .eval file (YAML)
|
|
6
|
+
* ↓ load + validate
|
|
7
|
+
* EvalSuite { config, tests[] }
|
|
8
|
+
* ↓ for each test
|
|
9
|
+
* ├── Load variables (from given: or from dataset + params)
|
|
10
|
+
* ├── Read target prompt.pdk
|
|
11
|
+
* ├── Render with echo-pdk
|
|
12
|
+
* ├── If expect_render: run assertions on rendered output
|
|
13
|
+
* ├── If expect_llm: send to LLM → run assertions on response
|
|
14
|
+
* ├── If record mode: save response as golden in .dset file
|
|
15
|
+
* └── Collect results
|
|
16
|
+
* ↓
|
|
17
|
+
* EvalSuiteResult { tests[], summary }
|
|
18
|
+
*/
|
|
19
|
+
import { readFile } from 'fs/promises';
|
|
20
|
+
import { resolve, dirname } from 'path';
|
|
21
|
+
import { createEcho } from '../index.js';
|
|
22
|
+
import { loadEvalFile } from './loader.js';
|
|
23
|
+
import { runAssertions } from './assertions.js';
|
|
24
|
+
import { DatasetManager } from './dataset.js';
|
|
25
|
+
import { createProvider } from '../providers/registry.js';
|
|
26
|
+
import { toLLMProvider } from '../providers/base.js';
|
|
27
|
+
import { createEmbeddingProvider } from '../embeddings/registry.js';
|
|
28
|
+
import { cosineSimilarity } from '../embeddings/cosine.js';
|
|
29
|
+
// =============================================================================
|
|
30
|
+
// EVAL RUNNER
|
|
31
|
+
// =============================================================================
|
|
32
|
+
/**
|
|
33
|
+
* Create and run an eval suite from a file.
|
|
34
|
+
*/
|
|
35
|
+
export async function runEvalFile(evalFilePath, config) {
|
|
36
|
+
const suite = await loadEvalFile(evalFilePath);
|
|
37
|
+
return runEvalSuite(suite, evalFilePath, config);
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Run an eval suite.
|
|
41
|
+
*/
|
|
42
|
+
export async function runEvalSuite(suite, evalFilePath, config) {
|
|
43
|
+
const suiteStart = Date.now();
|
|
44
|
+
// Determine prompt directory from eval file location
|
|
45
|
+
// Eval files live in <prompt>/eval/tests/ — go up to prompt dir
|
|
46
|
+
const evalDir = dirname(resolve(evalFilePath));
|
|
47
|
+
const promptDir = resolve(evalDir, '..', '..');
|
|
48
|
+
// Create dataset manager for this prompt
|
|
49
|
+
const datasetManager = new DatasetManager(promptDir);
|
|
50
|
+
// Load the target prompt template
|
|
51
|
+
const targetPath = resolve(promptDir, suite.config.target);
|
|
52
|
+
let template;
|
|
53
|
+
try {
|
|
54
|
+
template = await readFile(targetPath, 'utf-8');
|
|
55
|
+
}
|
|
56
|
+
catch {
|
|
57
|
+
return createErrorResult(suite.suite, `Failed to load target prompt: ${targetPath}`);
|
|
58
|
+
}
|
|
59
|
+
// Create echo instance
|
|
60
|
+
const echo = createEcho({ strict: false });
|
|
61
|
+
// Create embedding provider (explicit config or auto-detect from OpenAI)
|
|
62
|
+
let embeddingProvider;
|
|
63
|
+
if (config.embeddingProvider) {
|
|
64
|
+
embeddingProvider = createEmbeddingProvider(config.embeddingProvider);
|
|
65
|
+
}
|
|
66
|
+
else if (config.aiProvider?.type === 'openai') {
|
|
67
|
+
// Auto-detect: reuse OpenAI API key for embeddings
|
|
68
|
+
embeddingProvider = createEmbeddingProvider({
|
|
69
|
+
type: 'openai',
|
|
70
|
+
apiKey: config.aiProvider.apiKey,
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
// Build embeddings-based similarity closure
|
|
74
|
+
let embeddingSimilarity;
|
|
75
|
+
if (embeddingProvider) {
|
|
76
|
+
embeddingSimilarity = async (textA, textB) => {
|
|
77
|
+
const vectors = await embeddingProvider.embed([textA, textB]);
|
|
78
|
+
return cosineSimilarity(vectors[0], vectors[1]);
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
// Filter tests if needed
|
|
82
|
+
let tests = suite.tests;
|
|
83
|
+
if (config.filter) {
|
|
84
|
+
const pattern = config.filter.toLowerCase();
|
|
85
|
+
tests = tests.filter((t) => t.name.toLowerCase().includes(pattern));
|
|
86
|
+
}
|
|
87
|
+
// Run each test
|
|
88
|
+
const testResults = [];
|
|
89
|
+
for (const test of tests) {
|
|
90
|
+
const result = await runSingleTest(test, {
|
|
91
|
+
template,
|
|
92
|
+
echo,
|
|
93
|
+
datasetManager,
|
|
94
|
+
suiteConfig: suite.config,
|
|
95
|
+
runnerConfig: config,
|
|
96
|
+
promptDir,
|
|
97
|
+
embeddingSimilarity,
|
|
98
|
+
});
|
|
99
|
+
testResults.push(result);
|
|
100
|
+
}
|
|
101
|
+
// Compute summary
|
|
102
|
+
const summary = computeSummary(testResults, Date.now() - suiteStart);
|
|
103
|
+
// Determine overall status
|
|
104
|
+
const status = testResults.some((t) => t.status === 'error')
|
|
105
|
+
? 'error'
|
|
106
|
+
: testResults.some((t) => t.status === 'fail')
|
|
107
|
+
? 'fail'
|
|
108
|
+
: 'pass';
|
|
109
|
+
return {
|
|
110
|
+
suiteName: suite.suite,
|
|
111
|
+
status,
|
|
112
|
+
tests: testResults,
|
|
113
|
+
summary,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
async function runSingleTest(test, ctx) {
|
|
117
|
+
const testStart = Date.now();
|
|
118
|
+
try {
|
|
119
|
+
// 1. Resolve variables
|
|
120
|
+
let variables;
|
|
121
|
+
if (test.given) {
|
|
122
|
+
variables = test.given;
|
|
123
|
+
}
|
|
124
|
+
else if (test.dataset && test.params) {
|
|
125
|
+
variables = await ctx.datasetManager.getParams(test.dataset, test.params);
|
|
126
|
+
}
|
|
127
|
+
else if (test.dataset) {
|
|
128
|
+
// Use first parameter set from dataset
|
|
129
|
+
const dataset = await ctx.datasetManager.load(test.dataset);
|
|
130
|
+
if (dataset.parameters.length === 0) {
|
|
131
|
+
throw new Error(`Dataset "${test.dataset}" has no parameter sets`);
|
|
132
|
+
}
|
|
133
|
+
const { name: _name, ...vars } = dataset.parameters[0];
|
|
134
|
+
variables = vars;
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
variables = {};
|
|
138
|
+
}
|
|
139
|
+
const allAssertions = [];
|
|
140
|
+
let renderedOutput;
|
|
141
|
+
let llmResponseText;
|
|
142
|
+
// 2. Render the template
|
|
143
|
+
renderedOutput = await ctx.echo.render(ctx.template, variables);
|
|
144
|
+
// 3. Run expect_render assertions
|
|
145
|
+
if (test.expect_render) {
|
|
146
|
+
const renderCtx = { text: renderedOutput };
|
|
147
|
+
const results = await runAssertions(test.expect_render, renderCtx);
|
|
148
|
+
allAssertions.push(...results);
|
|
149
|
+
}
|
|
150
|
+
// 4. Run expect_llm assertions (requires LLM call)
|
|
151
|
+
if (test.expect_llm) {
|
|
152
|
+
// For now, LLM provider is optional.
|
|
153
|
+
// If not configured, LLM assertions will return 'error' status.
|
|
154
|
+
const llmProvider = ctx.runnerConfig.aiProvider
|
|
155
|
+
? toLLMProvider(createProvider(ctx.runnerConfig.aiProvider))
|
|
156
|
+
: undefined;
|
|
157
|
+
let llmResponse;
|
|
158
|
+
if (llmProvider) {
|
|
159
|
+
const model = ctx.suiteConfig.model ?? ctx.runnerConfig.aiProvider?.model;
|
|
160
|
+
try {
|
|
161
|
+
llmResponse = await llmProvider.complete(renderedOutput, model);
|
|
162
|
+
llmResponseText = llmResponse.text;
|
|
163
|
+
}
|
|
164
|
+
catch (llmErr) {
|
|
165
|
+
llmResponseText = '';
|
|
166
|
+
allAssertions.push({
|
|
167
|
+
operator: 'llm_call',
|
|
168
|
+
status: 'error',
|
|
169
|
+
message: `LLM call failed: ${llmErr.message}`,
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
// Record mode — save golden
|
|
173
|
+
if (ctx.runnerConfig.record && test.dataset && llmResponse) {
|
|
174
|
+
await ctx.datasetManager.recordGolden(test.dataset, llmResponse.text, llmResponse);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
const llmCtx = {
|
|
178
|
+
text: llmResponseText ?? '',
|
|
179
|
+
llmResponse,
|
|
180
|
+
llmProvider,
|
|
181
|
+
loadGolden: (name) => ctx.datasetManager.getGolden(name),
|
|
182
|
+
embeddingSimilarity: ctx.embeddingSimilarity,
|
|
183
|
+
};
|
|
184
|
+
const results = await runAssertions(test.expect_llm, llmCtx);
|
|
185
|
+
allAssertions.push(...results);
|
|
186
|
+
}
|
|
187
|
+
// 5. Determine status
|
|
188
|
+
const status = allAssertions.some((a) => a.status === 'error')
|
|
189
|
+
? 'error'
|
|
190
|
+
: allAssertions.some((a) => a.status === 'fail')
|
|
191
|
+
? 'fail'
|
|
192
|
+
: 'pass';
|
|
193
|
+
return {
|
|
194
|
+
name: test.name,
|
|
195
|
+
status,
|
|
196
|
+
assertions: allAssertions,
|
|
197
|
+
durationMs: Date.now() - testStart,
|
|
198
|
+
renderedOutput,
|
|
199
|
+
llmResponse: llmResponseText,
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
catch (err) {
|
|
203
|
+
return {
|
|
204
|
+
name: test.name,
|
|
205
|
+
status: 'error',
|
|
206
|
+
assertions: [],
|
|
207
|
+
durationMs: Date.now() - testStart,
|
|
208
|
+
error: err.message,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
// =============================================================================
|
|
213
|
+
// HELPERS
|
|
214
|
+
// =============================================================================
|
|
215
|
+
function computeSummary(tests, durationMs) {
|
|
216
|
+
return {
|
|
217
|
+
total: tests.length,
|
|
218
|
+
passed: tests.filter((t) => t.status === 'pass').length,
|
|
219
|
+
failed: tests.filter((t) => t.status === 'fail').length,
|
|
220
|
+
errored: tests.filter((t) => t.status === 'error').length,
|
|
221
|
+
durationMs,
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
function createErrorResult(suiteName, _error) {
|
|
225
|
+
return {
|
|
226
|
+
suiteName,
|
|
227
|
+
status: 'error',
|
|
228
|
+
tests: [],
|
|
229
|
+
summary: { total: 0, passed: 0, failed: 0, errored: 0, durationMs: 0 },
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
//# sourceMappingURL=runner.js.map
|