@hemia-ai/agents-evals 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +340 -0
- package/dist/index.js +578 -0
- package/dist/index.js.map +1 -0
- package/package.json +30 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
import { AgentKey, JsonObject, AgentExecutionResult } from '@hemia-ai/agents-core';
|
|
2
|
+
import { ModelGateway } from '@hemia-ai/agents-models';
|
|
3
|
+
|
|
4
|
+
interface EvalCase<TInput = unknown, TExpected = unknown> {
|
|
5
|
+
id: string;
|
|
6
|
+
name: string;
|
|
7
|
+
agentKey: AgentKey;
|
|
8
|
+
input: TInput;
|
|
9
|
+
expected?: TExpected;
|
|
10
|
+
tags?: string[];
|
|
11
|
+
metadata?: JsonObject;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
interface EvalRunContext {
|
|
15
|
+
runId: string;
|
|
16
|
+
datasetId: string;
|
|
17
|
+
datasetVersion: string;
|
|
18
|
+
agentKey?: AgentKey;
|
|
19
|
+
modelId?: string;
|
|
20
|
+
promptVersion?: string;
|
|
21
|
+
metadata?: JsonObject;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
interface EvalEvaluatorInput {
|
|
25
|
+
testCase: EvalCase;
|
|
26
|
+
result: AgentExecutionResult;
|
|
27
|
+
context: EvalRunContext;
|
|
28
|
+
}
|
|
29
|
+
interface EvalScore {
|
|
30
|
+
name: string;
|
|
31
|
+
score: number;
|
|
32
|
+
passed: boolean;
|
|
33
|
+
reason?: string;
|
|
34
|
+
metadata?: JsonObject;
|
|
35
|
+
}
|
|
36
|
+
interface EvalEvaluator {
|
|
37
|
+
name: string;
|
|
38
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
39
|
+
}
|
|
40
|
+
declare function createEvalScore(input: EvalScore): EvalScore;
|
|
41
|
+
|
|
42
|
+
interface EvalMetric {
|
|
43
|
+
name: string;
|
|
44
|
+
value: number;
|
|
45
|
+
unit?: string;
|
|
46
|
+
metadata?: JsonObject;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
interface EvalCaseResult {
|
|
50
|
+
caseId: string;
|
|
51
|
+
testCase: EvalCase;
|
|
52
|
+
result: AgentExecutionResult;
|
|
53
|
+
scores: EvalScore[];
|
|
54
|
+
passed: boolean;
|
|
55
|
+
metadata?: JsonObject;
|
|
56
|
+
}
|
|
57
|
+
interface EvalRunSummary {
|
|
58
|
+
totalCases: number;
|
|
59
|
+
passedCases: number;
|
|
60
|
+
failedCases: number;
|
|
61
|
+
passRate: number;
|
|
62
|
+
averageScore: number;
|
|
63
|
+
}
|
|
64
|
+
interface EvalRunResult {
|
|
65
|
+
runId: string;
|
|
66
|
+
datasetId: string;
|
|
67
|
+
datasetVersion: string;
|
|
68
|
+
results: EvalCaseResult[];
|
|
69
|
+
summary: EvalRunSummary;
|
|
70
|
+
metrics: EvalMetric[];
|
|
71
|
+
passed: boolean;
|
|
72
|
+
metadata?: JsonObject;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
interface EvalBaseline {
|
|
76
|
+
id: string;
|
|
77
|
+
name: string;
|
|
78
|
+
run: EvalRunResult;
|
|
79
|
+
metadata?: JsonObject;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
interface EvalRegressionResult {
|
|
83
|
+
metricName: string;
|
|
84
|
+
baselineValue: number;
|
|
85
|
+
currentValue: number;
|
|
86
|
+
delta: number;
|
|
87
|
+
regressed: boolean;
|
|
88
|
+
threshold: number;
|
|
89
|
+
metadata?: JsonObject;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
interface BaselineComparatorOptions {
|
|
93
|
+
thresholds?: Record<string, number>;
|
|
94
|
+
}
|
|
95
|
+
declare class BaselineComparator {
|
|
96
|
+
private readonly thresholds;
|
|
97
|
+
constructor(options?: BaselineComparatorOptions);
|
|
98
|
+
compare(baseline: EvalRunResult, current: EvalRunResult): EvalRegressionResult[];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
interface EvalCaseBuilderInput<TInput = unknown, TExpected = unknown> {
|
|
102
|
+
id: string;
|
|
103
|
+
name: string;
|
|
104
|
+
agentKey: AgentKey;
|
|
105
|
+
input: TInput;
|
|
106
|
+
expected?: TExpected;
|
|
107
|
+
tags?: string[];
|
|
108
|
+
metadata?: JsonObject;
|
|
109
|
+
}
|
|
110
|
+
declare function createEvalCase<TInput = unknown, TExpected = unknown>(input: EvalCaseBuilderInput<TInput, TExpected>): EvalCase<TInput, TExpected>;
|
|
111
|
+
|
|
112
|
+
type EvalDatasetSplit = 'dev' | 'test' | 'holdout';
|
|
113
|
+
|
|
114
|
+
interface EvalDataset {
|
|
115
|
+
id: string;
|
|
116
|
+
name: string;
|
|
117
|
+
version: string;
|
|
118
|
+
split: EvalDatasetSplit;
|
|
119
|
+
cases: EvalCase[];
|
|
120
|
+
metadata?: JsonObject;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
interface EvalDatasetStore {
|
|
124
|
+
list(): Promise<EvalDataset[]>;
|
|
125
|
+
get(id: string, version?: string): Promise<EvalDataset | undefined>;
|
|
126
|
+
put(dataset: EvalDataset): Promise<void>;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
declare class InMemoryEvalDatasetStore implements EvalDatasetStore {
|
|
130
|
+
private readonly datasets;
|
|
131
|
+
constructor(datasets?: EvalDataset[]);
|
|
132
|
+
list(): Promise<EvalDataset[]>;
|
|
133
|
+
get(id: string, version?: string): Promise<EvalDataset | undefined>;
|
|
134
|
+
put(dataset: EvalDataset): Promise<void>;
|
|
135
|
+
private key;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
type EvalErrorCode = 'EVAL_DATASET_NOT_FOUND' | 'EVAL_CASE_EXECUTION_FAILED' | 'EVAL_EVALUATOR_FAILED' | 'EVAL_REPORT_FAILED' | 'EVAL_GATE_FAILED' | 'EVAL_UNKNOWN_ERROR';
|
|
139
|
+
|
|
140
|
+
declare class EvalError extends Error {
|
|
141
|
+
readonly code: EvalErrorCode;
|
|
142
|
+
readonly metadata?: JsonObject;
|
|
143
|
+
constructor(code: EvalErrorCode, message: string, metadata?: JsonObject);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
interface WeightedEvalEvaluator {
|
|
147
|
+
evaluator: EvalEvaluator;
|
|
148
|
+
weight?: number;
|
|
149
|
+
}
|
|
150
|
+
interface CompositeEvaluatorOptions {
|
|
151
|
+
name?: string;
|
|
152
|
+
evaluators: WeightedEvalEvaluator[];
|
|
153
|
+
threshold?: number;
|
|
154
|
+
}
|
|
155
|
+
declare class CompositeEvaluator implements EvalEvaluator {
|
|
156
|
+
readonly name: string;
|
|
157
|
+
private readonly evaluators;
|
|
158
|
+
private readonly threshold;
|
|
159
|
+
constructor(options: CompositeEvaluatorOptions);
|
|
160
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
interface ExactMatchEvaluatorOptions {
|
|
164
|
+
name?: string;
|
|
165
|
+
ignoreCase?: boolean;
|
|
166
|
+
trim?: boolean;
|
|
167
|
+
}
|
|
168
|
+
declare class ExactMatchEvaluator implements EvalEvaluator {
|
|
169
|
+
readonly name: string;
|
|
170
|
+
private readonly ignoreCase;
|
|
171
|
+
private readonly trim;
|
|
172
|
+
constructor(options?: ExactMatchEvaluatorOptions);
|
|
173
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
174
|
+
private normalize;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
interface JsonSchemaEvaluatorOptions {
|
|
178
|
+
name?: string;
|
|
179
|
+
schema: JsonObject;
|
|
180
|
+
}
|
|
181
|
+
declare class JsonSchemaEvaluator implements EvalEvaluator {
|
|
182
|
+
readonly name: string;
|
|
183
|
+
private readonly schema;
|
|
184
|
+
constructor(options: JsonSchemaEvaluatorOptions);
|
|
185
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
interface LlmJudgeEvaluatorOptions {
|
|
189
|
+
name?: string;
|
|
190
|
+
modelGateway: Pick<ModelGateway, 'generate'>;
|
|
191
|
+
model?: string;
|
|
192
|
+
threshold?: number;
|
|
193
|
+
rubric?: string;
|
|
194
|
+
metadata?: JsonObject;
|
|
195
|
+
}
|
|
196
|
+
declare class LlmJudgeEvaluator implements EvalEvaluator {
|
|
197
|
+
readonly name: string;
|
|
198
|
+
private readonly modelGateway;
|
|
199
|
+
private readonly model?;
|
|
200
|
+
private readonly threshold;
|
|
201
|
+
private readonly rubric?;
|
|
202
|
+
private readonly metadata?;
|
|
203
|
+
constructor(options: LlmJudgeEvaluatorOptions);
|
|
204
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
interface RegexEvaluatorOptions {
|
|
208
|
+
name?: string;
|
|
209
|
+
pattern: RegExp;
|
|
210
|
+
}
|
|
211
|
+
declare class RegexEvaluator implements EvalEvaluator {
|
|
212
|
+
readonly name: string;
|
|
213
|
+
private readonly pattern;
|
|
214
|
+
constructor(options: RegexEvaluatorOptions);
|
|
215
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
interface SemanticSimilarityEvaluatorOptions {
|
|
219
|
+
name?: string;
|
|
220
|
+
threshold?: number;
|
|
221
|
+
similarity: (actual: string, expected: string) => Promise<number> | number;
|
|
222
|
+
}
|
|
223
|
+
declare class SemanticSimilarityEvaluator implements EvalEvaluator {
|
|
224
|
+
readonly name: string;
|
|
225
|
+
private readonly threshold;
|
|
226
|
+
private readonly similarity;
|
|
227
|
+
constructor(options: SemanticSimilarityEvaluatorOptions);
|
|
228
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
interface ExpectedToolCall {
|
|
232
|
+
name: string;
|
|
233
|
+
input?: unknown;
|
|
234
|
+
}
|
|
235
|
+
interface ToolCallEvaluatorOptions {
|
|
236
|
+
name?: string;
|
|
237
|
+
expectedToolCalls?: ExpectedToolCall[];
|
|
238
|
+
requireOrder?: boolean;
|
|
239
|
+
}
|
|
240
|
+
declare class ToolCallEvaluator implements EvalEvaluator {
|
|
241
|
+
readonly name: string;
|
|
242
|
+
private readonly expectedToolCalls?;
|
|
243
|
+
private readonly requireOrder;
|
|
244
|
+
constructor(options?: ToolCallEvaluatorOptions);
|
|
245
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
declare function createPassRateMetric(passed: number, total: number): EvalMetric;
|
|
249
|
+
declare function createAverageScoreMetric(scores: number[]): EvalMetric;
|
|
250
|
+
declare function createCostMetric(results: AgentExecutionResult[]): EvalMetric;
|
|
251
|
+
|
|
252
|
+
interface EvalReport {
|
|
253
|
+
run: EvalRunResult;
|
|
254
|
+
regressions?: EvalRegressionResult[];
|
|
255
|
+
metadata?: JsonObject;
|
|
256
|
+
}
|
|
257
|
+
interface EvalReportRenderer {
|
|
258
|
+
render(report: EvalReport): string;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
declare class JsonReportRenderer implements EvalReportRenderer {
|
|
262
|
+
render(report: EvalReport): string;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
declare class MarkdownReportRenderer implements EvalReportRenderer {
|
|
266
|
+
render(report: EvalReport): string;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
interface EvalRubricCriterion {
|
|
270
|
+
key: string;
|
|
271
|
+
description: string;
|
|
272
|
+
weight?: number;
|
|
273
|
+
threshold?: number;
|
|
274
|
+
metadata?: JsonObject;
|
|
275
|
+
}
|
|
276
|
+
interface EvalRubric {
|
|
277
|
+
id: string;
|
|
278
|
+
name: string;
|
|
279
|
+
version: string;
|
|
280
|
+
criteria: EvalRubricCriterion[];
|
|
281
|
+
threshold?: number;
|
|
282
|
+
metadata?: JsonObject;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
interface RubricCriterionScore {
|
|
286
|
+
key: string;
|
|
287
|
+
score: number;
|
|
288
|
+
passed: boolean;
|
|
289
|
+
reason?: string;
|
|
290
|
+
metadata?: JsonObject;
|
|
291
|
+
}
|
|
292
|
+
interface RubricScore {
|
|
293
|
+
rubricId: string;
|
|
294
|
+
rubricVersion: string;
|
|
295
|
+
score: number;
|
|
296
|
+
passed: boolean;
|
|
297
|
+
criteria: RubricCriterionScore[];
|
|
298
|
+
reason?: string;
|
|
299
|
+
metadata?: JsonObject;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
interface WeightedRubricEvaluatorOptions {
|
|
303
|
+
rubric: EvalRubric;
|
|
304
|
+
scoreCriterion: (criterionKey: string, input: EvalEvaluatorInput) => Promise<RubricCriterionScore> | RubricCriterionScore;
|
|
305
|
+
}
|
|
306
|
+
declare class WeightedRubricEvaluator implements EvalEvaluator {
|
|
307
|
+
readonly name: string;
|
|
308
|
+
private readonly rubric;
|
|
309
|
+
private readonly scoreCriterion;
|
|
310
|
+
constructor(options: WeightedRubricEvaluatorOptions);
|
|
311
|
+
evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
type EvalCaseExecutor = (testCase: EvalCase, context: EvalRunContext) => Promise<AgentExecutionResult>;
|
|
315
|
+
|
|
316
|
+
interface EvalRunConfig {
|
|
317
|
+
runId?: string;
|
|
318
|
+
datasetId: string;
|
|
319
|
+
datasetVersion?: string;
|
|
320
|
+
agentKey?: AgentKey;
|
|
321
|
+
modelId?: string;
|
|
322
|
+
promptVersion?: string;
|
|
323
|
+
threshold?: number;
|
|
324
|
+
metadata?: JsonObject;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
interface EvalRunnerOptions {
|
|
328
|
+
datasetStore: EvalDatasetStore;
|
|
329
|
+
evaluators: EvalEvaluator[];
|
|
330
|
+
executor: EvalCaseExecutor;
|
|
331
|
+
}
|
|
332
|
+
declare class EvalRunner {
|
|
333
|
+
private readonly datasetStore;
|
|
334
|
+
private readonly evaluators;
|
|
335
|
+
private readonly executor;
|
|
336
|
+
constructor(options: EvalRunnerOptions);
|
|
337
|
+
run(config: EvalRunConfig): Promise<EvalRunResult>;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
export { BaselineComparator, type BaselineComparatorOptions, CompositeEvaluator, type CompositeEvaluatorOptions, type EvalBaseline, type EvalCase, type EvalCaseBuilderInput, type EvalCaseExecutor, type EvalCaseResult, type EvalDataset, type EvalDatasetSplit, type EvalDatasetStore, EvalError, type EvalErrorCode, type EvalEvaluator, type EvalEvaluatorInput, type EvalMetric, type EvalRegressionResult, type EvalReport, type EvalReportRenderer, type EvalRubric, type EvalRubricCriterion, type EvalRunConfig, type EvalRunContext, type EvalRunResult, type EvalRunSummary, EvalRunner, type EvalRunnerOptions, type EvalScore, ExactMatchEvaluator, type ExactMatchEvaluatorOptions, type ExpectedToolCall, InMemoryEvalDatasetStore, JsonReportRenderer, JsonSchemaEvaluator, type JsonSchemaEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorOptions, MarkdownReportRenderer, RegexEvaluator, type RegexEvaluatorOptions, type RubricCriterionScore, type RubricScore, SemanticSimilarityEvaluator, type SemanticSimilarityEvaluatorOptions, ToolCallEvaluator, type ToolCallEvaluatorOptions, type WeightedEvalEvaluator, WeightedRubricEvaluator, type WeightedRubricEvaluatorOptions, createAverageScoreMetric, createCostMetric, createEvalCase, createEvalScore, createPassRateMetric };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,578 @@
|
|
|
1
|
+
// src/baselines/baseline-comparator.ts
|
|
2
|
+
var BaselineComparator = class {
|
|
3
|
+
thresholds;
|
|
4
|
+
constructor(options = {}) {
|
|
5
|
+
this.thresholds = options.thresholds ?? {};
|
|
6
|
+
}
|
|
7
|
+
compare(baseline, current) {
|
|
8
|
+
const baselineMetrics = new Map(baseline.metrics.map((metric) => [metric.name, metric.value]));
|
|
9
|
+
return current.metrics.filter((metric) => baselineMetrics.has(metric.name)).map((metric) => {
|
|
10
|
+
const baselineValue = baselineMetrics.get(metric.name) ?? 0;
|
|
11
|
+
const threshold = this.thresholds[metric.name] ?? 0;
|
|
12
|
+
const delta = metric.value - baselineValue;
|
|
13
|
+
const lowerIsBetter = metric.name.includes("cost") || metric.name.includes("latency");
|
|
14
|
+
const regressed = lowerIsBetter ? delta > threshold : delta < -threshold;
|
|
15
|
+
return {
|
|
16
|
+
metricName: metric.name,
|
|
17
|
+
baselineValue,
|
|
18
|
+
currentValue: metric.value,
|
|
19
|
+
delta,
|
|
20
|
+
regressed,
|
|
21
|
+
threshold
|
|
22
|
+
};
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
// src/cases/eval-case-builder.ts
|
|
28
|
+
function createEvalCase(input) {
|
|
29
|
+
const testCase = {
|
|
30
|
+
id: input.id,
|
|
31
|
+
name: input.name,
|
|
32
|
+
agentKey: input.agentKey,
|
|
33
|
+
input: input.input
|
|
34
|
+
};
|
|
35
|
+
if (input.expected !== void 0) testCase.expected = input.expected;
|
|
36
|
+
if (input.tags !== void 0) testCase.tags = input.tags;
|
|
37
|
+
if (input.metadata !== void 0) testCase.metadata = input.metadata;
|
|
38
|
+
return testCase;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// src/datasets/in-memory-dataset-store.ts
|
|
42
|
+
var InMemoryEvalDatasetStore = class {
|
|
43
|
+
datasets = /* @__PURE__ */ new Map();
|
|
44
|
+
constructor(datasets = []) {
|
|
45
|
+
for (const dataset of datasets) {
|
|
46
|
+
this.datasets.set(this.key(dataset.id, dataset.version), dataset);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
async list() {
|
|
50
|
+
return [...this.datasets.values()];
|
|
51
|
+
}
|
|
52
|
+
async get(id, version) {
|
|
53
|
+
if (version !== void 0) return this.datasets.get(this.key(id, version));
|
|
54
|
+
return [...this.datasets.values()].filter((dataset) => dataset.id === id).sort((a, b) => b.version.localeCompare(a.version))[0];
|
|
55
|
+
}
|
|
56
|
+
async put(dataset) {
|
|
57
|
+
this.datasets.set(this.key(dataset.id, dataset.version), dataset);
|
|
58
|
+
}
|
|
59
|
+
key(id, version) {
|
|
60
|
+
return `${id}@${version}`;
|
|
61
|
+
}
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
// src/errors/eval-error.ts
|
|
65
|
+
var EvalError = class extends Error {
|
|
66
|
+
code;
|
|
67
|
+
metadata;
|
|
68
|
+
constructor(code, message, metadata) {
|
|
69
|
+
super(message);
|
|
70
|
+
this.name = "EvalError";
|
|
71
|
+
this.code = code;
|
|
72
|
+
if (metadata !== void 0) this.metadata = metadata;
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
// src/evaluators/composite-evaluator.ts
|
|
77
|
+
var CompositeEvaluator = class {
|
|
78
|
+
name;
|
|
79
|
+
evaluators;
|
|
80
|
+
threshold;
|
|
81
|
+
constructor(options) {
|
|
82
|
+
this.name = options.name ?? "composite";
|
|
83
|
+
this.evaluators = options.evaluators;
|
|
84
|
+
this.threshold = options.threshold ?? 1;
|
|
85
|
+
}
|
|
86
|
+
async evaluate(input) {
|
|
87
|
+
const scores = await Promise.all(this.evaluators.map(({ evaluator }) => evaluator.evaluate(input)));
|
|
88
|
+
const totalWeight = this.evaluators.reduce((sum, item) => sum + (item.weight ?? 1), 0) || 1;
|
|
89
|
+
const score = scores.reduce((sum, item, index) => sum + item.score * (this.evaluators[index]?.weight ?? 1), 0) / totalWeight;
|
|
90
|
+
const passed = score >= this.threshold && scores.every((item) => item.passed);
|
|
91
|
+
return {
|
|
92
|
+
name: this.name,
|
|
93
|
+
score,
|
|
94
|
+
passed,
|
|
95
|
+
reason: passed ? "Composite score passed." : "Composite score failed.",
|
|
96
|
+
metadata: {
|
|
97
|
+
threshold: this.threshold,
|
|
98
|
+
scores: scores.map((item) => ({
|
|
99
|
+
name: item.name,
|
|
100
|
+
score: item.score,
|
|
101
|
+
passed: item.passed,
|
|
102
|
+
reason: item.reason ?? null
|
|
103
|
+
}))
|
|
104
|
+
}
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
// src/evaluators/evaluator.interface.ts
|
|
110
|
+
function createEvalScore(input) {
|
|
111
|
+
return input;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// src/evaluators/exact-match-evaluator.ts
|
|
115
|
+
var ExactMatchEvaluator = class {
|
|
116
|
+
name;
|
|
117
|
+
ignoreCase;
|
|
118
|
+
trim;
|
|
119
|
+
constructor(options = {}) {
|
|
120
|
+
this.name = options.name ?? "exact_match";
|
|
121
|
+
this.ignoreCase = options.ignoreCase ?? false;
|
|
122
|
+
this.trim = options.trim ?? true;
|
|
123
|
+
}
|
|
124
|
+
async evaluate(input) {
|
|
125
|
+
const actual = input.result.status === "completed" ? input.result.output : void 0;
|
|
126
|
+
const expected = input.testCase.expected;
|
|
127
|
+
const passed = this.normalize(actual) === this.normalize(expected);
|
|
128
|
+
return {
|
|
129
|
+
name: this.name,
|
|
130
|
+
score: passed ? 1 : 0,
|
|
131
|
+
passed,
|
|
132
|
+
reason: passed ? "Output matches expected value." : "Output does not match expected value."
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
normalize(value) {
|
|
136
|
+
const raw = typeof value === "string" ? value : JSON.stringify(value) ?? "";
|
|
137
|
+
const trimmed = this.trim ? raw.trim() : raw;
|
|
138
|
+
return this.ignoreCase ? trimmed.toLowerCase() : trimmed;
|
|
139
|
+
}
|
|
140
|
+
};
|
|
141
|
+
|
|
142
|
+
// src/evaluators/json-schema-evaluator.ts
|
|
143
|
+
var JsonSchemaEvaluator = class {
|
|
144
|
+
name;
|
|
145
|
+
schema;
|
|
146
|
+
constructor(options) {
|
|
147
|
+
this.name = options.name ?? "json_schema";
|
|
148
|
+
this.schema = options.schema;
|
|
149
|
+
}
|
|
150
|
+
async evaluate(input) {
|
|
151
|
+
const actual = input.result.status === "completed" ? input.result.output : void 0;
|
|
152
|
+
const errors = validateAgainstSchema(actual, this.schema);
|
|
153
|
+
const passed = errors.length === 0;
|
|
154
|
+
return {
|
|
155
|
+
name: this.name,
|
|
156
|
+
score: passed ? 1 : 0,
|
|
157
|
+
passed,
|
|
158
|
+
reason: passed ? "Output matches schema." : errors.join("; ")
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
};
|
|
162
|
+
function validateAgainstSchema(value, schema, path = "$") {
|
|
163
|
+
const errors = [];
|
|
164
|
+
const type = schema.type;
|
|
165
|
+
if (typeof type === "string" && !matchesType(value, type)) {
|
|
166
|
+
errors.push(`${path} expected ${type}`);
|
|
167
|
+
return errors;
|
|
168
|
+
}
|
|
169
|
+
if (type === "object" && isRecord(value)) {
|
|
170
|
+
const required = Array.isArray(schema.required) ? schema.required.filter((item) => typeof item === "string") : [];
|
|
171
|
+
for (const key of required) {
|
|
172
|
+
if (!(key in value)) errors.push(`${path}.${key} is required`);
|
|
173
|
+
}
|
|
174
|
+
const properties = isRecord(schema.properties) ? schema.properties : {};
|
|
175
|
+
for (const [key, childSchema] of Object.entries(properties)) {
|
|
176
|
+
if (key in value && isRecord(childSchema)) {
|
|
177
|
+
errors.push(...validateAgainstSchema(value[key], childSchema, `${path}.${key}`));
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
return errors;
|
|
182
|
+
}
|
|
183
|
+
function matchesType(value, type) {
|
|
184
|
+
if (type === "array") return Array.isArray(value);
|
|
185
|
+
if (type === "null") return value === null;
|
|
186
|
+
if (type === "integer") return Number.isInteger(value);
|
|
187
|
+
return typeof value === type;
|
|
188
|
+
}
|
|
189
|
+
function isRecord(value) {
|
|
190
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// src/evaluators/llm-judge-evaluator.ts
|
|
194
|
+
var LlmJudgeEvaluator = class {
|
|
195
|
+
name;
|
|
196
|
+
modelGateway;
|
|
197
|
+
model;
|
|
198
|
+
threshold;
|
|
199
|
+
rubric;
|
|
200
|
+
metadata;
|
|
201
|
+
constructor(options) {
|
|
202
|
+
this.name = options.name ?? "llm_judge";
|
|
203
|
+
this.modelGateway = options.modelGateway;
|
|
204
|
+
this.threshold = options.threshold ?? 0.8;
|
|
205
|
+
if (options.model !== void 0) this.model = options.model;
|
|
206
|
+
if (options.rubric !== void 0) this.rubric = options.rubric;
|
|
207
|
+
if (options.metadata !== void 0) this.metadata = options.metadata;
|
|
208
|
+
}
|
|
209
|
+
async evaluate(input) {
|
|
210
|
+
const request = {
|
|
211
|
+
taskType: "eval_judge",
|
|
212
|
+
agentKey: input.testCase.agentKey,
|
|
213
|
+
messages: [
|
|
214
|
+
{
|
|
215
|
+
role: "system",
|
|
216
|
+
content: 'You are an evaluator. Return JSON only with shape {"score": number, "passed": boolean, "reason": string}.'
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
role: "user",
|
|
220
|
+
content: JSON.stringify({
|
|
221
|
+
rubric: this.rubric,
|
|
222
|
+
input: input.testCase.input,
|
|
223
|
+
expected: input.testCase.expected,
|
|
224
|
+
actual: input.result.status === "completed" ? input.result.output : input.result
|
|
225
|
+
})
|
|
226
|
+
}
|
|
227
|
+
]
|
|
228
|
+
};
|
|
229
|
+
const response = await this.modelGateway.generate({
|
|
230
|
+
...request,
|
|
231
|
+
...this.model !== void 0 ? { model: this.model } : {},
|
|
232
|
+
...this.metadata !== void 0 ? { metadata: this.metadata } : {}
|
|
233
|
+
});
|
|
234
|
+
const parsed = parseJudgeResponse(response.content);
|
|
235
|
+
const score = parsed.score ?? 0;
|
|
236
|
+
const passed = parsed.passed ?? score >= this.threshold;
|
|
237
|
+
return {
|
|
238
|
+
name: this.name,
|
|
239
|
+
score,
|
|
240
|
+
passed,
|
|
241
|
+
reason: parsed.reason ?? response.content,
|
|
242
|
+
metadata: {
|
|
243
|
+
threshold: this.threshold,
|
|
244
|
+
judgeModel: this.model ?? null,
|
|
245
|
+
usage: response.usage === void 0 ? null : JSON.parse(JSON.stringify(response.usage))
|
|
246
|
+
}
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
};
|
|
250
|
+
function parseJudgeResponse(content) {
|
|
251
|
+
try {
|
|
252
|
+
const parsed = JSON.parse(content);
|
|
253
|
+
if (!isRecord2(parsed)) return {};
|
|
254
|
+
const result = {};
|
|
255
|
+
if (typeof parsed.score === "number") result.score = parsed.score;
|
|
256
|
+
if (typeof parsed.passed === "boolean") result.passed = parsed.passed;
|
|
257
|
+
if (typeof parsed.reason === "string") result.reason = parsed.reason;
|
|
258
|
+
return result;
|
|
259
|
+
} catch {
|
|
260
|
+
return {};
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
function isRecord2(value) {
|
|
264
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// src/evaluators/regex-evaluator.ts
|
|
268
|
+
var RegexEvaluator = class {
|
|
269
|
+
name;
|
|
270
|
+
pattern;
|
|
271
|
+
constructor(options) {
|
|
272
|
+
this.name = options.name ?? "regex_match";
|
|
273
|
+
this.pattern = options.pattern;
|
|
274
|
+
}
|
|
275
|
+
async evaluate(input) {
|
|
276
|
+
const actual = input.result.status === "completed" ? input.result.output : void 0;
|
|
277
|
+
const content = typeof actual === "string" ? actual : JSON.stringify(actual);
|
|
278
|
+
const passed = this.pattern.test(content);
|
|
279
|
+
return {
|
|
280
|
+
name: this.name,
|
|
281
|
+
score: passed ? 1 : 0,
|
|
282
|
+
passed,
|
|
283
|
+
reason: passed ? "Output matches pattern." : "Output does not match pattern.",
|
|
284
|
+
metadata: { pattern: this.pattern.source }
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
};
|
|
288
|
+
|
|
289
|
+
// src/evaluators/semantic-similarity-evaluator.ts
|
|
290
|
+
var SemanticSimilarityEvaluator = class {
|
|
291
|
+
name;
|
|
292
|
+
threshold;
|
|
293
|
+
similarity;
|
|
294
|
+
constructor(options) {
|
|
295
|
+
this.name = options.name ?? "semantic_similarity";
|
|
296
|
+
this.threshold = options.threshold ?? 0.8;
|
|
297
|
+
this.similarity = options.similarity;
|
|
298
|
+
}
|
|
299
|
+
async evaluate(input) {
|
|
300
|
+
const actual = stringify(input.result.status === "completed" ? input.result.output : void 0);
|
|
301
|
+
const expected = stringify(input.testCase.expected);
|
|
302
|
+
const score = await this.similarity(actual, expected);
|
|
303
|
+
return {
|
|
304
|
+
name: this.name,
|
|
305
|
+
score,
|
|
306
|
+
passed: score >= this.threshold,
|
|
307
|
+
reason: score >= this.threshold ? "Semantic similarity passed." : "Semantic similarity below threshold.",
|
|
308
|
+
metadata: { threshold: this.threshold }
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
};
|
|
312
|
+
function stringify(value) {
|
|
313
|
+
return typeof value === "string" ? value : JSON.stringify(value);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// src/evaluators/tool-call-evaluator.ts
|
|
317
|
+
var ToolCallEvaluator = class {
|
|
318
|
+
name;
|
|
319
|
+
expectedToolCalls;
|
|
320
|
+
requireOrder;
|
|
321
|
+
constructor(options = {}) {
|
|
322
|
+
this.name = options.name ?? "tool_calls";
|
|
323
|
+
if (options.expectedToolCalls !== void 0) this.expectedToolCalls = options.expectedToolCalls;
|
|
324
|
+
this.requireOrder = options.requireOrder ?? true;
|
|
325
|
+
}
|
|
326
|
+
async evaluate(input) {
|
|
327
|
+
const actualToolCalls = input.result.status === "completed" ? extractToolCalls(input.result.output) : [];
|
|
328
|
+
const expectedToolCalls = this.expectedToolCalls ?? extractExpectedToolCalls(input.testCase.expected);
|
|
329
|
+
const passed = this.requireOrder ? matchesInOrder(actualToolCalls, expectedToolCalls) : matchesAnyOrder(actualToolCalls, expectedToolCalls);
|
|
330
|
+
return {
|
|
331
|
+
name: this.name,
|
|
332
|
+
score: passed ? 1 : 0,
|
|
333
|
+
passed,
|
|
334
|
+
reason: passed ? "Tool calls match expected calls." : "Tool calls do not match expected calls.",
|
|
335
|
+
metadata: {
|
|
336
|
+
expected: expectedToolCalls.map((call) => call.name),
|
|
337
|
+
actual: actualToolCalls.map((call) => call.name)
|
|
338
|
+
}
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
};
|
|
342
|
+
function extractToolCalls(value) {
|
|
343
|
+
if (isRecord3(value) && Array.isArray(value.toolCalls)) {
|
|
344
|
+
return value.toolCalls.filter(isToolCall);
|
|
345
|
+
}
|
|
346
|
+
return [];
|
|
347
|
+
}
|
|
348
|
+
function extractExpectedToolCalls(value) {
|
|
349
|
+
if (isRecord3(value) && Array.isArray(value.toolCalls)) {
|
|
350
|
+
return value.toolCalls.filter(isExpectedToolCall);
|
|
351
|
+
}
|
|
352
|
+
return [];
|
|
353
|
+
}
|
|
354
|
+
function matchesInOrder(actual, expected) {
|
|
355
|
+
return expected.every((expectedCall, index) => matchesCall(actual[index], expectedCall));
|
|
356
|
+
}
|
|
357
|
+
function matchesAnyOrder(actual, expected) {
|
|
358
|
+
return expected.every((expectedCall) => actual.some((actualCall) => matchesCall(actualCall, expectedCall)));
|
|
359
|
+
}
|
|
360
|
+
function matchesCall(actual, expected) {
|
|
361
|
+
if (actual === void 0 || actual.name !== expected.name) return false;
|
|
362
|
+
if (expected.input === void 0) return true;
|
|
363
|
+
return JSON.stringify(actual.input) === JSON.stringify(expected.input);
|
|
364
|
+
}
|
|
365
|
+
function isToolCall(value) {
|
|
366
|
+
return isRecord3(value) && typeof value.name === "string";
|
|
367
|
+
}
|
|
368
|
+
function isExpectedToolCall(value) {
|
|
369
|
+
return isRecord3(value) && typeof value.name === "string";
|
|
370
|
+
}
|
|
371
|
+
function isRecord3(value) {
|
|
372
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// src/metrics/metric-helpers.ts
|
|
376
|
+
function createPassRateMetric(passed, total) {
|
|
377
|
+
return {
|
|
378
|
+
name: "pass_rate",
|
|
379
|
+
value: total === 0 ? 0 : passed / total,
|
|
380
|
+
unit: "ratio",
|
|
381
|
+
metadata: { passed, total }
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
function createAverageScoreMetric(scores) {
|
|
385
|
+
return {
|
|
386
|
+
name: "average_score",
|
|
387
|
+
value: scores.length === 0 ? 0 : scores.reduce((sum, score) => sum + score, 0) / scores.length,
|
|
388
|
+
unit: "ratio"
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
function createCostMetric(results) {
|
|
392
|
+
const cost = results.reduce((sum, result) => {
|
|
393
|
+
const value = result.metadata?.estimatedCostUsd;
|
|
394
|
+
return sum + (typeof value === "number" ? value : 0);
|
|
395
|
+
}, 0);
|
|
396
|
+
return {
|
|
397
|
+
name: "estimated_cost_usd",
|
|
398
|
+
value: cost,
|
|
399
|
+
unit: "usd"
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
// src/reports/json-report-renderer.ts
|
|
404
|
+
var JsonReportRenderer = class {
|
|
405
|
+
render(report) {
|
|
406
|
+
return JSON.stringify(report, null, 2);
|
|
407
|
+
}
|
|
408
|
+
};
|
|
409
|
+
|
|
410
|
+
// src/reports/markdown-report-renderer.ts
|
|
411
|
+
var MarkdownReportRenderer = class {
|
|
412
|
+
render(report) {
|
|
413
|
+
const failed = report.run.results.filter((result) => !result.passed);
|
|
414
|
+
const regressions = report.regressions?.filter((result) => result.regressed) ?? [];
|
|
415
|
+
return [
|
|
416
|
+
`# Eval Report ${report.run.runId}`,
|
|
417
|
+
"",
|
|
418
|
+
`Dataset: ${report.run.datasetId}@${report.run.datasetVersion}`,
|
|
419
|
+
`Passed: ${report.run.passed ? "yes" : "no"}`,
|
|
420
|
+
`Pass rate: ${formatRatio(report.run.summary.passRate)}`,
|
|
421
|
+
`Average score: ${formatRatio(report.run.summary.averageScore)}`,
|
|
422
|
+
"",
|
|
423
|
+
"## Metrics",
|
|
424
|
+
...report.run.metrics.map((metric) => `- ${metric.name}: ${metric.value}${metric.unit ? ` ${metric.unit}` : ""}`),
|
|
425
|
+
"",
|
|
426
|
+
"## Failed Cases",
|
|
427
|
+
...failed.length === 0 ? ["- none"] : failed.map((result) => {
|
|
428
|
+
const reasons = result.scores.filter((score) => !score.passed).map((score) => `${score.name}: ${score.reason ?? "failed"}`);
|
|
429
|
+
return `- ${result.caseId}: ${reasons.join(", ")}`;
|
|
430
|
+
}),
|
|
431
|
+
"",
|
|
432
|
+
"## Regressions",
|
|
433
|
+
...regressions.length === 0 ? ["- none"] : regressions.map((result) => `- ${result.metricName}: ${result.baselineValue} -> ${result.currentValue}`)
|
|
434
|
+
].join("\n");
|
|
435
|
+
}
|
|
436
|
+
};
|
|
437
|
+
function formatRatio(value) {
|
|
438
|
+
return `${Math.round(value * 1e4) / 100}%`;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// src/rubrics/weighted-rubric-evaluator.ts
|
|
442
|
+
var WeightedRubricEvaluator = class {
|
|
443
|
+
name;
|
|
444
|
+
rubric;
|
|
445
|
+
scoreCriterion;
|
|
446
|
+
constructor(options) {
|
|
447
|
+
this.name = `rubric:${options.rubric.id}`;
|
|
448
|
+
this.rubric = options.rubric;
|
|
449
|
+
this.scoreCriterion = options.scoreCriterion;
|
|
450
|
+
}
|
|
451
|
+
async evaluate(input) {
|
|
452
|
+
const criteria = await Promise.all(
|
|
453
|
+
this.rubric.criteria.map((criterion) => this.scoreCriterion(criterion.key, input))
|
|
454
|
+
);
|
|
455
|
+
const totalWeight = this.rubric.criteria.reduce((sum, item) => sum + (item.weight ?? 1), 0) || 1;
|
|
456
|
+
const score = criteria.reduce((sum, item) => {
|
|
457
|
+
const weight = this.rubric.criteria.find((criterion) => criterion.key === item.key)?.weight ?? 1;
|
|
458
|
+
return sum + item.score * weight;
|
|
459
|
+
}, 0) / totalWeight;
|
|
460
|
+
const threshold = this.rubric.threshold ?? 1;
|
|
461
|
+
const passed = score >= threshold && criteria.every((criterion) => criterion.passed);
|
|
462
|
+
return {
|
|
463
|
+
name: this.name,
|
|
464
|
+
score,
|
|
465
|
+
passed,
|
|
466
|
+
reason: passed ? "Rubric passed." : "Rubric failed.",
|
|
467
|
+
metadata: {
|
|
468
|
+
rubricId: this.rubric.id,
|
|
469
|
+
rubricVersion: this.rubric.version,
|
|
470
|
+
threshold,
|
|
471
|
+
criteria: criteria.map((criterion) => ({
|
|
472
|
+
key: criterion.key,
|
|
473
|
+
score: criterion.score,
|
|
474
|
+
passed: criterion.passed,
|
|
475
|
+
reason: criterion.reason ?? null
|
|
476
|
+
}))
|
|
477
|
+
}
|
|
478
|
+
};
|
|
479
|
+
}
|
|
480
|
+
};
|
|
481
|
+
|
|
482
|
+
// src/runners/eval-runner.ts
|
|
483
|
+
var EvalRunner = class {
|
|
484
|
+
datasetStore;
|
|
485
|
+
evaluators;
|
|
486
|
+
executor;
|
|
487
|
+
constructor(options) {
|
|
488
|
+
this.datasetStore = options.datasetStore;
|
|
489
|
+
this.evaluators = options.evaluators;
|
|
490
|
+
this.executor = options.executor;
|
|
491
|
+
}
|
|
492
|
+
async run(config) {
|
|
493
|
+
const dataset = await this.datasetStore.get(config.datasetId, config.datasetVersion);
|
|
494
|
+
if (dataset === void 0) {
|
|
495
|
+
throw new EvalError("EVAL_DATASET_NOT_FOUND", `Eval dataset not found: ${config.datasetId}`);
|
|
496
|
+
}
|
|
497
|
+
const context = {
|
|
498
|
+
runId: config.runId ?? createRunId(),
|
|
499
|
+
datasetId: dataset.id,
|
|
500
|
+
datasetVersion: dataset.version
|
|
501
|
+
};
|
|
502
|
+
if (config.agentKey !== void 0) context.agentKey = config.agentKey;
|
|
503
|
+
if (config.modelId !== void 0) context.modelId = config.modelId;
|
|
504
|
+
if (config.promptVersion !== void 0) context.promptVersion = config.promptVersion;
|
|
505
|
+
if (config.metadata !== void 0) context.metadata = config.metadata;
|
|
506
|
+
const results = [];
|
|
507
|
+
for (const testCase of dataset.cases) {
|
|
508
|
+
const result = await this.executor(testCase, context);
|
|
509
|
+
const scores2 = await Promise.all(
|
|
510
|
+
this.evaluators.map(
|
|
511
|
+
(evaluator) => evaluator.evaluate({
|
|
512
|
+
testCase,
|
|
513
|
+
result,
|
|
514
|
+
context
|
|
515
|
+
})
|
|
516
|
+
)
|
|
517
|
+
);
|
|
518
|
+
const averageScore = scores2.length === 0 ? 0 : scores2.reduce((sum, score) => sum + score.score, 0) / scores2.length;
|
|
519
|
+
const passed = result.status === "completed" && scores2.every((score) => score.passed) && averageScore >= (config.threshold ?? 1);
|
|
520
|
+
results.push({
|
|
521
|
+
caseId: testCase.id,
|
|
522
|
+
testCase,
|
|
523
|
+
result,
|
|
524
|
+
scores: scores2,
|
|
525
|
+
passed
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
const passedCases = results.filter((result) => result.passed).length;
|
|
529
|
+
const scores = results.flatMap((result) => result.scores.map((score) => score.score));
|
|
530
|
+
const summary = {
|
|
531
|
+
totalCases: results.length,
|
|
532
|
+
passedCases,
|
|
533
|
+
failedCases: results.length - passedCases,
|
|
534
|
+
passRate: results.length === 0 ? 0 : passedCases / results.length,
|
|
535
|
+
averageScore: scores.length === 0 ? 0 : scores.reduce((sum, score) => sum + score, 0) / scores.length
|
|
536
|
+
};
|
|
537
|
+
const runResult = {
|
|
538
|
+
runId: context.runId,
|
|
539
|
+
datasetId: dataset.id,
|
|
540
|
+
datasetVersion: dataset.version,
|
|
541
|
+
results,
|
|
542
|
+
summary,
|
|
543
|
+
metrics: [
|
|
544
|
+
createPassRateMetric(summary.passedCases, summary.totalCases),
|
|
545
|
+
createAverageScoreMetric(scores),
|
|
546
|
+
createCostMetric(results.map((result) => result.result))
|
|
547
|
+
],
|
|
548
|
+
passed: summary.failedCases === 0
|
|
549
|
+
};
|
|
550
|
+
if (config.metadata !== void 0) runResult.metadata = config.metadata;
|
|
551
|
+
return runResult;
|
|
552
|
+
}
|
|
553
|
+
};
|
|
554
|
+
function createRunId() {
|
|
555
|
+
return `eval_${Date.now().toString(36)}`;
|
|
556
|
+
}
|
|
557
|
+
export {
|
|
558
|
+
BaselineComparator,
|
|
559
|
+
CompositeEvaluator,
|
|
560
|
+
EvalError,
|
|
561
|
+
EvalRunner,
|
|
562
|
+
ExactMatchEvaluator,
|
|
563
|
+
InMemoryEvalDatasetStore,
|
|
564
|
+
JsonReportRenderer,
|
|
565
|
+
JsonSchemaEvaluator,
|
|
566
|
+
LlmJudgeEvaluator,
|
|
567
|
+
MarkdownReportRenderer,
|
|
568
|
+
RegexEvaluator,
|
|
569
|
+
SemanticSimilarityEvaluator,
|
|
570
|
+
ToolCallEvaluator,
|
|
571
|
+
WeightedRubricEvaluator,
|
|
572
|
+
createAverageScoreMetric,
|
|
573
|
+
createCostMetric,
|
|
574
|
+
createEvalCase,
|
|
575
|
+
createEvalScore,
|
|
576
|
+
createPassRateMetric
|
|
577
|
+
};
|
|
578
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/baselines/baseline-comparator.ts","../src/cases/eval-case-builder.ts","../src/datasets/in-memory-dataset-store.ts","../src/errors/eval-error.ts","../src/evaluators/composite-evaluator.ts","../src/evaluators/evaluator.interface.ts","../src/evaluators/exact-match-evaluator.ts","../src/evaluators/json-schema-evaluator.ts","../src/evaluators/llm-judge-evaluator.ts","../src/evaluators/regex-evaluator.ts","../src/evaluators/semantic-similarity-evaluator.ts","../src/evaluators/tool-call-evaluator.ts","../src/metrics/metric-helpers.ts","../src/reports/json-report-renderer.ts","../src/reports/markdown-report-renderer.ts","../src/rubrics/weighted-rubric-evaluator.ts","../src/runners/eval-runner.ts"],"sourcesContent":["import type { EvalRunResult } from '../runners/eval-run-result';\nimport type { EvalRegressionResult } from './regression-result';\n\nexport interface BaselineComparatorOptions {\n thresholds?: Record<string, number>;\n}\n\nexport class BaselineComparator {\n private readonly thresholds: Record<string, number>;\n\n constructor(options: BaselineComparatorOptions = {}) {\n this.thresholds = options.thresholds ?? {};\n }\n\n compare(baseline: EvalRunResult, current: EvalRunResult): EvalRegressionResult[] {\n const baselineMetrics = new Map(baseline.metrics.map((metric) => [metric.name, metric.value]));\n\n return current.metrics\n .filter((metric) => baselineMetrics.has(metric.name))\n .map((metric) => {\n const baselineValue = baselineMetrics.get(metric.name) ?? 0;\n const threshold = this.thresholds[metric.name] ?? 0;\n const delta = metric.value - baselineValue;\n const lowerIsBetter = metric.name.includes('cost') || metric.name.includes('latency');\n const regressed = lowerIsBetter ? delta > threshold : delta < -threshold;\n\n return {\n metricName: metric.name,\n baselineValue,\n currentValue: metric.value,\n delta,\n regressed,\n threshold,\n };\n });\n }\n}\n","import type { AgentKey, JsonObject } from '@hemia-ai/agents-core';\nimport type { EvalCase } from './eval-case';\n\nexport interface EvalCaseBuilderInput<TInput = unknown, TExpected = unknown> {\n id: string;\n name: string;\n agentKey: AgentKey;\n input: TInput;\n expected?: TExpected;\n tags?: string[];\n metadata?: JsonObject;\n}\n\nexport function createEvalCase<TInput = unknown, TExpected = unknown>(\n input: EvalCaseBuilderInput<TInput, TExpected>,\n): EvalCase<TInput, TExpected> {\n const testCase: EvalCase<TInput, TExpected> = {\n id: input.id,\n name: input.name,\n agentKey: input.agentKey,\n input: input.input,\n };\n\n if (input.expected !== undefined) testCase.expected = input.expected;\n if (input.tags !== undefined) testCase.tags = input.tags;\n if (input.metadata !== undefined) testCase.metadata = input.metadata;\n\n return testCase;\n}\n","import type { EvalDatasetStore } from './dataset-registry';\nimport type { EvalDataset } from './eval-dataset';\n\nexport class InMemoryEvalDatasetStore implements EvalDatasetStore {\n private readonly datasets = new Map<string, EvalDataset>();\n\n constructor(datasets: EvalDataset[] = []) {\n for (const dataset of datasets) {\n this.datasets.set(this.key(dataset.id, dataset.version), dataset);\n }\n }\n\n async list(): Promise<EvalDataset[]> {\n return [...this.datasets.values()];\n }\n\n async get(id: string, version?: string): Promise<EvalDataset | undefined> {\n if (version !== undefined) return this.datasets.get(this.key(id, version));\n\n return [...this.datasets.values()]\n .filter((dataset) => dataset.id === id)\n .sort((a, b) => b.version.localeCompare(a.version))[0];\n }\n\n async put(dataset: EvalDataset): Promise<void> {\n this.datasets.set(this.key(dataset.id, dataset.version), dataset);\n }\n\n private key(id: string, version: string): string {\n return `${id}@${version}`;\n }\n}\n","import type { JsonObject } from '@hemia-ai/agents-core';\nimport type { EvalErrorCode } from './eval-error-code';\n\nexport class EvalError extends Error {\n readonly code: EvalErrorCode;\n readonly metadata?: JsonObject;\n\n constructor(code: EvalErrorCode, message: string, metadata?: JsonObject) {\n super(message);\n this.name = 'EvalError';\n this.code = code;\n if (metadata !== undefined) this.metadata = metadata;\n }\n}\n","import type { EvalEvaluator, EvalEvaluatorInput, EvalScore } from './evaluator.interface';\n\nexport interface WeightedEvalEvaluator {\n evaluator: EvalEvaluator;\n weight?: number;\n}\n\nexport interface CompositeEvaluatorOptions {\n name?: string;\n evaluators: WeightedEvalEvaluator[];\n threshold?: number;\n}\n\nexport class CompositeEvaluator implements EvalEvaluator {\n readonly name: string;\n private readonly evaluators: WeightedEvalEvaluator[];\n private readonly threshold: number;\n\n constructor(options: CompositeEvaluatorOptions) {\n this.name = options.name ?? 'composite';\n this.evaluators = options.evaluators;\n this.threshold = options.threshold ?? 1;\n }\n\n async evaluate(input: EvalEvaluatorInput): Promise<EvalScore> {\n const scores = await Promise.all(this.evaluators.map(({ evaluator }) => evaluator.evaluate(input)));\n const totalWeight = this.evaluators.reduce((sum, item) => sum + (item.weight ?? 1), 0) || 1;\n const score = scores.reduce((sum, item, index) => sum + item.score * (this.evaluators[index]?.weight ?? 1), 0) / totalWeight;\n const passed = score >= this.threshold && scores.every((item) => item.passed);\n\n return {\n name: this.name,\n score,\n passed,\n reason: passed ? 'Composite score passed.' : 'Composite score failed.',\n metadata: {\n threshold: this.threshold,\n scores: scores.map((item) => ({\n name: item.name,\n score: item.score,\n passed: item.passed,\n reason: item.reason ?? null,\n })),\n },\n };\n }\n}\n","import type { AgentExecutionResult, JsonObject } from '@hemia-ai/agents-core';\nimport type { EvalCase } from '../cases/eval-case';\nimport type { EvalRunContext } from '../runners/eval-run-context';\n\nexport interface EvalEvaluatorInput {\n testCase: EvalCase;\n result: AgentExecutionResult;\n context: EvalRunContext;\n}\n\nexport interface EvalScore {\n name: string;\n score: number;\n passed: boolean;\n reason?: string;\n metadata?: JsonObject;\n}\n\nexport interface EvalEvaluator {\n name: string;\n evaluate(input: EvalEvaluatorInput): Promise<EvalScore>;\n}\n\nexport function createEvalScore(input: EvalScore): EvalScore {\n return input;\n}\n","import type { EvalEvaluator, EvalEvaluatorInput, EvalScore } from './evaluator.interface';\n\nexport interface ExactMatchEvaluatorOptions {\n name?: string;\n ignoreCase?: boolean;\n trim?: boolean;\n}\n\nexport class ExactMatchEvaluator implements EvalEvaluator {\n readonly name: string;\n private readonly ignoreCase: boolean;\n private readonly trim: boolean;\n\n constructor(options: ExactMatchEvaluatorOptions = {}) {\n this.name = options.name ?? 'exact_match';\n this.ignoreCase = options.ignoreCase ?? false;\n this.trim = options.trim ?? true;\n }\n\n async evaluate(input: EvalEvaluatorInput): Promise<EvalScore> {\n const actual = input.result.status === 'completed' ? input.result.output : undefined;\n const expected = input.testCase.expected;\n const passed = this.normalize(actual) === this.normalize(expected);\n\n return {\n name: this.name,\n score: passed ? 1 : 0,\n passed,\n reason: passed ? 'Output matches expected value.' : 'Output does not match expected value.',\n };\n }\n\n private normalize(value: unknown): string {\n const raw = typeof value === 'string' ? value : JSON.stringify(value) ?? '';\n const trimmed = this.trim ? raw.trim() : raw;\n return this.ignoreCase ? trimmed.toLowerCase() : trimmed;\n }\n}\n","import type { JsonObject } from '@hemia-ai/agents-core';\nimport type { EvalEvaluator, EvalEvaluatorInput, EvalScore } from './evaluator.interface';\n\nexport interface JsonSchemaEvaluatorOptions {\n name?: string;\n schema: JsonObject;\n}\n\nexport class JsonSchemaEvaluator implements EvalEvaluator {\n readonly name: string;\n private readonly schema: JsonObject;\n\n constructor(options: JsonSchemaEvaluatorOptions) {\n this.name = options.name ?? 'json_schema';\n this.schema = options.schema;\n }\n\n async evaluate(input: EvalEvaluatorInput): Promise<EvalScore> {\n const actual = input.result.status === 'completed' ? input.result.output : undefined;\n const errors = validateAgainstSchema(actual, this.schema);\n const passed = errors.length === 0;\n\n return {\n name: this.name,\n score: passed ? 1 : 0,\n passed,\n reason: passed ? 'Output matches schema.' : errors.join('; '),\n };\n }\n}\n\nfunction validateAgainstSchema(value: unknown, schema: JsonObject, path = '$'): string[] {\n const errors: string[] = [];\n const type = schema.type;\n\n if (typeof type === 'string' && !matchesType(value, type)) {\n errors.push(`${path} expected ${type}`);\n return errors;\n }\n\n if (type === 'object' && isRecord(value)) {\n const required = Array.isArray(schema.required) ? schema.required.filter((item): item is string => typeof item === 'string') : [];\n for (const key of required) {\n if (!(key in value)) errors.push(`${path}.${key} is required`);\n }\n\n const properties = isRecord(schema.properties) ? schema.properties : {};\n for (const [key, childSchema] of Object.entries(properties)) {\n if (key in value && isRecord(childSchema)) {\n errors.push(...validateAgainstSchema(value[key], childSchema, `${path}.${key}`));\n }\n }\n }\n\n return errors;\n}\n\nfunction matchesType(value: unknown, type: string): boolean {\n if (type === 'array') return Array.isArray(value);\n if (type === 'null') return value === null;\n if (type === 'integer') return Number.isInteger(value);\n return typeof value === type;\n}\n\nfunction isRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === 'object' && value !== null && !Array.isArray(value);\n}\n","import type { JsonObject } from '@hemia-ai/agents-core';\nimport type { AiGenerateInput } from '@hemia-ai/agents-models';\nimport type { ModelGateway } from '@hemia-ai/agents-models';\nimport type { EvalEvaluator, EvalEvaluatorInput, EvalScore } from './evaluator.interface';\n\nexport interface LlmJudgeEvaluatorOptions {\n name?: string;\n modelGateway: Pick<ModelGateway, 'generate'>;\n model?: string;\n threshold?: number;\n rubric?: string;\n metadata?: JsonObject;\n}\n\nexport class LlmJudgeEvaluator implements EvalEvaluator {\n readonly name: string;\n private readonly modelGateway: Pick<ModelGateway, 'generate'>;\n private readonly model?: string;\n private readonly threshold: number;\n private readonly rubric?: string;\n private readonly metadata?: JsonObject;\n\n constructor(options: LlmJudgeEvaluatorOptions) {\n this.name = options.name ?? 'llm_judge';\n this.modelGateway = options.modelGateway;\n this.threshold = options.threshold ?? 0.8;\n if (options.model !== undefined) this.model = options.model;\n if (options.rubric !== undefined) this.rubric = options.rubric;\n if (options.metadata !== undefined) this.metadata = options.metadata;\n }\n\n async evaluate(input: EvalEvaluatorInput): Promise<EvalScore> {\n const request: AiGenerateInput = {\n taskType: 'eval_judge',\n agentKey: input.testCase.agentKey,\n messages: [\n {\n role: 'system',\n content:\n 'You are an evaluator. Return JSON only with shape {\"score\": number, \"passed\": boolean, \"reason\": string}.',\n },\n {\n role: 'user',\n content: JSON.stringify({\n rubric: this.rubric,\n input: input.testCase.input,\n expected: input.testCase.expected,\n actual: input.result.status === 'completed' ? input.result.output : input.result,\n }),\n },\n ],\n };\n const response = await this.modelGateway.generate({\n ...request,\n ...(this.model !== undefined ? { model: this.model } : {}),\n ...(this.metadata !== undefined ? { metadata: this.metadata } : {}),\n });\n\n const parsed = parseJudgeResponse(response.content);\n const score = parsed.score ?? 0;\n const passed = parsed.passed ?? score >= this.threshold;\n\n return {\n name: this.name,\n score,\n passed,\n reason: parsed.reason ?? response.content,\n metadata: {\n threshold: this.threshold,\n judgeModel: this.model ?? null,\n usage: response.usage === undefined ? null : JSON.parse(JSON.stringify(response.usage)),\n },\n };\n }\n}\n\nfunction parseJudgeResponse(content: string): { score?: number; passed?: boolean; reason?: string } {\n try {\n const parsed = JSON.parse(content) as unknown;\n if (!isRecord(parsed)) return {};\n\n const result: { score?: number; passed?: boolean; reason?: string } = {};\n if (typeof parsed.score === 'number') result.score = parsed.score;\n if (typeof parsed.passed === 'boolean') result.passed = parsed.passed;\n if (typeof parsed.reason === 'string') result.reason = parsed.reason;\n return result;\n } catch {\n return {};\n }\n}\n\nfunction isRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === 'object' && value !== null && !Array.isArray(value);\n}\n","import type { EvalEvaluator, EvalEvaluatorInput, EvalScore } from './evaluator.interface';\n\nexport interface RegexEvaluatorOptions {\n name?: string;\n pattern: RegExp;\n}\n\nexport class RegexEvaluator implements EvalEvaluator {\n readonly name: string;\n private readonly pattern: RegExp;\n\n constructor(options: RegexEvaluatorOptions) {\n this.name = options.name ?? 'regex_match';\n this.pattern = options.pattern;\n }\n\n async evaluate(input: EvalEvaluatorInput): Promise<EvalScore> {\n const actual = input.result.status === 'completed' ? input.result.output : undefined;\n const content = typeof actual === 'string' ? actual : JSON.stringify(actual);\n const passed = this.pattern.test(content);\n\n return {\n name: this.name,\n score: passed ? 1 : 0,\n passed,\n reason: passed ? 'Output matches pattern.' : 'Output does not match pattern.',\n metadata: { pattern: this.pattern.source },\n };\n }\n}\n","import type { EvalEvaluator, EvalEvaluatorInput, EvalScore } from './evaluator.interface';\n\nexport interface SemanticSimilarityEvaluatorOptions {\n name?: string;\n threshold?: number;\n similarity: (actual: string, expected: string) => Promise<number> | number;\n}\n\nexport class SemanticSimilarityEvaluator implements EvalEvaluator {\n readonly name: string;\n private readonly threshold: number;\n private readonly similarity: (actual: string, expected: string) => Promise<number> | number;\n\n constructor(options: SemanticSimilarityEvaluatorOptions) {\n this.name = options.name ?? 'semantic_similarity';\n this.threshold = options.threshold ?? 0.8;\n this.similarity = options.similarity;\n }\n\n async evaluate(input: EvalEvaluatorInput): Promise<EvalScore> {\n const actual = stringify(input.result.status === 'completed' ? input.result.output : undefined);\n const expected = stringify(input.testCase.expected);\n const score = await this.similarity(actual, expected);\n\n return {\n name: this.name,\n score,\n passed: score >= this.threshold,\n reason: score >= this.threshold ? 'Semantic similarity passed.' : 'Semantic similarity below threshold.',\n metadata: { threshold: this.threshold },\n };\n }\n}\n\nfunction stringify(value: unknown): string {\n return typeof value === 'string' ? value : JSON.stringify(value);\n}\n","import type { ToolCall } from '@hemia-ai/agents-core';\nimport type { EvalEvaluator, EvalEvaluatorInput, EvalScore } from './evaluator.interface';\n\nexport interface ExpectedToolCall {\n name: string;\n input?: unknown;\n}\n\nexport interface ToolCallEvaluatorOptions {\n name?: string;\n expectedToolCalls?: ExpectedToolCall[];\n requireOrder?: boolean;\n}\n\nexport class ToolCallEvaluator implements EvalEvaluator {\n readonly name: string;\n private readonly expectedToolCalls?: ExpectedToolCall[];\n private readonly requireOrder: boolean;\n\n constructor(options: ToolCallEvaluatorOptions = {}) {\n this.name = options.name ?? 'tool_calls';\n if (options.expectedToolCalls !== undefined) this.expectedToolCalls = options.expectedToolCalls;\n this.requireOrder = options.requireOrder ?? true;\n }\n\n async evaluate(input: EvalEvaluatorInput): Promise<EvalScore> {\n const actualToolCalls = input.result.status === 'completed' ? extractToolCalls(input.result.output) : [];\n const expectedToolCalls = this.expectedToolCalls ?? extractExpectedToolCalls(input.testCase.expected);\n const passed = this.requireOrder\n ? matchesInOrder(actualToolCalls, expectedToolCalls)\n : matchesAnyOrder(actualToolCalls, expectedToolCalls);\n\n return {\n name: this.name,\n score: passed ? 1 : 0,\n passed,\n reason: passed ? 'Tool calls match expected calls.' : 'Tool calls do not match expected calls.',\n metadata: {\n expected: expectedToolCalls.map((call) => call.name),\n actual: actualToolCalls.map((call) => call.name),\n },\n };\n }\n}\n\nfunction extractToolCalls(value: unknown): ToolCall[] {\n if (isRecord(value) && Array.isArray(value.toolCalls)) {\n return value.toolCalls.filter(isToolCall);\n }\n\n return [];\n}\n\nfunction extractExpectedToolCalls(value: unknown): ExpectedToolCall[] {\n if (isRecord(value) && Array.isArray(value.toolCalls)) {\n return value.toolCalls.filter(isExpectedToolCall);\n }\n\n return [];\n}\n\nfunction matchesInOrder(actual: ToolCall[], expected: ExpectedToolCall[]): boolean {\n return expected.every((expectedCall, index) => matchesCall(actual[index], expectedCall));\n}\n\nfunction matchesAnyOrder(actual: ToolCall[], expected: ExpectedToolCall[]): boolean {\n return expected.every((expectedCall) => actual.some((actualCall) => matchesCall(actualCall, expectedCall)));\n}\n\nfunction matchesCall(actual: ToolCall | undefined, expected: ExpectedToolCall): boolean {\n if (actual === undefined || actual.name !== expected.name) return false;\n if (expected.input === undefined) return true;\n return JSON.stringify(actual.input) === JSON.stringify(expected.input);\n}\n\nfunction isToolCall(value: unknown): value is ToolCall {\n return isRecord(value) && typeof value.name === 'string';\n}\n\nfunction isExpectedToolCall(value: unknown): value is ExpectedToolCall {\n return isRecord(value) && typeof value.name === 'string';\n}\n\nfunction isRecord(value: unknown): value is Record<string, unknown> {\n return typeof value === 'object' && value !== null && !Array.isArray(value);\n}\n","import type { AgentExecutionResult } from '@hemia-ai/agents-core';\nimport type { EvalMetric } from './eval-metric';\n\nexport function createPassRateMetric(passed: number, total: number): EvalMetric {\n return {\n name: 'pass_rate',\n value: total === 0 ? 0 : passed / total,\n unit: 'ratio',\n metadata: { passed, total },\n };\n}\n\nexport function createAverageScoreMetric(scores: number[]): EvalMetric {\n return {\n name: 'average_score',\n value: scores.length === 0 ? 0 : scores.reduce((sum, score) => sum + score, 0) / scores.length,\n unit: 'ratio',\n };\n}\n\nexport function createCostMetric(results: AgentExecutionResult[]): EvalMetric {\n const cost = results.reduce((sum, result) => {\n const value = result.metadata?.estimatedCostUsd;\n return sum + (typeof value === 'number' ? value : 0);\n }, 0);\n\n return {\n name: 'estimated_cost_usd',\n value: cost,\n unit: 'usd',\n };\n}\n","import type { EvalReport, EvalReportRenderer } from './eval-report';\n\nexport class JsonReportRenderer implements EvalReportRenderer {\n render(report: EvalReport): string {\n return JSON.stringify(report, null, 2);\n }\n}\n","import type { EvalReport, EvalReportRenderer } from './eval-report';\n\nexport class MarkdownReportRenderer implements EvalReportRenderer {\n render(report: EvalReport): string {\n const failed = report.run.results.filter((result) => !result.passed);\n const regressions = report.regressions?.filter((result) => result.regressed) ?? [];\n\n return [\n `# Eval Report ${report.run.runId}`,\n '',\n `Dataset: ${report.run.datasetId}@${report.run.datasetVersion}`,\n `Passed: ${report.run.passed ? 'yes' : 'no'}`,\n `Pass rate: ${formatRatio(report.run.summary.passRate)}`,\n `Average score: ${formatRatio(report.run.summary.averageScore)}`,\n '',\n '## Metrics',\n ...report.run.metrics.map((metric) => `- ${metric.name}: ${metric.value}${metric.unit ? ` ${metric.unit}` : ''}`),\n '',\n '## Failed Cases',\n ...(failed.length === 0\n ? ['- none']\n : failed.map((result) => {\n const reasons = result.scores.filter((score) => !score.passed).map((score) => `${score.name}: ${score.reason ?? 'failed'}`);\n return `- ${result.caseId}: ${reasons.join(', ')}`;\n })),\n '',\n '## Regressions',\n ...(regressions.length === 0\n ? ['- none']\n : regressions.map((result) => `- ${result.metricName}: ${result.baselineValue} -> ${result.currentValue}`)),\n ].join('\\n');\n }\n}\n\nfunction formatRatio(value: number): string {\n return `${Math.round(value * 10000) / 100}%`;\n}\n","import type { EvalEvaluator, EvalEvaluatorInput, EvalScore } from '../evaluators/evaluator.interface';\nimport type { EvalRubric } from './eval-rubric';\nimport type { RubricCriterionScore } from './rubric-score';\n\nexport interface WeightedRubricEvaluatorOptions {\n rubric: EvalRubric;\n scoreCriterion: (criterionKey: string, input: EvalEvaluatorInput) => Promise<RubricCriterionScore> | RubricCriterionScore;\n}\n\nexport class WeightedRubricEvaluator implements EvalEvaluator {\n readonly name: string;\n private readonly rubric: EvalRubric;\n private readonly scoreCriterion: (criterionKey: string, input: EvalEvaluatorInput) => Promise<RubricCriterionScore> | RubricCriterionScore;\n\n constructor(options: WeightedRubricEvaluatorOptions) {\n this.name = `rubric:${options.rubric.id}`;\n this.rubric = options.rubric;\n this.scoreCriterion = options.scoreCriterion;\n }\n\n async evaluate(input: EvalEvaluatorInput): Promise<EvalScore> {\n const criteria = await Promise.all(\n this.rubric.criteria.map((criterion) => this.scoreCriterion(criterion.key, input)),\n );\n const totalWeight = this.rubric.criteria.reduce((sum, item) => sum + (item.weight ?? 1), 0) || 1;\n const score = criteria.reduce((sum, item) => {\n const weight = this.rubric.criteria.find((criterion) => criterion.key === item.key)?.weight ?? 1;\n return sum + item.score * weight;\n }, 0) / totalWeight;\n const threshold = this.rubric.threshold ?? 1;\n const passed = score >= threshold && criteria.every((criterion) => criterion.passed);\n\n return {\n name: this.name,\n score,\n passed,\n reason: passed ? 'Rubric passed.' : 'Rubric failed.',\n metadata: {\n rubricId: this.rubric.id,\n rubricVersion: this.rubric.version,\n threshold,\n criteria: criteria.map((criterion) => ({\n key: criterion.key,\n score: criterion.score,\n passed: criterion.passed,\n reason: criterion.reason ?? null,\n })),\n },\n };\n }\n}\n","import type { EvalDatasetStore } from '../datasets/dataset-registry';\nimport { EvalError } from '../errors/eval-error';\nimport type { EvalEvaluator } from '../evaluators/evaluator.interface';\nimport { createAverageScoreMetric, createCostMetric, createPassRateMetric } from '../metrics/metric-helpers';\nimport type { EvalCaseExecutor } from './eval-case-executor';\nimport type { EvalRunConfig } from './eval-run-config';\nimport type { EvalRunContext } from './eval-run-context';\nimport type { EvalCaseResult, EvalRunResult } from './eval-run-result';\n\nexport interface EvalRunnerOptions {\n datasetStore: EvalDatasetStore;\n evaluators: EvalEvaluator[];\n executor: EvalCaseExecutor;\n}\n\nexport class EvalRunner {\n private readonly datasetStore: EvalDatasetStore;\n private readonly evaluators: EvalEvaluator[];\n private readonly executor: EvalCaseExecutor;\n\n constructor(options: EvalRunnerOptions) {\n this.datasetStore = options.datasetStore;\n this.evaluators = options.evaluators;\n this.executor = options.executor;\n }\n\n async run(config: EvalRunConfig): Promise<EvalRunResult> {\n const dataset = await this.datasetStore.get(config.datasetId, config.datasetVersion);\n if (dataset === undefined) {\n throw new EvalError('EVAL_DATASET_NOT_FOUND', `Eval dataset not found: ${config.datasetId}`);\n }\n\n const context: EvalRunContext = {\n runId: config.runId ?? createRunId(),\n datasetId: dataset.id,\n datasetVersion: dataset.version,\n };\n if (config.agentKey !== undefined) context.agentKey = config.agentKey;\n if (config.modelId !== undefined) context.modelId = config.modelId;\n if (config.promptVersion !== undefined) context.promptVersion = config.promptVersion;\n if (config.metadata !== undefined) context.metadata = config.metadata;\n\n const results: EvalCaseResult[] = [];\n\n for (const testCase of dataset.cases) {\n const result = await this.executor(testCase, context);\n const scores = await Promise.all(\n this.evaluators.map((evaluator) =>\n evaluator.evaluate({\n testCase,\n result,\n context,\n }),\n ),\n );\n const averageScore = scores.length === 0 ? 0 : scores.reduce((sum, score) => sum + score.score, 0) / scores.length;\n const passed = result.status === 'completed' && scores.every((score) => score.passed) && averageScore >= (config.threshold ?? 1);\n\n results.push({\n caseId: testCase.id,\n testCase,\n result,\n scores,\n passed,\n });\n }\n\n const passedCases = results.filter((result) => result.passed).length;\n const scores = results.flatMap((result) => result.scores.map((score) => score.score));\n const summary = {\n totalCases: results.length,\n passedCases,\n failedCases: results.length - passedCases,\n passRate: results.length === 0 ? 0 : passedCases / results.length,\n averageScore: scores.length === 0 ? 0 : scores.reduce((sum, score) => sum + score, 0) / scores.length,\n };\n\n const runResult: EvalRunResult = {\n runId: context.runId,\n datasetId: dataset.id,\n datasetVersion: dataset.version,\n results,\n summary,\n metrics: [\n createPassRateMetric(summary.passedCases, summary.totalCases),\n createAverageScoreMetric(scores),\n createCostMetric(results.map((result) => result.result)),\n ],\n passed: summary.failedCases === 0,\n };\n if (config.metadata !== undefined) runResult.metadata = config.metadata;\n\n return runResult;\n }\n}\n\nfunction createRunId(): string {\n return `eval_${Date.now().toString(36)}`;\n}\n"],"mappings":";AAOO,IAAM,qBAAN,MAAyB;AAAA,EACb;AAAA,EAEjB,YAAY,UAAqC,CAAC,GAAG;AACnD,SAAK,aAAa,QAAQ,cAAc,CAAC;AAAA,EAC3C;AAAA,EAEA,QAAQ,UAAyB,SAAgD;AAC/E,UAAM,kBAAkB,IAAI,IAAI,SAAS,QAAQ,IAAI,CAAC,WAAW,CAAC,OAAO,MAAM,OAAO,KAAK,CAAC,CAAC;AAE7F,WAAO,QAAQ,QACZ,OAAO,CAAC,WAAW,gBAAgB,IAAI,OAAO,IAAI,CAAC,EACnD,IAAI,CAAC,WAAW;AACf,YAAM,gBAAgB,gBAAgB,IAAI,OAAO,IAAI,KAAK;AAC1D,YAAM,YAAY,KAAK,WAAW,OAAO,IAAI,KAAK;AAClD,YAAM,QAAQ,OAAO,QAAQ;AAC7B,YAAM,gBAAgB,OAAO,KAAK,SAAS,MAAM,KAAK,OAAO,KAAK,SAAS,SAAS;AACpF,YAAM,YAAY,gBAAgB,QAAQ,YAAY,QAAQ,CAAC;AAE/D,aAAO;AAAA,QACL,YAAY,OAAO;AAAA,QACnB;AAAA,QACA,cAAc,OAAO;AAAA,QACrB;AAAA,QACA;AAAA,QACA;AAAA,MACF;AAAA,IACF,CAAC;AAAA,EACL;AACF;;;ACvBO,SAAS,eACd,OAC6B;AAC7B,QAAM,WAAwC;AAAA,IAC5C,IAAI,MAAM;AAAA,IACV,MAAM,MAAM;AAAA,IACZ,UAAU,MAAM;AAAA,IAChB,OAAO,MAAM;AAAA,EACf;AAEA,MAAI,MAAM,aAAa,OAAW,UAAS,WAAW,MAAM;AAC5D,MAAI,MAAM,SAAS,OAAW,UAAS,OAAO,MAAM;AACpD,MAAI,MAAM,aAAa,OAAW,UAAS,WAAW,MAAM;AAE5D,SAAO;AACT;;;ACzBO,IAAM,2BAAN,MAA2D;AAAA,EAC/C,WAAW,oBAAI,IAAyB;AAAA,EAEzD,YAAY,WAA0B,CAAC,GAAG;AACxC,eAAW,WAAW,UAAU;AAC9B,WAAK,SAAS,IAAI,KAAK,IAAI,QAAQ,IAAI,QAAQ,OAAO,GAAG,OAAO;AAAA,IAClE;AAAA,EACF;AAAA,EAEA,MAAM,OAA+B;AACnC,WAAO,CAAC,GAAG,KAAK,SAAS,OAAO,CAAC;AAAA,EACnC;AAAA,EAEA,MAAM,IAAI,IAAY,SAAoD;AACxE,QAAI,YAAY,OAAW,QAAO,KAAK,SAAS,IAAI,KAAK,IAAI,IAAI,OAAO,CAAC;AAEzE,WAAO,CAAC,GAAG,KAAK,SAAS,OAAO,CAAC,EAC9B,OAAO,CAAC,YAAY,QAAQ,OAAO,EAAE,EACrC,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,cAAc,EAAE,OAAO,CAAC,EAAE,CAAC;AAAA,EACzD;AAAA,EAEA,MAAM,IAAI,SAAqC;AAC7C,SAAK,SAAS,IAAI,KAAK,IAAI,QAAQ,IAAI,QAAQ,OAAO,GAAG,OAAO;AAAA,EAClE;AAAA,EAEQ,IAAI,IAAY,SAAyB;AAC/C,WAAO,GAAG,EAAE,IAAI,OAAO;AAAA,EACzB;AACF;;;AC5BO,IAAM,YAAN,cAAwB,MAAM;AAAA,EAC1B;AAAA,EACA;AAAA,EAET,YAAY,MAAqB,SAAiB,UAAuB;AACvE,UAAM,OAAO;AACb,SAAK,OAAO;AACZ,SAAK,OAAO;AACZ,QAAI,aAAa,OAAW,MAAK,WAAW;AAAA,EAC9C;AACF;;;ACAO,IAAM,qBAAN,MAAkD;AAAA,EAC9C;AAAA,EACQ;AAAA,EACA;AAAA,EAEjB,YAAY,SAAoC;AAC9C,SAAK,OAAO,QAAQ,QAAQ;AAC5B,SAAK,aAAa,QAAQ;AAC1B,SAAK,YAAY,QAAQ,aAAa;AAAA,EACxC;AAAA,EAEA,MAAM,SAAS,OAA+C;AAC5D,UAAM,SAAS,MAAM,QAAQ,IAAI,KAAK,WAAW,IAAI,CAAC,EAAE,UAAU,MAAM,UAAU,SAAS,KAAK,CAAC,CAAC;AAClG,UAAM,cAAc,KAAK,WAAW,OAAO,CAAC,KAAK,SAAS,OAAO,KAAK,UAAU,IAAI,CAAC,KAAK;AAC1F,UAAM,QAAQ,OAAO,OAAO,CAAC,KAAK,MAAM,UAAU,MAAM,KAAK,SAAS,KAAK,WAAW,KAAK,GAAG,UAAU,IAAI,CAAC,IAAI;AACjH,UAAM,SAAS,SAAS,KAAK,aAAa,OAAO,MAAM,CAAC,SAAS,KAAK,MAAM;AAE5E,WAAO;AAAA,MACL,MAAM,KAAK;AAAA,MACX;AAAA,MACA;AAAA,MACA,QAAQ,SAAS,4BAA4B;AAAA,MAC7C,UAAU;AAAA,QACR,WAAW,KAAK;AAAA,QAChB,QAAQ,OAAO,IAAI,CAAC,UAAU;AAAA,UAC5B,MAAM,KAAK;AAAA,UACX,OAAO,KAAK;AAAA,UACZ,QAAQ,KAAK;AAAA,UACb,QAAQ,KAAK,UAAU;AAAA,QACzB,EAAE;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AACF;;;ACvBO,SAAS,gBAAgB,OAA6B;AAC3D,SAAO;AACT;;;ACjBO,IAAM,sBAAN,MAAmD;AAAA,EAC/C;AAAA,EACQ;AAAA,EACA;AAAA,EAEjB,YAAY,UAAsC,CAAC,GAAG;AACpD,SAAK,OAAO,QAAQ,QAAQ;AAC5B,SAAK,aAAa,QAAQ,cAAc;AACxC,SAAK,OAAO,QAAQ,QAAQ;AAAA,EAC9B;AAAA,EAEA,MAAM,SAAS,OAA+C;AAC5D,UAAM,SAAS,MAAM,OAAO,WAAW,cAAc,MAAM,OAAO,SAAS;AAC3E,UAAM,WAAW,MAAM,SAAS;AAChC,UAAM,SAAS,KAAK,UAAU,MAAM,MAAM,KAAK,UAAU,QAAQ;AAEjE,WAAO;AAAA,MACL,MAAM,KAAK;AAAA,MACX,OAAO,SAAS,IAAI;AAAA,MACpB;AAAA,MACA,QAAQ,SAAS,mCAAmC;AAAA,IACtD;AAAA,EACF;AAAA,EAEQ,UAAU,OAAwB;AACxC,UAAM,MAAM,OAAO,UAAU,WAAW,QAAQ,KAAK,UAAU,KAAK,KAAK;AACzE,UAAM,UAAU,KAAK,OAAO,IAAI,KAAK,IAAI;AACzC,WAAO,KAAK,aAAa,QAAQ,YAAY,IAAI;AAAA,EACnD;AACF;;;AC7BO,IAAM,sBAAN,MAAmD;AAAA,EAC/C;AAAA,EACQ;AAAA,EAEjB,YAAY,SAAqC;AAC/C,SAAK,OAAO,QAAQ,QAAQ;AAC5B,SAAK,SAAS,QAAQ;AAAA,EACxB;AAAA,EAEA,MAAM,SAAS,OAA+C;AAC5D,UAAM,SAAS,MAAM,OAAO,WAAW,cAAc,MAAM,OAAO,SAAS;AAC3E,UAAM,SAAS,sBAAsB,QAAQ,KAAK,MAAM;AACxD,UAAM,SAAS,OAAO,WAAW;AAEjC,WAAO;AAAA,MACL,MAAM,KAAK;AAAA,MACX,OAAO,SAAS,IAAI;AAAA,MACpB;AAAA,MACA,QAAQ,SAAS,2BAA2B,OAAO,KAAK,IAAI;AAAA,IAC9D;AAAA,EACF;AACF;AAEA,SAAS,sBAAsB,OAAgB,QAAoB,OAAO,KAAe;AACvF,QAAM,SAAmB,CAAC;AAC1B,QAAM,OAAO,OAAO;AAEpB,MAAI,OAAO,SAAS,YAAY,CAAC,YAAY,OAAO,IAAI,GAAG;AACzD,WAAO,KAAK,GAAG,IAAI,aAAa,IAAI,EAAE;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,YAAY,SAAS,KAAK,GAAG;AACxC,UAAM,WAAW,MAAM,QAAQ,OAAO,QAAQ,IAAI,OAAO,SAAS,OAAO,CAAC,SAAyB,OAAO,SAAS,QAAQ,IAAI,CAAC;AAChI,eAAW,OAAO,UAAU;AAC1B,UAAI,EAAE,OAAO,OAAQ,QAAO,KAAK,GAAG,IAAI,IAAI,GAAG,cAAc;AAAA,IAC/D;AAEA,UAAM,aAAa,SAAS,OAAO,UAAU,IAAI,OAAO,aAAa,CAAC;AACtE,eAAW,CAAC,KAAK,WAAW,KAAK,OAAO,QAAQ,UAAU,GAAG;AAC3D,UAAI,OAAO,SAAS,SAAS,WAAW,GAAG;AACzC,eAAO,KAAK,GAAG,sBAAsB,MAAM,GAAG,GAAG,aAAa,GAAG,IAAI,IAAI,GAAG,EAAE,CAAC;AAAA,MACjF;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AACT;AAEA,SAAS,YAAY,OAAgB,MAAuB;AAC1D,MAAI,SAAS,QAAS,QAAO,MAAM,QAAQ,KAAK;AAChD,MAAI,SAAS,OAAQ,QAAO,UAAU;AACtC,MAAI,SAAS,UAAW,QAAO,OAAO,UAAU,KAAK;AACrD,SAAO,OAAO,UAAU;AAC1B;AAEA,SAAS,SAAS,OAAkD;AAClE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;;;ACpDO,IAAM,oBAAN,MAAiD;AAAA,EAC7C;AAAA,EACQ;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EAEjB,YAAY,SAAmC;AAC7C,SAAK,OAAO,QAAQ,QAAQ;AAC5B,SAAK,eAAe,QAAQ;AAC5B,SAAK,YAAY,QAAQ,aAAa;AACtC,QAAI,QAAQ,UAAU,OAAW,MAAK,QAAQ,QAAQ;AACtD,QAAI,QAAQ,WAAW,OAAW,MAAK,SAAS,QAAQ;AACxD,QAAI,QAAQ,aAAa,OAAW,MAAK,WAAW,QAAQ;AAAA,EAC9D;AAAA,EAEA,MAAM,SAAS,OAA+C;AAC5D,UAAM,UAA2B;AAAA,MAC/B,UAAU;AAAA,MACV,UAAU,MAAM,SAAS;AAAA,MACzB,UAAU;AAAA,QACR;AAAA,UACE,MAAM;AAAA,UACN,SACE;AAAA,QACJ;AAAA,QACA;AAAA,UACE,MAAM;AAAA,UACN,SAAS,KAAK,UAAU;AAAA,YACtB,QAAQ,KAAK;AAAA,YACb,OAAO,MAAM,SAAS;AAAA,YACtB,UAAU,MAAM,SAAS;AAAA,YACzB,QAAQ,MAAM,OAAO,WAAW,cAAc,MAAM,OAAO,SAAS,MAAM;AAAA,UAC5E,CAAC;AAAA,QACH;AAAA,MACF;AAAA,IACF;AACA,UAAM,WAAW,MAAM,KAAK,aAAa,SAAS;AAAA,MAChD,GAAG;AAAA,MACH,GAAI,KAAK,UAAU,SAAY,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,MACxD,GAAI,KAAK,aAAa,SAAY,EAAE,UAAU,KAAK,SAAS,IAAI,CAAC;AAAA,IACnE,CAAC;AAED,UAAM,SAAS,mBAAmB,SAAS,OAAO;AAClD,UAAM,QAAQ,OAAO,SAAS;AAC9B,UAAM,SAAS,OAAO,UAAU,SAAS,KAAK;AAE9C,WAAO;AAAA,MACL,MAAM,KAAK;AAAA,MACX;AAAA,MACA;AAAA,MACA,QAAQ,OAAO,UAAU,SAAS;AAAA,MAClC,UAAU;AAAA,QACR,WAAW,KAAK;AAAA,QAChB,YAAY,KAAK,SAAS;AAAA,QAC1B,OAAO,SAAS,UAAU,SAAY,OAAO,KAAK,MAAM,KAAK,UAAU,SAAS,KAAK,CAAC;AAAA,MACxF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,mBAAmB,SAAwE;AAClG,MAAI;AACF,UAAM,SAAS,KAAK,MAAM,OAAO;AACjC,QAAI,CAACA,UAAS,MAAM,EAAG,QAAO,CAAC;AAE/B,UAAM,SAAgE,CAAC;AACvE,QAAI,OAAO,OAAO,UAAU,SAAU,QAAO,QAAQ,OAAO;AAC5D,QAAI,OAAO,OAAO,WAAW,UAAW,QAAO,SAAS,OAAO;AAC/D,QAAI,OAAO,OAAO,WAAW,SAAU,QAAO,SAAS,OAAO;AAC9D,WAAO;AAAA,EACT,QAAQ;AACN,WAAO,CAAC;AAAA,EACV;AACF;AAEA,SAASA,UAAS,OAAkD;AAClE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;;;ACtFO,IAAM,iBAAN,MAA8C;AAAA,EAC1C;AAAA,EACQ;AAAA,EAEjB,YAAY,SAAgC;AAC1C,SAAK,OAAO,QAAQ,QAAQ;AAC5B,SAAK,UAAU,QAAQ;AAAA,EACzB;AAAA,EAEA,MAAM,SAAS,OAA+C;AAC5D,UAAM,SAAS,MAAM,OAAO,WAAW,cAAc,MAAM,OAAO,SAAS;AAC3E,UAAM,UAAU,OAAO,WAAW,WAAW,SAAS,KAAK,UAAU,MAAM;AAC3E,UAAM,SAAS,KAAK,QAAQ,KAAK,OAAO;AAExC,WAAO;AAAA,MACL,MAAM,KAAK;AAAA,MACX,OAAO,SAAS,IAAI;AAAA,MACpB;AAAA,MACA,QAAQ,SAAS,4BAA4B;AAAA,MAC7C,UAAU,EAAE,SAAS,KAAK,QAAQ,OAAO;AAAA,IAC3C;AAAA,EACF;AACF;;;ACrBO,IAAM,8BAAN,MAA2D;AAAA,EACvD;AAAA,EACQ;AAAA,EACA;AAAA,EAEjB,YAAY,SAA6C;AACvD,SAAK,OAAO,QAAQ,QAAQ;AAC5B,SAAK,YAAY,QAAQ,aAAa;AACtC,SAAK,aAAa,QAAQ;AAAA,EAC5B;AAAA,EAEA,MAAM,SAAS,OAA+C;AAC5D,UAAM,SAAS,UAAU,MAAM,OAAO,WAAW,cAAc,MAAM,OAAO,SAAS,MAAS;AAC9F,UAAM,WAAW,UAAU,MAAM,SAAS,QAAQ;AAClD,UAAM,QAAQ,MAAM,KAAK,WAAW,QAAQ,QAAQ;AAEpD,WAAO;AAAA,MACL,MAAM,KAAK;AAAA,MACX;AAAA,MACA,QAAQ,SAAS,KAAK;AAAA,MACtB,QAAQ,SAAS,KAAK,YAAY,gCAAgC;AAAA,MAClE,UAAU,EAAE,WAAW,KAAK,UAAU;AAAA,IACxC;AAAA,EACF;AACF;AAEA,SAAS,UAAU,OAAwB;AACzC,SAAO,OAAO,UAAU,WAAW,QAAQ,KAAK,UAAU,KAAK;AACjE;;;ACtBO,IAAM,oBAAN,MAAiD;AAAA,EAC7C;AAAA,EACQ;AAAA,EACA;AAAA,EAEjB,YAAY,UAAoC,CAAC,GAAG;AAClD,SAAK,OAAO,QAAQ,QAAQ;AAC5B,QAAI,QAAQ,sBAAsB,OAAW,MAAK,oBAAoB,QAAQ;AAC9E,SAAK,eAAe,QAAQ,gBAAgB;AAAA,EAC9C;AAAA,EAEA,MAAM,SAAS,OAA+C;AAC5D,UAAM,kBAAkB,MAAM,OAAO,WAAW,cAAc,iBAAiB,MAAM,OAAO,MAAM,IAAI,CAAC;AACvG,UAAM,oBAAoB,KAAK,qBAAqB,yBAAyB,MAAM,SAAS,QAAQ;AACpG,UAAM,SAAS,KAAK,eAChB,eAAe,iBAAiB,iBAAiB,IACjD,gBAAgB,iBAAiB,iBAAiB;AAEtD,WAAO;AAAA,MACL,MAAM,KAAK;AAAA,MACX,OAAO,SAAS,IAAI;AAAA,MACpB;AAAA,MACA,QAAQ,SAAS,qCAAqC;AAAA,MACtD,UAAU;AAAA,QACR,UAAU,kBAAkB,IAAI,CAAC,SAAS,KAAK,IAAI;AAAA,QACnD,QAAQ,gBAAgB,IAAI,CAAC,SAAS,KAAK,IAAI;AAAA,MACjD;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,iBAAiB,OAA4B;AACpD,MAAIC,UAAS,KAAK,KAAK,MAAM,QAAQ,MAAM,SAAS,GAAG;AACrD,WAAO,MAAM,UAAU,OAAO,UAAU;AAAA,EAC1C;AAEA,SAAO,CAAC;AACV;AAEA,SAAS,yBAAyB,OAAoC;AACpE,MAAIA,UAAS,KAAK,KAAK,MAAM,QAAQ,MAAM,SAAS,GAAG;AACrD,WAAO,MAAM,UAAU,OAAO,kBAAkB;AAAA,EAClD;AAEA,SAAO,CAAC;AACV;AAEA,SAAS,eAAe,QAAoB,UAAuC;AACjF,SAAO,SAAS,MAAM,CAAC,cAAc,UAAU,YAAY,OAAO,KAAK,GAAG,YAAY,CAAC;AACzF;AAEA,SAAS,gBAAgB,QAAoB,UAAuC;AAClF,SAAO,SAAS,MAAM,CAAC,iBAAiB,OAAO,KAAK,CAAC,eAAe,YAAY,YAAY,YAAY,CAAC,CAAC;AAC5G;AAEA,SAAS,YAAY,QAA8B,UAAqC;AACtF,MAAI,WAAW,UAAa,OAAO,SAAS,SAAS,KAAM,QAAO;AAClE,MAAI,SAAS,UAAU,OAAW,QAAO;AACzC,SAAO,KAAK,UAAU,OAAO,KAAK,MAAM,KAAK,UAAU,SAAS,KAAK;AACvE;AAEA,SAAS,WAAW,OAAmC;AACrD,SAAOA,UAAS,KAAK,KAAK,OAAO,MAAM,SAAS;AAClD;AAEA,SAAS,mBAAmB,OAA2C;AACrE,SAAOA,UAAS,KAAK,KAAK,OAAO,MAAM,SAAS;AAClD;AAEA,SAASA,UAAS,OAAkD;AAClE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;;;AClFO,SAAS,qBAAqB,QAAgB,OAA2B;AAC9E,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,UAAU,IAAI,IAAI,SAAS;AAAA,IAClC,MAAM;AAAA,IACN,UAAU,EAAE,QAAQ,MAAM;AAAA,EAC5B;AACF;AAEO,SAAS,yBAAyB,QAA8B;AACrE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,KAAK,UAAU,MAAM,OAAO,CAAC,IAAI,OAAO;AAAA,IACxF,MAAM;AAAA,EACR;AACF;AAEO,SAAS,iBAAiB,SAA6C;AAC5E,QAAM,OAAO,QAAQ,OAAO,CAAC,KAAK,WAAW;AAC3C,UAAM,QAAQ,OAAO,UAAU;AAC/B,WAAO,OAAO,OAAO,UAAU,WAAW,QAAQ;AAAA,EACpD,GAAG,CAAC;AAEJ,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO;AAAA,IACP,MAAM;AAAA,EACR;AACF;;;AC7BO,IAAM,qBAAN,MAAuD;AAAA,EAC5D,OAAO,QAA4B;AACjC,WAAO,KAAK,UAAU,QAAQ,MAAM,CAAC;AAAA,EACvC;AACF;;;ACJO,IAAM,yBAAN,MAA2D;AAAA,EAChE,OAAO,QAA4B;AACjC,UAAM,SAAS,OAAO,IAAI,QAAQ,OAAO,CAAC,WAAW,CAAC,OAAO,MAAM;AACnE,UAAM,cAAc,OAAO,aAAa,OAAO,CAAC,WAAW,OAAO,SAAS,KAAK,CAAC;AAEjF,WAAO;AAAA,MACL,iBAAiB,OAAO,IAAI,KAAK;AAAA,MACjC;AAAA,MACA,YAAY,OAAO,IAAI,SAAS,IAAI,OAAO,IAAI,cAAc;AAAA,MAC7D,WAAW,OAAO,IAAI,SAAS,QAAQ,IAAI;AAAA,MAC3C,cAAc,YAAY,OAAO,IAAI,QAAQ,QAAQ,CAAC;AAAA,MACtD,kBAAkB,YAAY,OAAO,IAAI,QAAQ,YAAY,CAAC;AAAA,MAC9D;AAAA,MACA;AAAA,MACA,GAAG,OAAO,IAAI,QAAQ,IAAI,CAAC,WAAW,KAAK,OAAO,IAAI,KAAK,OAAO,KAAK,GAAG,OAAO,OAAO,IAAI,OAAO,IAAI,KAAK,EAAE,EAAE;AAAA,MAChH;AAAA,MACA;AAAA,MACA,GAAI,OAAO,WAAW,IAClB,CAAC,QAAQ,IACT,OAAO,IAAI,CAAC,WAAW;AACrB,cAAM,UAAU,OAAO,OAAO,OAAO,CAAC,UAAU,CAAC,MAAM,MAAM,EAAE,IAAI,CAAC,UAAU,GAAG,MAAM,IAAI,KAAK,MAAM,UAAU,QAAQ,EAAE;AAC1H,eAAO,KAAK,OAAO,MAAM,KAAK,QAAQ,KAAK,IAAI,CAAC;AAAA,MAClD,CAAC;AAAA,MACL;AAAA,MACA;AAAA,MACA,GAAI,YAAY,WAAW,IACvB,CAAC,QAAQ,IACT,YAAY,IAAI,CAAC,WAAW,KAAK,OAAO,UAAU,KAAK,OAAO,aAAa,OAAO,OAAO,YAAY,EAAE;AAAA,IAC7G,EAAE,KAAK,IAAI;AAAA,EACb;AACF;AAEA,SAAS,YAAY,OAAuB;AAC1C,SAAO,GAAG,KAAK,MAAM,QAAQ,GAAK,IAAI,GAAG;AAC3C;;;AC3BO,IAAM,0BAAN,MAAuD;AAAA,EACnD;AAAA,EACQ;AAAA,EACA;AAAA,EAEjB,YAAY,SAAyC;AACnD,SAAK,OAAO,UAAU,QAAQ,OAAO,EAAE;AACvC,SAAK,SAAS,QAAQ;AACtB,SAAK,iBAAiB,QAAQ;AAAA,EAChC;AAAA,EAEA,MAAM,SAAS,OAA+C;AAC5D,UAAM,WAAW,MAAM,QAAQ;AAAA,MAC7B,KAAK,OAAO,SAAS,IAAI,CAAC,cAAc,KAAK,eAAe,UAAU,KAAK,KAAK,CAAC;AAAA,IACnF;AACA,UAAM,cAAc,KAAK,OAAO,SAAS,OAAO,CAAC,KAAK,SAAS,OAAO,KAAK,UAAU,IAAI,CAAC,KAAK;AAC/F,UAAM,QAAQ,SAAS,OAAO,CAAC,KAAK,SAAS;AAC3C,YAAM,SAAS,KAAK,OAAO,SAAS,KAAK,CAAC,cAAc,UAAU,QAAQ,KAAK,GAAG,GAAG,UAAU;AAC/F,aAAO,MAAM,KAAK,QAAQ;AAAA,IAC5B,GAAG,CAAC,IAAI;AACR,UAAM,YAAY,KAAK,OAAO,aAAa;AAC3C,UAAM,SAAS,SAAS,aAAa,SAAS,MAAM,CAAC,cAAc,UAAU,MAAM;AAEnF,WAAO;AAAA,MACL,MAAM,KAAK;AAAA,MACX;AAAA,MACA;AAAA,MACA,QAAQ,SAAS,mBAAmB;AAAA,MACpC,UAAU;AAAA,QACR,UAAU,KAAK,OAAO;AAAA,QACtB,eAAe,KAAK,OAAO;AAAA,QAC3B;AAAA,QACA,UAAU,SAAS,IAAI,CAAC,eAAe;AAAA,UACrC,KAAK,UAAU;AAAA,UACf,OAAO,UAAU;AAAA,UACjB,QAAQ,UAAU;AAAA,UAClB,QAAQ,UAAU,UAAU;AAAA,QAC9B,EAAE;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AACF;;;ACnCO,IAAM,aAAN,MAAiB;AAAA,EACL;AAAA,EACA;AAAA,EACA;AAAA,EAEjB,YAAY,SAA4B;AACtC,SAAK,eAAe,QAAQ;AAC5B,SAAK,aAAa,QAAQ;AAC1B,SAAK,WAAW,QAAQ;AAAA,EAC1B;AAAA,EAEA,MAAM,IAAI,QAA+C;AACvD,UAAM,UAAU,MAAM,KAAK,aAAa,IAAI,OAAO,WAAW,OAAO,cAAc;AACnF,QAAI,YAAY,QAAW;AACzB,YAAM,IAAI,UAAU,0BAA0B,2BAA2B,OAAO,SAAS,EAAE;AAAA,IAC7F;AAEA,UAAM,UAA0B;AAAA,MAC9B,OAAO,OAAO,SAAS,YAAY;AAAA,MACnC,WAAW,QAAQ;AAAA,MACnB,gBAAgB,QAAQ;AAAA,IAC1B;AACA,QAAI,OAAO,aAAa,OAAW,SAAQ,WAAW,OAAO;AAC7D,QAAI,OAAO,YAAY,OAAW,SAAQ,UAAU,OAAO;AAC3D,QAAI,OAAO,kBAAkB,OAAW,SAAQ,gBAAgB,OAAO;AACvE,QAAI,OAAO,aAAa,OAAW,SAAQ,WAAW,OAAO;AAE7D,UAAM,UAA4B,CAAC;AAEnC,eAAW,YAAY,QAAQ,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,UAAU,OAAO;AACpD,YAAMC,UAAS,MAAM,QAAQ;AAAA,QAC3B,KAAK,WAAW;AAAA,UAAI,CAAC,cACnB,UAAU,SAAS;AAAA,YACjB;AAAA,YACA;AAAA,YACA;AAAA,UACF,CAAC;AAAA,QACH;AAAA,MACF;AACA,YAAM,eAAeA,QAAO,WAAW,IAAI,IAAIA,QAAO,OAAO,CAAC,KAAK,UAAU,MAAM,MAAM,OAAO,CAAC,IAAIA,QAAO;AAC5G,YAAM,SAAS,OAAO,WAAW,eAAeA,QAAO,MAAM,CAAC,UAAU,MAAM,MAAM,KAAK,iBAAiB,OAAO,aAAa;AAE9H,cAAQ,KAAK;AAAA,QACX,QAAQ,SAAS;AAAA,QACjB;AAAA,QACA;AAAA,QACA,QAAAA;AAAA,QACA;AAAA,MACF,CAAC;AAAA,IACH;AAEA,UAAM,cAAc,QAAQ,OAAO,CAAC,WAAW,OAAO,MAAM,EAAE;AAC9D,UAAM,SAAS,QAAQ,QAAQ,CAAC,WAAW,OAAO,OAAO,IAAI,CAAC,UAAU,MAAM,KAAK,CAAC;AACpF,UAAM,UAAU;AAAA,MACd,YAAY,QAAQ;AAAA,MACpB;AAAA,MACA,aAAa,QAAQ,SAAS;AAAA,MAC9B,UAAU,QAAQ,WAAW,IAAI,IAAI,cAAc,QAAQ;AAAA,MAC3D,cAAc,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,KAAK,UAAU,MAAM,OAAO,CAAC,IAAI,OAAO;AAAA,IACjG;AAEA,UAAM,YAA2B;AAAA,MAC/B,OAAO,QAAQ;AAAA,MACf,WAAW,QAAQ;AAAA,MACnB,gBAAgB,QAAQ;AAAA,MACxB;AAAA,MACA;AAAA,MACA,SAAS;AAAA,QACP,qBAAqB,QAAQ,aAAa,QAAQ,UAAU;AAAA,QAC5D,yBAAyB,MAAM;AAAA,QAC/B,iBAAiB,QAAQ,IAAI,CAAC,WAAW,OAAO,MAAM,CAAC;AAAA,MACzD;AAAA,MACA,QAAQ,QAAQ,gBAAgB;AAAA,IAClC;AACA,QAAI,OAAO,aAAa,OAAW,WAAU,WAAW,OAAO;AAE/D,WAAO;AAAA,EACT;AACF;AAEA,SAAS,cAAsB;AAC7B,SAAO,QAAQ,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC;AACxC;","names":["isRecord","isRecord","scores"]}
|
package/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@hemia-ai/agents-evals",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "Evaluation contracts for Hemia AI agent quality, golden datasets, and regression checks.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"sideEffects": false,
|
|
7
|
+
"files": ["dist"],
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"publishConfig": {
|
|
15
|
+
"access": "public"
|
|
16
|
+
},
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "tsup src/index.ts --format esm --dts --sourcemap --clean",
|
|
19
|
+
"clean": "rm -rf dist *.tsbuildinfo",
|
|
20
|
+
"dev": "tsup src/index.ts --format esm --dts --watch",
|
|
21
|
+
"lint": "tsc --noEmit",
|
|
22
|
+
"test": "vitest run --passWithNoTests",
|
|
23
|
+
"typecheck": "tsc --noEmit",
|
|
24
|
+
"prepack": "npm run build"
|
|
25
|
+
},
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"@hemia-ai/agents-core": "^0.0.1",
|
|
28
|
+
"@hemia-ai/agents-models": "^0.0.1"
|
|
29
|
+
}
|
|
30
|
+
}
|