@radaros/eval 0.3.20 → 0.3.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,371 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/index.ts
21
+ var index_exports = {};
22
+ __export(index_exports, {
23
+ ConsoleReporter: () => ConsoleReporter,
24
+ EvalSuite: () => EvalSuite,
25
+ JsonReporter: () => JsonReporter,
26
+ contains: () => contains,
27
+ custom: () => custom,
28
+ jsonMatch: () => jsonMatch,
29
+ llmJudge: () => llmJudge,
30
+ regexMatch: () => regexMatch,
31
+ semanticSimilarity: () => semanticSimilarity
32
+ });
33
+ module.exports = __toCommonJS(index_exports);
34
+
35
+ // src/reporters/console.ts
36
+ var ConsoleReporter = class {
37
+ report(result) {
38
+ const line = "\u2500".repeat(70);
39
+ console.log(`
40
+ ${line}`);
41
+ console.log(` Eval Suite: ${result.name}`);
42
+ console.log(`${line}`);
43
+ for (const r of result.results) {
44
+ const status = r.pass ? "PASS" : "FAIL";
45
+ const icon = r.pass ? "+" : "-";
46
+ console.log(`
47
+ [${icon}] ${status} | ${r.caseName} (${r.durationMs}ms)`);
48
+ console.log(` Input: ${r.input.slice(0, 80)}${r.input.length > 80 ? "..." : ""}`);
49
+ for (const [scorerName, score] of Object.entries(r.scores)) {
50
+ const mark = score.pass ? "+" : "-";
51
+ console.log(
52
+ ` [${mark}] ${scorerName}: ${score.score.toFixed(3)}${score.reason ? ` \u2014 ${score.reason}` : ""}`
53
+ );
54
+ }
55
+ }
56
+ console.log(`
57
+ ${line}`);
58
+ console.log(
59
+ ` Results: ${result.passed}/${result.total} passed | Avg score: ${result.averageScore.toFixed(3)} | ${result.durationMs}ms`
60
+ );
61
+ console.log(`${line}
62
+ `);
63
+ }
64
+ };
65
+
66
+ // src/reporters/json.ts
67
+ var import_promises = require("fs/promises");
68
+ var JsonReporter = class {
69
+ outputPath;
70
+ constructor(outputPath) {
71
+ this.outputPath = outputPath ?? `eval-results-${Date.now()}.json`;
72
+ }
73
+ async report(result) {
74
+ const serializable = {
75
+ ...result,
76
+ results: result.results.map((r) => ({
77
+ caseName: r.caseName,
78
+ input: r.input,
79
+ outputText: r.output?.text,
80
+ scores: r.scores,
81
+ durationMs: r.durationMs,
82
+ pass: r.pass,
83
+ usage: r.output?.usage
84
+ }))
85
+ };
86
+ try {
87
+ await (0, import_promises.writeFile)(this.outputPath, JSON.stringify(serializable, null, 2));
88
+ console.log(`Eval results written to ${this.outputPath}`);
89
+ } catch (err) {
90
+ console.warn(`Failed to write eval report to ${this.outputPath}:`, err instanceof Error ? err.message : err);
91
+ }
92
+ }
93
+ };
94
+
95
+ // src/scorers/contains.ts
96
+ function contains(expected, options) {
97
+ const caseSensitive = options?.caseSensitive ?? false;
98
+ return {
99
+ name: "contains",
100
+ async score(_input, output, _expected) {
101
+ const text = caseSensitive ? output.text ?? "" : (output.text ?? "").toLowerCase();
102
+ const target = caseSensitive ? expected : expected.toLowerCase();
103
+ const pass = text.includes(target);
104
+ return {
105
+ score: pass ? 1 : 0,
106
+ pass,
107
+ reason: pass ? void 0 : `Output does not contain "${expected}"`
108
+ };
109
+ }
110
+ };
111
+ }
112
+
113
+ // src/scorers/custom.ts
114
+ function custom(name, fn) {
115
+ return {
116
+ name,
117
+ async score(input, output, expected) {
118
+ return fn(input, output, expected);
119
+ }
120
+ };
121
+ }
122
+
123
+ // src/scorers/json-match.ts
124
+ function deepEqual(a, b) {
125
+ if (a === b) return true;
126
+ if (typeof a !== typeof b) return false;
127
+ if (a === null || b === null) return false;
128
+ if (typeof a !== "object") return false;
129
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
130
+ if (Array.isArray(a) && Array.isArray(b)) {
131
+ if (a.length !== b.length) return false;
132
+ return a.every((v, i) => deepEqual(v, b[i]));
133
+ }
134
+ const aObj = a;
135
+ const bObj = b;
136
+ const aKeys = Object.keys(aObj);
137
+ const bKeys = Object.keys(bObj);
138
+ if (aKeys.length !== bKeys.length) return false;
139
+ return aKeys.every((k) => deepEqual(aObj[k], bObj[k]));
140
+ }
141
+ function jsonMatch(expectedFields) {
142
+ return {
143
+ name: "json-match",
144
+ async score(_input, output) {
145
+ if (!output.structured) {
146
+ return { score: 0, pass: false, reason: "No structured output" };
147
+ }
148
+ const obj = output.structured;
149
+ const totalFields = Object.keys(expectedFields).length;
150
+ let matched = 0;
151
+ const mismatches = [];
152
+ for (const [key, expected] of Object.entries(expectedFields)) {
153
+ if (deepEqual(obj[key], expected)) {
154
+ matched++;
155
+ } else {
156
+ mismatches.push(`${key}: expected ${JSON.stringify(expected)}, got ${JSON.stringify(obj[key])}`);
157
+ }
158
+ }
159
+ const score = totalFields > 0 ? matched / totalFields : 1;
160
+ const pass = score >= 1;
161
+ return {
162
+ score,
163
+ pass,
164
+ reason: pass ? void 0 : `Mismatches: ${mismatches.join("; ")}`
165
+ };
166
+ }
167
+ };
168
+ }
169
+
170
+ // src/scorers/llm-judge.ts
171
+ function llmJudge(config) {
172
+ const criteria = config.criteria ?? ["relevance", "helpfulness"];
173
+ return {
174
+ name: "llm-judge",
175
+ async score(input, output, expected) {
176
+ const criteriaList = criteria.map((c) => `- ${c}`).join("\n");
177
+ const prompt = config.customPrompt ?? `You are an expert evaluator. Rate the following AI response on a scale of 0.0 to 1.0.
178
+
179
+ Criteria:
180
+ ${criteriaList}
181
+
182
+ User input: ${input}
183
+ ${expected ? `Expected output: ${expected}
184
+ ` : ""}
185
+ AI response: ${output.text}
186
+
187
+ Respond with ONLY a JSON object: {"score": <0.0-1.0>, "reason": "<brief explanation>"}`;
188
+ try {
189
+ const response = await config.model.generate([{ role: "user", content: prompt }]);
190
+ const text = typeof response.message.content === "string" ? response.message.content : "";
191
+ const jsonMatch2 = text.match(/\{[\s\S]*\}/);
192
+ if (!jsonMatch2) {
193
+ return { score: 0, pass: false, reason: "Judge failed to return valid JSON" };
194
+ }
195
+ const parsed = JSON.parse(jsonMatch2[0]);
196
+ const score = Math.max(0, Math.min(1, Number(parsed.score) || 0));
197
+ return {
198
+ score,
199
+ pass: score >= (config.threshold ?? 0.7),
200
+ reason: parsed.reason ?? void 0
201
+ };
202
+ } catch (err) {
203
+ return {
204
+ score: 0,
205
+ pass: false,
206
+ reason: `Judge error: ${err.message}`
207
+ };
208
+ }
209
+ }
210
+ };
211
+ }
212
+
213
+ // src/scorers/regex.ts
214
+ function regexMatch(pattern) {
215
+ const regex = typeof pattern === "string" ? new RegExp(pattern) : pattern;
216
+ return {
217
+ name: "regex",
218
+ async score(_input, output) {
219
+ regex.lastIndex = 0;
220
+ const pass = regex.test(output.text);
221
+ return {
222
+ score: pass ? 1 : 0,
223
+ pass,
224
+ reason: pass ? void 0 : `Output does not match pattern ${regex}`
225
+ };
226
+ }
227
+ };
228
+ }
229
+
230
+ // src/scorers/similarity.ts
231
+ function semanticSimilarity(config) {
232
+ const threshold = config.threshold ?? 0.8;
233
+ return {
234
+ name: "similarity",
235
+ async score(_input, output) {
236
+ const vecs = await config.embedding.embedBatch([output.text, config.expected]);
237
+ if (!vecs || vecs.length < 2 || !vecs[0] || !vecs[1]) {
238
+ return { score: 0, pass: false, reason: "Embedding failed: insufficient vectors returned" };
239
+ }
240
+ const [outputVec, expectedVec] = vecs;
241
+ if (outputVec.length !== expectedVec.length) {
242
+ return {
243
+ score: 0,
244
+ pass: false,
245
+ reason: `Vector dimension mismatch: ${outputVec.length} vs ${expectedVec.length}`
246
+ };
247
+ }
248
+ const sim = cosineSimilarity(outputVec, expectedVec);
249
+ const pass = sim >= threshold;
250
+ return {
251
+ score: sim,
252
+ pass,
253
+ reason: pass ? void 0 : `Similarity ${sim.toFixed(3)} < threshold ${threshold}`
254
+ };
255
+ }
256
+ };
257
+ }
258
+ function cosineSimilarity(a, b) {
259
+ let dot = 0;
260
+ let magA = 0;
261
+ let magB = 0;
262
+ for (let i = 0; i < a.length; i++) {
263
+ dot += a[i] * b[i];
264
+ magA += a[i] * a[i];
265
+ magB += b[i] * b[i];
266
+ }
267
+ const denom = Math.sqrt(magA) * Math.sqrt(magB);
268
+ return denom === 0 ? 0 : dot / denom;
269
+ }
270
+
271
+ // src/suite.ts
272
+ var import_core = require("@radaros/core");
273
+ var EvalSuite = class {
274
+ config;
275
+ constructor(config) {
276
+ this.config = config;
277
+ }
278
+ async run(reporters) {
279
+ const startTime = Date.now();
280
+ const threshold = this.config.threshold ?? 0.7;
281
+ const concurrency = this.config.concurrency ?? 1;
282
+ const results = [];
283
+ const chunks = this.chunk(this.config.cases, concurrency);
284
+ for (const batch of chunks) {
285
+ const batchResults = await Promise.allSettled(batch.map((c) => this.runCase(c, threshold)));
286
+ for (const result of batchResults) {
287
+ if (result.status === "fulfilled") {
288
+ results.push(result.value);
289
+ } else {
290
+ results.push({
291
+ input: "unknown",
292
+ scores: {},
293
+ pass: false,
294
+ durationMs: 0,
295
+ error: result.reason instanceof Error ? result.reason.message : String(result.reason)
296
+ });
297
+ }
298
+ }
299
+ }
300
+ const passed = results.filter((r) => r.pass).length;
301
+ const failed = results.length - passed;
302
+ const allScores = results.flatMap((r) => Object.values(r.scores).map((s) => s.score));
303
+ const averageScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : 0;
304
+ const suiteResult = {
305
+ name: this.config.name,
306
+ results,
307
+ passed,
308
+ failed,
309
+ total: results.length,
310
+ averageScore,
311
+ durationMs: Date.now() - startTime
312
+ };
313
+ if (reporters) {
314
+ for (const reporter of reporters) {
315
+ await reporter.report(suiteResult);
316
+ }
317
+ }
318
+ return suiteResult;
319
+ }
320
+ async runCase(evalCase, threshold) {
321
+ const startTime = Date.now();
322
+ const inputText = typeof evalCase.input === "string" ? evalCase.input : (0, import_core.getTextContent)(evalCase.input);
323
+ const timeoutMs = this.config.timeoutMs ?? 3e4;
324
+ const output = await Promise.race([
325
+ this.config.agent.run(evalCase.input, evalCase.runOpts),
326
+ new Promise(
327
+ (_, reject) => setTimeout(() => reject(new Error(`Eval case timed out after ${timeoutMs}ms`)), timeoutMs)
328
+ )
329
+ ]);
330
+ const scores = {};
331
+ for (const scorer of this.config.scorers) {
332
+ try {
333
+ scores[scorer.name] = await scorer.score(inputText, output, evalCase.expected);
334
+ } catch (err) {
335
+ scores[scorer.name] = {
336
+ score: 0,
337
+ pass: false,
338
+ reason: `Scorer error: ${err instanceof Error ? err.message : String(err)}`
339
+ };
340
+ }
341
+ }
342
+ const allPass = Object.values(scores).every((s) => s.score >= threshold);
343
+ return {
344
+ caseName: evalCase.name,
345
+ input: inputText,
346
+ output,
347
+ scores,
348
+ durationMs: Date.now() - startTime,
349
+ pass: allPass
350
+ };
351
+ }
352
+ chunk(arr, size) {
353
+ const chunks = [];
354
+ for (let i = 0; i < arr.length; i += size) {
355
+ chunks.push(arr.slice(i, i + size));
356
+ }
357
+ return chunks;
358
+ }
359
+ };
360
+ // Annotate the CommonJS export names for ESM import in node:
361
+ 0 && (module.exports = {
362
+ ConsoleReporter,
363
+ EvalSuite,
364
+ JsonReporter,
365
+ contains,
366
+ custom,
367
+ jsonMatch,
368
+ llmJudge,
369
+ regexMatch,
370
+ semanticSimilarity
371
+ });
@@ -0,0 +1,92 @@
1
+ import { RunOutput, MessageContent, RunOpts, Agent, ModelProvider, EmbeddingProvider } from '@radaros/core';
2
+
3
+ interface EvalCase {
4
+ name: string;
5
+ input: string | MessageContent;
6
+ expected?: string;
7
+ metadata?: Record<string, unknown>;
8
+ runOpts?: RunOpts;
9
+ }
10
+ interface ScorerResult {
11
+ score: number;
12
+ pass: boolean;
13
+ reason?: string;
14
+ }
15
+ interface Scorer {
16
+ name: string;
17
+ score(input: string, output: RunOutput, expected?: string): Promise<ScorerResult>;
18
+ }
19
+ interface EvalResult {
20
+ caseName?: string;
21
+ input: string;
22
+ output?: RunOutput;
23
+ scores: Record<string, ScorerResult>;
24
+ durationMs: number;
25
+ pass: boolean;
26
+ error?: string;
27
+ }
28
+ interface EvalSuiteResult {
29
+ name: string;
30
+ results: EvalResult[];
31
+ passed: number;
32
+ failed: number;
33
+ total: number;
34
+ averageScore: number;
35
+ durationMs: number;
36
+ }
37
+ interface EvalSuiteConfig {
38
+ name: string;
39
+ agent: Agent;
40
+ cases: EvalCase[];
41
+ scorers: Scorer[];
42
+ threshold?: number;
43
+ concurrency?: number;
44
+ timeoutMs?: number;
45
+ }
46
+ interface Reporter {
47
+ report(result: EvalSuiteResult): void | Promise<void>;
48
+ }
49
+
50
+ declare class ConsoleReporter implements Reporter {
51
+ report(result: EvalSuiteResult): void;
52
+ }
53
+
54
+ declare class JsonReporter implements Reporter {
55
+ private outputPath;
56
+ constructor(outputPath?: string);
57
+ report(result: EvalSuiteResult): Promise<void>;
58
+ }
59
+
60
+ declare function contains(expected: string, options?: {
61
+ caseSensitive?: boolean;
62
+ }): Scorer;
63
+
64
+ declare function custom(name: string, fn: (input: string, output: RunOutput, expected?: string) => Promise<ScorerResult> | ScorerResult): Scorer;
65
+
66
+ declare function jsonMatch(expectedFields: Record<string, unknown>): Scorer;
67
+
68
+ type JudgeCriteria = "faithfulness" | "relevance" | "helpfulness" | "safety" | "conciseness";
69
+ declare function llmJudge(config: {
70
+ model: ModelProvider;
71
+ criteria?: JudgeCriteria[];
72
+ customPrompt?: string;
73
+ threshold?: number;
74
+ }): Scorer;
75
+
76
+ declare function regexMatch(pattern: string | RegExp): Scorer;
77
+
78
+ declare function semanticSimilarity(config: {
79
+ expected: string;
80
+ embedding: EmbeddingProvider;
81
+ threshold?: number;
82
+ }): Scorer;
83
+
84
+ declare class EvalSuite {
85
+ private config;
86
+ constructor(config: EvalSuiteConfig);
87
+ run(reporters?: Reporter[]): Promise<EvalSuiteResult>;
88
+ private runCase;
89
+ private chunk;
90
+ }
91
+
92
+ export { ConsoleReporter, type EvalCase, type EvalResult, EvalSuite, type EvalSuiteConfig, type EvalSuiteResult, JsonReporter, type JudgeCriteria, type Reporter, type Scorer, type ScorerResult, contains, custom, jsonMatch, llmJudge, regexMatch, semanticSimilarity };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@radaros/eval",
3
- "version": "0.3.20",
3
+ "version": "0.3.22",
4
4
  "description": "Evaluation framework for testing and scoring RadarOS agents",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -17,24 +17,27 @@
17
17
  "benchmarks"
18
18
  ],
19
19
  "type": "module",
20
- "main": "./dist/index.js",
20
+ "main": "./dist/index.cjs",
21
+ "module": "./dist/index.js",
21
22
  "types": "./dist/index.d.ts",
22
23
  "exports": {
23
24
  ".": {
25
+ "types": "./dist/index.d.ts",
24
26
  "import": "./dist/index.js",
25
- "types": "./dist/index.d.ts"
27
+ "require": "./dist/index.cjs",
28
+ "default": "./dist/index.js"
26
29
  }
27
30
  },
28
31
  "files": [
29
32
  "dist"
30
33
  ],
31
34
  "scripts": {
32
- "build": "tsup src/index.ts --format esm --dts --clean",
33
- "dev": "tsup src/index.ts --format esm --dts --watch",
35
+ "build": "tsup src/index.ts --format esm,cjs --dts --clean",
36
+ "dev": "tsup src/index.ts --format esm,cjs --dts --watch",
34
37
  "prepublishOnly": "npm run build"
35
38
  },
36
39
  "peerDependencies": {
37
- "@radaros/core": "^0.3.20"
40
+ "@radaros/core": "^0.3.22"
38
41
  },
39
42
  "devDependencies": {
40
43
  "@types/node": "^25.3.1",