@massiangelone/rag-eval 0.1.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1067 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __esm = (fn, res) => function __init() {
9
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
10
+ };
11
+ var __export = (target, all) => {
12
+ for (var name in all)
13
+ __defProp(target, name, { get: all[name], enumerable: true });
14
+ };
15
+ var __copyProps = (to, from, except, desc) => {
16
+ if (from && typeof from === "object" || typeof from === "function") {
17
+ for (let key of __getOwnPropNames(from))
18
+ if (!__hasOwnProp.call(to, key) && key !== except)
19
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
20
+ }
21
+ return to;
22
+ };
23
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
24
+ // If the importer is in node compatibility mode or this is not an ESM
25
+ // file that has been converted to a CommonJS file using a Babel-
26
+ // compatible transform (i.e. "__esModule" has not been set), then set
27
+ // "default" to the CommonJS "module.exports" for node compatibility.
28
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
29
+ mod
30
+ ));
31
+
32
+ // node_modules/.pnpm/tsup@8.5.1_tsx@4.21.0_typescript@6.0.3/node_modules/tsup/assets/cjs_shims.js
33
+ var getImportMetaUrl, importMetaUrl;
34
+ var init_cjs_shims = __esm({
35
+ "node_modules/.pnpm/tsup@8.5.1_tsx@4.21.0_typescript@6.0.3/node_modules/tsup/assets/cjs_shims.js"() {
36
+ "use strict";
37
+ getImportMetaUrl = () => typeof document === "undefined" ? new URL(`file:${__filename}`).href : document.currentScript && document.currentScript.tagName.toUpperCase() === "SCRIPT" ? document.currentScript.src : new URL("main.js", document.baseURI).href;
38
+ importMetaUrl = /* @__PURE__ */ getImportMetaUrl();
39
+ }
40
+ });
41
+
42
+ // src/config/schema.ts
43
+ var import_zod, EvalEntrySchema, EndpointConfigSchema, JudgeConfigSchema, JudgeOutputJSONSchema, ScoringConfigSchema, ConfigSchema;
44
+ var init_schema = __esm({
45
+ "src/config/schema.ts"() {
46
+ "use strict";
47
+ init_cjs_shims();
48
+ import_zod = require("zod");
49
+ EvalEntrySchema = import_zod.z.object({
50
+ id: import_zod.z.string().min(1),
51
+ question: import_zod.z.string().min(1),
52
+ expected_source: import_zod.z.union([import_zod.z.string(), import_zod.z.array(import_zod.z.string())]),
53
+ expected_answer: import_zod.z.string().optional()
54
+ });
55
+ EndpointConfigSchema = import_zod.z.object({
56
+ url: import_zod.z.string().url(),
57
+ method: import_zod.z.enum(["GET", "POST"]).default("POST"),
58
+ headers: import_zod.z.record(import_zod.z.string(), import_zod.z.string()).optional(),
59
+ body: import_zod.z.record(import_zod.z.string(), import_zod.z.unknown()).optional(),
60
+ responsePaths: import_zod.z.object({
61
+ answer: import_zod.z.string(),
62
+ sources: import_zod.z.string(),
63
+ sourceContents: import_zod.z.string().optional()
64
+ }),
65
+ timeoutMs: import_zod.z.number().int().positive().default(3e4)
66
+ });
67
+ JudgeConfigSchema = import_zod.z.object({
68
+ provider: import_zod.z.enum(["claude", "openai"]).default("claude"),
69
+ model: import_zod.z.string().optional()
70
+ }).transform((data) => {
71
+ const defaultModel = data.provider === "openai" ? "gpt-4o-mini" : "claude-sonnet-4-6";
72
+ return {
73
+ provider: data.provider,
74
+ model: data.model ?? defaultModel
75
+ };
76
+ });
77
+ JudgeOutputJSONSchema = import_zod.z.object({
78
+ faithfulness: import_zod.z.union([import_zod.z.number().min(0).max(1), import_zod.z.null()]),
79
+ correctness: import_zod.z.union([import_zod.z.number().min(0).max(1), import_zod.z.null()]),
80
+ rationale: import_zod.z.string().min(1).max(2e3)
81
+ });
82
+ ScoringConfigSchema = import_zod.z.object({
83
+ retrievalK: import_zod.z.number().int().positive().default(5),
84
+ weights: import_zod.z.object({
85
+ retrieval: import_zod.z.number().min(0).max(1).default(0.4),
86
+ faithfulness: import_zod.z.number().min(0).max(1).default(0.3),
87
+ correctness: import_zod.z.number().min(0).max(1).default(0.3)
88
+ }).refine(
89
+ (w) => Math.abs(w.retrieval + w.faithfulness + w.correctness - 1) < 0.01,
90
+ { message: "Scoring weights must sum to 1.0" }
91
+ )
92
+ });
93
+ ConfigSchema = import_zod.z.object({
94
+ endpoint: EndpointConfigSchema,
95
+ judge: JudgeConfigSchema.optional(),
96
+ scoring: ScoringConfigSchema.default({
97
+ retrievalK: 5,
98
+ weights: { retrieval: 0.4, faithfulness: 0.3, correctness: 0.3 }
99
+ })
100
+ });
101
+ }
102
+ });
103
+
104
+ // src/config/loader.ts
105
+ function loadConfig(path) {
106
+ const absPath = (0, import_node_path.resolve)(process.cwd(), path);
107
+ let raw;
108
+ try {
109
+ raw = (0, import_node_fs.readFileSync)(absPath, "utf-8");
110
+ } catch {
111
+ throw new ConfigError(`Could not read config file: ${absPath}`, "load");
112
+ }
113
+ let parsed;
114
+ try {
115
+ parsed = JSON.parse(raw);
116
+ } catch (e) {
117
+ throw new ConfigError(
118
+ `Config is not valid JSON: ${e.message}`,
119
+ "parse"
120
+ );
121
+ }
122
+ const result = ConfigSchema.safeParse(parsed);
123
+ if (!result.success) {
124
+ const issues = result.error.issues.map((i) => ` \u2022 ${i.path.join(".")}: ${i.message}`).join("\n");
125
+ throw new ConfigError(`Config validation failed:
126
+ ${issues}`, "validate");
127
+ }
128
+ return result.data;
129
+ }
130
+ function loadEvalSet(path) {
131
+ const absPath = (0, import_node_path.resolve)(process.cwd(), path);
132
+ let raw;
133
+ try {
134
+ raw = (0, import_node_fs.readFileSync)(absPath, "utf-8");
135
+ } catch {
136
+ throw new ConfigError(`Could not read eval-set: ${absPath}`, "load-evalset");
137
+ }
138
+ const lines = raw.split("\n").filter((l) => l.trim().length > 0);
139
+ if (lines.length === 0) {
140
+ throw new ConfigError(`Eval-set is empty: ${absPath}`, "evalset-empty");
141
+ }
142
+ const entries = [];
143
+ for (let i = 0; i < lines.length; i++) {
144
+ let parsed;
145
+ try {
146
+ parsed = JSON.parse(lines[i]);
147
+ } catch (e) {
148
+ throw new ConfigError(
149
+ `Line ${i + 1} is not valid JSON: ${e.message}`,
150
+ "evalset-parse"
151
+ );
152
+ }
153
+ const result = EvalEntrySchema.safeParse(parsed);
154
+ if (!result.success) {
155
+ const issues = result.error.issues.map((iss) => `${iss.path.join(".")}: ${iss.message}`).join(", ");
156
+ throw new ConfigError(`Line ${i + 1} invalid: ${issues}`, "evalset-validate");
157
+ }
158
+ entries.push(result.data);
159
+ }
160
+ return entries;
161
+ }
162
+ var import_node_fs, import_node_path, ConfigError;
163
+ var init_loader = __esm({
164
+ "src/config/loader.ts"() {
165
+ "use strict";
166
+ init_cjs_shims();
167
+ import_node_fs = require("fs");
168
+ import_node_path = require("path");
169
+ init_schema();
170
+ ConfigError = class extends Error {
171
+ constructor(message, source) {
172
+ super(`[config:${source}] ${message}`);
173
+ this.source = source;
174
+ this.name = "ConfigError";
175
+ }
176
+ source;
177
+ };
178
+ }
179
+ });
180
+
181
+ // src/adapters/endpoint.ts
182
+ function substitutePlaceholders(template, entry) {
183
+ if (typeof template === "string") {
184
+ return template.replaceAll("{{question}}", entry.question).replaceAll("{{id}}", entry.id);
185
+ }
186
+ if (Array.isArray(template)) {
187
+ return template.map((item) => substitutePlaceholders(item, entry));
188
+ }
189
+ if (template && typeof template === "object") {
190
+ const out = {};
191
+ for (const [k, v] of Object.entries(template)) {
192
+ out[k] = substitutePlaceholders(v, entry);
193
+ }
194
+ return out;
195
+ }
196
+ return template;
197
+ }
198
+ function getPath(obj, path) {
199
+ const parts = path.split(".");
200
+ let current = obj;
201
+ for (let i = 0; i < parts.length; i++) {
202
+ const part = parts[i];
203
+ if (current === null || current === void 0) return void 0;
204
+ const arrayMatch = part.match(/^(.+?)\[\]$/);
205
+ if (arrayMatch) {
206
+ const arr = current[arrayMatch[1]];
207
+ if (!Array.isArray(arr)) return void 0;
208
+ const remainingPath = parts.slice(i + 1).join(".");
209
+ if (!remainingPath) return arr;
210
+ return arr.map((item) => getPath(item, remainingPath));
211
+ }
212
+ current = current[part];
213
+ }
214
+ return current;
215
+ }
216
+ async function callEndpoint(config, entry) {
217
+ const start = Date.now();
218
+ const controller = new AbortController();
219
+ const timeout = setTimeout(() => controller.abort(), config.timeoutMs);
220
+ let response;
221
+ try {
222
+ const init = {
223
+ method: config.method,
224
+ headers: config.headers,
225
+ signal: controller.signal
226
+ };
227
+ if (config.method === "POST" && config.body) {
228
+ const body = substitutePlaceholders(config.body, entry);
229
+ init.body = JSON.stringify(body);
230
+ }
231
+ response = await fetch(config.url, init);
232
+ } catch (e) {
233
+ clearTimeout(timeout);
234
+ if (e.name === "AbortError") {
235
+ throw new EndpointError(`Request timed out after ${config.timeoutMs}ms`);
236
+ }
237
+ throw new EndpointError(`Network error: ${e.message}`);
238
+ }
239
+ clearTimeout(timeout);
240
+ if (!response.ok) {
241
+ throw new EndpointError(
242
+ `Endpoint returned ${response.status}: ${response.statusText}`,
243
+ response.status
244
+ );
245
+ }
246
+ let raw;
247
+ try {
248
+ raw = await response.json();
249
+ } catch {
250
+ throw new EndpointError(`Endpoint response is not valid JSON`);
251
+ }
252
+ const answerRaw = getPath(raw, config.responsePaths.answer);
253
+ const sourcesRaw = getPath(raw, config.responsePaths.sources);
254
+ if (typeof answerRaw !== "string") {
255
+ throw new EndpointError(
256
+ `Response path "${config.responsePaths.answer}" did not resolve to a string`
257
+ );
258
+ }
259
+ let sources = [];
260
+ if (Array.isArray(sourcesRaw)) {
261
+ sources = sourcesRaw.map((s) => String(s));
262
+ } else if (typeof sourcesRaw === "string") {
263
+ sources = [sourcesRaw];
264
+ } else if (sourcesRaw !== void 0 && sourcesRaw !== null) {
265
+ throw new EndpointError(
266
+ `Response path "${config.responsePaths.sources}" did not resolve to array or string`
267
+ );
268
+ }
269
+ let sourceContents;
270
+ if (config.responsePaths.sourceContents) {
271
+ const contentsRaw = getPath(raw, config.responsePaths.sourceContents);
272
+ if (Array.isArray(contentsRaw)) {
273
+ sourceContents = contentsRaw.map((c) => String(c));
274
+ } else if (typeof contentsRaw === "string") {
275
+ sourceContents = [contentsRaw];
276
+ } else if (contentsRaw !== void 0 && contentsRaw !== null) {
277
+ throw new EndpointError(
278
+ `Response path "${config.responsePaths.sourceContents}" did not resolve to array or string`
279
+ );
280
+ }
281
+ }
282
+ return {
283
+ answer: answerRaw,
284
+ sources,
285
+ sourceContents,
286
+ raw,
287
+ latencyMs: Date.now() - start
288
+ };
289
+ }
290
+ var EndpointError;
291
+ var init_endpoint = __esm({
292
+ "src/adapters/endpoint.ts"() {
293
+ "use strict";
294
+ init_cjs_shims();
295
+ EndpointError = class extends Error {
296
+ constructor(message, statusCode) {
297
+ super(message);
298
+ this.statusCode = statusCode;
299
+ this.name = "EndpointError";
300
+ }
301
+ statusCode;
302
+ };
303
+ }
304
+ });
305
+
306
+ // src/core/scorer.ts
307
+ function scoreRetrieval(entry, response, k) {
308
+ const expected = Array.isArray(entry.expected_source) ? entry.expected_source : [entry.expected_source];
309
+ const topK = response.sources.slice(0, k);
310
+ let hits = 0;
311
+ for (const src of topK) {
312
+ if (expected.includes(src)) hits++;
313
+ }
314
+ const precision = topK.length > 0 ? hits / topK.length : 0;
315
+ const found = expected.some((e) => topK.includes(e));
316
+ return {
317
+ precision,
318
+ found,
319
+ topKSources: topK,
320
+ expectedSources: expected
321
+ };
322
+ }
323
+ function isValidScore(n) {
324
+ return typeof n === "number" && !isNaN(n);
325
+ }
326
+ function computeOverallScore(retrieval, judge, weights) {
327
+ const r = retrieval ? retrieval.found ? 1 : retrieval.precision : 0;
328
+ if (!judge) return r;
329
+ const hasFaith = isValidScore(judge.faithfulness);
330
+ const hasCorr = isValidScore(judge.correctness);
331
+ if (!hasFaith && !hasCorr) return r;
332
+ let totalWeight = weights.retrieval;
333
+ let weightedSum = r * weights.retrieval;
334
+ if (hasFaith) {
335
+ weightedSum += judge.faithfulness * weights.faithfulness;
336
+ totalWeight += weights.faithfulness;
337
+ }
338
+ if (hasCorr) {
339
+ weightedSum += judge.correctness * weights.correctness;
340
+ totalWeight += weights.correctness;
341
+ }
342
+ return totalWeight > 0 ? weightedSum / totalWeight : r;
343
+ }
344
+ var init_scorer = __esm({
345
+ "src/core/scorer.ts"() {
346
+ "use strict";
347
+ init_cjs_shims();
348
+ }
349
+ });
350
+
351
+ // src/providers/prompts.ts
352
+ function buildJudgeUserPrompt(input) {
353
+ const looksLikeIds = input.retrievedContext.length > 0 && input.retrievedContext.every((c) => c.length < 80 && !c.includes(" "));
354
+ const contextBlock = input.retrievedContext.length > 0 ? input.retrievedContext.map((c, i) => `[${i + 1}] ${c}`).join("\n\n") : "(no context retrieved)";
355
+ const contextNote = looksLikeIds ? "\n\n(NOTE: The retrieved context appears to be document IDs, not full text. Faithfulness cannot be reliably assessed. Return null for faithfulness.)" : "";
356
+ const expectedBlock = input.expectedAnswer ? `
357
+
358
+ Expected answer (ground truth):
359
+ ${input.expectedAnswer}` : "\n\n(No expected_answer provided \u2014 return null for correctness.)";
360
+ return `Question:
361
+ ${input.question}
362
+
363
+ Retrieved context:
364
+ ${contextBlock}${contextNote}
365
+
366
+ RAG-generated answer:
367
+ ${input.answer}${expectedBlock}
368
+
369
+ Now score and return JSON.`;
370
+ }
371
+ var JUDGE_SYSTEM_PROMPT;
372
+ var init_prompts = __esm({
373
+ "src/providers/prompts.ts"() {
374
+ "use strict";
375
+ init_cjs_shims();
376
+ JUDGE_SYSTEM_PROMPT = `You are a strict evaluator of RAG (retrieval-augmented generation) pipelines.
377
+
378
+ Your job: given a question, a RAG-generated answer, and the retrieved context the answer was supposed to be grounded in, score two dimensions:
379
+
380
+ 1. FAITHFULNESS (0.0 to 1.0 or null): Is the answer supported by the retrieved context? Penalize claims that are not in the context. A score of 1.0 means every claim in the answer is grounded in the context. A score of 0.0 means the answer is entirely hallucinated. Return null if the context is uninterpretable (e.g., only opaque IDs, not readable text).
381
+
382
+ 2. CORRECTNESS (0.0 to 1.0 or null): Does the answer match the expected answer in substance? Be lenient on phrasing, strict on facts. If no expected_answer is provided, return null for this field. A score of 1.0 means semantically equivalent. A score of 0.0 means contradictory or unrelated.
383
+
384
+ Return ONLY valid JSON in this exact shape, no markdown, no commentary:
385
+
386
+ {
387
+ "faithfulness": <number 0.0-1.0 or null if context is uninterpretable>,
388
+ "correctness": <number 0.0-1.0 or null>,
389
+ "rationale": "<one or two sentences explaining the scores>"
390
+ }`;
391
+ }
392
+ });
393
+
394
+ // src/providers/types.ts
395
+ var JudgeError;
396
+ var init_types = __esm({
397
+ "src/providers/types.ts"() {
398
+ "use strict";
399
+ init_cjs_shims();
400
+ JudgeError = class extends Error {
401
+ constructor(message, code, provider, cause) {
402
+ super(`[judge:${provider}:${code}] ${message}`);
403
+ this.code = code;
404
+ this.provider = provider;
405
+ this.cause = cause;
406
+ this.name = "JudgeError";
407
+ }
408
+ code;
409
+ provider;
410
+ cause;
411
+ };
412
+ }
413
+ });
414
+
415
+ // src/providers/claude.ts
416
+ var claude_exports = {};
417
+ __export(claude_exports, {
418
+ claudeJudge: () => claudeJudge
419
+ });
420
+ function getClient() {
421
+ const apiKey = process.env["ANTHROPIC_API_KEY"];
422
+ if (!apiKey) {
423
+ throw new JudgeError(
424
+ "ANTHROPIC_API_KEY not set in environment",
425
+ "auth",
426
+ "claude"
427
+ );
428
+ }
429
+ return new import_sdk.default({ apiKey });
430
+ }
431
+ function mapError(e) {
432
+ if (e instanceof JudgeError) return e;
433
+ const err = e;
434
+ if (err.status === 401 || err.status === 403) {
435
+ return new JudgeError("Authentication failed", "auth", "claude", e);
436
+ }
437
+ if (err.status === 429) {
438
+ return new JudgeError("Rate limit exceeded", "rate_limit", "claude", e);
439
+ }
440
+ if (err.status && err.status >= 500) {
441
+ return new JudgeError("Provider server error", "server", "claude", e);
442
+ }
443
+ if (err.status && err.status >= 400) {
444
+ return new JudgeError(err.message ?? "Invalid request", "invalid", "claude", e);
445
+ }
446
+ return new JudgeError(err.message ?? "Network error", "network", "claude", e);
447
+ }
448
+ function extractJSON(text) {
449
+ let cleaned = text.trim();
450
+ if (cleaned.startsWith("```")) {
451
+ cleaned = cleaned.replace(/^```(?:json)?\s*\n?/, "").replace(/\n?```\s*$/, "");
452
+ }
453
+ const firstBrace = cleaned.indexOf("{");
454
+ const lastBrace = cleaned.lastIndexOf("}");
455
+ if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) {
456
+ cleaned = cleaned.slice(firstBrace, lastBrace + 1);
457
+ }
458
+ return JSON.parse(cleaned);
459
+ }
460
+ var import_sdk, claudeJudge;
461
+ var init_claude = __esm({
462
+ "src/providers/claude.ts"() {
463
+ "use strict";
464
+ init_cjs_shims();
465
+ import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
466
+ init_prompts();
467
+ init_schema();
468
+ init_types();
469
+ claudeJudge = {
470
+ name: "claude",
471
+ async judge(input, model) {
472
+ const client = getClient();
473
+ const userPrompt = buildJudgeUserPrompt(input);
474
+ let response;
475
+ try {
476
+ response = await client.messages.create({
477
+ model,
478
+ max_tokens: 600,
479
+ temperature: 0,
480
+ system: JUDGE_SYSTEM_PROMPT,
481
+ messages: [{ role: "user", content: userPrompt }]
482
+ });
483
+ } catch (e) {
484
+ throw mapError(e);
485
+ }
486
+ const text = response.content.filter((b) => b.type === "text").map((b) => b.text).join("");
487
+ let parsed;
488
+ try {
489
+ parsed = extractJSON(text);
490
+ } catch (e) {
491
+ throw new JudgeError(
492
+ `Judge response was not valid JSON: ${text.slice(0, 200)}...`,
493
+ "parse",
494
+ "claude",
495
+ e
496
+ );
497
+ }
498
+ const result = JudgeOutputJSONSchema.safeParse(parsed);
499
+ if (!result.success) {
500
+ throw new JudgeError(
501
+ `Judge JSON failed schema validation: ${result.error.message}`,
502
+ "parse",
503
+ "claude"
504
+ );
505
+ }
506
+ return {
507
+ faithfulness: result.data.faithfulness ?? NaN,
508
+ correctness: result.data.correctness ?? NaN,
509
+ rationale: result.data.rationale,
510
+ rawResponse: response
511
+ };
512
+ }
513
+ };
514
+ }
515
+ });
516
+
517
+ // src/providers/openai.ts
518
+ var openai_exports = {};
519
+ __export(openai_exports, {
520
+ openaiJudge: () => openaiJudge
521
+ });
522
+ function getClient2() {
523
+ const apiKey = process.env["OPENAI_API_KEY"];
524
+ if (!apiKey) {
525
+ throw new JudgeError(
526
+ "OPENAI_API_KEY not set in environment",
527
+ "auth",
528
+ "openai"
529
+ );
530
+ }
531
+ return new import_openai.default({ apiKey });
532
+ }
533
+ function mapError2(e) {
534
+ if (e instanceof JudgeError) return e;
535
+ const err = e;
536
+ if (err.status === 401 || err.status === 403) {
537
+ return new JudgeError("Authentication failed", "auth", "openai", e);
538
+ }
539
+ if (err.status === 429) {
540
+ return new JudgeError("Rate limit exceeded", "rate_limit", "openai", e);
541
+ }
542
+ if (err.status && err.status >= 500) {
543
+ return new JudgeError("Provider server error", "server", "openai", e);
544
+ }
545
+ if (err.status && err.status >= 400) {
546
+ return new JudgeError(err.message ?? "Invalid request", "invalid", "openai", e);
547
+ }
548
+ return new JudgeError(err.message ?? "Network error", "network", "openai", e);
549
+ }
550
+ var import_openai, OPENAI_JSON_SCHEMA, openaiJudge;
551
+ var init_openai = __esm({
552
+ "src/providers/openai.ts"() {
553
+ "use strict";
554
+ init_cjs_shims();
555
+ import_openai = __toESM(require("openai"), 1);
556
+ init_prompts();
557
+ init_schema();
558
+ init_types();
559
+ OPENAI_JSON_SCHEMA = {
560
+ type: "object",
561
+ properties: {
562
+ faithfulness: {
563
+ type: ["number", "null"],
564
+ description: "How well the answer is supported by retrieved context, null if context is uninterpretable (e.g., only IDs not text)"
565
+ },
566
+ correctness: {
567
+ type: ["number", "null"],
568
+ description: "How well the answer matches expected_answer, null if not provided"
569
+ },
570
+ rationale: {
571
+ type: "string",
572
+ description: "One or two sentences explaining the scores"
573
+ }
574
+ },
575
+ required: ["faithfulness", "correctness", "rationale"],
576
+ additionalProperties: false
577
+ };
578
+ openaiJudge = {
579
+ name: "openai",
580
+ async judge(input, model) {
581
+ const client = getClient2();
582
+ const userPrompt = buildJudgeUserPrompt(input);
583
+ let response;
584
+ try {
585
+ response = await client.chat.completions.create({
586
+ model,
587
+ temperature: 0,
588
+ max_tokens: 600,
589
+ messages: [
590
+ { role: "system", content: JUDGE_SYSTEM_PROMPT },
591
+ { role: "user", content: userPrompt }
592
+ ],
593
+ response_format: {
594
+ type: "json_schema",
595
+ json_schema: {
596
+ name: "judge_output",
597
+ strict: true,
598
+ schema: OPENAI_JSON_SCHEMA
599
+ }
600
+ }
601
+ });
602
+ } catch (e) {
603
+ throw mapError2(e);
604
+ }
605
+ const text = response.choices[0]?.message?.content ?? "";
606
+ if (!text) {
607
+ throw new JudgeError("Empty response from judge", "parse", "openai");
608
+ }
609
+ let parsed;
610
+ try {
611
+ parsed = JSON.parse(text);
612
+ } catch (e) {
613
+ throw new JudgeError(
614
+ `Judge response was not valid JSON: ${text.slice(0, 200)}`,
615
+ "parse",
616
+ "openai",
617
+ e
618
+ );
619
+ }
620
+ const result = JudgeOutputJSONSchema.safeParse(parsed);
621
+ if (!result.success) {
622
+ throw new JudgeError(
623
+ `Judge JSON failed schema validation: ${result.error.message}`,
624
+ "parse",
625
+ "openai"
626
+ );
627
+ }
628
+ return {
629
+ faithfulness: result.data.faithfulness ?? NaN,
630
+ correctness: result.data.correctness ?? NaN,
631
+ rationale: result.data.rationale,
632
+ rawResponse: response
633
+ };
634
+ }
635
+ };
636
+ }
637
+ });
638
+
639
+ // src/core/judge.ts
640
+ async function getJudgeProvider(config) {
641
+ if (config.provider === "claude") {
642
+ const { claudeJudge: claudeJudge2 } = await Promise.resolve().then(() => (init_claude(), claude_exports));
643
+ return claudeJudge2;
644
+ }
645
+ if (config.provider === "openai") {
646
+ const { openaiJudge: openaiJudge2 } = await Promise.resolve().then(() => (init_openai(), openai_exports));
647
+ return openaiJudge2;
648
+ }
649
+ throw new Error(`Unknown judge provider: ${String(config.provider)}`);
650
+ }
651
+ async function runJudge(input, config) {
652
+ const provider = await getJudgeProvider(config);
653
+ return provider.judge(input, config.model);
654
+ }
655
+ var init_judge = __esm({
656
+ "src/core/judge.ts"() {
657
+ "use strict";
658
+ init_cjs_shims();
659
+ }
660
+ });
661
+
662
+ // src/core/runner.ts
663
+ async function runEval(opts) {
664
+ const startTime = Date.now();
665
+ const results = [];
666
+ const spinner = (0, import_ora.default)({ text: "Starting evaluation...", spinner: "dots" }).start();
667
+ for (let i = 0; i < opts.entries.length; i++) {
668
+ const entry = opts.entries[i];
669
+ spinner.text = `[${i + 1}/${opts.entries.length}] ${entry.id} \u2014 ${entry.question.slice(0, 50)}...`;
670
+ let result;
671
+ try {
672
+ const response = await callEndpoint(opts.config.endpoint, entry);
673
+ const retrieval = scoreRetrieval(entry, response, opts.config.scoring.retrievalK);
674
+ let judge = null;
675
+ if (opts.enableJudge && opts.config.judge) {
676
+ spinner.text = `[${i + 1}/${opts.entries.length}] ${entry.id} \u2014 judging...`;
677
+ try {
678
+ const k = opts.config.scoring.retrievalK;
679
+ const retrievedContext = response.sourceContents && response.sourceContents.length > 0 ? response.sourceContents.slice(0, k) : response.sources.slice(0, k);
680
+ const judgeOut = await runJudge(
681
+ {
682
+ question: entry.question,
683
+ answer: response.answer,
684
+ retrievedContext,
685
+ expectedAnswer: entry.expected_answer
686
+ },
687
+ opts.config.judge
688
+ );
689
+ judge = {
690
+ faithfulness: judgeOut.faithfulness,
691
+ correctness: judgeOut.correctness,
692
+ rationale: judgeOut.rationale
693
+ };
694
+ } catch (e) {
695
+ judge = {
696
+ faithfulness: NaN,
697
+ correctness: NaN,
698
+ rationale: `Judge error: ${e.message}`
699
+ };
700
+ }
701
+ }
702
+ const overallScore = computeOverallScore(retrieval, judge, opts.config.scoring.weights);
703
+ result = {
704
+ entry,
705
+ response,
706
+ error: null,
707
+ retrieval,
708
+ judge,
709
+ overallScore
710
+ };
711
+ } catch (e) {
712
+ const message = e instanceof EndpointError ? e.message : `Unexpected error: ${e.message}`;
713
+ result = {
714
+ entry,
715
+ response: null,
716
+ error: message,
717
+ retrieval: null,
718
+ judge: null,
719
+ overallScore: 0
720
+ };
721
+ }
722
+ results.push(result);
723
+ }
724
+ spinner.stop();
725
+ const successful = results.filter((r) => r.error === null).length;
726
+ const failed = results.length - successful;
727
+ const successResults = results.filter((r) => r.retrieval !== null);
728
+ const avgRetrievalPrecision = successResults.length > 0 ? successResults.reduce((s, r) => s + r.retrieval.precision, 0) / successResults.length : 0;
729
+ const faithScores = results.map((r) => r.judge?.faithfulness).filter((n) => typeof n === "number" && !isNaN(n));
730
+ const corrScores = results.map((r) => r.judge?.correctness).filter((n) => typeof n === "number" && !isNaN(n));
731
+ const avgFaithfulness = faithScores.length > 0 ? faithScores.reduce((s, n) => s + n, 0) / faithScores.length : NaN;
732
+ const avgCorrectness = corrScores.length > 0 ? corrScores.reduce((s, n) => s + n, 0) / corrScores.length : NaN;
733
+ const avgOverallScore = results.reduce((s, r) => s + r.overallScore, 0) / results.length;
734
+ const summary = {
735
+ total: results.length,
736
+ successful,
737
+ failed,
738
+ avgRetrievalPrecision,
739
+ avgFaithfulness,
740
+ avgCorrectness,
741
+ avgOverallScore,
742
+ passed: avgOverallScore >= opts.threshold,
743
+ durationMs: Date.now() - startTime
744
+ };
745
+ return { results, summary };
746
+ }
747
+ var import_ora;
748
+ var init_runner = __esm({
749
+ "src/core/runner.ts"() {
750
+ "use strict";
751
+ init_cjs_shims();
752
+ import_ora = __toESM(require("ora"), 1);
753
+ init_endpoint();
754
+ init_scorer();
755
+ init_judge();
756
+ }
757
+ });
758
+
759
+ // src/formatters/table.ts
760
+ function scoreColor(score) {
761
+ if (score >= 0.8) return import_chalk.default.green;
762
+ if (score >= 0.5) return import_chalk.default.yellow;
763
+ return import_chalk.default.red;
764
+ }
765
+ function fmtScore(s) {
766
+ if (s === void 0 || isNaN(s)) return import_chalk.default.gray("\u2014");
767
+ return scoreColor(s)(s.toFixed(2));
768
+ }
769
+ function renderTable(results) {
770
+ const table = new import_cli_table3.default({
771
+ head: [
772
+ import_chalk.default.bold("ID"),
773
+ import_chalk.default.bold("Question"),
774
+ import_chalk.default.bold("Retr"),
775
+ import_chalk.default.bold("Faith"),
776
+ import_chalk.default.bold("Corr"),
777
+ import_chalk.default.bold("Score"),
778
+ import_chalk.default.bold("Status")
779
+ ],
780
+ colWidths: [10, 32, 8, 8, 8, 8, 20],
781
+ wordWrap: true
782
+ });
783
+ for (const r of results) {
784
+ if (r.error) {
785
+ table.push([
786
+ r.entry.id,
787
+ r.entry.question.slice(0, 30),
788
+ import_chalk.default.gray("\u2014"),
789
+ import_chalk.default.gray("\u2014"),
790
+ import_chalk.default.gray("\u2014"),
791
+ import_chalk.default.gray("\u2014"),
792
+ import_chalk.default.red(r.error.slice(0, 18))
793
+ ]);
794
+ continue;
795
+ }
796
+ const ret = r.retrieval;
797
+ const j = r.judge;
798
+ table.push([
799
+ r.entry.id,
800
+ r.entry.question.slice(0, 30),
801
+ ret.found ? import_chalk.default.green("\u2713") : import_chalk.default.red("\u2717"),
802
+ fmtScore(j?.faithfulness),
803
+ fmtScore(j?.correctness),
804
+ scoreColor(r.overallScore)(r.overallScore.toFixed(2)),
805
+ import_chalk.default.green("ok")
806
+ ]);
807
+ }
808
+ return table.toString();
809
+ }
810
+ function renderSummary(summary, threshold) {
811
+ const lines = [];
812
+ lines.push("");
813
+ lines.push(import_chalk.default.bold("Summary"));
814
+ lines.push(import_chalk.default.gray("\u2500".repeat(60)));
815
+ lines.push(`Total questions: ${summary.total}`);
816
+ lines.push(`Successful: ${import_chalk.default.green(String(summary.successful))}`);
817
+ if (summary.failed > 0) {
818
+ lines.push(`Failed: ${import_chalk.default.red(String(summary.failed))}`);
819
+ }
820
+ lines.push(
821
+ `Avg retrieval precision: ${scoreColor(summary.avgRetrievalPrecision)(summary.avgRetrievalPrecision.toFixed(3))}`
822
+ );
823
+ lines.push(`Avg faithfulness: ${fmtScore(summary.avgFaithfulness)}`);
824
+ lines.push(`Avg correctness: ${fmtScore(summary.avgCorrectness)}`);
825
+ lines.push(
826
+ `Avg overall score: ${scoreColor(summary.avgOverallScore)(summary.avgOverallScore.toFixed(3))}`
827
+ );
828
+ lines.push(`Threshold: ${threshold.toFixed(2)}`);
829
+ lines.push(`Duration: ${(summary.durationMs / 1e3).toFixed(1)}s`);
830
+ lines.push(import_chalk.default.gray("\u2500".repeat(60)));
831
+ lines.push(
832
+ summary.passed ? import_chalk.default.green.bold("\u2713 PASSED") : import_chalk.default.red.bold("\u2717 FAILED \u2014 below threshold")
833
+ );
834
+ lines.push("");
835
+ return lines.join("\n");
836
+ }
837
+ var import_cli_table3, import_chalk;
838
+ var init_table = __esm({
839
+ "src/formatters/table.ts"() {
840
+ "use strict";
841
+ init_cjs_shims();
842
+ import_cli_table3 = __toESM(require("cli-table3"), 1);
843
+ import_chalk = __toESM(require("chalk"), 1);
844
+ }
845
+ });
846
+
847
+ // src/formatters/csv.ts
848
+ function escape(value) {
849
+ if (value === void 0 || value === null) return "";
850
+ if (typeof value === "number") return isNaN(value) ? "" : String(value);
851
+ const s = String(value);
852
+ if (s.includes(",") || s.includes('"') || s.includes("\n")) {
853
+ return `"${s.replaceAll('"', '""')}"`;
854
+ }
855
+ return s;
856
+ }
857
+ function renderCSV(results, summary) {
858
+ const lines = [];
859
+ lines.push(
860
+ [
861
+ "id",
862
+ "question",
863
+ "expected_answer",
864
+ "rag_answer",
865
+ "expected_sources",
866
+ "retrieved_sources",
867
+ "retrieval_found",
868
+ "retrieval_precision",
869
+ "faithfulness",
870
+ "correctness",
871
+ "overall_score",
872
+ "judge_rationale",
873
+ "latency_ms",
874
+ "error"
875
+ ].map(escape).join(",")
876
+ );
877
+ for (const r of results) {
878
+ lines.push(
879
+ [
880
+ escape(r.entry.id),
881
+ escape(r.entry.question),
882
+ escape(r.entry.expected_answer),
883
+ escape(r.response?.answer),
884
+ escape(
885
+ Array.isArray(r.entry.expected_source) ? r.entry.expected_source.join("|") : r.entry.expected_source
886
+ ),
887
+ escape(r.response?.sources.join("|")),
888
+ escape(r.retrieval ? r.retrieval.found ? "true" : "false" : void 0),
889
+ escape(r.retrieval?.precision),
890
+ escape(r.judge?.faithfulness),
891
+ escape(r.judge?.correctness),
892
+ escape(r.overallScore),
893
+ escape(r.judge?.rationale),
894
+ escape(r.response?.latencyMs),
895
+ escape(r.error ?? void 0)
896
+ ].join(",")
897
+ );
898
+ }
899
+ lines.push("");
900
+ lines.push(
901
+ `# summary,total=${summary.total},successful=${summary.successful},failed=${summary.failed}`
902
+ );
903
+ lines.push(`# summary,avg_retrieval_precision=${summary.avgRetrievalPrecision.toFixed(4)}`);
904
+ lines.push(
905
+ `# summary,avg_faithfulness=${isNaN(summary.avgFaithfulness) ? "n/a" : summary.avgFaithfulness.toFixed(4)}`
906
+ );
907
+ lines.push(
908
+ `# summary,avg_correctness=${isNaN(summary.avgCorrectness) ? "n/a" : summary.avgCorrectness.toFixed(4)}`
909
+ );
910
+ lines.push(`# summary,avg_overall_score=${summary.avgOverallScore.toFixed(4)}`);
911
+ lines.push(`# summary,passed=${summary.passed}`);
912
+ lines.push(`# summary,duration_ms=${summary.durationMs}`);
913
+ return lines.join("\n");
914
+ }
915
+ var init_csv = __esm({
916
+ "src/formatters/csv.ts"() {
917
+ "use strict";
918
+ init_cjs_shims();
919
+ }
920
+ });
921
+
922
+ // src/formatters/json.ts
923
+ function round4(n) {
924
+ return Math.round(n * 1e4) / 1e4;
925
+ }
926
+ function numOrNull(v, formatter) {
927
+ if (v === void 0) return null;
928
+ if (isNaN(v)) return null;
929
+ return formatter ? formatter(v) : v;
930
+ }
931
+ function renderJSON(results, summary) {
932
+ return {
933
+ summary: {
934
+ total: summary.total,
935
+ successful: summary.successful,
936
+ failed: summary.failed,
937
+ avgRetrievalPrecision: round4(summary.avgRetrievalPrecision),
938
+ avgFaithfulness: numOrNull(summary.avgFaithfulness, round4),
939
+ avgCorrectness: numOrNull(summary.avgCorrectness, round4),
940
+ avgOverallScore: round4(summary.avgOverallScore),
941
+ passed: summary.passed,
942
+ durationMs: summary.durationMs,
943
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
944
+ },
945
+ results: results.map((r) => ({
946
+ id: r.entry.id,
947
+ question: r.entry.question,
948
+ expectedAnswer: r.entry.expected_answer,
949
+ ragAnswer: r.response?.answer ?? null,
950
+ expectedSources: Array.isArray(r.entry.expected_source) ? r.entry.expected_source : [r.entry.expected_source],
951
+ retrievedSources: r.response?.sources ?? null,
952
+ retrieval: r.retrieval ? { found: r.retrieval.found, precision: round4(r.retrieval.precision) } : null,
953
+ judge: r.judge ? {
954
+ faithfulness: numOrNull(r.judge.faithfulness, round4),
955
+ correctness: numOrNull(r.judge.correctness, round4),
956
+ rationale: r.judge.rationale ?? ""
957
+ } : null,
958
+ overallScore: round4(r.overallScore),
959
+ latencyMs: r.response?.latencyMs ?? null,
960
+ error: r.error
961
+ }))
962
+ };
963
+ }
964
+ var init_json = __esm({
965
+ "src/formatters/json.ts"() {
966
+ "use strict";
967
+ init_cjs_shims();
968
+ }
969
+ });
970
+
971
+ // src/commands/run.ts
972
+ var run_exports = {};
973
+ __export(run_exports, {
974
+ runCommand: () => runCommand
975
+ });
976
+ async function runCommand(opts) {
977
+ try {
978
+ const config = loadConfig(opts.config);
979
+ const questionsPath = opts.questions ?? "eval-set.jsonl";
980
+ const entries = loadEvalSet(questionsPath);
981
+ const threshold = parseFloat(opts.threshold);
982
+ if (isNaN(threshold) || threshold < 0 || threshold > 1) {
983
+ console.error(import_chalk2.default.red(`Invalid threshold: ${opts.threshold} (must be 0-1)`));
984
+ return 2;
985
+ }
986
+ const enableJudge = opts.judge !== false;
987
+ if (enableJudge && typeof opts.judge === "string" && config.judge) {
988
+ if (opts.judge !== "claude" && opts.judge !== "openai") {
989
+ console.error(import_chalk2.default.red(`Invalid --judge value: "${opts.judge}" (use claude|openai)`));
990
+ return 2;
991
+ }
992
+ config.judge = {
993
+ provider: opts.judge,
994
+ model: opts.judge === "openai" ? "gpt-4o-mini" : "claude-sonnet-4-6"
995
+ };
996
+ }
997
+ console.log(import_chalk2.default.gray(`Loaded ${entries.length} questions from ${questionsPath}`));
998
+ console.log(import_chalk2.default.gray(`Endpoint: ${config.endpoint.url}`));
999
+ if (enableJudge && config.judge) {
1000
+ console.log(import_chalk2.default.gray(`Judge: ${config.judge.provider} (${config.judge.model})`));
1001
+ } else {
1002
+ console.log(import_chalk2.default.gray("Judge: disabled (retrieval-only)"));
1003
+ }
1004
+ console.log("");
1005
+ const { results, summary } = await runEval({ config, entries, threshold, enableJudge });
1006
+ console.log(renderTable(results));
1007
+ console.log(renderSummary(summary, threshold));
1008
+ try {
1009
+ (0, import_node_fs2.mkdirSync)(opts.output, { recursive: true });
1010
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1011
+ const csvPath = (0, import_node_path2.join)(opts.output, `eval-${timestamp}.csv`);
1012
+ (0, import_node_fs2.writeFileSync)(csvPath, renderCSV(results, summary));
1013
+ console.log(import_chalk2.default.gray(`CSV report: ${csvPath}`));
1014
+ const jsonPath = (0, import_node_path2.join)(opts.output, `eval-${timestamp}.json`);
1015
+ (0, import_node_fs2.writeFileSync)(jsonPath, JSON.stringify(renderJSON(results, summary), null, 2));
1016
+ console.log(import_chalk2.default.gray(`JSON report: ${jsonPath}`));
1017
+ } catch (e) {
1018
+ console.error(import_chalk2.default.yellow(`Warning: failed to write reports: ${e.message}`));
1019
+ }
1020
+ return summary.passed ? 0 : 1;
1021
+ } catch (e) {
1022
+ if (e instanceof ConfigError) {
1023
+ console.error(import_chalk2.default.red(e.message));
1024
+ return 2;
1025
+ }
1026
+ console.error(import_chalk2.default.red(`Unexpected error: ${e.message}`));
1027
+ if (process.env["DEBUG"]) {
1028
+ console.error(e.stack);
1029
+ }
1030
+ return 3;
1031
+ }
1032
+ }
1033
+ var import_node_fs2, import_node_path2, import_chalk2;
1034
+ var init_run = __esm({
1035
+ "src/commands/run.ts"() {
1036
+ "use strict";
1037
+ init_cjs_shims();
1038
+ import_node_fs2 = require("fs");
1039
+ import_node_path2 = require("path");
1040
+ import_chalk2 = __toESM(require("chalk"), 1);
1041
+ init_loader();
1042
+ init_runner();
1043
+ init_table();
1044
+ init_csv();
1045
+ init_json();
1046
+ }
1047
+ });
1048
+
1049
+ // src/index.ts
1050
+ init_cjs_shims();
1051
+ var import_commander = require("commander");
1052
+ var import_node_fs3 = require("fs");
1053
+ var import_node_url = require("url");
1054
+ var import_node_path3 = require("path");
1055
+ var __filename2 = (0, import_node_url.fileURLToPath)(importMetaUrl);
1056
+ var __dirname = (0, import_node_path3.dirname)(__filename2);
1057
+ var pkg = JSON.parse(
1058
+ (0, import_node_fs3.readFileSync)((0, import_node_path3.join)(__dirname, "..", "package.json"), "utf-8")
1059
+ );
1060
+ var program = new import_commander.Command();
1061
+ program.name("rag-eval").description("Evaluate RAG pipelines: retrieval, faithfulness, correctness.").version(pkg.version);
1062
+ program.command("run").description("Run evaluation against a RAG endpoint").option("-c, --config <path>", "config file path", "rag-eval.config.json").option("-q, --questions <path>", "eval-set JSONL file").option("-j, --judge <provider>", "judge LLM provider: claude|openai (overrides config)").option("--no-judge", "skip judge LLM (retrieval scoring only, no API costs)").option("-o, --output <dir>", "output directory for reports", "./rag-eval-output").option("--threshold <number>", "min score to exit 0 (0-1)", "0.7").action(async (opts) => {
1063
+ const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_run(), run_exports));
1064
+ const code = await runCommand2(opts);
1065
+ process.exit(code);
1066
+ });
1067
+ program.parse();