@massiangelone/rag-eval 0.1.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1043 @@
1
+ #!/usr/bin/env node
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropNames = Object.getOwnPropertyNames;
4
+ var __esm = (fn, res) => function __init() {
5
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
6
+ };
7
+ var __export = (target, all) => {
8
+ for (var name in all)
9
+ __defProp(target, name, { get: all[name], enumerable: true });
10
+ };
11
+
12
+ // node_modules/.pnpm/tsup@8.5.1_tsx@4.21.0_typescript@6.0.3/node_modules/tsup/assets/esm_shims.js
13
+ import path from "path";
14
+ import { fileURLToPath } from "url";
15
+ var init_esm_shims = __esm({
16
+ "node_modules/.pnpm/tsup@8.5.1_tsx@4.21.0_typescript@6.0.3/node_modules/tsup/assets/esm_shims.js"() {
17
+ "use strict";
18
+ }
19
+ });
20
+
21
+ // src/config/schema.ts
22
+ import { z } from "zod";
23
+ var EvalEntrySchema, EndpointConfigSchema, JudgeConfigSchema, JudgeOutputJSONSchema, ScoringConfigSchema, ConfigSchema;
24
+ var init_schema = __esm({
25
+ "src/config/schema.ts"() {
26
+ "use strict";
27
+ init_esm_shims();
28
+ EvalEntrySchema = z.object({
29
+ id: z.string().min(1),
30
+ question: z.string().min(1),
31
+ expected_source: z.union([z.string(), z.array(z.string())]),
32
+ expected_answer: z.string().optional()
33
+ });
34
+ EndpointConfigSchema = z.object({
35
+ url: z.string().url(),
36
+ method: z.enum(["GET", "POST"]).default("POST"),
37
+ headers: z.record(z.string(), z.string()).optional(),
38
+ body: z.record(z.string(), z.unknown()).optional(),
39
+ responsePaths: z.object({
40
+ answer: z.string(),
41
+ sources: z.string(),
42
+ sourceContents: z.string().optional()
43
+ }),
44
+ timeoutMs: z.number().int().positive().default(3e4)
45
+ });
46
+ JudgeConfigSchema = z.object({
47
+ provider: z.enum(["claude", "openai"]).default("claude"),
48
+ model: z.string().optional()
49
+ }).transform((data) => {
50
+ const defaultModel = data.provider === "openai" ? "gpt-4o-mini" : "claude-sonnet-4-6";
51
+ return {
52
+ provider: data.provider,
53
+ model: data.model ?? defaultModel
54
+ };
55
+ });
56
+ JudgeOutputJSONSchema = z.object({
57
+ faithfulness: z.union([z.number().min(0).max(1), z.null()]),
58
+ correctness: z.union([z.number().min(0).max(1), z.null()]),
59
+ rationale: z.string().min(1).max(2e3)
60
+ });
61
+ ScoringConfigSchema = z.object({
62
+ retrievalK: z.number().int().positive().default(5),
63
+ weights: z.object({
64
+ retrieval: z.number().min(0).max(1).default(0.4),
65
+ faithfulness: z.number().min(0).max(1).default(0.3),
66
+ correctness: z.number().min(0).max(1).default(0.3)
67
+ }).refine(
68
+ (w) => Math.abs(w.retrieval + w.faithfulness + w.correctness - 1) < 0.01,
69
+ { message: "Scoring weights must sum to 1.0" }
70
+ )
71
+ });
72
+ ConfigSchema = z.object({
73
+ endpoint: EndpointConfigSchema,
74
+ judge: JudgeConfigSchema.optional(),
75
+ scoring: ScoringConfigSchema.default({
76
+ retrievalK: 5,
77
+ weights: { retrieval: 0.4, faithfulness: 0.3, correctness: 0.3 }
78
+ })
79
+ });
80
+ }
81
+ });
82
+
83
+ // src/config/loader.ts
84
+ import { readFileSync } from "fs";
85
+ import { resolve } from "path";
86
+ function loadConfig(path2) {
87
+ const absPath = resolve(process.cwd(), path2);
88
+ let raw;
89
+ try {
90
+ raw = readFileSync(absPath, "utf-8");
91
+ } catch {
92
+ throw new ConfigError(`Could not read config file: ${absPath}`, "load");
93
+ }
94
+ let parsed;
95
+ try {
96
+ parsed = JSON.parse(raw);
97
+ } catch (e) {
98
+ throw new ConfigError(
99
+ `Config is not valid JSON: ${e.message}`,
100
+ "parse"
101
+ );
102
+ }
103
+ const result = ConfigSchema.safeParse(parsed);
104
+ if (!result.success) {
105
+ const issues = result.error.issues.map((i) => ` \u2022 ${i.path.join(".")}: ${i.message}`).join("\n");
106
+ throw new ConfigError(`Config validation failed:
107
+ ${issues}`, "validate");
108
+ }
109
+ return result.data;
110
+ }
111
+ function loadEvalSet(path2) {
112
+ const absPath = resolve(process.cwd(), path2);
113
+ let raw;
114
+ try {
115
+ raw = readFileSync(absPath, "utf-8");
116
+ } catch {
117
+ throw new ConfigError(`Could not read eval-set: ${absPath}`, "load-evalset");
118
+ }
119
+ const lines = raw.split("\n").filter((l) => l.trim().length > 0);
120
+ if (lines.length === 0) {
121
+ throw new ConfigError(`Eval-set is empty: ${absPath}`, "evalset-empty");
122
+ }
123
+ const entries = [];
124
+ for (let i = 0; i < lines.length; i++) {
125
+ let parsed;
126
+ try {
127
+ parsed = JSON.parse(lines[i]);
128
+ } catch (e) {
129
+ throw new ConfigError(
130
+ `Line ${i + 1} is not valid JSON: ${e.message}`,
131
+ "evalset-parse"
132
+ );
133
+ }
134
+ const result = EvalEntrySchema.safeParse(parsed);
135
+ if (!result.success) {
136
+ const issues = result.error.issues.map((iss) => `${iss.path.join(".")}: ${iss.message}`).join(", ");
137
+ throw new ConfigError(`Line ${i + 1} invalid: ${issues}`, "evalset-validate");
138
+ }
139
+ entries.push(result.data);
140
+ }
141
+ return entries;
142
+ }
143
+ var ConfigError;
144
+ var init_loader = __esm({
145
+ "src/config/loader.ts"() {
146
+ "use strict";
147
+ init_esm_shims();
148
+ init_schema();
149
+ ConfigError = class extends Error {
150
+ constructor(message, source) {
151
+ super(`[config:${source}] ${message}`);
152
+ this.source = source;
153
+ this.name = "ConfigError";
154
+ }
155
+ source;
156
+ };
157
+ }
158
+ });
159
+
160
+ // src/adapters/endpoint.ts
161
+ function substitutePlaceholders(template, entry) {
162
+ if (typeof template === "string") {
163
+ return template.replaceAll("{{question}}", entry.question).replaceAll("{{id}}", entry.id);
164
+ }
165
+ if (Array.isArray(template)) {
166
+ return template.map((item) => substitutePlaceholders(item, entry));
167
+ }
168
+ if (template && typeof template === "object") {
169
+ const out = {};
170
+ for (const [k, v] of Object.entries(template)) {
171
+ out[k] = substitutePlaceholders(v, entry);
172
+ }
173
+ return out;
174
+ }
175
+ return template;
176
+ }
177
+ function getPath(obj, path2) {
178
+ const parts = path2.split(".");
179
+ let current = obj;
180
+ for (let i = 0; i < parts.length; i++) {
181
+ const part = parts[i];
182
+ if (current === null || current === void 0) return void 0;
183
+ const arrayMatch = part.match(/^(.+?)\[\]$/);
184
+ if (arrayMatch) {
185
+ const arr = current[arrayMatch[1]];
186
+ if (!Array.isArray(arr)) return void 0;
187
+ const remainingPath = parts.slice(i + 1).join(".");
188
+ if (!remainingPath) return arr;
189
+ return arr.map((item) => getPath(item, remainingPath));
190
+ }
191
+ current = current[part];
192
+ }
193
+ return current;
194
+ }
195
+ async function callEndpoint(config, entry) {
196
+ const start = Date.now();
197
+ const controller = new AbortController();
198
+ const timeout = setTimeout(() => controller.abort(), config.timeoutMs);
199
+ let response;
200
+ try {
201
+ const init = {
202
+ method: config.method,
203
+ headers: config.headers,
204
+ signal: controller.signal
205
+ };
206
+ if (config.method === "POST" && config.body) {
207
+ const body = substitutePlaceholders(config.body, entry);
208
+ init.body = JSON.stringify(body);
209
+ }
210
+ response = await fetch(config.url, init);
211
+ } catch (e) {
212
+ clearTimeout(timeout);
213
+ if (e.name === "AbortError") {
214
+ throw new EndpointError(`Request timed out after ${config.timeoutMs}ms`);
215
+ }
216
+ throw new EndpointError(`Network error: ${e.message}`);
217
+ }
218
+ clearTimeout(timeout);
219
+ if (!response.ok) {
220
+ throw new EndpointError(
221
+ `Endpoint returned ${response.status}: ${response.statusText}`,
222
+ response.status
223
+ );
224
+ }
225
+ let raw;
226
+ try {
227
+ raw = await response.json();
228
+ } catch {
229
+ throw new EndpointError(`Endpoint response is not valid JSON`);
230
+ }
231
+ const answerRaw = getPath(raw, config.responsePaths.answer);
232
+ const sourcesRaw = getPath(raw, config.responsePaths.sources);
233
+ if (typeof answerRaw !== "string") {
234
+ throw new EndpointError(
235
+ `Response path "${config.responsePaths.answer}" did not resolve to a string`
236
+ );
237
+ }
238
+ let sources = [];
239
+ if (Array.isArray(sourcesRaw)) {
240
+ sources = sourcesRaw.map((s) => String(s));
241
+ } else if (typeof sourcesRaw === "string") {
242
+ sources = [sourcesRaw];
243
+ } else if (sourcesRaw !== void 0 && sourcesRaw !== null) {
244
+ throw new EndpointError(
245
+ `Response path "${config.responsePaths.sources}" did not resolve to array or string`
246
+ );
247
+ }
248
+ let sourceContents;
249
+ if (config.responsePaths.sourceContents) {
250
+ const contentsRaw = getPath(raw, config.responsePaths.sourceContents);
251
+ if (Array.isArray(contentsRaw)) {
252
+ sourceContents = contentsRaw.map((c) => String(c));
253
+ } else if (typeof contentsRaw === "string") {
254
+ sourceContents = [contentsRaw];
255
+ } else if (contentsRaw !== void 0 && contentsRaw !== null) {
256
+ throw new EndpointError(
257
+ `Response path "${config.responsePaths.sourceContents}" did not resolve to array or string`
258
+ );
259
+ }
260
+ }
261
+ return {
262
+ answer: answerRaw,
263
+ sources,
264
+ sourceContents,
265
+ raw,
266
+ latencyMs: Date.now() - start
267
+ };
268
+ }
269
+ var EndpointError;
270
+ var init_endpoint = __esm({
271
+ "src/adapters/endpoint.ts"() {
272
+ "use strict";
273
+ init_esm_shims();
274
+ EndpointError = class extends Error {
275
+ constructor(message, statusCode) {
276
+ super(message);
277
+ this.statusCode = statusCode;
278
+ this.name = "EndpointError";
279
+ }
280
+ statusCode;
281
+ };
282
+ }
283
+ });
284
+
285
+ // src/core/scorer.ts
286
+ function scoreRetrieval(entry, response, k) {
287
+ const expected = Array.isArray(entry.expected_source) ? entry.expected_source : [entry.expected_source];
288
+ const topK = response.sources.slice(0, k);
289
+ let hits = 0;
290
+ for (const src of topK) {
291
+ if (expected.includes(src)) hits++;
292
+ }
293
+ const precision = topK.length > 0 ? hits / topK.length : 0;
294
+ const found = expected.some((e) => topK.includes(e));
295
+ return {
296
+ precision,
297
+ found,
298
+ topKSources: topK,
299
+ expectedSources: expected
300
+ };
301
+ }
302
+ function isValidScore(n) {
303
+ return typeof n === "number" && !isNaN(n);
304
+ }
305
+ function computeOverallScore(retrieval, judge, weights) {
306
+ const r = retrieval ? retrieval.found ? 1 : retrieval.precision : 0;
307
+ if (!judge) return r;
308
+ const hasFaith = isValidScore(judge.faithfulness);
309
+ const hasCorr = isValidScore(judge.correctness);
310
+ if (!hasFaith && !hasCorr) return r;
311
+ let totalWeight = weights.retrieval;
312
+ let weightedSum = r * weights.retrieval;
313
+ if (hasFaith) {
314
+ weightedSum += judge.faithfulness * weights.faithfulness;
315
+ totalWeight += weights.faithfulness;
316
+ }
317
+ if (hasCorr) {
318
+ weightedSum += judge.correctness * weights.correctness;
319
+ totalWeight += weights.correctness;
320
+ }
321
+ return totalWeight > 0 ? weightedSum / totalWeight : r;
322
+ }
323
+ var init_scorer = __esm({
324
+ "src/core/scorer.ts"() {
325
+ "use strict";
326
+ init_esm_shims();
327
+ }
328
+ });
329
+
330
+ // src/providers/prompts.ts
331
+ function buildJudgeUserPrompt(input) {
332
+ const looksLikeIds = input.retrievedContext.length > 0 && input.retrievedContext.every((c) => c.length < 80 && !c.includes(" "));
333
+ const contextBlock = input.retrievedContext.length > 0 ? input.retrievedContext.map((c, i) => `[${i + 1}] ${c}`).join("\n\n") : "(no context retrieved)";
334
+ const contextNote = looksLikeIds ? "\n\n(NOTE: The retrieved context appears to be document IDs, not full text. Faithfulness cannot be reliably assessed. Return null for faithfulness.)" : "";
335
+ const expectedBlock = input.expectedAnswer ? `
336
+
337
+ Expected answer (ground truth):
338
+ ${input.expectedAnswer}` : "\n\n(No expected_answer provided \u2014 return null for correctness.)";
339
+ return `Question:
340
+ ${input.question}
341
+
342
+ Retrieved context:
343
+ ${contextBlock}${contextNote}
344
+
345
+ RAG-generated answer:
346
+ ${input.answer}${expectedBlock}
347
+
348
+ Now score and return JSON.`;
349
+ }
350
+ var JUDGE_SYSTEM_PROMPT;
351
+ var init_prompts = __esm({
352
+ "src/providers/prompts.ts"() {
353
+ "use strict";
354
+ init_esm_shims();
355
+ JUDGE_SYSTEM_PROMPT = `You are a strict evaluator of RAG (retrieval-augmented generation) pipelines.
356
+
357
+ Your job: given a question, a RAG-generated answer, and the retrieved context the answer was supposed to be grounded in, score two dimensions:
358
+
359
+ 1. FAITHFULNESS (0.0 to 1.0 or null): Is the answer supported by the retrieved context? Penalize claims that are not in the context. A score of 1.0 means every claim in the answer is grounded in the context. A score of 0.0 means the answer is entirely hallucinated. Return null if the context is uninterpretable (e.g., only opaque IDs, not readable text).
360
+
361
+ 2. CORRECTNESS (0.0 to 1.0 or null): Does the answer match the expected answer in substance? Be lenient on phrasing, strict on facts. If no expected_answer is provided, return null for this field. A score of 1.0 means semantically equivalent. A score of 0.0 means contradictory or unrelated.
362
+
363
+ Return ONLY valid JSON in this exact shape, no markdown, no commentary:
364
+
365
+ {
366
+ "faithfulness": <number 0.0-1.0 or null if context is uninterpretable>,
367
+ "correctness": <number 0.0-1.0 or null>,
368
+ "rationale": "<one or two sentences explaining the scores>"
369
+ }`;
370
+ }
371
+ });
372
+
373
+ // src/providers/types.ts
374
+ var JudgeError;
375
+ var init_types = __esm({
376
+ "src/providers/types.ts"() {
377
+ "use strict";
378
+ init_esm_shims();
379
+ JudgeError = class extends Error {
380
+ constructor(message, code, provider, cause) {
381
+ super(`[judge:${provider}:${code}] ${message}`);
382
+ this.code = code;
383
+ this.provider = provider;
384
+ this.cause = cause;
385
+ this.name = "JudgeError";
386
+ }
387
+ code;
388
+ provider;
389
+ cause;
390
+ };
391
+ }
392
+ });
393
+
394
+ // src/providers/claude.ts
395
+ var claude_exports = {};
396
+ __export(claude_exports, {
397
+ claudeJudge: () => claudeJudge
398
+ });
399
+ import Anthropic from "@anthropic-ai/sdk";
400
+ function getClient() {
401
+ const apiKey = process.env["ANTHROPIC_API_KEY"];
402
+ if (!apiKey) {
403
+ throw new JudgeError(
404
+ "ANTHROPIC_API_KEY not set in environment",
405
+ "auth",
406
+ "claude"
407
+ );
408
+ }
409
+ return new Anthropic({ apiKey });
410
+ }
411
+ function mapError(e) {
412
+ if (e instanceof JudgeError) return e;
413
+ const err = e;
414
+ if (err.status === 401 || err.status === 403) {
415
+ return new JudgeError("Authentication failed", "auth", "claude", e);
416
+ }
417
+ if (err.status === 429) {
418
+ return new JudgeError("Rate limit exceeded", "rate_limit", "claude", e);
419
+ }
420
+ if (err.status && err.status >= 500) {
421
+ return new JudgeError("Provider server error", "server", "claude", e);
422
+ }
423
+ if (err.status && err.status >= 400) {
424
+ return new JudgeError(err.message ?? "Invalid request", "invalid", "claude", e);
425
+ }
426
+ return new JudgeError(err.message ?? "Network error", "network", "claude", e);
427
+ }
428
+ function extractJSON(text) {
429
+ let cleaned = text.trim();
430
+ if (cleaned.startsWith("```")) {
431
+ cleaned = cleaned.replace(/^```(?:json)?\s*\n?/, "").replace(/\n?```\s*$/, "");
432
+ }
433
+ const firstBrace = cleaned.indexOf("{");
434
+ const lastBrace = cleaned.lastIndexOf("}");
435
+ if (firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace) {
436
+ cleaned = cleaned.slice(firstBrace, lastBrace + 1);
437
+ }
438
+ return JSON.parse(cleaned);
439
+ }
440
+ var claudeJudge;
441
+ var init_claude = __esm({
442
+ "src/providers/claude.ts"() {
443
+ "use strict";
444
+ init_esm_shims();
445
+ init_prompts();
446
+ init_schema();
447
+ init_types();
448
+ claudeJudge = {
449
+ name: "claude",
450
+ async judge(input, model) {
451
+ const client = getClient();
452
+ const userPrompt = buildJudgeUserPrompt(input);
453
+ let response;
454
+ try {
455
+ response = await client.messages.create({
456
+ model,
457
+ max_tokens: 600,
458
+ temperature: 0,
459
+ system: JUDGE_SYSTEM_PROMPT,
460
+ messages: [{ role: "user", content: userPrompt }]
461
+ });
462
+ } catch (e) {
463
+ throw mapError(e);
464
+ }
465
+ const text = response.content.filter((b) => b.type === "text").map((b) => b.text).join("");
466
+ let parsed;
467
+ try {
468
+ parsed = extractJSON(text);
469
+ } catch (e) {
470
+ throw new JudgeError(
471
+ `Judge response was not valid JSON: ${text.slice(0, 200)}...`,
472
+ "parse",
473
+ "claude",
474
+ e
475
+ );
476
+ }
477
+ const result = JudgeOutputJSONSchema.safeParse(parsed);
478
+ if (!result.success) {
479
+ throw new JudgeError(
480
+ `Judge JSON failed schema validation: ${result.error.message}`,
481
+ "parse",
482
+ "claude"
483
+ );
484
+ }
485
+ return {
486
+ faithfulness: result.data.faithfulness ?? NaN,
487
+ correctness: result.data.correctness ?? NaN,
488
+ rationale: result.data.rationale,
489
+ rawResponse: response
490
+ };
491
+ }
492
+ };
493
+ }
494
+ });
495
+
496
+ // src/providers/openai.ts
497
+ var openai_exports = {};
498
+ __export(openai_exports, {
499
+ openaiJudge: () => openaiJudge
500
+ });
501
+ import OpenAI from "openai";
502
+ function getClient2() {
503
+ const apiKey = process.env["OPENAI_API_KEY"];
504
+ if (!apiKey) {
505
+ throw new JudgeError(
506
+ "OPENAI_API_KEY not set in environment",
507
+ "auth",
508
+ "openai"
509
+ );
510
+ }
511
+ return new OpenAI({ apiKey });
512
+ }
513
+ function mapError2(e) {
514
+ if (e instanceof JudgeError) return e;
515
+ const err = e;
516
+ if (err.status === 401 || err.status === 403) {
517
+ return new JudgeError("Authentication failed", "auth", "openai", e);
518
+ }
519
+ if (err.status === 429) {
520
+ return new JudgeError("Rate limit exceeded", "rate_limit", "openai", e);
521
+ }
522
+ if (err.status && err.status >= 500) {
523
+ return new JudgeError("Provider server error", "server", "openai", e);
524
+ }
525
+ if (err.status && err.status >= 400) {
526
+ return new JudgeError(err.message ?? "Invalid request", "invalid", "openai", e);
527
+ }
528
+ return new JudgeError(err.message ?? "Network error", "network", "openai", e);
529
+ }
530
+ var OPENAI_JSON_SCHEMA, openaiJudge;
531
+ var init_openai = __esm({
532
+ "src/providers/openai.ts"() {
533
+ "use strict";
534
+ init_esm_shims();
535
+ init_prompts();
536
+ init_schema();
537
+ init_types();
538
+ OPENAI_JSON_SCHEMA = {
539
+ type: "object",
540
+ properties: {
541
+ faithfulness: {
542
+ type: ["number", "null"],
543
+ description: "How well the answer is supported by retrieved context, null if context is uninterpretable (e.g., only IDs not text)"
544
+ },
545
+ correctness: {
546
+ type: ["number", "null"],
547
+ description: "How well the answer matches expected_answer, null if not provided"
548
+ },
549
+ rationale: {
550
+ type: "string",
551
+ description: "One or two sentences explaining the scores"
552
+ }
553
+ },
554
+ required: ["faithfulness", "correctness", "rationale"],
555
+ additionalProperties: false
556
+ };
557
+ openaiJudge = {
558
+ name: "openai",
559
+ async judge(input, model) {
560
+ const client = getClient2();
561
+ const userPrompt = buildJudgeUserPrompt(input);
562
+ let response;
563
+ try {
564
+ response = await client.chat.completions.create({
565
+ model,
566
+ temperature: 0,
567
+ max_tokens: 600,
568
+ messages: [
569
+ { role: "system", content: JUDGE_SYSTEM_PROMPT },
570
+ { role: "user", content: userPrompt }
571
+ ],
572
+ response_format: {
573
+ type: "json_schema",
574
+ json_schema: {
575
+ name: "judge_output",
576
+ strict: true,
577
+ schema: OPENAI_JSON_SCHEMA
578
+ }
579
+ }
580
+ });
581
+ } catch (e) {
582
+ throw mapError2(e);
583
+ }
584
+ const text = response.choices[0]?.message?.content ?? "";
585
+ if (!text) {
586
+ throw new JudgeError("Empty response from judge", "parse", "openai");
587
+ }
588
+ let parsed;
589
+ try {
590
+ parsed = JSON.parse(text);
591
+ } catch (e) {
592
+ throw new JudgeError(
593
+ `Judge response was not valid JSON: ${text.slice(0, 200)}`,
594
+ "parse",
595
+ "openai",
596
+ e
597
+ );
598
+ }
599
+ const result = JudgeOutputJSONSchema.safeParse(parsed);
600
+ if (!result.success) {
601
+ throw new JudgeError(
602
+ `Judge JSON failed schema validation: ${result.error.message}`,
603
+ "parse",
604
+ "openai"
605
+ );
606
+ }
607
+ return {
608
+ faithfulness: result.data.faithfulness ?? NaN,
609
+ correctness: result.data.correctness ?? NaN,
610
+ rationale: result.data.rationale,
611
+ rawResponse: response
612
+ };
613
+ }
614
+ };
615
+ }
616
+ });
617
+
618
+ // src/core/judge.ts
619
+ async function getJudgeProvider(config) {
620
+ if (config.provider === "claude") {
621
+ const { claudeJudge: claudeJudge2 } = await Promise.resolve().then(() => (init_claude(), claude_exports));
622
+ return claudeJudge2;
623
+ }
624
+ if (config.provider === "openai") {
625
+ const { openaiJudge: openaiJudge2 } = await Promise.resolve().then(() => (init_openai(), openai_exports));
626
+ return openaiJudge2;
627
+ }
628
+ throw new Error(`Unknown judge provider: ${String(config.provider)}`);
629
+ }
630
+ async function runJudge(input, config) {
631
+ const provider = await getJudgeProvider(config);
632
+ return provider.judge(input, config.model);
633
+ }
634
+ var init_judge = __esm({
635
+ "src/core/judge.ts"() {
636
+ "use strict";
637
+ init_esm_shims();
638
+ }
639
+ });
640
+
641
+ // src/core/runner.ts
642
+ import ora from "ora";
643
+ async function runEval(opts) {
644
+ const startTime = Date.now();
645
+ const results = [];
646
+ const spinner = ora({ text: "Starting evaluation...", spinner: "dots" }).start();
647
+ for (let i = 0; i < opts.entries.length; i++) {
648
+ const entry = opts.entries[i];
649
+ spinner.text = `[${i + 1}/${opts.entries.length}] ${entry.id} \u2014 ${entry.question.slice(0, 50)}...`;
650
+ let result;
651
+ try {
652
+ const response = await callEndpoint(opts.config.endpoint, entry);
653
+ const retrieval = scoreRetrieval(entry, response, opts.config.scoring.retrievalK);
654
+ let judge = null;
655
+ if (opts.enableJudge && opts.config.judge) {
656
+ spinner.text = `[${i + 1}/${opts.entries.length}] ${entry.id} \u2014 judging...`;
657
+ try {
658
+ const k = opts.config.scoring.retrievalK;
659
+ const retrievedContext = response.sourceContents && response.sourceContents.length > 0 ? response.sourceContents.slice(0, k) : response.sources.slice(0, k);
660
+ const judgeOut = await runJudge(
661
+ {
662
+ question: entry.question,
663
+ answer: response.answer,
664
+ retrievedContext,
665
+ expectedAnswer: entry.expected_answer
666
+ },
667
+ opts.config.judge
668
+ );
669
+ judge = {
670
+ faithfulness: judgeOut.faithfulness,
671
+ correctness: judgeOut.correctness,
672
+ rationale: judgeOut.rationale
673
+ };
674
+ } catch (e) {
675
+ judge = {
676
+ faithfulness: NaN,
677
+ correctness: NaN,
678
+ rationale: `Judge error: ${e.message}`
679
+ };
680
+ }
681
+ }
682
+ const overallScore = computeOverallScore(retrieval, judge, opts.config.scoring.weights);
683
+ result = {
684
+ entry,
685
+ response,
686
+ error: null,
687
+ retrieval,
688
+ judge,
689
+ overallScore
690
+ };
691
+ } catch (e) {
692
+ const message = e instanceof EndpointError ? e.message : `Unexpected error: ${e.message}`;
693
+ result = {
694
+ entry,
695
+ response: null,
696
+ error: message,
697
+ retrieval: null,
698
+ judge: null,
699
+ overallScore: 0
700
+ };
701
+ }
702
+ results.push(result);
703
+ }
704
+ spinner.stop();
705
+ const successful = results.filter((r) => r.error === null).length;
706
+ const failed = results.length - successful;
707
+ const successResults = results.filter((r) => r.retrieval !== null);
708
+ const avgRetrievalPrecision = successResults.length > 0 ? successResults.reduce((s, r) => s + r.retrieval.precision, 0) / successResults.length : 0;
709
+ const faithScores = results.map((r) => r.judge?.faithfulness).filter((n) => typeof n === "number" && !isNaN(n));
710
+ const corrScores = results.map((r) => r.judge?.correctness).filter((n) => typeof n === "number" && !isNaN(n));
711
+ const avgFaithfulness = faithScores.length > 0 ? faithScores.reduce((s, n) => s + n, 0) / faithScores.length : NaN;
712
+ const avgCorrectness = corrScores.length > 0 ? corrScores.reduce((s, n) => s + n, 0) / corrScores.length : NaN;
713
+ const avgOverallScore = results.reduce((s, r) => s + r.overallScore, 0) / results.length;
714
+ const summary = {
715
+ total: results.length,
716
+ successful,
717
+ failed,
718
+ avgRetrievalPrecision,
719
+ avgFaithfulness,
720
+ avgCorrectness,
721
+ avgOverallScore,
722
+ passed: avgOverallScore >= opts.threshold,
723
+ durationMs: Date.now() - startTime
724
+ };
725
+ return { results, summary };
726
+ }
727
+ var init_runner = __esm({
728
+ "src/core/runner.ts"() {
729
+ "use strict";
730
+ init_esm_shims();
731
+ init_endpoint();
732
+ init_scorer();
733
+ init_judge();
734
+ }
735
+ });
736
+
737
+ // src/formatters/table.ts
738
+ import Table from "cli-table3";
739
+ import chalk from "chalk";
740
+ function scoreColor(score) {
741
+ if (score >= 0.8) return chalk.green;
742
+ if (score >= 0.5) return chalk.yellow;
743
+ return chalk.red;
744
+ }
745
+ function fmtScore(s) {
746
+ if (s === void 0 || isNaN(s)) return chalk.gray("\u2014");
747
+ return scoreColor(s)(s.toFixed(2));
748
+ }
749
+ function renderTable(results) {
750
+ const table = new Table({
751
+ head: [
752
+ chalk.bold("ID"),
753
+ chalk.bold("Question"),
754
+ chalk.bold("Retr"),
755
+ chalk.bold("Faith"),
756
+ chalk.bold("Corr"),
757
+ chalk.bold("Score"),
758
+ chalk.bold("Status")
759
+ ],
760
+ colWidths: [10, 32, 8, 8, 8, 8, 20],
761
+ wordWrap: true
762
+ });
763
+ for (const r of results) {
764
+ if (r.error) {
765
+ table.push([
766
+ r.entry.id,
767
+ r.entry.question.slice(0, 30),
768
+ chalk.gray("\u2014"),
769
+ chalk.gray("\u2014"),
770
+ chalk.gray("\u2014"),
771
+ chalk.gray("\u2014"),
772
+ chalk.red(r.error.slice(0, 18))
773
+ ]);
774
+ continue;
775
+ }
776
+ const ret = r.retrieval;
777
+ const j = r.judge;
778
+ table.push([
779
+ r.entry.id,
780
+ r.entry.question.slice(0, 30),
781
+ ret.found ? chalk.green("\u2713") : chalk.red("\u2717"),
782
+ fmtScore(j?.faithfulness),
783
+ fmtScore(j?.correctness),
784
+ scoreColor(r.overallScore)(r.overallScore.toFixed(2)),
785
+ chalk.green("ok")
786
+ ]);
787
+ }
788
+ return table.toString();
789
+ }
790
+ function renderSummary(summary, threshold) {
791
+ const lines = [];
792
+ lines.push("");
793
+ lines.push(chalk.bold("Summary"));
794
+ lines.push(chalk.gray("\u2500".repeat(60)));
795
+ lines.push(`Total questions: ${summary.total}`);
796
+ lines.push(`Successful: ${chalk.green(String(summary.successful))}`);
797
+ if (summary.failed > 0) {
798
+ lines.push(`Failed: ${chalk.red(String(summary.failed))}`);
799
+ }
800
+ lines.push(
801
+ `Avg retrieval precision: ${scoreColor(summary.avgRetrievalPrecision)(summary.avgRetrievalPrecision.toFixed(3))}`
802
+ );
803
+ lines.push(`Avg faithfulness: ${fmtScore(summary.avgFaithfulness)}`);
804
+ lines.push(`Avg correctness: ${fmtScore(summary.avgCorrectness)}`);
805
+ lines.push(
806
+ `Avg overall score: ${scoreColor(summary.avgOverallScore)(summary.avgOverallScore.toFixed(3))}`
807
+ );
808
+ lines.push(`Threshold: ${threshold.toFixed(2)}`);
809
+ lines.push(`Duration: ${(summary.durationMs / 1e3).toFixed(1)}s`);
810
+ lines.push(chalk.gray("\u2500".repeat(60)));
811
+ lines.push(
812
+ summary.passed ? chalk.green.bold("\u2713 PASSED") : chalk.red.bold("\u2717 FAILED \u2014 below threshold")
813
+ );
814
+ lines.push("");
815
+ return lines.join("\n");
816
+ }
817
+ var init_table = __esm({
818
+ "src/formatters/table.ts"() {
819
+ "use strict";
820
+ init_esm_shims();
821
+ }
822
+ });
823
+
824
+ // src/formatters/csv.ts
825
+ function escape(value) {
826
+ if (value === void 0 || value === null) return "";
827
+ if (typeof value === "number") return isNaN(value) ? "" : String(value);
828
+ const s = String(value);
829
+ if (s.includes(",") || s.includes('"') || s.includes("\n")) {
830
+ return `"${s.replaceAll('"', '""')}"`;
831
+ }
832
+ return s;
833
+ }
834
+ function renderCSV(results, summary) {
835
+ const lines = [];
836
+ lines.push(
837
+ [
838
+ "id",
839
+ "question",
840
+ "expected_answer",
841
+ "rag_answer",
842
+ "expected_sources",
843
+ "retrieved_sources",
844
+ "retrieval_found",
845
+ "retrieval_precision",
846
+ "faithfulness",
847
+ "correctness",
848
+ "overall_score",
849
+ "judge_rationale",
850
+ "latency_ms",
851
+ "error"
852
+ ].map(escape).join(",")
853
+ );
854
+ for (const r of results) {
855
+ lines.push(
856
+ [
857
+ escape(r.entry.id),
858
+ escape(r.entry.question),
859
+ escape(r.entry.expected_answer),
860
+ escape(r.response?.answer),
861
+ escape(
862
+ Array.isArray(r.entry.expected_source) ? r.entry.expected_source.join("|") : r.entry.expected_source
863
+ ),
864
+ escape(r.response?.sources.join("|")),
865
+ escape(r.retrieval ? r.retrieval.found ? "true" : "false" : void 0),
866
+ escape(r.retrieval?.precision),
867
+ escape(r.judge?.faithfulness),
868
+ escape(r.judge?.correctness),
869
+ escape(r.overallScore),
870
+ escape(r.judge?.rationale),
871
+ escape(r.response?.latencyMs),
872
+ escape(r.error ?? void 0)
873
+ ].join(",")
874
+ );
875
+ }
876
+ lines.push("");
877
+ lines.push(
878
+ `# summary,total=${summary.total},successful=${summary.successful},failed=${summary.failed}`
879
+ );
880
+ lines.push(`# summary,avg_retrieval_precision=${summary.avgRetrievalPrecision.toFixed(4)}`);
881
+ lines.push(
882
+ `# summary,avg_faithfulness=${isNaN(summary.avgFaithfulness) ? "n/a" : summary.avgFaithfulness.toFixed(4)}`
883
+ );
884
+ lines.push(
885
+ `# summary,avg_correctness=${isNaN(summary.avgCorrectness) ? "n/a" : summary.avgCorrectness.toFixed(4)}`
886
+ );
887
+ lines.push(`# summary,avg_overall_score=${summary.avgOverallScore.toFixed(4)}`);
888
+ lines.push(`# summary,passed=${summary.passed}`);
889
+ lines.push(`# summary,duration_ms=${summary.durationMs}`);
890
+ return lines.join("\n");
891
+ }
892
+ var init_csv = __esm({
893
+ "src/formatters/csv.ts"() {
894
+ "use strict";
895
+ init_esm_shims();
896
+ }
897
+ });
898
+
899
+ // src/formatters/json.ts
900
+ function round4(n) {
901
+ return Math.round(n * 1e4) / 1e4;
902
+ }
903
+ function numOrNull(v, formatter) {
904
+ if (v === void 0) return null;
905
+ if (isNaN(v)) return null;
906
+ return formatter ? formatter(v) : v;
907
+ }
908
+ function renderJSON(results, summary) {
909
+ return {
910
+ summary: {
911
+ total: summary.total,
912
+ successful: summary.successful,
913
+ failed: summary.failed,
914
+ avgRetrievalPrecision: round4(summary.avgRetrievalPrecision),
915
+ avgFaithfulness: numOrNull(summary.avgFaithfulness, round4),
916
+ avgCorrectness: numOrNull(summary.avgCorrectness, round4),
917
+ avgOverallScore: round4(summary.avgOverallScore),
918
+ passed: summary.passed,
919
+ durationMs: summary.durationMs,
920
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
921
+ },
922
+ results: results.map((r) => ({
923
+ id: r.entry.id,
924
+ question: r.entry.question,
925
+ expectedAnswer: r.entry.expected_answer,
926
+ ragAnswer: r.response?.answer ?? null,
927
+ expectedSources: Array.isArray(r.entry.expected_source) ? r.entry.expected_source : [r.entry.expected_source],
928
+ retrievedSources: r.response?.sources ?? null,
929
+ retrieval: r.retrieval ? { found: r.retrieval.found, precision: round4(r.retrieval.precision) } : null,
930
+ judge: r.judge ? {
931
+ faithfulness: numOrNull(r.judge.faithfulness, round4),
932
+ correctness: numOrNull(r.judge.correctness, round4),
933
+ rationale: r.judge.rationale ?? ""
934
+ } : null,
935
+ overallScore: round4(r.overallScore),
936
+ latencyMs: r.response?.latencyMs ?? null,
937
+ error: r.error
938
+ }))
939
+ };
940
+ }
941
+ var init_json = __esm({
942
+ "src/formatters/json.ts"() {
943
+ "use strict";
944
+ init_esm_shims();
945
+ }
946
+ });
947
+
948
+ // src/commands/run.ts
949
+ var run_exports = {};
950
+ __export(run_exports, {
951
+ runCommand: () => runCommand
952
+ });
953
+ import { mkdirSync, writeFileSync } from "fs";
954
+ import { join } from "path";
955
+ import chalk2 from "chalk";
956
+ async function runCommand(opts) {
957
+ try {
958
+ const config = loadConfig(opts.config);
959
+ const questionsPath = opts.questions ?? "eval-set.jsonl";
960
+ const entries = loadEvalSet(questionsPath);
961
+ const threshold = parseFloat(opts.threshold);
962
+ if (isNaN(threshold) || threshold < 0 || threshold > 1) {
963
+ console.error(chalk2.red(`Invalid threshold: ${opts.threshold} (must be 0-1)`));
964
+ return 2;
965
+ }
966
+ const enableJudge = opts.judge !== false;
967
+ if (enableJudge && typeof opts.judge === "string" && config.judge) {
968
+ if (opts.judge !== "claude" && opts.judge !== "openai") {
969
+ console.error(chalk2.red(`Invalid --judge value: "${opts.judge}" (use claude|openai)`));
970
+ return 2;
971
+ }
972
+ config.judge = {
973
+ provider: opts.judge,
974
+ model: opts.judge === "openai" ? "gpt-4o-mini" : "claude-sonnet-4-6"
975
+ };
976
+ }
977
+ console.log(chalk2.gray(`Loaded ${entries.length} questions from ${questionsPath}`));
978
+ console.log(chalk2.gray(`Endpoint: ${config.endpoint.url}`));
979
+ if (enableJudge && config.judge) {
980
+ console.log(chalk2.gray(`Judge: ${config.judge.provider} (${config.judge.model})`));
981
+ } else {
982
+ console.log(chalk2.gray("Judge: disabled (retrieval-only)"));
983
+ }
984
+ console.log("");
985
+ const { results, summary } = await runEval({ config, entries, threshold, enableJudge });
986
+ console.log(renderTable(results));
987
+ console.log(renderSummary(summary, threshold));
988
+ try {
989
+ mkdirSync(opts.output, { recursive: true });
990
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
991
+ const csvPath = join(opts.output, `eval-${timestamp}.csv`);
992
+ writeFileSync(csvPath, renderCSV(results, summary));
993
+ console.log(chalk2.gray(`CSV report: ${csvPath}`));
994
+ const jsonPath = join(opts.output, `eval-${timestamp}.json`);
995
+ writeFileSync(jsonPath, JSON.stringify(renderJSON(results, summary), null, 2));
996
+ console.log(chalk2.gray(`JSON report: ${jsonPath}`));
997
+ } catch (e) {
998
+ console.error(chalk2.yellow(`Warning: failed to write reports: ${e.message}`));
999
+ }
1000
+ return summary.passed ? 0 : 1;
1001
+ } catch (e) {
1002
+ if (e instanceof ConfigError) {
1003
+ console.error(chalk2.red(e.message));
1004
+ return 2;
1005
+ }
1006
+ console.error(chalk2.red(`Unexpected error: ${e.message}`));
1007
+ if (process.env["DEBUG"]) {
1008
+ console.error(e.stack);
1009
+ }
1010
+ return 3;
1011
+ }
1012
+ }
1013
+ var init_run = __esm({
1014
+ "src/commands/run.ts"() {
1015
+ "use strict";
1016
+ init_esm_shims();
1017
+ init_loader();
1018
+ init_runner();
1019
+ init_table();
1020
+ init_csv();
1021
+ init_json();
1022
+ }
1023
+ });
1024
+
1025
+ // src/index.ts
1026
+ init_esm_shims();
1027
+ import { Command } from "commander";
1028
+ import { readFileSync as readFileSync2 } from "fs";
1029
+ import { fileURLToPath as fileURLToPath2 } from "url";
1030
+ import { dirname, join as join2 } from "path";
1031
+ var __filename2 = fileURLToPath2(import.meta.url);
1032
+ var __dirname2 = dirname(__filename2);
1033
+ var pkg = JSON.parse(
1034
+ readFileSync2(join2(__dirname2, "..", "package.json"), "utf-8")
1035
+ );
1036
+ var program = new Command();
1037
+ program.name("rag-eval").description("Evaluate RAG pipelines: retrieval, faithfulness, correctness.").version(pkg.version);
1038
+ program.command("run").description("Run evaluation against a RAG endpoint").option("-c, --config <path>", "config file path", "rag-eval.config.json").option("-q, --questions <path>", "eval-set JSONL file").option("-j, --judge <provider>", "judge LLM provider: claude|openai (overrides config)").option("--no-judge", "skip judge LLM (retrieval scoring only, no API costs)").option("-o, --output <dir>", "output directory for reports", "./rag-eval-output").option("--threshold <number>", "min score to exit 0 (0-1)", "0.7").action(async (opts) => {
1039
+ const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_run(), run_exports));
1040
+ const code = await runCommand2(opts);
1041
+ process.exit(code);
1042
+ });
1043
+ program.parse();