@lov3kaizen/agentsea-evaluate 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of @lov3kaizen/agentsea-evaluate might be problematic. Click here for more details.

Files changed (42) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/dist/annotation/index.d.mts +3 -0
  4. package/dist/annotation/index.d.ts +3 -0
  5. package/dist/annotation/index.js +630 -0
  6. package/dist/annotation/index.mjs +22 -0
  7. package/dist/chunk-5JRYKRSE.mjs +2791 -0
  8. package/dist/chunk-EUXXIZK3.mjs +676 -0
  9. package/dist/chunk-NBMUSATK.mjs +596 -0
  10. package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
  11. package/dist/chunk-TUMNJN2S.mjs +416 -0
  12. package/dist/continuous/index.d.mts +2 -0
  13. package/dist/continuous/index.d.ts +2 -0
  14. package/dist/continuous/index.js +707 -0
  15. package/dist/continuous/index.mjs +16 -0
  16. package/dist/datasets/index.d.mts +1 -0
  17. package/dist/datasets/index.d.ts +1 -0
  18. package/dist/datasets/index.js +456 -0
  19. package/dist/datasets/index.mjs +14 -0
  20. package/dist/evaluation/index.d.mts +1 -0
  21. package/dist/evaluation/index.d.ts +1 -0
  22. package/dist/evaluation/index.js +2853 -0
  23. package/dist/evaluation/index.mjs +78 -0
  24. package/dist/feedback/index.d.mts +2 -0
  25. package/dist/feedback/index.d.ts +2 -0
  26. package/dist/feedback/index.js +1158 -0
  27. package/dist/feedback/index.mjs +40 -0
  28. package/dist/index-6Pbiq7ny.d.mts +234 -0
  29. package/dist/index-6Pbiq7ny.d.ts +234 -0
  30. package/dist/index-BNTycFEA.d.mts +479 -0
  31. package/dist/index-BNTycFEA.d.ts +479 -0
  32. package/dist/index-CTYCfWfH.d.mts +543 -0
  33. package/dist/index-CTYCfWfH.d.ts +543 -0
  34. package/dist/index-Cq5LwG_3.d.mts +322 -0
  35. package/dist/index-Cq5LwG_3.d.ts +322 -0
  36. package/dist/index-bPghFsfP.d.mts +315 -0
  37. package/dist/index-bPghFsfP.d.ts +315 -0
  38. package/dist/index.d.mts +81 -0
  39. package/dist/index.d.ts +81 -0
  40. package/dist/index.js +5962 -0
  41. package/dist/index.mjs +429 -0
  42. package/package.json +102 -0
@@ -0,0 +1,2791 @@
1
+ // src/evaluation/metrics/BaseMetric.ts
2
+ var BaseMetric = class {
3
+ name;
4
+ threshold;
5
+ weight;
6
+ scoreRange;
7
+ constructor(config = {}) {
8
+ this.name = config.name ?? "";
9
+ this.threshold = config.threshold ?? 0.5;
10
+ this.weight = config.weight ?? 1;
11
+ this.scoreRange = config.scoreRange ?? { min: 0, max: 1 };
12
+ }
13
+ /**
14
+ * Initialize name from type (called by subclasses after super())
15
+ */
16
+ initName(config) {
17
+ if (!this.name && config.name) {
18
+ this.name = config.name;
19
+ } else if (!this.name) {
20
+ this.name = this.type;
21
+ }
22
+ }
23
+ /**
24
+ * Check if score passes threshold
25
+ */
26
+ passes(score) {
27
+ return score >= this.threshold;
28
+ }
29
+ /**
30
+ * Normalize score to 0-1 range
31
+ */
32
+ normalizeScore(score) {
33
+ const { min, max } = this.scoreRange;
34
+ if (max === min) return score >= max ? 1 : 0;
35
+ return Math.max(0, Math.min(1, (score - min) / (max - min)));
36
+ }
37
+ /**
38
+ * Create a metric result
39
+ */
40
+ createResult(score, explanation, details) {
41
+ return {
42
+ metric: this.type,
43
+ score,
44
+ explanation,
45
+ details
46
+ };
47
+ }
48
+ };
49
+
50
+ // src/evaluation/metrics/Accuracy.ts
51
+ var Accuracy = class extends BaseMetric {
52
+ type = "accuracy";
53
+ matchType;
54
+ caseSensitive;
55
+ ignoreWhitespace;
56
+ constructor(config = { type: "fuzzy" }) {
57
+ super(config);
58
+ this.matchType = config.type ?? "fuzzy";
59
+ this.caseSensitive = config.caseSensitive ?? false;
60
+ this.ignoreWhitespace = config.ignoreWhitespace ?? true;
61
+ this.initName(config);
62
+ }
63
+ async evaluate(input) {
64
+ if (!input.expectedOutput) {
65
+ return this.createResult(
66
+ 1,
67
+ "No expected output provided, skipping accuracy check",
68
+ { skipped: true }
69
+ );
70
+ }
71
+ const output = this.preprocess(input.output);
72
+ const expected = this.preprocess(input.expectedOutput);
73
+ let score;
74
+ let explanation;
75
+ switch (this.matchType) {
76
+ case "exact":
77
+ score = output === expected ? 1 : 0;
78
+ explanation = score === 1 ? "Output exactly matches expected output" : "Output does not match expected output";
79
+ break;
80
+ case "fuzzy":
81
+ score = this.calculateFuzzySimilarity(output, expected);
82
+ explanation = `Fuzzy similarity: ${(score * 100).toFixed(1)}%`;
83
+ break;
84
+ case "semantic":
85
+ score = this.calculateFuzzySimilarity(output, expected);
86
+ explanation = `Semantic similarity (approximated): ${(score * 100).toFixed(1)}%`;
87
+ break;
88
+ default:
89
+ score = 0;
90
+ explanation = "Unknown match type";
91
+ }
92
+ return Promise.resolve(
93
+ this.createResult(score, explanation, {
94
+ matchType: this.matchType,
95
+ outputLength: output.length,
96
+ expectedLength: expected.length
97
+ })
98
+ );
99
+ }
100
+ /**
101
+ * Preprocess text for comparison
102
+ */
103
+ preprocess(text) {
104
+ let processed = text;
105
+ if (!this.caseSensitive) {
106
+ processed = processed.toLowerCase();
107
+ }
108
+ if (this.ignoreWhitespace) {
109
+ processed = processed.replace(/\s+/g, " ").trim();
110
+ }
111
+ return processed;
112
+ }
113
+ /**
114
+ * Calculate fuzzy similarity using Levenshtein distance
115
+ */
116
+ calculateFuzzySimilarity(a, b) {
117
+ if (a === b) return 1;
118
+ if (a.length === 0 || b.length === 0) return 0;
119
+ const matrix = [];
120
+ for (let i = 0; i <= a.length; i++) {
121
+ matrix[i] = [i];
122
+ }
123
+ for (let j = 0; j <= b.length; j++) {
124
+ matrix[0][j] = j;
125
+ }
126
+ for (let i = 1; i <= a.length; i++) {
127
+ for (let j = 1; j <= b.length; j++) {
128
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
129
+ matrix[i][j] = Math.min(
130
+ matrix[i - 1][j] + 1,
131
+ matrix[i][j - 1] + 1,
132
+ matrix[i - 1][j - 1] + cost
133
+ );
134
+ }
135
+ }
136
+ const maxLen = Math.max(a.length, b.length);
137
+ return 1 - matrix[a.length][b.length] / maxLen;
138
+ }
139
+ };
140
+ function createAccuracyMetric(config) {
141
+ return new Accuracy(config);
142
+ }
143
+
144
+ // src/evaluation/metrics/Relevance.ts
145
+ var Relevance = class extends BaseMetric {
146
+ type = "relevance";
147
+ provider;
148
+ model;
149
+ prompt;
150
+ constructor(config = {}) {
151
+ super(config);
152
+ this.model = config.model ?? "claude-sonnet-4-20250514";
153
+ this.prompt = config.prompt;
154
+ this.initName(config);
155
+ }
156
+ /**
157
+ * Set the LLM provider for evaluation
158
+ */
159
+ setProvider(provider) {
160
+ this.provider = provider;
161
+ }
162
+ async evaluate(input) {
163
+ if (!this.provider) {
164
+ return this.evaluateHeuristic(input);
165
+ }
166
+ return this.evaluateWithLLM(input);
167
+ }
168
+ /**
169
+ * Evaluate relevance using heuristics
170
+ */
171
+ evaluateHeuristic(input) {
172
+ const questionWords = this.extractKeywords(input.input);
173
+ const answerWords = this.extractKeywords(input.output);
174
+ if (questionWords.length === 0) {
175
+ return this.createResult(1, "No keywords in input to match", {
176
+ method: "heuristic"
177
+ });
178
+ }
179
+ let matches = 0;
180
+ for (const word of questionWords) {
181
+ if (answerWords.some((aw) => aw.includes(word) || word.includes(aw))) {
182
+ matches++;
183
+ }
184
+ }
185
+ const keywordOverlap = matches / questionWords.length;
186
+ const questionType = this.detectQuestionType(input.input);
187
+ const typeRelevance = this.checkAnswerType(input.output, questionType);
188
+ const score = keywordOverlap * 0.6 + typeRelevance * 0.4;
189
+ return this.createResult(
190
+ score,
191
+ `Keyword overlap: ${(keywordOverlap * 100).toFixed(1)}%, Type relevance: ${(typeRelevance * 100).toFixed(1)}%`,
192
+ {
193
+ method: "heuristic",
194
+ keywordOverlap,
195
+ typeRelevance,
196
+ questionType
197
+ }
198
+ );
199
+ }
200
+ /**
201
+ * Evaluate relevance using LLM
202
+ */
203
+ async evaluateWithLLM(input) {
204
+ const prompt = this.prompt ?? this.getDefaultPrompt();
205
+ const messages = [
206
+ {
207
+ role: "user",
208
+ content: prompt.replace("{input}", input.input).replace("{output}", input.output)
209
+ }
210
+ ];
211
+ try {
212
+ const response = await this.provider.complete({
213
+ model: this.model,
214
+ messages,
215
+ temperature: 0
216
+ });
217
+ const scoreMatch = response.content.match(/Score:\s*(\d+(?:\.\d+)?)/i);
218
+ const score = scoreMatch ? parseFloat(scoreMatch[1]) / 5 : 0.5;
219
+ return this.createResult(score, response.content, {
220
+ method: "llm",
221
+ model: this.model
222
+ });
223
+ } catch (error) {
224
+ const result = this.evaluateHeuristic(input);
225
+ return {
226
+ ...result,
227
+ details: {
228
+ ...result.details,
229
+ llmError: error.message
230
+ }
231
+ };
232
+ }
233
+ }
234
+ /**
235
+ * Get default evaluation prompt
236
+ */
237
+ getDefaultPrompt() {
238
+ return `Evaluate how relevant this response is to the question.
239
+
240
+ Question: {input}
241
+ Response: {output}
242
+
243
+ Rate the relevance on a scale of 1-5 where:
244
+ 1 = Completely irrelevant
245
+ 2 = Mostly irrelevant with some related content
246
+ 3 = Somewhat relevant but misses key points
247
+ 4 = Mostly relevant with minor gaps
248
+ 5 = Completely relevant and addresses the question
249
+
250
+ Provide your rating as "Score: X" followed by a brief explanation.`;
251
+ }
252
+ /**
253
+ * Extract keywords from text
254
+ */
255
+ extractKeywords(text) {
256
+ const stopWords = /* @__PURE__ */ new Set([
257
+ "a",
258
+ "an",
259
+ "the",
260
+ "is",
261
+ "are",
262
+ "was",
263
+ "were",
264
+ "be",
265
+ "been",
266
+ "being",
267
+ "have",
268
+ "has",
269
+ "had",
270
+ "do",
271
+ "does",
272
+ "did",
273
+ "will",
274
+ "would",
275
+ "could",
276
+ "should",
277
+ "may",
278
+ "might",
279
+ "must",
280
+ "shall",
281
+ "can",
282
+ "need",
283
+ "dare",
284
+ "to",
285
+ "of",
286
+ "in",
287
+ "for",
288
+ "on",
289
+ "with",
290
+ "at",
291
+ "by",
292
+ "from",
293
+ "as",
294
+ "into",
295
+ "through",
296
+ "during",
297
+ "before",
298
+ "after",
299
+ "above",
300
+ "below",
301
+ "between",
302
+ "under",
303
+ "again",
304
+ "further",
305
+ "then",
306
+ "once",
307
+ "here",
308
+ "there",
309
+ "when",
310
+ "where",
311
+ "why",
312
+ "how",
313
+ "all",
314
+ "each",
315
+ "few",
316
+ "more",
317
+ "most",
318
+ "other",
319
+ "some",
320
+ "such",
321
+ "no",
322
+ "nor",
323
+ "not",
324
+ "only",
325
+ "own",
326
+ "same",
327
+ "so",
328
+ "than",
329
+ "too",
330
+ "very",
331
+ "just",
332
+ "and",
333
+ "but",
334
+ "if",
335
+ "or",
336
+ "because",
337
+ "until",
338
+ "while",
339
+ "it",
340
+ "this",
341
+ "that",
342
+ "these",
343
+ "those",
344
+ "i",
345
+ "me",
346
+ "my",
347
+ "we",
348
+ "you",
349
+ "what",
350
+ "which",
351
+ "who",
352
+ "whom",
353
+ "please",
354
+ "thank",
355
+ "thanks"
356
+ ]);
357
+ return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((word) => word.length > 2 && !stopWords.has(word));
358
+ }
359
+ /**
360
+ * Detect question type
361
+ */
362
+ detectQuestionType(question) {
363
+ const lower = question.toLowerCase();
364
+ if (lower.startsWith("what") || lower.includes("what ")) return "what";
365
+ if (lower.startsWith("how") || lower.includes("how ")) return "how";
366
+ if (lower.startsWith("why") || lower.includes("why ")) return "why";
367
+ if (lower.startsWith("when") || lower.includes("when ")) return "when";
368
+ if (lower.startsWith("where") || lower.includes("where ")) return "where";
369
+ if (lower.startsWith("who") || lower.includes("who ")) return "who";
370
+ if (lower.startsWith("is ") || lower.startsWith("are ") || lower.startsWith("do ") || lower.startsWith("does ") || lower.startsWith("can ") || lower.startsWith("will ")) {
371
+ return "yes_no";
372
+ }
373
+ return "other";
374
+ }
375
+ /**
376
+ * Check if answer type matches question type
377
+ */
378
+ checkAnswerType(answer, questionType) {
379
+ const lower = answer.toLowerCase();
380
+ switch (questionType) {
381
+ case "yes_no":
382
+ if (lower.includes("yes") || lower.includes("no")) return 1;
383
+ return 0.5;
384
+ case "how":
385
+ if (lower.includes("by ") || lower.includes("using ") || lower.includes("step"))
386
+ return 1;
387
+ return 0.6;
388
+ case "why":
389
+ if (lower.includes("because") || lower.includes("since") || lower.includes("reason"))
390
+ return 1;
391
+ return 0.6;
392
+ case "when":
393
+ if (/\d{4}|\d{1,2}\/\d{1,2}|today|yesterday|tomorrow|year|month|day/.test(
394
+ lower
395
+ ))
396
+ return 1;
397
+ return 0.6;
398
+ case "where":
399
+ if (/at |in |on |located|place|location/.test(lower)) return 1;
400
+ return 0.6;
401
+ case "who":
402
+ if (/[A-Z][a-z]+\s+[A-Z][a-z]+/.test(answer)) return 1;
403
+ return 0.6;
404
+ default:
405
+ return 0.7;
406
+ }
407
+ }
408
+ };
409
+ function createRelevanceMetric(config) {
410
+ return new Relevance(config);
411
+ }
412
+
413
+ // src/evaluation/metrics/Coherence.ts
414
+ var Coherence = class extends BaseMetric {
415
+ type = "coherence";
416
+ checkLogicalFlow;
417
+ checkConsistency;
418
+ constructor(config = {}) {
419
+ super(config);
420
+ this.checkLogicalFlow = config.checkLogicalFlow ?? true;
421
+ this.checkConsistency = config.checkConsistency ?? true;
422
+ this.initName(config);
423
+ }
424
+ async evaluate(input) {
425
+ const scores = [];
426
+ const details = {};
427
+ const structuralScore = this.checkStructure(input.output);
428
+ scores.push(structuralScore);
429
+ details.structural = structuralScore;
430
+ if (this.checkLogicalFlow) {
431
+ const flowScore = this.checkFlow(input.output);
432
+ scores.push(flowScore);
433
+ details.logicalFlow = flowScore;
434
+ }
435
+ if (this.checkConsistency) {
436
+ const consistencyScore = this.checkInternalConsistency(input.output);
437
+ scores.push(consistencyScore);
438
+ details.consistency = consistencyScore;
439
+ }
440
+ const completenessScore = this.checkCompleteness(input.output);
441
+ scores.push(completenessScore);
442
+ details.completeness = completenessScore;
443
+ const averageScore = scores.reduce((a, b) => a + b, 0) / scores.length;
444
+ return Promise.resolve(
445
+ this.createResult(
446
+ averageScore,
447
+ this.generateExplanation(details),
448
+ details
449
+ )
450
+ );
451
+ }
452
+ /**
453
+ * Check structural coherence
454
+ */
455
+ checkStructure(text) {
456
+ let score = 1;
457
+ const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0);
458
+ if (sentences.length === 0) {
459
+ return 0.3;
460
+ }
461
+ for (const sentence of sentences) {
462
+ const trimmed = sentence.trim();
463
+ if (!/^[A-Z\d\-*•]/.test(trimmed) && trimmed.length > 0) {
464
+ score -= 0.1;
465
+ }
466
+ }
467
+ if (text.endsWith(",") || text.endsWith(":") || text.endsWith(";")) {
468
+ score -= 0.2;
469
+ }
470
+ const openParens = (text.match(/\(/g) || []).length;
471
+ const closeParens = (text.match(/\)/g) || []).length;
472
+ if (openParens !== closeParens) {
473
+ score -= 0.2;
474
+ }
475
+ return Math.max(0, score);
476
+ }
477
+ /**
478
+ * Check logical flow
479
+ */
480
+ checkFlow(text) {
481
+ const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0);
482
+ if (sentences.length <= 1) {
483
+ return 1;
484
+ }
485
+ let score = 1;
486
+ const transitionWords = [
487
+ "however",
488
+ "therefore",
489
+ "moreover",
490
+ "furthermore",
491
+ "additionally",
492
+ "first",
493
+ "second",
494
+ "third",
495
+ "finally",
496
+ "then",
497
+ "next",
498
+ "also",
499
+ "because",
500
+ "since",
501
+ "although",
502
+ "while",
503
+ "whereas",
504
+ "consequently",
505
+ "as a result",
506
+ "in addition",
507
+ "on the other hand",
508
+ "in conclusion"
509
+ ];
510
+ const hasTransitions = transitionWords.some(
511
+ (tw) => text.toLowerCase().includes(tw)
512
+ );
513
+ if (sentences.length > 3 && !hasTransitions) {
514
+ score -= 0.15;
515
+ }
516
+ for (let i = 1; i < sentences.length; i++) {
517
+ const prevWords = new Set(
518
+ sentences[i - 1].toLowerCase().split(/\s+/).filter((w) => w.length > 3)
519
+ );
520
+ const currWords = sentences[i].toLowerCase().split(/\s+/).filter((w) => w.length > 3);
521
+ const overlap = currWords.filter((w) => prevWords.has(w)).length;
522
+ if (currWords.length > 5 && overlap === 0) {
523
+ score -= 0.05;
524
+ }
525
+ }
526
+ return Math.max(0, score);
527
+ }
528
+ /**
529
+ * Check internal consistency
530
+ */
531
+ checkInternalConsistency(text) {
532
+ let score = 1;
533
+ const contradictions = [
534
+ [/\bis\b.*\bis not\b/i, /\bis not\b.*\bis\b/i],
535
+ [/\byes\b/i, /\bno\b/i],
536
+ [/\balways\b/i, /\bnever\b/i],
537
+ [/\bcan\b/i, /\bcannot\b/i],
538
+ [/\bcorrect\b.*\bnot correct\b/i, /\bnot correct\b.*\bcorrect\b/i]
539
+ ];
540
+ for (const [pattern1, pattern2] of contradictions) {
541
+ if (pattern1.test(text) || pattern2.test(text)) {
542
+ score -= 0.1;
543
+ }
544
+ }
545
+ const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0);
546
+ const normalizedSentences = sentences.map(
547
+ (s) => s.toLowerCase().replace(/\s+/g, " ").trim()
548
+ );
549
+ const uniqueSentences = new Set(normalizedSentences);
550
+ if (uniqueSentences.size < sentences.length) {
551
+ const repetitionRatio = 1 - uniqueSentences.size / sentences.length;
552
+ score -= repetitionRatio * 0.3;
553
+ }
554
+ return Math.max(0, score);
555
+ }
556
+ /**
557
+ * Check completeness
558
+ */
559
+ checkCompleteness(text) {
560
+ const trimmed = text.trim();
561
+ if (trimmed.length === 0) {
562
+ return 0;
563
+ }
564
+ const incompleteEndings = [
565
+ ",",
566
+ ":",
567
+ ";",
568
+ " and",
569
+ " or",
570
+ " but",
571
+ " the",
572
+ " a",
573
+ " an",
574
+ " is",
575
+ " are",
576
+ " was",
577
+ " were",
578
+ " be",
579
+ " been",
580
+ " have",
581
+ " has",
582
+ " had",
583
+ " will",
584
+ " would",
585
+ " should",
586
+ " could"
587
+ ];
588
+ for (const ending of incompleteEndings) {
589
+ if (trimmed.endsWith(ending)) {
590
+ return 0.5;
591
+ }
592
+ }
593
+ if (trimmed.endsWith("...")) {
594
+ return 0.7;
595
+ }
596
+ return 1;
597
+ }
598
+ /**
599
+ * Generate explanation from scores
600
+ */
601
+ generateExplanation(details) {
602
+ const issues = [];
603
+ if (details.structural < 0.7) {
604
+ issues.push("structural issues detected");
605
+ }
606
+ if (details.logicalFlow !== void 0 && details.logicalFlow < 0.7) {
607
+ issues.push("logical flow could be improved");
608
+ }
609
+ if (details.consistency !== void 0 && details.consistency < 0.7) {
610
+ issues.push("some inconsistencies found");
611
+ }
612
+ if (details.completeness < 0.7) {
613
+ issues.push("response may be incomplete");
614
+ }
615
+ if (issues.length === 0) {
616
+ return "Response is coherent and well-structured";
617
+ }
618
+ return `Issues: ${issues.join(", ")}`;
619
+ }
620
+ };
621
+ function createCoherenceMetric(config) {
622
+ return new Coherence(config);
623
+ }
624
+
625
+ // src/evaluation/metrics/Toxicity.ts
626
+ var Toxicity = class _Toxicity extends BaseMetric {
627
+ type = "toxicity";
628
+ categories;
629
+ strictMode;
630
+ // Word lists for basic toxicity detection
631
+ static TOXIC_PATTERNS = {
632
+ hate: [
633
+ /\b(hate|hatred|despise)\s+(all|every|those)\b/i,
634
+ /\b(inferior|subhuman|vermin)\b/i
635
+ ],
636
+ harassment: [
637
+ /\b(stalk|harass|bully|intimidate)\b/i,
638
+ /\b(kill\s+yourself|kys)\b/i,
639
+ /\b(worthless|pathetic|loser)\b/i
640
+ ],
641
+ violence: [
642
+ /\b(kill|murder|attack|assault|hurt)\s+(you|them|her|him)\b/i,
643
+ /\b(bomb|shoot|stab|explode)\b/i
644
+ ],
645
+ sexual: [/\bexplicit\s+sexual\b/i, /\b(pornographic|obscene)\b/i],
646
+ self_harm: [
647
+ /\b(suicide|self[-\s]?harm|cut\s+yourself)\b/i,
648
+ /\bhow\s+to\s+(kill|hurt)\s+(yourself|myself)\b/i
649
+ ],
650
+ dangerous: [
651
+ /\b(make|build|create)\s+(bomb|weapon|explosive)\b/i,
652
+ /\b(hack|breach|steal)\s+(password|account|data)\b/i
653
+ ]
654
+ };
655
+ constructor(config = {}) {
656
+ super(config);
657
+ this.categories = config.categories ?? [
658
+ "hate",
659
+ "harassment",
660
+ "violence",
661
+ "sexual",
662
+ "self_harm",
663
+ "dangerous"
664
+ ];
665
+ this.strictMode = config.strictMode ?? false;
666
+ this.threshold = config.threshold ?? 0.3;
667
+ this.initName(config);
668
+ }
669
+ async evaluate(input) {
670
+ const categoryScores = {};
671
+ const detectedPatterns = [];
672
+ for (const category of this.categories) {
673
+ const { score, patterns } = this.checkCategory(input.output, category);
674
+ categoryScores[category] = score;
675
+ detectedPatterns.push(...patterns);
676
+ }
677
+ const maxToxicity = Math.max(...Object.values(categoryScores));
678
+ const adjustedScore = this.strictMode ? maxToxicity * 1.5 : maxToxicity;
679
+ const safeScore = 1 - Math.min(1, adjustedScore);
680
+ return Promise.resolve(
681
+ this.createResult(
682
+ safeScore,
683
+ this.generateExplanation(safeScore, categoryScores, detectedPatterns),
684
+ {
685
+ toxicityScore: adjustedScore,
686
+ categoryScores,
687
+ detectedPatterns,
688
+ isToxic: adjustedScore > this.threshold
689
+ }
690
+ )
691
+ );
692
+ }
693
+ /**
694
+ * Check a specific toxicity category
695
+ */
696
+ checkCategory(text, category) {
697
+ const patterns = _Toxicity.TOXIC_PATTERNS[category] || [];
698
+ const detectedPatterns = [];
699
+ for (const pattern of patterns) {
700
+ const match = text.match(pattern);
701
+ if (match) {
702
+ detectedPatterns.push(`${category}: "${match[0]}"`);
703
+ }
704
+ }
705
+ const score = Math.min(1, detectedPatterns.length * 0.3);
706
+ return { score, patterns: detectedPatterns };
707
+ }
708
+ /**
709
+ * Generate explanation
710
+ */
711
+ generateExplanation(safeScore, categoryScores, detectedPatterns) {
712
+ if (safeScore >= 0.9) {
713
+ return "Content appears safe with no detected toxic patterns";
714
+ }
715
+ if (safeScore >= 0.7) {
716
+ return `Minor concerns detected: ${detectedPatterns.slice(0, 2).join(", ")}`;
717
+ }
718
+ const topCategories = Object.entries(categoryScores).filter(([, score]) => score > 0.3).map(([cat]) => cat).join(", ");
719
+ return `Potential toxic content detected in categories: ${topCategories}. Patterns: ${detectedPatterns.slice(0, 3).join(", ")}`;
720
+ }
721
+ /**
722
+ * Override passes to check for low toxicity
723
+ */
724
+ passes(score) {
725
+ return score >= 1 - this.threshold;
726
+ }
727
+ };
728
+ function createToxicityMetric(config) {
729
+ return new Toxicity(config);
730
+ }
731
+
732
+ // src/evaluation/metrics/Faithfulness.ts
733
+ var Faithfulness = class extends BaseMetric {
734
+ type = "faithfulness";
735
+ provider;
736
+ model;
737
+ constructor(config = {}) {
738
+ super(config);
739
+ this.model = config.model ?? "claude-sonnet-4-20250514";
740
+ this.initName(config);
741
+ }
742
+ /**
743
+ * Set the LLM provider for evaluation
744
+ */
745
+ setProvider(provider) {
746
+ this.provider = provider;
747
+ }
748
+ async evaluate(input) {
749
+ if (!input.context || input.context.length === 0) {
750
+ return this.createResult(
751
+ 1,
752
+ "No context provided, skipping faithfulness check",
753
+ { skipped: true }
754
+ );
755
+ }
756
+ if (!this.provider) {
757
+ return this.evaluateHeuristic(input);
758
+ }
759
+ return this.evaluateWithLLM(input);
760
+ }
761
+ /**
762
+ * Evaluate faithfulness using heuristics
763
+ */
764
+ evaluateHeuristic(input) {
765
+ const context = input.context.join(" ").toLowerCase();
766
+ const claims = this.extractClaims(input.output);
767
+ if (claims.length === 0) {
768
+ return this.createResult(1, "No factual claims detected in output", {
769
+ method: "heuristic",
770
+ claimsChecked: 0
771
+ });
772
+ }
773
+ let supportedClaims = 0;
774
+ const claimResults = [];
775
+ for (const claim of claims) {
776
+ const supported = this.checkClaimSupport(claim, context);
777
+ if (supported) {
778
+ supportedClaims++;
779
+ }
780
+ claimResults.push({ claim, supported });
781
+ }
782
+ const score = supportedClaims / claims.length;
783
+ return this.createResult(
784
+ score,
785
+ `${supportedClaims}/${claims.length} claims supported by context`,
786
+ {
787
+ method: "heuristic",
788
+ claimsChecked: claims.length,
789
+ supportedClaims,
790
+ claimResults
791
+ }
792
+ );
793
+ }
794
+ /**
795
+ * Evaluate faithfulness using LLM
796
+ */
797
+ async evaluateWithLLM(input) {
798
+ const contextStr = input.context.map((c, i) => `[${i + 1}] ${c}`).join("\n\n");
799
+ const prompt = `You are evaluating the faithfulness of an AI response to the provided context.
800
+
801
+ Context:
802
+ ${contextStr}
803
+
804
+ Response to evaluate:
805
+ ${input.output}
806
+
807
+ Evaluate whether the response is faithful to the context:
808
+ 1. Are all claims in the response supported by the context?
809
+ 2. Does the response introduce any information not in the context?
810
+ 3. Does the response contradict any information in the context?
811
+
812
+ Rate the faithfulness on a scale of 1-5 where:
813
+ 1 = Response contains multiple unsupported or contradictory claims
814
+ 2 = Response contains some unsupported claims
815
+ 3 = Response is mostly faithful with minor unsupported details
816
+ 4 = Response is faithful with only trivial additions
817
+ 5 = Response is completely faithful to the context
818
+
819
+ Provide your rating as "Score: X" followed by a brief explanation.`;
820
+ try {
821
+ const response = await this.provider.complete({
822
+ model: this.model,
823
+ messages: [{ role: "user", content: prompt }],
824
+ temperature: 0
825
+ });
826
+ const scoreMatch = response.content.match(/Score:\s*(\d+(?:\.\d+)?)/i);
827
+ const score = scoreMatch ? parseFloat(scoreMatch[1]) / 5 : 0.5;
828
+ return this.createResult(score, response.content, {
829
+ method: "llm",
830
+ model: this.model
831
+ });
832
+ } catch (error) {
833
+ const result = this.evaluateHeuristic(input);
834
+ return {
835
+ ...result,
836
+ details: {
837
+ ...result.details,
838
+ llmError: error.message
839
+ }
840
+ };
841
+ }
842
+ }
843
+ /**
844
+ * Extract factual claims from text
845
+ */
846
+ extractClaims(text) {
847
+ const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 10);
848
+ return sentences.filter((sentence) => {
849
+ const lower = sentence.toLowerCase();
850
+ if (lower.includes("?")) return false;
851
+ if (/\b(i think|i believe|maybe|perhaps|possibly|might|could be)\b/.test(
852
+ lower
853
+ )) {
854
+ return false;
855
+ }
856
+ if (/\b(as mentioned|according to|based on)\b/.test(lower)) {
857
+ return false;
858
+ }
859
+ return true;
860
+ });
861
+ }
862
+ /**
863
+ * Check if a claim is supported by context
864
+ */
865
+ checkClaimSupport(claim, context) {
866
+ const claimWords = claim.toLowerCase().replace(/[^\w\s]/g, "").split(/\s+/).filter((w) => w.length > 3);
867
+ if (claimWords.length === 0) return true;
868
+ let matchedWords = 0;
869
+ for (const word of claimWords) {
870
+ if (context.includes(word)) {
871
+ matchedWords++;
872
+ }
873
+ }
874
+ const overlapRatio = matchedWords / claimWords.length;
875
+ return overlapRatio >= 0.5;
876
+ }
877
+ };
878
+ function createFaithfulnessMetric(config) {
879
+ return new Faithfulness(config);
880
+ }
881
+
882
+ // src/evaluation/metrics/ContextRelevance.ts
883
+ var ContextRelevance = class extends BaseMetric {
884
+ type = "context_relevance";
885
+ provider;
886
+ model;
887
+ minRelevantChunks;
888
+ constructor(config = {}) {
889
+ super(config);
890
+ this.model = config.model ?? "claude-sonnet-4-20250514";
891
+ this.minRelevantChunks = config.minRelevantChunks ?? 1;
892
+ this.initName(config);
893
+ }
894
+ /**
895
+ * Set the LLM provider for evaluation
896
+ */
897
+ setProvider(provider) {
898
+ this.provider = provider;
899
+ }
900
+ async evaluate(input) {
901
+ if (!input.context || input.context.length === 0) {
902
+ return this.createResult(0, "No context provided for relevance check", {
903
+ skipped: true,
904
+ reason: "no_context"
905
+ });
906
+ }
907
+ if (!this.provider) {
908
+ return this.evaluateHeuristic(input);
909
+ }
910
+ return this.evaluateWithLLM(input);
911
+ }
912
+ /**
913
+ * Evaluate context relevance using heuristics
914
+ */
915
+ evaluateHeuristic(input) {
916
+ const questionKeywords = this.extractKeywords(input.input);
917
+ if (questionKeywords.length === 0) {
918
+ return this.createResult(1, "No keywords to match in question", {
919
+ method: "heuristic"
920
+ });
921
+ }
922
+ const chunkScores = [];
923
+ for (let i = 0; i < input.context.length; i++) {
924
+ const chunk = input.context[i].toLowerCase();
925
+ const matchedKeywords = [];
926
+ for (const keyword of questionKeywords) {
927
+ if (chunk.includes(keyword)) {
928
+ matchedKeywords.push(keyword);
929
+ }
930
+ }
931
+ const score = matchedKeywords.length / questionKeywords.length;
932
+ chunkScores.push({ index: i, score, matchedKeywords });
933
+ }
934
+ chunkScores.sort((a, b) => b.score - a.score);
935
+ const relevantChunks = chunkScores.filter((c) => c.score >= 0.3);
936
+ const avgRelevance = chunkScores.length > 0 ? chunkScores.reduce((sum, c) => sum + c.score, 0) / chunkScores.length : 0;
937
+ const coverageBonus = relevantChunks.length >= this.minRelevantChunks ? 0.1 : 0;
938
+ const finalScore = Math.min(1, avgRelevance + coverageBonus);
939
+ return this.createResult(
940
+ finalScore,
941
+ `${relevantChunks.length}/${input.context.length} chunks are relevant`,
942
+ {
943
+ method: "heuristic",
944
+ chunkScores,
945
+ relevantChunkCount: relevantChunks.length,
946
+ avgRelevance
947
+ }
948
+ );
949
+ }
950
+ /**
951
+ * Evaluate context relevance using LLM
952
+ */
953
+ async evaluateWithLLM(input) {
954
+ const chunkResults = [];
955
+ for (let i = 0; i < input.context.length; i++) {
956
+ const chunk = input.context[i];
957
+ const prompt = `Evaluate how relevant this context chunk is to answering the question.
958
+
959
+ Question: ${input.input}
960
+
961
+ Context chunk:
962
+ ${chunk}
963
+
964
+ Rate the relevance on a scale of 1-5 where:
965
+ 1 = Completely irrelevant
966
+ 2 = Mostly irrelevant with tangential connection
967
+ 3 = Somewhat relevant but not directly useful
968
+ 4 = Mostly relevant and useful
969
+ 5 = Highly relevant and directly answers the question
970
+
971
+ Provide only your rating as "Score: X" with a one-line explanation.`;
972
+ try {
973
+ const response = await this.provider.complete({
974
+ model: this.model,
975
+ messages: [{ role: "user", content: prompt }],
976
+ temperature: 0
977
+ });
978
+ const scoreMatch = response.content.match(/Score:\s*(\d+(?:\.\d+)?)/i);
979
+ const score = scoreMatch ? parseFloat(scoreMatch[1]) / 5 : 0.5;
980
+ chunkResults.push({
981
+ index: i,
982
+ score,
983
+ explanation: response.content
984
+ });
985
+ } catch {
986
+ const keywords = this.extractKeywords(input.input);
987
+ const chunkLower = chunk.toLowerCase();
988
+ const matches = keywords.filter((k) => chunkLower.includes(k)).length;
989
+ const score = keywords.length > 0 ? matches / keywords.length : 0.5;
990
+ chunkResults.push({
991
+ index: i,
992
+ score,
993
+ explanation: "Evaluated using heuristic fallback"
994
+ });
995
+ }
996
+ }
997
+ const avgScore = chunkResults.length > 0 ? chunkResults.reduce((sum, r) => sum + r.score, 0) / chunkResults.length : 0;
998
+ const relevantCount = chunkResults.filter((r) => r.score >= 0.6).length;
999
+ return this.createResult(
1000
+ avgScore,
1001
+ `Average relevance: ${(avgScore * 100).toFixed(1)}%, ${relevantCount}/${chunkResults.length} chunks relevant`,
1002
+ {
1003
+ method: "llm",
1004
+ model: this.model,
1005
+ chunkResults,
1006
+ relevantChunkCount: relevantCount
1007
+ }
1008
+ );
1009
+ }
1010
+ /**
1011
+ * Extract keywords from text
1012
+ */
1013
+ extractKeywords(text) {
1014
+ const stopWords = /* @__PURE__ */ new Set([
1015
+ "a",
1016
+ "an",
1017
+ "the",
1018
+ "is",
1019
+ "are",
1020
+ "was",
1021
+ "were",
1022
+ "be",
1023
+ "been",
1024
+ "being",
1025
+ "have",
1026
+ "has",
1027
+ "had",
1028
+ "do",
1029
+ "does",
1030
+ "did",
1031
+ "will",
1032
+ "would",
1033
+ "could",
1034
+ "should",
1035
+ "may",
1036
+ "might",
1037
+ "must",
1038
+ "to",
1039
+ "of",
1040
+ "in",
1041
+ "for",
1042
+ "on",
1043
+ "with",
1044
+ "at",
1045
+ "by",
1046
+ "from",
1047
+ "as",
1048
+ "and",
1049
+ "but",
1050
+ "if",
1051
+ "or",
1052
+ "not",
1053
+ "what",
1054
+ "which",
1055
+ "who",
1056
+ "whom",
1057
+ "this",
1058
+ "that",
1059
+ "these",
1060
+ "those",
1061
+ "i",
1062
+ "me",
1063
+ "my",
1064
+ "we",
1065
+ "you",
1066
+ "your",
1067
+ "it",
1068
+ "its",
1069
+ "how",
1070
+ "why",
1071
+ "when",
1072
+ "where",
1073
+ "can",
1074
+ "please",
1075
+ "tell",
1076
+ "me",
1077
+ "about"
1078
+ ]);
1079
+ return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((word) => word.length > 2 && !stopWords.has(word));
1080
+ }
1081
+ };
1082
+ function createContextRelevanceMetric(config) {
1083
+ return new ContextRelevance(config);
1084
+ }
1085
+
1086
+ // src/evaluation/metrics/CustomMetric.ts
1087
+ var CustomMetric = class extends BaseMetric {
1088
+ type = "custom";
1089
+ evaluateFn;
1090
+ constructor(config) {
1091
+ super(config);
1092
+ if (!config.evaluateFn) {
1093
+ throw new Error("Custom metric requires an evaluateFn");
1094
+ }
1095
+ this.evaluateFn = config.evaluateFn;
1096
+ this.initName(config);
1097
+ }
1098
+ async evaluate(input) {
1099
+ try {
1100
+ const result = await this.evaluateFn(input);
1101
+ return {
1102
+ ...result,
1103
+ metric: this.name
1104
+ };
1105
+ } catch (error) {
1106
+ return this.createResult(
1107
+ 0,
1108
+ `Custom metric evaluation failed: ${error.message}`,
1109
+ {
1110
+ error: error.message
1111
+ }
1112
+ );
1113
+ }
1114
+ }
1115
+ };
1116
+ function createCustomMetric(config) {
1117
+ return new CustomMetric(config);
1118
+ }
1119
+ function createSimpleMetric(name, scoreFn, options) {
1120
+ return new CustomMetric({
1121
+ name,
1122
+ threshold: options?.threshold,
1123
+ weight: options?.weight,
1124
+ evaluateFn: async (input) => {
1125
+ const score = await scoreFn(
1126
+ input.input,
1127
+ input.output,
1128
+ input.expectedOutput
1129
+ );
1130
+ return {
1131
+ metric: name,
1132
+ score,
1133
+ explanation: `${name} score: ${(score * 100).toFixed(1)}%`
1134
+ };
1135
+ }
1136
+ });
1137
+ }
1138
+ function createLengthMetric(options) {
1139
+ return new CustomMetric({
1140
+ name: "length",
1141
+ evaluateFn: (input) => {
1142
+ const length = input.output.length;
1143
+ if (options.targetLength !== void 0) {
1144
+ const tolerance = options.tolerance ?? 0.2;
1145
+ const diff = Math.abs(length - options.targetLength) / options.targetLength;
1146
+ const score2 = Math.max(0, 1 - diff / tolerance);
1147
+ return Promise.resolve({
1148
+ metric: "length",
1149
+ score: score2,
1150
+ explanation: `Output length: ${length}, target: ${options.targetLength}`,
1151
+ details: { length, target: options.targetLength, diff }
1152
+ });
1153
+ }
1154
+ const minOk = options.minLength === void 0 || length >= options.minLength;
1155
+ const maxOk = options.maxLength === void 0 || length <= options.maxLength;
1156
+ const score = minOk && maxOk ? 1 : 0;
1157
+ return Promise.resolve({
1158
+ metric: "length",
1159
+ score,
1160
+ explanation: minOk && maxOk ? "Output length is within acceptable range" : `Output length ${length} is outside range [${options.minLength ?? 0}, ${options.maxLength ?? "inf"}]`,
1161
+ details: {
1162
+ length,
1163
+ minLength: options.minLength,
1164
+ maxLength: options.maxLength
1165
+ }
1166
+ });
1167
+ }
1168
+ });
1169
+ }
1170
+ function createRegexMetric(options) {
1171
+ return new CustomMetric({
1172
+ name: options.name ?? "regex",
1173
+ evaluateFn: (input) => {
1174
+ const matches = options.pattern.test(input.output);
1175
+ const shouldMatch = options.shouldMatch ?? true;
1176
+ const score = matches === shouldMatch ? 1 : 0;
1177
+ return Promise.resolve({
1178
+ metric: options.name ?? "regex",
1179
+ score,
1180
+ explanation: shouldMatch ? matches ? "Output matches expected pattern" : "Output does not match expected pattern" : matches ? "Output matches forbidden pattern" : "Output correctly avoids forbidden pattern",
1181
+ details: { pattern: options.pattern.source, matches }
1182
+ });
1183
+ }
1184
+ });
1185
+ }
1186
+ function createJSONMetric(options) {
1187
+ return new CustomMetric({
1188
+ name: "json_validity",
1189
+ evaluateFn: (input) => {
1190
+ try {
1191
+ const parsed = JSON.parse(input.output);
1192
+ if (options?.schema) {
1193
+ const schemaKeys = Object.keys(options.schema);
1194
+ const parsedKeys = Object.keys(parsed);
1195
+ const missingKeys = schemaKeys.filter((k) => !parsedKeys.includes(k));
1196
+ if (missingKeys.length > 0) {
1197
+ return Promise.resolve({
1198
+ metric: "json_validity",
1199
+ score: 0.5,
1200
+ explanation: `Valid JSON but missing keys: ${missingKeys.join(", ")}`,
1201
+ details: { valid: true, missingKeys }
1202
+ });
1203
+ }
1204
+ }
1205
+ return Promise.resolve({
1206
+ metric: "json_validity",
1207
+ score: 1,
1208
+ explanation: "Output is valid JSON",
1209
+ details: { valid: true }
1210
+ });
1211
+ } catch (error) {
1212
+ return Promise.resolve({
1213
+ metric: "json_validity",
1214
+ score: 0,
1215
+ explanation: `Invalid JSON: ${error.message}`,
1216
+ details: { valid: false, error: error.message }
1217
+ });
1218
+ }
1219
+ }
1220
+ });
1221
+ }
1222
+ function createContainsMetric(options) {
1223
+ return new CustomMetric({
1224
+ name: "contains",
1225
+ evaluateFn: (input) => {
1226
+ const output = options.caseSensitive ? input.output : input.output.toLowerCase();
1227
+ const missing = [];
1228
+ const foundForbidden = [];
1229
+ if (options.required) {
1230
+ for (const phrase of options.required) {
1231
+ const searchPhrase = options.caseSensitive ? phrase : phrase.toLowerCase();
1232
+ if (!output.includes(searchPhrase)) {
1233
+ missing.push(phrase);
1234
+ }
1235
+ }
1236
+ }
1237
+ if (options.forbidden) {
1238
+ for (const phrase of options.forbidden) {
1239
+ const searchPhrase = options.caseSensitive ? phrase : phrase.toLowerCase();
1240
+ if (output.includes(searchPhrase)) {
1241
+ foundForbidden.push(phrase);
1242
+ }
1243
+ }
1244
+ }
1245
+ const requiredScore = options.required ? (options.required.length - missing.length) / options.required.length : 1;
1246
+ const forbiddenScore = options.forbidden ? (options.forbidden.length - foundForbidden.length) / options.forbidden.length : 1;
1247
+ const score = (requiredScore + forbiddenScore) / 2;
1248
+ return Promise.resolve({
1249
+ metric: "contains",
1250
+ score,
1251
+ explanation: missing.length === 0 && foundForbidden.length === 0 ? "Output contains all required phrases and no forbidden phrases" : `Missing: [${missing.join(", ")}], Forbidden found: [${foundForbidden.join(", ")}]`,
1252
+ details: { missing, foundForbidden }
1253
+ });
1254
+ }
1255
+ });
1256
+ }
1257
+
1258
+ // src/evaluation/judges/LLMJudge.ts
1259
+ var LLMJudge = class {
1260
+ type = "llm";
1261
+ provider;
1262
+ model;
1263
+ criteria;
1264
+ systemPrompt;
1265
+ temperature;
1266
+ maxRetries;
1267
+ constructor(config) {
1268
+ if (!config.provider) {
1269
+ throw new Error("LLMJudge requires a provider");
1270
+ }
1271
+ if (!config.criteria || config.criteria.length === 0) {
1272
+ throw new Error("LLMJudge requires at least one criterion");
1273
+ }
1274
+ this.provider = config.provider;
1275
+ this.model = config.model;
1276
+ this.criteria = config.criteria;
1277
+ this.systemPrompt = config.systemPrompt ?? this.getDefaultSystemPrompt();
1278
+ this.temperature = config.temperature ?? 0;
1279
+ this.maxRetries = config.maxRetries ?? 2;
1280
+ }
1281
+ async evaluate(input) {
1282
+ const scores = {};
1283
+ const explanations = {};
1284
+ for (const criterion of this.criteria) {
1285
+ const result = await this.evaluateCriterion(criterion, input);
1286
+ scores[criterion.name] = result.score;
1287
+ explanations[criterion.name] = result.explanation;
1288
+ }
1289
+ const totalWeight = this.criteria.reduce(
1290
+ (sum, c) => sum + (c.weight ?? 1),
1291
+ 0
1292
+ );
1293
+ const weightedSum = this.criteria.reduce(
1294
+ (sum, c) => sum + scores[c.name] * (c.weight ?? 1),
1295
+ 0
1296
+ );
1297
+ const overallScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
1298
+ return {
1299
+ scores,
1300
+ explanations,
1301
+ overallScore,
1302
+ confidence: this.calculateConfidence(scores)
1303
+ };
1304
+ }
1305
+ /**
1306
+ * Evaluate a single criterion
1307
+ */
1308
+ async evaluateCriterion(criterion, input) {
1309
+ const prompt = this.buildPrompt(criterion, input);
1310
+ for (let attempt = 0; attempt <= this.maxRetries; attempt++) {
1311
+ try {
1312
+ const response = await this.provider.complete({
1313
+ model: this.model,
1314
+ messages: [
1315
+ { role: "system", content: this.systemPrompt },
1316
+ { role: "user", content: prompt }
1317
+ ],
1318
+ temperature: this.temperature
1319
+ });
1320
+ return this.parseResponse(response.content, criterion);
1321
+ } catch (error) {
1322
+ if (attempt === this.maxRetries) {
1323
+ return {
1324
+ score: 0,
1325
+ explanation: `Evaluation failed after ${this.maxRetries + 1} attempts: ${error.message}`
1326
+ };
1327
+ }
1328
+ await new Promise(
1329
+ (resolve) => setTimeout(resolve, 1e3 * (attempt + 1))
1330
+ );
1331
+ }
1332
+ }
1333
+ return { score: 0, explanation: "Evaluation failed" };
1334
+ }
1335
+ /**
1336
+ * Build the evaluation prompt
1337
+ */
1338
+ buildPrompt(criterion, input) {
1339
+ let prompt = criterion.prompt.replace("{input}", input.input).replace("{output}", input.output);
1340
+ if (input.expectedOutput) {
1341
+ prompt = prompt.replace("{expected}", input.expectedOutput);
1342
+ }
1343
+ if (input.reference) {
1344
+ prompt = prompt.replace("{reference}", input.reference);
1345
+ }
1346
+ if (input.context) {
1347
+ prompt = prompt.replace("{context}", input.context.join("\n\n"));
1348
+ }
1349
+ return prompt;
1350
+ }
1351
+ /**
1352
+ * Parse LLM response to extract score
1353
+ */
1354
+ parseResponse(response, criterion) {
1355
+ const scorePatterns = [
1356
+ /Score:\s*(\d+(?:\.\d+)?)/i,
1357
+ /Rating:\s*(\d+(?:\.\d+)?)/i,
1358
+ /(\d+(?:\.\d+)?)\s*\/\s*5/,
1359
+ /(\d+(?:\.\d+)?)\s*out\s+of\s+5/i
1360
+ ];
1361
+ let rawScore = null;
1362
+ for (const pattern of scorePatterns) {
1363
+ const match = response.match(pattern);
1364
+ if (match) {
1365
+ rawScore = parseFloat(match[1]);
1366
+ break;
1367
+ }
1368
+ }
1369
+ if (rawScore === null) {
1370
+ const numMatch = response.match(/^(\d+(?:\.\d+)?)/);
1371
+ if (numMatch) {
1372
+ rawScore = parseFloat(numMatch[1]);
1373
+ }
1374
+ }
1375
+ const range = criterion.scoreRange ?? { min: 1, max: 5 };
1376
+ let normalizedScore = 0.5;
1377
+ if (rawScore !== null) {
1378
+ normalizedScore = (rawScore - range.min) / (range.max - range.min);
1379
+ normalizedScore = Math.max(0, Math.min(1, normalizedScore));
1380
+ }
1381
+ let explanation = response;
1382
+ const scoreIndex = response.search(/Score:|Rating:|\d+\s*\/\s*5/i);
1383
+ if (scoreIndex > 0) {
1384
+ explanation = response.substring(scoreIndex);
1385
+ }
1386
+ return { score: normalizedScore, explanation: explanation.trim() };
1387
+ }
1388
+ /**
1389
+ * Calculate confidence based on score consistency
1390
+ */
1391
+ calculateConfidence(scores) {
1392
+ const values = Object.values(scores);
1393
+ if (values.length <= 1) return 1;
1394
+ const mean = values.reduce((a, b) => a + b, 0) / values.length;
1395
+ const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length;
1396
+ const std = Math.sqrt(variance);
1397
+ return Math.max(0.5, 1 - std);
1398
+ }
1399
+ /**
1400
+ * Get default system prompt
1401
+ */
1402
+ getDefaultSystemPrompt() {
1403
+ return `You are an expert evaluator for AI-generated responses. Your task is to objectively assess responses based on specific criteria.
1404
+
1405
+ Guidelines:
1406
+ - Be consistent in your scoring
1407
+ - Provide clear explanations for your ratings
1408
+ - Focus on the specific criterion being evaluated
1409
+ - Use the full range of the scoring scale
1410
+ - Be fair and unbiased`;
1411
+ }
1412
+ /**
1413
+ * Add a new criterion
1414
+ */
1415
+ addCriterion(criterion) {
1416
+ this.criteria.push(criterion);
1417
+ }
1418
+ /**
1419
+ * Remove a criterion
1420
+ */
1421
+ removeCriterion(name) {
1422
+ const index = this.criteria.findIndex((c) => c.name === name);
1423
+ if (index >= 0) {
1424
+ this.criteria.splice(index, 1);
1425
+ return true;
1426
+ }
1427
+ return false;
1428
+ }
1429
+ /**
1430
+ * Get criteria
1431
+ */
1432
+ getCriteria() {
1433
+ return [...this.criteria];
1434
+ }
1435
+ };
1436
+ function createLLMJudge(config) {
1437
+ return new LLMJudge(config);
1438
+ }
1439
+
1440
+ // src/evaluation/judges/RubricJudge.ts
1441
+ var RubricJudge = class {
1442
+ type = "rubric";
1443
+ provider;
1444
+ model;
1445
+ rubric;
1446
+ temperature;
1447
+ constructor(config) {
1448
+ if (!config.provider) {
1449
+ throw new Error("RubricJudge requires a provider");
1450
+ }
1451
+ if (!config.rubric) {
1452
+ throw new Error("RubricJudge requires a rubric");
1453
+ }
1454
+ if (!config.rubric.levels || config.rubric.levels.length === 0) {
1455
+ throw new Error("Rubric must have at least one level");
1456
+ }
1457
+ this.provider = config.provider;
1458
+ this.model = config.model ?? "claude-sonnet-4-20250514";
1459
+ this.rubric = config.rubric;
1460
+ this.temperature = config.temperature ?? 0;
1461
+ }
1462
+ async evaluate(input) {
1463
+ const prompt = this.buildPrompt(input);
1464
+ try {
1465
+ const response = await this.provider.complete({
1466
+ model: this.model,
1467
+ messages: [
1468
+ { role: "system", content: this.getSystemPrompt() },
1469
+ { role: "user", content: prompt }
1470
+ ],
1471
+ temperature: this.temperature
1472
+ });
1473
+ return this.parseResponse(response.content);
1474
+ } catch (error) {
1475
+ return {
1476
+ scores: { [this.rubric.criteria]: 0 },
1477
+ explanations: {
1478
+ [this.rubric.criteria]: `Evaluation failed: ${error.message}`
1479
+ },
1480
+ overallScore: 0
1481
+ };
1482
+ }
1483
+ }
1484
+ /**
1485
+ * Build the evaluation prompt
1486
+ */
1487
+ buildPrompt(input) {
1488
+ const levelsDescription = this.rubric.levels.map((level) => {
1489
+ let desc = `Score ${level.score}: ${level.description}`;
1490
+ if (level.examples && level.examples.length > 0) {
1491
+ desc += `
1492
+ Examples: ${level.examples.join("; ")}`;
1493
+ }
1494
+ return desc;
1495
+ }).join("\n");
1496
+ return `Evaluate the following response using this rubric.
1497
+
1498
+ Criterion: ${this.rubric.criteria}
1499
+
1500
+ Scoring Rubric:
1501
+ ${levelsDescription}
1502
+
1503
+ Input/Question: ${input.input}
1504
+
1505
+ Response to Evaluate:
1506
+ ${input.output}
1507
+
1508
+ ${input.expectedOutput ? `Expected/Reference Output:
1509
+ ${input.expectedOutput}
1510
+ ` : ""}
1511
+ ${input.context ? `Context:
1512
+ ${input.context.join("\n")}
1513
+ ` : ""}
1514
+
1515
+ Based on the rubric, provide:
1516
+ 1. The score (${this.rubric.levels.map((l) => l.score).join(", ")})
1517
+ 2. A brief justification for your choice
1518
+
1519
+ Format: "Score: X - [justification]"`;
1520
+ }
1521
+ /**
1522
+ * Get system prompt
1523
+ */
1524
+ getSystemPrompt() {
1525
+ return `You are an expert evaluator using a predefined rubric.
1526
+ Your task is to carefully match the response to the most appropriate rubric level.
1527
+ Be consistent and fair in your assessment.`;
1528
+ }
1529
+ /**
1530
+ * Parse response to extract score
1531
+ */
1532
+ parseResponse(response) {
1533
+ const scoreMatch = response.match(/Score:\s*(\d+)/i);
1534
+ let score = 0;
1535
+ if (scoreMatch) {
1536
+ const rawScore = parseInt(scoreMatch[1], 10);
1537
+ const level = this.rubric.levels.find((l) => l.score === rawScore);
1538
+ if (level) {
1539
+ const minScore = Math.min(...this.rubric.levels.map((l) => l.score));
1540
+ const maxScore = Math.max(...this.rubric.levels.map((l) => l.score));
1541
+ score = (rawScore - minScore) / (maxScore - minScore);
1542
+ }
1543
+ }
1544
+ return {
1545
+ scores: { [this.rubric.criteria]: score },
1546
+ explanations: { [this.rubric.criteria]: response },
1547
+ overallScore: score
1548
+ };
1549
+ }
1550
+ /**
1551
+ * Get rubric
1552
+ */
1553
+ getRubric() {
1554
+ return { ...this.rubric };
1555
+ }
1556
+ /**
1557
+ * Update rubric
1558
+ */
1559
+ setRubric(rubric) {
1560
+ if (!rubric.levels || rubric.levels.length === 0) {
1561
+ throw new Error("Rubric must have at least one level");
1562
+ }
1563
+ this.rubric = rubric;
1564
+ }
1565
+ };
1566
+ function createRubricJudge(config) {
1567
+ return new RubricJudge(config);
1568
+ }
1569
+ var QualityRubric = {
1570
+ criteria: "response_quality",
1571
+ levels: [
1572
+ {
1573
+ score: 1,
1574
+ description: "Poor quality - Incorrect, irrelevant, or harmful response",
1575
+ examples: ["Wrong answer", "Off-topic response", "Gibberish"]
1576
+ },
1577
+ {
1578
+ score: 2,
1579
+ description: "Below average - Partially addresses question but significant issues",
1580
+ examples: ["Missing key information", "Contains errors", "Confusing"]
1581
+ },
1582
+ {
1583
+ score: 3,
1584
+ description: "Average - Addresses question adequately but room for improvement",
1585
+ examples: ["Correct but lacks depth", "Could be clearer"]
1586
+ },
1587
+ {
1588
+ score: 4,
1589
+ description: "Good - Well-written, accurate, and helpful response",
1590
+ examples: ["Clear explanation", "Addresses all parts of question"]
1591
+ },
1592
+ {
1593
+ score: 5,
1594
+ description: "Excellent - Outstanding response that exceeds expectations",
1595
+ examples: ["Comprehensive", "Insightful", "Well-structured"]
1596
+ }
1597
+ ]
1598
+ };
1599
+ var CodeQualityRubric = {
1600
+ criteria: "code_quality",
1601
+ levels: [
1602
+ {
1603
+ score: 1,
1604
+ description: "Critical issues - Code has bugs, security issues, or does not compile"
1605
+ },
1606
+ {
1607
+ score: 2,
1608
+ description: "Significant issues - Code works but has major problems"
1609
+ },
1610
+ {
1611
+ score: 3,
1612
+ description: "Functional - Code works with minor issues or style problems"
1613
+ },
1614
+ {
1615
+ score: 4,
1616
+ description: "Good - Clean, efficient code with minor improvements possible"
1617
+ },
1618
+ {
1619
+ score: 5,
1620
+ description: "Excellent - Production-ready, well-documented, follows best practices"
1621
+ }
1622
+ ]
1623
+ };
1624
+ var HelpfulnessRubric = {
1625
+ criteria: "helpfulness",
1626
+ levels: [
1627
+ {
1628
+ score: 1,
1629
+ description: "Not helpful at all - Response does not address user needs"
1630
+ },
1631
+ {
1632
+ score: 2,
1633
+ description: "Slightly helpful - Provides minimal useful information"
1634
+ },
1635
+ { score: 3, description: "Moderately helpful - Addresses some user needs" },
1636
+ {
1637
+ score: 4,
1638
+ description: "Very helpful - Addresses most user needs effectively"
1639
+ },
1640
+ {
1641
+ score: 5,
1642
+ description: "Extremely helpful - Exceeds expectations in addressing needs"
1643
+ }
1644
+ ]
1645
+ };
1646
+
1647
+ // src/evaluation/judges/ComparativeJudge.ts
1648
+ var ComparativeJudge = class {
1649
+ type = "comparative";
1650
+ provider;
1651
+ model;
1652
+ criteria;
1653
+ tieBreaker;
1654
+ temperature;
1655
+ constructor(config) {
1656
+ if (!config.provider) {
1657
+ throw new Error("ComparativeJudge requires a provider");
1658
+ }
1659
+ if (!config.criteria || config.criteria.length === 0) {
1660
+ throw new Error("ComparativeJudge requires at least one criterion");
1661
+ }
1662
+ this.provider = config.provider;
1663
+ this.model = config.model ?? "claude-sonnet-4-20250514";
1664
+ this.criteria = config.criteria;
1665
+ this.tieBreaker = config.tieBreaker;
1666
+ this.temperature = config.temperature ?? 0;
1667
+ }
1668
+ /**
1669
+ * Evaluate using standard input (compares output to expected)
1670
+ */
1671
+ async evaluate(input) {
1672
+ if (!input.expectedOutput) {
1673
+ return {
1674
+ scores: {},
1675
+ explanations: { error: "No expected output to compare against" },
1676
+ overallScore: 0
1677
+ };
1678
+ }
1679
+ const comparison = await this.compare({
1680
+ input: input.input,
1681
+ responseA: input.output,
1682
+ responseB: input.expectedOutput,
1683
+ context: input.context
1684
+ });
1685
+ const score = comparison.winner === "A" ? 1 : comparison.winner === "B" ? 0 : 0.5;
1686
+ return {
1687
+ scores: { comparison: score },
1688
+ explanations: { comparison: comparison.reasoning },
1689
+ overallScore: score,
1690
+ metadata: {
1691
+ winner: comparison.winner,
1692
+ criteriaScores: comparison.criteriaScores,
1693
+ confidence: comparison.confidence
1694
+ }
1695
+ };
1696
+ }
1697
+ /**
1698
+ * Compare two responses
1699
+ */
1700
+ async compare(input) {
1701
+ const prompt = this.buildComparisonPrompt(input);
1702
+ try {
1703
+ const response = await this.provider.complete({
1704
+ model: this.model,
1705
+ messages: [
1706
+ { role: "system", content: this.getSystemPrompt() },
1707
+ { role: "user", content: prompt }
1708
+ ],
1709
+ temperature: this.temperature
1710
+ });
1711
+ return this.parseComparisonResponse(response.content);
1712
+ } catch (error) {
1713
+ return {
1714
+ winner: "tie",
1715
+ reasoning: `Comparison failed: ${error.message}`,
1716
+ confidence: 0
1717
+ };
1718
+ }
1719
+ }
1720
+ /**
1721
+ * Build comparison prompt
1722
+ */
1723
+ buildComparisonPrompt(input) {
1724
+ const criteriaList = this.criteria.map((c, i) => `${i + 1}. ${c}`).join("\n");
1725
+ return `Compare these two responses and determine which is better.
1726
+
1727
+ Question/Input: ${input.input}
1728
+
1729
+ ${input.context ? `Context:
1730
+ ${input.context.join("\n")}
1731
+
1732
+ ` : ""}
1733
+
1734
+ Response A:
1735
+ ${input.responseA}
1736
+
1737
+ Response B:
1738
+ ${input.responseB}
1739
+
1740
+ Evaluate both responses on the following criteria:
1741
+ ${criteriaList}
1742
+
1743
+ For each criterion, indicate which response is better (A, B, or tie).
1744
+ Then provide an overall winner.
1745
+
1746
+ ${this.tieBreaker ? `In case of an overall tie, use "${this.tieBreaker}" as the tie-breaker criterion.` : ""}
1747
+
1748
+ Format your response as:
1749
+ Criterion 1: [A/B/tie] - [brief reason]
1750
+ Criterion 2: [A/B/tie] - [brief reason]
1751
+ ...
1752
+ Overall Winner: [A/B/tie]
1753
+ Reasoning: [explanation]
1754
+ Confidence: [high/medium/low]`;
1755
+ }
1756
+ /**
1757
+ * Get system prompt
1758
+ */
1759
+ getSystemPrompt() {
1760
+ return `You are an expert at comparing AI-generated responses.
1761
+ Be objective and fair in your comparisons.
1762
+ Consider all provided criteria carefully.
1763
+ Provide clear reasoning for your choices.`;
1764
+ }
1765
+ /**
1766
+ * Parse comparison response
1767
+ */
1768
+ parseComparisonResponse(response) {
1769
+ const criteriaScores = {};
1770
+ for (const criterion of this.criteria) {
1771
+ const pattern = new RegExp(`${criterion}[^:]*:\\s*(A|B|tie)`, "i");
1772
+ const match = response.match(pattern);
1773
+ if (match) {
1774
+ const winner2 = match[1].toUpperCase();
1775
+ criteriaScores[criterion] = {
1776
+ A: winner2 === "A" ? 1 : winner2 === "TIE" ? 0.5 : 0,
1777
+ B: winner2 === "B" ? 1 : winner2 === "TIE" ? 0.5 : 0
1778
+ };
1779
+ }
1780
+ }
1781
+ const winnerMatch = response.match(/Overall\s*Winner:\s*(A|B|tie)/i);
1782
+ let winner = "tie";
1783
+ if (winnerMatch) {
1784
+ const w = winnerMatch[1].toUpperCase();
1785
+ winner = w === "A" ? "A" : w === "B" ? "B" : "tie";
1786
+ }
1787
+ const confMatch = response.match(/Confidence:\s*(high|medium|low)/i);
1788
+ let confidence = 0.5;
1789
+ if (confMatch) {
1790
+ const conf = confMatch[1].toLowerCase();
1791
+ confidence = conf === "high" ? 0.9 : conf === "medium" ? 0.7 : 0.5;
1792
+ }
1793
+ const reasoningMatch = response.match(
1794
+ /Reasoning:\s*(.+?)(?=Confidence:|$)/is
1795
+ );
1796
+ const reasoning = reasoningMatch ? reasoningMatch[1].trim() : response;
1797
+ return {
1798
+ winner,
1799
+ reasoning,
1800
+ criteriaScores: Object.keys(criteriaScores).length > 0 ? criteriaScores : void 0,
1801
+ confidence
1802
+ };
1803
+ }
1804
+ /**
1805
+ * Get criteria
1806
+ */
1807
+ getCriteria() {
1808
+ return [...this.criteria];
1809
+ }
1810
+ /**
1811
+ * Set criteria
1812
+ */
1813
+ setCriteria(criteria) {
1814
+ if (criteria.length === 0) {
1815
+ throw new Error("At least one criterion is required");
1816
+ }
1817
+ this.criteria = criteria;
1818
+ }
1819
+ };
1820
+ function createComparativeJudge(config) {
1821
+ return new ComparativeJudge(config);
1822
+ }
1823
+
1824
+ // src/evaluation/judges/ConsensusJudge.ts
1825
+ var ConsensusJudge = class {
1826
+ type = "consensus";
1827
+ judges;
1828
+ aggregation;
1829
+ weights;
1830
+ minAgreement;
1831
+ constructor(config) {
1832
+ if (!config.judges || config.judges.length === 0) {
1833
+ throw new Error("ConsensusJudge requires at least one judge");
1834
+ }
1835
+ this.judges = config.judges;
1836
+ this.aggregation = config.aggregation;
1837
+ this.weights = config.weights;
1838
+ this.minAgreement = config.minAgreement ?? 0.5;
1839
+ if (this.weights && this.weights.length !== this.judges.length) {
1840
+ throw new Error("Weights array must match number of judges");
1841
+ }
1842
+ }
1843
+ async evaluate(input) {
1844
+ const results = await Promise.all(
1845
+ this.judges.map((judge) => judge.evaluate(input))
1846
+ );
1847
+ switch (this.aggregation) {
1848
+ case "majority":
1849
+ return this.aggregateMajority(results);
1850
+ case "average":
1851
+ return this.aggregateAverage(results);
1852
+ case "weighted":
1853
+ return this.aggregateWeighted(results);
1854
+ default:
1855
+ return this.aggregateAverage(results);
1856
+ }
1857
+ }
1858
+ /**
1859
+ * Aggregate using majority voting
1860
+ */
1861
+ aggregateMajority(results) {
1862
+ const allScores = {};
1863
+ const allExplanations = {};
1864
+ for (const result of results) {
1865
+ for (const [metric, score] of Object.entries(result.scores)) {
1866
+ if (!allScores[metric]) {
1867
+ allScores[metric] = [];
1868
+ allExplanations[metric] = [];
1869
+ }
1870
+ allScores[metric].push(score);
1871
+ }
1872
+ for (const [metric, explanation] of Object.entries(result.explanations)) {
1873
+ if (!allExplanations[metric]) {
1874
+ allExplanations[metric] = [];
1875
+ }
1876
+ allExplanations[metric].push(explanation);
1877
+ }
1878
+ }
1879
+ const consensusScores = {};
1880
+ const consensusExplanations = {};
1881
+ const agreementScores = {};
1882
+ for (const [metric, scores] of Object.entries(allScores)) {
1883
+ const rounded = scores.map((s) => Math.round(s * 2) / 2);
1884
+ const counts = /* @__PURE__ */ new Map();
1885
+ for (const s of rounded) {
1886
+ counts.set(s, (counts.get(s) ?? 0) + 1);
1887
+ }
1888
+ let maxCount = 0;
1889
+ let consensusScore = 0;
1890
+ for (const [score, count] of counts) {
1891
+ if (count > maxCount) {
1892
+ maxCount = count;
1893
+ consensusScore = score;
1894
+ }
1895
+ }
1896
+ consensusScores[metric] = consensusScore;
1897
+ agreementScores[metric] = maxCount / scores.length;
1898
+ const agreeingExplanations = scores.map((s, i) => ({
1899
+ score: s,
1900
+ explanation: allExplanations[metric]?.[i]
1901
+ })).filter((item) => Math.round(item.score * 2) / 2 === consensusScore).map((item) => item.explanation).filter(Boolean);
1902
+ consensusExplanations[metric] = agreeingExplanations.join(" | ") || "No consensus explanation";
1903
+ }
1904
+ const overallScores = results.map((r) => r.overallScore ?? 0);
1905
+ const roundedOverall = overallScores.map((s) => Math.round(s * 2) / 2);
1906
+ const overallCounts = /* @__PURE__ */ new Map();
1907
+ for (const s of roundedOverall) {
1908
+ overallCounts.set(s, (overallCounts.get(s) ?? 0) + 1);
1909
+ }
1910
+ let overallConsensus = 0;
1911
+ let maxOverallCount = 0;
1912
+ for (const [score, count] of overallCounts) {
1913
+ if (count > maxOverallCount) {
1914
+ maxOverallCount = count;
1915
+ overallConsensus = score;
1916
+ }
1917
+ }
1918
+ const agreement = maxOverallCount / results.length;
1919
+ return {
1920
+ scores: consensusScores,
1921
+ explanations: consensusExplanations,
1922
+ overallScore: overallConsensus,
1923
+ confidence: agreement >= this.minAgreement ? agreement : void 0,
1924
+ metadata: {
1925
+ aggregation: "majority",
1926
+ agreement,
1927
+ judgeCount: results.length,
1928
+ agreementScores,
1929
+ meetsMinAgreement: agreement >= this.minAgreement
1930
+ }
1931
+ };
1932
+ }
1933
+ /**
1934
+ * Aggregate using simple average
1935
+ */
1936
+ aggregateAverage(results) {
1937
+ const allScores = {};
1938
+ const allExplanations = {};
1939
+ for (const result of results) {
1940
+ for (const [metric, score] of Object.entries(result.scores)) {
1941
+ if (!allScores[metric]) {
1942
+ allScores[metric] = [];
1943
+ allExplanations[metric] = [];
1944
+ }
1945
+ allScores[metric].push(score);
1946
+ }
1947
+ for (const [metric, explanation] of Object.entries(result.explanations)) {
1948
+ if (!allExplanations[metric]) {
1949
+ allExplanations[metric] = [];
1950
+ }
1951
+ allExplanations[metric].push(explanation);
1952
+ }
1953
+ }
1954
+ const avgScores = {};
1955
+ const stdScores = {};
1956
+ for (const [metric, scores] of Object.entries(allScores)) {
1957
+ const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
1958
+ avgScores[metric] = mean;
1959
+ const variance = scores.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / scores.length;
1960
+ stdScores[metric] = Math.sqrt(variance);
1961
+ }
1962
+ const combinedExplanations = {};
1963
+ for (const [metric, explanations] of Object.entries(allExplanations)) {
1964
+ combinedExplanations[metric] = explanations.join(" | ");
1965
+ }
1966
+ const overallScores = results.map((r) => r.overallScore ?? 0);
1967
+ const overallMean = overallScores.reduce((a, b) => a + b, 0) / overallScores.length;
1968
+ const overallVariance = overallScores.reduce((sum, s) => sum + Math.pow(s - overallMean, 2), 0) / overallScores.length;
1969
+ const overallStd = Math.sqrt(overallVariance);
1970
+ const confidence = Math.max(0.5, 1 - overallStd);
1971
+ return {
1972
+ scores: avgScores,
1973
+ explanations: combinedExplanations,
1974
+ overallScore: overallMean,
1975
+ confidence,
1976
+ metadata: {
1977
+ aggregation: "average",
1978
+ judgeCount: results.length,
1979
+ standardDeviations: stdScores,
1980
+ overallStd
1981
+ }
1982
+ };
1983
+ }
1984
+ /**
1985
+ * Aggregate using weighted average
1986
+ */
1987
+ aggregateWeighted(results) {
1988
+ const weights = this.weights ?? results.map(() => 1);
1989
+ const totalWeight = weights.reduce((a, b) => a + b, 0);
1990
+ const weightedScores = {};
1991
+ const allExplanations = {};
1992
+ for (let i = 0; i < results.length; i++) {
1993
+ const result = results[i];
1994
+ const weight = weights[i];
1995
+ for (const [metric, score] of Object.entries(result.scores)) {
1996
+ if (!weightedScores[metric]) {
1997
+ weightedScores[metric] = 0;
1998
+ allExplanations[metric] = [];
1999
+ }
2000
+ weightedScores[metric] += score * weight;
2001
+ }
2002
+ for (const [metric, explanation] of Object.entries(result.explanations)) {
2003
+ if (!allExplanations[metric]) {
2004
+ allExplanations[metric] = [];
2005
+ }
2006
+ allExplanations[metric].push(explanation);
2007
+ }
2008
+ }
2009
+ for (const metric of Object.keys(weightedScores)) {
2010
+ weightedScores[metric] /= totalWeight;
2011
+ }
2012
+ const combinedExplanations = {};
2013
+ for (const [metric, explanations] of Object.entries(allExplanations)) {
2014
+ combinedExplanations[metric] = explanations.join(" | ");
2015
+ }
2016
+ const weightedOverall = results.reduce(
2017
+ (sum, r, i) => sum + (r.overallScore ?? 0) * weights[i],
2018
+ 0
2019
+ ) / totalWeight;
2020
+ return {
2021
+ scores: weightedScores,
2022
+ explanations: combinedExplanations,
2023
+ overallScore: weightedOverall,
2024
+ metadata: {
2025
+ aggregation: "weighted",
2026
+ judgeCount: results.length,
2027
+ weights
2028
+ }
2029
+ };
2030
+ }
2031
+ /**
2032
+ * Add a judge
2033
+ */
2034
+ addJudge(judge, weight) {
2035
+ this.judges.push(judge);
2036
+ if (this.weights && weight !== void 0) {
2037
+ this.weights.push(weight);
2038
+ }
2039
+ }
2040
+ /**
2041
+ * Remove a judge by index
2042
+ */
2043
+ removeJudge(index) {
2044
+ if (index >= 0 && index < this.judges.length) {
2045
+ this.judges.splice(index, 1);
2046
+ if (this.weights) {
2047
+ this.weights.splice(index, 1);
2048
+ }
2049
+ return true;
2050
+ }
2051
+ return false;
2052
+ }
2053
+ /**
2054
+ * Get judge count
2055
+ */
2056
+ getJudgeCount() {
2057
+ return this.judges.length;
2058
+ }
2059
+ };
2060
+ function createConsensusJudge(config) {
2061
+ return new ConsensusJudge(config);
2062
+ }
2063
+
2064
+ // src/evaluation/EvalDataset.ts
2065
+ import { nanoid } from "nanoid";
2066
+ var EvalDataset = class _EvalDataset {
2067
+ name;
2068
+ items;
2069
+ metadata;
2070
+ constructor(config) {
2071
+ this.name = config.name ?? "eval-dataset";
2072
+ this.items = config.items.map((item) => ({
2073
+ ...item,
2074
+ id: item.id ?? nanoid()
2075
+ }));
2076
+ this.metadata = config.metadata;
2077
+ }
2078
+ /**
2079
+ * Get dataset size
2080
+ */
2081
+ get size() {
2082
+ return this.items.length;
2083
+ }
2084
+ /**
2085
+ * Get all items
2086
+ */
2087
+ getItems() {
2088
+ return [...this.items];
2089
+ }
2090
+ /**
2091
+ * Get item by ID
2092
+ */
2093
+ getItem(id) {
2094
+ return this.items.find((item) => item.id === id);
2095
+ }
2096
+ /**
2097
+ * Filter items
2098
+ */
2099
+ filter(predicate) {
2100
+ return new _EvalDataset({
2101
+ name: this.name,
2102
+ items: this.items.filter(predicate),
2103
+ metadata: this.metadata
2104
+ });
2105
+ }
2106
+ /**
2107
+ * Sample random items
2108
+ */
2109
+ sample(count, seed) {
2110
+ if (count >= this.items.length) {
2111
+ return new _EvalDataset({
2112
+ name: this.name,
2113
+ items: [...this.items],
2114
+ metadata: this.metadata
2115
+ });
2116
+ }
2117
+ const shuffled = [...this.items];
2118
+ const rng = seed !== void 0 ? this.seededRandom(seed) : Math.random;
2119
+ for (let i = shuffled.length - 1; i > 0; i--) {
2120
+ const j = Math.floor(rng() * (i + 1));
2121
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
2122
+ }
2123
+ return new _EvalDataset({
2124
+ name: this.name,
2125
+ items: shuffled.slice(0, count),
2126
+ metadata: this.metadata
2127
+ });
2128
+ }
2129
+ /**
2130
+ * Split dataset into train/test
2131
+ */
2132
+ split(ratio) {
2133
+ const splitIndex = Math.floor(this.items.length * ratio);
2134
+ const shuffled = [...this.items];
2135
+ for (let i = shuffled.length - 1; i > 0; i--) {
2136
+ const j = Math.floor(Math.random() * (i + 1));
2137
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
2138
+ }
2139
+ return [
2140
+ new _EvalDataset({
2141
+ name: `${this.name}-train`,
2142
+ items: shuffled.slice(0, splitIndex),
2143
+ metadata: this.metadata
2144
+ }),
2145
+ new _EvalDataset({
2146
+ name: `${this.name}-test`,
2147
+ items: shuffled.slice(splitIndex),
2148
+ metadata: this.metadata
2149
+ })
2150
+ ];
2151
+ }
2152
+ /**
2153
+ * Filter by tags
2154
+ */
2155
+ filterByTags(tags, mode = "any") {
2156
+ return this.filter((item) => {
2157
+ if (!item.tags) return false;
2158
+ if (mode === "any") {
2159
+ return tags.some((tag) => item.tags.includes(tag));
2160
+ }
2161
+ return tags.every((tag) => item.tags.includes(tag));
2162
+ });
2163
+ }
2164
+ /**
2165
+ * Get unique tags
2166
+ */
2167
+ getTags() {
2168
+ const tags = /* @__PURE__ */ new Set();
2169
+ for (const item of this.items) {
2170
+ if (item.tags) {
2171
+ for (const tag of item.tags) {
2172
+ tags.add(tag);
2173
+ }
2174
+ }
2175
+ }
2176
+ return Array.from(tags);
2177
+ }
2178
+ /**
2179
+ * Add items
2180
+ */
2181
+ addItems(items) {
2182
+ for (const item of items) {
2183
+ this.items.push({
2184
+ ...item,
2185
+ id: item.id ?? nanoid()
2186
+ });
2187
+ }
2188
+ }
2189
+ /**
2190
+ * Remove item by ID
2191
+ */
2192
+ removeItem(id) {
2193
+ const index = this.items.findIndex((item) => item.id === id);
2194
+ if (index >= 0) {
2195
+ this.items.splice(index, 1);
2196
+ return true;
2197
+ }
2198
+ return false;
2199
+ }
2200
+ /**
2201
+ * Create seeded random function
2202
+ */
2203
+ seededRandom(seed) {
2204
+ return () => {
2205
+ seed = (seed * 9301 + 49297) % 233280;
2206
+ return seed / 233280;
2207
+ };
2208
+ }
2209
+ /**
2210
+ * Export to JSON
2211
+ */
2212
+ toJSON() {
2213
+ return JSON.stringify(
2214
+ {
2215
+ name: this.name,
2216
+ items: this.items,
2217
+ metadata: this.metadata
2218
+ },
2219
+ null,
2220
+ 2
2221
+ );
2222
+ }
2223
+ /**
2224
+ * Export to JSONL
2225
+ */
2226
+ toJSONL() {
2227
+ return this.items.map((item) => JSON.stringify(item)).join("\n");
2228
+ }
2229
+ /**
2230
+ * Create from JSON array
2231
+ */
2232
+ static fromJSON(data, name) {
2233
+ return new _EvalDataset({
2234
+ name: name ?? "json-dataset",
2235
+ items: data.map((item, index) => ({
2236
+ id: `item-${index}`,
2237
+ ...item
2238
+ }))
2239
+ });
2240
+ }
2241
+ /**
2242
+ * Create from JSONL string
2243
+ */
2244
+ static fromJSONL(jsonl, name) {
2245
+ const lines = jsonl.trim().split("\n").filter((line) => line.trim());
2246
+ const items = lines.map((line, index) => {
2247
+ const parsed = JSON.parse(line);
2248
+ return {
2249
+ id: parsed.id ?? `item-${index}`,
2250
+ ...parsed
2251
+ };
2252
+ });
2253
+ return new _EvalDataset({
2254
+ name: name ?? "jsonl-dataset",
2255
+ items
2256
+ });
2257
+ }
2258
+ /**
2259
+ * Create from HuggingFace dataset (stub - would need actual HF integration)
2260
+ */
2261
+ static fromHuggingFace(datasetName, config) {
2262
+ console.warn(
2263
+ "HuggingFace integration not implemented. Please install @huggingface/hub and implement the loader."
2264
+ );
2265
+ return Promise.resolve(
2266
+ new _EvalDataset({
2267
+ name: datasetName,
2268
+ items: [],
2269
+ metadata: {
2270
+ source: "huggingface",
2271
+ datasetName,
2272
+ config
2273
+ }
2274
+ })
2275
+ );
2276
+ }
2277
+ /**
2278
+ * Create from CSV string
2279
+ */
2280
+ static fromCSV(csv, options) {
2281
+ const delimiter = options?.delimiter ?? ",";
2282
+ const lines = csv.trim().split("\n");
2283
+ if (lines.length < 2) {
2284
+ return new _EvalDataset({ name: "csv-dataset", items: [] });
2285
+ }
2286
+ const headers = lines[0].split(delimiter).map((h) => h.trim().replace(/^"|"$/g, ""));
2287
+ const inputCol = options?.inputColumn ?? headers.find((h) => h.toLowerCase().includes("input")) ?? headers[0];
2288
+ const outputCol = options?.outputColumn ?? headers.find(
2289
+ (h) => h.toLowerCase().includes("output") || h.toLowerCase().includes("expected")
2290
+ );
2291
+ const contextCol = options?.contextColumn ?? headers.find((h) => h.toLowerCase().includes("context"));
2292
+ const inputIdx = headers.indexOf(inputCol);
2293
+ const outputIdx = outputCol ? headers.indexOf(outputCol) : -1;
2294
+ const contextIdx = contextCol ? headers.indexOf(contextCol) : -1;
2295
+ const items = [];
2296
+ for (let i = 1; i < lines.length; i++) {
2297
+ const values = lines[i].split(delimiter).map((v) => v.trim().replace(/^"|"$/g, ""));
2298
+ if (inputIdx >= 0 && values[inputIdx]) {
2299
+ items.push({
2300
+ id: `csv-${i}`,
2301
+ input: values[inputIdx],
2302
+ expectedOutput: outputIdx >= 0 ? values[outputIdx] : void 0,
2303
+ context: contextIdx >= 0 && values[contextIdx] ? [values[contextIdx]] : void 0
2304
+ });
2305
+ }
2306
+ }
2307
+ return new _EvalDataset({
2308
+ name: "csv-dataset",
2309
+ items
2310
+ });
2311
+ }
2312
+ };
2313
+ function createEvalDataset(config) {
2314
+ return new EvalDataset(config);
2315
+ }
2316
+
2317
+ // src/evaluation/EvalRunner.ts
2318
+ var EvalRunner = class {
2319
+ parallelism;
2320
+ timeout;
2321
+ retries;
2322
+ onItemComplete;
2323
+ onError;
2324
+ constructor(config = {}) {
2325
+ this.parallelism = config.parallelism ?? 5;
2326
+ this.timeout = config.timeout ?? 3e4;
2327
+ this.retries = config.retries ?? 1;
2328
+ this.onItemComplete = config.onItemComplete;
2329
+ this.onError = config.onError;
2330
+ }
2331
+ /**
2332
+ * Run evaluation on a dataset
2333
+ */
2334
+ async run(dataset, generateFn, metrics, judge) {
2335
+ const items = dataset.getItems();
2336
+ const results = [];
2337
+ for (let i = 0; i < items.length; i += this.parallelism) {
2338
+ const batch = items.slice(i, i + this.parallelism);
2339
+ const batchResults = await Promise.all(
2340
+ batch.map(
2341
+ (item) => this.evaluateItem(item, generateFn, metrics, judge)
2342
+ )
2343
+ );
2344
+ results.push(...batchResults);
2345
+ }
2346
+ return results;
2347
+ }
2348
+ /**
2349
+ * Run evaluation as async generator
2350
+ */
2351
+ async *runStream(dataset, generateFn, metrics, judge) {
2352
+ const items = dataset.getItems();
2353
+ for (let i = 0; i < items.length; i += this.parallelism) {
2354
+ const batch = items.slice(i, i + this.parallelism);
2355
+ const batchResults = await Promise.all(
2356
+ batch.map(
2357
+ (item) => this.evaluateItem(item, generateFn, metrics, judge)
2358
+ )
2359
+ );
2360
+ for (const result of batchResults) {
2361
+ yield result;
2362
+ }
2363
+ }
2364
+ }
2365
+ /**
2366
+ * Evaluate a single item
2367
+ */
2368
+ async evaluateItem(item, generateFn, metrics, judge) {
2369
+ const startTime = performance.now();
2370
+ let output = "";
2371
+ let generationError = null;
2372
+ for (let attempt = 0; attempt <= this.retries; attempt++) {
2373
+ try {
2374
+ output = await this.withTimeout(
2375
+ generateFn(item.input, item.context),
2376
+ this.timeout
2377
+ );
2378
+ break;
2379
+ } catch (error) {
2380
+ generationError = error;
2381
+ if (attempt === this.retries) {
2382
+ const evalError = {
2383
+ itemId: item.id,
2384
+ input: item.input,
2385
+ error: generationError,
2386
+ phase: "generation"
2387
+ };
2388
+ this.onError?.(evalError);
2389
+ return {
2390
+ itemId: item.id,
2391
+ input: item.input,
2392
+ output: "",
2393
+ expectedOutput: item.expectedOutput,
2394
+ context: item.context,
2395
+ scores: {},
2396
+ passed: false,
2397
+ durationMs: performance.now() - startTime
2398
+ };
2399
+ }
2400
+ }
2401
+ }
2402
+ const evalInput = {
2403
+ input: item.input,
2404
+ output,
2405
+ expectedOutput: item.expectedOutput,
2406
+ context: item.context,
2407
+ reference: item.reference,
2408
+ metadata: item.metadata
2409
+ };
2410
+ const scores = {};
2411
+ const explanations = {};
2412
+ for (const metric of metrics) {
2413
+ try {
2414
+ const result2 = await this.withTimeout(
2415
+ metric.evaluate(evalInput),
2416
+ this.timeout
2417
+ );
2418
+ scores[metric.name] = result2.score;
2419
+ if (result2.explanation) {
2420
+ explanations[metric.name] = result2.explanation;
2421
+ }
2422
+ } catch (error) {
2423
+ const evalError = {
2424
+ itemId: item.id,
2425
+ input: item.input,
2426
+ error,
2427
+ phase: "evaluation"
2428
+ };
2429
+ this.onError?.(evalError);
2430
+ scores[metric.name] = 0;
2431
+ explanations[metric.name] = `Error: ${error.message}`;
2432
+ }
2433
+ }
2434
+ let judgeResult;
2435
+ if (judge) {
2436
+ try {
2437
+ judgeResult = await this.withTimeout(
2438
+ judge.evaluate(evalInput),
2439
+ this.timeout
2440
+ );
2441
+ for (const [key, value] of Object.entries(judgeResult.scores)) {
2442
+ scores[`judge_${key}`] = value;
2443
+ }
2444
+ } catch (error) {
2445
+ const evalError = {
2446
+ itemId: item.id,
2447
+ input: item.input,
2448
+ error,
2449
+ phase: "evaluation"
2450
+ };
2451
+ this.onError?.(evalError);
2452
+ }
2453
+ }
2454
+ const passed = Object.values(scores).every((score) => score >= 0.5);
2455
+ const result = {
2456
+ itemId: item.id,
2457
+ input: item.input,
2458
+ output,
2459
+ expectedOutput: item.expectedOutput,
2460
+ context: item.context,
2461
+ scores,
2462
+ explanations: Object.keys(explanations).length > 0 ? explanations : void 0,
2463
+ judgeResult,
2464
+ passed,
2465
+ durationMs: performance.now() - startTime
2466
+ };
2467
+ this.onItemComplete?.(result);
2468
+ return result;
2469
+ }
2470
+ /**
2471
+ * Run with timeout
2472
+ */
2473
+ async withTimeout(promise, timeoutMs) {
2474
+ return Promise.race([
2475
+ promise,
2476
+ new Promise(
2477
+ (_, reject) => setTimeout(() => reject(new Error("Evaluation timeout")), timeoutMs)
2478
+ )
2479
+ ]);
2480
+ }
2481
+ };
2482
+ function createEvalRunner(config) {
2483
+ return new EvalRunner(config);
2484
+ }
2485
+
2486
+ // src/evaluation/EvaluationPipeline.ts
2487
+ var EvaluationPipeline = class {
2488
+ metrics;
2489
+ llmJudge;
2490
+ runner;
2491
+ constructor(config) {
2492
+ this.metrics = config.metrics;
2493
+ this.llmJudge = config.llmJudge;
2494
+ this.runner = new EvalRunner({
2495
+ parallelism: config.parallelism ?? 5,
2496
+ timeout: config.timeout ?? 3e4,
2497
+ retries: config.retries ?? 1
2498
+ });
2499
+ }
2500
+ /**
2501
+ * Run evaluation pipeline
2502
+ */
2503
+ async evaluate(options) {
2504
+ const startTime = performance.now();
2505
+ const results = [];
2506
+ const total = options.dataset.size;
2507
+ let completed = 0;
2508
+ const runner = new EvalRunner({
2509
+ onItemComplete: (result) => {
2510
+ results.push(result);
2511
+ completed++;
2512
+ if (options.onProgress) {
2513
+ const elapsed = performance.now() - startTime;
2514
+ const avgTime = elapsed / completed;
2515
+ const remaining = (total - completed) * avgTime;
2516
+ options.onProgress({
2517
+ completed,
2518
+ total,
2519
+ currentItem: result.itemId,
2520
+ elapsedMs: elapsed,
2521
+ estimatedRemainingMs: remaining
2522
+ });
2523
+ }
2524
+ },
2525
+ onError: (error) => {
2526
+ if (options.onError) {
2527
+ options.onError(error);
2528
+ }
2529
+ if (options.stopOnError) {
2530
+ throw error.error;
2531
+ }
2532
+ }
2533
+ });
2534
+ await runner.run(
2535
+ options.dataset,
2536
+ options.generateFn,
2537
+ this.metrics,
2538
+ this.llmJudge
2539
+ );
2540
+ const totalDurationMs = performance.now() - startTime;
2541
+ const metricsSummary = this.calculateMetricsSummary(results);
2542
+ const failures = this.analyzeFailures(results);
2543
+ const summary = this.createSummary(results, totalDurationMs);
2544
+ return this.createResult(results, metricsSummary, failures, summary);
2545
+ }
2546
+ /**
2547
+ * Run evaluation as stream
2548
+ */
2549
+ async *evaluateStream(options) {
2550
+ const startTime = performance.now();
2551
+ const results = [];
2552
+ const total = options.dataset.size;
2553
+ for await (const result of this.runner.runStream(
2554
+ options.dataset,
2555
+ options.generateFn,
2556
+ this.metrics,
2557
+ this.llmJudge
2558
+ )) {
2559
+ results.push(result);
2560
+ if (options.onProgress) {
2561
+ const elapsed = performance.now() - startTime;
2562
+ const avgTime = elapsed / results.length;
2563
+ const remaining = (total - results.length) * avgTime;
2564
+ options.onProgress({
2565
+ completed: results.length,
2566
+ total,
2567
+ currentItem: result.itemId,
2568
+ elapsedMs: elapsed,
2569
+ estimatedRemainingMs: remaining
2570
+ });
2571
+ }
2572
+ yield result;
2573
+ }
2574
+ const totalDurationMs = performance.now() - startTime;
2575
+ const metricsSummary = this.calculateMetricsSummary(results);
2576
+ const failures = this.analyzeFailures(results);
2577
+ const summary = this.createSummary(results, totalDurationMs);
2578
+ return this.createResult(results, metricsSummary, failures, summary);
2579
+ }
2580
+ /**
2581
+ * Calculate metrics summary
2582
+ */
2583
+ calculateMetricsSummary(results) {
2584
+ const summary = {};
2585
+ if (results.length === 0) return summary;
2586
+ const metricNames = /* @__PURE__ */ new Set();
2587
+ for (const result of results) {
2588
+ for (const name of Object.keys(result.scores)) {
2589
+ metricNames.add(name);
2590
+ }
2591
+ }
2592
+ for (const name of metricNames) {
2593
+ const scores = results.map((r) => r.scores[name]).filter((s) => s !== void 0);
2594
+ if (scores.length === 0) continue;
2595
+ const sorted = [...scores].sort((a, b) => a - b);
2596
+ const sum = scores.reduce((a, b) => a + b, 0);
2597
+ const mean = sum / scores.length;
2598
+ const variance = scores.reduce((s, v) => s + Math.pow(v - mean, 2), 0) / scores.length;
2599
+ const std = Math.sqrt(variance);
2600
+ const passCount = scores.filter((s) => s >= 0.5).length;
2601
+ summary[name] = {
2602
+ mean,
2603
+ std,
2604
+ min: sorted[0],
2605
+ max: sorted[sorted.length - 1],
2606
+ median: sorted[Math.floor(sorted.length / 2)],
2607
+ p90: sorted[Math.floor(sorted.length * 0.9)],
2608
+ p95: sorted[Math.floor(sorted.length * 0.95)],
2609
+ passRate: passCount / scores.length
2610
+ };
2611
+ }
2612
+ return summary;
2613
+ }
2614
+ /**
2615
+ * Analyze failures
2616
+ */
2617
+ analyzeFailures(results) {
2618
+ return results.filter((r) => !r.passed).map((r) => {
2619
+ const failedMetrics = Object.entries(r.scores).filter(([, score]) => score < 0.5).map(([name]) => name);
2620
+ const explanations = failedMetrics.map((m) => r.explanations?.[m]).filter(Boolean).join("; ");
2621
+ return {
2622
+ itemId: r.itemId,
2623
+ input: r.input,
2624
+ output: r.output,
2625
+ expectedOutput: r.expectedOutput,
2626
+ scores: r.scores,
2627
+ failedMetrics,
2628
+ explanation: explanations || void 0
2629
+ };
2630
+ });
2631
+ }
2632
+ /**
2633
+ * Create evaluation summary
2634
+ */
2635
+ createSummary(results, totalDurationMs) {
2636
+ const passedItems = results.filter((r) => r.passed).length;
2637
+ const allScores = results.flatMap((r) => Object.values(r.scores));
2638
+ const avgScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : 0;
2639
+ return {
2640
+ totalItems: results.length,
2641
+ passedItems,
2642
+ failedItems: results.length - passedItems,
2643
+ passRate: results.length > 0 ? passedItems / results.length : 0,
2644
+ avgScore,
2645
+ totalDurationMs,
2646
+ avgDurationMs: results.length > 0 ? totalDurationMs / results.length : 0,
2647
+ timestamp: Date.now()
2648
+ };
2649
+ }
2650
+ /**
2651
+ * Create result object
2652
+ */
2653
+ createResult(results, metrics, failures, summary) {
2654
+ return {
2655
+ results,
2656
+ metrics,
2657
+ failures,
2658
+ summary,
2659
+ exportJSON() {
2660
+ return JSON.stringify(
2661
+ {
2662
+ results,
2663
+ metrics,
2664
+ failures,
2665
+ summary
2666
+ },
2667
+ null,
2668
+ 2
2669
+ );
2670
+ },
2671
+ exportCSV() {
2672
+ if (results.length === 0) return "";
2673
+ const scoreColumns = /* @__PURE__ */ new Set();
2674
+ for (const r of results) {
2675
+ for (const name of Object.keys(r.scores)) {
2676
+ scoreColumns.add(name);
2677
+ }
2678
+ }
2679
+ const headers = [
2680
+ "itemId",
2681
+ "input",
2682
+ "output",
2683
+ "passed",
2684
+ ...scoreColumns
2685
+ ];
2686
+ const rows = results.map((r) => {
2687
+ const values = [
2688
+ r.itemId,
2689
+ `"${r.input.replace(/"/g, '""')}"`,
2690
+ `"${r.output.replace(/"/g, '""')}"`,
2691
+ r.passed.toString(),
2692
+ ...Array.from(scoreColumns).map(
2693
+ (c) => r.scores[c]?.toFixed(4) ?? ""
2694
+ )
2695
+ ];
2696
+ return values.join(",");
2697
+ });
2698
+ return [headers.join(","), ...rows].join("\n");
2699
+ },
2700
+ getFailures(options) {
2701
+ let filtered = [...failures];
2702
+ if (options?.threshold !== void 0) {
2703
+ filtered = filtered.filter(
2704
+ (f) => Object.values(f.scores).some((s) => s < options.threshold)
2705
+ );
2706
+ }
2707
+ if (options?.metric) {
2708
+ filtered = filtered.filter(
2709
+ (f) => f.failedMetrics.includes(options.metric)
2710
+ );
2711
+ }
2712
+ if (options?.limit) {
2713
+ filtered = filtered.slice(0, options.limit);
2714
+ }
2715
+ return filtered;
2716
+ }
2717
+ };
2718
+ }
2719
+ /**
2720
+ * Add a metric
2721
+ */
2722
+ addMetric(metric) {
2723
+ this.metrics.push(metric);
2724
+ }
2725
+ /**
2726
+ * Remove a metric
2727
+ */
2728
+ removeMetric(name) {
2729
+ const index = this.metrics.findIndex((m) => m.name === name);
2730
+ if (index >= 0) {
2731
+ this.metrics.splice(index, 1);
2732
+ return true;
2733
+ }
2734
+ return false;
2735
+ }
2736
+ /**
2737
+ * Set judge
2738
+ */
2739
+ setJudge(judge) {
2740
+ this.llmJudge = judge;
2741
+ }
2742
+ /**
2743
+ * Get metrics
2744
+ */
2745
+ getMetrics() {
2746
+ return [...this.metrics];
2747
+ }
2748
+ };
2749
+ function createEvaluationPipeline(config) {
2750
+ return new EvaluationPipeline(config);
2751
+ }
2752
+
2753
+ export {
2754
+ BaseMetric,
2755
+ Accuracy,
2756
+ createAccuracyMetric,
2757
+ Relevance,
2758
+ createRelevanceMetric,
2759
+ Coherence,
2760
+ createCoherenceMetric,
2761
+ Toxicity,
2762
+ createToxicityMetric,
2763
+ Faithfulness,
2764
+ createFaithfulnessMetric,
2765
+ ContextRelevance,
2766
+ createContextRelevanceMetric,
2767
+ CustomMetric,
2768
+ createCustomMetric,
2769
+ createSimpleMetric,
2770
+ createLengthMetric,
2771
+ createRegexMetric,
2772
+ createJSONMetric,
2773
+ createContainsMetric,
2774
+ LLMJudge,
2775
+ createLLMJudge,
2776
+ RubricJudge,
2777
+ createRubricJudge,
2778
+ QualityRubric,
2779
+ CodeQualityRubric,
2780
+ HelpfulnessRubric,
2781
+ ComparativeJudge,
2782
+ createComparativeJudge,
2783
+ ConsensusJudge,
2784
+ createConsensusJudge,
2785
+ EvalDataset,
2786
+ createEvalDataset,
2787
+ EvalRunner,
2788
+ createEvalRunner,
2789
+ EvaluationPipeline,
2790
+ createEvaluationPipeline
2791
+ };