rag-eval-node-ts 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +204 -0
  2. package/dist/__tests__/evaluate.test.d.ts +2 -0
  3. package/dist/__tests__/evaluate.test.d.ts.map +1 -0
  4. package/dist/__tests__/evaluate.test.js +130 -0
  5. package/dist/__tests__/evaluate.test.js.map +1 -0
  6. package/dist/__tests__/evaluator.test.d.ts +2 -0
  7. package/dist/__tests__/evaluator.test.d.ts.map +1 -0
  8. package/dist/__tests__/evaluator.test.js +92 -0
  9. package/dist/__tests__/evaluator.test.js.map +1 -0
  10. package/dist/__tests__/heuristic/ngrams.test.d.ts +2 -0
  11. package/dist/__tests__/heuristic/ngrams.test.d.ts.map +1 -0
  12. package/dist/__tests__/heuristic/ngrams.test.js +89 -0
  13. package/dist/__tests__/heuristic/ngrams.test.js.map +1 -0
  14. package/dist/__tests__/heuristic/tfidf.test.d.ts +2 -0
  15. package/dist/__tests__/heuristic/tfidf.test.d.ts.map +1 -0
  16. package/dist/__tests__/heuristic/tfidf.test.js +57 -0
  17. package/dist/__tests__/heuristic/tfidf.test.js.map +1 -0
  18. package/dist/__tests__/heuristic/token-f1.test.d.ts +2 -0
  19. package/dist/__tests__/heuristic/token-f1.test.d.ts.map +1 -0
  20. package/dist/__tests__/heuristic/token-f1.test.js +40 -0
  21. package/dist/__tests__/heuristic/token-f1.test.js.map +1 -0
  22. package/dist/__tests__/metrics/faithfulness.test.d.ts +2 -0
  23. package/dist/__tests__/metrics/faithfulness.test.d.ts.map +1 -0
  24. package/dist/__tests__/metrics/faithfulness.test.js +66 -0
  25. package/dist/__tests__/metrics/faithfulness.test.js.map +1 -0
  26. package/dist/__tests__/types.test.d.ts +2 -0
  27. package/dist/__tests__/types.test.d.ts.map +1 -0
  28. package/dist/__tests__/types.test.js +531 -0
  29. package/dist/__tests__/types.test.js.map +1 -0
  30. package/dist/evaluate.d.ts +14 -0
  31. package/dist/evaluate.d.ts.map +1 -0
  32. package/dist/evaluate.js +208 -0
  33. package/dist/evaluate.js.map +1 -0
  34. package/dist/evaluator.d.ts +10 -0
  35. package/dist/evaluator.d.ts.map +1 -0
  36. package/dist/evaluator.js +39 -0
  37. package/dist/evaluator.js.map +1 -0
  38. package/dist/heuristic/ngrams.d.ts +22 -0
  39. package/dist/heuristic/ngrams.d.ts.map +1 -0
  40. package/dist/heuristic/ngrams.js +70 -0
  41. package/dist/heuristic/ngrams.js.map +1 -0
  42. package/dist/heuristic/sentences.d.ts +13 -0
  43. package/dist/heuristic/sentences.d.ts.map +1 -0
  44. package/dist/heuristic/sentences.js +23 -0
  45. package/dist/heuristic/sentences.js.map +1 -0
  46. package/dist/heuristic/tfidf.d.ts +21 -0
  47. package/dist/heuristic/tfidf.d.ts.map +1 -0
  48. package/dist/heuristic/tfidf.js +87 -0
  49. package/dist/heuristic/tfidf.js.map +1 -0
  50. package/dist/heuristic/token-f1.d.ts +12 -0
  51. package/dist/heuristic/token-f1.d.ts.map +1 -0
  52. package/dist/heuristic/token-f1.js +41 -0
  53. package/dist/heuristic/token-f1.js.map +1 -0
  54. package/dist/index.d.ts +9 -0
  55. package/dist/index.d.ts.map +1 -0
  56. package/dist/index.js +37 -0
  57. package/dist/index.js.map +1 -0
  58. package/dist/metrics/answer-correctness.d.ts +7 -0
  59. package/dist/metrics/answer-correctness.d.ts.map +1 -0
  60. package/dist/metrics/answer-correctness.js +51 -0
  61. package/dist/metrics/answer-correctness.js.map +1 -0
  62. package/dist/metrics/answer-relevance.d.ts +6 -0
  63. package/dist/metrics/answer-relevance.d.ts.map +1 -0
  64. package/dist/metrics/answer-relevance.js +37 -0
  65. package/dist/metrics/answer-relevance.js.map +1 -0
  66. package/dist/metrics/context-precision.d.ts +6 -0
  67. package/dist/metrics/context-precision.d.ts.map +1 -0
  68. package/dist/metrics/context-precision.js +57 -0
  69. package/dist/metrics/context-precision.js.map +1 -0
  70. package/dist/metrics/context-recall.d.ts +7 -0
  71. package/dist/metrics/context-recall.d.ts.map +1 -0
  72. package/dist/metrics/context-recall.js +66 -0
  73. package/dist/metrics/context-recall.js.map +1 -0
  74. package/dist/metrics/context-relevance.d.ts +6 -0
  75. package/dist/metrics/context-relevance.d.ts.map +1 -0
  76. package/dist/metrics/context-relevance.js +48 -0
  77. package/dist/metrics/context-relevance.js.map +1 -0
  78. package/dist/metrics/faithfulness.d.ts +6 -0
  79. package/dist/metrics/faithfulness.d.ts.map +1 -0
  80. package/dist/metrics/faithfulness.js +64 -0
  81. package/dist/metrics/faithfulness.js.map +1 -0
  82. package/dist/metrics/hallucination-rate.d.ts +7 -0
  83. package/dist/metrics/hallucination-rate.d.ts.map +1 -0
  84. package/dist/metrics/hallucination-rate.js +65 -0
  85. package/dist/metrics/hallucination-rate.js.map +1 -0
  86. package/dist/metrics/index.d.ts +14 -0
  87. package/dist/metrics/index.d.ts.map +1 -0
  88. package/dist/metrics/index.js +40 -0
  89. package/dist/metrics/index.js.map +1 -0
  90. package/dist/types.d.ts +169 -0
  91. package/dist/types.d.ts.map +1 -0
  92. package/dist/types.js +4 -0
  93. package/dist/types.js.map +1 -0
  94. package/package.json +53 -0
package/dist/index.js ADDED
@@ -0,0 +1,37 @@
1
+ "use strict";
2
+ // rag-eval-node-ts - Lightweight RAG evaluation metrics for CI/CD pipelines
3
+ Object.defineProperty(exports, "__esModule", { value: true });
4
+ exports.tokenF1 = exports.tfidfSimilarity = exports.cosineSimilarity = exports.buildTfIdfVectors = exports.weightedNgramOverlap = exports.ngramOverlap = exports.getNgrams = exports.tokenize = exports.filterFactualSentences = exports.splitSentences = exports.computeMetric = exports.scoreHallucinationRate = exports.scoreAnswerCorrectness = exports.scoreContextRelevance = exports.scoreContextRecall = exports.scoreContextPrecision = exports.scoreAnswerRelevance = exports.scoreFaithfulness = exports.createEvaluator = exports.evaluateBatch = exports.evaluate = void 0;
5
+ // Core evaluation functions
6
+ var evaluate_1 = require("./evaluate");
7
+ Object.defineProperty(exports, "evaluate", { enumerable: true, get: function () { return evaluate_1.evaluate; } });
8
+ Object.defineProperty(exports, "evaluateBatch", { enumerable: true, get: function () { return evaluate_1.evaluateBatch; } });
9
+ // Evaluator factory
10
+ var evaluator_1 = require("./evaluator");
11
+ Object.defineProperty(exports, "createEvaluator", { enumerable: true, get: function () { return evaluator_1.createEvaluator; } });
12
+ // Metric functions
13
+ var index_1 = require("./metrics/index");
14
+ Object.defineProperty(exports, "scoreFaithfulness", { enumerable: true, get: function () { return index_1.scoreFaithfulness; } });
15
+ Object.defineProperty(exports, "scoreAnswerRelevance", { enumerable: true, get: function () { return index_1.scoreAnswerRelevance; } });
16
+ Object.defineProperty(exports, "scoreContextPrecision", { enumerable: true, get: function () { return index_1.scoreContextPrecision; } });
17
+ Object.defineProperty(exports, "scoreContextRecall", { enumerable: true, get: function () { return index_1.scoreContextRecall; } });
18
+ Object.defineProperty(exports, "scoreContextRelevance", { enumerable: true, get: function () { return index_1.scoreContextRelevance; } });
19
+ Object.defineProperty(exports, "scoreAnswerCorrectness", { enumerable: true, get: function () { return index_1.scoreAnswerCorrectness; } });
20
+ Object.defineProperty(exports, "scoreHallucinationRate", { enumerable: true, get: function () { return index_1.scoreHallucinationRate; } });
21
+ Object.defineProperty(exports, "computeMetric", { enumerable: true, get: function () { return index_1.computeMetric; } });
22
+ // Heuristic primitives
23
+ var sentences_1 = require("./heuristic/sentences");
24
+ Object.defineProperty(exports, "splitSentences", { enumerable: true, get: function () { return sentences_1.splitSentences; } });
25
+ Object.defineProperty(exports, "filterFactualSentences", { enumerable: true, get: function () { return sentences_1.filterFactualSentences; } });
26
+ var ngrams_1 = require("./heuristic/ngrams");
27
+ Object.defineProperty(exports, "tokenize", { enumerable: true, get: function () { return ngrams_1.tokenize; } });
28
+ Object.defineProperty(exports, "getNgrams", { enumerable: true, get: function () { return ngrams_1.getNgrams; } });
29
+ Object.defineProperty(exports, "ngramOverlap", { enumerable: true, get: function () { return ngrams_1.ngramOverlap; } });
30
+ Object.defineProperty(exports, "weightedNgramOverlap", { enumerable: true, get: function () { return ngrams_1.weightedNgramOverlap; } });
31
+ var tfidf_1 = require("./heuristic/tfidf");
32
+ Object.defineProperty(exports, "buildTfIdfVectors", { enumerable: true, get: function () { return tfidf_1.buildTfIdfVectors; } });
33
+ Object.defineProperty(exports, "cosineSimilarity", { enumerable: true, get: function () { return tfidf_1.cosineSimilarity; } });
34
+ Object.defineProperty(exports, "tfidfSimilarity", { enumerable: true, get: function () { return tfidf_1.tfidfSimilarity; } });
35
+ var token_f1_1 = require("./heuristic/token-f1");
36
+ Object.defineProperty(exports, "tokenF1", { enumerable: true, get: function () { return token_f1_1.tokenF1; } });
37
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAAA,4EAA4E;;;AAwB5E,4BAA4B;AAC5B,uCAAqD;AAA5C,oGAAA,QAAQ,OAAA;AAAE,yGAAA,aAAa,OAAA;AAEhC,oBAAoB;AACpB,yCAA8C;AAArC,4GAAA,eAAe,OAAA;AAExB,mBAAmB;AACnB,yCASyB;AARvB,0GAAA,iBAAiB,OAAA;AACjB,6GAAA,oBAAoB,OAAA;AACpB,8GAAA,qBAAqB,OAAA;AACrB,2GAAA,kBAAkB,OAAA;AAClB,8GAAA,qBAAqB,OAAA;AACrB,+GAAA,sBAAsB,OAAA;AACtB,+GAAA,sBAAsB,OAAA;AACtB,sGAAA,aAAa,OAAA;AAGf,uBAAuB;AACvB,mDAA+E;AAAtE,2GAAA,cAAc,OAAA;AAAE,mHAAA,sBAAsB,OAAA;AAC/C,6CAA6F;AAApF,kGAAA,QAAQ,OAAA;AAAE,mGAAA,SAAS,OAAA;AAAE,sGAAA,YAAY,OAAA;AAAE,8GAAA,oBAAoB,OAAA;AAChE,2CAAyF;AAAhF,0GAAA,iBAAiB,OAAA;AAAE,yGAAA,gBAAgB,OAAA;AAAE,wGAAA,eAAe,OAAA;AAC7D,iDAA+C;AAAtC,mGAAA,OAAO,OAAA"}
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Answer correctness metric: is the answer factually correct vs ground truth?
3
+ * Requires groundTruth.
4
+ */
5
+ import type { EvalSample, HeuristicOptions, MetricResult } from '../types';
6
+ export declare function scoreAnswerCorrectness(sample: EvalSample, _options?: HeuristicOptions): Promise<MetricResult>;
7
+ //# sourceMappingURL=answer-correctness.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"answer-correctness.d.ts","sourceRoot":"","sources":["../../src/metrics/answer-correctness.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAc,MAAM,UAAU,CAAC;AAMvF,wBAAsB,sBAAsB,CAC1C,MAAM,EAAE,UAAU,EAClB,QAAQ,CAAC,EAAE,gBAAgB,GAC1B,OAAO,CAAC,YAAY,CAAC,CA2CvB"}
@@ -0,0 +1,51 @@
1
+ "use strict";
2
+ /**
3
+ * Answer correctness metric: is the answer factually correct vs ground truth?
4
+ * Requires groundTruth.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.scoreAnswerCorrectness = scoreAnswerCorrectness;
8
+ const token_f1_1 = require("../heuristic/token-f1");
9
+ const ngrams_1 = require("../heuristic/ngrams");
10
+ const DEFAULT_THRESHOLD = 0.6;
11
+ async function scoreAnswerCorrectness(sample, _options) {
12
+ const start = Date.now();
13
+ const threshold = DEFAULT_THRESHOLD;
14
+ const signals = [];
15
+ if (!sample.groundTruth) {
16
+ return {
17
+ metricId: 'answerCorrectness',
18
+ score: null,
19
+ mode: 'heuristic',
20
+ passed: null,
21
+ threshold,
22
+ explanation: 'groundTruth is required for answerCorrectness but was not provided.',
23
+ signals,
24
+ llmCalls: 0,
25
+ durationMs: Date.now() - start,
26
+ };
27
+ }
28
+ const f1 = (0, token_f1_1.tokenF1)(sample.groundTruth, sample.answer);
29
+ const ngram = (0, ngrams_1.ngramOverlap)(sample.groundTruth, sample.answer, 1);
30
+ const score = 0.7 * f1 + 0.3 * ngram;
31
+ if (score < 0.5) {
32
+ signals.push({
33
+ id: 'answer-correctness-low',
34
+ metricId: 'answerCorrectness',
35
+ severity: 'warning',
36
+ message: `Answer correctness is low (score=${score.toFixed(3)}).`,
37
+ });
38
+ }
39
+ return {
40
+ metricId: 'answerCorrectness',
41
+ score,
42
+ mode: 'heuristic',
43
+ passed: score >= threshold,
44
+ threshold,
45
+ explanation: `Token F1: ${f1.toFixed(3)}, unigram overlap: ${ngram.toFixed(3)}, blended score (70%F1 + 30%ngram): ${score.toFixed(3)}.`,
46
+ signals,
47
+ llmCalls: 0,
48
+ durationMs: Date.now() - start,
49
+ };
50
+ }
51
+ //# sourceMappingURL=answer-correctness.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"answer-correctness.js","sourceRoot":"","sources":["../../src/metrics/answer-correctness.ts"],"names":[],"mappings":";AAAA;;;GAGG;;AAQH,wDA8CC;AAnDD,oDAAgD;AAChD,gDAAmD;AAEnD,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAEvB,KAAK,UAAU,sBAAsB,CAC1C,MAAkB,EAClB,QAA2B;IAE3B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,iBAAiB,CAAC;IACpC,MAAM,OAAO,GAAiB,EAAE,CAAC;IAEjC,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;QACxB,OAAO;YACL,QAAQ,EAAE,mBAAmB;YAC7B,KAAK,EAAE,IAAI;YACX,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,IAAI;YACZ,SAAS;YACT,WAAW,EAAE,qEAAqE;YAClF,OAAO;YACP,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC/B,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,GAAG,IAAA,kBAAO,EAAC,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;IACtD,MAAM,KAAK,GAAG,IAAA,qBAAY,EAAC,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACjE,MAAM,KAAK,GAAG,GAAG,GAAG,EAAE,GAAG,GAAG,GAAG,KAAK,CAAC;IAErC,IAAI,KAAK,GAAG,GAAG,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC;YACX,EAAE,EAAE,wBAAwB;YAC5B,QAAQ,EAAE,mBAAmB;YAC7B,QAAQ,EAAE,SAAS;YACnB,OAAO,EAAE,oCAAoC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;SAClE,CAAC,CAAC;IACL,CAAC;IAED,OAAO;QACL,QAAQ,EAAE,mBAAmB;QAC7B,KAAK;QACL,IAAI,EAAE,WAAW;QACjB,MAAM,EAAE,KAAK,IAAI,SAAS;QAC1B,SAAS;QACT,WAAW,EAAE,aAAa,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,sBAAsB,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,uCAAuC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;QACvI,OAAO;QACP,QAAQ,EAAE,CAAC;QACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;KAC/B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Answer relevance metric: is the answer relevant to the question?
3
+ */
4
+ import type { EvalSample, HeuristicOptions, MetricResult } from '../types';
5
+ export declare function scoreAnswerRelevance(sample: EvalSample, _options?: HeuristicOptions): Promise<MetricResult>;
6
+ //# sourceMappingURL=answer-relevance.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"answer-relevance.d.ts","sourceRoot":"","sources":["../../src/metrics/answer-relevance.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAc,MAAM,UAAU,CAAC;AAMvF,wBAAsB,oBAAoB,CACxC,MAAM,EAAE,UAAU,EAClB,QAAQ,CAAC,EAAE,gBAAgB,GAC1B,OAAO,CAAC,YAAY,CAAC,CA6BvB"}
@@ -0,0 +1,37 @@
1
+ "use strict";
2
+ /**
3
+ * Answer relevance metric: is the answer relevant to the question?
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.scoreAnswerRelevance = scoreAnswerRelevance;
7
+ const tfidf_1 = require("../heuristic/tfidf");
8
+ const ngrams_1 = require("../heuristic/ngrams");
9
+ const DEFAULT_THRESHOLD = 0.7;
10
+ async function scoreAnswerRelevance(sample, _options) {
11
+ const start = Date.now();
12
+ const threshold = DEFAULT_THRESHOLD;
13
+ const signals = [];
14
+ const tfidf = (0, tfidf_1.tfidfSimilarity)(sample.question, sample.answer);
15
+ const ngram = (0, ngrams_1.ngramOverlap)(sample.question, sample.answer, 1);
16
+ const score = (tfidf + ngram) / 2;
17
+ if (score < 0.5) {
18
+ signals.push({
19
+ id: 'answer-relevance-low',
20
+ metricId: 'answerRelevance',
21
+ severity: 'warning',
22
+ message: `Answer may not be relevant to the question (score=${score.toFixed(3)}).`,
23
+ });
24
+ }
25
+ return {
26
+ metricId: 'answerRelevance',
27
+ score,
28
+ mode: 'heuristic',
29
+ passed: score >= threshold,
30
+ threshold,
31
+ explanation: `TF-IDF similarity: ${tfidf.toFixed(3)}, unigram overlap: ${ngram.toFixed(3)}, average: ${score.toFixed(3)}.`,
32
+ signals,
33
+ llmCalls: 0,
34
+ durationMs: Date.now() - start,
35
+ };
36
+ }
37
+ //# sourceMappingURL=answer-relevance.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"answer-relevance.js","sourceRoot":"","sources":["../../src/metrics/answer-relevance.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAQH,oDAgCC;AArCD,8CAAqD;AACrD,gDAAmD;AAEnD,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAEvB,KAAK,UAAU,oBAAoB,CACxC,MAAkB,EAClB,QAA2B;IAE3B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,iBAAiB,CAAC;IACpC,MAAM,OAAO,GAAiB,EAAE,CAAC;IAEjC,MAAM,KAAK,GAAG,IAAA,uBAAe,EAAC,MAAM,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;IAC9D,MAAM,KAAK,GAAG,IAAA,qBAAY,EAAC,MAAM,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,KAAK,GAAG,CAAC,KAAK,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;IAElC,IAAI,KAAK,GAAG,GAAG,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC;YACX,EAAE,EAAE,sBAAsB;YAC1B,QAAQ,EAAE,iBAAiB;YAC3B,QAAQ,EAAE,SAAS;YACnB,OAAO,EAAE,qDAAqD,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;SACnF,CAAC,CAAC;IACL,CAAC;IAED,OAAO;QACL,QAAQ,EAAE,iBAAiB;QAC3B,KAAK;QACL,IAAI,EAAE,WAAW;QACjB,MAAM,EAAE,KAAK,IAAI,SAAS;QAC1B,SAAS;QACT,WAAW,EAAE,sBAAsB,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,sBAAsB,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;QAC1H,OAAO;QACP,QAAQ,EAAE,CAAC;QACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;KAC/B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Context precision metric: are the retrieved contexts relevant to answering the question?
3
+ */
4
+ import type { EvalSample, HeuristicOptions, MetricResult } from '../types';
5
+ export declare function scoreContextPrecision(sample: EvalSample, _options?: HeuristicOptions): Promise<MetricResult>;
6
+ //# sourceMappingURL=context-precision.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-precision.d.ts","sourceRoot":"","sources":["../../src/metrics/context-precision.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAc,MAAM,UAAU,CAAC;AAMvF,wBAAsB,qBAAqB,CACzC,MAAM,EAAE,UAAU,EAClB,QAAQ,CAAC,EAAE,gBAAgB,GAC1B,OAAO,CAAC,YAAY,CAAC,CAoDvB"}
@@ -0,0 +1,57 @@
1
+ "use strict";
2
+ /**
3
+ * Context precision metric: are the retrieved contexts relevant to answering the question?
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.scoreContextPrecision = scoreContextPrecision;
7
+ const tfidf_1 = require("../heuristic/tfidf");
8
+ const DEFAULT_THRESHOLD = 0.7;
9
+ const LOW_RELEVANCE_THRESHOLD = 0.3;
10
+ async function scoreContextPrecision(sample, _options) {
11
+ const start = Date.now();
12
+ const threshold = DEFAULT_THRESHOLD;
13
+ const signals = [];
14
+ if (sample.contexts.length === 0) {
15
+ return {
16
+ metricId: 'contextPrecision',
17
+ score: 0,
18
+ mode: 'heuristic',
19
+ passed: false,
20
+ threshold,
21
+ explanation: 'No context chunks provided.',
22
+ signals,
23
+ llmCalls: 0,
24
+ durationMs: Date.now() - start,
25
+ };
26
+ }
27
+ const relevances = [];
28
+ const lowChunks = [];
29
+ for (let i = 0; i < sample.contexts.length; i++) {
30
+ const relevance = (0, tfidf_1.tfidfSimilarity)(sample.question, sample.contexts[i]);
31
+ relevances.push(relevance);
32
+ if (relevance < LOW_RELEVANCE_THRESHOLD) {
33
+ lowChunks.push(i);
34
+ }
35
+ }
36
+ if (lowChunks.length > 0) {
37
+ signals.push({
38
+ id: 'context-precision-low-relevance-chunks',
39
+ metricId: 'contextPrecision',
40
+ severity: 'info',
41
+ message: `Context chunk(s) with low relevance (< ${LOW_RELEVANCE_THRESHOLD}): indices [${lowChunks.join(', ')}].`,
42
+ });
43
+ }
44
+ const score = relevances.reduce((a, b) => a + b, 0) / relevances.length;
45
+ return {
46
+ metricId: 'contextPrecision',
47
+ score,
48
+ mode: 'heuristic',
49
+ passed: score >= threshold,
50
+ threshold,
51
+ explanation: `Average TF-IDF relevance of ${sample.contexts.length} context chunk(s) to the question: ${score.toFixed(3)}.`,
52
+ signals,
53
+ llmCalls: 0,
54
+ durationMs: Date.now() - start,
55
+ };
56
+ }
57
+ //# sourceMappingURL=context-precision.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-precision.js","sourceRoot":"","sources":["../../src/metrics/context-precision.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAQH,sDAuDC;AA5DD,8CAAqD;AAErD,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAC9B,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE7B,KAAK,UAAU,qBAAqB,CACzC,MAAkB,EAClB,QAA2B;IAE3B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,iBAAiB,CAAC;IACpC,MAAM,OAAO,GAAiB,EAAE,CAAC;IAEjC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACjC,OAAO;YACL,QAAQ,EAAE,kBAAkB;YAC5B,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,KAAK;YACb,SAAS;YACT,WAAW,EAAE,6BAA6B;YAC1C,OAAO;YACP,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC/B,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAa,EAAE,CAAC;IAChC,MAAM,SAAS,GAAa,EAAE,CAAC;IAE/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAChD,MAAM,SAAS,GAAG,IAAA,uBAAe,EAAC,MAAM,CAAC,QAAQ,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QACvE,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAC3B,IAAI,SAAS,GAAG,uBAAuB,EAAE,CAAC;YACxC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,CAAC;IACH,CAAC;IAED,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,IAAI,CAAC;YACX,EAAE,EAAE,wCAAwC;YAC5C,QAAQ,EAAE,kBAAkB;YAC5B,QAAQ,EAAE,MAAM;YAChB,OAAO,EAAE,0CAA0C,uBAAuB,eAAe,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI;SAClH,CAAC,CAAC;IACL,CAAC;IAED,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;IAExE,OAAO;QACL,QAAQ,EAAE,kBAAkB;QAC5B,KAAK;QACL,IAAI,EAAE,WAAW;QACjB,MAAM,EAAE,KAAK,IAAI,SAAS;QAC1B,SAAS;QACT,WAAW,EAAE,+BAA+B,MAAM,CAAC,QAAQ,CAAC,MAAM,sCAAsC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;QAC3H,OAAO;QACP,QAAQ,EAAE,CAAC;QACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;KAC/B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Context recall metric: do contexts cover the ground truth?
3
+ * Requires groundTruth.
4
+ */
5
+ import type { EvalSample, HeuristicOptions, MetricResult } from '../types';
6
+ export declare function scoreContextRecall(sample: EvalSample, _options?: HeuristicOptions): Promise<MetricResult>;
7
+ //# sourceMappingURL=context-recall.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-recall.d.ts","sourceRoot":"","sources":["../../src/metrics/context-recall.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAO3E,wBAAsB,kBAAkB,CACtC,MAAM,EAAE,UAAU,EAClB,QAAQ,CAAC,EAAE,gBAAgB,GAC1B,OAAO,CAAC,YAAY,CAAC,CAyDvB"}
@@ -0,0 +1,66 @@
1
+ "use strict";
2
+ /**
3
+ * Context recall metric: do contexts cover the ground truth?
4
+ * Requires groundTruth.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.scoreContextRecall = scoreContextRecall;
8
+ const sentences_1 = require("../heuristic/sentences");
9
+ const ngrams_1 = require("../heuristic/ngrams");
10
+ const DEFAULT_THRESHOLD = 0.7;
11
+ const COVERAGE_THRESHOLD = 0.3;
12
+ async function scoreContextRecall(sample, _options) {
13
+ const start = Date.now();
14
+ const threshold = DEFAULT_THRESHOLD;
15
+ if (!sample.groundTruth) {
16
+ return {
17
+ metricId: 'contextRecall',
18
+ score: null,
19
+ mode: 'heuristic',
20
+ passed: null,
21
+ threshold,
22
+ explanation: 'groundTruth is required for contextRecall but was not provided.',
23
+ signals: [],
24
+ llmCalls: 0,
25
+ durationMs: Date.now() - start,
26
+ };
27
+ }
28
+ const gtSentences = (0, sentences_1.splitSentences)(sample.groundTruth);
29
+ if (gtSentences.length === 0 || sample.contexts.length === 0) {
30
+ return {
31
+ metricId: 'contextRecall',
32
+ score: 0,
33
+ mode: 'heuristic',
34
+ passed: false,
35
+ threshold,
36
+ explanation: 'No ground truth sentences or context chunks to evaluate.',
37
+ signals: [],
38
+ llmCalls: 0,
39
+ durationMs: Date.now() - start,
40
+ };
41
+ }
42
+ let coveredCount = 0;
43
+ for (const sentence of gtSentences) {
44
+ let maxOverlap = 0;
45
+ for (const ctx of sample.contexts) {
46
+ const overlap = (0, ngrams_1.ngramOverlap)(sentence, ctx, 1);
47
+ if (overlap > maxOverlap)
48
+ maxOverlap = overlap;
49
+ }
50
+ if (maxOverlap >= COVERAGE_THRESHOLD)
51
+ coveredCount++;
52
+ }
53
+ const score = coveredCount / gtSentences.length;
54
+ return {
55
+ metricId: 'contextRecall',
56
+ score,
57
+ mode: 'heuristic',
58
+ passed: score >= threshold,
59
+ threshold,
60
+ explanation: `${coveredCount}/${gtSentences.length} ground truth sentence(s) covered by contexts (overlap >= ${COVERAGE_THRESHOLD}): score=${score.toFixed(3)}.`,
61
+ signals: [],
62
+ llmCalls: 0,
63
+ durationMs: Date.now() - start,
64
+ };
65
+ }
66
+ //# sourceMappingURL=context-recall.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-recall.js","sourceRoot":"","sources":["../../src/metrics/context-recall.ts"],"names":[],"mappings":";AAAA;;;GAGG;;AASH,gDA4DC;AAlED,sDAAwD;AACxD,gDAAmD;AAEnD,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAC9B,MAAM,kBAAkB,GAAG,GAAG,CAAC;AAExB,KAAK,UAAU,kBAAkB,CACtC,MAAkB,EAClB,QAA2B;IAE3B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,iBAAiB,CAAC;IAEpC,IAAI,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;QACxB,OAAO;YACL,QAAQ,EAAE,eAAe;YACzB,KAAK,EAAE,IAAI;YACX,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,IAAI;YACZ,SAAS;YACT,WAAW,EAAE,iEAAiE;YAC9E,OAAO,EAAE,EAAE;YACX,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC/B,CAAC;IACJ,CAAC;IAED,MAAM,WAAW,GAAG,IAAA,0BAAc,EAAC,MAAM,CAAC,WAAW,CAAC,CAAC;IAEvD,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC7D,OAAO;YACL,QAAQ,EAAE,eAAe;YACzB,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,KAAK;YACb,SAAS;YACT,WAAW,EAAE,0DAA0D;YACvE,OAAO,EAAE,EAAE;YACX,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC/B,CAAC;IACJ,CAAC;IAED,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;QACnC,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;YAClC,MAAM,OAAO,GAAG,IAAA,qBAAY,EAAC,QAAQ,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;YAC/C,IAAI,OAAO,GAAG,UAAU;gBAAE,UAAU,GAAG,OAAO,CAAC;QACjD,CAAC;QACD,IAAI,UAAU,IAAI,kBAAkB;YAAE,YAAY,EAAE,CAAC;IACvD,CAAC;IAED,MAAM,KAAK,GAAG,YAAY,GAAG,WAAW,CAAC,MAAM,CAAC;IAEhD,OAAO;QACL,QAAQ,EAAE,eAAe;QACzB,KAAK;QACL,IAAI,EAAE,WAAW;QACjB,MAAM,EAAE,KAAK,IAAI,SAAS;QAC1B,SAAS;QACT,WAAW,EAAE,GAAG,YAAY,IAAI,WAAW,CAAC,MAAM,6DAA6D,kBAAkB,YAAY,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;QAChK,OAAO,EAAE,EAAE;QACX,QAAQ,EAAE,CAAC;QACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;KAC/B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Context relevance metric: are context chunks relevant to the question (stricter than precision)?
3
+ */
4
+ import type { EvalSample, HeuristicOptions, MetricResult } from '../types';
5
+ export declare function scoreContextRelevance(sample: EvalSample, options?: HeuristicOptions): Promise<MetricResult>;
6
+ //# sourceMappingURL=context-relevance.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-relevance.d.ts","sourceRoot":"","sources":["../../src/metrics/context-relevance.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AAM3E,wBAAsB,qBAAqB,CACzC,MAAM,EAAE,UAAU,EAClB,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,YAAY,CAAC,CAwCvB"}
@@ -0,0 +1,48 @@
1
+ "use strict";
2
+ /**
3
+ * Context relevance metric: are context chunks relevant to the question (stricter than precision)?
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.scoreContextRelevance = scoreContextRelevance;
7
+ const ngrams_1 = require("../heuristic/ngrams");
8
+ const DEFAULT_THRESHOLD = 0.6;
9
+ const DEFAULT_CHUNK_RELEVANCE_THRESHOLD = 0.2;
10
+ async function scoreContextRelevance(sample, options) {
11
+ const start = Date.now();
12
+ const threshold = DEFAULT_THRESHOLD;
13
+ const chunkRelevanceThreshold = options?.chunkRelevanceThreshold ?? DEFAULT_CHUNK_RELEVANCE_THRESHOLD;
14
+ const ngramSizes = options?.ngramSizes ?? [1, 2];
15
+ const ngramWeights = options?.ngramWeights ?? [0.7, 0.3];
16
+ if (sample.contexts.length === 0) {
17
+ return {
18
+ metricId: 'contextRelevance',
19
+ score: 0,
20
+ mode: 'heuristic',
21
+ passed: false,
22
+ threshold,
23
+ explanation: 'No context chunks provided.',
24
+ signals: [],
25
+ llmCalls: 0,
26
+ durationMs: Date.now() - start,
27
+ };
28
+ }
29
+ let relevantCount = 0;
30
+ for (const ctx of sample.contexts) {
31
+ const overlap = (0, ngrams_1.weightedNgramOverlap)(sample.question, ctx, ngramSizes, ngramWeights);
32
+ if (overlap >= chunkRelevanceThreshold)
33
+ relevantCount++;
34
+ }
35
+ const score = relevantCount / sample.contexts.length;
36
+ return {
37
+ metricId: 'contextRelevance',
38
+ score,
39
+ mode: 'heuristic',
40
+ passed: score >= threshold,
41
+ threshold,
42
+ explanation: `${relevantCount}/${sample.contexts.length} context chunk(s) meet relevance threshold (>= ${chunkRelevanceThreshold}): score=${score.toFixed(3)}.`,
43
+ signals: [],
44
+ llmCalls: 0,
45
+ durationMs: Date.now() - start,
46
+ };
47
+ }
48
+ //# sourceMappingURL=context-relevance.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-relevance.js","sourceRoot":"","sources":["../../src/metrics/context-relevance.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAQH,sDA2CC;AAhDD,gDAA2D;AAE3D,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAC9B,MAAM,iCAAiC,GAAG,GAAG,CAAC;AAEvC,KAAK,UAAU,qBAAqB,CACzC,MAAkB,EAClB,OAA0B;IAE1B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,iBAAiB,CAAC;IACpC,MAAM,uBAAuB,GAAG,OAAO,EAAE,uBAAuB,IAAI,iCAAiC,CAAC;IACtG,MAAM,UAAU,GAAG,OAAO,EAAE,UAAU,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACjD,MAAM,YAAY,GAAG,OAAO,EAAE,YAAY,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAEzD,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACjC,OAAO;YACL,QAAQ,EAAE,kBAAkB;YAC5B,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,KAAK;YACb,SAAS;YACT,WAAW,EAAE,6BAA6B;YAC1C,OAAO,EAAE,EAAE;YACX,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC/B,CAAC;IACJ,CAAC;IAED,IAAI,aAAa,GAAG,CAAC,CAAC;IACtB,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;QAClC,MAAM,OAAO,GAAG,IAAA,6BAAoB,EAAC,MAAM,CAAC,QAAQ,EAAE,GAAG,EAAE,UAAU,EAAE,YAAY,CAAC,CAAC;QACrF,IAAI,OAAO,IAAI,uBAAuB;YAAE,aAAa,EAAE,CAAC;IAC1D,CAAC;IAED,MAAM,KAAK,GAAG,aAAa,GAAG,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;IAErD,OAAO;QACL,QAAQ,EAAE,kBAAkB;QAC5B,KAAK;QACL,IAAI,EAAE,WAAW;QACjB,MAAM,EAAE,KAAK,IAAI,SAAS;QAC1B,SAAS;QACT,WAAW,EAAE,GAAG,aAAa,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,kDAAkD,uBAAuB,YAAY,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;QAC/J,OAAO,EAAE,EAAE;QACX,QAAQ,EAAE,CAAC;QACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;KAC/B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Faithfulness metric: is the answer supported by the retrieved contexts?
3
+ */
4
+ import type { EvalSample, HeuristicOptions, MetricResult } from '../types';
5
+ export declare function scoreFaithfulness(sample: EvalSample, options?: HeuristicOptions): Promise<MetricResult>;
6
+ //# sourceMappingURL=faithfulness.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"faithfulness.d.ts","sourceRoot":"","sources":["../../src/metrics/faithfulness.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAc,MAAM,UAAU,CAAC;AAOvF,wBAAsB,iBAAiB,CACrC,MAAM,EAAE,UAAU,EAClB,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,YAAY,CAAC,CA0DvB"}
@@ -0,0 +1,64 @@
1
+ "use strict";
2
+ /**
3
+ * Faithfulness metric: is the answer supported by the retrieved contexts?
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.scoreFaithfulness = scoreFaithfulness;
7
+ const sentences_1 = require("../heuristic/sentences");
8
+ const ngrams_1 = require("../heuristic/ngrams");
9
+ const DEFAULT_THRESHOLD = 0.7;
10
+ const LOW_SUPPORT_THRESHOLD = 0.3;
11
+ async function scoreFaithfulness(sample, options) {
12
+ const start = Date.now();
13
+ const threshold = DEFAULT_THRESHOLD;
14
+ const ngramSizes = options?.ngramSizes ?? [1, 2];
15
+ const ngramWeights = options?.ngramWeights ?? [0.7, 0.3];
16
+ const sentences = (0, sentences_1.splitSentences)(sample.answer);
17
+ const signals = [];
18
+ if (sentences.length === 0 || sample.contexts.length === 0) {
19
+ return {
20
+ metricId: 'faithfulness',
21
+ score: 0,
22
+ mode: 'heuristic',
23
+ passed: false,
24
+ threshold,
25
+ explanation: 'No answer sentences or context chunks to evaluate.',
26
+ signals,
27
+ llmCalls: 0,
28
+ durationMs: Date.now() - start,
29
+ };
30
+ }
31
+ const sentenceScores = [];
32
+ for (let i = 0; i < sentences.length; i++) {
33
+ const sentence = sentences[i];
34
+ let maxOverlap = 0;
35
+ for (const ctx of sample.contexts) {
36
+ const overlap = (0, ngrams_1.weightedNgramOverlap)(sentence, ctx, ngramSizes, ngramWeights);
37
+ if (overlap > maxOverlap)
38
+ maxOverlap = overlap;
39
+ }
40
+ sentenceScores.push(maxOverlap);
41
+ if (maxOverlap < LOW_SUPPORT_THRESHOLD) {
42
+ signals.push({
43
+ id: `faithfulness-low-support-${i}`,
44
+ metricId: 'faithfulness',
45
+ severity: 'warning',
46
+ message: `Answer sentence has low context support (overlap=${maxOverlap.toFixed(3)}).`,
47
+ evidence: sentence,
48
+ });
49
+ }
50
+ }
51
+ const score = sentenceScores.reduce((a, b) => a + b, 0) / sentenceScores.length;
52
+ return {
53
+ metricId: 'faithfulness',
54
+ score,
55
+ mode: 'heuristic',
56
+ passed: score >= threshold,
57
+ threshold,
58
+ explanation: `Average max-context overlap across ${sentences.length} answer sentence(s): ${score.toFixed(3)}.`,
59
+ signals,
60
+ llmCalls: 0,
61
+ durationMs: Date.now() - start,
62
+ };
63
+ }
64
+ //# sourceMappingURL=faithfulness.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"faithfulness.js","sourceRoot":"","sources":["../../src/metrics/faithfulness.ts"],"names":[],"mappings":";AAAA;;GAEG;;AASH,8CA6DC;AAnED,sDAAwD;AACxD,gDAA2D;AAE3D,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAC9B,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAE3B,KAAK,UAAU,iBAAiB,CACrC,MAAkB,EAClB,OAA0B;IAE1B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,iBAAiB,CAAC;IACpC,MAAM,UAAU,GAAG,OAAO,EAAE,UAAU,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACjD,MAAM,YAAY,GAAG,OAAO,EAAE,YAAY,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;IAEzD,MAAM,SAAS,GAAG,IAAA,0BAAc,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAChD,MAAM,OAAO,GAAiB,EAAE,CAAC;IAEjC,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3D,OAAO;YACL,QAAQ,EAAE,cAAc;YACxB,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,KAAK;YACb,SAAS;YACT,WAAW,EAAE,oDAAoD;YACjE,OAAO;YACP,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC/B,CAAC;IACJ,CAAC;IAED,MAAM,cAAc,GAAa,EAAE,CAAC;IAEpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;QAC9B,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;YAClC,MAAM,OAAO,GAAG,IAAA,6BAAoB,EAAC,QAAQ,EAAE,GAAG,EAAE,UAAU,EAAE,YAAY,CAAC,CAAC;YAC9E,IAAI,OAAO,GAAG,UAAU;gBAAE,UAAU,GAAG,OAAO,CAAC;QACjD,CAAC;QACD,cAAc,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAEhC,IAAI,UAAU,GAAG,qBAAqB,EAAE,CAAC;YACvC,OAAO,CAAC,IAAI,CAAC;gBACX,EAAE,EAAE,4BAA4B,CAAC,EAAE;gBACnC,QAAQ,EAAE,cAAc;gBACxB,QAAQ,EAAE,SAAS;gBACnB,OAAO,EAAE,oDAAoD,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;gBACtF,QAAQ,EAAE,QAAQ;aACnB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,MAAM,KAAK,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC;IAEhF,OAAO;QACL,QAAQ,EAAE,cAAc;QACxB,KAAK;QACL,IAAI,EAAE,WAAW;QACjB,MAAM,EAAE,KAAK,IAAI,SAAS;QAC1B,SAAS;QACT,WAAW,EAAE,sCAAsC,SAAS,CAAC,MAAM,wBAAwB,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;QAC9G,OAAO;QACP,QAAQ,EAAE,CAAC;QACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;KAC/B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Hallucination rate metric: does the answer contain claims not supported by any context?
3
+ * Score = 1 - hallucination_rate (higher is better).
4
+ */
5
+ import type { EvalSample, HeuristicOptions, MetricResult } from '../types';
6
+ export declare function scoreHallucinationRate(sample: EvalSample, options?: HeuristicOptions): Promise<MetricResult>;
7
+ //# sourceMappingURL=hallucination-rate.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hallucination-rate.d.ts","sourceRoot":"","sources":["../../src/metrics/hallucination-rate.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAc,MAAM,UAAU,CAAC;AAOvF,wBAAsB,sBAAsB,CAC1C,MAAM,EAAE,UAAU,EAClB,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,YAAY,CAAC,CA0DvB"}
@@ -0,0 +1,65 @@
1
+ "use strict";
2
+ /**
3
+ * Hallucination rate metric: does the answer contain claims not supported by any context?
4
+ * Score = 1 - hallucination_rate (higher is better).
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.scoreHallucinationRate = scoreHallucinationRate;
8
+ const sentences_1 = require("../heuristic/sentences");
9
+ const ngrams_1 = require("../heuristic/ngrams");
10
+ const DEFAULT_THRESHOLD = 0.7;
11
+ const DEFAULT_CLAIM_SUPPORT_THRESHOLD = 0.15;
12
+ async function scoreHallucinationRate(sample, options) {
13
+ const start = Date.now();
14
+ const threshold = DEFAULT_THRESHOLD;
15
+ const claimSupportThreshold = options?.claimSupportThreshold ?? DEFAULT_CLAIM_SUPPORT_THRESHOLD;
16
+ const signals = [];
17
+ const sentences = (0, sentences_1.splitSentences)(sample.answer);
18
+ if (sentences.length === 0 || sample.contexts.length === 0) {
19
+ return {
20
+ metricId: 'hallucinationRate',
21
+ score: sample.contexts.length === 0 ? 0 : 1,
22
+ mode: 'heuristic',
23
+ passed: sample.contexts.length === 0 ? false : true,
24
+ threshold,
25
+ explanation: 'No answer sentences or context chunks to evaluate.',
26
+ signals,
27
+ llmCalls: 0,
28
+ durationMs: Date.now() - start,
29
+ };
30
+ }
31
+ let unsupportedCount = 0;
32
+ for (let i = 0; i < sentences.length; i++) {
33
+ const sentence = sentences[i];
34
+ let maxOverlap = 0;
35
+ for (const ctx of sample.contexts) {
36
+ const overlap = (0, ngrams_1.ngramOverlap)(sentence, ctx, 1);
37
+ if (overlap > maxOverlap)
38
+ maxOverlap = overlap;
39
+ }
40
+ if (maxOverlap < claimSupportThreshold) {
41
+ unsupportedCount++;
42
+ signals.push({
43
+ id: `hallucination-unsupported-${i}`,
44
+ metricId: 'hallucinationRate',
45
+ severity: 'critical',
46
+ message: `Answer sentence appears unsupported by any context (max overlap=${maxOverlap.toFixed(3)}).`,
47
+ evidence: sentence,
48
+ });
49
+ }
50
+ }
51
+ const hallucinationRate = unsupportedCount / sentences.length;
52
+ const score = 1 - hallucinationRate;
53
+ return {
54
+ metricId: 'hallucinationRate',
55
+ score,
56
+ mode: 'heuristic',
57
+ passed: score >= threshold,
58
+ threshold,
59
+ explanation: `${unsupportedCount}/${sentences.length} answer sentence(s) appear unsupported (hallucination rate=${hallucinationRate.toFixed(3)}, score=${score.toFixed(3)}).`,
60
+ signals,
61
+ llmCalls: 0,
62
+ durationMs: Date.now() - start,
63
+ };
64
+ }
65
+ //# sourceMappingURL=hallucination-rate.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hallucination-rate.js","sourceRoot":"","sources":["../../src/metrics/hallucination-rate.ts"],"names":[],"mappings":";AAAA;;;GAGG;;AASH,wDA6DC;AAnED,sDAAwD;AACxD,gDAAmD;AAEnD,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAC9B,MAAM,+BAA+B,GAAG,IAAI,CAAC;AAEtC,KAAK,UAAU,sBAAsB,CAC1C,MAAkB,EAClB,OAA0B;IAE1B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,iBAAiB,CAAC;IACpC,MAAM,qBAAqB,GAAG,OAAO,EAAE,qBAAqB,IAAI,+BAA+B,CAAC;IAChG,MAAM,OAAO,GAAiB,EAAE,CAAC;IAEjC,MAAM,SAAS,GAAG,IAAA,0BAAc,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAEhD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3D,OAAO;YACL,QAAQ,EAAE,mBAAmB;YAC7B,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC3C,IAAI,EAAE,WAAW;YACjB,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI;YACnD,SAAS;YACT,WAAW,EAAE,oDAAoD;YACjE,OAAO;YACP,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC/B,CAAC;IACJ,CAAC;IAED,IAAI,gBAAgB,GAAG,CAAC,CAAC;IAEzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;QAC9B,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;YAClC,MAAM,OAAO,GAAG,IAAA,qBAAY,EAAC,QAAQ,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC;YAC/C,IAAI,OAAO,GAAG,UAAU;gBAAE,UAAU,GAAG,OAAO,CAAC;QACjD,CAAC;QAED,IAAI,UAAU,GAAG,qBAAqB,EAAE,CAAC;YACvC,gBAAgB,EAAE,CAAC;YACnB,OAAO,CAAC,IAAI,CAAC;gBACX,EAAE,EAAE,6BAA6B,CAAC,EAAE;gBACpC,QAAQ,EAAE,mBAAmB;gBAC7B,QAAQ,EAAE,UAAU;gBACpB,OAAO,EAAE,mEAAmE,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;gBACrG,QAAQ,EAAE,QAAQ;aACnB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,MAAM,iBAAiB,GAAG,gBAAgB,GAAG,SAAS,CAAC,MAAM,CAAC;IAC9D,MAAM,KAAK,GAAG,CAAC,GAAG,iBAAiB,CAAC;IAEpC,OAAO;QACL,QAAQ,EAAE,mBAAmB;QAC7B,KAAK;QACL,IAAI,EAAE,WAAW;QACjB,MAAM,EAAE,KAAK,IAAI,SAAS;QAC1B,SAAS;QACT,WAAW,EAAE,GAAG,gBAAgB,IAAI,SAAS,CAAC,MAAM,8DAA8D,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;QAC7K,OAAO;QACP,QAAQ,EAAE,CAAC;QACX,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;KAC/B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Metric dispatcher: route MetricId to the correct scoring function.
3
+ */
4
+ import type { MetricId, EvalSample, HeuristicOptions, MetricResult } from '../types';
5
+ import { scoreFaithfulness } from './faithfulness';
6
+ import { scoreAnswerRelevance } from './answer-relevance';
7
+ import { scoreContextPrecision } from './context-precision';
8
+ import { scoreContextRecall } from './context-recall';
9
+ import { scoreContextRelevance } from './context-relevance';
10
+ import { scoreAnswerCorrectness } from './answer-correctness';
11
+ import { scoreHallucinationRate } from './hallucination-rate';
12
+ export { scoreFaithfulness, scoreAnswerRelevance, scoreContextPrecision, scoreContextRecall, scoreContextRelevance, scoreAnswerCorrectness, scoreHallucinationRate, };
13
+ export declare function computeMetric(metricId: MetricId, sample: EvalSample, options?: HeuristicOptions): Promise<MetricResult>;
14
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/metrics/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,UAAU,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AACrF,OAAO,EAAE,iBAAiB,EAAE,MAAM,gBAAgB,CAAC;AACnD,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAC1D,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAC9D,OAAO,EAAE,sBAAsB,EAAE,MAAM,sBAAsB,CAAC;AAE9D,OAAO,EACL,iBAAiB,EACjB,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,EAClB,qBAAqB,EACrB,sBAAsB,EACtB,sBAAsB,GACvB,CAAC;AAEF,wBAAsB,aAAa,CACjC,QAAQ,EAAE,QAAQ,EAClB,MAAM,EAAE,UAAU,EAClB,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,YAAY,CAAC,CAiBvB"}
@@ -0,0 +1,40 @@
1
+ "use strict";
2
+ /**
3
+ * Metric dispatcher: route MetricId to the correct scoring function.
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.scoreHallucinationRate = exports.scoreAnswerCorrectness = exports.scoreContextRelevance = exports.scoreContextRecall = exports.scoreContextPrecision = exports.scoreAnswerRelevance = exports.scoreFaithfulness = void 0;
7
+ exports.computeMetric = computeMetric;
8
+ const faithfulness_1 = require("./faithfulness");
9
+ Object.defineProperty(exports, "scoreFaithfulness", { enumerable: true, get: function () { return faithfulness_1.scoreFaithfulness; } });
10
+ const answer_relevance_1 = require("./answer-relevance");
11
+ Object.defineProperty(exports, "scoreAnswerRelevance", { enumerable: true, get: function () { return answer_relevance_1.scoreAnswerRelevance; } });
12
+ const context_precision_1 = require("./context-precision");
13
+ Object.defineProperty(exports, "scoreContextPrecision", { enumerable: true, get: function () { return context_precision_1.scoreContextPrecision; } });
14
+ const context_recall_1 = require("./context-recall");
15
+ Object.defineProperty(exports, "scoreContextRecall", { enumerable: true, get: function () { return context_recall_1.scoreContextRecall; } });
16
+ const context_relevance_1 = require("./context-relevance");
17
+ Object.defineProperty(exports, "scoreContextRelevance", { enumerable: true, get: function () { return context_relevance_1.scoreContextRelevance; } });
18
+ const answer_correctness_1 = require("./answer-correctness");
19
+ Object.defineProperty(exports, "scoreAnswerCorrectness", { enumerable: true, get: function () { return answer_correctness_1.scoreAnswerCorrectness; } });
20
+ const hallucination_rate_1 = require("./hallucination-rate");
21
+ Object.defineProperty(exports, "scoreHallucinationRate", { enumerable: true, get: function () { return hallucination_rate_1.scoreHallucinationRate; } });
22
+ async function computeMetric(metricId, sample, options) {
23
+ switch (metricId) {
24
+ case 'faithfulness':
25
+ return (0, faithfulness_1.scoreFaithfulness)(sample, options);
26
+ case 'answerRelevance':
27
+ return (0, answer_relevance_1.scoreAnswerRelevance)(sample, options);
28
+ case 'contextPrecision':
29
+ return (0, context_precision_1.scoreContextPrecision)(sample, options);
30
+ case 'contextRecall':
31
+ return (0, context_recall_1.scoreContextRecall)(sample, options);
32
+ case 'contextRelevance':
33
+ return (0, context_relevance_1.scoreContextRelevance)(sample, options);
34
+ case 'answerCorrectness':
35
+ return (0, answer_correctness_1.scoreAnswerCorrectness)(sample, options);
36
+ case 'hallucinationRate':
37
+ return (0, hallucination_rate_1.scoreHallucinationRate)(sample, options);
38
+ }
39
+ }
40
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/metrics/index.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAqBH,sCAqBC;AAvCD,iDAAmD;AASjD,kGATO,gCAAiB,OASP;AARnB,yDAA0D;AASxD,qGATO,uCAAoB,OASP;AARtB,2DAA4D;AAS1D,sGATO,yCAAqB,OASP;AARvB,qDAAsD;AASpD,mGATO,mCAAkB,OASP;AARpB,2DAA4D;AAS1D,sGATO,yCAAqB,OASP;AARvB,6DAA8D;AAS5D,uGATO,2CAAsB,OASP;AARxB,6DAA8D;AAS5D,uGATO,2CAAsB,OASP;AAGjB,KAAK,UAAU,aAAa,CACjC,QAAkB,EAClB,MAAkB,EAClB,OAA0B;IAE1B,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,cAAc;YACjB,OAAO,IAAA,gCAAiB,EAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC5C,KAAK,iBAAiB;YACpB,OAAO,IAAA,uCAAoB,EAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC/C,KAAK,kBAAkB;YACrB,OAAO,IAAA,yCAAqB,EAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAChD,KAAK,eAAe;YAClB,OAAO,IAAA,mCAAkB,EAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC7C,KAAK,kBAAkB;YACrB,OAAO,IAAA,yCAAqB,EAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAChD,KAAK,mBAAmB;YACtB,OAAO,IAAA,2CAAsB,EAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACjD,KAAK,mBAAmB;YACtB,OAAO,IAAA,2CAAsB,EAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACnD,CAAC;AACH,CAAC"}