evalsense 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +678 -0
- package/bin/evalsense.js +3 -0
- package/dist/chunk-5P7LNNO6.js +747 -0
- package/dist/chunk-5P7LNNO6.js.map +1 -0
- package/dist/chunk-BRPM6AB6.js +925 -0
- package/dist/chunk-BRPM6AB6.js.map +1 -0
- package/dist/chunk-HDJID3GC.cjs +779 -0
- package/dist/chunk-HDJID3GC.cjs.map +1 -0
- package/dist/chunk-Y23VHTD3.cjs +942 -0
- package/dist/chunk-Y23VHTD3.cjs.map +1 -0
- package/dist/cli.cjs +65 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +63 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +1126 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +604 -0
- package/dist/index.d.ts +604 -0
- package/dist/index.js +1043 -0
- package/dist/index.js.map +1 -0
- package/dist/metrics/index.cjs +275 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.cts +299 -0
- package/dist/metrics/index.d.ts +299 -0
- package/dist/metrics/index.js +191 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/opinionated/index.cjs +24 -0
- package/dist/metrics/opinionated/index.cjs.map +1 -0
- package/dist/metrics/opinionated/index.d.cts +163 -0
- package/dist/metrics/opinionated/index.d.ts +163 -0
- package/dist/metrics/opinionated/index.js +3 -0
- package/dist/metrics/opinionated/index.js.map +1 -0
- package/dist/types-C71p0wzM.d.cts +265 -0
- package/dist/types-C71p0wzM.d.ts +265 -0
- package/package.json +91 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
export { batchItems, createJSONSchema, createLLMError, extractScore, faithfulness, fillPrompt, getLLMClient, hallucination, parseJSONResponse, relevance, requireLLMClient, resetLLMClient, setLLMClient, toxicity, validateResponse, withTimeout } from '../chunk-BRPM6AB6.js';
|
|
2
|
+
|
|
3
|
+
// src/metrics/custom/index.ts
|
|
4
|
+
var customMetrics = /* @__PURE__ */ new Map();
|
|
5
|
+
function registerMetric(name, fn) {
|
|
6
|
+
if (customMetrics.has(name)) {
|
|
7
|
+
throw new Error(`Metric "${name}" is already registered`);
|
|
8
|
+
}
|
|
9
|
+
customMetrics.set(name, fn);
|
|
10
|
+
}
|
|
11
|
+
function getMetric(name) {
|
|
12
|
+
return customMetrics.get(name);
|
|
13
|
+
}
|
|
14
|
+
async function runMetric(name, config) {
|
|
15
|
+
const fn = customMetrics.get(name);
|
|
16
|
+
if (!fn) {
|
|
17
|
+
throw new Error(`Metric "${name}" is not registered`);
|
|
18
|
+
}
|
|
19
|
+
return fn(config);
|
|
20
|
+
}
|
|
21
|
+
function listMetrics() {
|
|
22
|
+
return Array.from(customMetrics.keys());
|
|
23
|
+
}
|
|
24
|
+
function unregisterMetric(name) {
|
|
25
|
+
return customMetrics.delete(name);
|
|
26
|
+
}
|
|
27
|
+
function clearMetrics() {
|
|
28
|
+
customMetrics.clear();
|
|
29
|
+
}
|
|
30
|
+
function createPatternMetric(name, patterns, options = {}) {
|
|
31
|
+
const { matchScore = 1, noMatchScore = 0 } = options;
|
|
32
|
+
return async (config) => {
|
|
33
|
+
return config.outputs.map((o) => {
|
|
34
|
+
const hasMatch = patterns.some((p) => p.test(o.output));
|
|
35
|
+
return {
|
|
36
|
+
id: o.id,
|
|
37
|
+
metric: name,
|
|
38
|
+
score: hasMatch ? matchScore : noMatchScore,
|
|
39
|
+
label: hasMatch ? "detected" : "not_detected"
|
|
40
|
+
};
|
|
41
|
+
});
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
function createKeywordMetric(name, keywords, options = {}) {
|
|
45
|
+
const { caseSensitive = false, threshold = 0.5 } = options;
|
|
46
|
+
const normalizedKeywords = caseSensitive ? keywords : keywords.map((k) => k.toLowerCase());
|
|
47
|
+
return async (config) => {
|
|
48
|
+
return config.outputs.map((o) => {
|
|
49
|
+
const text = caseSensitive ? o.output : o.output.toLowerCase();
|
|
50
|
+
const matches = normalizedKeywords.filter((k) => text.includes(k));
|
|
51
|
+
const score = matches.length / keywords.length;
|
|
52
|
+
return {
|
|
53
|
+
id: o.id,
|
|
54
|
+
metric: name,
|
|
55
|
+
score,
|
|
56
|
+
label: score >= threshold ? "detected" : "not_detected"
|
|
57
|
+
};
|
|
58
|
+
});
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// src/metrics/utils/index.ts
|
|
63
|
+
function normalizeScore(score, min = 0, max = 1) {
|
|
64
|
+
const range = max - min;
|
|
65
|
+
if (range === 0) return 0;
|
|
66
|
+
return Math.max(0, Math.min(1, (score - min) / range));
|
|
67
|
+
}
|
|
68
|
+
function scoreToLabel(score, thresholds) {
|
|
69
|
+
const sorted = [...thresholds].sort((a, b) => b.min - a.min);
|
|
70
|
+
for (const { label, min } of sorted) {
|
|
71
|
+
if (score >= min) {
|
|
72
|
+
return label;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return thresholds[thresholds.length - 1]?.label ?? "unknown";
|
|
76
|
+
}
|
|
77
|
+
function createMetricOutput(id, metric, score, labelThresholds) {
|
|
78
|
+
const normalizedScore = normalizeScore(score);
|
|
79
|
+
const label = labelThresholds ? scoreToLabel(normalizedScore, labelThresholds) : normalizedScore >= 0.5 ? "high" : "low";
|
|
80
|
+
return {
|
|
81
|
+
id,
|
|
82
|
+
metric,
|
|
83
|
+
score: normalizedScore,
|
|
84
|
+
label
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
var BINARY_THRESHOLDS = [
|
|
88
|
+
{ label: "true", min: 0.5 },
|
|
89
|
+
{ label: "false", min: 0 }
|
|
90
|
+
];
|
|
91
|
+
var SEVERITY_THRESHOLDS = [
|
|
92
|
+
{ label: "high", min: 0.7 },
|
|
93
|
+
{ label: "medium", min: 0.4 },
|
|
94
|
+
{ label: "low", min: 0 }
|
|
95
|
+
];
|
|
96
|
+
function batch(items, size) {
|
|
97
|
+
const batches = [];
|
|
98
|
+
for (let i = 0; i < items.length; i += size) {
|
|
99
|
+
batches.push(items.slice(i, i + size));
|
|
100
|
+
}
|
|
101
|
+
return batches;
|
|
102
|
+
}
|
|
103
|
+
function delay(ms) {
|
|
104
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// src/metrics/llm/adapters/mock.ts
|
|
108
|
+
function createMockLLMClient(config = {}) {
|
|
109
|
+
const {
|
|
110
|
+
response,
|
|
111
|
+
responses,
|
|
112
|
+
delay: delay2 = 0,
|
|
113
|
+
shouldError = false,
|
|
114
|
+
errorMessage = "Mock LLM error",
|
|
115
|
+
onPrompt
|
|
116
|
+
} = config;
|
|
117
|
+
let callCount = 0;
|
|
118
|
+
const getResponse = () => {
|
|
119
|
+
if (responses && responses.length > 0) {
|
|
120
|
+
const resp = responses[Math.min(callCount, responses.length - 1)];
|
|
121
|
+
if (!resp) {
|
|
122
|
+
return JSON.stringify({ score: 0.5, reasoning: "Mock response" });
|
|
123
|
+
}
|
|
124
|
+
callCount++;
|
|
125
|
+
return resp;
|
|
126
|
+
}
|
|
127
|
+
if (response !== void 0) {
|
|
128
|
+
return response;
|
|
129
|
+
}
|
|
130
|
+
return JSON.stringify({ score: 0.5, reasoning: "Mock response" });
|
|
131
|
+
};
|
|
132
|
+
return {
|
|
133
|
+
async complete(prompt) {
|
|
134
|
+
if (onPrompt) {
|
|
135
|
+
onPrompt(prompt);
|
|
136
|
+
}
|
|
137
|
+
if (delay2 > 0) {
|
|
138
|
+
await new Promise((resolve) => setTimeout(resolve, delay2));
|
|
139
|
+
}
|
|
140
|
+
if (shouldError) {
|
|
141
|
+
throw new Error(errorMessage);
|
|
142
|
+
}
|
|
143
|
+
const resp = getResponse();
|
|
144
|
+
return typeof resp === "string" ? resp : JSON.stringify(resp);
|
|
145
|
+
},
|
|
146
|
+
async completeStructured(prompt, _schema) {
|
|
147
|
+
if (onPrompt) {
|
|
148
|
+
onPrompt(prompt);
|
|
149
|
+
}
|
|
150
|
+
if (delay2 > 0) {
|
|
151
|
+
await new Promise((resolve) => setTimeout(resolve, delay2));
|
|
152
|
+
}
|
|
153
|
+
if (shouldError) {
|
|
154
|
+
throw new Error(errorMessage);
|
|
155
|
+
}
|
|
156
|
+
const resp = getResponse();
|
|
157
|
+
if (typeof resp === "string") {
|
|
158
|
+
try {
|
|
159
|
+
return JSON.parse(resp);
|
|
160
|
+
} catch {
|
|
161
|
+
throw new Error(`Mock response is not valid JSON: ${resp}`);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return resp;
|
|
165
|
+
}
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
function createSequentialMockClient(responses, options = {}) {
|
|
169
|
+
return createMockLLMClient({
|
|
170
|
+
responses,
|
|
171
|
+
delay: options.delay
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
function createErrorMockClient(errorMessage = "Mock LLM error") {
|
|
175
|
+
return createMockLLMClient({
|
|
176
|
+
shouldError: true,
|
|
177
|
+
errorMessage
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
function createSpyMockClient(response) {
|
|
181
|
+
const prompts = [];
|
|
182
|
+
const client = createMockLLMClient({
|
|
183
|
+
response,
|
|
184
|
+
onPrompt: (prompt) => prompts.push(prompt)
|
|
185
|
+
});
|
|
186
|
+
return { client, prompts };
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
export { BINARY_THRESHOLDS, SEVERITY_THRESHOLDS, batch, clearMetrics, createErrorMockClient, createKeywordMetric, createMetricOutput, createMockLLMClient, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, getMetric, listMetrics, normalizeScore, registerMetric, runMetric, scoreToLabel, unregisterMetric };
|
|
190
|
+
//# sourceMappingURL=index.js.map
|
|
191
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/metrics/custom/index.ts","../../src/metrics/utils/index.ts","../../src/metrics/llm/adapters/mock.ts"],"names":["delay"],"mappings":";;;AASA,IAAM,aAAA,uBAAoB,GAAA,EAAsB;AAiBzC,SAAS,cAAA,CAAe,MAAc,EAAA,EAAoB;AAC/D,EAAA,IAAI,aAAA,CAAc,GAAA,CAAI,IAAI,CAAA,EAAG;AAC3B,IAAA,MAAM,IAAI,KAAA,CAAM,CAAA,QAAA,EAAW,IAAI,CAAA,uBAAA,CAAyB,CAAA;AAAA,EAC1D;AACA,EAAA,aAAA,CAAc,GAAA,CAAI,MAAM,EAAE,CAAA;AAC5B;AAKO,SAAS,UAAU,IAAA,EAAoC;AAC5D,EAAA,OAAO,aAAA,CAAc,IAAI,IAAI,CAAA;AAC/B;AAKA,eAAsB,SAAA,CAAU,MAAc,MAAA,EAA+C;AAC3F,EAAA,MAAM,EAAA,GAAK,aAAA,CAAc,GAAA,CAAI,IAAI,CAAA;AACjC,EAAA,IAAI,CAAC,EAAA,EAAI;AACP,IAAA,MAAM,IAAI,KAAA,CAAM,CAAA,QAAA,EAAW,IAAI,CAAA,mBAAA,CAAqB,CAAA;AAAA,EACtD;AACA,EAAA,OAAO,GAAG,MAAM,CAAA;AAClB;AAKO,SAAS,WAAA,GAAwB;AACtC,EAAA,OAAO,KAAA,CAAM,IAAA,CAAK,aAAA,CAAc,IAAA,EAAM,CAAA;AACxC;AAKO,SAAS,iBAAiB,IAAA,EAAuB;AACtD,EAAA,OAAO,aAAA,CAAc,OAAO,IAAI,CAAA;AAClC;AAKO,SAAS,YAAA,GAAqB;AACnC,EAAA,aAAA,CAAc,KAAA,EAAM;AACtB;AAKO,SAAS,mBAAA,CACd,IAAA,EACA,QAAA,EACA,OAAA,GAA0D,EAAC,EACjD;AACV,EAAA,MAAM,EAAE,UAAA,GAAa,CAAA,EAAG,YAAA,GAAe,GAAE,GAAI,OAAA;AAE7C,EAAA,OAAO,OAAO,MAAA,KAAkD;AAC9D,IAAA,OAAO,MAAA,CAAO,OAAA,CAAQ,GAAA,CAAI,CAAC,CAAA,KAAM;AAC/B,MAAA,MAAM,QAAA,GAAW,SAAS,IAAA,CAAK,CAAC,MAAM,CAAA,CAAE,IAAA,CAAK,CAAA,CAAE,MAAM,CAAC,CAAA;AACtD,MAAA,OAAO;AAAA,QACL,IAAI,CAAA,CAAE,EAAA;AAAA,QACN,MAAA,EAAQ,IAAA;AAAA,QACR,KAAA,EAAO,WAAW,UAAA,GAAa,YAAA;AAAA,QAC/B,KAAA,EAAO,WAAW,UAAA,GAAa;AAAA,OACjC;AAAA,IACF,CAAC,CAAA;AAAA,EACH,CAAA;AACF;AAKO,SAAS,mBAAA,CACd,IAAA,EACA,QAAA,EACA,OAAA,GAA2D,EAAC,EAClD;AACV,EAAA,MAAM,EAAE,aAAA,GAAgB,KAAA,EAAO,SAAA,GAAY,KAAI,GAAI,OAAA;AAEnD,EAAA,MAAM,kBAAA,GAAqB,gBACvB,QAAA,GACA,QAAA,CAAS,IAAI,CAAC,CAAA,KAAM,CAAA,CAAE,WAAA,EAAa,CAAA;AAEvC,EAAA,OAAO,OAAO,MAAA,KAAkD;AAC9D,IAAA,OAAO,MAAA,CAAO,OAAA,CAAQ,GAAA,CAAI,CAAC,CAAA,KAAM;AAC/B,MAAA,MAAM,OAAO,aAAA,GAAgB,CAAA,CAAE,MAAA,GAAS,CAAA,CAAE,OAAO,WAAA,EAAY;AAC7D,MAAA,MAAM,OAAA,GAAU,mBAAmB,MAAA,CAAO,CAAC,MAAM,IAAA,CAAK,QAAA,CAAS,CAAC,CAAC,CAAA;AACjE,MAAA,MAAM,KAAA,GAAQ,OAAA,CAAQ,MAAA,GAAS,QAAA,CAAS,MAAA;AAExC,MAAA,OAAO;AAAA,QACL,IAAI,CAAA,CAAE,EAAA;AAAA,QACN,MAAA,EAAQ,IAAA;AAAA,QACR,KAAA;AAAA,QACA,KAAA,EAAO,KAAA,IAAS,SAAA,GAAY,UAAA,GAAa;AAAA,OAC3C;AAAA,IACF,CAAC,CAAA;AAAA,EACH,CAAA;AACF;;;AClHO,SAAS,cAAA,CAAe,KAAA,EAAe,GAAA,GAAM,CAAA,EAAG,MAAM,CAAA,EAAW;AACtE,EAAA,MAAM,QAAQ,GAAA,GAAM,GAAA;AACpB,EAAA,IAAI,KAAA,KAAU,GAAG,OAAO,CAAA;AACxB,EAAA,OAAO,IAAA,CAAK,IAAI,CAAA,EAAG,IAAA,CAAK,IAAI,CAAA,EAAA,CAAI,KAAA,GAAQ,GAAA,IAAO,KAAK,CAAC,CAAA;AACvD;AAKO,SAAS,YAAA,CACd,OACA,UAAA,EACQ;AAER,EAAA,MAAM,MAAA,GAAS,CAAC,GAAG,UAAU,CAAA,CAAE,IAAA,CAAK,CAAC,CAAA,EAAG,CAAA,KAAM,CAAA,CAAE,GAAA,GAAM,CAAA,CAAE,GAAG,CAAA;AAE3D,EAAA,KAAA,MAAW,EAAE,KAAA,EAAO,GAAA,EAAI,IAAK,MAAA,EAAQ;AACnC,IAAA,IAAI,SAAS,GAAA,EAAK;AAChB,MAAA,OAAO,KAAA;AAAA,IACT;AAAA,EACF;AAEA,EAAA,OAAO,UAAA,CAAW,UAAA,CAAW,MAAA,GAAS,CAAC,GAAG,KAAA,IAAS,SAAA;AACrD;AAKO,SAAS,kBAAA,CACd,EAAA,EACA,MAAA,EACA,KAAA,EACA,eAAA,EACc;AACd,EAAA,MAAM,eAAA,GAAkB,eAAe,KAAK,CAAA;AAC5C,EAAA,MAAM,KAAA,GAAQ,kBACV,YAAA,CAAa,eAAA,EAAiB,eAAe,CAAA,GAC7C,eAAA,IAAmB,MACjB,MAAA,GACA,KAAA;AAEN,EAAA,OAAO;AAAA,IACL,EAAA;AAAA,IACA,MAAA;AAAA,IACA,KAAA,EAAO,eAAA;AAAA,IACP;AAAA,GACF;AACF;AAKO,IAAM,iBAAA,GAAoB;AAAA,EAC/B,EAAE,KAAA,EAAO,MAAA,EAAQ,GAAA,EAAK,GAAA,EAAI;AAAA,EAC1B,EAAE,KAAA,EAAO,OAAA,EAAS,GAAA,EAAK,CAAA;AACzB;AAKO,IAAM,mBAAA,GAAsB;AAAA,EACjC,EAAE,KAAA,EAAO,MAAA,EAAQ,GAAA,EAAK,GAAA,EAAI;AAAA,EAC1B,EAAE,KAAA,EAAO,QAAA,EAAU,GAAA,EAAK,GAAA,EAAI;AAAA,EAC5B,EAAE,KAAA,EAAO,KAAA,EAAO,GAAA,EAAK,CAAA;AACvB;AAKO,SAAS,KAAA,CAAS,OAAY,IAAA,EAAqB;AACxD,EAAA,MAAM,UAAiB,EAAC;AACxB,EAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,KAAA,CAAM,MAAA,EAAQ,KAAK,IAAA,EAAM;AAC3C,IAAA,OAAA,CAAQ,KAAK,KAAA,CAAM,KAAA,CAAM,CAAA,EAAG,CAAA,GAAI,IAAI,CAAC,CAAA;AAAA,EACvC;AACA,EAAA,OAAO,OAAA;AACT;AAKO,SAAS,MAAM,EAAA,EAA2B;AAC/C,EAAA,OAAO,IAAI,OAAA,CAAQ,CAAC,YAAY,UAAA,CAAW,OAAA,EAAS,EAAE,CAAC,CAAA;AACzD;;;AC/CO,SAAS,mBAAA,CAAoB,MAAA,GAAwB,EAAC,EAAc;AACzE,EAAA,MAAM;AAAA,IACJ,QAAA;AAAA,IACA,SAAA;AAAA,IACA,OAAAA,MAAAA,GAAQ,CAAA;AAAA,IACR,WAAA,GAAc,KAAA;AAAA,IACd,YAAA,GAAe,gBAAA;AAAA,IACf;AAAA,GACF,GAAI,MAAA;AAEJ,EAAA,IAAI,SAAA,GAAY,CAAA;AAEhB,EAAA,MAAM,cAAc,MAAwC;AAC1D,IAAA,IAAI,SAAA,IAAa,SAAA,CAAU,MAAA,GAAS,CAAA,EAAG;AACrC,MAAA,MAAM,IAAA,GAAO,UAAU,IAAA,CAAK,GAAA,CAAI,WAAW,SAAA,CAAU,MAAA,GAAS,CAAC,CAAC,CAAA;AAChE,MAAA,IAAI,CAAC,IAAA,EAAM;AACT,QAAA,OAAO,KAAK,SAAA,CAAU,EAAE,OAAO,GAAA,EAAK,SAAA,EAAW,iBAAiB,CAAA;AAAA,MAClE;AACA,MAAA,SAAA,EAAA;AACA,MAAA,OAAO,IAAA;AAAA,IACT;AAEA,IAAA,IAAI,aAAa,MAAA,EAAW;AAC1B,MAAA,OAAO,QAAA;AAAA,IACT;AAGA,IAAA,OAAO,KAAK,SAAA,CAAU,EAAE,OAAO,GAAA,EAAK,SAAA,EAAW,iBAAiB,CAAA;AAAA,EAClE,CAAA;AAEA,EAAA,OAAO;AAAA,IACL,MAAM,SAAS,MAAA,EAAiC;AAE9C,MAAA,IAAI,QAAA,EAAU;AACZ,QAAA,QAAA,CAAS,MAAM,CAAA;AAAA,MACjB;AAGA,MAAA,IAAIA,SAAQ,CAAA,EAAG;AACb,QAAA,MAAM,IAAI,OAAA,CAAQ,CAAC,YAAY,UAAA,CAAW,OAAA,EAASA,MAAK,CAAC,CAAA;AAAA,MAC3D;AAGA,MAAA,IAAI,WAAA,EAAa;AACf,QAAA,MAAM,IAAI,MAAM,YAAY,CAAA;AAAA,MAC9B;AAGA,MAAA,MAAM,OAAO,WAAA,EAAY;AACzB,MAAA,OAAO,OAAO,IAAA,KAAS,QAAA,GAAW,IAAA,GAAO,IAAA,CAAK,UAAU,IAAI,CAAA;AAAA,IAC9D,CAAA;AAAA,IAEA,MAAM,kBAAA,CAAsB,MAAA,EAAgB,OAAA,EAAiC;AAE3E,MAAA,IAAI,QAAA,EAAU;AACZ,QAAA,QAAA,CAAS,MAAM,CAAA;AAAA,MACjB;AAGA,MAAA,IAAIA,SAAQ,CAAA,EAAG;AACb,QAAA,MAAM,IAAI,OAAA,CAAQ,CAAC,YAAY,UAAA,CAAW,OAAA,EAASA,MAAK,CAAC,CAAA;AAAA,MAC3D;AAGA,MAAA,IAAI,WAAA,EAAa;AACf,QAAA,MAAM,IAAI,MAAM,YAAY,CAAA;AAAA,MAC9B;AAGA,MAAA,MAAM,OAAO,WAAA,EAAY;AACzB,MAAA,IAAI,OAAO,SAAS,QAAA,EAAU;AAC5B,QAAA,IAAI;AACF,UAAA,OAAO,IAAA,CAAK,MAAM,IAAI,CAAA;AAAA,QACxB,CAAA,CAAA,MAAQ;AACN,UAAA,MAAM,IAAI,KAAA,CAAM,CAAA,iCAAA,EAAoC,IAAI,CAAA,CAAE,CAAA;AAAA,QAC5D;AAAA,MACF;AAEA,MAAA,OAAO,IAAA;AAAA,IACT;AAAA,GACF;AACF;AAeO,SAAS,0BAAA,CACd,SAAA,EACA,OAAA,GAA8B,EAAC,EACpB;AACX,EAAA,OAAO,mBAAA,CAAoB;AAAA,IACzB,SAAA;AAAA,IACA,OAAO,OAAA,CAAQ;AAAA,GAChB,CAAA;AACH;AAOO,SAAS,qBAAA,CAAsB,eAAe,gBAAA,EAA6B;AAChF,EAAA,OAAO,mBAAA,CAAoB;AAAA,IACzB,WAAA,EAAa,IAAA;AAAA,IACb;AAAA,GACD,CAAA;AACH;AAcO,SAAS,oBACd,QAAA,EAC0C;AAC1C,EAAA,MAAM,UAAoB,EAAC;AAE3B,EAAA,MAAM,SAAS,mBAAA,CAAoB;AAAA,IACjC,QAAA;AAAA,IACA,QAAA,EAAU,CAAC,MAAA,KAAW,OAAA,CAAQ,KAAK,MAAM;AAAA,GAC1C,CAAA;AAED,EAAA,OAAO,EAAE,QAAQ,OAAA,EAAQ;AAC3B","file":"index.js","sourcesContent":["/**\n * Custom metric registration\n */\n\nimport type { MetricFn, MetricOutput, MetricConfig } from \"../../core/types.js\";\n\n/**\n * Registry of custom metrics\n */\nconst customMetrics = new Map<string, MetricFn>();\n\n/**\n * Registers a custom metric\n *\n * @example\n * ```ts\n * registerMetric(\"custom-relevance\", async ({ outputs, query }) => {\n * // Custom evaluation logic\n * return outputs.map(o => ({\n * id: o.id,\n * metric: \"custom-relevance\",\n * score: evaluateRelevance(o.output, query),\n * }));\n * });\n * ```\n */\nexport function registerMetric(name: string, fn: MetricFn): void {\n if (customMetrics.has(name)) {\n throw new Error(`Metric \"${name}\" is already registered`);\n }\n customMetrics.set(name, fn);\n}\n\n/**\n * Gets a registered custom metric\n */\nexport function getMetric(name: string): MetricFn | undefined {\n return customMetrics.get(name);\n}\n\n/**\n * Runs a registered metric\n */\nexport async function runMetric(name: string, config: MetricConfig): Promise<MetricOutput[]> {\n const fn = customMetrics.get(name);\n if (!fn) {\n throw new Error(`Metric \"${name}\" is not registered`);\n }\n return fn(config);\n}\n\n/**\n * Lists all registered custom metrics\n */\nexport function listMetrics(): string[] {\n return Array.from(customMetrics.keys());\n}\n\n/**\n * Unregisters a metric (mainly for testing)\n */\nexport function unregisterMetric(name: string): boolean {\n return customMetrics.delete(name);\n}\n\n/**\n * Clears all registered metrics (mainly for testing)\n */\nexport function clearMetrics(): void {\n customMetrics.clear();\n}\n\n/**\n * Creates a simple string-matching metric\n */\nexport function createPatternMetric(\n name: string,\n patterns: RegExp[],\n options: { matchScore?: number; noMatchScore?: number } = {}\n): MetricFn {\n const { matchScore = 1, noMatchScore = 0 } = options;\n\n return async (config: MetricConfig): Promise<MetricOutput[]> => {\n return config.outputs.map((o) => {\n const hasMatch = patterns.some((p) => p.test(o.output));\n return {\n id: o.id,\n metric: name,\n score: hasMatch ? matchScore : noMatchScore,\n label: hasMatch ? \"detected\" : \"not_detected\",\n };\n });\n };\n}\n\n/**\n * Creates a keyword-based metric\n */\nexport function createKeywordMetric(\n name: string,\n keywords: string[],\n options: { caseSensitive?: boolean; threshold?: number } = {}\n): MetricFn {\n const { caseSensitive = false, threshold = 0.5 } = options;\n\n const normalizedKeywords = caseSensitive\n ? keywords\n : keywords.map((k) => k.toLowerCase());\n\n return async (config: MetricConfig): Promise<MetricOutput[]> => {\n return config.outputs.map((o) => {\n const text = caseSensitive ? o.output : o.output.toLowerCase();\n const matches = normalizedKeywords.filter((k) => text.includes(k));\n const score = matches.length / keywords.length;\n\n return {\n id: o.id,\n metric: name,\n score,\n label: score >= threshold ? \"detected\" : \"not_detected\",\n };\n });\n };\n}\n","/**\n * Metric utilities\n */\n\nimport type { MetricOutput } from \"../../core/types.js\";\n\n/**\n * Normalizes a score to 0-1 range\n */\nexport function normalizeScore(score: number, min = 0, max = 1): number {\n const range = max - min;\n if (range === 0) return 0;\n return Math.max(0, Math.min(1, (score - min) / range));\n}\n\n/**\n * Converts a numeric score to a label based on thresholds\n */\nexport function scoreToLabel(\n score: number,\n thresholds: { label: string; min: number }[]\n): string {\n // Sort thresholds by min descending\n const sorted = [...thresholds].sort((a, b) => b.min - a.min);\n\n for (const { label, min } of sorted) {\n if (score >= min) {\n return label;\n }\n }\n\n return thresholds[thresholds.length - 1]?.label ?? \"unknown\";\n}\n\n/**\n * Creates a metric output from a score\n */\nexport function createMetricOutput(\n id: string,\n metric: string,\n score: number,\n labelThresholds?: { label: string; min: number }[]\n): MetricOutput {\n const normalizedScore = normalizeScore(score);\n const label = labelThresholds\n ? scoreToLabel(normalizedScore, labelThresholds)\n : normalizedScore >= 0.5\n ? \"high\"\n : \"low\";\n\n return {\n id,\n metric,\n score: normalizedScore,\n label,\n };\n}\n\n/**\n * Default thresholds for binary metrics\n */\nexport const BINARY_THRESHOLDS = [\n { label: \"true\", min: 0.5 },\n { label: \"false\", min: 0 },\n];\n\n/**\n * Default thresholds for severity metrics\n */\nexport const SEVERITY_THRESHOLDS = [\n { label: \"high\", min: 0.7 },\n { label: \"medium\", min: 0.4 },\n { label: \"low\", min: 0 },\n];\n\n/**\n * Batches items for parallel processing\n */\nexport function batch<T>(items: T[], size: number): T[][] {\n const batches: T[][] = [];\n for (let i = 0; i < items.length; i += size) {\n batches.push(items.slice(i, i + size));\n }\n return batches;\n}\n\n/**\n * Delays execution\n */\nexport function delay(ms: number): Promise<void> {\n return new Promise((resolve) => setTimeout(resolve, ms));\n}\n","/**\n * Mock LLM client for testing\n *\n * Provides a configurable mock implementation of LLMClient for unit tests.\n */\n\nimport type { LLMClient, JSONSchema } from \"../../../core/types.js\";\n\n/**\n * Configuration for mock LLM client\n */\nexport interface MockLLMConfig {\n /** Fixed response to return (can be string or object for JSON mode) */\n response?: string | Record<string, unknown>;\n\n /** Multiple responses for sequential calls */\n responses?: Array<string | Record<string, unknown>>;\n\n /** Delay in milliseconds before responding */\n delay?: number;\n\n /** Whether to throw an error */\n shouldError?: boolean;\n\n /** Error message to throw */\n errorMessage?: string;\n\n /** Function to validate prompts */\n onPrompt?: (prompt: string) => void;\n}\n\n/**\n * Creates a mock LLM client for testing\n *\n * @example\n * ```ts\n * const mock = createMockLLMClient({\n * response: JSON.stringify({ score: 0.8, reasoning: \"test\" }),\n * delay: 100\n * });\n *\n * setLLMClient(mock);\n * ```\n */\nexport function createMockLLMClient(config: MockLLMConfig = {}): LLMClient {\n const {\n response,\n responses,\n delay = 0,\n shouldError = false,\n errorMessage = \"Mock LLM error\",\n onPrompt,\n } = config;\n\n let callCount = 0;\n\n const getResponse = (): string | Record<string, unknown> => {\n if (responses && responses.length > 0) {\n const resp = responses[Math.min(callCount, responses.length - 1)];\n if (!resp) {\n return JSON.stringify({ score: 0.5, reasoning: \"Mock response\" });\n }\n callCount++;\n return resp;\n }\n\n if (response !== undefined) {\n return response;\n }\n\n // Default response\n return JSON.stringify({ score: 0.5, reasoning: \"Mock response\" });\n };\n\n return {\n async complete(prompt: string): Promise<string> {\n // Call validation hook if provided\n if (onPrompt) {\n onPrompt(prompt);\n }\n\n // Simulate delay\n if (delay > 0) {\n await new Promise((resolve) => setTimeout(resolve, delay));\n }\n\n // Simulate error\n if (shouldError) {\n throw new Error(errorMessage);\n }\n\n // Return response\n const resp = getResponse();\n return typeof resp === \"string\" ? resp : JSON.stringify(resp);\n },\n\n async completeStructured<T>(prompt: string, _schema: JSONSchema): Promise<T> {\n // Call validation hook if provided\n if (onPrompt) {\n onPrompt(prompt);\n }\n\n // Simulate delay\n if (delay > 0) {\n await new Promise((resolve) => setTimeout(resolve, delay));\n }\n\n // Simulate error\n if (shouldError) {\n throw new Error(errorMessage);\n }\n\n // Return response as object\n const resp = getResponse();\n if (typeof resp === \"string\") {\n try {\n return JSON.parse(resp) as T;\n } catch {\n throw new Error(`Mock response is not valid JSON: ${resp}`);\n }\n }\n\n return resp as T;\n },\n };\n}\n\n/**\n * Creates a mock client that returns sequential responses\n *\n * Useful for testing multiple calls with different responses.\n *\n * @example\n * ```ts\n * const mock = createSequentialMockClient([\n * { score: 0.2, reasoning: \"First call\" },\n * { score: 0.8, reasoning: \"Second call\" }\n * ]);\n * ```\n */\nexport function createSequentialMockClient(\n responses: Array<string | Record<string, unknown>>,\n options: { delay?: number } = {}\n): LLMClient {\n return createMockLLMClient({\n responses,\n delay: options.delay,\n });\n}\n\n/**\n * Creates a mock client that always errors\n *\n * Useful for testing error handling.\n */\nexport function createErrorMockClient(errorMessage = \"Mock LLM error\"): LLMClient {\n return createMockLLMClient({\n shouldError: true,\n errorMessage,\n });\n}\n\n/**\n * Creates a spy mock client that records all prompts\n *\n * Useful for testing what prompts are being sent to the LLM.\n *\n * @example\n * ```ts\n * const { client, prompts } = createSpyMockClient({ score: 0.5 });\n * await metric({ outputs, context, llmClient: client });\n * console.log(prompts); // See all prompts that were sent\n * ```\n */\nexport function createSpyMockClient(\n response: string | Record<string, unknown>\n): { client: LLMClient; prompts: string[] } {\n const prompts: string[] = [];\n\n const client = createMockLLMClient({\n response,\n onPrompt: (prompt) => prompts.push(prompt),\n });\n\n return { client, prompts };\n}\n"]}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var chunkY23VHTD3_cjs = require('../../chunk-Y23VHTD3.cjs');
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
Object.defineProperty(exports, "faithfulness", {
|
|
8
|
+
enumerable: true,
|
|
9
|
+
get: function () { return chunkY23VHTD3_cjs.faithfulness; }
|
|
10
|
+
});
|
|
11
|
+
Object.defineProperty(exports, "hallucination", {
|
|
12
|
+
enumerable: true,
|
|
13
|
+
get: function () { return chunkY23VHTD3_cjs.hallucination; }
|
|
14
|
+
});
|
|
15
|
+
Object.defineProperty(exports, "relevance", {
|
|
16
|
+
enumerable: true,
|
|
17
|
+
get: function () { return chunkY23VHTD3_cjs.relevance; }
|
|
18
|
+
});
|
|
19
|
+
Object.defineProperty(exports, "toxicity", {
|
|
20
|
+
enumerable: true,
|
|
21
|
+
get: function () { return chunkY23VHTD3_cjs.toxicity; }
|
|
22
|
+
});
|
|
23
|
+
//# sourceMappingURL=index.cjs.map
|
|
24
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"index.cjs"}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import { a as MetricConfig, b as MetricOutput } from '../../types-C71p0wzM.cjs';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Hallucination detection metric (LLM-based)
|
|
5
|
+
*
|
|
6
|
+
* Detects statements in the output that are not supported by the provided context.
|
|
7
|
+
* Uses LLM evaluation for accurate hallucination detection.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Configuration for hallucination metric
|
|
12
|
+
*/
|
|
13
|
+
interface HallucinationConfig extends MetricConfig {
|
|
14
|
+
/** Model outputs to evaluate */
|
|
15
|
+
outputs: Array<{
|
|
16
|
+
id: string;
|
|
17
|
+
output: string;
|
|
18
|
+
}>;
|
|
19
|
+
/** Context/source material that outputs should be faithful to */
|
|
20
|
+
context: string[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Detects potential hallucinations by checking if output content
|
|
24
|
+
* is supported by the provided context.
|
|
25
|
+
*
|
|
26
|
+
* This metric requires an LLM client. Set one globally with setLLMClient()
|
|
27
|
+
* or pass llmClient in the config.
|
|
28
|
+
*
|
|
29
|
+
* @example
|
|
30
|
+
* ```ts
|
|
31
|
+
* import { setLLMClient, hallucination } from "evalsense/metrics";
|
|
32
|
+
*
|
|
33
|
+
* // Configure LLM client once
|
|
34
|
+
* setLLMClient({
|
|
35
|
+
* async complete(prompt) {
|
|
36
|
+
* return await yourLLM.generate(prompt);
|
|
37
|
+
* }
|
|
38
|
+
* });
|
|
39
|
+
*
|
|
40
|
+
* // Use the metric
|
|
41
|
+
* const results = await hallucination({
|
|
42
|
+
* outputs: [{ id: "1", output: "The capital of France is Paris." }],
|
|
43
|
+
* context: ["France is a country in Europe. Its capital is Paris."]
|
|
44
|
+
* });
|
|
45
|
+
* ```
|
|
46
|
+
*/
|
|
47
|
+
declare function hallucination(config: HallucinationConfig): Promise<MetricOutput[]>;
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Relevance metric (LLM-based)
|
|
51
|
+
*
|
|
52
|
+
* Measures how relevant the output is to the input query.
|
|
53
|
+
* Uses LLM evaluation for accurate relevance assessment.
|
|
54
|
+
*/
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Configuration for relevance metric
|
|
58
|
+
*/
|
|
59
|
+
interface RelevanceConfig extends MetricConfig {
|
|
60
|
+
/** Model outputs to evaluate */
|
|
61
|
+
outputs: Array<{
|
|
62
|
+
id: string;
|
|
63
|
+
output: string;
|
|
64
|
+
}>;
|
|
65
|
+
/** Queries that the outputs should be relevant to */
|
|
66
|
+
query: string[];
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Measures the relevance of outputs to their queries.
|
|
70
|
+
*
|
|
71
|
+
* This metric requires an LLM client. Set one globally with setLLMClient()
|
|
72
|
+
* or pass llmClient in the config.
|
|
73
|
+
*
|
|
74
|
+
* @example
|
|
75
|
+
* ```ts
|
|
76
|
+
* import { setLLMClient, relevance } from "evalsense/metrics";
|
|
77
|
+
*
|
|
78
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
79
|
+
*
|
|
80
|
+
* const results = await relevance({
|
|
81
|
+
* outputs: [{ id: "1", output: "Paris is the capital of France." }],
|
|
82
|
+
* query: ["What is the capital of France?"]
|
|
83
|
+
* });
|
|
84
|
+
* ```
|
|
85
|
+
*/
|
|
86
|
+
declare function relevance(config: RelevanceConfig): Promise<MetricOutput[]>;
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Faithfulness metric (LLM-based)
|
|
90
|
+
*
|
|
91
|
+
* Measures how faithful the output is to the source material.
|
|
92
|
+
* Uses LLM evaluation to detect contradictions and misrepresentations.
|
|
93
|
+
*/
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Configuration for faithfulness metric
|
|
97
|
+
*/
|
|
98
|
+
interface FaithfulnessConfig extends MetricConfig {
|
|
99
|
+
/** Model outputs to evaluate */
|
|
100
|
+
outputs: Array<{
|
|
101
|
+
id: string;
|
|
102
|
+
output: string;
|
|
103
|
+
}>;
|
|
104
|
+
/** Source material that outputs should be faithful to */
|
|
105
|
+
source: string[];
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Measures the faithfulness of outputs to their source material.
|
|
109
|
+
*
|
|
110
|
+
* This metric requires an LLM client. Set one globally with setLLMClient()
|
|
111
|
+
* or pass llmClient in the config.
|
|
112
|
+
*
|
|
113
|
+
* @example
|
|
114
|
+
* ```ts
|
|
115
|
+
* import { setLLMClient, faithfulness } from "evalsense/metrics";
|
|
116
|
+
*
|
|
117
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
118
|
+
*
|
|
119
|
+
* const results = await faithfulness({
|
|
120
|
+
* outputs: [{ id: "1", output: "The document discusses climate change." }],
|
|
121
|
+
* source: ["This report covers the impacts of climate change on biodiversity."]
|
|
122
|
+
* });
|
|
123
|
+
* ```
|
|
124
|
+
*/
|
|
125
|
+
declare function faithfulness(config: FaithfulnessConfig): Promise<MetricOutput[]>;
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Toxicity detection metric (LLM-based)
|
|
129
|
+
*
|
|
130
|
+
* Detects potentially toxic, harmful, or inappropriate content.
|
|
131
|
+
* Uses LLM evaluation for nuanced toxicity detection.
|
|
132
|
+
*/
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Configuration for toxicity metric
|
|
136
|
+
*/
|
|
137
|
+
interface ToxicityConfig extends MetricConfig {
|
|
138
|
+
/** Model outputs to evaluate */
|
|
139
|
+
outputs: Array<{
|
|
140
|
+
id: string;
|
|
141
|
+
output: string;
|
|
142
|
+
}>;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Detects potential toxicity in outputs.
|
|
146
|
+
*
|
|
147
|
+
* This metric requires an LLM client. Set one globally with setLLMClient()
|
|
148
|
+
* or pass llmClient in the config.
|
|
149
|
+
*
|
|
150
|
+
* @example
|
|
151
|
+
* ```ts
|
|
152
|
+
* import { setLLMClient, toxicity } from "evalsense/metrics";
|
|
153
|
+
*
|
|
154
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
155
|
+
*
|
|
156
|
+
* const results = await toxicity({
|
|
157
|
+
* outputs: [{ id: "1", output: "This is a friendly message." }]
|
|
158
|
+
* });
|
|
159
|
+
* ```
|
|
160
|
+
*/
|
|
161
|
+
declare function toxicity(config: ToxicityConfig): Promise<MetricOutput[]>;
|
|
162
|
+
|
|
163
|
+
export { type FaithfulnessConfig, type HallucinationConfig, type RelevanceConfig, type ToxicityConfig, faithfulness, hallucination, relevance, toxicity };
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import { a as MetricConfig, b as MetricOutput } from '../../types-C71p0wzM.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Hallucination detection metric (LLM-based)
|
|
5
|
+
*
|
|
6
|
+
* Detects statements in the output that are not supported by the provided context.
|
|
7
|
+
* Uses LLM evaluation for accurate hallucination detection.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Configuration for hallucination metric
|
|
12
|
+
*/
|
|
13
|
+
interface HallucinationConfig extends MetricConfig {
|
|
14
|
+
/** Model outputs to evaluate */
|
|
15
|
+
outputs: Array<{
|
|
16
|
+
id: string;
|
|
17
|
+
output: string;
|
|
18
|
+
}>;
|
|
19
|
+
/** Context/source material that outputs should be faithful to */
|
|
20
|
+
context: string[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Detects potential hallucinations by checking if output content
|
|
24
|
+
* is supported by the provided context.
|
|
25
|
+
*
|
|
26
|
+
* This metric requires an LLM client. Set one globally with setLLMClient()
|
|
27
|
+
* or pass llmClient in the config.
|
|
28
|
+
*
|
|
29
|
+
* @example
|
|
30
|
+
* ```ts
|
|
31
|
+
* import { setLLMClient, hallucination } from "evalsense/metrics";
|
|
32
|
+
*
|
|
33
|
+
* // Configure LLM client once
|
|
34
|
+
* setLLMClient({
|
|
35
|
+
* async complete(prompt) {
|
|
36
|
+
* return await yourLLM.generate(prompt);
|
|
37
|
+
* }
|
|
38
|
+
* });
|
|
39
|
+
*
|
|
40
|
+
* // Use the metric
|
|
41
|
+
* const results = await hallucination({
|
|
42
|
+
* outputs: [{ id: "1", output: "The capital of France is Paris." }],
|
|
43
|
+
* context: ["France is a country in Europe. Its capital is Paris."]
|
|
44
|
+
* });
|
|
45
|
+
* ```
|
|
46
|
+
*/
|
|
47
|
+
declare function hallucination(config: HallucinationConfig): Promise<MetricOutput[]>;
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Relevance metric (LLM-based)
|
|
51
|
+
*
|
|
52
|
+
* Measures how relevant the output is to the input query.
|
|
53
|
+
* Uses LLM evaluation for accurate relevance assessment.
|
|
54
|
+
*/
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Configuration for relevance metric
|
|
58
|
+
*/
|
|
59
|
+
interface RelevanceConfig extends MetricConfig {
|
|
60
|
+
/** Model outputs to evaluate */
|
|
61
|
+
outputs: Array<{
|
|
62
|
+
id: string;
|
|
63
|
+
output: string;
|
|
64
|
+
}>;
|
|
65
|
+
/** Queries that the outputs should be relevant to */
|
|
66
|
+
query: string[];
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Measures the relevance of outputs to their queries.
|
|
70
|
+
*
|
|
71
|
+
* This metric requires an LLM client. Set one globally with setLLMClient()
|
|
72
|
+
* or pass llmClient in the config.
|
|
73
|
+
*
|
|
74
|
+
* @example
|
|
75
|
+
* ```ts
|
|
76
|
+
* import { setLLMClient, relevance } from "evalsense/metrics";
|
|
77
|
+
*
|
|
78
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
79
|
+
*
|
|
80
|
+
* const results = await relevance({
|
|
81
|
+
* outputs: [{ id: "1", output: "Paris is the capital of France." }],
|
|
82
|
+
* query: ["What is the capital of France?"]
|
|
83
|
+
* });
|
|
84
|
+
* ```
|
|
85
|
+
*/
|
|
86
|
+
declare function relevance(config: RelevanceConfig): Promise<MetricOutput[]>;
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Faithfulness metric (LLM-based)
|
|
90
|
+
*
|
|
91
|
+
* Measures how faithful the output is to the source material.
|
|
92
|
+
* Uses LLM evaluation to detect contradictions and misrepresentations.
|
|
93
|
+
*/
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Configuration for faithfulness metric
|
|
97
|
+
*/
|
|
98
|
+
interface FaithfulnessConfig extends MetricConfig {
|
|
99
|
+
/** Model outputs to evaluate */
|
|
100
|
+
outputs: Array<{
|
|
101
|
+
id: string;
|
|
102
|
+
output: string;
|
|
103
|
+
}>;
|
|
104
|
+
/** Source material that outputs should be faithful to */
|
|
105
|
+
source: string[];
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Measures the faithfulness of outputs to their source material.
|
|
109
|
+
*
|
|
110
|
+
* This metric requires an LLM client. Set one globally with setLLMClient()
|
|
111
|
+
* or pass llmClient in the config.
|
|
112
|
+
*
|
|
113
|
+
* @example
|
|
114
|
+
* ```ts
|
|
115
|
+
* import { setLLMClient, faithfulness } from "evalsense/metrics";
|
|
116
|
+
*
|
|
117
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
118
|
+
*
|
|
119
|
+
* const results = await faithfulness({
|
|
120
|
+
* outputs: [{ id: "1", output: "The document discusses climate change." }],
|
|
121
|
+
* source: ["This report covers the impacts of climate change on biodiversity."]
|
|
122
|
+
* });
|
|
123
|
+
* ```
|
|
124
|
+
*/
|
|
125
|
+
declare function faithfulness(config: FaithfulnessConfig): Promise<MetricOutput[]>;
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Toxicity detection metric (LLM-based)
|
|
129
|
+
*
|
|
130
|
+
* Detects potentially toxic, harmful, or inappropriate content.
|
|
131
|
+
* Uses LLM evaluation for nuanced toxicity detection.
|
|
132
|
+
*/
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Configuration for toxicity metric
|
|
136
|
+
*/
|
|
137
|
+
interface ToxicityConfig extends MetricConfig {
|
|
138
|
+
/** Model outputs to evaluate */
|
|
139
|
+
outputs: Array<{
|
|
140
|
+
id: string;
|
|
141
|
+
output: string;
|
|
142
|
+
}>;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Detects potential toxicity in outputs.
|
|
146
|
+
*
|
|
147
|
+
* This metric requires an LLM client. Set one globally with setLLMClient()
|
|
148
|
+
* or pass llmClient in the config.
|
|
149
|
+
*
|
|
150
|
+
* @example
|
|
151
|
+
* ```ts
|
|
152
|
+
* import { setLLMClient, toxicity } from "evalsense/metrics";
|
|
153
|
+
*
|
|
154
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
155
|
+
*
|
|
156
|
+
* const results = await toxicity({
|
|
157
|
+
* outputs: [{ id: "1", output: "This is a friendly message." }]
|
|
158
|
+
* });
|
|
159
|
+
* ```
|
|
160
|
+
*/
|
|
161
|
+
declare function toxicity(config: ToxicityConfig): Promise<MetricOutput[]>;
|
|
162
|
+
|
|
163
|
+
export { type FaithfulnessConfig, type HallucinationConfig, type RelevanceConfig, type ToxicityConfig, faithfulness, hallucination, relevance, toxicity };
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"names":[],"mappings":"","file":"index.js"}
|