promptfoo 0.96.0 → 0.96.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/package.json +10 -10
- package/dist/src/app/assets/{index-Dwt7E2K_.js → index-CL29fyye.js} +55 -55
- package/dist/src/app/assets/{index.es-CooNf3HB.js → index.es-CsYHA4xg.js} +1 -1
- package/dist/src/app/assets/{sync-Bj1WJrHQ.js → sync-B9AjROwZ.js} +1 -1
- package/dist/src/app/index.html +1 -1
- package/dist/src/assertions/answerRelevance.d.ts +3 -0
- package/dist/src/assertions/answerRelevance.d.ts.map +1 -0
- package/dist/src/assertions/answerRelevance.js +19 -0
- package/dist/src/assertions/answerRelevance.js.map +1 -0
- package/dist/src/assertions/bleu.d.ts +4 -14
- package/dist/src/assertions/bleu.d.ts.map +1 -1
- package/dist/src/assertions/bleu.js +19 -3
- package/dist/src/assertions/bleu.js.map +1 -1
- package/dist/src/assertions/classifier.d.ts +3 -0
- package/dist/src/assertions/classifier.d.ts.map +1 -0
- package/dist/src/assertions/classifier.js +22 -0
- package/dist/src/assertions/classifier.js.map +1 -0
- package/dist/src/assertions/contains.d.ts +8 -0
- package/dist/src/assertions/contains.d.ts.map +1 -0
- package/dist/src/assertions/contains.js +110 -0
- package/dist/src/assertions/contains.js.map +1 -0
- package/dist/src/assertions/contextFaithfulness.d.ts +3 -0
- package/dist/src/assertions/contextFaithfulness.d.ts.map +1 -0
- package/dist/src/assertions/contextFaithfulness.js +19 -0
- package/dist/src/assertions/contextFaithfulness.js.map +1 -0
- package/dist/src/assertions/contextRecall.d.ts +3 -0
- package/dist/src/assertions/contextRecall.d.ts.map +1 -0
- package/dist/src/assertions/contextRecall.js +18 -0
- package/dist/src/assertions/contextRecall.js.map +1 -0
- package/dist/src/assertions/contextRelevance.d.ts +3 -0
- package/dist/src/assertions/contextRelevance.d.ts.map +1 -0
- package/dist/src/assertions/contextRelevance.js +19 -0
- package/dist/src/assertions/contextRelevance.js.map +1 -0
- package/dist/src/assertions/cost.d.ts +3 -0
- package/dist/src/assertions/cost.d.ts.map +1 -0
- package/dist/src/assertions/cost.js +22 -0
- package/dist/src/assertions/cost.js.map +1 -0
- package/dist/src/assertions/equals.d.ts +4 -0
- package/dist/src/assertions/equals.d.ts.map +1 -0
- package/dist/src/assertions/equals.js +32 -0
- package/dist/src/assertions/equals.js.map +1 -0
- package/dist/src/assertions/factuality.d.ts +3 -0
- package/dist/src/assertions/factuality.d.ts.map +1 -0
- package/dist/src/assertions/factuality.js +25 -0
- package/dist/src/assertions/factuality.js.map +1 -0
- package/dist/src/assertions/index.d.ts.map +1 -1
- package/dist/src/assertions/index.js +89 -752
- package/dist/src/assertions/index.js.map +1 -1
- package/dist/src/assertions/javascript.d.ts +4 -0
- package/dist/src/assertions/javascript.d.ts.map +1 -0
- package/dist/src/assertions/javascript.js +94 -0
- package/dist/src/assertions/javascript.js.map +1 -0
- package/dist/src/assertions/json.d.ts +3 -4
- package/dist/src/assertions/json.d.ts.map +1 -1
- package/dist/src/assertions/json.js +2 -2
- package/dist/src/assertions/json.js.map +1 -1
- package/dist/src/assertions/latency.d.ts +3 -0
- package/dist/src/assertions/latency.d.ts.map +1 -0
- package/dist/src/assertions/latency.js +22 -0
- package/dist/src/assertions/latency.js.map +1 -0
- package/dist/src/assertions/levenshtein.d.ts +3 -0
- package/dist/src/assertions/levenshtein.d.ts.map +1 -0
- package/dist/src/assertions/levenshtein.js +22 -0
- package/dist/src/assertions/levenshtein.js.map +1 -0
- package/dist/src/assertions/llmRubric.d.ts +3 -0
- package/dist/src/assertions/llmRubric.d.ts.map +1 -0
- package/dist/src/assertions/llmRubric.js +22 -0
- package/dist/src/assertions/llmRubric.js.map +1 -0
- package/dist/src/assertions/modelGradedClosedQa.d.ts +3 -0
- package/dist/src/assertions/modelGradedClosedQa.d.ts.map +1 -0
- package/dist/src/assertions/modelGradedClosedQa.js +25 -0
- package/dist/src/assertions/modelGradedClosedQa.js.map +1 -0
- package/dist/src/assertions/moderation.d.ts +3 -0
- package/dist/src/assertions/moderation.d.ts.map +1 -0
- package/dist/src/assertions/moderation.js +41 -0
- package/dist/src/assertions/moderation.js.map +1 -0
- package/dist/src/assertions/openai.d.ts +5 -0
- package/dist/src/assertions/openai.d.ts.map +1 -0
- package/dist/src/assertions/openai.js +68 -0
- package/dist/src/assertions/openai.js.map +1 -0
- package/dist/src/assertions/perplexity.d.ts +3 -3
- package/dist/src/assertions/perplexity.d.ts.map +1 -1
- package/dist/src/assertions/perplexity.js +2 -2
- package/dist/src/assertions/perplexity.js.map +1 -1
- package/dist/src/assertions/python.d.ts +4 -0
- package/dist/src/assertions/python.d.ts.map +1 -0
- package/dist/src/assertions/python.js +107 -0
- package/dist/src/assertions/python.js.map +1 -0
- package/dist/src/assertions/redteam.d.ts +3 -0
- package/dist/src/assertions/redteam.d.ts.map +1 -0
- package/dist/src/assertions/redteam.js +29 -0
- package/dist/src/assertions/redteam.js.map +1 -0
- package/dist/src/assertions/regex.d.ts +3 -0
- package/dist/src/assertions/regex.d.ts.map +1 -0
- package/dist/src/assertions/regex.js +34 -0
- package/dist/src/assertions/regex.js.map +1 -0
- package/dist/src/assertions/rouge.d.ts +3 -0
- package/dist/src/assertions/rouge.d.ts.map +1 -0
- package/dist/src/assertions/rouge.js +47 -0
- package/dist/src/assertions/rouge.js.map +1 -0
- package/dist/src/assertions/similar.d.ts +3 -0
- package/dist/src/assertions/similar.d.ts.map +1 -0
- package/dist/src/assertions/similar.js +40 -0
- package/dist/src/assertions/similar.js.map +1 -0
- package/dist/src/assertions/sql.d.ts +3 -3
- package/dist/src/assertions/sql.d.ts.map +1 -1
- package/dist/src/assertions/sql.js +14 -3
- package/dist/src/assertions/sql.js.map +1 -1
- package/dist/src/assertions/startsWith.d.ts +3 -0
- package/dist/src/assertions/startsWith.d.ts.map +1 -0
- package/dist/src/assertions/startsWith.js +22 -0
- package/dist/src/assertions/startsWith.js.map +1 -0
- package/dist/src/assertions/utils.d.ts +1 -0
- package/dist/src/assertions/utils.d.ts.map +1 -1
- package/dist/src/assertions/utils.js +7 -0
- package/dist/src/assertions/utils.js.map +1 -1
- package/dist/src/assertions/webhook.d.ts +3 -0
- package/dist/src/assertions/webhook.d.ts.map +1 -0
- package/dist/src/assertions/webhook.js +55 -0
- package/dist/src/assertions/webhook.js.map +1 -0
- package/dist/src/assertions/xml.d.ts +2 -0
- package/dist/src/assertions/xml.d.ts.map +1 -1
- package/dist/src/assertions/xml.js +29 -0
- package/dist/src/assertions/xml.js.map +1 -1
- package/dist/src/fetch.d.ts.map +1 -1
- package/dist/src/fetch.js +8 -1
- package/dist/src/fetch.js.map +1 -1
- package/dist/src/providers/anthropic.d.ts.map +1 -1
- package/dist/src/providers/anthropic.js +12 -2
- package/dist/src/providers/anthropic.js.map +1 -1
- package/dist/src/providers/bedrock.js +1 -1
- package/dist/src/redteam/constants.d.ts +4 -3
- package/dist/src/redteam/constants.d.ts.map +1 -1
- package/dist/src/redteam/constants.js +43 -35
- package/dist/src/redteam/constants.js.map +1 -1
- package/dist/src/redteam/providers/goat.d.ts +2 -0
- package/dist/src/redteam/providers/goat.d.ts.map +1 -1
- package/dist/src/redteam/providers/goat.js +9 -2
- package/dist/src/redteam/providers/goat.js.map +1 -1
- package/dist/src/types/index.d.ts +108 -2
- package/dist/src/types/index.d.ts.map +1 -1
- package/dist/src/types/index.js +21 -26
- package/dist/src/types/index.js.map +1 -1
- package/dist/src/util/config/load.js +3 -2
- package/dist/src/util/config/load.js.map +1 -1
- package/dist/test/assertions/bleu.test.js +38 -13
- package/dist/test/assertions/bleu.test.js.map +1 -1
- package/dist/test/assertions/sql.test.js +268 -167
- package/dist/test/assertions/sql.test.js.map +1 -1
- package/dist/test/fetch.test.js +17 -0
- package/dist/test/fetch.test.js.map +1 -1
- package/dist/test/providers/anthropic.test.js +44 -5
- package/dist/test/providers/anthropic.test.js.map +1 -1
- package/dist/test/util/config/load.test.js +44 -1
- package/dist/test/util/config/load.test.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +10 -10
- package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240617.json +0 -10
- package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240618.json +0 -10
- package/dist/src/redteam/eval/harmful/llm_rubric-20240723.json +0 -10
- package/dist/src/redteam/eval/harmful/llm_rubric-20240724.json +0 -10
|
@@ -1,27 +1,4 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
-
if (mod && mod.__esModule) return mod;
|
|
20
|
-
var result = {};
|
|
21
|
-
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
-
__setModuleDefault(result, mod);
|
|
23
|
-
return result;
|
|
24
|
-
};
|
|
25
2
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
26
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
27
4
|
};
|
|
@@ -32,36 +9,50 @@ exports.runAssertions = runAssertions;
|
|
|
32
9
|
exports.runCompareAssertion = runCompareAssertion;
|
|
33
10
|
exports.readAssertions = readAssertions;
|
|
34
11
|
const async_1 = __importDefault(require("async"));
|
|
35
|
-
const fastest_levenshtein_1 = require("fastest-levenshtein");
|
|
36
12
|
const fs_1 = __importDefault(require("fs"));
|
|
37
|
-
const rouge = __importStar(require("js-rouge"));
|
|
38
13
|
const js_yaml_1 = __importDefault(require("js-yaml"));
|
|
39
|
-
const node_util_1 = __importDefault(require("node:util"));
|
|
40
14
|
const path_1 = __importDefault(require("path"));
|
|
41
15
|
const tiny_invariant_1 = __importDefault(require("tiny-invariant"));
|
|
42
16
|
const cliState_1 = __importDefault(require("../cliState"));
|
|
43
17
|
const envars_1 = require("../envars");
|
|
44
18
|
const esm_1 = require("../esm");
|
|
45
|
-
const fetch_1 = require("../fetch");
|
|
46
19
|
const logger_1 = __importDefault(require("../logger"));
|
|
47
20
|
const matchers_1 = require("../matchers");
|
|
48
|
-
const openaiUtil_1 = require("../providers/openaiUtil");
|
|
49
21
|
const packageParser_1 = require("../providers/packageParser");
|
|
50
|
-
const shared_1 = require("../providers/shared");
|
|
51
22
|
const pythonUtils_1 = require("../python/pythonUtils");
|
|
52
|
-
const wrapper_1 = require("../python/wrapper");
|
|
53
|
-
const graders_1 = require("../redteam/graders");
|
|
54
23
|
const telemetry_1 = __importDefault(require("../telemetry"));
|
|
55
|
-
const types_1 = require("../types");
|
|
56
24
|
const file_1 = require("../util/file");
|
|
57
25
|
const templates_1 = require("../util/templates");
|
|
58
26
|
const transform_1 = require("../util/transform");
|
|
59
27
|
const AssertionsResult_1 = require("./AssertionsResult");
|
|
28
|
+
const answerRelevance_1 = require("./answerRelevance");
|
|
60
29
|
const bleu_1 = require("./bleu");
|
|
30
|
+
const classifier_1 = require("./classifier");
|
|
31
|
+
const contains_1 = require("./contains");
|
|
32
|
+
const contextFaithfulness_1 = require("./contextFaithfulness");
|
|
33
|
+
const contextRecall_1 = require("./contextRecall");
|
|
34
|
+
const contextRelevance_1 = require("./contextRelevance");
|
|
35
|
+
const cost_1 = require("./cost");
|
|
36
|
+
const equals_1 = require("./equals");
|
|
37
|
+
const factuality_1 = require("./factuality");
|
|
38
|
+
const javascript_1 = require("./javascript");
|
|
61
39
|
const json_1 = require("./json");
|
|
40
|
+
const latency_1 = require("./latency");
|
|
41
|
+
const levenshtein_1 = require("./levenshtein");
|
|
42
|
+
const llmRubric_1 = require("./llmRubric");
|
|
43
|
+
const modelGradedClosedQa_1 = require("./modelGradedClosedQa");
|
|
44
|
+
const moderation_1 = require("./moderation");
|
|
45
|
+
const openai_1 = require("./openai");
|
|
62
46
|
const perplexity_1 = require("./perplexity");
|
|
47
|
+
const python_1 = require("./python");
|
|
48
|
+
const redteam_1 = require("./redteam");
|
|
49
|
+
const regex_1 = require("./regex");
|
|
50
|
+
const rouge_1 = require("./rouge");
|
|
51
|
+
const similar_1 = require("./similar");
|
|
63
52
|
const sql_1 = require("./sql");
|
|
53
|
+
const startsWith_1 = require("./startsWith");
|
|
64
54
|
const utils_1 = require("./utils");
|
|
55
|
+
const webhook_1 = require("./webhook");
|
|
65
56
|
const xml_1 = require("./xml");
|
|
66
57
|
const ASSERTIONS_MAX_CONCURRENCY = (0, envars_1.getEnvInt)('PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY', 3);
|
|
67
58
|
exports.MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
@@ -69,40 +60,20 @@ exports.MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
|
69
60
|
'context-faithfulness',
|
|
70
61
|
'context-recall',
|
|
71
62
|
'context-relevance',
|
|
63
|
+
'factuality',
|
|
72
64
|
'llm-rubric',
|
|
73
65
|
'model-graded-closedqa',
|
|
74
|
-
'factuality',
|
|
75
66
|
'model-graded-factuality',
|
|
76
67
|
]);
|
|
77
68
|
const nunjucks = (0, templates_1.getNunjucksEngine)();
|
|
78
|
-
function coerceString(value) {
|
|
79
|
-
if (typeof value === 'string') {
|
|
80
|
-
return value;
|
|
81
|
-
}
|
|
82
|
-
return JSON.stringify(value);
|
|
83
|
-
}
|
|
84
|
-
function handleRougeScore(baseType, assertion, expected, output, inverted) {
|
|
85
|
-
const fnName = baseType[baseType.length - 1];
|
|
86
|
-
const rougeMethod = rouge[fnName];
|
|
87
|
-
const score = rougeMethod(output, expected, {});
|
|
88
|
-
const pass = score >= (assertion.threshold || 0.75) != inverted;
|
|
89
|
-
return {
|
|
90
|
-
pass,
|
|
91
|
-
score: inverted ? 1 - score : score,
|
|
92
|
-
reason: pass
|
|
93
|
-
? `${baseType.toUpperCase()} score ${score.toFixed(2)} is greater than or equal to threshold ${assertion.threshold || 0.75}`
|
|
94
|
-
: `${baseType.toUpperCase()} score ${score.toFixed(2)} is less than threshold ${assertion.threshold || 0.75}`,
|
|
95
|
-
assertion,
|
|
96
|
-
};
|
|
97
|
-
}
|
|
98
69
|
async function runAssertion({ prompt, provider, assertion, test, latencyMs, providerResponse, }) {
|
|
99
70
|
const { cost, logProbs, output: originalOutput } = providerResponse;
|
|
100
71
|
let output = originalOutput;
|
|
101
|
-
let pass = false;
|
|
102
|
-
let score = 0.0;
|
|
103
72
|
(0, tiny_invariant_1.default)(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
|
|
104
73
|
const inverse = assertion.type.startsWith('not-');
|
|
105
|
-
const baseType = inverse
|
|
74
|
+
const baseType = inverse
|
|
75
|
+
? assertion.type.slice(4)
|
|
76
|
+
: assertion.type;
|
|
106
77
|
telemetry_1.default.record('assertion_used', {
|
|
107
78
|
type: baseType,
|
|
108
79
|
});
|
|
@@ -112,7 +83,6 @@ async function runAssertion({ prompt, provider, assertion, test, latencyMs, prov
|
|
|
112
83
|
prompt: { label: prompt },
|
|
113
84
|
});
|
|
114
85
|
}
|
|
115
|
-
const outputString = coerceString(output);
|
|
116
86
|
const context = {
|
|
117
87
|
prompt,
|
|
118
88
|
vars: test.vars || {},
|
|
@@ -186,703 +156,70 @@ async function runAssertion({ prompt, provider, assertion, test, latencyMs, prov
|
|
|
186
156
|
return v;
|
|
187
157
|
});
|
|
188
158
|
}
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
if (baseType === 'contains') {
|
|
253
|
-
(0, tiny_invariant_1.default)(renderedValue, '"contains" assertion type must have a string or number value');
|
|
254
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string' || typeof renderedValue === 'number', '"contains" assertion type must have a string or number value');
|
|
255
|
-
pass = outputString.includes(String(renderedValue)) !== inverse;
|
|
256
|
-
return {
|
|
257
|
-
pass,
|
|
258
|
-
score: pass ? 1 : 0,
|
|
259
|
-
reason: pass
|
|
260
|
-
? 'Assertion passed'
|
|
261
|
-
: `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`,
|
|
262
|
-
assertion,
|
|
263
|
-
};
|
|
264
|
-
}
|
|
265
|
-
if (baseType === 'contains-any') {
|
|
266
|
-
(0, tiny_invariant_1.default)(renderedValue, '"contains-any" assertion type must have a value');
|
|
267
|
-
if (typeof renderedValue === 'string') {
|
|
268
|
-
renderedValue = renderedValue.split(',').map((v) => v.trim());
|
|
269
|
-
}
|
|
270
|
-
(0, tiny_invariant_1.default)(Array.isArray(renderedValue), '"contains-any" assertion type must have an array value');
|
|
271
|
-
pass = renderedValue.some((value) => outputString.includes(String(value))) !== inverse;
|
|
272
|
-
return {
|
|
273
|
-
pass,
|
|
274
|
-
score: pass ? 1 : 0,
|
|
275
|
-
reason: pass
|
|
276
|
-
? 'Assertion passed'
|
|
277
|
-
: `Expected output to ${inverse ? 'not ' : ''}contain one of "${renderedValue.join(', ')}"`,
|
|
278
|
-
assertion,
|
|
279
|
-
};
|
|
280
|
-
}
|
|
281
|
-
if (baseType === 'icontains-any') {
|
|
282
|
-
(0, tiny_invariant_1.default)(renderedValue, '"icontains-any" assertion type must have a value');
|
|
283
|
-
if (typeof renderedValue === 'string') {
|
|
284
|
-
renderedValue = renderedValue.split(',').map((v) => v.trim());
|
|
285
|
-
}
|
|
286
|
-
(0, tiny_invariant_1.default)(Array.isArray(renderedValue), '"icontains-any" assertion type must have an array value');
|
|
287
|
-
pass =
|
|
288
|
-
renderedValue.some((value) => outputString.toLowerCase().includes(String(value).toLowerCase())) !== inverse;
|
|
289
|
-
return {
|
|
290
|
-
pass,
|
|
291
|
-
score: pass ? 1 : 0,
|
|
292
|
-
reason: pass
|
|
293
|
-
? 'Assertion passed'
|
|
294
|
-
: `Expected output to ${inverse ? 'not ' : ''}contain one of "${renderedValue.join(', ')}"`,
|
|
295
|
-
assertion,
|
|
296
|
-
};
|
|
297
|
-
}
|
|
298
|
-
if (baseType === 'contains-all') {
|
|
299
|
-
(0, tiny_invariant_1.default)(renderedValue, '"contains-all" assertion type must have a value');
|
|
300
|
-
if (typeof renderedValue === 'string') {
|
|
301
|
-
renderedValue = renderedValue.split(',').map((v) => v.trim());
|
|
302
|
-
}
|
|
303
|
-
(0, tiny_invariant_1.default)(Array.isArray(renderedValue), '"contains-all" assertion type must have an array value');
|
|
304
|
-
const missingStrings = renderedValue.filter((value) => !outputString.includes(String(value)));
|
|
305
|
-
pass = (missingStrings.length === 0) !== inverse;
|
|
306
|
-
return {
|
|
307
|
-
pass,
|
|
308
|
-
score: pass ? 1 : 0,
|
|
309
|
-
reason: pass
|
|
310
|
-
? 'Assertion passed'
|
|
311
|
-
: `Expected output to ${inverse ? 'not ' : ''}contain all of [${renderedValue.join(', ')}]. Missing: [${missingStrings.join(', ')}]`,
|
|
312
|
-
assertion,
|
|
313
|
-
};
|
|
314
|
-
}
|
|
315
|
-
if (baseType === 'icontains-all') {
|
|
316
|
-
(0, tiny_invariant_1.default)(renderedValue, '"icontains-all" assertion type must have a value');
|
|
317
|
-
if (typeof renderedValue === 'string') {
|
|
318
|
-
renderedValue = renderedValue.split(',').map((v) => v.trim());
|
|
319
|
-
}
|
|
320
|
-
(0, tiny_invariant_1.default)(Array.isArray(renderedValue), '"icontains-all" assertion type must have an array value');
|
|
321
|
-
const missingStrings = renderedValue.filter((value) => !outputString.toLowerCase().includes(String(value).toLowerCase()));
|
|
322
|
-
pass = (missingStrings.length === 0) !== inverse;
|
|
323
|
-
return {
|
|
324
|
-
pass,
|
|
325
|
-
score: pass ? 1 : 0,
|
|
326
|
-
reason: pass
|
|
327
|
-
? 'Assertion passed'
|
|
328
|
-
: `Expected output to ${inverse ? 'not ' : ''}contain all of [${renderedValue.join(', ')}]. Missing: [${missingStrings.join(', ')}]`,
|
|
329
|
-
assertion,
|
|
330
|
-
};
|
|
331
|
-
}
|
|
332
|
-
if (baseType === 'regex') {
|
|
333
|
-
(0, tiny_invariant_1.default)(renderedValue, '"regex" assertion type must have a string value');
|
|
334
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', '"regex" assertion type must have a string value');
|
|
335
|
-
const regex = new RegExp(renderedValue);
|
|
336
|
-
pass = regex.test(outputString) !== inverse;
|
|
337
|
-
return {
|
|
338
|
-
pass,
|
|
339
|
-
score: pass ? 1 : 0,
|
|
340
|
-
reason: pass
|
|
341
|
-
? 'Assertion passed'
|
|
342
|
-
: `Expected output to ${inverse ? 'not ' : ''}match regex "${renderedValue}"`,
|
|
343
|
-
assertion,
|
|
344
|
-
};
|
|
345
|
-
}
|
|
346
|
-
if (baseType === 'icontains') {
|
|
347
|
-
(0, tiny_invariant_1.default)(renderedValue, '"icontains" assertion type must have a string or number value');
|
|
348
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string' || typeof renderedValue === 'number', '"icontains" assertion type must have a string or number value');
|
|
349
|
-
pass = outputString.toLowerCase().includes(String(renderedValue).toLowerCase()) !== inverse;
|
|
350
|
-
return {
|
|
351
|
-
pass,
|
|
352
|
-
score: pass ? 1 : 0,
|
|
353
|
-
reason: pass
|
|
354
|
-
? 'Assertion passed'
|
|
355
|
-
: `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`,
|
|
356
|
-
assertion,
|
|
357
|
-
};
|
|
358
|
-
}
|
|
359
|
-
if (baseType === 'starts-with') {
|
|
360
|
-
(0, tiny_invariant_1.default)(renderedValue, '"starts-with" assertion type must have a string value');
|
|
361
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', '"starts-with" assertion type must have a string value');
|
|
362
|
-
pass = outputString.startsWith(String(renderedValue)) !== inverse;
|
|
363
|
-
return {
|
|
364
|
-
pass,
|
|
365
|
-
score: pass ? 1 : 0,
|
|
366
|
-
reason: pass
|
|
367
|
-
? 'Assertion passed'
|
|
368
|
-
: `Expected output to ${inverse ? 'not ' : ''}start with "${renderedValue}"`,
|
|
369
|
-
assertion,
|
|
370
|
-
};
|
|
371
|
-
}
|
|
372
|
-
if (baseType === 'is-valid-openai-tools-call') {
|
|
373
|
-
const toolsOutput = output;
|
|
374
|
-
if (!Array.isArray(toolsOutput) ||
|
|
375
|
-
toolsOutput.length === 0 ||
|
|
376
|
-
typeof toolsOutput[0].function.name !== 'string' ||
|
|
377
|
-
typeof toolsOutput[0].function.arguments !== 'string') {
|
|
378
|
-
return {
|
|
379
|
-
pass: false,
|
|
380
|
-
score: 0,
|
|
381
|
-
reason: `OpenAI did not return a valid-looking tools response: ${JSON.stringify(toolsOutput)}`,
|
|
382
|
-
assertion,
|
|
383
|
-
};
|
|
384
|
-
}
|
|
385
|
-
try {
|
|
386
|
-
toolsOutput.forEach((toolOutput) => (0, openaiUtil_1.validateFunctionCall)(toolOutput.function, provider.config.tools?.map((tool) => tool.function), test.vars));
|
|
387
|
-
return {
|
|
388
|
-
pass: true,
|
|
389
|
-
score: 1,
|
|
390
|
-
reason: 'Assertion passed',
|
|
391
|
-
assertion,
|
|
392
|
-
};
|
|
393
|
-
}
|
|
394
|
-
catch (err) {
|
|
395
|
-
return {
|
|
396
|
-
pass: false,
|
|
397
|
-
score: 0,
|
|
398
|
-
reason: err.message,
|
|
399
|
-
assertion,
|
|
400
|
-
};
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
if (baseType === 'is-valid-openai-function-call') {
|
|
404
|
-
const functionOutput = output;
|
|
405
|
-
if (typeof functionOutput !== 'object' ||
|
|
406
|
-
typeof functionOutput.name !== 'string' ||
|
|
407
|
-
typeof functionOutput.arguments !== 'string') {
|
|
408
|
-
return {
|
|
409
|
-
pass: false,
|
|
410
|
-
score: 0,
|
|
411
|
-
reason: `OpenAI did not return a valid-looking function call: ${JSON.stringify(functionOutput)}`,
|
|
412
|
-
assertion,
|
|
413
|
-
};
|
|
414
|
-
}
|
|
415
|
-
try {
|
|
416
|
-
(0, openaiUtil_1.validateFunctionCall)(functionOutput, provider.config.functions, test.vars);
|
|
417
|
-
return {
|
|
418
|
-
pass: true,
|
|
419
|
-
score: 1,
|
|
420
|
-
reason: 'Assertion passed',
|
|
421
|
-
assertion,
|
|
422
|
-
};
|
|
423
|
-
}
|
|
424
|
-
catch (err) {
|
|
425
|
-
return {
|
|
426
|
-
pass: false,
|
|
427
|
-
score: 0,
|
|
428
|
-
reason: err.message,
|
|
429
|
-
assertion,
|
|
430
|
-
};
|
|
431
|
-
}
|
|
432
|
-
}
|
|
433
|
-
if (baseType === 'javascript') {
|
|
434
|
-
try {
|
|
435
|
-
const validateResult = async (result) => {
|
|
436
|
-
result = await Promise.resolve(result);
|
|
437
|
-
if (typeof result === 'boolean' || typeof result === 'number' || (0, types_1.isGradingResult)(result)) {
|
|
438
|
-
return result;
|
|
439
|
-
}
|
|
440
|
-
else {
|
|
441
|
-
throw new Error(`Custom function must return a boolean, number, or GradingResult object. Got type ${typeof result}: ${JSON.stringify(result)}`);
|
|
442
|
-
}
|
|
443
|
-
};
|
|
444
|
-
if (typeof assertion.value === 'function') {
|
|
445
|
-
let ret = assertion.value(outputString, context);
|
|
446
|
-
ret = await validateResult(ret);
|
|
447
|
-
if (!ret.assertion) {
|
|
448
|
-
// Populate the assertion object if the custom function didn't return it.
|
|
449
|
-
const functionString = assertion.value.toString();
|
|
450
|
-
ret.assertion = {
|
|
451
|
-
type: 'javascript',
|
|
452
|
-
value: functionString.length > 50 ? functionString.slice(0, 50) + '...' : functionString,
|
|
453
|
-
};
|
|
454
|
-
}
|
|
455
|
-
return ret;
|
|
456
|
-
}
|
|
457
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', 'javascript assertion must have a string value');
|
|
458
|
-
/**
|
|
459
|
-
* Removes trailing newline from the rendered value.
|
|
460
|
-
* This is necessary for handling multi-line string literals in YAML
|
|
461
|
-
* that are defined on a single line in the YAML file.
|
|
462
|
-
*
|
|
463
|
-
* @example
|
|
464
|
-
* value: |
|
|
465
|
-
* output === 'true'
|
|
466
|
-
*/
|
|
467
|
-
renderedValue = renderedValue.trimEnd();
|
|
468
|
-
let result;
|
|
469
|
-
if (typeof valueFromScript === 'undefined') {
|
|
470
|
-
const functionBody = renderedValue.includes('\n')
|
|
471
|
-
? renderedValue
|
|
472
|
-
: `return ${renderedValue}`;
|
|
473
|
-
const customFunction = new Function('output', 'context', functionBody);
|
|
474
|
-
result = await validateResult(customFunction(output, context));
|
|
475
|
-
}
|
|
476
|
-
else {
|
|
477
|
-
(0, tiny_invariant_1.default)(typeof valueFromScript === 'boolean' ||
|
|
478
|
-
typeof valueFromScript === 'number' ||
|
|
479
|
-
typeof valueFromScript === 'object', `Javascript assertion script must return a boolean, number, or object (${assertion.value})`);
|
|
480
|
-
result = await validateResult(valueFromScript);
|
|
481
|
-
}
|
|
482
|
-
if (typeof result === 'boolean') {
|
|
483
|
-
pass = result !== inverse;
|
|
484
|
-
score = pass ? 1 : 0;
|
|
485
|
-
}
|
|
486
|
-
else if (typeof result === 'number') {
|
|
487
|
-
pass = assertion.threshold ? result >= assertion.threshold : result > 0;
|
|
488
|
-
score = result;
|
|
489
|
-
}
|
|
490
|
-
else if (typeof result === 'object') {
|
|
491
|
-
return result;
|
|
492
|
-
}
|
|
493
|
-
else {
|
|
494
|
-
throw new Error('Custom function must return a boolean or number');
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
catch (err) {
|
|
498
|
-
return {
|
|
499
|
-
pass: false,
|
|
500
|
-
score: 0,
|
|
501
|
-
reason: `Custom function threw error: ${err.message}
|
|
502
|
-
Stack Trace: ${err.stack}
|
|
503
|
-
${renderedValue}`,
|
|
504
|
-
assertion,
|
|
505
|
-
};
|
|
506
|
-
}
|
|
507
|
-
return {
|
|
508
|
-
pass,
|
|
509
|
-
score,
|
|
510
|
-
reason: pass
|
|
511
|
-
? 'Assertion passed'
|
|
512
|
-
: `Custom function returned ${inverse ? 'true' : 'false'}
|
|
513
|
-
${renderedValue}`,
|
|
514
|
-
assertion,
|
|
515
|
-
};
|
|
516
|
-
}
|
|
517
|
-
if (baseType === 'python') {
|
|
518
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', 'python assertion must have a string value');
|
|
519
|
-
try {
|
|
520
|
-
let result;
|
|
521
|
-
if (typeof valueFromScript === 'undefined') {
|
|
522
|
-
const isMultiline = renderedValue.includes('\n');
|
|
523
|
-
let indentStyle = ' ';
|
|
524
|
-
if (isMultiline) {
|
|
525
|
-
// Detect the indentation style of the first indented line
|
|
526
|
-
const match = renderedValue.match(/^(?!\s*$)\s+/m);
|
|
527
|
-
if (match) {
|
|
528
|
-
indentStyle = match[0];
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
const pythonScript = `import json
|
|
532
|
-
|
|
533
|
-
def main(output, context):
|
|
534
|
-
${isMultiline
|
|
535
|
-
? renderedValue
|
|
536
|
-
.split('\n')
|
|
537
|
-
.map((line) => `${indentStyle}${line}`)
|
|
538
|
-
.join('\n')
|
|
539
|
-
: ` return ${renderedValue}`}
|
|
540
|
-
`;
|
|
541
|
-
result = await (0, wrapper_1.runPythonCode)(pythonScript, 'main', [output, context]);
|
|
542
|
-
}
|
|
543
|
-
else {
|
|
544
|
-
result = valueFromScript;
|
|
545
|
-
}
|
|
546
|
-
if ((typeof result === 'boolean' && result) ||
|
|
547
|
-
(typeof result === 'string' && result.toLowerCase() === 'true')) {
|
|
548
|
-
pass = true;
|
|
549
|
-
score = 1.0;
|
|
550
|
-
}
|
|
551
|
-
else if ((typeof result === 'boolean' && !result) ||
|
|
552
|
-
(typeof result === 'string' && result.toLowerCase() === 'false')) {
|
|
553
|
-
pass = false;
|
|
554
|
-
score = 0.0;
|
|
555
|
-
}
|
|
556
|
-
else if (typeof result === 'string' && result.startsWith('{')) {
|
|
557
|
-
let parsed;
|
|
558
|
-
try {
|
|
559
|
-
parsed = JSON.parse(result);
|
|
560
|
-
}
|
|
561
|
-
catch (err) {
|
|
562
|
-
throw new Error(`Invalid JSON: ${err} when parsing result: ${result}`);
|
|
563
|
-
}
|
|
564
|
-
if (!(0, types_1.isGradingResult)(parsed)) {
|
|
565
|
-
throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Got instead: ${result}`);
|
|
566
|
-
}
|
|
567
|
-
return parsed;
|
|
568
|
-
}
|
|
569
|
-
else if (typeof result === 'object') {
|
|
570
|
-
if (!(0, types_1.isGradingResult)(result)) {
|
|
571
|
-
throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Got instead:\n${JSON.stringify(result, null, 2)}`);
|
|
572
|
-
}
|
|
573
|
-
const pythonGradingResult = result;
|
|
574
|
-
if (assertion.threshold && pythonGradingResult.score < assertion.threshold) {
|
|
575
|
-
pythonGradingResult.pass = false;
|
|
576
|
-
pythonGradingResult.reason = `Python score ${pythonGradingResult.score} is less than threshold ${assertion.threshold}`;
|
|
577
|
-
}
|
|
578
|
-
return {
|
|
579
|
-
...pythonGradingResult,
|
|
580
|
-
assertion,
|
|
581
|
-
};
|
|
582
|
-
}
|
|
583
|
-
else {
|
|
584
|
-
score = Number.parseFloat(String(result));
|
|
585
|
-
pass = assertion.threshold ? score >= assertion.threshold : score > 0;
|
|
586
|
-
if (Number.isNaN(score)) {
|
|
587
|
-
throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
588
|
-
}
|
|
589
|
-
if (typeof assertion.threshold !== 'undefined' && score < assertion.threshold) {
|
|
590
|
-
pass = false;
|
|
591
|
-
}
|
|
592
|
-
}
|
|
593
|
-
}
|
|
594
|
-
catch (err) {
|
|
595
|
-
return {
|
|
596
|
-
pass: false,
|
|
597
|
-
score: 0,
|
|
598
|
-
reason: `Python code execution failed: ${err.message}`,
|
|
599
|
-
assertion,
|
|
600
|
-
};
|
|
601
|
-
}
|
|
602
|
-
return {
|
|
603
|
-
pass,
|
|
604
|
-
score,
|
|
605
|
-
reason: pass
|
|
606
|
-
? 'Assertion passed'
|
|
607
|
-
: `Python code returned ${pass ? 'true' : 'false'}\n${assertion.value}`,
|
|
608
|
-
assertion,
|
|
609
|
-
};
|
|
610
|
-
}
|
|
611
|
-
if (baseType === 'similar') {
|
|
612
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string' || Array.isArray(renderedValue), 'Similarity assertion type must have a string or array of strings value');
|
|
613
|
-
if (Array.isArray(renderedValue)) {
|
|
614
|
-
let minScore = Infinity;
|
|
615
|
-
for (const value of renderedValue) {
|
|
616
|
-
const result = await (0, matchers_1.matchesSimilarity)(value, outputString, assertion.threshold || 0.75, inverse, test.options);
|
|
617
|
-
if (result.pass) {
|
|
618
|
-
return {
|
|
619
|
-
assertion,
|
|
620
|
-
...result,
|
|
621
|
-
};
|
|
622
|
-
}
|
|
623
|
-
if (result.score < minScore) {
|
|
624
|
-
minScore = result.score;
|
|
625
|
-
}
|
|
626
|
-
}
|
|
627
|
-
return {
|
|
628
|
-
assertion,
|
|
629
|
-
pass: false,
|
|
630
|
-
score: minScore,
|
|
631
|
-
reason: `None of the provided values met the similarity threshold`,
|
|
632
|
-
};
|
|
633
|
-
}
|
|
634
|
-
else {
|
|
635
|
-
return {
|
|
636
|
-
assertion,
|
|
637
|
-
...(await (0, matchers_1.matchesSimilarity)(renderedValue, outputString, assertion.threshold || 0.75, inverse, test.options)),
|
|
638
|
-
};
|
|
639
|
-
}
|
|
640
|
-
}
|
|
641
|
-
if (baseType === 'llm-rubric') {
|
|
642
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string' || typeof renderedValue === 'undefined', '"llm-rubric" assertion type must have a string value');
|
|
643
|
-
if (test.options?.rubricPrompt && typeof test.options.rubricPrompt === 'object') {
|
|
644
|
-
test.options.rubricPrompt = JSON.stringify(test.options.rubricPrompt);
|
|
645
|
-
}
|
|
646
|
-
// Update the assertion value. This allows the web view to display the prompt.
|
|
647
|
-
assertion.value = assertion.value || test.options?.rubricPrompt;
|
|
648
|
-
return {
|
|
649
|
-
assertion,
|
|
650
|
-
...(await (0, matchers_1.matchesLlmRubric)(renderedValue || '', outputString, test.options, test.vars)),
|
|
651
|
-
};
|
|
652
|
-
}
|
|
653
|
-
if (baseType === 'model-graded-factuality' || baseType === 'factuality') {
|
|
654
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', 'factuality assertion type must have a string value');
|
|
655
|
-
(0, tiny_invariant_1.default)(prompt, 'factuality assertion type must have a prompt');
|
|
656
|
-
if (test.options?.rubricPrompt) {
|
|
657
|
-
// Substitute vars in prompt
|
|
658
|
-
(0, tiny_invariant_1.default)(typeof test.options.rubricPrompt === 'string', 'rubricPrompt must be a string');
|
|
659
|
-
test.options.rubricPrompt = nunjucks.renderString(test.options.rubricPrompt, test.vars || {});
|
|
660
|
-
}
|
|
661
|
-
return {
|
|
662
|
-
assertion,
|
|
663
|
-
...(await (0, matchers_1.matchesFactuality)(prompt, renderedValue, outputString, test.options, test.vars)),
|
|
664
|
-
};
|
|
665
|
-
}
|
|
666
|
-
if (baseType === 'model-graded-closedqa') {
|
|
667
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', 'model-graded-closedqa assertion type must have a string value');
|
|
668
|
-
(0, tiny_invariant_1.default)(prompt, 'model-graded-closedqa assertion type must have a prompt');
|
|
669
|
-
if (test.options?.rubricPrompt) {
|
|
670
|
-
// Substitute vars in prompt
|
|
671
|
-
(0, tiny_invariant_1.default)(typeof test.options.rubricPrompt === 'string', 'rubricPrompt must be a string');
|
|
672
|
-
test.options.rubricPrompt = nunjucks.renderString(test.options.rubricPrompt, test.vars || {});
|
|
673
|
-
}
|
|
674
|
-
return {
|
|
675
|
-
assertion,
|
|
676
|
-
...(await (0, matchers_1.matchesClosedQa)(prompt, renderedValue, outputString, test.options, test.vars)),
|
|
677
|
-
};
|
|
678
|
-
}
|
|
679
|
-
if (baseType === 'answer-relevance') {
|
|
680
|
-
(0, tiny_invariant_1.default)(typeof output === 'string', 'answer-relevance assertion type must evaluate a string output');
|
|
681
|
-
(0, tiny_invariant_1.default)(prompt, 'answer-relevance assertion type must have a prompt');
|
|
682
|
-
const input = typeof test.vars?.query === 'string' ? test.vars.query : prompt;
|
|
683
|
-
return {
|
|
684
|
-
assertion,
|
|
685
|
-
...(await (0, matchers_1.matchesAnswerRelevance)(input, output, assertion.threshold || 0, test.options)),
|
|
686
|
-
};
|
|
687
|
-
}
|
|
688
|
-
if (baseType === 'context-recall') {
|
|
689
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', 'context-recall assertion type must have a string value');
|
|
690
|
-
(0, tiny_invariant_1.default)(prompt, 'context-recall assertion type must have a prompt');
|
|
691
|
-
return {
|
|
692
|
-
assertion,
|
|
693
|
-
...(await (0, matchers_1.matchesContextRecall)(typeof test.vars?.context === 'string' ? test.vars.context : prompt, renderedValue, assertion.threshold || 0, test.options, test.vars)),
|
|
694
|
-
};
|
|
695
|
-
}
|
|
696
|
-
if (baseType === 'context-relevance') {
|
|
697
|
-
(0, tiny_invariant_1.default)(test.vars, 'context-relevance assertion type must have a vars object');
|
|
698
|
-
(0, tiny_invariant_1.default)(typeof test.vars.query === 'string', 'context-relevance assertion type must have a query var');
|
|
699
|
-
(0, tiny_invariant_1.default)(typeof test.vars.context === 'string', 'context-relevance assertion type must have a context var');
|
|
700
|
-
return {
|
|
701
|
-
assertion,
|
|
702
|
-
...(await (0, matchers_1.matchesContextRelevance)(test.vars.query, test.vars.context, assertion.threshold || 0, test.options)),
|
|
703
|
-
};
|
|
704
|
-
}
|
|
705
|
-
if (baseType === 'context-faithfulness') {
|
|
706
|
-
(0, tiny_invariant_1.default)(test.vars, 'context-faithfulness assertion type must have a vars object');
|
|
707
|
-
(0, tiny_invariant_1.default)(typeof test.vars.query === 'string', 'context-faithfulness assertion type must have a query var');
|
|
708
|
-
(0, tiny_invariant_1.default)(typeof test.vars.context === 'string', 'context-faithfulness assertion type must have a context var');
|
|
709
|
-
(0, tiny_invariant_1.default)(typeof output === 'string', 'context-faithfulness assertion type must have a string output');
|
|
710
|
-
return {
|
|
711
|
-
assertion,
|
|
712
|
-
...(await (0, matchers_1.matchesContextFaithfulness)(test.vars.query, output, test.vars.context, assertion.threshold || 0, test.options)),
|
|
713
|
-
};
|
|
714
|
-
}
|
|
715
|
-
if (baseType === 'moderation') {
|
|
716
|
-
// Some redteam techniques override the actual prompt that is used, so we need to assess that prompt for moderation.
|
|
717
|
-
const promptToModerate = providerResponse.metadata?.redteamFinalPrompt || prompt;
|
|
718
|
-
const outputString = typeof output === 'string' ? output : JSON.stringify(output);
|
|
719
|
-
(0, tiny_invariant_1.default)(promptToModerate, 'moderation assertion type must have a prompt');
|
|
720
|
-
(0, tiny_invariant_1.default)(!assertion.value ||
|
|
721
|
-
(Array.isArray(assertion.value) && typeof assertion.value[0] === 'string'), 'moderation assertion value must be a string array if set');
|
|
722
|
-
if (promptToModerate[0] === '[' || promptToModerate[0] === '{') {
|
|
723
|
-
// Try to extract the last user message from OpenAI-style prompts.
|
|
724
|
-
try {
|
|
725
|
-
const parsedPrompt = (0, shared_1.parseChatPrompt)(promptToModerate, null);
|
|
726
|
-
if (parsedPrompt && parsedPrompt.length > 0) {
|
|
727
|
-
prompt = parsedPrompt[parsedPrompt.length - 1].content;
|
|
728
|
-
}
|
|
729
|
-
}
|
|
730
|
-
catch {
|
|
731
|
-
// Ignore error
|
|
732
|
-
}
|
|
733
|
-
}
|
|
734
|
-
const moderationResult = await (0, matchers_1.matchesModeration)({
|
|
735
|
-
userPrompt: promptToModerate,
|
|
736
|
-
assistantResponse: outputString,
|
|
737
|
-
categories: Array.isArray(assertion.value) ? assertion.value : [],
|
|
738
|
-
}, test.options);
|
|
739
|
-
pass = moderationResult.pass;
|
|
740
|
-
return {
|
|
741
|
-
pass,
|
|
742
|
-
score: moderationResult.score,
|
|
743
|
-
reason: moderationResult.reason,
|
|
744
|
-
assertion,
|
|
745
|
-
};
|
|
746
|
-
}
|
|
747
|
-
if (baseType === 'webhook') {
|
|
748
|
-
(0, tiny_invariant_1.default)(renderedValue, '"webhook" assertion type must have a URL value');
|
|
749
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', '"webhook" assertion type must have a URL value');
|
|
750
|
-
try {
|
|
751
|
-
const context = {
|
|
752
|
-
prompt,
|
|
753
|
-
vars: test.vars || {},
|
|
754
|
-
};
|
|
755
|
-
const response = await (0, fetch_1.fetchWithRetries)(renderedValue, {
|
|
756
|
-
method: 'POST',
|
|
757
|
-
headers: {
|
|
758
|
-
'Content-Type': 'application/json',
|
|
759
|
-
},
|
|
760
|
-
body: JSON.stringify({ output, context }),
|
|
761
|
-
}, (0, envars_1.getEnvInt)('WEBHOOK_TIMEOUT', 5000));
|
|
762
|
-
if (!response.ok) {
|
|
763
|
-
throw new Error(`Webhook response status: ${response.status}`);
|
|
764
|
-
}
|
|
765
|
-
const jsonResponse = await response.json();
|
|
766
|
-
pass = jsonResponse.pass !== inverse;
|
|
767
|
-
score =
|
|
768
|
-
typeof jsonResponse.score === 'undefined'
|
|
769
|
-
? pass
|
|
770
|
-
? 1
|
|
771
|
-
: 0
|
|
772
|
-
: inverse
|
|
773
|
-
? 1 - jsonResponse.score
|
|
774
|
-
: jsonResponse.score;
|
|
775
|
-
const reason = jsonResponse.reason ||
|
|
776
|
-
(pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`);
|
|
777
|
-
return {
|
|
778
|
-
pass,
|
|
779
|
-
score,
|
|
780
|
-
reason,
|
|
781
|
-
assertion,
|
|
782
|
-
};
|
|
783
|
-
}
|
|
784
|
-
catch (err) {
|
|
785
|
-
return {
|
|
786
|
-
pass: false,
|
|
787
|
-
score: 0,
|
|
788
|
-
reason: `Webhook error: ${err.message}`,
|
|
789
|
-
assertion,
|
|
790
|
-
};
|
|
791
|
-
}
|
|
792
|
-
}
|
|
793
|
-
if (baseType === 'rouge-n') {
|
|
794
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', '"rouge" assertion type must be a string value');
|
|
795
|
-
return handleRougeScore(baseType, assertion, renderedValue, outputString, inverse);
|
|
796
|
-
}
|
|
797
|
-
if (baseType === 'bleu') {
|
|
798
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string' ||
|
|
799
|
-
(Array.isArray(renderedValue) && renderedValue.every((v) => typeof v === 'string')), '"bleu" assertion type must have a string or array of strings value');
|
|
800
|
-
return (0, bleu_1.handleBleuScore)(assertion, renderedValue, outputString, inverse);
|
|
801
|
-
}
|
|
802
|
-
if (baseType === 'levenshtein') {
|
|
803
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string', '"levenshtein" assertion type must have a string value');
|
|
804
|
-
const levDistance = (0, fastest_levenshtein_1.distance)(outputString, renderedValue);
|
|
805
|
-
pass = levDistance <= (assertion.threshold || 5);
|
|
806
|
-
return {
|
|
807
|
-
pass,
|
|
808
|
-
score: pass ? 1 : 0,
|
|
809
|
-
reason: pass
|
|
810
|
-
? 'Assertion passed'
|
|
811
|
-
: `Levenshtein distance ${levDistance} is greater than threshold ${assertion.threshold || 5}`,
|
|
812
|
-
assertion,
|
|
813
|
-
};
|
|
814
|
-
}
|
|
815
|
-
if (baseType === 'classifier') {
|
|
816
|
-
(0, tiny_invariant_1.default)(typeof renderedValue === 'string' || typeof renderedValue === 'undefined', '"classifier" assertion type must have a string value or be undefined');
|
|
817
|
-
// Assertion provider overrides test provider
|
|
818
|
-
const classificationResult = await (0, matchers_1.matchesClassification)(renderedValue, outputString, assertion.threshold ?? 1, test.options);
|
|
819
|
-
if (inverse) {
|
|
820
|
-
classificationResult.pass = !classificationResult.pass;
|
|
821
|
-
classificationResult.score = 1 - classificationResult.score;
|
|
822
|
-
}
|
|
823
|
-
return {
|
|
824
|
-
assertion,
|
|
825
|
-
...classificationResult,
|
|
826
|
-
};
|
|
827
|
-
}
|
|
828
|
-
if (baseType === 'latency') {
|
|
829
|
-
if (!assertion.threshold) {
|
|
830
|
-
throw new Error('Latency assertion must have a threshold in milliseconds');
|
|
831
|
-
}
|
|
832
|
-
if (latencyMs === undefined) {
|
|
833
|
-
throw new Error('Latency assertion does not support cached results. Rerun the eval with --no-cache');
|
|
834
|
-
}
|
|
835
|
-
pass = latencyMs <= assertion.threshold;
|
|
836
|
-
return {
|
|
837
|
-
pass,
|
|
838
|
-
score: pass ? 1 : 0,
|
|
839
|
-
reason: pass
|
|
840
|
-
? 'Assertion passed'
|
|
841
|
-
: `Latency ${latencyMs}ms is greater than threshold ${assertion.threshold}ms`,
|
|
842
|
-
assertion,
|
|
843
|
-
};
|
|
844
|
-
}
|
|
845
|
-
if (baseType === 'perplexity') {
|
|
846
|
-
return (0, perplexity_1.handlePerplexity)(logProbs, assertion);
|
|
847
|
-
}
|
|
848
|
-
if (baseType === 'perplexity-score') {
|
|
849
|
-
return (0, perplexity_1.handlePerplexityScore)(logProbs, assertion);
|
|
850
|
-
}
|
|
851
|
-
if (baseType === 'cost') {
|
|
852
|
-
if (!assertion.threshold) {
|
|
853
|
-
throw new Error('Cost assertion must have a threshold');
|
|
854
|
-
}
|
|
855
|
-
if (typeof cost === 'undefined') {
|
|
856
|
-
throw new Error('Cost assertion does not support providers that do not return cost');
|
|
857
|
-
}
|
|
858
|
-
pass = cost <= assertion.threshold;
|
|
859
|
-
return {
|
|
860
|
-
pass,
|
|
861
|
-
score: pass ? 1 : 0,
|
|
862
|
-
reason: pass
|
|
863
|
-
? 'Assertion passed'
|
|
864
|
-
: `Cost ${cost.toPrecision(2)} is greater than threshold ${assertion.threshold}`,
|
|
865
|
-
assertion,
|
|
866
|
-
};
|
|
159
|
+
const assertionParams = {
|
|
160
|
+
assertion,
|
|
161
|
+
baseType,
|
|
162
|
+
context,
|
|
163
|
+
cost,
|
|
164
|
+
inverse,
|
|
165
|
+
latencyMs,
|
|
166
|
+
logProbs,
|
|
167
|
+
output,
|
|
168
|
+
outputString: (0, utils_1.coerceString)(output),
|
|
169
|
+
prompt,
|
|
170
|
+
provider,
|
|
171
|
+
providerResponse,
|
|
172
|
+
renderedValue,
|
|
173
|
+
test: (0, utils_1.getFinalTest)(test, assertion),
|
|
174
|
+
valueFromScript,
|
|
175
|
+
};
|
|
176
|
+
// Map assertion types to their handler functions>
|
|
177
|
+
const assertionHandlers = {
|
|
178
|
+
'answer-relevance': answerRelevance_1.handleAnswerRelevance,
|
|
179
|
+
bleu: bleu_1.handleBleuScore,
|
|
180
|
+
classifier: classifier_1.handleClassifier,
|
|
181
|
+
contains: contains_1.handleContains,
|
|
182
|
+
'contains-all': contains_1.handleContainsAll,
|
|
183
|
+
'contains-any': contains_1.handleContainsAny,
|
|
184
|
+
'contains-json': json_1.handleContainsJson,
|
|
185
|
+
'contains-sql': sql_1.handleContainsSql,
|
|
186
|
+
'contains-xml': xml_1.handleIsXml,
|
|
187
|
+
'context-faithfulness': contextFaithfulness_1.handleContextFaithfulness,
|
|
188
|
+
'context-recall': contextRecall_1.handleContextRecall,
|
|
189
|
+
'context-relevance': contextRelevance_1.handleContextRelevance,
|
|
190
|
+
cost: cost_1.handleCost,
|
|
191
|
+
equals: equals_1.handleEquals,
|
|
192
|
+
factuality: factuality_1.handleFactuality,
|
|
193
|
+
icontains: contains_1.handleIContains,
|
|
194
|
+
'icontains-all': contains_1.handleIContainsAll,
|
|
195
|
+
'icontains-any': contains_1.handleIContainsAny,
|
|
196
|
+
'is-json': json_1.handleIsJson,
|
|
197
|
+
'is-sql': sql_1.handleIsSql,
|
|
198
|
+
'is-valid-openai-function-call': openai_1.handleIsValidOpenAiFunctionCall,
|
|
199
|
+
'is-valid-openai-tools-call': openai_1.handleIsValidOpenAiToolsCall,
|
|
200
|
+
'is-xml': xml_1.handleIsXml,
|
|
201
|
+
javascript: javascript_1.handleJavascript,
|
|
202
|
+
latency: latency_1.handleLatency,
|
|
203
|
+
levenshtein: levenshtein_1.handleLevenshtein,
|
|
204
|
+
'llm-rubric': llmRubric_1.handleLlmRubric,
|
|
205
|
+
'model-graded-closedqa': modelGradedClosedQa_1.handleModelGradedClosedQa,
|
|
206
|
+
'model-graded-factuality': factuality_1.handleFactuality,
|
|
207
|
+
moderation: moderation_1.handleModeration,
|
|
208
|
+
perplexity: perplexity_1.handlePerplexity,
|
|
209
|
+
'perplexity-score': perplexity_1.handlePerplexityScore,
|
|
210
|
+
python: python_1.handlePython,
|
|
211
|
+
regex: regex_1.handleRegex,
|
|
212
|
+
'rouge-n': rouge_1.handleRougeScore,
|
|
213
|
+
similar: similar_1.handleSimilar,
|
|
214
|
+
'starts-with': startsWith_1.handleStartsWith,
|
|
215
|
+
webhook: webhook_1.handleWebhook,
|
|
216
|
+
};
|
|
217
|
+
const handler = assertionHandlers[baseType];
|
|
218
|
+
if (handler) {
|
|
219
|
+
return handler(assertionParams);
|
|
867
220
|
}
|
|
868
221
|
if (baseType.startsWith('promptfoo:redteam:')) {
|
|
869
|
-
|
|
870
|
-
(0, tiny_invariant_1.default)(grader, `Unknown promptfoo grader: ${baseType}`);
|
|
871
|
-
(0, tiny_invariant_1.default)(prompt, `Promptfoo grader ${baseType} must have a prompt`);
|
|
872
|
-
const { grade, rubric, suggestions } = await grader.getResult(prompt, outputString, test, provider, renderedValue);
|
|
873
|
-
return {
|
|
874
|
-
assertion: {
|
|
875
|
-
...assertion,
|
|
876
|
-
value: rubric,
|
|
877
|
-
},
|
|
878
|
-
...grade,
|
|
879
|
-
suggestions,
|
|
880
|
-
metadata: {
|
|
881
|
-
// Pass through all test metadata for redteam
|
|
882
|
-
...test.metadata,
|
|
883
|
-
...grade.metadata,
|
|
884
|
-
},
|
|
885
|
-
};
|
|
222
|
+
return (0, redteam_1.handleRedteam)(assertionParams);
|
|
886
223
|
}
|
|
887
224
|
throw new Error('Unknown assertion type: ' + assertion.type);
|
|
888
225
|
}
|