promptfoo 0.119.13 → 0.119.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +28 -26
- package/dist/src/app/assets/index-eJ2lMe94.js +51 -0
- package/dist/src/app/assets/{source-map-support-Bnh0UQ2S.js → source-map-support-1v4oeb7P.js} +1 -1
- package/dist/src/app/assets/sync-CtLQRuC1.js +1 -0
- package/dist/src/app/assets/{vendor-charts-T60Uk0Z3.js → vendor-charts-DnVv66VV.js} +1 -1
- package/dist/src/app/assets/{vendor-markdown-DLig-KJh.js → vendor-markdown-DCpQIyMA.js} +1 -1
- package/dist/src/app/assets/{vendor-mui-core-5BLaiG3c.js → vendor-mui-core-Boqnpf9f.js} +1 -1
- package/dist/src/app/assets/{vendor-mui-icons-fn39Fu2e.js → vendor-mui-icons-B8MqoVbj.js} +1 -1
- package/dist/src/app/assets/vendor-mui-x-CGSS6QHF.js +45 -0
- package/dist/src/app/assets/{vendor-utils-DYBMEuwX.js → vendor-utils-DdfHIEy8.js} +1 -1
- package/dist/src/app/index.html +7 -7
- package/dist/src/assertions/guardrails.d.ts +1 -1
- package/dist/src/assertions/guardrails.js +18 -9
- package/dist/src/assertions/index.d.ts +1 -1
- package/dist/src/assertions/index.js +9 -3
- package/dist/src/assertions/searchRubric.d.ts +3 -0
- package/dist/src/assertions/searchRubric.js +18 -0
- package/dist/src/commands/eval.js +1 -1
- package/dist/src/commands/modelScan.d.ts +7 -1
- package/dist/src/commands/modelScan.js +121 -59
- package/dist/src/database/index.d.ts +6 -0
- package/dist/src/database/index.js +11 -0
- package/dist/src/database/tables.d.ts +46 -24
- package/dist/src/envars.d.ts +17 -0
- package/dist/src/generated/constants.js +1 -1
- package/dist/src/logger.d.ts +5 -0
- package/dist/src/logger.js +28 -0
- package/dist/src/main.js +17 -6
- package/dist/src/matchers.d.ts +1 -0
- package/dist/src/matchers.js +80 -0
- package/dist/src/models/eval.d.ts +2 -1
- package/dist/src/models/eval.js +44 -2
- package/dist/src/prompts/grading.d.ts +1 -0
- package/dist/src/prompts/grading.js +26 -1
- package/dist/src/prompts/index.d.ts +1 -0
- package/dist/src/prompts/index.js +4 -1
- package/dist/src/providers/adaline.gateway.js +2 -2
- package/dist/src/providers/anthropic/defaults.d.ts +1 -1
- package/dist/src/providers/anthropic/defaults.js +15 -0
- package/dist/src/providers/azure/chat.d.ts +3 -1
- package/dist/src/providers/azure/chat.js +16 -3
- package/dist/src/providers/azure/defaults.js +660 -141
- package/dist/src/providers/azure/responses.d.ts +5 -0
- package/dist/src/providers/azure/responses.js +33 -4
- package/dist/src/providers/azure/types.d.ts +4 -0
- package/dist/src/providers/bedrock/agents.d.ts +1 -1
- package/dist/src/providers/bedrock/agents.js +2 -2
- package/dist/src/providers/bedrock/base.d.ts +40 -0
- package/dist/src/providers/bedrock/base.js +171 -0
- package/dist/src/providers/bedrock/converse.d.ts +146 -0
- package/dist/src/providers/bedrock/converse.js +1044 -0
- package/dist/src/providers/bedrock/index.d.ts +1 -34
- package/dist/src/providers/bedrock/index.js +4 -159
- package/dist/src/providers/bedrock/knowledgeBase.d.ts +1 -1
- package/dist/src/providers/bedrock/knowledgeBase.js +2 -2
- package/dist/src/providers/bedrock/nova-sonic.d.ts +2 -1
- package/dist/src/providers/bedrock/nova-sonic.js +2 -2
- package/dist/src/providers/claude-agent-sdk.d.ts +58 -1
- package/dist/src/providers/claude-agent-sdk.js +22 -1
- package/dist/src/providers/defaults.js +4 -0
- package/dist/src/providers/github/defaults.js +6 -6
- package/dist/src/providers/google/types.d.ts +25 -0
- package/dist/src/providers/google/util.d.ts +2 -0
- package/dist/src/providers/google/vertex.js +78 -22
- package/dist/src/providers/{groq.d.ts → groq/chat.d.ts} +26 -20
- package/dist/src/providers/groq/chat.js +79 -0
- package/dist/src/providers/groq/index.d.ts +5 -0
- package/dist/src/providers/groq/index.js +24 -0
- package/dist/src/providers/groq/responses.d.ts +106 -0
- package/dist/src/providers/groq/responses.js +64 -0
- package/dist/src/providers/groq/types.d.ts +44 -0
- package/dist/src/providers/groq/types.js +3 -0
- package/dist/src/providers/groq/util.d.ts +15 -0
- package/dist/src/providers/groq/util.js +28 -0
- package/dist/src/providers/mcp/client.d.ts +8 -0
- package/dist/src/providers/mcp/client.js +60 -10
- package/dist/src/providers/mcp/types.d.ts +21 -0
- package/dist/src/providers/openai/chatkit-pool.d.ts +114 -0
- package/dist/src/providers/openai/chatkit-pool.js +548 -0
- package/dist/src/providers/openai/chatkit-types.d.ts +73 -0
- package/dist/src/providers/openai/chatkit-types.js +3 -0
- package/dist/src/providers/openai/chatkit.d.ts +76 -0
- package/dist/src/providers/openai/chatkit.js +879 -0
- package/dist/src/providers/openai/codex-sdk.d.ts +109 -0
- package/dist/src/providers/openai/codex-sdk.js +346 -0
- package/dist/src/providers/openai/defaults.d.ts +2 -0
- package/dist/src/providers/openai/defaults.js +10 -4
- package/dist/src/providers/registry.js +48 -9
- package/dist/src/providers/responses/types.d.ts +1 -1
- package/dist/src/providers/sagemaker.d.ts +2 -2
- package/dist/src/providers/webSearchUtils.d.ts +17 -0
- package/dist/src/providers/webSearchUtils.js +169 -0
- package/dist/src/providers/xai/chat.d.ts +61 -0
- package/dist/src/providers/xai/chat.js +68 -3
- package/dist/src/providers/xai/responses.d.ts +189 -0
- package/dist/src/providers/xai/responses.js +268 -0
- package/dist/src/redteam/constants/plugins.d.ts +1 -1
- package/dist/src/redteam/constants/plugins.js +1 -1
- package/dist/src/redteam/constants/strategies.d.ts +1 -1
- package/dist/src/redteam/constants/strategies.js +1 -0
- package/dist/src/redteam/plugins/vlguard.d.ts +53 -4
- package/dist/src/redteam/plugins/vlguard.js +362 -46
- package/dist/src/redteam/providers/constants.d.ts +2 -2
- package/dist/src/redteam/providers/constants.js +2 -2
- package/dist/src/redteam/providers/crescendo/index.d.ts +1 -1
- package/dist/src/redteam/providers/crescendo/index.js +5 -3
- package/dist/src/redteam/providers/hydra/index.js +1 -1
- package/dist/src/server/routes/modelAudit.js +4 -4
- package/dist/src/share.js +4 -2
- package/dist/src/telemetry.js +44 -8
- package/dist/src/types/env.d.ts +3 -0
- package/dist/src/types/env.js +1 -0
- package/dist/src/types/index.d.ts +896 -615
- package/dist/src/types/index.js +1 -0
- package/dist/src/types/providers.d.ts +1 -0
- package/dist/src/types/tracing.d.ts +3 -0
- package/dist/src/util/database.d.ts +6 -4
- package/dist/src/util/file.js +6 -4
- package/dist/src/util/modelAuditCliParser.d.ts +4 -4
- package/dist/src/util/xlsx.js +52 -26
- package/dist/src/validators/providers.d.ts +142 -122
- package/dist/src/validators/providers.js +4 -6
- package/dist/src/validators/redteam.d.ts +36 -28
- package/dist/src/validators/redteam.js +9 -3
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +28 -26
- package/dist/drizzle/CLAUDE.md +0 -65
- package/dist/src/app/assets/index-DifT6VGT.js +0 -51
- package/dist/src/app/assets/sync-Oo-W_Rbj.js +0 -1
- package/dist/src/app/assets/vendor-mui-x-C2xF-yiO.js +0 -45
- package/dist/src/providers/groq.js +0 -48
package/dist/src/envars.d.ts
CHANGED
|
@@ -107,6 +107,22 @@ type EnvVars = {
|
|
|
107
107
|
REQUEST_TIMEOUT_MS?: number;
|
|
108
108
|
RESULT_HISTORY_LENGTH?: number;
|
|
109
109
|
WEBHOOK_TIMEOUT?: number;
|
|
110
|
+
/**
|
|
111
|
+
* Default timeout in milliseconds for MCP tool calls.
|
|
112
|
+
* This overrides the MCP SDK's default 60-second timeout.
|
|
113
|
+
* Can be overridden per-provider via config.mcp.timeout.
|
|
114
|
+
*/
|
|
115
|
+
MCP_REQUEST_TIMEOUT_MS?: number;
|
|
116
|
+
/**
|
|
117
|
+
* Enable debug logging for MCP connections.
|
|
118
|
+
* Can be overridden per-provider via config.mcp.debug.
|
|
119
|
+
*/
|
|
120
|
+
MCP_DEBUG?: boolean;
|
|
121
|
+
/**
|
|
122
|
+
* Enable verbose output for MCP connections.
|
|
123
|
+
* Can be overridden per-provider via config.mcp.verbose.
|
|
124
|
+
*/
|
|
125
|
+
MCP_VERBOSE?: boolean;
|
|
110
126
|
PROMPTFOO_POSTHOG_KEY?: string;
|
|
111
127
|
PROMPTFOO_POSTHOG_HOST?: string;
|
|
112
128
|
/**
|
|
@@ -208,6 +224,7 @@ type EnvVars = {
|
|
|
208
224
|
OPENAI_STOP?: string;
|
|
209
225
|
OPENAI_TEMPERATURE?: number;
|
|
210
226
|
OPENAI_TOP_P?: number;
|
|
227
|
+
CODEX_API_KEY?: string;
|
|
211
228
|
OPENROUTER_API_KEY?: string;
|
|
212
229
|
PORTKEY_API_BASE_URL?: string;
|
|
213
230
|
PORTKEY_API_KEY?: string;
|
|
@@ -2,6 +2,6 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.POSTHOG_KEY = void 0;
|
|
4
4
|
// This file is auto-generated during build. Do not edit manually.
|
|
5
|
-
// Generated at: 2025-
|
|
5
|
+
// Generated at: 2025-12-01T18:11:57.753Z
|
|
6
6
|
exports.POSTHOG_KEY = 'phc_E5n5uHnDo2eREJL1uqX1cIlbkoRby4yFWt3V94HqRRg';
|
|
7
7
|
//# sourceMappingURL=constants.js.map
|
package/dist/src/logger.d.ts
CHANGED
|
@@ -73,5 +73,10 @@ export declare function logRequestResponse(options: {
|
|
|
73
73
|
response?: Response | null;
|
|
74
74
|
error?: boolean;
|
|
75
75
|
}): Promise<void>;
|
|
76
|
+
/**
|
|
77
|
+
* Close all file transports and cleanup logger resources
|
|
78
|
+
* Should be called during graceful shutdown to prevent event loop hanging
|
|
79
|
+
*/
|
|
80
|
+
export declare function closeLogger(): void;
|
|
76
81
|
export default logger;
|
|
77
82
|
//# sourceMappingURL=logger.d.ts.map
|
package/dist/src/logger.js
CHANGED
|
@@ -47,6 +47,7 @@ exports.isDebugEnabled = isDebugEnabled;
|
|
|
47
47
|
exports.initializeRunLogging = initializeRunLogging;
|
|
48
48
|
exports.setLogger = setLogger;
|
|
49
49
|
exports.logRequestResponse = logRequestResponse;
|
|
50
|
+
exports.closeLogger = closeLogger;
|
|
50
51
|
const fs_1 = __importDefault(require("fs"));
|
|
51
52
|
const path_1 = __importDefault(require("path"));
|
|
52
53
|
const chalk_1 = __importDefault(require("chalk"));
|
|
@@ -389,6 +390,33 @@ async function logRequestResponse(options) {
|
|
|
389
390
|
logMethod(`Api Request`, logObject);
|
|
390
391
|
}
|
|
391
392
|
}
|
|
393
|
+
/**
|
|
394
|
+
* Close all file transports and cleanup logger resources
|
|
395
|
+
* Should be called during graceful shutdown to prevent event loop hanging
|
|
396
|
+
*/
|
|
397
|
+
function closeLogger() {
|
|
398
|
+
try {
|
|
399
|
+
// Close all file transports
|
|
400
|
+
const fileTransports = exports.winstonLogger.transports.filter((transport) => transport instanceof winston_1.default.transports.File);
|
|
401
|
+
for (const transport of fileTransports) {
|
|
402
|
+
const filename = transport.filename;
|
|
403
|
+
if (filename) {
|
|
404
|
+
logger.debug(`Closing log file: ${filename}`);
|
|
405
|
+
}
|
|
406
|
+
if (typeof transport.close === 'function') {
|
|
407
|
+
transport.close();
|
|
408
|
+
}
|
|
409
|
+
exports.winstonLogger.remove(transport);
|
|
410
|
+
}
|
|
411
|
+
if (fileTransports.length > 0) {
|
|
412
|
+
logger.debug('Logger cleanup complete');
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
catch (error) {
|
|
416
|
+
// Can't use logger here since we're shutting it down
|
|
417
|
+
console.error(`Error closing logger: ${error}`);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
392
420
|
// Initialize source maps if debug is enabled at startup
|
|
393
421
|
if ((0, envars_1.getEnvString)('LOG_LEVEL', 'info') === 'debug') {
|
|
394
422
|
initializeSourceMapSupport();
|
package/dist/src/main.js
CHANGED
|
@@ -39,12 +39,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
39
39
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
40
40
|
exports.addCommonOptionsRecursively = addCommonOptionsRecursively;
|
|
41
41
|
const commander_1 = require("commander");
|
|
42
|
+
const undici_1 = require("undici");
|
|
42
43
|
const package_json_1 = require("../package.json");
|
|
43
44
|
const checkNodeVersion_1 = require("./checkNodeVersion");
|
|
44
45
|
const cliState_1 = __importDefault(require("./cliState"));
|
|
46
|
+
const index_1 = require("./codeScan/index");
|
|
45
47
|
const auth_1 = require("./commands/auth");
|
|
46
48
|
const cache_1 = require("./commands/cache");
|
|
47
|
-
const index_1 = require("./codeScan/index");
|
|
48
49
|
const config_1 = require("./commands/config");
|
|
49
50
|
const debug_1 = require("./commands/debug");
|
|
50
51
|
const delete_1 = require("./commands/delete");
|
|
@@ -63,6 +64,7 @@ const share_1 = require("./commands/share");
|
|
|
63
64
|
const show_1 = require("./commands/show");
|
|
64
65
|
const validate_1 = require("./commands/validate");
|
|
65
66
|
const view_1 = require("./commands/view");
|
|
67
|
+
const index_3 = require("./database/index");
|
|
66
68
|
const logger_1 = __importStar(require("./logger"));
|
|
67
69
|
const migrate_1 = require("./migrate");
|
|
68
70
|
const discover_1 = require("./redteam/commands/discover");
|
|
@@ -76,8 +78,8 @@ const simba_1 = require("./redteam/commands/simba");
|
|
|
76
78
|
const telemetry_1 = __importDefault(require("./telemetry"));
|
|
77
79
|
const updates_1 = require("./updates");
|
|
78
80
|
const default_1 = require("./util/config/default");
|
|
79
|
-
const
|
|
80
|
-
const
|
|
81
|
+
const index_4 = require("./util/errors/index");
|
|
82
|
+
const index_5 = require("./util/index");
|
|
81
83
|
/**
|
|
82
84
|
* Adds verbose and env-file options to all commands recursively
|
|
83
85
|
*/
|
|
@@ -97,7 +99,7 @@ function addCommonOptionsRecursively(command) {
|
|
|
97
99
|
}
|
|
98
100
|
const envPath = thisCommand.opts().envFile || thisCommand.opts().envPath;
|
|
99
101
|
if (envPath) {
|
|
100
|
-
(0,
|
|
102
|
+
(0, index_5.setupEnv)(envPath);
|
|
101
103
|
logger_1.default.debug(`Loading environment from ${envPath}`);
|
|
102
104
|
}
|
|
103
105
|
});
|
|
@@ -159,12 +161,12 @@ async function main() {
|
|
|
159
161
|
// Add common options to all commands recursively
|
|
160
162
|
addCommonOptionsRecursively(program);
|
|
161
163
|
program.hook('postAction', async () => {
|
|
162
|
-
(0,
|
|
164
|
+
(0, index_4.printErrorInformation)(cliState_1.default.errorLogFile, cliState_1.default.debugLogFile);
|
|
163
165
|
if (cliState_1.default.postActionCallback) {
|
|
164
166
|
await cliState_1.default.postActionCallback();
|
|
165
167
|
}
|
|
166
168
|
});
|
|
167
|
-
program.
|
|
169
|
+
await program.parseAsync();
|
|
168
170
|
}
|
|
169
171
|
if (require.main === module) {
|
|
170
172
|
(0, checkNodeVersion_1.checkNodeVersion)();
|
|
@@ -172,6 +174,15 @@ if (require.main === module) {
|
|
|
172
174
|
logger_1.default.debug('Shutting down gracefully...');
|
|
173
175
|
await telemetry_1.default.shutdown();
|
|
174
176
|
logger_1.default.debug('Shutdown complete');
|
|
177
|
+
(0, logger_1.closeLogger)();
|
|
178
|
+
(0, index_3.closeDbIfOpen)();
|
|
179
|
+
try {
|
|
180
|
+
const dispatcher = (0, undici_1.getGlobalDispatcher)();
|
|
181
|
+
await dispatcher.destroy();
|
|
182
|
+
}
|
|
183
|
+
catch {
|
|
184
|
+
// Silently handle dispatcher destroy errors
|
|
185
|
+
}
|
|
175
186
|
});
|
|
176
187
|
}
|
|
177
188
|
//# sourceMappingURL=main.js.map
|
package/dist/src/matchers.d.ts
CHANGED
|
@@ -35,6 +35,7 @@ interface ModerationMatchOptions {
|
|
|
35
35
|
assistantResponse: string;
|
|
36
36
|
categories?: string[];
|
|
37
37
|
}
|
|
38
|
+
export declare function matchesSearchRubric(rubric: string, llmOutput: string, grading?: GradingConfig, vars?: Record<string, string | object>, assertion?: Assertion, _provider?: ApiProvider): Promise<GradingResult>;
|
|
38
39
|
export declare function matchesModeration({ userPrompt, assistantResponse, categories }: ModerationMatchOptions, grading?: GradingConfig): Promise<{
|
|
39
40
|
pass: boolean;
|
|
40
41
|
score: number;
|
package/dist/src/matchers.js
CHANGED
|
@@ -20,6 +20,7 @@ exports.matchesContextRelevance = matchesContextRelevance;
|
|
|
20
20
|
exports.matchesContextFaithfulness = matchesContextFaithfulness;
|
|
21
21
|
exports.matchesSelectBest = matchesSelectBest;
|
|
22
22
|
exports.selectMaxScore = selectMaxScore;
|
|
23
|
+
exports.matchesSearchRubric = matchesSearchRubric;
|
|
23
24
|
exports.matchesModeration = matchesModeration;
|
|
24
25
|
const path_1 = __importDefault(require("path"));
|
|
25
26
|
const utils_1 = require("./assertions/utils");
|
|
@@ -29,6 +30,8 @@ const logger_1 = __importDefault(require("./logger"));
|
|
|
29
30
|
const index_1 = require("./prompts/index");
|
|
30
31
|
const index_2 = require("./providers/index");
|
|
31
32
|
const defaults_1 = require("./providers/defaults");
|
|
33
|
+
const webSearchUtils_1 = require("./providers/webSearchUtils");
|
|
34
|
+
const grading_1 = require("./prompts/grading");
|
|
32
35
|
const constants_1 = require("./redteam/constants");
|
|
33
36
|
const remoteGeneration_1 = require("./redteam/remoteGeneration");
|
|
34
37
|
const remoteGrading_1 = require("./remoteGrading");
|
|
@@ -1224,6 +1227,83 @@ async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
|
|
|
1224
1227
|
};
|
|
1225
1228
|
});
|
|
1226
1229
|
}
|
|
1230
|
+
async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider) {
|
|
1231
|
+
if (!grading) {
|
|
1232
|
+
throw new Error('Cannot grade output without grading config. Specify --grader option or grading config.');
|
|
1233
|
+
}
|
|
1234
|
+
// Search rubric assertion is like llm-rubric but with web search capabilities
|
|
1235
|
+
const defaultProviders = await (0, defaults_1.getDefaultProviders)();
|
|
1236
|
+
// Get a provider with web search capabilities
|
|
1237
|
+
let searchProvider = grading.provider ||
|
|
1238
|
+
defaultProviders.webSearchProvider ||
|
|
1239
|
+
defaultProviders.llmRubricProvider ||
|
|
1240
|
+
defaultProviders.gradingProvider;
|
|
1241
|
+
// Check if current provider has web search, if not try to load one
|
|
1242
|
+
if (!(0, webSearchUtils_1.hasWebSearchCapability)(searchProvider)) {
|
|
1243
|
+
// Try to load a provider with web search capabilities
|
|
1244
|
+
// For search-rubric assertion, prefer Anthropic first (pass true)
|
|
1245
|
+
const webSearchProvider = await (0, webSearchUtils_1.loadWebSearchProvider)(true);
|
|
1246
|
+
if (webSearchProvider) {
|
|
1247
|
+
searchProvider = webSearchProvider;
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
// Ensure we have a provider with web search capabilities
|
|
1251
|
+
if (!searchProvider || !(0, webSearchUtils_1.hasWebSearchCapability)(searchProvider)) {
|
|
1252
|
+
throw new Error('search-rubric assertion requires a grading provider with web search capabilities. ' +
|
|
1253
|
+
'Use --grader with a web search provider (e.g., anthropic:messages:claude-sonnet-4, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider');
|
|
1254
|
+
}
|
|
1255
|
+
// Load the web search rubric prompt
|
|
1256
|
+
const rubricPrompt = await loadRubricPrompt(grading?.rubricPrompt, grading_1.DEFAULT_WEB_SEARCH_PROMPT);
|
|
1257
|
+
const prompt = await renderLlmRubricPrompt(rubricPrompt, {
|
|
1258
|
+
output: tryParse(llmOutput),
|
|
1259
|
+
rubric,
|
|
1260
|
+
...(vars || {}),
|
|
1261
|
+
});
|
|
1262
|
+
// Get the evaluation from the search provider
|
|
1263
|
+
const resp = await searchProvider.callApi(prompt);
|
|
1264
|
+
if (resp.error || !resp.output) {
|
|
1265
|
+
return {
|
|
1266
|
+
pass: false,
|
|
1267
|
+
score: 0,
|
|
1268
|
+
reason: `Search rubric evaluation failed: ${resp.error || 'No output'}`,
|
|
1269
|
+
tokensUsed: resp.tokenUsage,
|
|
1270
|
+
assertion,
|
|
1271
|
+
};
|
|
1272
|
+
}
|
|
1273
|
+
// Parse the response
|
|
1274
|
+
try {
|
|
1275
|
+
const result = (0, json_1.extractFirstJsonObject)(String(resp.output));
|
|
1276
|
+
// Apply threshold if specified
|
|
1277
|
+
let pass = result.pass ?? false;
|
|
1278
|
+
const score = typeof result.score === 'number' ? result.score : pass ? 1 : 0;
|
|
1279
|
+
if (assertion?.threshold !== undefined) {
|
|
1280
|
+
pass = pass && score >= assertion.threshold;
|
|
1281
|
+
}
|
|
1282
|
+
return {
|
|
1283
|
+
pass,
|
|
1284
|
+
score,
|
|
1285
|
+
reason: result.reason || 'No reason provided',
|
|
1286
|
+
tokensUsed: resp.tokenUsage,
|
|
1287
|
+
assertion,
|
|
1288
|
+
metadata: {
|
|
1289
|
+
searchResults: result.searchResults || [],
|
|
1290
|
+
searchProvider: searchProvider.id(),
|
|
1291
|
+
},
|
|
1292
|
+
};
|
|
1293
|
+
}
|
|
1294
|
+
catch {
|
|
1295
|
+
// Try to parse as a simple pass/fail
|
|
1296
|
+
const outputLower = String(resp.output).toLowerCase();
|
|
1297
|
+
const pass = outputLower.includes('"pass":true') || outputLower.includes('"pass": true');
|
|
1298
|
+
return {
|
|
1299
|
+
pass,
|
|
1300
|
+
score: pass ? 1 : 0,
|
|
1301
|
+
reason: resp.output,
|
|
1302
|
+
tokensUsed: resp.tokenUsage,
|
|
1303
|
+
assertion,
|
|
1304
|
+
};
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1227
1307
|
async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
|
|
1228
1308
|
if (!assistantResponse) {
|
|
1229
1309
|
return {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { type CompletedPrompt, type EvalSummary, type EvaluateResult, type EvaluateStats, type EvaluateSummaryV2, type EvaluateSummaryV3, type EvaluateTable, type EvaluateTableRow, type Prompt, type ResultsFile, type UnifiedConfig } from '../types/index';
|
|
2
2
|
import EvalResult from './evalResult';
|
|
3
|
-
import type { EvalResultsFilterMode } from '../types/index';
|
|
3
|
+
import type { EvalResultsFilterMode, TraceData } from '../types/index';
|
|
4
4
|
export declare function createEvalId(createdAt?: Date): string;
|
|
5
5
|
/** Result from queries extracting variable keys with eval IDs */
|
|
6
6
|
export interface VarKeyWithEvalIdResult {
|
|
@@ -181,6 +181,7 @@ export default class Eval {
|
|
|
181
181
|
clearResults(): void;
|
|
182
182
|
getStats(): EvaluateStats;
|
|
183
183
|
toEvaluateSummary(): Promise<EvaluateSummaryV3 | EvaluateSummaryV2>;
|
|
184
|
+
getTraces(): Promise<TraceData[]>;
|
|
184
185
|
toResultsFile(): Promise<ResultsFile>;
|
|
185
186
|
delete(): Promise<void>;
|
|
186
187
|
/**
|
package/dist/src/models/eval.js
CHANGED
|
@@ -17,8 +17,11 @@ const accounts_1 = require("../globalConfig/accounts");
|
|
|
17
17
|
const logger_1 = __importDefault(require("../logger"));
|
|
18
18
|
const utils_1 = require("../prompts/utils");
|
|
19
19
|
const constants_2 = require("../redteam/constants");
|
|
20
|
+
const metrics_1 = require("../redteam/metrics");
|
|
20
21
|
const sharedFrontend_1 = require("../redteam/sharedFrontend");
|
|
22
|
+
const store_1 = require("../tracing/store");
|
|
21
23
|
const index_2 = require("../types/index");
|
|
24
|
+
const calculateFilteredMetrics_1 = require("../util/calculateFilteredMetrics");
|
|
22
25
|
const convertEvalResultsToTable_1 = require("../util/convertEvalResultsToTable");
|
|
23
26
|
const createHash_1 = require("../util/createHash");
|
|
24
27
|
const index_3 = require("../util/exportToFile/index");
|
|
@@ -27,8 +30,6 @@ const time_1 = require("../util/time");
|
|
|
27
30
|
const tokenUsageUtils_1 = require("../util/tokenUsageUtils");
|
|
28
31
|
const evalPerformance_1 = require("./evalPerformance");
|
|
29
32
|
const evalResult_1 = __importDefault(require("./evalResult"));
|
|
30
|
-
const calculateFilteredMetrics_1 = require("../util/calculateFilteredMetrics");
|
|
31
|
-
const metrics_1 = require("../redteam/metrics");
|
|
32
33
|
/**
|
|
33
34
|
* Sanitizes runtime options to ensure only JSON-serializable data is persisted.
|
|
34
35
|
* Removes non-serializable fields like AbortSignal, functions, and symbols.
|
|
@@ -813,7 +814,47 @@ class Eval {
|
|
|
813
814
|
stats,
|
|
814
815
|
};
|
|
815
816
|
}
|
|
817
|
+
async getTraces() {
|
|
818
|
+
try {
|
|
819
|
+
const traceStore = (0, store_1.getTraceStore)();
|
|
820
|
+
const tracesData = await traceStore.getTracesByEvaluation(this.id);
|
|
821
|
+
// Transform trace data to match the expected schema
|
|
822
|
+
return tracesData.map((trace) => ({
|
|
823
|
+
traceId: trace.traceId,
|
|
824
|
+
evaluationId: trace.evaluationId,
|
|
825
|
+
testCaseId: trace.testCaseId,
|
|
826
|
+
metadata: trace.metadata,
|
|
827
|
+
spans: (trace.spans || []).map((span) => {
|
|
828
|
+
// Calculate duration
|
|
829
|
+
const durationMs = span.endTime && span.startTime ? (span.endTime - span.startTime) / 1000000 : undefined;
|
|
830
|
+
// Map status code
|
|
831
|
+
const statusCode = span.statusCode === 1 ? 'ok' : span.statusCode === 2 ? 'error' : 'unset';
|
|
832
|
+
return {
|
|
833
|
+
spanId: span.spanId,
|
|
834
|
+
parentSpanId: span.parentSpanId,
|
|
835
|
+
name: span.name,
|
|
836
|
+
kind: span.kind || 'unspecified',
|
|
837
|
+
startTime: span.startTime,
|
|
838
|
+
endTime: span.endTime,
|
|
839
|
+
durationMs,
|
|
840
|
+
attributes: span.attributes || {},
|
|
841
|
+
status: {
|
|
842
|
+
code: statusCode,
|
|
843
|
+
message: span.statusMessage,
|
|
844
|
+
},
|
|
845
|
+
depth: 0, // Will be calculated on the server side when storing
|
|
846
|
+
events: span.events || [],
|
|
847
|
+
};
|
|
848
|
+
}),
|
|
849
|
+
}));
|
|
850
|
+
}
|
|
851
|
+
catch (error) {
|
|
852
|
+
logger_1.default.debug(`Failed to fetch traces for eval ${this.id}: ${error}`);
|
|
853
|
+
return [];
|
|
854
|
+
}
|
|
855
|
+
}
|
|
816
856
|
async toResultsFile() {
|
|
857
|
+
const traces = await this.getTraces();
|
|
817
858
|
const results = {
|
|
818
859
|
version: this.version(),
|
|
819
860
|
createdAt: new Date(this.createdAt).toISOString(),
|
|
@@ -822,6 +863,7 @@ class Eval {
|
|
|
822
863
|
author: this.author || null,
|
|
823
864
|
prompts: this.getPrompts(),
|
|
824
865
|
datasetId: this.datasetId || null,
|
|
866
|
+
...(traces.length > 0 && { traces }),
|
|
825
867
|
};
|
|
826
868
|
return results;
|
|
827
869
|
}
|
|
@@ -17,7 +17,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
17
17
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
18
18
|
};
|
|
19
19
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
20
|
-
exports.SELECT_BEST_PROMPT = exports.SUGGEST_PROMPTS_SYSTEM_MESSAGE = exports.OPENAI_CLOSED_QA_PROMPT = exports.PROMPTFOO_FACTUALITY_PROMPT = exports.DEFAULT_GRADING_PROMPT = void 0;
|
|
20
|
+
exports.DEFAULT_WEB_SEARCH_PROMPT = exports.SELECT_BEST_PROMPT = exports.SUGGEST_PROMPTS_SYSTEM_MESSAGE = exports.OPENAI_CLOSED_QA_PROMPT = exports.PROMPTFOO_FACTUALITY_PROMPT = exports.DEFAULT_GRADING_PROMPT = void 0;
|
|
21
21
|
const dedent_1 = __importDefault(require("dedent"));
|
|
22
22
|
/**
|
|
23
23
|
* This module exports various prompts used for model graded evaluation.
|
|
@@ -135,4 +135,29 @@ Here are the pieces of text:
|
|
|
135
135
|
Output the index of the text that best fits the criteria. You must output a single integer.`,
|
|
136
136
|
},
|
|
137
137
|
]);
|
|
138
|
+
exports.DEFAULT_WEB_SEARCH_PROMPT = JSON.stringify([
|
|
139
|
+
{
|
|
140
|
+
role: 'system',
|
|
141
|
+
content: (0, dedent_1.default) `You are grading output according to a user-specified rubric, with the ability to search the web for current information. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
142
|
+
|
|
143
|
+
You MUST search the web when:
|
|
144
|
+
- The rubric asks about current information (prices, weather, news, etc.)
|
|
145
|
+
- Facts need to be verified against recent data
|
|
146
|
+
- The rubric references time-sensitive information
|
|
147
|
+
|
|
148
|
+
Examples:
|
|
149
|
+
|
|
150
|
+
<Output>The current CEO of Microsoft is Satya Nadella</Output>
|
|
151
|
+
<Rubric>Contains accurate information about Microsoft's leadership</Rubric>
|
|
152
|
+
{"reason": "I searched and confirmed Satya Nadella is indeed the current CEO of Microsoft", "pass": true, "score": 1.0}
|
|
153
|
+
|
|
154
|
+
<Output>Bitcoin is trading at $45,000</Output>
|
|
155
|
+
<Rubric>Provides current Bitcoin price within 10% accuracy</Rubric>
|
|
156
|
+
{"reason": "Web search shows Bitcoin is currently trading at $98,000, not $45,000. The output is off by more than 50%", "pass": false, "score": 0.0}`,
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
role: 'user',
|
|
160
|
+
content: '<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>',
|
|
161
|
+
},
|
|
162
|
+
]);
|
|
138
163
|
//# sourceMappingURL=grading.js.map
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { EvaluateTestSuite, Prompt, TestSuite, UnifiedConfig } from '../types/index';
|
|
2
2
|
export * from './grading';
|
|
3
|
+
export { DEFAULT_WEB_SEARCH_PROMPT } from './grading';
|
|
3
4
|
/**
|
|
4
5
|
* Reads and maps provider prompts based on the configuration and parsed prompts.
|
|
5
6
|
* @param config - The configuration object.
|
|
@@ -17,7 +17,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
17
17
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
18
18
|
};
|
|
19
19
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
20
|
-
exports.GEVAL_PROMPT_EVALUATE = exports.GEVAL_PROMPT_STEPS = void 0;
|
|
20
|
+
exports.GEVAL_PROMPT_EVALUATE = exports.GEVAL_PROMPT_STEPS = exports.DEFAULT_WEB_SEARCH_PROMPT = void 0;
|
|
21
21
|
exports.readProviderPromptMap = readProviderPromptMap;
|
|
22
22
|
exports.readPrompts = readPrompts;
|
|
23
23
|
exports.processPrompts = processPrompts;
|
|
@@ -41,6 +41,8 @@ const text_1 = require("./processors/text");
|
|
|
41
41
|
const yaml_1 = require("./processors/yaml");
|
|
42
42
|
const utils_1 = require("./utils");
|
|
43
43
|
__exportStar(require("./grading"), exports);
|
|
44
|
+
var grading_1 = require("./grading");
|
|
45
|
+
Object.defineProperty(exports, "DEFAULT_WEB_SEARCH_PROMPT", { enumerable: true, get: function () { return grading_1.DEFAULT_WEB_SEARCH_PROMPT; } });
|
|
44
46
|
/**
|
|
45
47
|
* Reads and maps provider prompts based on the configuration and parsed prompts.
|
|
46
48
|
* @param config - The configuration object.
|
|
@@ -213,6 +215,7 @@ async function processPrompts(prompts) {
|
|
|
213
215
|
}
|
|
214
216
|
}))).flat();
|
|
215
217
|
}
|
|
218
|
+
// G-Eval prompts
|
|
216
219
|
exports.GEVAL_PROMPT_STEPS = `
|
|
217
220
|
Given an evaluation criteria which outlines how you should judge some text, generate 3-4 concise evaluation steps for any text based on the criteria below.
|
|
218
221
|
|
|
@@ -25,7 +25,7 @@ const embedding_1 = require("./azure/embedding");
|
|
|
25
25
|
const util_2 = require("./azure/util");
|
|
26
26
|
const ai_studio_1 = require("./google/ai.studio");
|
|
27
27
|
const vertex_2 = require("./google/vertex");
|
|
28
|
-
const
|
|
28
|
+
const index_2 = require("./groq/index");
|
|
29
29
|
const chat_2 = require("./openai/chat");
|
|
30
30
|
const embedding_2 = require("./openai/embedding");
|
|
31
31
|
const util_3 = require("./openai/util");
|
|
@@ -396,7 +396,7 @@ class AdalineGatewayChatProvider extends AdalineGatewayGenericProvider {
|
|
|
396
396
|
}
|
|
397
397
|
else if (this.providerName === 'groq') {
|
|
398
398
|
const provider = new groq_1.Groq();
|
|
399
|
-
const parentClass = new
|
|
399
|
+
const parentClass = new index_2.GroqProvider(this.modelName, this.providerOptions);
|
|
400
400
|
const apiKey = parentClass.getApiKey();
|
|
401
401
|
if (!apiKey) {
|
|
402
402
|
throw new Error('Groq API key is not set. Set the GROQ_API_KEY environment variable or add `apiKey` to the provider config.');
|
|
@@ -14,5 +14,5 @@ export declare class AnthropicLlmRubricProvider extends AnthropicMessagesProvide
|
|
|
14
14
|
* @param env - Optional environment overrides
|
|
15
15
|
* @returns Anthropic provider implementations for various functions
|
|
16
16
|
*/
|
|
17
|
-
export declare function getAnthropicProviders(env?: EnvOverrides): Pick<DefaultProviders, 'gradingJsonProvider' | 'gradingProvider' | 'llmRubricProvider' | 'suggestionsProvider' | 'synthesizeProvider'>;
|
|
17
|
+
export declare function getAnthropicProviders(env?: EnvOverrides): Pick<DefaultProviders, 'gradingJsonProvider' | 'gradingProvider' | 'llmRubricProvider' | 'suggestionsProvider' | 'synthesizeProvider' | 'webSearchProvider'>;
|
|
18
18
|
//# sourceMappingURL=defaults.d.ts.map
|
|
@@ -84,6 +84,19 @@ exports.AnthropicLlmRubricProvider = AnthropicLlmRubricProvider;
|
|
|
84
84
|
// Private provider factories with lazy loading
|
|
85
85
|
const gradingProviderFactory = createLazyProvider((env) => new messages_1.AnthropicMessagesProvider(exports.DEFAULT_ANTHROPIC_MODEL, { env }));
|
|
86
86
|
const llmRubricProviderFactory = createLazyProvider((env) => new AnthropicLlmRubricProvider(exports.DEFAULT_ANTHROPIC_MODEL, { env }));
|
|
87
|
+
// Web Search Provider with web_search tool
|
|
88
|
+
const webSearchProviderFactory = createLazyProvider((env) => new messages_1.AnthropicMessagesProvider(exports.DEFAULT_ANTHROPIC_MODEL, {
|
|
89
|
+
env,
|
|
90
|
+
config: {
|
|
91
|
+
tools: [
|
|
92
|
+
{
|
|
93
|
+
type: 'web_search_20250305',
|
|
94
|
+
name: 'web_search',
|
|
95
|
+
max_uses: 5,
|
|
96
|
+
},
|
|
97
|
+
],
|
|
98
|
+
},
|
|
99
|
+
}));
|
|
87
100
|
/**
|
|
88
101
|
* Gets all default Anthropic providers with the given environment overrides
|
|
89
102
|
* @param env - Optional environment overrides
|
|
@@ -93,12 +106,14 @@ function getAnthropicProviders(env) {
|
|
|
93
106
|
// Get providers with the provided environment variables
|
|
94
107
|
const gradingProvider = gradingProviderFactory.getInstance(env);
|
|
95
108
|
const llmRubricProvider = llmRubricProviderFactory.getInstance(env);
|
|
109
|
+
const webSearchProvider = webSearchProviderFactory.getInstance(env);
|
|
96
110
|
return {
|
|
97
111
|
gradingJsonProvider: gradingProvider,
|
|
98
112
|
gradingProvider,
|
|
99
113
|
llmRubricProvider,
|
|
100
114
|
suggestionsProvider: gradingProvider,
|
|
101
115
|
synthesizeProvider: gradingProvider,
|
|
116
|
+
webSearchProvider,
|
|
102
117
|
};
|
|
103
118
|
}
|
|
104
119
|
//# sourceMappingURL=defaults.js.map
|
|
@@ -7,7 +7,9 @@ export declare class AzureChatCompletionProvider extends AzureGenericProvider {
|
|
|
7
7
|
private initializeMCP;
|
|
8
8
|
cleanup(): Promise<void>;
|
|
9
9
|
/**
|
|
10
|
-
* Check if the current deployment is configured as a reasoning model
|
|
10
|
+
* Check if the current deployment is configured as a reasoning model.
|
|
11
|
+
* Reasoning models use max_completion_tokens instead of max_tokens,
|
|
12
|
+
* don't support temperature, and accept reasoning_effort parameter.
|
|
11
13
|
*/
|
|
12
14
|
protected isReasoningModel(): boolean;
|
|
13
15
|
getOpenAiBody(prompt: string, context?: CallApiContextParams, callApiOptions?: CallApiOptionsParams): Promise<Record<string, any>>;
|
|
@@ -43,7 +43,9 @@ class AzureChatCompletionProvider extends generic_1.AzureGenericProvider {
|
|
|
43
43
|
}
|
|
44
44
|
}
|
|
45
45
|
/**
|
|
46
|
-
* Check if the current deployment is configured as a reasoning model
|
|
46
|
+
* Check if the current deployment is configured as a reasoning model.
|
|
47
|
+
* Reasoning models use max_completion_tokens instead of max_tokens,
|
|
48
|
+
* don't support temperature, and accept reasoning_effort parameter.
|
|
47
49
|
*/
|
|
48
50
|
isReasoningModel() {
|
|
49
51
|
// Check explicit config flags first
|
|
@@ -53,14 +55,25 @@ class AzureChatCompletionProvider extends generic_1.AzureGenericProvider {
|
|
|
53
55
|
// Auto-detect reasoning models by deployment name (case-insensitive)
|
|
54
56
|
// Supports both direct names (o1-preview) and prefixed names (prod-o1-mini)
|
|
55
57
|
const lowerName = this.deploymentName.toLowerCase();
|
|
56
|
-
return (
|
|
58
|
+
return (
|
|
59
|
+
// OpenAI reasoning models
|
|
60
|
+
lowerName.startsWith('o1') ||
|
|
57
61
|
lowerName.includes('-o1') ||
|
|
58
62
|
lowerName.startsWith('o3') ||
|
|
59
63
|
lowerName.includes('-o3') ||
|
|
60
64
|
lowerName.startsWith('o4') ||
|
|
61
65
|
lowerName.includes('-o4') ||
|
|
66
|
+
// GPT-5 series (reasoning by default)
|
|
62
67
|
lowerName.startsWith('gpt-5') ||
|
|
63
|
-
lowerName.includes('-gpt-5')
|
|
68
|
+
lowerName.includes('-gpt-5') ||
|
|
69
|
+
// DeepSeek reasoning models
|
|
70
|
+
lowerName.includes('deepseek-r1') ||
|
|
71
|
+
lowerName.includes('deepseek_r1') ||
|
|
72
|
+
// Microsoft Phi reasoning models
|
|
73
|
+
lowerName.includes('phi-4-reasoning') ||
|
|
74
|
+
lowerName.includes('phi-4-mini-reasoning') ||
|
|
75
|
+
// xAI Grok reasoning models
|
|
76
|
+
(lowerName.includes('grok') && lowerName.includes('reasoning')));
|
|
64
77
|
}
|
|
65
78
|
async getOpenAiBody(prompt, context, callApiOptions) {
|
|
66
79
|
const config = {
|