promptfoo 0.119.13 → 0.119.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/dist/package.json +28 -26
  2. package/dist/src/app/assets/index-eJ2lMe94.js +51 -0
  3. package/dist/src/app/assets/{source-map-support-Bnh0UQ2S.js → source-map-support-1v4oeb7P.js} +1 -1
  4. package/dist/src/app/assets/sync-CtLQRuC1.js +1 -0
  5. package/dist/src/app/assets/{vendor-charts-T60Uk0Z3.js → vendor-charts-DnVv66VV.js} +1 -1
  6. package/dist/src/app/assets/{vendor-markdown-DLig-KJh.js → vendor-markdown-DCpQIyMA.js} +1 -1
  7. package/dist/src/app/assets/{vendor-mui-core-5BLaiG3c.js → vendor-mui-core-Boqnpf9f.js} +1 -1
  8. package/dist/src/app/assets/{vendor-mui-icons-fn39Fu2e.js → vendor-mui-icons-B8MqoVbj.js} +1 -1
  9. package/dist/src/app/assets/vendor-mui-x-CGSS6QHF.js +45 -0
  10. package/dist/src/app/assets/{vendor-utils-DYBMEuwX.js → vendor-utils-DdfHIEy8.js} +1 -1
  11. package/dist/src/app/index.html +7 -7
  12. package/dist/src/assertions/guardrails.d.ts +1 -1
  13. package/dist/src/assertions/guardrails.js +18 -9
  14. package/dist/src/assertions/index.d.ts +1 -1
  15. package/dist/src/assertions/index.js +9 -3
  16. package/dist/src/assertions/searchRubric.d.ts +3 -0
  17. package/dist/src/assertions/searchRubric.js +18 -0
  18. package/dist/src/commands/eval.js +1 -1
  19. package/dist/src/commands/modelScan.d.ts +7 -1
  20. package/dist/src/commands/modelScan.js +121 -59
  21. package/dist/src/database/index.d.ts +6 -0
  22. package/dist/src/database/index.js +11 -0
  23. package/dist/src/database/tables.d.ts +46 -24
  24. package/dist/src/envars.d.ts +17 -0
  25. package/dist/src/generated/constants.js +1 -1
  26. package/dist/src/logger.d.ts +5 -0
  27. package/dist/src/logger.js +28 -0
  28. package/dist/src/main.js +17 -6
  29. package/dist/src/matchers.d.ts +1 -0
  30. package/dist/src/matchers.js +80 -0
  31. package/dist/src/models/eval.d.ts +2 -1
  32. package/dist/src/models/eval.js +44 -2
  33. package/dist/src/prompts/grading.d.ts +1 -0
  34. package/dist/src/prompts/grading.js +26 -1
  35. package/dist/src/prompts/index.d.ts +1 -0
  36. package/dist/src/prompts/index.js +4 -1
  37. package/dist/src/providers/adaline.gateway.js +2 -2
  38. package/dist/src/providers/anthropic/defaults.d.ts +1 -1
  39. package/dist/src/providers/anthropic/defaults.js +15 -0
  40. package/dist/src/providers/azure/chat.d.ts +3 -1
  41. package/dist/src/providers/azure/chat.js +16 -3
  42. package/dist/src/providers/azure/defaults.js +660 -141
  43. package/dist/src/providers/azure/responses.d.ts +5 -0
  44. package/dist/src/providers/azure/responses.js +33 -4
  45. package/dist/src/providers/azure/types.d.ts +4 -0
  46. package/dist/src/providers/bedrock/agents.d.ts +1 -1
  47. package/dist/src/providers/bedrock/agents.js +2 -2
  48. package/dist/src/providers/bedrock/base.d.ts +40 -0
  49. package/dist/src/providers/bedrock/base.js +171 -0
  50. package/dist/src/providers/bedrock/converse.d.ts +146 -0
  51. package/dist/src/providers/bedrock/converse.js +1044 -0
  52. package/dist/src/providers/bedrock/index.d.ts +1 -34
  53. package/dist/src/providers/bedrock/index.js +4 -159
  54. package/dist/src/providers/bedrock/knowledgeBase.d.ts +1 -1
  55. package/dist/src/providers/bedrock/knowledgeBase.js +2 -2
  56. package/dist/src/providers/bedrock/nova-sonic.d.ts +2 -1
  57. package/dist/src/providers/bedrock/nova-sonic.js +2 -2
  58. package/dist/src/providers/claude-agent-sdk.d.ts +58 -1
  59. package/dist/src/providers/claude-agent-sdk.js +22 -1
  60. package/dist/src/providers/defaults.js +4 -0
  61. package/dist/src/providers/github/defaults.js +6 -6
  62. package/dist/src/providers/google/types.d.ts +25 -0
  63. package/dist/src/providers/google/util.d.ts +2 -0
  64. package/dist/src/providers/google/vertex.js +78 -22
  65. package/dist/src/providers/{groq.d.ts → groq/chat.d.ts} +26 -20
  66. package/dist/src/providers/groq/chat.js +79 -0
  67. package/dist/src/providers/groq/index.d.ts +5 -0
  68. package/dist/src/providers/groq/index.js +24 -0
  69. package/dist/src/providers/groq/responses.d.ts +106 -0
  70. package/dist/src/providers/groq/responses.js +64 -0
  71. package/dist/src/providers/groq/types.d.ts +44 -0
  72. package/dist/src/providers/groq/types.js +3 -0
  73. package/dist/src/providers/groq/util.d.ts +15 -0
  74. package/dist/src/providers/groq/util.js +28 -0
  75. package/dist/src/providers/mcp/client.d.ts +8 -0
  76. package/dist/src/providers/mcp/client.js +60 -10
  77. package/dist/src/providers/mcp/types.d.ts +21 -0
  78. package/dist/src/providers/openai/chatkit-pool.d.ts +114 -0
  79. package/dist/src/providers/openai/chatkit-pool.js +548 -0
  80. package/dist/src/providers/openai/chatkit-types.d.ts +73 -0
  81. package/dist/src/providers/openai/chatkit-types.js +3 -0
  82. package/dist/src/providers/openai/chatkit.d.ts +76 -0
  83. package/dist/src/providers/openai/chatkit.js +879 -0
  84. package/dist/src/providers/openai/codex-sdk.d.ts +109 -0
  85. package/dist/src/providers/openai/codex-sdk.js +346 -0
  86. package/dist/src/providers/openai/defaults.d.ts +2 -0
  87. package/dist/src/providers/openai/defaults.js +10 -4
  88. package/dist/src/providers/registry.js +48 -9
  89. package/dist/src/providers/responses/types.d.ts +1 -1
  90. package/dist/src/providers/sagemaker.d.ts +2 -2
  91. package/dist/src/providers/webSearchUtils.d.ts +17 -0
  92. package/dist/src/providers/webSearchUtils.js +169 -0
  93. package/dist/src/providers/xai/chat.d.ts +61 -0
  94. package/dist/src/providers/xai/chat.js +68 -3
  95. package/dist/src/providers/xai/responses.d.ts +189 -0
  96. package/dist/src/providers/xai/responses.js +268 -0
  97. package/dist/src/redteam/constants/plugins.d.ts +1 -1
  98. package/dist/src/redteam/constants/plugins.js +1 -1
  99. package/dist/src/redteam/constants/strategies.d.ts +1 -1
  100. package/dist/src/redteam/constants/strategies.js +1 -0
  101. package/dist/src/redteam/plugins/vlguard.d.ts +53 -4
  102. package/dist/src/redteam/plugins/vlguard.js +362 -46
  103. package/dist/src/redteam/providers/constants.d.ts +2 -2
  104. package/dist/src/redteam/providers/constants.js +2 -2
  105. package/dist/src/redteam/providers/crescendo/index.d.ts +1 -1
  106. package/dist/src/redteam/providers/crescendo/index.js +5 -3
  107. package/dist/src/redteam/providers/hydra/index.js +1 -1
  108. package/dist/src/server/routes/modelAudit.js +4 -4
  109. package/dist/src/share.js +4 -2
  110. package/dist/src/telemetry.js +44 -8
  111. package/dist/src/types/env.d.ts +3 -0
  112. package/dist/src/types/env.js +1 -0
  113. package/dist/src/types/index.d.ts +896 -615
  114. package/dist/src/types/index.js +1 -0
  115. package/dist/src/types/providers.d.ts +1 -0
  116. package/dist/src/types/tracing.d.ts +3 -0
  117. package/dist/src/util/database.d.ts +6 -4
  118. package/dist/src/util/file.js +6 -4
  119. package/dist/src/util/modelAuditCliParser.d.ts +4 -4
  120. package/dist/src/util/xlsx.js +52 -26
  121. package/dist/src/validators/providers.d.ts +142 -122
  122. package/dist/src/validators/providers.js +4 -6
  123. package/dist/src/validators/redteam.d.ts +36 -28
  124. package/dist/src/validators/redteam.js +9 -3
  125. package/dist/tsconfig.tsbuildinfo +1 -1
  126. package/package.json +28 -26
  127. package/dist/drizzle/CLAUDE.md +0 -65
  128. package/dist/src/app/assets/index-DifT6VGT.js +0 -51
  129. package/dist/src/app/assets/sync-Oo-W_Rbj.js +0 -1
  130. package/dist/src/app/assets/vendor-mui-x-C2xF-yiO.js +0 -45
  131. package/dist/src/providers/groq.js +0 -48
@@ -107,6 +107,22 @@ type EnvVars = {
107
107
  REQUEST_TIMEOUT_MS?: number;
108
108
  RESULT_HISTORY_LENGTH?: number;
109
109
  WEBHOOK_TIMEOUT?: number;
110
+ /**
111
+ * Default timeout in milliseconds for MCP tool calls.
112
+ * This overrides the MCP SDK's default 60-second timeout.
113
+ * Can be overridden per-provider via config.mcp.timeout.
114
+ */
115
+ MCP_REQUEST_TIMEOUT_MS?: number;
116
+ /**
117
+ * Enable debug logging for MCP connections.
118
+ * Can be overridden per-provider via config.mcp.debug.
119
+ */
120
+ MCP_DEBUG?: boolean;
121
+ /**
122
+ * Enable verbose output for MCP connections.
123
+ * Can be overridden per-provider via config.mcp.verbose.
124
+ */
125
+ MCP_VERBOSE?: boolean;
110
126
  PROMPTFOO_POSTHOG_KEY?: string;
111
127
  PROMPTFOO_POSTHOG_HOST?: string;
112
128
  /**
@@ -208,6 +224,7 @@ type EnvVars = {
208
224
  OPENAI_STOP?: string;
209
225
  OPENAI_TEMPERATURE?: number;
210
226
  OPENAI_TOP_P?: number;
227
+ CODEX_API_KEY?: string;
211
228
  OPENROUTER_API_KEY?: string;
212
229
  PORTKEY_API_BASE_URL?: string;
213
230
  PORTKEY_API_KEY?: string;
@@ -2,6 +2,6 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.POSTHOG_KEY = void 0;
4
4
  // This file is auto-generated during build. Do not edit manually.
5
- // Generated at: 2025-11-25T16:43:03.753Z
5
+ // Generated at: 2025-12-01T18:11:57.753Z
6
6
  exports.POSTHOG_KEY = 'phc_E5n5uHnDo2eREJL1uqX1cIlbkoRby4yFWt3V94HqRRg';
7
7
  //# sourceMappingURL=constants.js.map
@@ -73,5 +73,10 @@ export declare function logRequestResponse(options: {
73
73
  response?: Response | null;
74
74
  error?: boolean;
75
75
  }): Promise<void>;
76
+ /**
77
+ * Close all file transports and cleanup logger resources
78
+ * Should be called during graceful shutdown to prevent event loop hanging
79
+ */
80
+ export declare function closeLogger(): void;
76
81
  export default logger;
77
82
  //# sourceMappingURL=logger.d.ts.map
@@ -47,6 +47,7 @@ exports.isDebugEnabled = isDebugEnabled;
47
47
  exports.initializeRunLogging = initializeRunLogging;
48
48
  exports.setLogger = setLogger;
49
49
  exports.logRequestResponse = logRequestResponse;
50
+ exports.closeLogger = closeLogger;
50
51
  const fs_1 = __importDefault(require("fs"));
51
52
  const path_1 = __importDefault(require("path"));
52
53
  const chalk_1 = __importDefault(require("chalk"));
@@ -389,6 +390,33 @@ async function logRequestResponse(options) {
389
390
  logMethod(`Api Request`, logObject);
390
391
  }
391
392
  }
393
+ /**
394
+ * Close all file transports and cleanup logger resources
395
+ * Should be called during graceful shutdown to prevent event loop hanging
396
+ */
397
+ function closeLogger() {
398
+ try {
399
+ // Close all file transports
400
+ const fileTransports = exports.winstonLogger.transports.filter((transport) => transport instanceof winston_1.default.transports.File);
401
+ for (const transport of fileTransports) {
402
+ const filename = transport.filename;
403
+ if (filename) {
404
+ logger.debug(`Closing log file: ${filename}`);
405
+ }
406
+ if (typeof transport.close === 'function') {
407
+ transport.close();
408
+ }
409
+ exports.winstonLogger.remove(transport);
410
+ }
411
+ if (fileTransports.length > 0) {
412
+ logger.debug('Logger cleanup complete');
413
+ }
414
+ }
415
+ catch (error) {
416
+ // Can't use logger here since we're shutting it down
417
+ console.error(`Error closing logger: ${error}`);
418
+ }
419
+ }
392
420
  // Initialize source maps if debug is enabled at startup
393
421
  if ((0, envars_1.getEnvString)('LOG_LEVEL', 'info') === 'debug') {
394
422
  initializeSourceMapSupport();
package/dist/src/main.js CHANGED
@@ -39,12 +39,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
39
39
  Object.defineProperty(exports, "__esModule", { value: true });
40
40
  exports.addCommonOptionsRecursively = addCommonOptionsRecursively;
41
41
  const commander_1 = require("commander");
42
+ const undici_1 = require("undici");
42
43
  const package_json_1 = require("../package.json");
43
44
  const checkNodeVersion_1 = require("./checkNodeVersion");
44
45
  const cliState_1 = __importDefault(require("./cliState"));
46
+ const index_1 = require("./codeScan/index");
45
47
  const auth_1 = require("./commands/auth");
46
48
  const cache_1 = require("./commands/cache");
47
- const index_1 = require("./codeScan/index");
48
49
  const config_1 = require("./commands/config");
49
50
  const debug_1 = require("./commands/debug");
50
51
  const delete_1 = require("./commands/delete");
@@ -63,6 +64,7 @@ const share_1 = require("./commands/share");
63
64
  const show_1 = require("./commands/show");
64
65
  const validate_1 = require("./commands/validate");
65
66
  const view_1 = require("./commands/view");
67
+ const index_3 = require("./database/index");
66
68
  const logger_1 = __importStar(require("./logger"));
67
69
  const migrate_1 = require("./migrate");
68
70
  const discover_1 = require("./redteam/commands/discover");
@@ -76,8 +78,8 @@ const simba_1 = require("./redteam/commands/simba");
76
78
  const telemetry_1 = __importDefault(require("./telemetry"));
77
79
  const updates_1 = require("./updates");
78
80
  const default_1 = require("./util/config/default");
79
- const index_3 = require("./util/errors/index");
80
- const index_4 = require("./util/index");
81
+ const index_4 = require("./util/errors/index");
82
+ const index_5 = require("./util/index");
81
83
  /**
82
84
  * Adds verbose and env-file options to all commands recursively
83
85
  */
@@ -97,7 +99,7 @@ function addCommonOptionsRecursively(command) {
97
99
  }
98
100
  const envPath = thisCommand.opts().envFile || thisCommand.opts().envPath;
99
101
  if (envPath) {
100
- (0, index_4.setupEnv)(envPath);
102
+ (0, index_5.setupEnv)(envPath);
101
103
  logger_1.default.debug(`Loading environment from ${envPath}`);
102
104
  }
103
105
  });
@@ -159,12 +161,12 @@ async function main() {
159
161
  // Add common options to all commands recursively
160
162
  addCommonOptionsRecursively(program);
161
163
  program.hook('postAction', async () => {
162
- (0, index_3.printErrorInformation)(cliState_1.default.errorLogFile, cliState_1.default.debugLogFile);
164
+ (0, index_4.printErrorInformation)(cliState_1.default.errorLogFile, cliState_1.default.debugLogFile);
163
165
  if (cliState_1.default.postActionCallback) {
164
166
  await cliState_1.default.postActionCallback();
165
167
  }
166
168
  });
167
- program.parse();
169
+ await program.parseAsync();
168
170
  }
169
171
  if (require.main === module) {
170
172
  (0, checkNodeVersion_1.checkNodeVersion)();
@@ -172,6 +174,15 @@ if (require.main === module) {
172
174
  logger_1.default.debug('Shutting down gracefully...');
173
175
  await telemetry_1.default.shutdown();
174
176
  logger_1.default.debug('Shutdown complete');
177
+ (0, logger_1.closeLogger)();
178
+ (0, index_3.closeDbIfOpen)();
179
+ try {
180
+ const dispatcher = (0, undici_1.getGlobalDispatcher)();
181
+ await dispatcher.destroy();
182
+ }
183
+ catch {
184
+ // Silently handle dispatcher destroy errors
185
+ }
175
186
  });
176
187
  }
177
188
  //# sourceMappingURL=main.js.map
@@ -35,6 +35,7 @@ interface ModerationMatchOptions {
35
35
  assistantResponse: string;
36
36
  categories?: string[];
37
37
  }
38
+ export declare function matchesSearchRubric(rubric: string, llmOutput: string, grading?: GradingConfig, vars?: Record<string, string | object>, assertion?: Assertion, _provider?: ApiProvider): Promise<GradingResult>;
38
39
  export declare function matchesModeration({ userPrompt, assistantResponse, categories }: ModerationMatchOptions, grading?: GradingConfig): Promise<{
39
40
  pass: boolean;
40
41
  score: number;
@@ -20,6 +20,7 @@ exports.matchesContextRelevance = matchesContextRelevance;
20
20
  exports.matchesContextFaithfulness = matchesContextFaithfulness;
21
21
  exports.matchesSelectBest = matchesSelectBest;
22
22
  exports.selectMaxScore = selectMaxScore;
23
+ exports.matchesSearchRubric = matchesSearchRubric;
23
24
  exports.matchesModeration = matchesModeration;
24
25
  const path_1 = __importDefault(require("path"));
25
26
  const utils_1 = require("./assertions/utils");
@@ -29,6 +30,8 @@ const logger_1 = __importDefault(require("./logger"));
29
30
  const index_1 = require("./prompts/index");
30
31
  const index_2 = require("./providers/index");
31
32
  const defaults_1 = require("./providers/defaults");
33
+ const webSearchUtils_1 = require("./providers/webSearchUtils");
34
+ const grading_1 = require("./prompts/grading");
32
35
  const constants_1 = require("./redteam/constants");
33
36
  const remoteGeneration_1 = require("./redteam/remoteGeneration");
34
37
  const remoteGrading_1 = require("./remoteGrading");
@@ -1224,6 +1227,83 @@ async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
1224
1227
  };
1225
1228
  });
1226
1229
  }
1230
+ async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider) {
1231
+ if (!grading) {
1232
+ throw new Error('Cannot grade output without grading config. Specify --grader option or grading config.');
1233
+ }
1234
+ // Search rubric assertion is like llm-rubric but with web search capabilities
1235
+ const defaultProviders = await (0, defaults_1.getDefaultProviders)();
1236
+ // Get a provider with web search capabilities
1237
+ let searchProvider = grading.provider ||
1238
+ defaultProviders.webSearchProvider ||
1239
+ defaultProviders.llmRubricProvider ||
1240
+ defaultProviders.gradingProvider;
1241
+ // Check if current provider has web search, if not try to load one
1242
+ if (!(0, webSearchUtils_1.hasWebSearchCapability)(searchProvider)) {
1243
+ // Try to load a provider with web search capabilities
1244
+ // For search-rubric assertion, prefer Anthropic first (pass true)
1245
+ const webSearchProvider = await (0, webSearchUtils_1.loadWebSearchProvider)(true);
1246
+ if (webSearchProvider) {
1247
+ searchProvider = webSearchProvider;
1248
+ }
1249
+ }
1250
+ // Ensure we have a provider with web search capabilities
1251
+ if (!searchProvider || !(0, webSearchUtils_1.hasWebSearchCapability)(searchProvider)) {
1252
+ throw new Error('search-rubric assertion requires a grading provider with web search capabilities. ' +
1253
+ 'Use --grader with a web search provider (e.g., anthropic:messages:claude-sonnet-4, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider');
1254
+ }
1255
+ // Load the web search rubric prompt
1256
+ const rubricPrompt = await loadRubricPrompt(grading?.rubricPrompt, grading_1.DEFAULT_WEB_SEARCH_PROMPT);
1257
+ const prompt = await renderLlmRubricPrompt(rubricPrompt, {
1258
+ output: tryParse(llmOutput),
1259
+ rubric,
1260
+ ...(vars || {}),
1261
+ });
1262
+ // Get the evaluation from the search provider
1263
+ const resp = await searchProvider.callApi(prompt);
1264
+ if (resp.error || !resp.output) {
1265
+ return {
1266
+ pass: false,
1267
+ score: 0,
1268
+ reason: `Search rubric evaluation failed: ${resp.error || 'No output'}`,
1269
+ tokensUsed: resp.tokenUsage,
1270
+ assertion,
1271
+ };
1272
+ }
1273
+ // Parse the response
1274
+ try {
1275
+ const result = (0, json_1.extractFirstJsonObject)(String(resp.output));
1276
+ // Apply threshold if specified
1277
+ let pass = result.pass ?? false;
1278
+ const score = typeof result.score === 'number' ? result.score : pass ? 1 : 0;
1279
+ if (assertion?.threshold !== undefined) {
1280
+ pass = pass && score >= assertion.threshold;
1281
+ }
1282
+ return {
1283
+ pass,
1284
+ score,
1285
+ reason: result.reason || 'No reason provided',
1286
+ tokensUsed: resp.tokenUsage,
1287
+ assertion,
1288
+ metadata: {
1289
+ searchResults: result.searchResults || [],
1290
+ searchProvider: searchProvider.id(),
1291
+ },
1292
+ };
1293
+ }
1294
+ catch {
1295
+ // Try to parse as a simple pass/fail
1296
+ const outputLower = String(resp.output).toLowerCase();
1297
+ const pass = outputLower.includes('"pass":true') || outputLower.includes('"pass": true');
1298
+ return {
1299
+ pass,
1300
+ score: pass ? 1 : 0,
1301
+ reason: resp.output,
1302
+ tokensUsed: resp.tokenUsage,
1303
+ assertion,
1304
+ };
1305
+ }
1306
+ }
1227
1307
  async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
1228
1308
  if (!assistantResponse) {
1229
1309
  return {
@@ -1,6 +1,6 @@
1
1
  import { type CompletedPrompt, type EvalSummary, type EvaluateResult, type EvaluateStats, type EvaluateSummaryV2, type EvaluateSummaryV3, type EvaluateTable, type EvaluateTableRow, type Prompt, type ResultsFile, type UnifiedConfig } from '../types/index';
2
2
  import EvalResult from './evalResult';
3
- import type { EvalResultsFilterMode } from '../types/index';
3
+ import type { EvalResultsFilterMode, TraceData } from '../types/index';
4
4
  export declare function createEvalId(createdAt?: Date): string;
5
5
  /** Result from queries extracting variable keys with eval IDs */
6
6
  export interface VarKeyWithEvalIdResult {
@@ -181,6 +181,7 @@ export default class Eval {
181
181
  clearResults(): void;
182
182
  getStats(): EvaluateStats;
183
183
  toEvaluateSummary(): Promise<EvaluateSummaryV3 | EvaluateSummaryV2>;
184
+ getTraces(): Promise<TraceData[]>;
184
185
  toResultsFile(): Promise<ResultsFile>;
185
186
  delete(): Promise<void>;
186
187
  /**
@@ -17,8 +17,11 @@ const accounts_1 = require("../globalConfig/accounts");
17
17
  const logger_1 = __importDefault(require("../logger"));
18
18
  const utils_1 = require("../prompts/utils");
19
19
  const constants_2 = require("../redteam/constants");
20
+ const metrics_1 = require("../redteam/metrics");
20
21
  const sharedFrontend_1 = require("../redteam/sharedFrontend");
22
+ const store_1 = require("../tracing/store");
21
23
  const index_2 = require("../types/index");
24
+ const calculateFilteredMetrics_1 = require("../util/calculateFilteredMetrics");
22
25
  const convertEvalResultsToTable_1 = require("../util/convertEvalResultsToTable");
23
26
  const createHash_1 = require("../util/createHash");
24
27
  const index_3 = require("../util/exportToFile/index");
@@ -27,8 +30,6 @@ const time_1 = require("../util/time");
27
30
  const tokenUsageUtils_1 = require("../util/tokenUsageUtils");
28
31
  const evalPerformance_1 = require("./evalPerformance");
29
32
  const evalResult_1 = __importDefault(require("./evalResult"));
30
- const calculateFilteredMetrics_1 = require("../util/calculateFilteredMetrics");
31
- const metrics_1 = require("../redteam/metrics");
32
33
  /**
33
34
  * Sanitizes runtime options to ensure only JSON-serializable data is persisted.
34
35
  * Removes non-serializable fields like AbortSignal, functions, and symbols.
@@ -813,7 +814,47 @@ class Eval {
813
814
  stats,
814
815
  };
815
816
  }
817
+ async getTraces() {
818
+ try {
819
+ const traceStore = (0, store_1.getTraceStore)();
820
+ const tracesData = await traceStore.getTracesByEvaluation(this.id);
821
+ // Transform trace data to match the expected schema
822
+ return tracesData.map((trace) => ({
823
+ traceId: trace.traceId,
824
+ evaluationId: trace.evaluationId,
825
+ testCaseId: trace.testCaseId,
826
+ metadata: trace.metadata,
827
+ spans: (trace.spans || []).map((span) => {
828
+ // Calculate duration
829
+ const durationMs = span.endTime && span.startTime ? (span.endTime - span.startTime) / 1000000 : undefined;
830
+ // Map status code
831
+ const statusCode = span.statusCode === 1 ? 'ok' : span.statusCode === 2 ? 'error' : 'unset';
832
+ return {
833
+ spanId: span.spanId,
834
+ parentSpanId: span.parentSpanId,
835
+ name: span.name,
836
+ kind: span.kind || 'unspecified',
837
+ startTime: span.startTime,
838
+ endTime: span.endTime,
839
+ durationMs,
840
+ attributes: span.attributes || {},
841
+ status: {
842
+ code: statusCode,
843
+ message: span.statusMessage,
844
+ },
845
+ depth: 0, // Will be calculated on the server side when storing
846
+ events: span.events || [],
847
+ };
848
+ }),
849
+ }));
850
+ }
851
+ catch (error) {
852
+ logger_1.default.debug(`Failed to fetch traces for eval ${this.id}: ${error}`);
853
+ return [];
854
+ }
855
+ }
816
856
  async toResultsFile() {
857
+ const traces = await this.getTraces();
817
858
  const results = {
818
859
  version: this.version(),
819
860
  createdAt: new Date(this.createdAt).toISOString(),
@@ -822,6 +863,7 @@ class Eval {
822
863
  author: this.author || null,
823
864
  prompts: this.getPrompts(),
824
865
  datasetId: this.datasetId || null,
866
+ ...(traces.length > 0 && { traces }),
825
867
  };
826
868
  return results;
827
869
  }
@@ -13,4 +13,5 @@ export declare const SUGGEST_PROMPTS_SYSTEM_MESSAGE: {
13
13
  content: string;
14
14
  };
15
15
  export declare const SELECT_BEST_PROMPT: string;
16
+ export declare const DEFAULT_WEB_SEARCH_PROMPT: string;
16
17
  //# sourceMappingURL=grading.d.ts.map
@@ -17,7 +17,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
17
17
  return (mod && mod.__esModule) ? mod : { "default": mod };
18
18
  };
19
19
  Object.defineProperty(exports, "__esModule", { value: true });
20
- exports.SELECT_BEST_PROMPT = exports.SUGGEST_PROMPTS_SYSTEM_MESSAGE = exports.OPENAI_CLOSED_QA_PROMPT = exports.PROMPTFOO_FACTUALITY_PROMPT = exports.DEFAULT_GRADING_PROMPT = void 0;
20
+ exports.DEFAULT_WEB_SEARCH_PROMPT = exports.SELECT_BEST_PROMPT = exports.SUGGEST_PROMPTS_SYSTEM_MESSAGE = exports.OPENAI_CLOSED_QA_PROMPT = exports.PROMPTFOO_FACTUALITY_PROMPT = exports.DEFAULT_GRADING_PROMPT = void 0;
21
21
  const dedent_1 = __importDefault(require("dedent"));
22
22
  /**
23
23
  * This module exports various prompts used for model graded evaluation.
@@ -135,4 +135,29 @@ Here are the pieces of text:
135
135
  Output the index of the text that best fits the criteria. You must output a single integer.`,
136
136
  },
137
137
  ]);
138
+ exports.DEFAULT_WEB_SEARCH_PROMPT = JSON.stringify([
139
+ {
140
+ role: 'system',
141
+ content: (0, dedent_1.default) `You are grading output according to a user-specified rubric, with the ability to search the web for current information. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
142
+
143
+ You MUST search the web when:
144
+ - The rubric asks about current information (prices, weather, news, etc.)
145
+ - Facts need to be verified against recent data
146
+ - The rubric references time-sensitive information
147
+
148
+ Examples:
149
+
150
+ <Output>The current CEO of Microsoft is Satya Nadella</Output>
151
+ <Rubric>Contains accurate information about Microsoft's leadership</Rubric>
152
+ {"reason": "I searched and confirmed Satya Nadella is indeed the current CEO of Microsoft", "pass": true, "score": 1.0}
153
+
154
+ <Output>Bitcoin is trading at $45,000</Output>
155
+ <Rubric>Provides current Bitcoin price within 10% accuracy</Rubric>
156
+ {"reason": "Web search shows Bitcoin is currently trading at $98,000, not $45,000. The output is off by more than 50%", "pass": false, "score": 0.0}`,
157
+ },
158
+ {
159
+ role: 'user',
160
+ content: '<Output>\n{{ output }}\n</Output>\n<Rubric>\n{{ rubric }}\n</Rubric>',
161
+ },
162
+ ]);
138
163
  //# sourceMappingURL=grading.js.map
@@ -1,5 +1,6 @@
1
1
  import type { EvaluateTestSuite, Prompt, TestSuite, UnifiedConfig } from '../types/index';
2
2
  export * from './grading';
3
+ export { DEFAULT_WEB_SEARCH_PROMPT } from './grading';
3
4
  /**
4
5
  * Reads and maps provider prompts based on the configuration and parsed prompts.
5
6
  * @param config - The configuration object.
@@ -17,7 +17,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
17
17
  return (mod && mod.__esModule) ? mod : { "default": mod };
18
18
  };
19
19
  Object.defineProperty(exports, "__esModule", { value: true });
20
- exports.GEVAL_PROMPT_EVALUATE = exports.GEVAL_PROMPT_STEPS = void 0;
20
+ exports.GEVAL_PROMPT_EVALUATE = exports.GEVAL_PROMPT_STEPS = exports.DEFAULT_WEB_SEARCH_PROMPT = void 0;
21
21
  exports.readProviderPromptMap = readProviderPromptMap;
22
22
  exports.readPrompts = readPrompts;
23
23
  exports.processPrompts = processPrompts;
@@ -41,6 +41,8 @@ const text_1 = require("./processors/text");
41
41
  const yaml_1 = require("./processors/yaml");
42
42
  const utils_1 = require("./utils");
43
43
  __exportStar(require("./grading"), exports);
44
+ var grading_1 = require("./grading");
45
+ Object.defineProperty(exports, "DEFAULT_WEB_SEARCH_PROMPT", { enumerable: true, get: function () { return grading_1.DEFAULT_WEB_SEARCH_PROMPT; } });
44
46
  /**
45
47
  * Reads and maps provider prompts based on the configuration and parsed prompts.
46
48
  * @param config - The configuration object.
@@ -213,6 +215,7 @@ async function processPrompts(prompts) {
213
215
  }
214
216
  }))).flat();
215
217
  }
218
+ // G-Eval prompts
216
219
  exports.GEVAL_PROMPT_STEPS = `
217
220
  Given an evaluation criteria which outlines how you should judge some text, generate 3-4 concise evaluation steps for any text based on the criteria below.
218
221
 
@@ -25,7 +25,7 @@ const embedding_1 = require("./azure/embedding");
25
25
  const util_2 = require("./azure/util");
26
26
  const ai_studio_1 = require("./google/ai.studio");
27
27
  const vertex_2 = require("./google/vertex");
28
- const groq_2 = require("./groq");
28
+ const index_2 = require("./groq/index");
29
29
  const chat_2 = require("./openai/chat");
30
30
  const embedding_2 = require("./openai/embedding");
31
31
  const util_3 = require("./openai/util");
@@ -396,7 +396,7 @@ class AdalineGatewayChatProvider extends AdalineGatewayGenericProvider {
396
396
  }
397
397
  else if (this.providerName === 'groq') {
398
398
  const provider = new groq_1.Groq();
399
- const parentClass = new groq_2.GroqProvider(this.modelName, this.providerOptions);
399
+ const parentClass = new index_2.GroqProvider(this.modelName, this.providerOptions);
400
400
  const apiKey = parentClass.getApiKey();
401
401
  if (!apiKey) {
402
402
  throw new Error('Groq API key is not set. Set the GROQ_API_KEY environment variable or add `apiKey` to the provider config.');
@@ -14,5 +14,5 @@ export declare class AnthropicLlmRubricProvider extends AnthropicMessagesProvide
14
14
  * @param env - Optional environment overrides
15
15
  * @returns Anthropic provider implementations for various functions
16
16
  */
17
- export declare function getAnthropicProviders(env?: EnvOverrides): Pick<DefaultProviders, 'gradingJsonProvider' | 'gradingProvider' | 'llmRubricProvider' | 'suggestionsProvider' | 'synthesizeProvider'>;
17
+ export declare function getAnthropicProviders(env?: EnvOverrides): Pick<DefaultProviders, 'gradingJsonProvider' | 'gradingProvider' | 'llmRubricProvider' | 'suggestionsProvider' | 'synthesizeProvider' | 'webSearchProvider'>;
18
18
  //# sourceMappingURL=defaults.d.ts.map
@@ -84,6 +84,19 @@ exports.AnthropicLlmRubricProvider = AnthropicLlmRubricProvider;
84
84
  // Private provider factories with lazy loading
85
85
  const gradingProviderFactory = createLazyProvider((env) => new messages_1.AnthropicMessagesProvider(exports.DEFAULT_ANTHROPIC_MODEL, { env }));
86
86
  const llmRubricProviderFactory = createLazyProvider((env) => new AnthropicLlmRubricProvider(exports.DEFAULT_ANTHROPIC_MODEL, { env }));
87
+ // Web Search Provider with web_search tool
88
+ const webSearchProviderFactory = createLazyProvider((env) => new messages_1.AnthropicMessagesProvider(exports.DEFAULT_ANTHROPIC_MODEL, {
89
+ env,
90
+ config: {
91
+ tools: [
92
+ {
93
+ type: 'web_search_20250305',
94
+ name: 'web_search',
95
+ max_uses: 5,
96
+ },
97
+ ],
98
+ },
99
+ }));
87
100
  /**
88
101
  * Gets all default Anthropic providers with the given environment overrides
89
102
  * @param env - Optional environment overrides
@@ -93,12 +106,14 @@ function getAnthropicProviders(env) {
93
106
  // Get providers with the provided environment variables
94
107
  const gradingProvider = gradingProviderFactory.getInstance(env);
95
108
  const llmRubricProvider = llmRubricProviderFactory.getInstance(env);
109
+ const webSearchProvider = webSearchProviderFactory.getInstance(env);
96
110
  return {
97
111
  gradingJsonProvider: gradingProvider,
98
112
  gradingProvider,
99
113
  llmRubricProvider,
100
114
  suggestionsProvider: gradingProvider,
101
115
  synthesizeProvider: gradingProvider,
116
+ webSearchProvider,
102
117
  };
103
118
  }
104
119
  //# sourceMappingURL=defaults.js.map
@@ -7,7 +7,9 @@ export declare class AzureChatCompletionProvider extends AzureGenericProvider {
7
7
  private initializeMCP;
8
8
  cleanup(): Promise<void>;
9
9
  /**
10
- * Check if the current deployment is configured as a reasoning model
10
+ * Check if the current deployment is configured as a reasoning model.
11
+ * Reasoning models use max_completion_tokens instead of max_tokens,
12
+ * don't support temperature, and accept reasoning_effort parameter.
11
13
  */
12
14
  protected isReasoningModel(): boolean;
13
15
  getOpenAiBody(prompt: string, context?: CallApiContextParams, callApiOptions?: CallApiOptionsParams): Promise<Record<string, any>>;
@@ -43,7 +43,9 @@ class AzureChatCompletionProvider extends generic_1.AzureGenericProvider {
43
43
  }
44
44
  }
45
45
  /**
46
- * Check if the current deployment is configured as a reasoning model
46
+ * Check if the current deployment is configured as a reasoning model.
47
+ * Reasoning models use max_completion_tokens instead of max_tokens,
48
+ * don't support temperature, and accept reasoning_effort parameter.
47
49
  */
48
50
  isReasoningModel() {
49
51
  // Check explicit config flags first
@@ -53,14 +55,25 @@ class AzureChatCompletionProvider extends generic_1.AzureGenericProvider {
53
55
  // Auto-detect reasoning models by deployment name (case-insensitive)
54
56
  // Supports both direct names (o1-preview) and prefixed names (prod-o1-mini)
55
57
  const lowerName = this.deploymentName.toLowerCase();
56
- return (lowerName.startsWith('o1') ||
58
+ return (
59
+ // OpenAI reasoning models
60
+ lowerName.startsWith('o1') ||
57
61
  lowerName.includes('-o1') ||
58
62
  lowerName.startsWith('o3') ||
59
63
  lowerName.includes('-o3') ||
60
64
  lowerName.startsWith('o4') ||
61
65
  lowerName.includes('-o4') ||
66
+ // GPT-5 series (reasoning by default)
62
67
  lowerName.startsWith('gpt-5') ||
63
- lowerName.includes('-gpt-5'));
68
+ lowerName.includes('-gpt-5') ||
69
+ // DeepSeek reasoning models
70
+ lowerName.includes('deepseek-r1') ||
71
+ lowerName.includes('deepseek_r1') ||
72
+ // Microsoft Phi reasoning models
73
+ lowerName.includes('phi-4-reasoning') ||
74
+ lowerName.includes('phi-4-mini-reasoning') ||
75
+ // xAI Grok reasoning models
76
+ (lowerName.includes('grok') && lowerName.includes('reasoning')));
64
77
  }
65
78
  async getOpenAiBody(prompt, context, callApiOptions) {
66
79
  const config = {