promptfoo 0.115.1 → 0.115.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/dist/package.json +1 -1
  2. package/dist/src/app/assets/{index-BguflfND.css → index-BPc2R353.css} +1 -1
  3. package/dist/src/app/assets/index-BXsrbFYA.js +839 -0
  4. package/dist/src/app/assets/{source-map-support-yC3CWAhX.js → source-map-support-CNXJ1-7r.js} +1 -1
  5. package/dist/src/app/assets/{sync-Dow9n6Ue.js → sync-CYtWtmxM.js} +1 -1
  6. package/dist/src/app/index.html +2 -2
  7. package/dist/src/assertions/openai.d.ts.map +1 -1
  8. package/dist/src/assertions/openai.js +3 -1
  9. package/dist/src/assertions/openai.js.map +1 -1
  10. package/dist/src/assertions/synthesis.d.ts +15 -0
  11. package/dist/src/assertions/synthesis.d.ts.map +1 -0
  12. package/dist/src/assertions/synthesis.js +504 -0
  13. package/dist/src/assertions/synthesis.js.map +1 -0
  14. package/dist/src/commands/delete.d.ts +2 -0
  15. package/dist/src/commands/delete.d.ts.map +1 -1
  16. package/dist/src/commands/delete.js +3 -0
  17. package/dist/src/commands/delete.js.map +1 -1
  18. package/dist/src/commands/eval.d.ts.map +1 -1
  19. package/dist/src/commands/eval.js +15 -14
  20. package/dist/src/commands/eval.js.map +1 -1
  21. package/dist/src/commands/generate/assertions.d.ts +19 -0
  22. package/dist/src/commands/generate/assertions.d.ts.map +1 -0
  23. package/dist/src/commands/generate/assertions.js +150 -0
  24. package/dist/src/commands/generate/assertions.js.map +1 -0
  25. package/dist/src/commands/modelScan.d.ts +1 -0
  26. package/dist/src/commands/modelScan.d.ts.map +1 -1
  27. package/dist/src/commands/modelScan.js +4 -5
  28. package/dist/src/commands/modelScan.js.map +1 -1
  29. package/dist/src/commands/show.d.ts +3 -0
  30. package/dist/src/commands/show.d.ts.map +1 -1
  31. package/dist/src/commands/show.js +7 -0
  32. package/dist/src/commands/show.js.map +1 -1
  33. package/dist/src/database/tables.d.ts +22 -0
  34. package/dist/src/database/tables.d.ts.map +1 -1
  35. package/dist/src/envars.d.ts +1 -1
  36. package/dist/src/envars.d.ts.map +1 -1
  37. package/dist/src/envars.js.map +1 -1
  38. package/dist/src/evaluator.d.ts +9 -0
  39. package/dist/src/evaluator.d.ts.map +1 -1
  40. package/dist/src/evaluator.js +48 -24
  41. package/dist/src/evaluator.js.map +1 -1
  42. package/dist/src/evaluatorHelpers.d.ts +33 -5
  43. package/dist/src/evaluatorHelpers.d.ts.map +1 -1
  44. package/dist/src/evaluatorHelpers.js +41 -5
  45. package/dist/src/evaluatorHelpers.js.map +1 -1
  46. package/dist/src/main.d.ts.map +1 -1
  47. package/dist/src/main.js +2 -0
  48. package/dist/src/main.js.map +1 -1
  49. package/dist/src/models/eval.d.ts +1 -0
  50. package/dist/src/models/eval.d.ts.map +1 -1
  51. package/dist/src/models/eval.js +2 -1
  52. package/dist/src/models/eval.js.map +1 -1
  53. package/dist/src/providers/azure/chat.d.ts.map +1 -1
  54. package/dist/src/providers/azure/chat.js +1 -1
  55. package/dist/src/providers/azure/chat.js.map +1 -1
  56. package/dist/src/providers/azure/completion.d.ts.map +1 -1
  57. package/dist/src/providers/azure/completion.js +1 -1
  58. package/dist/src/providers/azure/completion.js.map +1 -1
  59. package/dist/src/providers/azure/types.d.ts +1 -1
  60. package/dist/src/providers/azure/types.d.ts.map +1 -1
  61. package/dist/src/providers/golangCompletion.d.ts.map +1 -1
  62. package/dist/src/providers/golangCompletion.js +0 -1
  63. package/dist/src/providers/golangCompletion.js.map +1 -1
  64. package/dist/src/providers/google/ai.studio.js +1 -1
  65. package/dist/src/providers/google/ai.studio.js.map +1 -1
  66. package/dist/src/providers/google/live.d.ts +11 -0
  67. package/dist/src/providers/google/live.d.ts.map +1 -1
  68. package/dist/src/providers/google/live.js +84 -2
  69. package/dist/src/providers/google/live.js.map +1 -1
  70. package/dist/src/providers/google/shared.d.ts.map +1 -1
  71. package/dist/src/providers/google/shared.js +15 -0
  72. package/dist/src/providers/google/shared.js.map +1 -1
  73. package/dist/src/providers/google/types.d.ts +2 -2
  74. package/dist/src/providers/google/types.d.ts.map +1 -1
  75. package/dist/src/providers/google/vertex.d.ts +11 -0
  76. package/dist/src/providers/google/vertex.d.ts.map +1 -1
  77. package/dist/src/providers/google/vertex.js +82 -3
  78. package/dist/src/providers/google/vertex.js.map +1 -1
  79. package/dist/src/providers/groq.d.ts +3 -1
  80. package/dist/src/providers/groq.d.ts.map +1 -1
  81. package/dist/src/providers/http.js +1 -1
  82. package/dist/src/providers/http.js.map +1 -1
  83. package/dist/src/providers/hyperbolic/audio.d.ts +40 -0
  84. package/dist/src/providers/hyperbolic/audio.d.ts.map +1 -0
  85. package/dist/src/providers/hyperbolic/audio.js +141 -0
  86. package/dist/src/providers/hyperbolic/audio.js.map +1 -0
  87. package/dist/src/providers/hyperbolic/chat.d.ts +110 -0
  88. package/dist/src/providers/hyperbolic/chat.d.ts.map +1 -0
  89. package/dist/src/providers/hyperbolic/chat.js +318 -0
  90. package/dist/src/providers/hyperbolic/chat.js.map +1 -0
  91. package/dist/src/providers/hyperbolic/image.d.ts +57 -0
  92. package/dist/src/providers/hyperbolic/image.d.ts.map +1 -0
  93. package/dist/src/providers/hyperbolic/image.js +231 -0
  94. package/dist/src/providers/hyperbolic/image.js.map +1 -0
  95. package/dist/src/providers/litellm.d.ts +3 -1
  96. package/dist/src/providers/litellm.d.ts.map +1 -1
  97. package/dist/src/providers/ollama.d.ts.map +1 -1
  98. package/dist/src/providers/ollama.js +1 -1
  99. package/dist/src/providers/ollama.js.map +1 -1
  100. package/dist/src/providers/openai/chat.d.ts +11 -0
  101. package/dist/src/providers/openai/chat.d.ts.map +1 -1
  102. package/dist/src/providers/openai/chat.js +132 -4
  103. package/dist/src/providers/openai/chat.js.map +1 -1
  104. package/dist/src/providers/openai/completion.d.ts.map +1 -1
  105. package/dist/src/providers/openai/completion.js +1 -1
  106. package/dist/src/providers/openai/completion.js.map +1 -1
  107. package/dist/src/providers/openai/responses.d.ts.map +1 -1
  108. package/dist/src/providers/openai/responses.js +21 -1
  109. package/dist/src/providers/openai/responses.js.map +1 -1
  110. package/dist/src/providers/openai/types.d.ts +27 -2
  111. package/dist/src/providers/openai/types.d.ts.map +1 -1
  112. package/dist/src/providers/openai/util.d.ts.map +1 -1
  113. package/dist/src/providers/openai/util.js +24 -1
  114. package/dist/src/providers/openai/util.js.map +1 -1
  115. package/dist/src/providers/promptfoo.d.ts +1 -1
  116. package/dist/src/providers/promptfoo.d.ts.map +1 -1
  117. package/dist/src/providers/promptfoo.js.map +1 -1
  118. package/dist/src/providers/pythonCompletion.d.ts.map +1 -1
  119. package/dist/src/providers/pythonCompletion.js +0 -1
  120. package/dist/src/providers/pythonCompletion.js.map +1 -1
  121. package/dist/src/providers/registry.d.ts.map +1 -1
  122. package/dist/src/providers/registry.js +21 -12
  123. package/dist/src/providers/registry.js.map +1 -1
  124. package/dist/src/providers/sagemaker.js +1 -1
  125. package/dist/src/providers/sagemaker.js.map +1 -1
  126. package/dist/src/providers/scriptCompletion.d.ts.map +1 -1
  127. package/dist/src/providers/scriptCompletion.js +0 -1
  128. package/dist/src/providers/scriptCompletion.js.map +1 -1
  129. package/dist/src/providers/simulatedUser.d.ts.map +1 -1
  130. package/dist/src/providers/simulatedUser.js +5 -3
  131. package/dist/src/providers/simulatedUser.js.map +1 -1
  132. package/dist/src/providers/xai/chat.d.ts +3 -1
  133. package/dist/src/providers/xai/chat.d.ts.map +1 -1
  134. package/dist/src/python/pythonUtils.d.ts +1 -0
  135. package/dist/src/python/pythonUtils.d.ts.map +1 -1
  136. package/dist/src/python/pythonUtils.js +9 -7
  137. package/dist/src/python/pythonUtils.js.map +1 -1
  138. package/dist/src/redteam/commands/discover.d.ts +2 -8
  139. package/dist/src/redteam/commands/discover.d.ts.map +1 -1
  140. package/dist/src/redteam/commands/discover.js +31 -100
  141. package/dist/src/redteam/commands/discover.js.map +1 -1
  142. package/dist/src/redteam/commands/generate.d.ts.map +1 -1
  143. package/dist/src/redteam/commands/generate.js +10 -26
  144. package/dist/src/redteam/commands/generate.js.map +1 -1
  145. package/dist/src/redteam/index.d.ts.map +1 -1
  146. package/dist/src/redteam/index.js +24 -5
  147. package/dist/src/redteam/index.js.map +1 -1
  148. package/dist/src/redteam/plugins/base.d.ts +6 -0
  149. package/dist/src/redteam/plugins/base.d.ts.map +1 -1
  150. package/dist/src/redteam/plugins/base.js +19 -0
  151. package/dist/src/redteam/plugins/base.js.map +1 -1
  152. package/dist/src/redteam/plugins/crossSessionLeak.d.ts +2 -1
  153. package/dist/src/redteam/plugins/crossSessionLeak.d.ts.map +1 -1
  154. package/dist/src/redteam/plugins/crossSessionLeak.js +11 -0
  155. package/dist/src/redteam/plugins/crossSessionLeak.js.map +1 -1
  156. package/dist/src/redteam/providers/crescendo/index.d.ts +15 -3
  157. package/dist/src/redteam/providers/crescendo/index.d.ts.map +1 -1
  158. package/dist/src/redteam/providers/crescendo/index.js +152 -16
  159. package/dist/src/redteam/providers/crescendo/index.js.map +1 -1
  160. package/dist/src/redteam/providers/crescendo/prompts.js +4 -4
  161. package/dist/src/redteam/providers/goat.d.ts +9 -0
  162. package/dist/src/redteam/providers/goat.d.ts.map +1 -1
  163. package/dist/src/redteam/providers/goat.js +23 -3
  164. package/dist/src/redteam/providers/goat.js.map +1 -1
  165. package/dist/src/redteam/remoteGeneration.d.ts +5 -0
  166. package/dist/src/redteam/remoteGeneration.d.ts.map +1 -1
  167. package/dist/src/redteam/remoteGeneration.js +21 -7
  168. package/dist/src/redteam/remoteGeneration.js.map +1 -1
  169. package/dist/src/redteam/types.d.ts +1 -0
  170. package/dist/src/redteam/types.d.ts.map +1 -1
  171. package/dist/src/redteam/util.d.ts.map +1 -1
  172. package/dist/src/redteam/util.js +4 -0
  173. package/dist/src/redteam/util.js.map +1 -1
  174. package/dist/src/server/routes/eval.js +1 -0
  175. package/dist/src/server/routes/eval.js.map +1 -1
  176. package/dist/src/server/routes/providers.d.ts.map +1 -1
  177. package/dist/src/server/routes/providers.js +41 -0
  178. package/dist/src/server/routes/providers.js.map +1 -1
  179. package/dist/src/server/server.js +2 -2
  180. package/dist/src/server/server.js.map +1 -1
  181. package/dist/src/tableOutput.html +2 -2
  182. package/dist/src/types/env.d.ts +3 -0
  183. package/dist/src/types/env.d.ts.map +1 -1
  184. package/dist/src/types/env.js +1 -0
  185. package/dist/src/types/env.js.map +1 -1
  186. package/dist/src/types/index.d.ts +281 -0
  187. package/dist/src/types/index.d.ts.map +1 -1
  188. package/dist/src/types/providers.d.ts +1 -1
  189. package/dist/src/types/providers.d.ts.map +1 -1
  190. package/dist/src/util/config/load.d.ts +1 -1
  191. package/dist/src/util/config/load.d.ts.map +1 -1
  192. package/dist/src/util/config/load.js +1 -0
  193. package/dist/src/util/config/load.js.map +1 -1
  194. package/dist/src/util/database.d.ts +2 -0
  195. package/dist/src/util/database.d.ts.map +1 -1
  196. package/dist/src/util/server.d.ts +12 -0
  197. package/dist/src/util/server.d.ts.map +1 -1
  198. package/dist/src/util/server.js +59 -0
  199. package/dist/src/util/server.js.map +1 -1
  200. package/dist/src/util/templates.d.ts.map +1 -1
  201. package/dist/src/util/templates.js +9 -7
  202. package/dist/src/util/templates.js.map +1 -1
  203. package/dist/src/util/transform.d.ts +3 -8
  204. package/dist/src/util/transform.d.ts.map +1 -1
  205. package/dist/src/util/transform.js +1 -1
  206. package/dist/src/util/transform.js.map +1 -1
  207. package/dist/src/validators/providers.d.ts +20 -0
  208. package/dist/src/validators/providers.d.ts.map +1 -1
  209. package/dist/src/validators/redteam.d.ts +8 -0
  210. package/dist/src/validators/redteam.d.ts.map +1 -1
  211. package/dist/tsconfig.tsbuildinfo +1 -1
  212. package/package.json +1 -1
  213. package/dist/src/app/assets/index-DLhvLf11.js +0 -1051
  214. package/dist/src/app/assets/index.es-BFIXrhjI.js +0 -18
  215. package/dist/src/app/assets/purify.es-CQJ0hv7W.js +0 -2
@@ -0,0 +1,504 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __importDefault = (this && this.__importDefault) || function (mod) {
36
+ return (mod && mod.__esModule) ? mod : { "default": mod };
37
+ };
38
+ Object.defineProperty(exports, "__esModule", { value: true });
39
+ exports.generateNewQuestionsPrompt = generateNewQuestionsPrompt;
40
+ exports.convertQuestionToPythonPrompt = convertQuestionToPythonPrompt;
41
+ exports.synthesize = synthesize;
42
+ exports.synthesizeFromTestSuite = synthesizeFromTestSuite;
43
+ const dedent_1 = __importDefault(require("dedent"));
44
+ const logger_1 = __importDefault(require("../logger"));
45
+ const providers_1 = require("../providers");
46
+ const defaults_1 = require("../providers/defaults");
47
+ const generation_1 = require("../util/generation");
48
+ const invariant_1 = __importDefault(require("../util/invariant"));
49
+ const json_1 = require("../util/json");
50
+ function generateNewQuestionsPrompt(prompts, testCases, numQuestions) {
51
+ const allAssertions = testCases.flatMap((c) => c.assert || []);
52
+ return (0, dedent_1.default) `
53
+ Role: You are a senior data scientist specializing in metric design for stochastic AI systems. You will be given
54
+ an series of system prompts and existing assertions being tested in an evaluation, your task is to create objective evaluation questions that assess
55
+ individual AI responses—not the application holistically—based on input-output pairs.
56
+
57
+ Make sure to generate questions that are different from ones that already exist.
58
+
59
+ Clarification: Some applications (like scam detection, content moderation, or classification tasks) ask the AI to evaluate an input artifact.
60
+ Your task is **NOT** to evaluate the artifact (input) directly, but to assess the AI's response — i.e., how well the assistant performed the requested evaluation.
61
+ For example, don’t ask: “Does the message contain suspicious links?”
62
+ Instead, ask: “Did the response correctly identify suspicious links in the message?” or “Are the ratings in the output aligned with the rubric?”
63
+
64
+ Core Requirements
65
+ 1. Question Types:
66
+ Questions may use one of the following scoring formats: binary (Yes/No), 5-point Likert scale, or 0–1 continuous scale.
67
+ Design each question to naturally align with its scale—for example, use binary for clear-cut presence/absence traits, Likert for subjective gradations, and continuous for measurable properties.
68
+ Binary questions can still be scored on a Likert scale by mapping “Yes = 5” and “No = 1” if needed.
69
+
70
+ IMPORTANT: Questions should be phrased so that a 'Yes' answer or higher score **always** indicates compliance with the desired metric or requirement.
71
+
72
+ 2. Focus:
73
+ Questions can evaluate:
74
+ i. Input-output relationships (e.g., "Does the output address all parts of the input query?").
75
+ ii. Response attributes (e.g., structure, clarity, safety).
76
+ Avoid holistic/system-level judgments (e.g., "Is the AI helpful?").
77
+
78
+ 3. Objectivity:
79
+ Be as objective as possible. Replace ambiguous terms (e.g., "inspiring," "too long") with quantifiable criteria (e.g., "Is the output > 100 words?").
80
+ Allowed subjectivity: Verbs/adjectives are fine if they describe inherent properties of language (e.g., "Does the response contain abusive language?").
81
+ Rationale: "Abusive" is a property of language, even if borderline cases exist.
82
+ Avoid unbounded subjectivity (e.g., "Is the output extremely concise?" → replace with "Is the output ≤ 50 words?").
83
+ In general, think of ways to replace subjective ideas with objective ones.
84
+
85
+ 4. Atomicity:
86
+ Each question should test one attribute or relationship (e.g., split "Is the response clear and concise?" into two questions).
87
+
88
+ 5. Independence:
89
+ Questions should avoid overlap to prevent double-counting issues in evaluation. They should not overlap with any assertions either.
90
+
91
+ 6. Self-Containment:
92
+ Permitted: Derive answers from the input/output text (e.g., "Does the output cite a verbatim quote from the input?").
93
+ Forbidden: Reliance on external knowledge (e.g., "Is the cited source reputable?" → replace with "Does the citation include a DOI?").
94
+
95
+ 7. Special Cases:
96
+ For creative tasks: Focus on technical execution (e.g., "Does each stanza have 4 lines?").
97
+ For list outputs: Evaluate per item (e.g., "Does each bullet point contain a complete sentence?").
98
+
99
+ Each question must be preceded by a label in Title Case, no longer than three words, that serves as a concise and descriptive title for the question.
100
+
101
+ After writing each question, **always** set 'is_lower_score_desirable' to false because if the answer to the question is “Yes” (or higher score in case of likert/0-1 scales),
102
+ it always indicates a good response. You are only generating such type of questions.
103
+
104
+ Each question should have a question_source. If the question is implied in the input application_description, use
105
+ IMPLIED_IN_INSTRUCTIONS; otherwise if you are generating it from scratch, use FULLY_NEWLY_GENERATED.
106
+
107
+ Each question should have a question_type. If the question is core for this specific application, use
108
+ CORE_FOR_APPLICATION. If the question is a generic check which applies to many other applications like check for
109
+ abusive content or toxic language, use HORIZONTAL. If the question is regarding output format or some structure
110
+ in the response of the application, use FORMAT_CHECK.
111
+
112
+ Anti-Patterns to Avoid
113
+ 1. Reasoning Dependencies:
114
+ Bad: "Is the argument persuasive?"
115
+ Fixed: "Does the response list at least 2 supporting facts?"
116
+
117
+ 2. World Knowledge:
118
+ Bad: "Is the cited author an expert?"
119
+ Fixed: "Does the citation include the author’s institutional affiliation?"
120
+
121
+ 3. Unbounded Subjectivity:
122
+ Bad: "Is the output extremely concise?"
123
+ Fixed: "Is the output ≤ 3 sentences?"
124
+
125
+ Process
126
+ 1. Classify the Application:
127
+ First classify the application into appropriate categories such as information extraction, information summarization, creative task, analysis task.
128
+ Note that an application can belong to multiple categories.
129
+ Define key attributes (e.g., accuracy, structure, safety).
130
+
131
+ 2. Extract Implied Questions (Mandatory):
132
+ Scan the application_description for any *implied requirements*—expectations stated or suggested in the instructions.
133
+ For each implied requirement, generate an evaluation question marked with:
134
+ - 'question_source = implied_in_instructions'
135
+ These must be generated **before** any newly inferred or generic questions.
136
+
137
+ 3. Generate Deep Criteria (for new questions):
138
+ For each key attribute not already covered by an implied question:
139
+ - Identify subtle failure modes
140
+ - Design objectively measurable, atomic, and independent evaluation criteria
141
+ - Use quantifiable standards and avoid vague constructs
142
+ - Generate questions with 'question_source = fully_newly_generated'
143
+
144
+ 4. Generate Questions:
145
+ ${numQuestions > 0
146
+ ? `Create total ${numQuestions} questions with:`
147
+ : `Create a comprehensive set of evaluation questions with:`}
148
+ Binary (if absolute criteria exist) or Likert/continuous scales.
149
+ Concrete thresholds for quantifiable traits (e.g., word/line counts).
150
+
151
+ **IMPORTANT**: You must prioritize and fully exhaust all questions implied by the application description before generating any new questions.
152
+ ${numQuestions > 0
153
+ ? `Do not generate any 'fully_newly_generated' questions if the implied questions alone fulfill the requested ${numQuestions}.`
154
+ : ``}
155
+
156
+
157
+ # OUTPUT FORMAT
158
+
159
+ Only respond in JSON with no extra content.
160
+
161
+ # EXAMPLES
162
+
163
+ <application>
164
+ Describe a recipe for an input dish in bulleted list format.
165
+ </application>
166
+ <existing_assertions>
167
+ [
168
+ {
169
+ "type" : "llm-rubric",
170
+ "value": "Does the output list all necessary ingredients for the dish?",
171
+ "metric": "Ingredient Inclusion"
172
+ },
173
+ {
174
+ "type" : "g-eval",
175
+ "value": "Does each step in the recipe provide clear and complete instructions for preparation?"
176
+ }
177
+ ]
178
+ </existing_assertions>
179
+ ${'```'}json
180
+
181
+ {
182
+ "questions": [
183
+ {
184
+ "label": "Sequential Order",
185
+ "question": "Are the preparation steps listed in a logical and sequential order?",
186
+ "question_source": "implied_in_instructions",
187
+ "question_type": "core_for_application"
188
+ },
189
+ {
190
+ "label": "Bullet Format",
191
+ "question": "Is each item in the recipe presented as a distinct bullet point?",
192
+ "question_source": "implied_in_instructions",
193
+ "question_type": "format_check"
194
+ },
195
+ {
196
+ "label": "Cooking Times",
197
+ "question": "Are the cooking and preparation times mentioned in the recipe?",
198
+ "question_source": "fully_newly_generated",
199
+ "question_type": "core_for_application"
200
+ },
201
+ {
202
+ "label": "Ingredient Quantities",
203
+ "question": "Are the quantities for each ingredient specified in the recipe?",
204
+ "question_source": "fully_newly_generated",
205
+ "question_type": "core_for_application"
206
+ },
207
+ {
208
+ "label": "Serving Size",
209
+ "question": "Does the recipe specify the number of servings it makes?",
210
+ "question_source": "fully_newly_generated",
211
+ "question_type": "core_for_application"
212
+ },
213
+ {
214
+ "label": "Filler Words",
215
+ "question": "Does the recipe avoid including unnecessary details?",
216
+ "question_source": "fully_newly_generated",
217
+ "question_type": "horizontal"
218
+ }
219
+ ]
220
+ }
221
+
222
+
223
+ Consider the following prompt${prompts.length > 1 ? 's' : ''} and assertions for an LLM application:
224
+
225
+ <Prompts>
226
+ ${prompts
227
+ .map((prompt, i) => ` ${i > 0 ? ' ' : ''}<Prompt>
228
+ ${prompt}
229
+ </Prompt>`)
230
+ .join('\n')}
231
+ </Prompts>
232
+
233
+ <existing_assertions>
234
+ ${JSON.stringify(allAssertions, null, 2)
235
+ .split('\n')
236
+ .map((line, i) => (i > 0 ? ` ${line}` : line))
237
+ .join('\n')}
238
+ </existing_assertions>`;
239
+ }
240
+ function convertQuestionToPythonPrompt(prompts, question) {
241
+ return (0, dedent_1.default) `
242
+ You are a specialized system that analyzes an LLM evaluation question and generates a Python function to automatically check LLM responses against the specific criterion.
243
+ Your task is to determine if the given evaluation question can be reliably answered using a deterministic Python function.
244
+
245
+ ## Input Format
246
+ You will be provided with:
247
+ 1. A description of the LLM application (string)
248
+ 2. A single evaluation question used to assess LLM responses (string)
249
+
250
+ ## Output Format
251
+ For the evaluation question, you must:
252
+
253
+ - Determine if the question can be reliably answered with a deterministic Python function using ONLY the LLM response
254
+ - If YES: Return only the Python function body (without the function signature) that:
255
+ - Assumes the LLM's response text is available as a string variable named \`output\`
256
+ - Returns a dictionary with two keys:
257
+ - \`'pass'\`: boolean value (True if criterion is met, False if not)
258
+ - \`'score'\`: float value (1.0 if criterion is met, 0.0 if not)
259
+ - The Answer "Yes" to the question should correspond to \`{'pass': True, 'score': 1.0}\`
260
+ - The answer "No" to the question should correspond to \`{'pass': False, 'score': 0.0}\`
261
+ - Includes clear comments
262
+ - Handles edge cases gracefully (e.g., empty responses, invalid formats)
263
+ - Performs any necessary parsing of the response string (JSON parsing, text extraction, etc.)
264
+ - If NO: Return the string "None" (when the question requires semantic understanding, subjective judgment, domain expertise, or requires examining the original prompt/input)
265
+
266
+ ## Critical Requirements
267
+ - The function must evaluate ONLY the LLM response itself, which will always be provided as a string
268
+ - The evaluation question might refer to the LLM output by domain-specific terms (e.g., "story", "recipe", "code", "answer") based on the application description, rather than generic terms like "response" or "output"
269
+ - Regardless of terminology used in the question, the variable name in your code must be "output".
270
+ - If evaluation requires comparing the response to the original prompt/input, return "None"
271
+ - If evaluation requires external knowledge, context, or resources, return "None"
272
+ - When in doubt, return "None" rather than an unreliable function
273
+ - Any required parsing (JSON, XML, etc.) must be handled within the function
274
+
275
+ ## IMPORTANT
276
+ - Return "None" for any evaluation that requires semantic understanding or could have multiple valid expressions
277
+ - For questions about greetings, politeness, tone, style, or other subjective language features, return "None"
278
+ - Avoid creating functions that rely on hardcoded lists of phrases, expressions, or patterns when the concept being evaluated could be expressed in many different ways
279
+ - Only create functions for criteria that can be evaluated through standardized, unambiguous patterns or clear structural properties
280
+
281
+ ## Guidelines for Domain-Specific References
282
+ - When the question refers to the output by a domain-specific term (e.g., "Is the story less than 2 lines long?", "Does the recipe include four or more spices?"), understand that it's referring to the same content that will be available as the \`output\` variable
283
+ - The application description often provides context for what type of output to expect (story, recipe, etc.)
284
+
285
+ ## Guidelines for Function Generation
286
+
287
+ ### Questions Suitable for Functions (return a function):
288
+ - Counting elements (words, sentences, lines, items)
289
+ - Checking for presence of specific strings, patterns, or structures within the response
290
+ - Validating formats (JSON, dates, emails, etc.)
291
+ - Measuring response length in characters/bytes etc
292
+ - Checking for code syntax, structure, or presence of specific elements
293
+ - Verifying mathematical properties or numerical ranges
294
+
295
+ ### Questions NOT Suitable for Functions (return "None"):
296
+ - Any evaluation requiring comparison to the original prompt
297
+ - Evaluating relevance, accuracy, or helpfulness
298
+ - Assessing tone, intent, style, sentiment or semantics
299
+ - Checking factual correctness
300
+ - Determining completeness of explanations
301
+ - Evaluating creativity or originality
302
+ - Assessing logical coherence or reasoning quality
303
+ - Any judgment requiring domain expertise
304
+ - Any evaluation that would require an exhaustive list of possible expressions (like apologies, call-to-action etc.)
305
+
306
+ Please provide only the Python function body without markdown formatting or function signature.
307
+ The function body should assume the LLM's response is available as a variable named \`output\`.
308
+ Also include the necessary import statements within the function body itself.
309
+
310
+ ## Example Input/Output Pairs
311
+
312
+ ### Example 1:
313
+ **Application Description:** A JSON API documentation system
314
+ **Evaluation Question:** "Does the response contain valid JSON?"
315
+
316
+ **Output:**
317
+ \`\`\`python
318
+ import json
319
+ import re
320
+
321
+ # Try to find JSON blocks in the output
322
+ # Look for content within code blocks with \`\`\`json
323
+ json_block_pattern = r'\`\`\`(?:json)?\\s*([\\s\\S]*?)\\s*\`\`\`'
324
+ json_blocks = re.findall(json_block_pattern, output)
325
+
326
+ # Also look for content within curly braces that might be JSON
327
+ potential_json = re.findall(r'(\\{[\\s\\S]*?\\})', output)
328
+
329
+ # Combine all potential JSON content
330
+ all_potential_json = json_blocks + potential_json
331
+
332
+ # If we don't find any potential JSON patterns, return False
333
+ if not all_potential_json:
334
+ return {'pass': False, 'score': 0.0}
335
+
336
+ # Try to parse each potential JSON block
337
+ for json_str in all_potential_json:
338
+ try:
339
+ json.loads(json_str)
340
+ return {'pass': True, 'score': 1.0} # Valid JSON found
341
+ except json.JSONDecodeError:
342
+ continue
343
+
344
+ return {'pass': False, 'score': 0.0} # No valid JSON found
345
+ \`\`\`
346
+
347
+ ### Example 2:
348
+ **Application Description:** A customer service chatbot
349
+ **Evaluation Question:** "Does the response address the customer's initial query?"
350
+
351
+ **Output:**
352
+ None
353
+
354
+ ### Example 3:
355
+ **Application Description:** A code assistant that generates SQL queries.
356
+ **Evaluation Question:** "Does the SQL query use a JOIN statement?"
357
+
358
+ **Output:**
359
+ \`\`\`python
360
+ import re
361
+
362
+ # Convert to lowercase for case-insensitive matching
363
+ output_lower = output.lower()
364
+
365
+ # Extract code blocks if present
366
+ code_blocks = re.findall(r'\`\`\`(?:sql)?([^\`]+)\`\`\`', output_lower)
367
+
368
+ # If code blocks are found, check them first
369
+ if code_blocks:
370
+ for block in code_blocks:
371
+ # Check for JOIN keyword with word boundaries
372
+ if re.search(r'\\b(join|inner\\s+join|left\\s+join|right\\s+join|full\\s+join|cross\\s+join)\\b', block):
373
+ return {'pass': True, 'score': 1.0}
374
+
375
+ # If no code blocks or no JOIN found in code blocks, check the entire output
376
+ join_patterns = [
377
+ r'\\b(join)\\b',
378
+ r'\\b(inner\\s+join)\\b',
379
+ r'\\b(left\\s+join)\\b',
380
+ r'\\b(right\\s+join)\\b',
381
+ r'\\b(full\\s+join)\\b',
382
+ r'\\b(cross\\s+join)\\b'
383
+ ]
384
+
385
+ for pattern in join_patterns:
386
+ if re.search(pattern, output_lower):
387
+ return {'pass': True, 'score': 1.0}
388
+
389
+ return {'pass': False, 'score': 0.0}
390
+ \`\`\`
391
+
392
+ ### Example 4:
393
+ **Application Description:** An eval agent that can plan weekend trips.
394
+ **Evaluation Question:** "Does the response exceed 1500 words?"
395
+
396
+ **Output:**
397
+ \`\`\`python
398
+ # Split the output into words
399
+ words = output.split()
400
+
401
+ # Count the number of words
402
+ word_count = len(words)
403
+
404
+ # Check if the word count exceeds 1500
405
+ if word_count > 1500:
406
+ return {'pass': True, 'score': 1.0}
407
+ return {'pass': False, 'score': 0.0}
408
+ \`\`\`
409
+
410
+ ### Example 5:
411
+ **Application Description:** A customer service chatbot
412
+ **Evaluation Question:** "Does the response start with a greeting?"
413
+
414
+ **Output:**
415
+ None
416
+
417
+ Remember: When in doubt, return "None". It's better to use some other evaluation mechanism than to generate an unreliable function.
418
+
419
+ <application_description>
420
+ <Prompts>
421
+ ${prompts
422
+ .map((prompt, i) => ` ${i > 0 ? ' ' : ''}<Prompt>
423
+ ${prompt}
424
+ </Prompt>`)
425
+ .join('\n')}
426
+ </Prompts>
427
+ </application_description>
428
+ <question>
429
+ ${question}
430
+ </question>
431
+ `;
432
+ }
433
+ async function synthesize({ prompts, instructions, numQuestions = 5, tests, provider, type = 'pi', }) {
434
+ if (prompts.length < 1) {
435
+ throw new Error('Assertion synthesis requires at least one prompt.');
436
+ }
437
+ let progressBar;
438
+ if (logger_1.default.level !== 'debug') {
439
+ const cliProgress = await Promise.resolve().then(() => __importStar(require('cli-progress')));
440
+ progressBar = new cliProgress.SingleBar({ gracefulExit: true }, cliProgress.Presets.shades_classic);
441
+ progressBar.start(1 + numQuestions, 0);
442
+ }
443
+ logger_1.default.debug(`Starting assertion synthesis. We'll begin by generating a set of questions`);
444
+ logger_1.default.debug(`Generating user personas from ${prompts.length} prompt${prompts.length > 1 ? 's' : ''}...`);
445
+ let providerModel;
446
+ if (typeof provider === 'undefined') {
447
+ providerModel = (await (0, defaults_1.getDefaultProviders)()).synthesizeProvider;
448
+ }
449
+ else {
450
+ providerModel = await (0, providers_1.loadApiProvider)(provider);
451
+ }
452
+ let newQuestionsPrompt = generateNewQuestionsPrompt(prompts, tests, numQuestions);
453
+ if (instructions) {
454
+ newQuestionsPrompt = `${newQuestionsPrompt}\n${instructions}`;
455
+ }
456
+ logger_1.default.debug(`Generated questions prompt:\n${newQuestionsPrompt}`);
457
+ const resp = await providerModel.callApi(newQuestionsPrompt);
458
+ logger_1.default.debug(`Received questions response:\n${resp.output}`);
459
+ (0, invariant_1.default)(typeof resp.output !== 'undefined', 'resp.output must be defined');
460
+ const output = typeof resp.output === 'string' ? resp.output : JSON.stringify(resp.output);
461
+ const respObjects = (0, json_1.extractJsonObjects)(output);
462
+ (0, invariant_1.default)(respObjects.length >= 1, `Expected at least one JSON object in the response for questions, got ${respObjects.length}`);
463
+ const questionsWrapper = respObjects[0];
464
+ const questions = (0, generation_1.sampleArray)(questionsWrapper.questions, numQuestions);
465
+ logger_1.default.debug(`Generated ${questions.length} question${questions.length === 1 ? '' : 's'}:\n${questions.map((p) => ` - ${p.question}`).join('\n')}`);
466
+ if (progressBar) {
467
+ progressBar.increment();
468
+ }
469
+ // Determine which dimensions should be objectively answered with python. If so, generate the python.
470
+ providerModel.config = {
471
+ maxTokens: 3000,
472
+ };
473
+ const assertions = await Promise.all(questions.map(async (q) => {
474
+ const pythonConvertPrompt = convertQuestionToPythonPrompt(prompts, q.question);
475
+ const resp = await providerModel.callApi(pythonConvertPrompt);
476
+ const output = resp.output;
477
+ if (progressBar) {
478
+ progressBar.increment();
479
+ }
480
+ if (output.toLowerCase().trim() == 'none') {
481
+ return { type, metric: q.label, value: q.question };
482
+ }
483
+ else {
484
+ return {
485
+ type: 'python',
486
+ metric: q.label,
487
+ value: output,
488
+ };
489
+ }
490
+ }));
491
+ logger_1.default.debug(`Generated ${assertions.length} new assertions`);
492
+ if (progressBar) {
493
+ progressBar.stop();
494
+ }
495
+ return assertions;
496
+ }
497
+ async function synthesizeFromTestSuite(testSuite, options) {
498
+ return synthesize({
499
+ ...options,
500
+ prompts: testSuite.prompts.map((prompt) => prompt.raw),
501
+ tests: testSuite.tests || [],
502
+ });
503
+ }
504
+ //# sourceMappingURL=synthesis.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"synthesis.js","sourceRoot":"","sources":["../../../src/assertions/synthesis.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAmBA,gEAyMC;AAED,sEAmMC;AASD,gCAuFC;AAED,0DASC;AA3gBD,oDAA4B;AAC5B,uDAA+B;AAC/B,4CAA+C;AAC/C,oDAA4D;AAE5D,mDAAiD;AACjD,kEAA0C;AAC1C,uCAAkD;AAWlD,SAAgB,0BAA0B,CACxC,OAAiB,EACjB,SAAqB,EACrB,YAAoB;IAEpB,MAAM,aAAa,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC;IAE/D,OAAO,IAAA,gBAAM,EAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;QA8FP,YAAY,GAAG,CAAC;QACd,CAAC,CAAC,gBAAgB,YAAY,kBAAkB;QAChD,CAAC,CAAC,0DACN;;;;;QAME,YAAY,GAAG,CAAC;QACd,CAAC,CAAC,8GAA8G,YAAY,GAAG;QAC/H,CAAC,CAAC,EACN;;;;;;;;;;;;;;;;;;;;;;;;;QAyBE,KAAK;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;qCA4CwB,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE;;;IAG9D,OAAO;SACN,GAAG,CACF,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CACZ,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE;YACtB,MAAM;kBACA,CACb;SACA,IAAI,CAAC,IAAI,CAAC;;;;UAIL,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;SACrC,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;SACpD,IAAI,CAAC,IAAI,CAAC;6BACQ,CAAC;AAC9B,CAAC;AAED,SAAgB,6BAA6B,CAAC,OAAiB,EAAE,QAAgB;IAC/E,OAAO,IAAA,gBAAM,EAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAoLX,OAAO;SACN,GAAG,CACF,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CACZ,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE;YACtB,MAAM;kBACA,CACb;SACA,IAAI,CAAC,IAAI,CAAC;;;;MAIT,QAAQ;;CAEb,CAAC;AACF,CAAC;AASM,KAAK,UAAU,UAAU,CAAC,EAC/B,OAAO,EACP,YAAY,EACZ,YAAY,GAAG,CAAC,EAChB,KAAK,EACL,QAAQ,EACR,IAAI,GAAG,IAAI,GACO;IAClB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,mDAAmD,CAAC,CAAC;IACvE,CAAC;IAED,IAAI,WAAkC,CAAC;IACvC,IAAI,gBAAM,CAAC,KAAK,KAAK,OAAO,EAAE,CAAC;QAC7B,MAAM,WAAW,GAAG,wDAAa,cAAc,GAAC,CAAC;QACjD,WAAW,GAAG,IAAI,WAAW,CAAC,SAAS,CACrC,EAAE,YAAY,EAAE,IAAI,EAAE,EACtB,WAAW,CAAC,OAAO,CAAC,cAAc,CACnC,CAAC;QACF,WAAW,CAAC,KAAK,CAAC,CAAC,GAAG,YAAY,EAAE,CAAC,CAAC,CAAC;IACzC,CAAC;IAED,gBAAM,CAAC,KAAK,CAAC,4EAA4E,CAAC,CAAC;IAE3F,gBAAM,CAAC,KAAK,CACV,iCAAiC,OAAO,CAAC,MAAM,UAAU,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,KAAK,CAC5F,CAAC;IAEF,IAAI,aAA0B,CAAC;IAC/B,IAAI,OAAO,QAAQ,KAAK,WAAW,EAAE,CAAC;QACpC,aAAa,GAAG,CAAC,MAAM,IAAA,8BAAmB,GAAE,CAAC,CAAC,kBAAkB,CAAC;IACnE,CAAC;SAAM,CAAC;QACN,aAAa,GAAG,MAAM,IAAA,2BAAe,EAAC,QAAQ,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,kBAAkB,GAAG,0BAA0B,CAAC,OAAO,EAAE,KAAK,EAAE,YAAY,CAAC,CAAC;IAClF,IAAI,YAAY,EAAE,CAAC;QACjB,kBAAkB,GAAG,GAAG,kBAAkB,KAAK,YAAY,EAAE,CAAC;IAChE,CAAC;IACD,gBAAM,CAAC,KAAK,CAAC,gCAAgC,kBAAkB,EAAE,CAAC,CAAC;IACnE,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;IAC7D,gBAAM,CAAC,KAAK,CAAC,iCAAiC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;IAC7D,IAAA,mBAAS,EAAC,OAAO,IAAI,CAAC,MAAM,KAAK,WAAW,EAAE,6BAA6B,CAAC,CAAC;IAC7E,MAAM,MAAM,GAAG,OAAO,IAAI,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC3F,MAAM,WAAW,GAAG,IAAA,yBAAkB,EAAC,MAAM,CAAC,CAAC;IAC/C,IAAA,mBAAS,EACP,WAAW,CAAC,MAAM,IAAI,CAAC,EACvB,wEAAwE,WAAW,CAAC,MAAM,EAAE,CAC7F,CAAC;IACF,MAAM,gBAAgB,GAAG,WAAW,CAAC,CAAC,CAAuC,CAAC;IAC9E,MAAM,SAAS,GAAG,IAAA,wBAAW,EAAC,gBAAgB,CAAC,SAAS,EAAE,YAAY,CAAC,CAAC;IAExE,gBAAM,CAAC,KAAK,CACV,aAAa,SAAS,CAAC,MAAM,YAAY,SAAS,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,MAAM,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACvI,CAAC;IAEF,IAAI,WAAW,EAAE,CAAC;QAChB,WAAW,CAAC,SAAS,EAAE,CAAC;IAC1B,CAAC;IACD,qGAAqG;IACrG,aAAa,CAAC,MAAM,GAAG;QACrB,SAAS,EAAE,IAAI;KAChB,CAAC;IACF,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,GAAG,CAClC,SAAS,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE;QACxB,MAAM,mBAAmB,GAAG,6BAA6B,CAAC,OAAO,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC;QAC/E,MAAM,IAAI,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,mBAAmB,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAW,IAAI,CAAC,MAAM,CAAC;QACnC,IAAI,WAAW,EAAE,CAAC;YAChB,WAAW,CAAC,SAAS,EAAE,CAAC;QAC1B,CAAC;QAED,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,IAAI,MAAM,EAAE,CAAC;YAC1C,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC;QACtD,CAAC;aAAM,CAAC;YACN,OAAO;gBACL,IAAI,EAAE,QAA6B;gBACnC,MAAM,EAAE,CAAC,CAAC,KAAK;gBACf,KAAK,EAAE,MAAM;aACd,CAAC;QACJ,CAAC;IACH,CAAC,CAAC,CACH,CAAC;IACF,gBAAM,CAAC,KAAK,CAAC,aAAa,UAAU,CAAC,MAAM,iBAAiB,CAAC,CAAC;IAC9D,IAAI,WAAW,EAAE,CAAC;QAChB,WAAW,CAAC,IAAI,EAAE,CAAC;IACrB,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAEM,KAAK,UAAU,uBAAuB,CAC3C,SAAoB,EACpB,OAAmC;IAEnC,OAAO,UAAU,CAAC;QAChB,GAAG,OAAO;QACV,OAAO,EAAE,SAAS,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC;QACtD,KAAK,EAAE,SAAS,CAAC,KAAK,IAAI,EAAE;KAC7B,CAAC,CAAC;AACL,CAAC"}
@@ -1,3 +1,5 @@
1
1
  import type { Command } from 'commander';
2
+ export declare function handleEvalDelete(evalId: string, envPath?: string): Promise<void>;
3
+ export declare function handleEvalDeleteAll(): Promise<void>;
2
4
  export declare function deleteCommand(program: Command): void;
3
5
  //# sourceMappingURL=delete.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"delete.d.ts","sourceRoot":"","sources":["../../../src/commands/delete.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AA6BzC,wBAAgB,aAAa,CAAC,OAAO,EAAE,OAAO,QA+C7C"}
1
+ {"version":3,"file":"delete.d.ts","sourceRoot":"","sources":["../../../src/commands/delete.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAOzC,wBAAsB,gBAAgB,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,iBAQtE;AAED,wBAAsB,mBAAmB,kBAUxC;AAED,wBAAgB,aAAa,CAAC,OAAO,EAAE,OAAO,QAgD7C"}
@@ -3,6 +3,8 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
3
3
  return (mod && mod.__esModule) ? mod : { "default": mod };
4
4
  };
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.handleEvalDelete = handleEvalDelete;
7
+ exports.handleEvalDeleteAll = handleEvalDeleteAll;
6
8
  exports.deleteCommand = deleteCommand;
7
9
  const confirm_1 = __importDefault(require("@inquirer/confirm"));
8
10
  const logger_1 = __importDefault(require("../logger"));
@@ -45,6 +47,7 @@ function deleteCommand(program) {
45
47
  return handleEvalDelete(id, cmdObj.envPath);
46
48
  }
47
49
  logger_1.default.error(`No resource found with ID ${id}`);
50
+ process.exitCode = 1;
48
51
  });
49
52
  deleteCommand
50
53
  .command('eval <id>')
@@ -1 +1 @@
1
- {"version":3,"file":"delete.js","sourceRoot":"","sources":["../../../src/commands/delete.ts"],"names":[],"mappings":";;;;;AA8BA,sCA+CC;AA7ED,gEAAwC;AAExC,uDAA+B;AAC/B,0DAAkC;AAClC,6DAAqC;AACrC,kCAAmC;AACnC,+CAA6E;AAE7E,KAAK,UAAU,gBAAgB,CAAC,MAAc,EAAE,OAAgB;IAC9D,IAAI,CAAC;QACH,MAAM,IAAA,qBAAU,EAAC,MAAM,CAAC,CAAC;QACzB,gBAAM,CAAC,IAAI,CAAC,sBAAsB,MAAM,iCAAiC,CAAC,CAAC;IAC7E,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,gBAAM,CAAC,KAAK,CAAC,uCAAuC,MAAM,MAAM,KAAK,EAAE,CAAC,CAAC;QACzE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAED,KAAK,UAAU,mBAAmB;IAChC,MAAM,SAAS,GAAG,MAAM,IAAA,iBAAO,EAAC;QAC9B,OAAO,EACL,uFAAuF;KAC1F,CAAC,CAAC;IACH,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO;IACT,CAAC;IACD,MAAM,IAAA,yBAAc,GAAE,CAAC;IACvB,gBAAM,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC;AACpD,CAAC;AAED,SAAgB,aAAa,CAAC,OAAgB;IAC5C,MAAM,aAAa,GAAG,OAAO;SAC1B,OAAO,CAAC,aAAa,CAAC;SACtB,WAAW,CAAC,0BAA0B,CAAC;SACvC,MAAM,CAAC,+BAA+B,EAAE,mBAAmB,CAAC;SAC5D,MAAM,CAAC,KAAK,EAAE,EAAU,EAAE,MAA4B,EAAE,EAAE;QACzD,IAAA,eAAQ,EAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzB,mBAAS,CAAC,MAAM,CAAC,cAAc,EAAE;YAC/B,IAAI,EAAE,QAAQ;SACf,CAAC,CAAC;QAEH,MAAM,GAAG,GAAG,MAAM,IAAA,wBAAa,EAAC,EAAE,CAAC,CAAC;QACpC,IAAI,GAAG,EAAE,CAAC;YACR,OAAO,gBAAgB,CAAC,EAAE,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;QAC9C,CAAC;QAED,gBAAM,CAAC,KAAK,CAAC,6BAA6B,EAAE,EAAE,CAAC,CAAC;IAClD,CAAC,CAAC,CAAC;IAEL,aAAa;SACV,OAAO,CAAC,WAAW,CAAC;SACpB,WAAW,CACV,oHAAoH,CACrH;SACA,MAAM,CAAC,+BAA+B,EAAE,mBAAmB,CAAC;SAC5D,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE;QAC/B,IAAA,eAAQ,EAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzB,mBAAS,CAAC,MAAM,CAAC,cAAc,EAAE;YAC/B,IAAI,EAAE,aAAa;YACnB,MAAM;SACP,CAAC,CAAC;QACH,MAAM,mBAAS,CAAC,IAAI,EAAE,CAAC;QAEvB,IAAI,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxB,MAAM,aAAa,GAAG,MAAM,cAAI,CAAC,MAAM,EAAE,CAAC;YAC1C,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,gBAAgB,CAAC,aAAa,CAAC,EAAE,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;YAC3D,CAAC;iBAAM,CAAC;gBACN,gBAAM,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;gBAC/B,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAC;YACvB,CAAC;QACH,CAAC;aAAM,IAAI,MAAM,KAAK,KAAK,EAAE,CAAC;YAC5B,MAAM,mBAAmB,EAAE,CAAC;QAC9B,CAAC;aAAM,CAAC;YACN,MAAM,gBAAgB,CAAC,MAAM,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;QACjD,CAAC;IACH,CAAC,CAAC,CAAC;AACP,CAAC"}
1
+ {"version":3,"file":"delete.js","sourceRoot":"","sources":["../../../src/commands/delete.ts"],"names":[],"mappings":";;;;;AAQA,4CAQC;AAED,kDAUC;AAED,sCAgDC;AA9ED,gEAAwC;AAExC,uDAA+B;AAC/B,0DAAkC;AAClC,6DAAqC;AACrC,kCAAmC;AACnC,+CAA6E;AAEtE,KAAK,UAAU,gBAAgB,CAAC,MAAc,EAAE,OAAgB;IACrE,IAAI,CAAC;QACH,MAAM,IAAA,qBAAU,EAAC,MAAM,CAAC,CAAC;QACzB,gBAAM,CAAC,IAAI,CAAC,sBAAsB,MAAM,iCAAiC,CAAC,CAAC;IAC7E,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,gBAAM,CAAC,KAAK,CAAC,uCAAuC,MAAM,MAAM,KAAK,EAAE,CAAC,CAAC;QACzE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC;AAEM,KAAK,UAAU,mBAAmB;IACvC,MAAM,SAAS,GAAG,MAAM,IAAA,iBAAO,EAAC;QAC9B,OAAO,EACL,uFAAuF;KAC1F,CAAC,CAAC;IACH,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO;IACT,CAAC;IACD,MAAM,IAAA,yBAAc,GAAE,CAAC;IACvB,gBAAM,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC;AACpD,CAAC;AAED,SAAgB,aAAa,CAAC,OAAgB;IAC5C,MAAM,aAAa,GAAG,OAAO;SAC1B,OAAO,CAAC,aAAa,CAAC;SACtB,WAAW,CAAC,0BAA0B,CAAC;SACvC,MAAM,CAAC,+BAA+B,EAAE,mBAAmB,CAAC;SAC5D,MAAM,CAAC,KAAK,EAAE,EAAU,EAAE,MAA4B,EAAE,EAAE;QACzD,IAAA,eAAQ,EAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzB,mBAAS,CAAC,MAAM,CAAC,cAAc,EAAE;YAC/B,IAAI,EAAE,QAAQ;SACf,CAAC,CAAC;QAEH,MAAM,GAAG,GAAG,MAAM,IAAA,wBAAa,EAAC,EAAE,CAAC,CAAC;QACpC,IAAI,GAAG,EAAE,CAAC;YACR,OAAO,gBAAgB,CAAC,EAAE,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;QAC9C,CAAC;QAED,gBAAM,CAAC,KAAK,CAAC,6BAA6B,EAAE,EAAE,CAAC,CAAC;QAChD,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAC;IACvB,CAAC,CAAC,CAAC;IAEL,aAAa;SACV,OAAO,CAAC,WAAW,CAAC;SACpB,WAAW,CACV,oHAAoH,CACrH;SACA,MAAM,CAAC,+BAA+B,EAAE,mBAAmB,CAAC;SAC5D,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE;QAC/B,IAAA,eAAQ,EAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzB,mBAAS,CAAC,MAAM,CAAC,cAAc,EAAE;YAC/B,IAAI,EAAE,aAAa;YACnB,MAAM;SACP,CAAC,CAAC;QACH,MAAM,mBAAS,CAAC,IAAI,EAAE,CAAC;QAEvB,IAAI,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxB,MAAM,aAAa,GAAG,MAAM,cAAI,CAAC,MAAM,EAAE,CAAC;YAC1C,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,gBAAgB,CAAC,aAAa,CAAC,EAAE,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;YAC3D,CAAC;iBAAM,CAAC;gBACN,gBAAM,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;gBAC/B,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAC;YACvB,CAAC;QACH,CAAC;aAAM,IAAI,MAAM,KAAK,KAAK,EAAE,CAAC;YAC5B,MAAM,mBAAmB,EAAE,CAAC;QAC9B,CAAC;aAAM,CAAC;YACN,MAAM,gBAAgB,CAAC,MAAM,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;QACjD,CAAC;IACH,CAAC,CAAC,CAAC;AACP,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"eval.d.ts","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAczC,OAAO,IAAI,MAAM,gBAAgB,CAAC;AAKlC,OAAO,KAAK,EACV,kBAAkB,EAClB,eAAe,EAEf,SAAS,EACT,UAAU,EACV,aAAa,EACd,MAAM,UAAU,CAAC;AAyBlB,wBAAgB,sCAAsC,CAAC,SAAS,EAAE,SAAS,QAa1E;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,OAAO,CAAC,UAAU,CAAC,GAAG,MAAM,CAwBnE;AAED,wBAAsB,MAAM,CAC1B,MAAM,EAAE,OAAO,CAAC,kBAAkB,GAAG,OAAO,CAAC,EAC7C,aAAa,EAAE,OAAO,CAAC,aAAa,CAAC,EACrC,iBAAiB,EAAE,MAAM,GAAG,SAAS,EACrC,eAAe,EAAE,eAAe,GAC/B,OAAO,CAAC,IAAI,CAAC,CAmjBf;AAED,wBAAgB,WAAW,CACzB,OAAO,EAAE,OAAO,EAChB,aAAa,EAAE,OAAO,CAAC,aAAa,CAAC,EACrC,iBAAiB,EAAE,MAAM,GAAG,SAAS,WAwMtC"}
1
+ {"version":3,"file":"eval.d.ts","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAczC,OAAO,IAAI,MAAM,gBAAgB,CAAC;AAKlC,OAAO,KAAK,EACV,kBAAkB,EAClB,eAAe,EAEf,SAAS,EACT,UAAU,EACV,aAAa,EACd,MAAM,UAAU,CAAC;AAyBlB,wBAAgB,sCAAsC,CAAC,SAAS,EAAE,SAAS,QAa1E;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,OAAO,CAAC,UAAU,CAAC,GAAG,MAAM,CAwBnE;AAED,wBAAsB,MAAM,CAC1B,MAAM,EAAE,OAAO,CAAC,kBAAkB,GAAG,OAAO,CAAC,EAC7C,aAAa,EAAE,OAAO,CAAC,aAAa,CAAC,EACrC,iBAAiB,EAAE,MAAM,GAAG,SAAS,EACrC,eAAe,EAAE,eAAe,GAC/B,OAAO,CAAC,IAAI,CAAC,CAojBf;AAED,wBAAgB,WAAW,CACzB,OAAO,EAAE,OAAO,EAChB,aAAa,EAAE,OAAO,CAAC,aAAa,CAAC,EACrC,iBAAiB,EAAE,MAAM,GAAG,SAAS,WAwMtC"}
@@ -359,7 +359,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
359
359
  else {
360
360
  logger_1.default.info(`» Do you want to share this with your team? Sign up for free at ${chalk_1.default.greenBright.bold('https://promptfoo.app')}`);
361
361
  }
362
- logger_1.default.info(`» This project needs your feedback. What's one thing we can improve? ${chalk_1.default.greenBright.bold('https://forms.gle/YFLgTe1dKJKNSCsU7')}`);
362
+ logger_1.default.info(`» This project needs your feedback. What's one thing we can improve? ${chalk_1.default.greenBright.bold('https://promptfoo.dev/feedback')}`);
363
363
  }
364
364
  }
365
365
  else {
@@ -370,22 +370,13 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
370
370
  const duration = Math.round((Date.now() - startTime) / 1000);
371
371
  const durationDisplay = (0, formatDuration_1.formatDuration)(duration);
372
372
  const isRedteam = Boolean(config.redteam);
373
- logger_1.default.info(chalk_1.default.green.bold(`Successes: ${successes}`));
374
- logger_1.default.info(chalk_1.default.red.bold(`Failures: ${failures}`));
375
- if (!Number.isNaN(errors)) {
376
- logger_1.default.info(chalk_1.default.red.bold(`Errors: ${errors}`));
377
- }
378
- if (!Number.isNaN(passRate)) {
379
- logger_1.default.info(chalk_1.default.blue.bold(`Pass Rate: ${passRate.toFixed(2)}%`));
380
- }
381
- logger_1.default.info(chalk_1.default.blue.bold(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
382
373
  // Handle token usage display
383
374
  if (tokenUsage.total > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0) {
384
375
  const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
385
376
  const evalTokens = {
386
377
  prompt: tokenUsage.prompt || 0,
387
378
  completion: tokenUsage.completion || 0,
388
- total: combinedTotal,
379
+ total: tokenUsage.total || combinedTotal,
389
380
  cached: tokenUsage.cached || 0,
390
381
  completionDetails: tokenUsage.completionDetails || {
391
382
  reasoning: 0,
@@ -393,7 +384,6 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
393
384
  rejectedPrediction: 0,
394
385
  },
395
386
  };
396
- (0, util_2.printBorder)();
397
387
  logger_1.default.info(chalk_1.default.bold('Token Usage Summary:'));
398
388
  if (isRedteam) {
399
389
  logger_1.default.info(` ${chalk_1.default.cyan('Probes:')} ${chalk_1.default.white.bold(tokenUsage.numRequests.toLocaleString())}`);
@@ -412,7 +402,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
412
402
  // Provider breakdown
413
403
  const tracker = tokenUsage_1.TokenUsageTracker.getInstance();
414
404
  const providerIds = tracker.getProviderIds();
415
- if (providerIds.length > 0) {
405
+ if (providerIds.length > 1) {
416
406
  logger_1.default.info(`\n ${chalk_1.default.cyan.bold('Provider Breakdown:')}`);
417
407
  // Sort providers by total token usage (descending)
418
408
  const sortedProviders = providerIds
@@ -460,9 +450,20 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
460
450
  }
461
451
  }
462
452
  // Grand total
463
- const grandTotal = evalTokens.total + tokenUsage.assertions.total;
453
+ const grandTotal = evalTokens.total + (tokenUsage.assertions.total || 0);
464
454
  logger_1.default.info(`\n ${chalk_1.default.blue.bold('Grand Total:')} ${chalk_1.default.white.bold(grandTotal.toLocaleString())} tokens`);
455
+ (0, util_2.printBorder)();
465
456
  }
457
+ logger_1.default.info(chalk_1.default.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
458
+ logger_1.default.info(chalk_1.default.green.bold(`Successes: ${successes}`));
459
+ logger_1.default.info(chalk_1.default.red.bold(`Failures: ${failures}`));
460
+ if (!Number.isNaN(errors)) {
461
+ logger_1.default.info(chalk_1.default.red.bold(`Errors: ${errors}`));
462
+ }
463
+ if (!Number.isNaN(passRate)) {
464
+ logger_1.default.info(chalk_1.default.blue.bold(`Pass Rate: ${passRate.toFixed(2)}%`));
465
+ }
466
+ (0, util_2.printBorder)();
466
467
  telemetry_1.default.record('command_used', {
467
468
  name: 'eval',
468
469
  watch: Boolean(cmdObj.watch),