promptfoo 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/README.md +1 -1
  2. package/dist/package.json +4 -4
  3. package/dist/src/assertions.d.ts.map +1 -1
  4. package/dist/src/assertions.js +5 -0
  5. package/dist/src/assertions.js.map +1 -1
  6. package/dist/src/evaluator.js +1 -1
  7. package/dist/src/evaluator.js.map +1 -1
  8. package/dist/src/index.d.ts +1 -5
  9. package/dist/src/index.d.ts.map +1 -1
  10. package/dist/src/index.js +1 -1
  11. package/dist/src/index.js.map +1 -1
  12. package/dist/src/matchers.d.ts +3 -2
  13. package/dist/src/matchers.d.ts.map +1 -1
  14. package/dist/src/matchers.js +37 -9
  15. package/dist/src/matchers.js.map +1 -1
  16. package/dist/src/providers/anthropic.d.ts +5 -3
  17. package/dist/src/providers/anthropic.d.ts.map +1 -1
  18. package/dist/src/providers/anthropic.js +8 -10
  19. package/dist/src/providers/anthropic.js.map +1 -1
  20. package/dist/src/providers/azureopenai.d.ts +9 -8
  21. package/dist/src/providers/azureopenai.d.ts.map +1 -1
  22. package/dist/src/providers/azureopenai.js +33 -36
  23. package/dist/src/providers/azureopenai.js.map +1 -1
  24. package/dist/src/providers/openai.d.ts +12 -12
  25. package/dist/src/providers/openai.d.ts.map +1 -1
  26. package/dist/src/providers/openai.js +54 -65
  27. package/dist/src/providers/openai.js.map +1 -1
  28. package/dist/src/providers/replicate.d.ts +4 -2
  29. package/dist/src/providers/replicate.d.ts.map +1 -1
  30. package/dist/src/providers/replicate.js +10 -8
  31. package/dist/src/providers/replicate.js.map +1 -1
  32. package/dist/src/providers/webhook.d.ts +9 -0
  33. package/dist/src/providers/webhook.d.ts.map +1 -0
  34. package/dist/src/providers/webhook.js +54 -0
  35. package/dist/src/providers/webhook.js.map +1 -0
  36. package/dist/src/providers.d.ts +1 -1
  37. package/dist/src/providers.d.ts.map +1 -1
  38. package/dist/src/providers.js +36 -28
  39. package/dist/src/providers.js.map +1 -1
  40. package/dist/src/suggestions.d.ts.map +1 -1
  41. package/dist/src/suggestions.js +1 -3
  42. package/dist/src/suggestions.js.map +1 -1
  43. package/dist/src/types.d.ts +7 -1
  44. package/dist/src/types.d.ts.map +1 -1
  45. package/dist/src/util.js +1 -1
  46. package/dist/src/util.js.map +1 -1
  47. package/dist/src/web/nextui/404/index.html +1 -1
  48. package/dist/src/web/nextui/404.html +1 -1
  49. package/dist/src/web/nextui/_next/static/Bl3o5lF4ON7Fjki46lPhr/_buildManifest.js +1 -0
  50. package/dist/src/web/nextui/_next/static/chunks/226-7bbb6c98a19542fd.js +37 -0
  51. package/dist/src/web/nextui/_next/static/chunks/249-ea9c0f034888ccff.js +125 -0
  52. package/dist/src/web/nextui/_next/static/chunks/339-501c32916b785ef1.js +1 -0
  53. package/dist/src/web/nextui/_next/static/chunks/365-e426ea5bc7e815fc.js +8 -0
  54. package/dist/src/web/nextui/_next/static/chunks/396-0a51429a01e24cdd.js +1 -0
  55. package/dist/src/web/nextui/_next/static/chunks/596-297f7ff4a0436e87.js +25 -0
  56. package/dist/src/web/nextui/_next/static/chunks/613-572c22424de64659.js +1 -0
  57. package/dist/src/web/nextui/_next/static/chunks/706-ae1d3352d28419e9.js +9 -0
  58. package/dist/src/web/nextui/_next/static/chunks/891-7035926a62c1c4e0.js +1 -0
  59. package/dist/src/web/nextui/_next/static/chunks/app/eval/[id]/not-found-366629541fd598e9.js +1 -0
  60. package/dist/src/web/nextui/_next/static/chunks/app/eval/[id]/page-319d2ee38d37574e.js +1 -0
  61. package/dist/src/web/nextui/_next/static/chunks/app/eval/page-a6b1ff91723b7beb.js +1 -0
  62. package/dist/src/web/nextui/_next/static/chunks/app/layout-024c4adc71c9feb0.js +1 -0
  63. package/dist/src/web/nextui/_next/static/chunks/app/page-1ae60660130041b2.js +1 -0
  64. package/dist/src/web/nextui/_next/static/chunks/app/setup/page-6ef16148040bf4f4.js +1 -0
  65. package/dist/src/web/nextui/_next/static/chunks/{ca377847-cb6ae6a6a073aebb.js → ca377847-26b462611379a4f7.js} +3 -3
  66. package/dist/src/web/nextui/_next/static/chunks/{fd9d1056-ac777be631f5a9e9.js → fd9d1056-fba4b53a2f01213b.js} +1 -1
  67. package/dist/src/web/nextui/_next/static/chunks/framework-8883d1e9be70c3da.js +25 -0
  68. package/dist/src/web/nextui/_next/static/chunks/main-8ea85465d428ecfe.js +1 -0
  69. package/dist/src/web/nextui/_next/static/chunks/main-app-581ccf0003955b21.js +1 -0
  70. package/dist/src/web/nextui/_next/static/chunks/pages/_app-52924524f99094ab.js +1 -0
  71. package/dist/src/web/nextui/_next/static/chunks/pages/_error-c92d5c4bb2b49926.js +1 -0
  72. package/dist/src/web/nextui/_next/static/chunks/webpack-55c264ce2fd85eb7.js +1 -0
  73. package/dist/src/web/nextui/_next/static/css/4d399fceacd06992.css +1 -0
  74. package/dist/src/web/nextui/eval/index.html +1 -1
  75. package/dist/src/web/nextui/eval/index.txt +6 -6
  76. package/dist/src/web/nextui/index.html +1 -1
  77. package/dist/src/web/nextui/index.txt +5 -5
  78. package/dist/src/web/nextui/setup/index.html +27 -1
  79. package/dist/src/web/nextui/setup/index.txt +9 -9
  80. package/dist/src/web/server.d.ts.map +1 -1
  81. package/dist/src/web/server.js +9 -5
  82. package/dist/src/web/server.js.map +1 -1
  83. package/package.json +4 -4
  84. package/dist/src/web/nextui/_next/static/US6gOx8LHTX_Hzm9aYNrC/_buildManifest.js +0 -1
  85. package/dist/src/web/nextui/_next/static/chunks/339-4fc8a80fa840e771.js +0 -1
  86. package/dist/src/web/nextui/_next/static/chunks/373-8a280796c0f2d1af.js +0 -1
  87. package/dist/src/web/nextui/_next/static/chunks/583-125d32af505e9bc4.js +0 -1
  88. package/dist/src/web/nextui/_next/static/chunks/596-07e4a23a5c6cdf04.js +0 -25
  89. package/dist/src/web/nextui/_next/static/chunks/658-a62210d07dc4dcb6.js +0 -15
  90. package/dist/src/web/nextui/_next/static/chunks/707-699cbd84b259c37b.js +0 -37
  91. package/dist/src/web/nextui/_next/static/chunks/858-ceb6fa22e614492b.js +0 -125
  92. package/dist/src/web/nextui/_next/static/chunks/891-3000ea7c0a292558.js +0 -1
  93. package/dist/src/web/nextui/_next/static/chunks/app/eval/[id]/not-found-50e40614fa05600e.js +0 -1
  94. package/dist/src/web/nextui/_next/static/chunks/app/eval/[id]/page-c19c44ed1b2dfb58.js +0 -1
  95. package/dist/src/web/nextui/_next/static/chunks/app/eval/page-d4a1813b2f8c4532.js +0 -1
  96. package/dist/src/web/nextui/_next/static/chunks/app/layout-664a8d716d2d24b1.js +0 -1
  97. package/dist/src/web/nextui/_next/static/chunks/app/page-1f8ef6a00a2355f0.js +0 -1
  98. package/dist/src/web/nextui/_next/static/chunks/app/setup/page-182018a3c6397345.js +0 -1
  99. package/dist/src/web/nextui/_next/static/chunks/framework-43665103d101a22d.js +0 -25
  100. package/dist/src/web/nextui/_next/static/chunks/main-50cc0a98559591ce.js +0 -1
  101. package/dist/src/web/nextui/_next/static/chunks/main-app-c9dc13756d166550.js +0 -1
  102. package/dist/src/web/nextui/_next/static/chunks/pages/_app-6b79a29ad0d63b21.js +0 -1
  103. package/dist/src/web/nextui/_next/static/chunks/pages/_error-9aeb3e4d490fe4b8.js +0 -1
  104. package/dist/src/web/nextui/_next/static/chunks/webpack-6e474e42be502dd7.js +0 -1
  105. package/dist/src/web/nextui/_next/static/css/a35c840ac696f161.css +0 -1
  106. package/dist/src/web/nextui/api +0 -1
  107. package/src/__mocks__/esm.ts +0 -3
  108. package/src/assertions.ts +0 -580
  109. package/src/cache.ts +0 -109
  110. package/src/esm.ts +0 -13
  111. package/src/evaluator.ts +0 -500
  112. package/src/index.ts +0 -52
  113. package/src/logger.ts +0 -46
  114. package/src/main.ts +0 -442
  115. package/src/matchers.ts +0 -120
  116. package/src/onboarding.ts +0 -69
  117. package/src/prompts.ts +0 -39
  118. package/src/providers/anthropic.ts +0 -88
  119. package/src/providers/azureopenai.ts +0 -299
  120. package/src/providers/llama.ts +0 -95
  121. package/src/providers/localai.ts +0 -111
  122. package/src/providers/ollama.ts +0 -89
  123. package/src/providers/openai.ts +0 -337
  124. package/src/providers/replicate.ts +0 -99
  125. package/src/providers/scriptCompletion.ts +0 -35
  126. package/src/providers/shared.ts +0 -34
  127. package/src/providers.ts +0 -192
  128. package/src/share.ts +0 -27
  129. package/src/suggestions.ts +0 -63
  130. package/src/table.ts +0 -43
  131. package/src/tableOutput.html +0 -52
  132. package/src/telemetry.ts +0 -70
  133. package/src/types.ts +0 -299
  134. package/src/updates.ts +0 -46
  135. package/src/util.ts +0 -543
  136. package/src/web/nextui/.eslintrc.json +0 -3
  137. package/src/web/nextui/next.config.js +0 -14
  138. package/src/web/nextui/package-lock.json +0 -4644
  139. package/src/web/nextui/package.json +0 -47
  140. package/src/web/nextui/public/favicon.ico +0 -0
  141. package/src/web/nextui/public/logo.svg +0 -30
  142. package/src/web/nextui/src/app/Home.css +0 -3
  143. package/src/web/nextui/src/app/api/route.ts +0 -6
  144. package/src/web/nextui/src/app/components/DarkMode.css +0 -22
  145. package/src/web/nextui/src/app/components/DarkMode.tsx +0 -17
  146. package/src/web/nextui/src/app/components/Logo.css +0 -32
  147. package/src/web/nextui/src/app/components/Logo.tsx +0 -11
  148. package/src/web/nextui/src/app/components/PageShell.css +0 -33
  149. package/src/web/nextui/src/app/components/PageShell.tsx +0 -87
  150. package/src/web/nextui/src/app/eval/ConfigModal.tsx +0 -84
  151. package/src/web/nextui/src/app/eval/Eval.css +0 -13
  152. package/src/web/nextui/src/app/eval/Eval.tsx +0 -79
  153. package/src/web/nextui/src/app/eval/EvalOutputPromptDialog.tsx +0 -127
  154. package/src/web/nextui/src/app/eval/ResultsCharts.tsx +0 -355
  155. package/src/web/nextui/src/app/eval/ResultsTable.css +0 -179
  156. package/src/web/nextui/src/app/eval/ResultsTable.tsx +0 -503
  157. package/src/web/nextui/src/app/eval/ResultsView.tsx +0 -301
  158. package/src/web/nextui/src/app/eval/ShareModal.tsx +0 -70
  159. package/src/web/nextui/src/app/eval/[id]/not-found.tsx +0 -5
  160. package/src/web/nextui/src/app/eval/[id]/page.css +0 -9
  161. package/src/web/nextui/src/app/eval/[id]/page.tsx +0 -20
  162. package/src/web/nextui/src/app/eval/index.css +0 -0
  163. package/src/web/nextui/src/app/eval/page.tsx +0 -8
  164. package/src/web/nextui/src/app/eval/store.ts +0 -18
  165. package/src/web/nextui/src/app/eval/types.ts +0 -20
  166. package/src/web/nextui/src/app/globals.css +0 -58
  167. package/src/web/nextui/src/app/layout.tsx +0 -25
  168. package/src/web/nextui/src/app/page.tsx +0 -7
  169. package/src/web/nextui/src/app/setup/AssertsForm.tsx +0 -118
  170. package/src/web/nextui/src/app/setup/PromptDialog.tsx +0 -77
  171. package/src/web/nextui/src/app/setup/PromptsSection.tsx +0 -190
  172. package/src/web/nextui/src/app/setup/ProviderConfigDialog.tsx +0 -99
  173. package/src/web/nextui/src/app/setup/ProviderSelector.tsx +0 -149
  174. package/src/web/nextui/src/app/setup/RunTestSuiteButton.tsx +0 -88
  175. package/src/web/nextui/src/app/setup/TestCaseDialog.tsx +0 -108
  176. package/src/web/nextui/src/app/setup/TestCasesSection.tsx +0 -154
  177. package/src/web/nextui/src/app/setup/VarsForm.tsx +0 -57
  178. package/src/web/nextui/src/app/setup/page.css +0 -3
  179. package/src/web/nextui/src/app/setup/page.tsx +0 -160
  180. package/src/web/nextui/src/util/api.ts +0 -1
  181. package/src/web/nextui/src/util/store.ts +0 -53
  182. package/src/web/nextui/tsconfig.json +0 -28
  183. package/src/web/server.ts +0 -151
  184. /package/dist/src/web/nextui/_next/static/{US6gOx8LHTX_Hzm9aYNrC → Bl3o5lF4ON7Fjki46lPhr}/_ssgManifest.js +0 -0
package/src/evaluator.ts DELETED
@@ -1,500 +0,0 @@
1
- import readline from 'readline';
2
-
3
- import async from 'async';
4
- import chalk from 'chalk';
5
- import invariant from 'tiny-invariant';
6
-
7
- import logger from './logger';
8
- import telemetry from './telemetry';
9
- import { runAssertions } from './assertions';
10
- import { generatePrompts } from './suggestions';
11
- import { getNunjucksEngine } from './util';
12
-
13
- import type { SingleBar } from 'cli-progress';
14
- import type {
15
- ApiProvider,
16
- EvaluateOptions,
17
- EvaluateResult,
18
- EvaluateStats,
19
- EvaluateSummary,
20
- EvaluateTable,
21
- TestSuite,
22
- Prompt,
23
- TestCase,
24
- AtomicTestCase,
25
- } from './types';
26
-
27
- interface RunEvalOptions {
28
- provider: ApiProvider;
29
- prompt: Prompt;
30
-
31
- test: AtomicTestCase;
32
-
33
- includeProviderId?: boolean;
34
-
35
- rowIndex: number;
36
- colIndex: number;
37
- repeatIndex: number;
38
- }
39
-
40
- const DEFAULT_MAX_CONCURRENCY = 4;
41
-
42
- const nunjucks = getNunjucksEngine();
43
-
44
- function generateVarCombinations(
45
- vars: Record<string, string | string[] | any>,
46
- ): Record<string, string | any[]>[] {
47
- const keys = Object.keys(vars);
48
- const combinations: Record<string, string | any[]>[] = [{}];
49
-
50
- for (const key of keys) {
51
- let values: any[] = Array.isArray(vars[key]) ? vars[key] : [vars[key]];
52
-
53
- // Check if it's an array but not a string array
54
- if (Array.isArray(vars[key]) && typeof vars[key][0] !== 'string') {
55
- values = [vars[key]];
56
- }
57
-
58
- const newCombinations: Record<string, any>[] = [];
59
-
60
- for (const combination of combinations) {
61
- for (const value of values) {
62
- newCombinations.push({ ...combination, [key]: value });
63
- }
64
- }
65
-
66
- combinations.length = 0;
67
- combinations.push(...newCombinations);
68
- }
69
-
70
- return combinations;
71
- }
72
-
73
- class Evaluator {
74
- testSuite: TestSuite;
75
- options: EvaluateOptions;
76
- stats: EvaluateStats;
77
-
78
- constructor(testSuite: TestSuite, options: EvaluateOptions) {
79
- this.testSuite = testSuite;
80
- this.options = options;
81
- this.stats = {
82
- successes: 0,
83
- failures: 0,
84
- tokenUsage: {
85
- total: 0,
86
- prompt: 0,
87
- completion: 0,
88
- cached: 0,
89
- },
90
- };
91
- }
92
-
93
- async runEval({
94
- provider,
95
- prompt,
96
- test,
97
- includeProviderId,
98
- }: RunEvalOptions): Promise<EvaluateResult> {
99
- const vars = test.vars || {};
100
- const renderedPrompt = nunjucks.renderString(prompt.raw, vars);
101
-
102
- // Note that we're using original prompt, not renderedPrompt
103
- let promptDisplay = prompt.display;
104
- if (includeProviderId) {
105
- promptDisplay = `[${provider.id()}] ${promptDisplay}`;
106
- }
107
-
108
- const setup = {
109
- prompt: {
110
- raw: renderedPrompt,
111
- display: promptDisplay,
112
- },
113
- vars,
114
- };
115
-
116
- let latencyMs = 0;
117
- try {
118
- const startTime = Date.now();
119
- const response = await provider.callApi(renderedPrompt);
120
- const endTime = Date.now();
121
- latencyMs = endTime - startTime;
122
-
123
- const ret: EvaluateResult = {
124
- ...setup,
125
- response,
126
- success: false,
127
- score: 0,
128
- latencyMs,
129
- };
130
- if (response.error) {
131
- ret.error = response.error;
132
- } else if (response.output) {
133
- // Create a copy of response so we can potentially mutate it.
134
- let processedResponse = { ...response };
135
- if (test.options?.postprocess) {
136
- const { postprocess } = test.options;
137
- const postprocessFn = new Function(
138
- 'output',
139
- 'context',
140
- postprocess.includes('\n') ? postprocess : `return ${postprocess}`,
141
- );
142
- processedResponse.output = postprocessFn(processedResponse.output);
143
- if (processedResponse.output == null) {
144
- throw new Error('Postprocess function did not return a value');
145
- }
146
- }
147
-
148
- invariant(processedResponse.output != null, 'Response output should not be null');
149
- const checkResult = await runAssertions(test, processedResponse.output);
150
- if (!checkResult.pass) {
151
- ret.error = checkResult.reason;
152
- }
153
- ret.success = checkResult.pass;
154
- ret.score = checkResult.score;
155
- if (checkResult.tokensUsed) {
156
- this.stats.tokenUsage.total += checkResult.tokensUsed.total;
157
- this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
158
- this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
159
- }
160
- ret.response = processedResponse;
161
- ret.gradingResult = checkResult;
162
- } else {
163
- ret.success = false;
164
- ret.score = 0;
165
- ret.error = 'No output';
166
- }
167
-
168
- // Update token usage stats
169
- if (response.tokenUsage) {
170
- this.stats.tokenUsage.total += response.tokenUsage.total || 0;
171
- this.stats.tokenUsage.prompt += response.tokenUsage.prompt || 0;
172
- this.stats.tokenUsage.completion += response.tokenUsage.completion || 0;
173
- this.stats.tokenUsage.cached += response.tokenUsage.cached || 0;
174
- }
175
-
176
- if (ret.success) {
177
- this.stats.successes++;
178
- } else {
179
- this.stats.failures++;
180
- }
181
-
182
- return ret;
183
- } catch (err) {
184
- return {
185
- ...setup,
186
- error: String(err) + '\n\n' + (err as Error).stack,
187
- success: false,
188
- score: 0,
189
- latencyMs,
190
- };
191
- }
192
- }
193
-
194
- async evaluate(): Promise<EvaluateSummary> {
195
- const { testSuite, options } = this;
196
- const prompts: Prompt[] = [];
197
-
198
- if (options.generateSuggestions) {
199
- // TODO(ian): Move this into its own command/file
200
- logger.info(`Generating prompt variations...`);
201
- const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
202
- if (error || !newPrompts) {
203
- throw new Error(`Failed to generate prompts: ${error}`);
204
- }
205
-
206
- logger.info(chalk.blue('Generated prompts:'));
207
- let numAdded = 0;
208
- for (const prompt of newPrompts) {
209
- logger.info('--------------------------------------------------------');
210
- logger.info(`${prompt}`);
211
- logger.info('--------------------------------------------------------');
212
-
213
- // Ask the user if they want to continue
214
- await new Promise((resolve) => {
215
- const rl = readline.createInterface({
216
- input: process.stdin,
217
- output: process.stdout,
218
- });
219
- rl.question(
220
- `${chalk.blue('Do you want to test this prompt?')} (y/N): `,
221
- async (answer) => {
222
- rl.close();
223
- if (answer.toLowerCase().startsWith('y')) {
224
- testSuite.prompts.push({ raw: prompt, display: prompt });
225
- numAdded++;
226
- } else {
227
- logger.info('Skipping this prompt.');
228
- }
229
- resolve(true);
230
- },
231
- );
232
- });
233
- }
234
-
235
- if (numAdded < 1) {
236
- logger.info(chalk.red('No prompts selected. Aborting.'));
237
- process.exit(1);
238
- }
239
- }
240
-
241
- // Split prompts by provider
242
- for (const prompt of testSuite.prompts) {
243
- for (const provider of testSuite.providers) {
244
- // Check if providerPromptMap exists and if it contains the current prompt's display
245
- if (testSuite.providerPromptMap) {
246
- const allowedPrompts = testSuite.providerPromptMap[provider.id()];
247
- if (allowedPrompts && !allowedPrompts.includes(prompt.display)) {
248
- continue;
249
- }
250
- }
251
- const updatedDisplay =
252
- testSuite.providers.length > 1 ? `[${provider.id()}] ${prompt.display}` : prompt.display;
253
- prompts.push({
254
- ...prompt,
255
- display: updatedDisplay,
256
- });
257
- }
258
- }
259
-
260
- // Aggregate all vars across test cases
261
- let tests = (
262
- testSuite.tests && testSuite.tests.length > 0
263
- ? testSuite.tests
264
- : testSuite.scenarios
265
- ? []
266
- : [
267
- {
268
- // Dummy test for cases when we're only comparing raw prompts.
269
- },
270
- ]
271
- ).map((test) => {
272
- const finalTestCase: TestCase = Object.assign({}, testSuite.defaultTest);
273
- return Object.assign(finalTestCase, test);
274
- });
275
-
276
- // Build scenarios and add to tests
277
- if (testSuite.scenarios && testSuite.scenarios.length > 0) {
278
- for (const scenario of testSuite.scenarios) {
279
- for (const data of scenario.config) {
280
- // Merge defaultTest with scenario config
281
- const scenarioTests = (
282
- scenario.tests || [
283
- {
284
- // Dummy test for cases when we're only comparing raw prompts.
285
- },
286
- ]
287
- ).map((test) => {
288
- return {
289
- ...testSuite.defaultTest,
290
- ...data,
291
- ...test,
292
- vars: {
293
- ...testSuite.defaultTest?.vars,
294
- ...data.vars,
295
- ...test.vars,
296
- },
297
- options: {
298
- ...testSuite.defaultTest?.options,
299
- ...test.options,
300
- },
301
- };
302
- });
303
- // Add scenario tests to tests
304
- tests = tests.concat(scenarioTests);
305
- }
306
- }
307
- }
308
-
309
- const varNames: Set<string> = new Set();
310
- const varsWithSpecialColsRemoved: Record<string, string | string[] | object>[] = [];
311
- for (const testCase of tests) {
312
- if (testCase.vars) {
313
- const varWithSpecialColsRemoved: Record<string, string | string[] | object> = {};
314
- for (const varName of Object.keys(testCase.vars)) {
315
- varNames.add(varName);
316
- varWithSpecialColsRemoved[varName] = testCase.vars[varName];
317
- }
318
- varsWithSpecialColsRemoved.push(varWithSpecialColsRemoved);
319
- }
320
- }
321
-
322
- // Set up eval cases
323
- const runEvalOptions: RunEvalOptions[] = [];
324
- let totalVarCombinations = 0;
325
- let rowIndex = 0;
326
- for (const testCase of tests) {
327
- // Handle default properties
328
- testCase.vars = Object.assign({}, testSuite.defaultTest?.vars, testCase.vars);
329
- testCase.assert = [...(testSuite.defaultTest?.assert || []), ...(testCase.assert || [])];
330
- testCase.options = testCase.options || {};
331
- testCase.options.provider =
332
- testCase.options.provider || testSuite.defaultTest?.options?.provider;
333
- const prependToPrompt =
334
- testCase.options?.prefix || testSuite.defaultTest?.options?.prefix || '';
335
- const appendToPrompt =
336
- testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';
337
- testCase.options.postprocess =
338
- testCase.options.postprocess || testSuite.defaultTest?.options?.postprocess;
339
-
340
- // Finalize test case eval
341
- const varCombinations = generateVarCombinations(testCase.vars || {});
342
- totalVarCombinations += varCombinations.length;
343
-
344
- const numRepeat = this.options.repeat || 1;
345
- for (let repeatIndex = 0; repeatIndex < numRepeat; repeatIndex++) {
346
- for (const vars of varCombinations) {
347
- let colIndex = 0;
348
- for (const prompt of testSuite.prompts) {
349
- for (const provider of testSuite.providers) {
350
- if (testSuite.providerPromptMap) {
351
- const allowedPrompts = testSuite.providerPromptMap[provider.id()];
352
- if (allowedPrompts && !allowedPrompts.includes(prompt.display)) {
353
- // This prompt should not be used with this provider.
354
- continue;
355
- }
356
- }
357
- runEvalOptions.push({
358
- provider,
359
- prompt: {
360
- ...prompt,
361
- raw: prependToPrompt + prompt.raw + appendToPrompt,
362
- },
363
- test: { ...testCase, vars, options: testCase.options },
364
- includeProviderId: testSuite.providers.length > 1,
365
- rowIndex,
366
- colIndex,
367
- repeatIndex,
368
- });
369
- colIndex++;
370
- }
371
- }
372
- rowIndex++;
373
- }
374
- }
375
- }
376
-
377
- // Set up table...
378
- const isTest = tests.some((t) => !!t.assert);
379
-
380
- const table: EvaluateTable = {
381
- head: {
382
- prompts,
383
- vars: Array.from(varNames).sort(),
384
- // TODO(ian): add assertions to table?
385
- },
386
- body: [],
387
- };
388
-
389
- // Set up progress bar...
390
- let progressbar: SingleBar | undefined;
391
- if (options.showProgressBar) {
392
- const cliProgress = await import('cli-progress');
393
- progressbar = new cliProgress.SingleBar(
394
- {
395
- format:
396
- 'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
397
- },
398
- cliProgress.Presets.shades_classic,
399
- );
400
- progressbar.start(runEvalOptions.length, 0, {
401
- provider: '',
402
- prompt: '',
403
- vars: '',
404
- });
405
- }
406
- if (options.progressCallback) {
407
- options.progressCallback(0, runEvalOptions.length);
408
- }
409
-
410
- // Actually run the eval
411
- const results: EvaluateResult[] = [];
412
- await async.forEachOfLimit(
413
- runEvalOptions,
414
- options.maxConcurrency || DEFAULT_MAX_CONCURRENCY,
415
- async (evalStep: RunEvalOptions, index: number | string) => {
416
- const row = await this.runEval(evalStep);
417
-
418
- results.push(row);
419
-
420
- if (progressbar) {
421
- progressbar.increment({
422
- provider: evalStep.provider.id(),
423
- prompt: evalStep.prompt.raw.slice(0, 10).replace(/\n/g, ' '),
424
- vars: Object.entries(evalStep.test.vars || {})
425
- .map(([k, v]) => `${k}=${v}`)
426
- .join(' ')
427
- .slice(0, 10)
428
- .replace(/\n/g, ' '),
429
- });
430
- }
431
- if (options.progressCallback) {
432
- options.progressCallback(results.length, runEvalOptions.length);
433
- }
434
-
435
- // Bookkeeping for table
436
- if (typeof index !== 'number') {
437
- throw new Error('Expected index to be a number');
438
- }
439
-
440
- let resultText: string | undefined;
441
- if (isTest) {
442
- if (row.success) {
443
- resultText = `${row.response?.output || row.error || ''}`;
444
- } else {
445
- resultText = `${row.error}\n---\n${row.response?.output || row.error || ''}`;
446
- }
447
- } else if (row.error) {
448
- resultText = `${row.error}`;
449
- } else {
450
- resultText = row.response?.output || row.error || '';
451
- }
452
-
453
- const { rowIndex, colIndex } = evalStep;
454
- if (!table.body[rowIndex]) {
455
- table.body[rowIndex] = {
456
- outputs: [],
457
- vars: table.head.vars
458
- .map((varName) => {
459
- const varValue = evalStep.test.vars?.[varName] || '';
460
- if (typeof varValue === 'string') {
461
- return varValue;
462
- }
463
- if (Array.isArray(varValue)) {
464
- // Only flatten string arrays
465
- return typeof varValue[0] === 'string' ? varValue : JSON.stringify(varValue);
466
- }
467
- return JSON.stringify(varValue);
468
- })
469
- .flat(),
470
- };
471
- }
472
- table.body[rowIndex].outputs[colIndex] = {
473
- pass: row.success,
474
- score: row.score,
475
- text: resultText,
476
- prompt: row.prompt.raw,
477
- latencyMs: row.latencyMs,
478
- tokenUsage: row.response?.tokenUsage,
479
- gradingResult: row.gradingResult,
480
- };
481
- },
482
- );
483
-
484
- if (progressbar) {
485
- progressbar.stop();
486
- }
487
- if (options.progressCallback) {
488
- options.progressCallback(runEvalOptions.length, runEvalOptions.length);
489
- }
490
-
491
- telemetry.record('eval_ran', {});
492
-
493
- return { version: 2, results, stats: this.stats, table };
494
- }
495
- }
496
-
497
- export function evaluate(testSuite: TestSuite, options: EvaluateOptions) {
498
- const ev = new Evaluator(testSuite, options);
499
- return ev.evaluate();
500
- }
package/src/index.ts DELETED
@@ -1,52 +0,0 @@
1
- import assertions from './assertions';
2
- import providers from './providers';
3
- import telemetry from './telemetry';
4
- import { evaluate as doEvaluate } from './evaluator';
5
- import { loadApiProviders } from './providers';
6
- import { readTests, writeLatestResults, writeOutput } from './util';
7
- import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
8
-
9
- export * from './types';
10
-
11
- export { generateTable } from './table';
12
-
13
- interface EvaluateTestSuite extends TestSuiteConfig {
14
- prompts: string[];
15
- writeLatestResults?: boolean;
16
- }
17
-
18
- async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions = {}) {
19
- const constructedTestSuite: TestSuite = {
20
- ...testSuite,
21
- providers: await loadApiProviders(testSuite.providers),
22
- tests: await readTests(testSuite.tests),
23
-
24
- // Full prompts expected (not filepaths)
25
- prompts: testSuite.prompts.map((promptContent) => ({
26
- raw: promptContent,
27
- display: promptContent,
28
- })),
29
- };
30
- telemetry.maybeShowNotice();
31
-
32
- const ret = await doEvaluate(constructedTestSuite, options);
33
-
34
- if (testSuite.outputPath) {
35
- writeOutput(testSuite.outputPath, ret, testSuite, null);
36
- }
37
-
38
- if (testSuite.writeLatestResults) {
39
- writeLatestResults(ret, {});
40
- }
41
-
42
- await telemetry.send();
43
- return ret;
44
- }
45
-
46
- export { evaluate, assertions, providers };
47
-
48
- export default {
49
- evaluate,
50
- assertions,
51
- providers,
52
- };
package/src/logger.ts DELETED
@@ -1,46 +0,0 @@
1
- import chalk from 'chalk';
2
- import winston from 'winston';
3
-
4
- export const LOG_LEVELS = {
5
- error: 0,
6
- warn: 1,
7
- info: 2,
8
- debug: 3,
9
- };
10
-
11
- const customFormatter = winston.format.printf(({ level, message, ...args }) => {
12
- if (level === 'error') {
13
- return chalk.red(message);
14
- } else if (level === 'warn') {
15
- return chalk.yellow(message);
16
- } else if (level === 'info') {
17
- return message;
18
- } else if (level === 'debug') {
19
- return chalk.cyan(message);
20
- }
21
- throw new Error(`Invalid log level: ${level}`);
22
- });
23
-
24
- const logger = winston.createLogger({
25
- levels: LOG_LEVELS,
26
- format: winston.format.combine(winston.format.simple(), customFormatter),
27
- transports: [
28
- new winston.transports.Console({
29
- level: process.env.LOG_LEVEL || 'info',
30
- }),
31
- ],
32
- });
33
-
34
- export function getLogLevel() {
35
- return logger.transports[0].level;
36
- }
37
-
38
- export function setLogLevel(level: keyof typeof LOG_LEVELS) {
39
- if (LOG_LEVELS.hasOwnProperty(level)) {
40
- logger.transports[0].level = level;
41
- } else {
42
- throw new Error(`Invalid log level: ${level}`);
43
- }
44
- }
45
-
46
- export default logger;