promptfoo 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/package.json +4 -4
- package/dist/src/assertions.d.ts.map +1 -1
- package/dist/src/assertions.js +5 -0
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/evaluator.js +1 -1
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/index.d.ts +1 -5
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +1 -1
- package/dist/src/index.js.map +1 -1
- package/dist/src/matchers.d.ts +3 -2
- package/dist/src/matchers.d.ts.map +1 -1
- package/dist/src/matchers.js +37 -9
- package/dist/src/matchers.js.map +1 -1
- package/dist/src/providers/anthropic.d.ts +5 -3
- package/dist/src/providers/anthropic.d.ts.map +1 -1
- package/dist/src/providers/anthropic.js +8 -10
- package/dist/src/providers/anthropic.js.map +1 -1
- package/dist/src/providers/azureopenai.d.ts +9 -8
- package/dist/src/providers/azureopenai.d.ts.map +1 -1
- package/dist/src/providers/azureopenai.js +33 -36
- package/dist/src/providers/azureopenai.js.map +1 -1
- package/dist/src/providers/openai.d.ts +12 -12
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +54 -65
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers/replicate.d.ts +4 -2
- package/dist/src/providers/replicate.d.ts.map +1 -1
- package/dist/src/providers/replicate.js +10 -8
- package/dist/src/providers/replicate.js.map +1 -1
- package/dist/src/providers/webhook.d.ts +9 -0
- package/dist/src/providers/webhook.d.ts.map +1 -0
- package/dist/src/providers/webhook.js +54 -0
- package/dist/src/providers/webhook.js.map +1 -0
- package/dist/src/providers.d.ts +1 -1
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +36 -28
- package/dist/src/providers.js.map +1 -1
- package/dist/src/suggestions.d.ts.map +1 -1
- package/dist/src/suggestions.js +1 -3
- package/dist/src/suggestions.js.map +1 -1
- package/dist/src/types.d.ts +7 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.js +1 -1
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/nextui/404/index.html +1 -1
- package/dist/src/web/nextui/404.html +1 -1
- package/dist/src/web/nextui/_next/static/Bl3o5lF4ON7Fjki46lPhr/_buildManifest.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/226-7bbb6c98a19542fd.js +37 -0
- package/dist/src/web/nextui/_next/static/chunks/249-ea9c0f034888ccff.js +125 -0
- package/dist/src/web/nextui/_next/static/chunks/339-501c32916b785ef1.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/365-e426ea5bc7e815fc.js +8 -0
- package/dist/src/web/nextui/_next/static/chunks/396-0a51429a01e24cdd.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/596-297f7ff4a0436e87.js +25 -0
- package/dist/src/web/nextui/_next/static/chunks/613-572c22424de64659.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/706-ae1d3352d28419e9.js +9 -0
- package/dist/src/web/nextui/_next/static/chunks/891-7035926a62c1c4e0.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/eval/[id]/not-found-366629541fd598e9.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/eval/[id]/page-319d2ee38d37574e.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/eval/page-a6b1ff91723b7beb.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/layout-024c4adc71c9feb0.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/page-1ae60660130041b2.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/setup/page-6ef16148040bf4f4.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/{ca377847-cb6ae6a6a073aebb.js → ca377847-26b462611379a4f7.js} +3 -3
- package/dist/src/web/nextui/_next/static/chunks/{fd9d1056-ac777be631f5a9e9.js → fd9d1056-fba4b53a2f01213b.js} +1 -1
- package/dist/src/web/nextui/_next/static/chunks/framework-8883d1e9be70c3da.js +25 -0
- package/dist/src/web/nextui/_next/static/chunks/main-8ea85465d428ecfe.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/main-app-581ccf0003955b21.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/pages/_app-52924524f99094ab.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/pages/_error-c92d5c4bb2b49926.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/webpack-55c264ce2fd85eb7.js +1 -0
- package/dist/src/web/nextui/_next/static/css/4d399fceacd06992.css +1 -0
- package/dist/src/web/nextui/eval/index.html +1 -1
- package/dist/src/web/nextui/eval/index.txt +6 -6
- package/dist/src/web/nextui/index.html +1 -1
- package/dist/src/web/nextui/index.txt +5 -5
- package/dist/src/web/nextui/setup/index.html +27 -1
- package/dist/src/web/nextui/setup/index.txt +9 -9
- package/dist/src/web/server.d.ts.map +1 -1
- package/dist/src/web/server.js +9 -5
- package/dist/src/web/server.js.map +1 -1
- package/package.json +4 -4
- package/dist/src/web/nextui/_next/static/US6gOx8LHTX_Hzm9aYNrC/_buildManifest.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/339-4fc8a80fa840e771.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/373-8a280796c0f2d1af.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/583-125d32af505e9bc4.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/596-07e4a23a5c6cdf04.js +0 -25
- package/dist/src/web/nextui/_next/static/chunks/658-a62210d07dc4dcb6.js +0 -15
- package/dist/src/web/nextui/_next/static/chunks/707-699cbd84b259c37b.js +0 -37
- package/dist/src/web/nextui/_next/static/chunks/858-ceb6fa22e614492b.js +0 -125
- package/dist/src/web/nextui/_next/static/chunks/891-3000ea7c0a292558.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/eval/[id]/not-found-50e40614fa05600e.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/eval/[id]/page-c19c44ed1b2dfb58.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/eval/page-d4a1813b2f8c4532.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/layout-664a8d716d2d24b1.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/page-1f8ef6a00a2355f0.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/setup/page-182018a3c6397345.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/framework-43665103d101a22d.js +0 -25
- package/dist/src/web/nextui/_next/static/chunks/main-50cc0a98559591ce.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/main-app-c9dc13756d166550.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/pages/_app-6b79a29ad0d63b21.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/pages/_error-9aeb3e4d490fe4b8.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/webpack-6e474e42be502dd7.js +0 -1
- package/dist/src/web/nextui/_next/static/css/a35c840ac696f161.css +0 -1
- package/dist/src/web/nextui/api +0 -1
- package/src/__mocks__/esm.ts +0 -3
- package/src/assertions.ts +0 -580
- package/src/cache.ts +0 -109
- package/src/esm.ts +0 -13
- package/src/evaluator.ts +0 -500
- package/src/index.ts +0 -52
- package/src/logger.ts +0 -46
- package/src/main.ts +0 -442
- package/src/matchers.ts +0 -120
- package/src/onboarding.ts +0 -69
- package/src/prompts.ts +0 -39
- package/src/providers/anthropic.ts +0 -88
- package/src/providers/azureopenai.ts +0 -299
- package/src/providers/llama.ts +0 -95
- package/src/providers/localai.ts +0 -111
- package/src/providers/ollama.ts +0 -89
- package/src/providers/openai.ts +0 -337
- package/src/providers/replicate.ts +0 -99
- package/src/providers/scriptCompletion.ts +0 -35
- package/src/providers/shared.ts +0 -34
- package/src/providers.ts +0 -192
- package/src/share.ts +0 -27
- package/src/suggestions.ts +0 -63
- package/src/table.ts +0 -43
- package/src/tableOutput.html +0 -52
- package/src/telemetry.ts +0 -70
- package/src/types.ts +0 -299
- package/src/updates.ts +0 -46
- package/src/util.ts +0 -543
- package/src/web/nextui/.eslintrc.json +0 -3
- package/src/web/nextui/next.config.js +0 -14
- package/src/web/nextui/package-lock.json +0 -4644
- package/src/web/nextui/package.json +0 -47
- package/src/web/nextui/public/favicon.ico +0 -0
- package/src/web/nextui/public/logo.svg +0 -30
- package/src/web/nextui/src/app/Home.css +0 -3
- package/src/web/nextui/src/app/api/route.ts +0 -6
- package/src/web/nextui/src/app/components/DarkMode.css +0 -22
- package/src/web/nextui/src/app/components/DarkMode.tsx +0 -17
- package/src/web/nextui/src/app/components/Logo.css +0 -32
- package/src/web/nextui/src/app/components/Logo.tsx +0 -11
- package/src/web/nextui/src/app/components/PageShell.css +0 -33
- package/src/web/nextui/src/app/components/PageShell.tsx +0 -87
- package/src/web/nextui/src/app/eval/ConfigModal.tsx +0 -84
- package/src/web/nextui/src/app/eval/Eval.css +0 -13
- package/src/web/nextui/src/app/eval/Eval.tsx +0 -79
- package/src/web/nextui/src/app/eval/EvalOutputPromptDialog.tsx +0 -127
- package/src/web/nextui/src/app/eval/ResultsCharts.tsx +0 -355
- package/src/web/nextui/src/app/eval/ResultsTable.css +0 -179
- package/src/web/nextui/src/app/eval/ResultsTable.tsx +0 -503
- package/src/web/nextui/src/app/eval/ResultsView.tsx +0 -301
- package/src/web/nextui/src/app/eval/ShareModal.tsx +0 -70
- package/src/web/nextui/src/app/eval/[id]/not-found.tsx +0 -5
- package/src/web/nextui/src/app/eval/[id]/page.css +0 -9
- package/src/web/nextui/src/app/eval/[id]/page.tsx +0 -20
- package/src/web/nextui/src/app/eval/index.css +0 -0
- package/src/web/nextui/src/app/eval/page.tsx +0 -8
- package/src/web/nextui/src/app/eval/store.ts +0 -18
- package/src/web/nextui/src/app/eval/types.ts +0 -20
- package/src/web/nextui/src/app/globals.css +0 -58
- package/src/web/nextui/src/app/layout.tsx +0 -25
- package/src/web/nextui/src/app/page.tsx +0 -7
- package/src/web/nextui/src/app/setup/AssertsForm.tsx +0 -118
- package/src/web/nextui/src/app/setup/PromptDialog.tsx +0 -77
- package/src/web/nextui/src/app/setup/PromptsSection.tsx +0 -190
- package/src/web/nextui/src/app/setup/ProviderConfigDialog.tsx +0 -99
- package/src/web/nextui/src/app/setup/ProviderSelector.tsx +0 -149
- package/src/web/nextui/src/app/setup/RunTestSuiteButton.tsx +0 -88
- package/src/web/nextui/src/app/setup/TestCaseDialog.tsx +0 -108
- package/src/web/nextui/src/app/setup/TestCasesSection.tsx +0 -154
- package/src/web/nextui/src/app/setup/VarsForm.tsx +0 -57
- package/src/web/nextui/src/app/setup/page.css +0 -3
- package/src/web/nextui/src/app/setup/page.tsx +0 -160
- package/src/web/nextui/src/util/api.ts +0 -1
- package/src/web/nextui/src/util/store.ts +0 -53
- package/src/web/nextui/tsconfig.json +0 -28
- package/src/web/server.ts +0 -151
- /package/dist/src/web/nextui/_next/static/{US6gOx8LHTX_Hzm9aYNrC → Bl3o5lF4ON7Fjki46lPhr}/_ssgManifest.js +0 -0
package/src/evaluator.ts
DELETED
|
@@ -1,500 +0,0 @@
|
|
|
1
|
-
import readline from 'readline';
|
|
2
|
-
|
|
3
|
-
import async from 'async';
|
|
4
|
-
import chalk from 'chalk';
|
|
5
|
-
import invariant from 'tiny-invariant';
|
|
6
|
-
|
|
7
|
-
import logger from './logger';
|
|
8
|
-
import telemetry from './telemetry';
|
|
9
|
-
import { runAssertions } from './assertions';
|
|
10
|
-
import { generatePrompts } from './suggestions';
|
|
11
|
-
import { getNunjucksEngine } from './util';
|
|
12
|
-
|
|
13
|
-
import type { SingleBar } from 'cli-progress';
|
|
14
|
-
import type {
|
|
15
|
-
ApiProvider,
|
|
16
|
-
EvaluateOptions,
|
|
17
|
-
EvaluateResult,
|
|
18
|
-
EvaluateStats,
|
|
19
|
-
EvaluateSummary,
|
|
20
|
-
EvaluateTable,
|
|
21
|
-
TestSuite,
|
|
22
|
-
Prompt,
|
|
23
|
-
TestCase,
|
|
24
|
-
AtomicTestCase,
|
|
25
|
-
} from './types';
|
|
26
|
-
|
|
27
|
-
interface RunEvalOptions {
|
|
28
|
-
provider: ApiProvider;
|
|
29
|
-
prompt: Prompt;
|
|
30
|
-
|
|
31
|
-
test: AtomicTestCase;
|
|
32
|
-
|
|
33
|
-
includeProviderId?: boolean;
|
|
34
|
-
|
|
35
|
-
rowIndex: number;
|
|
36
|
-
colIndex: number;
|
|
37
|
-
repeatIndex: number;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
41
|
-
|
|
42
|
-
const nunjucks = getNunjucksEngine();
|
|
43
|
-
|
|
44
|
-
function generateVarCombinations(
|
|
45
|
-
vars: Record<string, string | string[] | any>,
|
|
46
|
-
): Record<string, string | any[]>[] {
|
|
47
|
-
const keys = Object.keys(vars);
|
|
48
|
-
const combinations: Record<string, string | any[]>[] = [{}];
|
|
49
|
-
|
|
50
|
-
for (const key of keys) {
|
|
51
|
-
let values: any[] = Array.isArray(vars[key]) ? vars[key] : [vars[key]];
|
|
52
|
-
|
|
53
|
-
// Check if it's an array but not a string array
|
|
54
|
-
if (Array.isArray(vars[key]) && typeof vars[key][0] !== 'string') {
|
|
55
|
-
values = [vars[key]];
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
const newCombinations: Record<string, any>[] = [];
|
|
59
|
-
|
|
60
|
-
for (const combination of combinations) {
|
|
61
|
-
for (const value of values) {
|
|
62
|
-
newCombinations.push({ ...combination, [key]: value });
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
combinations.length = 0;
|
|
67
|
-
combinations.push(...newCombinations);
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
return combinations;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
class Evaluator {
|
|
74
|
-
testSuite: TestSuite;
|
|
75
|
-
options: EvaluateOptions;
|
|
76
|
-
stats: EvaluateStats;
|
|
77
|
-
|
|
78
|
-
constructor(testSuite: TestSuite, options: EvaluateOptions) {
|
|
79
|
-
this.testSuite = testSuite;
|
|
80
|
-
this.options = options;
|
|
81
|
-
this.stats = {
|
|
82
|
-
successes: 0,
|
|
83
|
-
failures: 0,
|
|
84
|
-
tokenUsage: {
|
|
85
|
-
total: 0,
|
|
86
|
-
prompt: 0,
|
|
87
|
-
completion: 0,
|
|
88
|
-
cached: 0,
|
|
89
|
-
},
|
|
90
|
-
};
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
async runEval({
|
|
94
|
-
provider,
|
|
95
|
-
prompt,
|
|
96
|
-
test,
|
|
97
|
-
includeProviderId,
|
|
98
|
-
}: RunEvalOptions): Promise<EvaluateResult> {
|
|
99
|
-
const vars = test.vars || {};
|
|
100
|
-
const renderedPrompt = nunjucks.renderString(prompt.raw, vars);
|
|
101
|
-
|
|
102
|
-
// Note that we're using original prompt, not renderedPrompt
|
|
103
|
-
let promptDisplay = prompt.display;
|
|
104
|
-
if (includeProviderId) {
|
|
105
|
-
promptDisplay = `[${provider.id()}] ${promptDisplay}`;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
const setup = {
|
|
109
|
-
prompt: {
|
|
110
|
-
raw: renderedPrompt,
|
|
111
|
-
display: promptDisplay,
|
|
112
|
-
},
|
|
113
|
-
vars,
|
|
114
|
-
};
|
|
115
|
-
|
|
116
|
-
let latencyMs = 0;
|
|
117
|
-
try {
|
|
118
|
-
const startTime = Date.now();
|
|
119
|
-
const response = await provider.callApi(renderedPrompt);
|
|
120
|
-
const endTime = Date.now();
|
|
121
|
-
latencyMs = endTime - startTime;
|
|
122
|
-
|
|
123
|
-
const ret: EvaluateResult = {
|
|
124
|
-
...setup,
|
|
125
|
-
response,
|
|
126
|
-
success: false,
|
|
127
|
-
score: 0,
|
|
128
|
-
latencyMs,
|
|
129
|
-
};
|
|
130
|
-
if (response.error) {
|
|
131
|
-
ret.error = response.error;
|
|
132
|
-
} else if (response.output) {
|
|
133
|
-
// Create a copy of response so we can potentially mutate it.
|
|
134
|
-
let processedResponse = { ...response };
|
|
135
|
-
if (test.options?.postprocess) {
|
|
136
|
-
const { postprocess } = test.options;
|
|
137
|
-
const postprocessFn = new Function(
|
|
138
|
-
'output',
|
|
139
|
-
'context',
|
|
140
|
-
postprocess.includes('\n') ? postprocess : `return ${postprocess}`,
|
|
141
|
-
);
|
|
142
|
-
processedResponse.output = postprocessFn(processedResponse.output);
|
|
143
|
-
if (processedResponse.output == null) {
|
|
144
|
-
throw new Error('Postprocess function did not return a value');
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
invariant(processedResponse.output != null, 'Response output should not be null');
|
|
149
|
-
const checkResult = await runAssertions(test, processedResponse.output);
|
|
150
|
-
if (!checkResult.pass) {
|
|
151
|
-
ret.error = checkResult.reason;
|
|
152
|
-
}
|
|
153
|
-
ret.success = checkResult.pass;
|
|
154
|
-
ret.score = checkResult.score;
|
|
155
|
-
if (checkResult.tokensUsed) {
|
|
156
|
-
this.stats.tokenUsage.total += checkResult.tokensUsed.total;
|
|
157
|
-
this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
|
|
158
|
-
this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
|
|
159
|
-
}
|
|
160
|
-
ret.response = processedResponse;
|
|
161
|
-
ret.gradingResult = checkResult;
|
|
162
|
-
} else {
|
|
163
|
-
ret.success = false;
|
|
164
|
-
ret.score = 0;
|
|
165
|
-
ret.error = 'No output';
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
// Update token usage stats
|
|
169
|
-
if (response.tokenUsage) {
|
|
170
|
-
this.stats.tokenUsage.total += response.tokenUsage.total || 0;
|
|
171
|
-
this.stats.tokenUsage.prompt += response.tokenUsage.prompt || 0;
|
|
172
|
-
this.stats.tokenUsage.completion += response.tokenUsage.completion || 0;
|
|
173
|
-
this.stats.tokenUsage.cached += response.tokenUsage.cached || 0;
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
if (ret.success) {
|
|
177
|
-
this.stats.successes++;
|
|
178
|
-
} else {
|
|
179
|
-
this.stats.failures++;
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
return ret;
|
|
183
|
-
} catch (err) {
|
|
184
|
-
return {
|
|
185
|
-
...setup,
|
|
186
|
-
error: String(err) + '\n\n' + (err as Error).stack,
|
|
187
|
-
success: false,
|
|
188
|
-
score: 0,
|
|
189
|
-
latencyMs,
|
|
190
|
-
};
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
async evaluate(): Promise<EvaluateSummary> {
|
|
195
|
-
const { testSuite, options } = this;
|
|
196
|
-
const prompts: Prompt[] = [];
|
|
197
|
-
|
|
198
|
-
if (options.generateSuggestions) {
|
|
199
|
-
// TODO(ian): Move this into its own command/file
|
|
200
|
-
logger.info(`Generating prompt variations...`);
|
|
201
|
-
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
202
|
-
if (error || !newPrompts) {
|
|
203
|
-
throw new Error(`Failed to generate prompts: ${error}`);
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
logger.info(chalk.blue('Generated prompts:'));
|
|
207
|
-
let numAdded = 0;
|
|
208
|
-
for (const prompt of newPrompts) {
|
|
209
|
-
logger.info('--------------------------------------------------------');
|
|
210
|
-
logger.info(`${prompt}`);
|
|
211
|
-
logger.info('--------------------------------------------------------');
|
|
212
|
-
|
|
213
|
-
// Ask the user if they want to continue
|
|
214
|
-
await new Promise((resolve) => {
|
|
215
|
-
const rl = readline.createInterface({
|
|
216
|
-
input: process.stdin,
|
|
217
|
-
output: process.stdout,
|
|
218
|
-
});
|
|
219
|
-
rl.question(
|
|
220
|
-
`${chalk.blue('Do you want to test this prompt?')} (y/N): `,
|
|
221
|
-
async (answer) => {
|
|
222
|
-
rl.close();
|
|
223
|
-
if (answer.toLowerCase().startsWith('y')) {
|
|
224
|
-
testSuite.prompts.push({ raw: prompt, display: prompt });
|
|
225
|
-
numAdded++;
|
|
226
|
-
} else {
|
|
227
|
-
logger.info('Skipping this prompt.');
|
|
228
|
-
}
|
|
229
|
-
resolve(true);
|
|
230
|
-
},
|
|
231
|
-
);
|
|
232
|
-
});
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
if (numAdded < 1) {
|
|
236
|
-
logger.info(chalk.red('No prompts selected. Aborting.'));
|
|
237
|
-
process.exit(1);
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
// Split prompts by provider
|
|
242
|
-
for (const prompt of testSuite.prompts) {
|
|
243
|
-
for (const provider of testSuite.providers) {
|
|
244
|
-
// Check if providerPromptMap exists and if it contains the current prompt's display
|
|
245
|
-
if (testSuite.providerPromptMap) {
|
|
246
|
-
const allowedPrompts = testSuite.providerPromptMap[provider.id()];
|
|
247
|
-
if (allowedPrompts && !allowedPrompts.includes(prompt.display)) {
|
|
248
|
-
continue;
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
const updatedDisplay =
|
|
252
|
-
testSuite.providers.length > 1 ? `[${provider.id()}] ${prompt.display}` : prompt.display;
|
|
253
|
-
prompts.push({
|
|
254
|
-
...prompt,
|
|
255
|
-
display: updatedDisplay,
|
|
256
|
-
});
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
// Aggregate all vars across test cases
|
|
261
|
-
let tests = (
|
|
262
|
-
testSuite.tests && testSuite.tests.length > 0
|
|
263
|
-
? testSuite.tests
|
|
264
|
-
: testSuite.scenarios
|
|
265
|
-
? []
|
|
266
|
-
: [
|
|
267
|
-
{
|
|
268
|
-
// Dummy test for cases when we're only comparing raw prompts.
|
|
269
|
-
},
|
|
270
|
-
]
|
|
271
|
-
).map((test) => {
|
|
272
|
-
const finalTestCase: TestCase = Object.assign({}, testSuite.defaultTest);
|
|
273
|
-
return Object.assign(finalTestCase, test);
|
|
274
|
-
});
|
|
275
|
-
|
|
276
|
-
// Build scenarios and add to tests
|
|
277
|
-
if (testSuite.scenarios && testSuite.scenarios.length > 0) {
|
|
278
|
-
for (const scenario of testSuite.scenarios) {
|
|
279
|
-
for (const data of scenario.config) {
|
|
280
|
-
// Merge defaultTest with scenario config
|
|
281
|
-
const scenarioTests = (
|
|
282
|
-
scenario.tests || [
|
|
283
|
-
{
|
|
284
|
-
// Dummy test for cases when we're only comparing raw prompts.
|
|
285
|
-
},
|
|
286
|
-
]
|
|
287
|
-
).map((test) => {
|
|
288
|
-
return {
|
|
289
|
-
...testSuite.defaultTest,
|
|
290
|
-
...data,
|
|
291
|
-
...test,
|
|
292
|
-
vars: {
|
|
293
|
-
...testSuite.defaultTest?.vars,
|
|
294
|
-
...data.vars,
|
|
295
|
-
...test.vars,
|
|
296
|
-
},
|
|
297
|
-
options: {
|
|
298
|
-
...testSuite.defaultTest?.options,
|
|
299
|
-
...test.options,
|
|
300
|
-
},
|
|
301
|
-
};
|
|
302
|
-
});
|
|
303
|
-
// Add scenario tests to tests
|
|
304
|
-
tests = tests.concat(scenarioTests);
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
const varNames: Set<string> = new Set();
|
|
310
|
-
const varsWithSpecialColsRemoved: Record<string, string | string[] | object>[] = [];
|
|
311
|
-
for (const testCase of tests) {
|
|
312
|
-
if (testCase.vars) {
|
|
313
|
-
const varWithSpecialColsRemoved: Record<string, string | string[] | object> = {};
|
|
314
|
-
for (const varName of Object.keys(testCase.vars)) {
|
|
315
|
-
varNames.add(varName);
|
|
316
|
-
varWithSpecialColsRemoved[varName] = testCase.vars[varName];
|
|
317
|
-
}
|
|
318
|
-
varsWithSpecialColsRemoved.push(varWithSpecialColsRemoved);
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
// Set up eval cases
|
|
323
|
-
const runEvalOptions: RunEvalOptions[] = [];
|
|
324
|
-
let totalVarCombinations = 0;
|
|
325
|
-
let rowIndex = 0;
|
|
326
|
-
for (const testCase of tests) {
|
|
327
|
-
// Handle default properties
|
|
328
|
-
testCase.vars = Object.assign({}, testSuite.defaultTest?.vars, testCase.vars);
|
|
329
|
-
testCase.assert = [...(testSuite.defaultTest?.assert || []), ...(testCase.assert || [])];
|
|
330
|
-
testCase.options = testCase.options || {};
|
|
331
|
-
testCase.options.provider =
|
|
332
|
-
testCase.options.provider || testSuite.defaultTest?.options?.provider;
|
|
333
|
-
const prependToPrompt =
|
|
334
|
-
testCase.options?.prefix || testSuite.defaultTest?.options?.prefix || '';
|
|
335
|
-
const appendToPrompt =
|
|
336
|
-
testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';
|
|
337
|
-
testCase.options.postprocess =
|
|
338
|
-
testCase.options.postprocess || testSuite.defaultTest?.options?.postprocess;
|
|
339
|
-
|
|
340
|
-
// Finalize test case eval
|
|
341
|
-
const varCombinations = generateVarCombinations(testCase.vars || {});
|
|
342
|
-
totalVarCombinations += varCombinations.length;
|
|
343
|
-
|
|
344
|
-
const numRepeat = this.options.repeat || 1;
|
|
345
|
-
for (let repeatIndex = 0; repeatIndex < numRepeat; repeatIndex++) {
|
|
346
|
-
for (const vars of varCombinations) {
|
|
347
|
-
let colIndex = 0;
|
|
348
|
-
for (const prompt of testSuite.prompts) {
|
|
349
|
-
for (const provider of testSuite.providers) {
|
|
350
|
-
if (testSuite.providerPromptMap) {
|
|
351
|
-
const allowedPrompts = testSuite.providerPromptMap[provider.id()];
|
|
352
|
-
if (allowedPrompts && !allowedPrompts.includes(prompt.display)) {
|
|
353
|
-
// This prompt should not be used with this provider.
|
|
354
|
-
continue;
|
|
355
|
-
}
|
|
356
|
-
}
|
|
357
|
-
runEvalOptions.push({
|
|
358
|
-
provider,
|
|
359
|
-
prompt: {
|
|
360
|
-
...prompt,
|
|
361
|
-
raw: prependToPrompt + prompt.raw + appendToPrompt,
|
|
362
|
-
},
|
|
363
|
-
test: { ...testCase, vars, options: testCase.options },
|
|
364
|
-
includeProviderId: testSuite.providers.length > 1,
|
|
365
|
-
rowIndex,
|
|
366
|
-
colIndex,
|
|
367
|
-
repeatIndex,
|
|
368
|
-
});
|
|
369
|
-
colIndex++;
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
rowIndex++;
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
// Set up table...
|
|
378
|
-
const isTest = tests.some((t) => !!t.assert);
|
|
379
|
-
|
|
380
|
-
const table: EvaluateTable = {
|
|
381
|
-
head: {
|
|
382
|
-
prompts,
|
|
383
|
-
vars: Array.from(varNames).sort(),
|
|
384
|
-
// TODO(ian): add assertions to table?
|
|
385
|
-
},
|
|
386
|
-
body: [],
|
|
387
|
-
};
|
|
388
|
-
|
|
389
|
-
// Set up progress bar...
|
|
390
|
-
let progressbar: SingleBar | undefined;
|
|
391
|
-
if (options.showProgressBar) {
|
|
392
|
-
const cliProgress = await import('cli-progress');
|
|
393
|
-
progressbar = new cliProgress.SingleBar(
|
|
394
|
-
{
|
|
395
|
-
format:
|
|
396
|
-
'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
|
|
397
|
-
},
|
|
398
|
-
cliProgress.Presets.shades_classic,
|
|
399
|
-
);
|
|
400
|
-
progressbar.start(runEvalOptions.length, 0, {
|
|
401
|
-
provider: '',
|
|
402
|
-
prompt: '',
|
|
403
|
-
vars: '',
|
|
404
|
-
});
|
|
405
|
-
}
|
|
406
|
-
if (options.progressCallback) {
|
|
407
|
-
options.progressCallback(0, runEvalOptions.length);
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
// Actually run the eval
|
|
411
|
-
const results: EvaluateResult[] = [];
|
|
412
|
-
await async.forEachOfLimit(
|
|
413
|
-
runEvalOptions,
|
|
414
|
-
options.maxConcurrency || DEFAULT_MAX_CONCURRENCY,
|
|
415
|
-
async (evalStep: RunEvalOptions, index: number | string) => {
|
|
416
|
-
const row = await this.runEval(evalStep);
|
|
417
|
-
|
|
418
|
-
results.push(row);
|
|
419
|
-
|
|
420
|
-
if (progressbar) {
|
|
421
|
-
progressbar.increment({
|
|
422
|
-
provider: evalStep.provider.id(),
|
|
423
|
-
prompt: evalStep.prompt.raw.slice(0, 10).replace(/\n/g, ' '),
|
|
424
|
-
vars: Object.entries(evalStep.test.vars || {})
|
|
425
|
-
.map(([k, v]) => `${k}=${v}`)
|
|
426
|
-
.join(' ')
|
|
427
|
-
.slice(0, 10)
|
|
428
|
-
.replace(/\n/g, ' '),
|
|
429
|
-
});
|
|
430
|
-
}
|
|
431
|
-
if (options.progressCallback) {
|
|
432
|
-
options.progressCallback(results.length, runEvalOptions.length);
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
// Bookkeeping for table
|
|
436
|
-
if (typeof index !== 'number') {
|
|
437
|
-
throw new Error('Expected index to be a number');
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
let resultText: string | undefined;
|
|
441
|
-
if (isTest) {
|
|
442
|
-
if (row.success) {
|
|
443
|
-
resultText = `${row.response?.output || row.error || ''}`;
|
|
444
|
-
} else {
|
|
445
|
-
resultText = `${row.error}\n---\n${row.response?.output || row.error || ''}`;
|
|
446
|
-
}
|
|
447
|
-
} else if (row.error) {
|
|
448
|
-
resultText = `${row.error}`;
|
|
449
|
-
} else {
|
|
450
|
-
resultText = row.response?.output || row.error || '';
|
|
451
|
-
}
|
|
452
|
-
|
|
453
|
-
const { rowIndex, colIndex } = evalStep;
|
|
454
|
-
if (!table.body[rowIndex]) {
|
|
455
|
-
table.body[rowIndex] = {
|
|
456
|
-
outputs: [],
|
|
457
|
-
vars: table.head.vars
|
|
458
|
-
.map((varName) => {
|
|
459
|
-
const varValue = evalStep.test.vars?.[varName] || '';
|
|
460
|
-
if (typeof varValue === 'string') {
|
|
461
|
-
return varValue;
|
|
462
|
-
}
|
|
463
|
-
if (Array.isArray(varValue)) {
|
|
464
|
-
// Only flatten string arrays
|
|
465
|
-
return typeof varValue[0] === 'string' ? varValue : JSON.stringify(varValue);
|
|
466
|
-
}
|
|
467
|
-
return JSON.stringify(varValue);
|
|
468
|
-
})
|
|
469
|
-
.flat(),
|
|
470
|
-
};
|
|
471
|
-
}
|
|
472
|
-
table.body[rowIndex].outputs[colIndex] = {
|
|
473
|
-
pass: row.success,
|
|
474
|
-
score: row.score,
|
|
475
|
-
text: resultText,
|
|
476
|
-
prompt: row.prompt.raw,
|
|
477
|
-
latencyMs: row.latencyMs,
|
|
478
|
-
tokenUsage: row.response?.tokenUsage,
|
|
479
|
-
gradingResult: row.gradingResult,
|
|
480
|
-
};
|
|
481
|
-
},
|
|
482
|
-
);
|
|
483
|
-
|
|
484
|
-
if (progressbar) {
|
|
485
|
-
progressbar.stop();
|
|
486
|
-
}
|
|
487
|
-
if (options.progressCallback) {
|
|
488
|
-
options.progressCallback(runEvalOptions.length, runEvalOptions.length);
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
telemetry.record('eval_ran', {});
|
|
492
|
-
|
|
493
|
-
return { version: 2, results, stats: this.stats, table };
|
|
494
|
-
}
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
export function evaluate(testSuite: TestSuite, options: EvaluateOptions) {
|
|
498
|
-
const ev = new Evaluator(testSuite, options);
|
|
499
|
-
return ev.evaluate();
|
|
500
|
-
}
|
package/src/index.ts
DELETED
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import assertions from './assertions';
|
|
2
|
-
import providers from './providers';
|
|
3
|
-
import telemetry from './telemetry';
|
|
4
|
-
import { evaluate as doEvaluate } from './evaluator';
|
|
5
|
-
import { loadApiProviders } from './providers';
|
|
6
|
-
import { readTests, writeLatestResults, writeOutput } from './util';
|
|
7
|
-
import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
|
|
8
|
-
|
|
9
|
-
export * from './types';
|
|
10
|
-
|
|
11
|
-
export { generateTable } from './table';
|
|
12
|
-
|
|
13
|
-
interface EvaluateTestSuite extends TestSuiteConfig {
|
|
14
|
-
prompts: string[];
|
|
15
|
-
writeLatestResults?: boolean;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions = {}) {
|
|
19
|
-
const constructedTestSuite: TestSuite = {
|
|
20
|
-
...testSuite,
|
|
21
|
-
providers: await loadApiProviders(testSuite.providers),
|
|
22
|
-
tests: await readTests(testSuite.tests),
|
|
23
|
-
|
|
24
|
-
// Full prompts expected (not filepaths)
|
|
25
|
-
prompts: testSuite.prompts.map((promptContent) => ({
|
|
26
|
-
raw: promptContent,
|
|
27
|
-
display: promptContent,
|
|
28
|
-
})),
|
|
29
|
-
};
|
|
30
|
-
telemetry.maybeShowNotice();
|
|
31
|
-
|
|
32
|
-
const ret = await doEvaluate(constructedTestSuite, options);
|
|
33
|
-
|
|
34
|
-
if (testSuite.outputPath) {
|
|
35
|
-
writeOutput(testSuite.outputPath, ret, testSuite, null);
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
if (testSuite.writeLatestResults) {
|
|
39
|
-
writeLatestResults(ret, {});
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
await telemetry.send();
|
|
43
|
-
return ret;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
export { evaluate, assertions, providers };
|
|
47
|
-
|
|
48
|
-
export default {
|
|
49
|
-
evaluate,
|
|
50
|
-
assertions,
|
|
51
|
-
providers,
|
|
52
|
-
};
|
package/src/logger.ts
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import chalk from 'chalk';
|
|
2
|
-
import winston from 'winston';
|
|
3
|
-
|
|
4
|
-
export const LOG_LEVELS = {
|
|
5
|
-
error: 0,
|
|
6
|
-
warn: 1,
|
|
7
|
-
info: 2,
|
|
8
|
-
debug: 3,
|
|
9
|
-
};
|
|
10
|
-
|
|
11
|
-
const customFormatter = winston.format.printf(({ level, message, ...args }) => {
|
|
12
|
-
if (level === 'error') {
|
|
13
|
-
return chalk.red(message);
|
|
14
|
-
} else if (level === 'warn') {
|
|
15
|
-
return chalk.yellow(message);
|
|
16
|
-
} else if (level === 'info') {
|
|
17
|
-
return message;
|
|
18
|
-
} else if (level === 'debug') {
|
|
19
|
-
return chalk.cyan(message);
|
|
20
|
-
}
|
|
21
|
-
throw new Error(`Invalid log level: ${level}`);
|
|
22
|
-
});
|
|
23
|
-
|
|
24
|
-
const logger = winston.createLogger({
|
|
25
|
-
levels: LOG_LEVELS,
|
|
26
|
-
format: winston.format.combine(winston.format.simple(), customFormatter),
|
|
27
|
-
transports: [
|
|
28
|
-
new winston.transports.Console({
|
|
29
|
-
level: process.env.LOG_LEVEL || 'info',
|
|
30
|
-
}),
|
|
31
|
-
],
|
|
32
|
-
});
|
|
33
|
-
|
|
34
|
-
export function getLogLevel() {
|
|
35
|
-
return logger.transports[0].level;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
export function setLogLevel(level: keyof typeof LOG_LEVELS) {
|
|
39
|
-
if (LOG_LEVELS.hasOwnProperty(level)) {
|
|
40
|
-
logger.transports[0].level = level;
|
|
41
|
-
} else {
|
|
42
|
-
throw new Error(`Invalid log level: ${level}`);
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
export default logger;
|