promptfoo 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -250
- package/dist/__mocks__/esm.js +5 -1
- package/dist/__mocks__/esm.js.map +1 -1
- package/dist/assertions.d.ts +18 -0
- package/dist/assertions.d.ts.map +1 -0
- package/dist/assertions.js +128 -0
- package/dist/assertions.js.map +1 -0
- package/dist/cache.d.ts +8 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +78 -0
- package/dist/cache.js.map +1 -0
- package/dist/esm.d.ts.map +1 -1
- package/dist/esm.js +10 -3
- package/dist/esm.js.map +1 -1
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +90 -117
- package/dist/evaluator.js.map +1 -1
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +34 -5
- package/dist/index.js.map +1 -1
- package/dist/logger.js +18 -11
- package/dist/logger.js.map +1 -1
- package/dist/main.js +103 -56
- package/dist/main.js.map +1 -1
- package/dist/prompts.d.ts +4 -0
- package/dist/prompts.d.ts.map +1 -1
- package/dist/prompts.js +12 -1
- package/dist/prompts.js.map +1 -1
- package/dist/providers/localai.d.ts.map +1 -1
- package/dist/providers/localai.js +23 -17
- package/dist/providers/localai.js.map +1 -1
- package/dist/providers/openai.d.ts +9 -4
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +61 -58
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers/shared.d.ts.map +1 -1
- package/dist/providers/shared.js +5 -2
- package/dist/providers/shared.js.map +1 -1
- package/dist/providers.d.ts +10 -0
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +51 -14
- package/dist/providers.js.map +1 -1
- package/dist/suggestions.d.ts +9 -0
- package/dist/suggestions.d.ts.map +1 -0
- package/dist/suggestions.js +54 -0
- package/dist/suggestions.js.map +1 -0
- package/dist/types.d.ts +17 -6
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +2 -1
- package/dist/util.d.ts +1 -1
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +85 -31
- package/dist/util.js.map +1 -1
- package/dist/web/client/assets/index-207192fc.css +1 -0
- package/dist/web/client/assets/index-8751749f.js +172 -0
- package/dist/web/client/index.html +2 -2
- package/dist/web/server.js +38 -31
- package/dist/web/server.js.map +1 -1
- package/package.json +18 -5
- package/src/assertions.ts +154 -0
- package/src/cache.ts +91 -0
- package/src/esm.ts +5 -2
- package/src/evaluator.ts +63 -139
- package/src/index.ts +12 -0
- package/src/main.ts +39 -9
- package/src/prompts.ts +9 -0
- package/src/providers/localai.ts +9 -11
- package/src/providers/openai.ts +49 -50
- package/src/providers/shared.ts +1 -1
- package/src/providers.ts +8 -0
- package/src/suggestions.ts +63 -0
- package/src/types.ts +20 -6
- package/src/util.ts +24 -4
- package/src/web/client/package.json +1 -0
- package/src/web/client/src/App.css +4 -0
- package/src/web/client/src/App.tsx +29 -5
- package/src/web/client/src/Logo.css +5 -0
- package/src/web/client/src/NavBar.css +18 -0
- package/src/web/client/src/NavBar.tsx +12 -1
- package/src/web/client/src/index.css +10 -0
- package/src/web/server.ts +2 -2
- package/dist/web/client/assets/index-710f1308.css +0 -1
- package/dist/web/client/assets/index-900b20c0.js +0 -172
package/src/evaluator.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
import readline from 'node:readline';
|
|
2
|
+
|
|
1
3
|
import async from 'async';
|
|
4
|
+
import chalk from 'chalk';
|
|
2
5
|
import nunjucks from 'nunjucks';
|
|
3
6
|
|
|
4
|
-
import
|
|
5
|
-
import {
|
|
7
|
+
import logger from './logger.js';
|
|
8
|
+
import { matchesExpectedValue } from './assertions.js';
|
|
6
9
|
|
|
7
10
|
import type { SingleBar } from 'cli-progress';
|
|
8
11
|
import type {
|
|
@@ -13,9 +16,8 @@ import type {
|
|
|
13
16
|
EvaluateSummary,
|
|
14
17
|
EvaluateTable,
|
|
15
18
|
Prompt,
|
|
16
|
-
TokenUsage,
|
|
17
19
|
} from './types.js';
|
|
18
|
-
import {
|
|
20
|
+
import { generatePrompts } from './suggestions.js';
|
|
19
21
|
|
|
20
22
|
interface RunEvalOptions {
|
|
21
23
|
provider: ApiProvider;
|
|
@@ -27,16 +29,8 @@ interface RunEvalOptions {
|
|
|
27
29
|
colIndex: number;
|
|
28
30
|
}
|
|
29
31
|
|
|
30
|
-
interface GradingResult {
|
|
31
|
-
pass: boolean;
|
|
32
|
-
reason: string;
|
|
33
|
-
tokensUsed: TokenUsage;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
32
|
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
37
33
|
|
|
38
|
-
const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
|
|
39
|
-
|
|
40
34
|
class Evaluator {
|
|
41
35
|
options: EvaluateOptions;
|
|
42
36
|
stats: EvaluateStats;
|
|
@@ -50,132 +44,11 @@ class Evaluator {
|
|
|
50
44
|
total: 0,
|
|
51
45
|
prompt: 0,
|
|
52
46
|
completion: 0,
|
|
47
|
+
cached: 0,
|
|
53
48
|
},
|
|
54
49
|
};
|
|
55
50
|
}
|
|
56
51
|
|
|
57
|
-
async gradeOutput(expected: string, output: string): Promise<GradingResult> {
|
|
58
|
-
const { grading } = this.options;
|
|
59
|
-
|
|
60
|
-
if (!grading) {
|
|
61
|
-
throw new Error(
|
|
62
|
-
'Cannot grade output without grading config. Specify --grader option or grading config.',
|
|
63
|
-
);
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
const prompt = nunjucks.renderString(grading.prompt || DEFAULT_GRADING_PROMPT, {
|
|
67
|
-
content: output,
|
|
68
|
-
rubric: expected,
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
const resp = await grading.provider.callApi(prompt);
|
|
72
|
-
if (resp.error || !resp.output) {
|
|
73
|
-
return {
|
|
74
|
-
pass: false,
|
|
75
|
-
reason: resp.error || 'No output',
|
|
76
|
-
tokensUsed: {
|
|
77
|
-
total: resp.tokenUsage?.total || 0,
|
|
78
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
79
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
80
|
-
},
|
|
81
|
-
};
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
try {
|
|
85
|
-
const parsed = JSON.parse(resp.output) as GradingResult;
|
|
86
|
-
parsed.tokensUsed = {
|
|
87
|
-
total: resp.tokenUsage?.total || 0,
|
|
88
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
89
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
90
|
-
};
|
|
91
|
-
return parsed;
|
|
92
|
-
} catch (err) {
|
|
93
|
-
return {
|
|
94
|
-
pass: false,
|
|
95
|
-
reason: `Output is not valid JSON: ${resp.output}`,
|
|
96
|
-
tokensUsed: {
|
|
97
|
-
total: resp.tokenUsage?.total || 0,
|
|
98
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
99
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
100
|
-
},
|
|
101
|
-
};
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
async checkSimilarity(
|
|
106
|
-
expected: string,
|
|
107
|
-
output: string,
|
|
108
|
-
threshold: number,
|
|
109
|
-
): Promise<GradingResult> {
|
|
110
|
-
const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
|
|
111
|
-
const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
|
|
112
|
-
|
|
113
|
-
const tokensUsed = {
|
|
114
|
-
total: (expectedEmbedding.tokenUsage?.total || 0) + (outputEmbedding.tokenUsage?.total || 0),
|
|
115
|
-
prompt:
|
|
116
|
-
(expectedEmbedding.tokenUsage?.prompt || 0) + (outputEmbedding.tokenUsage?.prompt || 0),
|
|
117
|
-
completion:
|
|
118
|
-
(expectedEmbedding.tokenUsage?.completion || 0) +
|
|
119
|
-
(outputEmbedding.tokenUsage?.completion || 0),
|
|
120
|
-
};
|
|
121
|
-
|
|
122
|
-
if (expectedEmbedding.error || outputEmbedding.error) {
|
|
123
|
-
return {
|
|
124
|
-
pass: false,
|
|
125
|
-
reason:
|
|
126
|
-
expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
|
|
127
|
-
tokensUsed,
|
|
128
|
-
};
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
|
|
132
|
-
return {
|
|
133
|
-
pass: false,
|
|
134
|
-
reason: 'Embedding not found',
|
|
135
|
-
tokensUsed,
|
|
136
|
-
};
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
|
|
140
|
-
if (similarity < threshold) {
|
|
141
|
-
return {
|
|
142
|
-
pass: false,
|
|
143
|
-
reason: `Similarity ${similarity} is less than threshold ${threshold}`,
|
|
144
|
-
tokensUsed,
|
|
145
|
-
};
|
|
146
|
-
}
|
|
147
|
-
return {
|
|
148
|
-
pass: true,
|
|
149
|
-
reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
|
|
150
|
-
tokensUsed,
|
|
151
|
-
};
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
async checkExpectedValue(
|
|
155
|
-
expected: string,
|
|
156
|
-
output: string,
|
|
157
|
-
): Promise<{ pass: boolean; reason?: string }> {
|
|
158
|
-
const match = expected.match(SIMILAR_REGEX);
|
|
159
|
-
|
|
160
|
-
if (match) {
|
|
161
|
-
const threshold = parseFloat(match[1]) || 0.8;
|
|
162
|
-
const rest = expected.replace(SIMILAR_REGEX, '').trim();
|
|
163
|
-
return this.checkSimilarity(rest, output, threshold);
|
|
164
|
-
} else if (expected.startsWith('eval:')) {
|
|
165
|
-
const evalBody = expected.slice(5);
|
|
166
|
-
const evalFunction = new Function('output', `return ${evalBody}`);
|
|
167
|
-
return { pass: evalFunction(output) };
|
|
168
|
-
} else if (expected.startsWith('grade:')) {
|
|
169
|
-
return this.gradeOutput(expected.slice(6), output);
|
|
170
|
-
} else {
|
|
171
|
-
const pass = expected === output;
|
|
172
|
-
return {
|
|
173
|
-
pass,
|
|
174
|
-
reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
|
|
175
|
-
};
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
52
|
async runEval({
|
|
180
53
|
provider,
|
|
181
54
|
prompt,
|
|
@@ -207,7 +80,7 @@ class Evaluator {
|
|
|
207
80
|
ret.error = response.error;
|
|
208
81
|
} else if (response.output) {
|
|
209
82
|
const checkResult = vars.__expected
|
|
210
|
-
? await
|
|
83
|
+
? await matchesExpectedValue(vars.__expected, response.output, this.options)
|
|
211
84
|
: { pass: true };
|
|
212
85
|
if (!checkResult.pass) {
|
|
213
86
|
ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
|
|
@@ -222,6 +95,7 @@ class Evaluator {
|
|
|
222
95
|
this.stats.tokenUsage.total += response.tokenUsage?.total || 0;
|
|
223
96
|
this.stats.tokenUsage.prompt += response.tokenUsage?.prompt || 0;
|
|
224
97
|
this.stats.tokenUsage.completion += response.tokenUsage?.completion || 0;
|
|
98
|
+
this.stats.tokenUsage.cached += response.tokenUsage?.cached || 0;
|
|
225
99
|
|
|
226
100
|
if (ret.success) {
|
|
227
101
|
this.stats.successes++;
|
|
@@ -243,6 +117,48 @@ class Evaluator {
|
|
|
243
117
|
const options = this.options;
|
|
244
118
|
const prompts: Prompt[] = [];
|
|
245
119
|
|
|
120
|
+
if (options.prompt?.generateSuggestions) {
|
|
121
|
+
logger.info(`Generating prompt variations...`);
|
|
122
|
+
const { prompts: newPrompts, error } = await generatePrompts(options.prompts[0], 1);
|
|
123
|
+
if (error || !newPrompts) {
|
|
124
|
+
throw new Error(`Failed to generate prompts: ${error}`);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
logger.info(chalk.blue('Generated prompts:'));
|
|
128
|
+
let numAdded = 0;
|
|
129
|
+
for (const prompt of newPrompts) {
|
|
130
|
+
logger.info('--------------------------------------------------------');
|
|
131
|
+
logger.info(`${prompt}`);
|
|
132
|
+
logger.info('--------------------------------------------------------');
|
|
133
|
+
|
|
134
|
+
// Ask the user if they want to continue
|
|
135
|
+
await new Promise((resolve) => {
|
|
136
|
+
const rl = readline.createInterface({
|
|
137
|
+
input: process.stdin,
|
|
138
|
+
output: process.stdout,
|
|
139
|
+
});
|
|
140
|
+
rl.question(
|
|
141
|
+
`${chalk.blue('Do you want to test this prompt?')} (y/N): `,
|
|
142
|
+
async (answer) => {
|
|
143
|
+
rl.close();
|
|
144
|
+
if (answer.toLowerCase().startsWith('y')) {
|
|
145
|
+
options.prompts.push(prompt);
|
|
146
|
+
numAdded++;
|
|
147
|
+
} else {
|
|
148
|
+
logger.info('Skipping this prompt.');
|
|
149
|
+
}
|
|
150
|
+
resolve(true);
|
|
151
|
+
},
|
|
152
|
+
);
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (numAdded < 1) {
|
|
157
|
+
logger.info(chalk.red('No prompts selected. Aborting.'));
|
|
158
|
+
process.exit(1);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
246
162
|
for (const promptContent of options.prompts) {
|
|
247
163
|
for (const provider of options.providers) {
|
|
248
164
|
const display =
|
|
@@ -255,16 +171,20 @@ class Evaluator {
|
|
|
255
171
|
}
|
|
256
172
|
|
|
257
173
|
const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
|
|
258
|
-
const
|
|
174
|
+
const varsWithSpecialColsRemoved = vars.map((v) => {
|
|
259
175
|
const ret = { ...v };
|
|
260
|
-
|
|
176
|
+
Object.keys(ret).forEach((key) => {
|
|
177
|
+
if (key.startsWith('__')) {
|
|
178
|
+
delete ret[key];
|
|
179
|
+
}
|
|
180
|
+
});
|
|
261
181
|
return ret;
|
|
262
182
|
});
|
|
263
183
|
const isTest = vars[0].__expected;
|
|
264
184
|
const table: EvaluateTable = {
|
|
265
185
|
head: {
|
|
266
186
|
prompts: prompts.map((p) => p.display),
|
|
267
|
-
vars: Object.keys(
|
|
187
|
+
vars: Object.keys(varsWithSpecialColsRemoved[0]),
|
|
268
188
|
},
|
|
269
189
|
body: [],
|
|
270
190
|
};
|
|
@@ -292,11 +212,15 @@ class Evaluator {
|
|
|
292
212
|
let rowIndex = 0;
|
|
293
213
|
for (const row of vars) {
|
|
294
214
|
let colIndex = 0;
|
|
215
|
+
|
|
216
|
+
const prependToPrompt = row.__prefix || options.prompt?.prefix || '';
|
|
217
|
+
const appendToPrompt = row.__suffix || options.prompt?.suffix || '';
|
|
218
|
+
|
|
295
219
|
for (const promptContent of options.prompts) {
|
|
296
220
|
for (const provider of options.providers) {
|
|
297
221
|
runEvalOptions.push({
|
|
298
222
|
provider,
|
|
299
|
-
prompt: promptContent,
|
|
223
|
+
prompt: prependToPrompt + promptContent + appendToPrompt,
|
|
300
224
|
vars: row,
|
|
301
225
|
includeProviderId: options.providers.length > 1,
|
|
302
226
|
rowIndex,
|
package/src/index.ts
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import { evaluate as doEvaluate } from './evaluator.js';
|
|
2
2
|
import { loadApiProvider } from './providers.js';
|
|
3
|
+
import assertions from './assertions.js';
|
|
4
|
+
import providers from './providers.js';
|
|
3
5
|
|
|
4
6
|
import type { ApiProvider, EvaluateOptions, EvaluateSummary } from './types.js';
|
|
5
7
|
|
|
8
|
+
export * from './types.js';
|
|
9
|
+
|
|
6
10
|
async function evaluate(
|
|
7
11
|
providers: (string | ApiProvider)[] | (string | ApiProvider),
|
|
8
12
|
options: Omit<EvaluateOptions, 'providers'>,
|
|
@@ -30,6 +34,14 @@ async function evaluate(
|
|
|
30
34
|
});
|
|
31
35
|
}
|
|
32
36
|
|
|
37
|
+
module.exports = {
|
|
38
|
+
evaluate,
|
|
39
|
+
assertions,
|
|
40
|
+
providers,
|
|
41
|
+
};
|
|
42
|
+
|
|
33
43
|
export default {
|
|
34
44
|
evaluate,
|
|
45
|
+
assertions,
|
|
46
|
+
providers,
|
|
35
47
|
};
|
package/src/main.ts
CHANGED
|
@@ -14,6 +14,7 @@ import { getDirectory } from './esm.js';
|
|
|
14
14
|
import { init } from './web/server.js';
|
|
15
15
|
|
|
16
16
|
import type { CommandLineOptions, EvaluateOptions, VarMapping } from './types.js';
|
|
17
|
+
import { disableCache } from './cache.js';
|
|
17
18
|
|
|
18
19
|
function createDummyFiles(directory: string | null) {
|
|
19
20
|
if (directory) {
|
|
@@ -34,7 +35,7 @@ These prompts are nunjucks templates, so you can use logic like this:
|
|
|
34
35
|
{% endif %}`;
|
|
35
36
|
const dummyVars =
|
|
36
37
|
'var1,var2,var3\nvalue1,value2,value3\nanother value1,another value2,another value3';
|
|
37
|
-
const dummyConfig = `
|
|
38
|
+
const dummyConfig = `module.exports = {
|
|
38
39
|
prompts: ['prompts.txt'],
|
|
39
40
|
providers: ['openai:gpt-3.5-turbo'],
|
|
40
41
|
vars: 'vars.csv',
|
|
@@ -79,6 +80,10 @@ async function main() {
|
|
|
79
80
|
defaultConfig = (await import(pathJoin(process.cwd(), './promptfooconfig.js'))).default;
|
|
80
81
|
logger.info('Loaded default config from promptfooconfig.js');
|
|
81
82
|
}
|
|
83
|
+
if (existsSync('promptfooconfig.json')) {
|
|
84
|
+
defaultConfig = JSON.parse(readFileSync('promptfooconfig.json', 'utf-8'));
|
|
85
|
+
logger.info('Loaded default config from promptfooconfig.json');
|
|
86
|
+
}
|
|
82
87
|
|
|
83
88
|
const program = new Command();
|
|
84
89
|
|
|
@@ -143,15 +148,26 @@ async function main() {
|
|
|
143
148
|
'Truncate console table cells to this length',
|
|
144
149
|
'250',
|
|
145
150
|
)
|
|
151
|
+
.option(
|
|
152
|
+
'--suggest-prompts <number>',
|
|
153
|
+
'Generate N new prompts and append them to the prompt list',
|
|
154
|
+
)
|
|
155
|
+
.option(
|
|
156
|
+
'--prompt-prefix <path>',
|
|
157
|
+
'This prefix is prepended to every prompt',
|
|
158
|
+
defaultConfig.promptPrefix,
|
|
159
|
+
)
|
|
160
|
+
.option(
|
|
161
|
+
'--prompt-suffix <path>',
|
|
162
|
+
'This suffix is append to every prompt',
|
|
163
|
+
defaultConfig.promptSuffix,
|
|
164
|
+
)
|
|
146
165
|
.option('--no-write', 'Do not write results to promptfoo directory')
|
|
166
|
+
.option('--no-cache', 'Do not read or write results to disk cache')
|
|
147
167
|
.option('--grader', 'Model that will grade outputs', defaultConfig.grader)
|
|
148
168
|
.option('--verbose', 'Show debug logs', defaultConfig.verbose)
|
|
149
|
-
.option('--view', 'View in browser ui')
|
|
169
|
+
.option('--view [port]', 'View in browser ui')
|
|
150
170
|
.action(async (cmdObj: CommandLineOptions & Command) => {
|
|
151
|
-
if (cmdObj.verbose) {
|
|
152
|
-
setLogLevel('debug');
|
|
153
|
-
}
|
|
154
|
-
|
|
155
171
|
const configPath = cmdObj.config;
|
|
156
172
|
let config = {};
|
|
157
173
|
if (configPath) {
|
|
@@ -169,6 +185,13 @@ async function main() {
|
|
|
169
185
|
}
|
|
170
186
|
}
|
|
171
187
|
|
|
188
|
+
if (cmdObj.verbose) {
|
|
189
|
+
setLogLevel('debug');
|
|
190
|
+
}
|
|
191
|
+
if (!cmdObj.cache) {
|
|
192
|
+
disableCache();
|
|
193
|
+
}
|
|
194
|
+
|
|
172
195
|
let vars: VarMapping[] = [];
|
|
173
196
|
if (cmdObj.vars) {
|
|
174
197
|
vars = readVars(cmdObj.vars);
|
|
@@ -184,6 +207,10 @@ async function main() {
|
|
|
184
207
|
providers,
|
|
185
208
|
showProgressBar: true,
|
|
186
209
|
maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
|
|
210
|
+
prompt: {
|
|
211
|
+
prefix: cmdObj.promptPrefix,
|
|
212
|
+
suffix: cmdObj.promptSuffix,
|
|
213
|
+
},
|
|
187
214
|
...config,
|
|
188
215
|
};
|
|
189
216
|
|
|
@@ -192,6 +219,9 @@ async function main() {
|
|
|
192
219
|
provider: await loadApiProvider(cmdObj.grader),
|
|
193
220
|
};
|
|
194
221
|
}
|
|
222
|
+
if (cmdObj.generateSuggestions) {
|
|
223
|
+
options.prompt!.generateSuggestions = true;
|
|
224
|
+
}
|
|
195
225
|
|
|
196
226
|
const summary = await evaluate(options);
|
|
197
227
|
|
|
@@ -238,7 +268,7 @@ async function main() {
|
|
|
238
268
|
|
|
239
269
|
logger.info('\n' + table.toString());
|
|
240
270
|
}
|
|
241
|
-
if (cmdObj.
|
|
271
|
+
if (cmdObj.view || !cmdObj.write) {
|
|
242
272
|
logger.info('Evaluation complete');
|
|
243
273
|
} else {
|
|
244
274
|
writeLatestResults(summary);
|
|
@@ -247,12 +277,12 @@ async function main() {
|
|
|
247
277
|
logger.info(chalk.green.bold(`Successes: ${summary.stats.successes}`));
|
|
248
278
|
logger.info(chalk.red.bold(`Failures: ${summary.stats.failures}`));
|
|
249
279
|
logger.info(
|
|
250
|
-
`Token usage: Total ${summary.stats.tokenUsage.total} Prompt ${summary.stats.tokenUsage.prompt} Completion ${summary.stats.tokenUsage.completion}`,
|
|
280
|
+
`Token usage: Total ${summary.stats.tokenUsage.total}, Prompt ${summary.stats.tokenUsage.prompt}, Completion ${summary.stats.tokenUsage.completion}, Cached ${summary.stats.tokenUsage.cached}`,
|
|
251
281
|
);
|
|
252
282
|
logger.info('Done.');
|
|
253
283
|
|
|
254
284
|
if (cmdObj.view) {
|
|
255
|
-
init(15500);
|
|
285
|
+
init(parseInt(cmdObj.view, 10) || 15500);
|
|
256
286
|
}
|
|
257
287
|
});
|
|
258
288
|
|
package/src/prompts.ts
CHANGED
|
@@ -18,3 +18,12 @@ Rubric: Does not speak like a pirate
|
|
|
18
18
|
content: 'Content: {{ content }}\nRubric: {{ rubric }}',
|
|
19
19
|
},
|
|
20
20
|
]);
|
|
21
|
+
|
|
22
|
+
export const SUGGEST_PROMPTS_SYSTEM_MESSAGE = {
|
|
23
|
+
role: 'system',
|
|
24
|
+
content: `You're helping a scientist who is tuning a prompt for a large language model. You will receive messages, and each message is a full prompt. Generate a candidate variation of the given prompt. This variation will be tested for quality in order to select a winner.
|
|
25
|
+
|
|
26
|
+
Substantially revise the prompt, revising its structure and content however necessary to make it perform better, while preserving the original intent and including important details.
|
|
27
|
+
|
|
28
|
+
Your output is going to be copied directly into the program. It should contain the prompt ONLY`,
|
|
29
|
+
};
|
package/src/providers/localai.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logger from '../logger.js';
|
|
2
|
-
import {
|
|
2
|
+
import { fetchJsonWithCache } from '../cache.js';
|
|
3
3
|
import { REQUEST_TIMEOUT_MS } from './shared.js';
|
|
4
4
|
|
|
5
5
|
import type { ApiProvider, ProviderResponse } from '../types.js';
|
|
@@ -36,9 +36,10 @@ export class LocalAiChatProvider extends LocalAiGenericProvider {
|
|
|
36
36
|
};
|
|
37
37
|
logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
|
|
38
38
|
|
|
39
|
-
let
|
|
39
|
+
let data,
|
|
40
|
+
cached = false;
|
|
40
41
|
try {
|
|
41
|
-
|
|
42
|
+
({ data, cached } = (await fetchJsonWithCache(
|
|
42
43
|
`${this.apiBaseUrl}/chat/completions`,
|
|
43
44
|
{
|
|
44
45
|
method: 'POST',
|
|
@@ -48,9 +49,7 @@ export class LocalAiChatProvider extends LocalAiGenericProvider {
|
|
|
48
49
|
body: JSON.stringify(body),
|
|
49
50
|
},
|
|
50
51
|
REQUEST_TIMEOUT_MS,
|
|
51
|
-
);
|
|
52
|
-
|
|
53
|
-
data = (await response.json()) as unknown as any;
|
|
52
|
+
)) as unknown as any);
|
|
54
53
|
} catch (err) {
|
|
55
54
|
return {
|
|
56
55
|
error: `API call error: ${String(err)}`,
|
|
@@ -78,9 +77,10 @@ export class LocalAiCompletionProvider extends LocalAiGenericProvider {
|
|
|
78
77
|
};
|
|
79
78
|
logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
|
|
80
79
|
|
|
81
|
-
let
|
|
80
|
+
let data,
|
|
81
|
+
cached = false;
|
|
82
82
|
try {
|
|
83
|
-
|
|
83
|
+
({ data, cached } = (await fetchJsonWithCache(
|
|
84
84
|
`${this.apiBaseUrl}/completions`,
|
|
85
85
|
{
|
|
86
86
|
method: 'POST',
|
|
@@ -90,9 +90,7 @@ export class LocalAiCompletionProvider extends LocalAiGenericProvider {
|
|
|
90
90
|
body: JSON.stringify(body),
|
|
91
91
|
},
|
|
92
92
|
REQUEST_TIMEOUT_MS,
|
|
93
|
-
);
|
|
94
|
-
|
|
95
|
-
data = (await response.json()) as unknown as any;
|
|
93
|
+
)) as unknown as any);
|
|
96
94
|
} catch (err) {
|
|
97
95
|
return {
|
|
98
96
|
error: `API call error: ${String(err)}`,
|