npm - promptfoo - Versions diffs - 0.2.1 → 0.3.0 - Mend

promptfoo 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/src/evaluator.ts CHANGED Viewed

@@ -1,9 +1,18 @@
 import async from 'async';
 import nunjucks from 'nunjucks';
-import type { SingleBar } from 'cli-progress';
+import { DEFAULT_GRADING_PROMPT } from './prompts.js';
-import { EvaluateOptions, EvaluateSummary, EvaluateResult, ApiProvider, Prompt } from './types.js';
+import type { SingleBar } from 'cli-progress';
+import type {
+  ApiProvider,
+  EvaluateOptions,
+  EvaluateResult,
+  EvaluateStats,
+  EvaluateSummary,
+  Prompt,
+  TokenUsage,
+} from './types.js';
 interface RunEvalOptions {
   provider: ApiProvider;
@@ -12,193 +21,297 @@ interface RunEvalOptions {
   includeProviderId?: boolean;
 }
-const DEFAULT_MAX_CONCURRENCY = 3;
-function checkExpectedValue(expected: string, output: string): boolean {
-  if (expected.startsWith('eval:')) {
-    const evalBody = expected.slice(5);
-    const evalFunction = new Function('output', `return ${evalBody}`);
-    return evalFunction(output);
-  } else if (expected.startsWith('grade:')) {
-    // NYI
-    return false;
-  } else {
-    return expected === output;
-  }
+interface GradingResult {
+  pass: boolean;
+  reason: string;
+  tokensUsed: TokenUsage;
 }
-async function runEval({
-  provider,
-  prompt,
-  vars,
-  includeProviderId,
-}: RunEvalOptions): Promise<EvaluateResult> {
-  vars = vars || {};
-  const renderedPrompt = nunjucks.renderString(prompt, vars);
-  // Note that we're using original prompt, not renderedPrompt
-  const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
-  const setup = {
-    prompt: {
-      raw: renderedPrompt,
-      display: promptDisplay,
-    },
-    vars,
-  };
-  try {
-    const response = await provider.callApi(renderedPrompt);
-    const success = vars.__expected ? checkExpectedValue(vars.__expected, response.output) : true;
-    const ret: EvaluateResult = {
-      ...setup,
-      response,
-      success,
-    };
-    if (!success) {
-      ret.error = `Expected ${vars.__expected}, got "${response.output}"`;
-    }
-    return ret;
-  } catch (err) {
-    return {
-      ...setup,
-      error: String(err),
-      success: false,
+const DEFAULT_MAX_CONCURRENCY = 4;
+class Evaluator {
+  options: EvaluateOptions;
+  stats: EvaluateStats;
+  constructor(options: EvaluateOptions) {
+    this.options = options;
+    this.stats = {
+      successes: 0,
+      failures: 0,
+      tokenUsage: {
+        total: 0,
+        prompt: 0,
+        completion: 0,
+      },
     };
   }
-}
-export async function evaluate(options: EvaluateOptions): Promise<EvaluateSummary> {
-  const prompts: Prompt[] = [];
-  const results: EvaluateResult[] = [];
+  async gradeOutput(expected: string, output: string): Promise<GradingResult> {
+    const { grading } = this.options;
-  for (const promptContent of options.prompts) {
-    for (const provider of options.providers) {
-      prompts.push({
-        raw: promptContent,
-        display:
-          options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent,
-      });
+    if (!grading) {
+      throw new Error(
+        'Cannot grade output without grading config. Specify --grader option or grading config.',
+      );
     }
-  }
-  const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
-  const varsWithExpectedKeyRemoved = vars.map((v) => {
-    const ret = { ...v };
-    delete ret.__expected;
-    return ret;
-  });
-  const isTest = vars[0].__expected;
-  const table: string[][] = [
-    isTest
-      ? [
-          'RESULT',
-          [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
-        ].flat()
-      : [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
-  ];
-  const stats = {
-    successes: 0,
-    failures: 0,
-    tokenUsage: {
-      total: 0,
-      prompt: 0,
-      completion: 0,
-    },
-  };
-  let progressbar: SingleBar | undefined;
-  if (options.showProgressBar) {
-    const totalNumRuns =
-      options.prompts.length * options.providers.length * (options.vars?.length || 1);
-    const cliProgress = await import('cli-progress');
-    progressbar = new cliProgress.SingleBar(
-      {
-        format:
-          'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
-      },
-      cliProgress.Presets.shades_classic,
-    );
-    progressbar.start(totalNumRuns, 0, {
-      provider: '',
-      prompt: '',
-      vars: '',
+    const prompt = nunjucks.renderString(grading.prompt || DEFAULT_GRADING_PROMPT, {
+      content: output,
+      rubric: expected,
     });
+    const resp = await grading.provider.callApi(prompt);
+    if (resp.error || !resp.output) {
+      return {
+        pass: false,
+        reason: resp.error || 'No output',
+        tokensUsed: {
+          total: resp.tokenUsage?.total || 0,
+          prompt: resp.tokenUsage?.prompt || 0,
+          completion: resp.tokenUsage?.completion || 0,
+        },
+      };
+    }
+    try {
+      const parsed = JSON.parse(resp.output) as GradingResult;
+      parsed.tokensUsed = {
+        total: resp.tokenUsage?.total || 0,
+        prompt: resp.tokenUsage?.prompt || 0,
+        completion: resp.tokenUsage?.completion || 0,
+      };
+      return parsed;
+    } catch (err) {
+      return {
+        pass: false,
+        reason: `Output is not valid JSON: ${resp.output}`,
+        tokensUsed: {
+          total: resp.tokenUsage?.total || 0,
+          prompt: resp.tokenUsage?.prompt || 0,
+          completion: resp.tokenUsage?.completion || 0,
+        },
+      };
+    }
   }
-  const runEvalOptions: RunEvalOptions[] = [];
-  for (const row of vars) {
-    for (const promptContent of options.prompts) {
-      for (const provider of options.providers) {
-        runEvalOptions.push({
-          provider,
-          prompt: promptContent,
-          vars: row,
-          includeProviderId: options.providers.length > 1,
-        });
-      }
+  async checkExpectedValue(
+    expected: string,
+    output: string,
+  ): Promise<{ pass: boolean; reason?: string }> {
+    if (expected.startsWith('eval:')) {
+      const evalBody = expected.slice(5);
+      const evalFunction = new Function('output', `return ${evalBody}`);
+      return { pass: evalFunction(output) };
+    } else if (expected.startsWith('grade:')) {
+      const gradingResult = await this.gradeOutput(expected.slice(6), output);
+      return {
+        pass: gradingResult.pass,
+        reason: gradingResult.reason,
+      };
+    } else {
+      const pass = expected === output;
+      return {
+        pass,
+        reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
+      };
     }
   }
-  const combinedOutputs: string[][] = new Array(vars.length).fill(null).map(() => []);
-  await async.forEachOfLimit(
-    runEvalOptions,
-    options.maxConcurrency || DEFAULT_MAX_CONCURRENCY,
-    async (options: RunEvalOptions, index: number | string) => {
-      const row = await runEval(options);
-      results.push(row);
-      if (row.error) {
-        stats.failures++;
-      } else {
-        if (row.success) {
-          stats.successes++;
-        } else {
-          stats.failures++;
+  async runEval({
+    provider,
+    prompt,
+    vars,
+    includeProviderId,
+  }: RunEvalOptions): Promise<EvaluateResult> {
+    vars = vars || {};
+    const renderedPrompt = nunjucks.renderString(prompt, vars);
+    // Note that we're using original prompt, not renderedPrompt
+    const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
+    const setup = {
+      prompt: {
+        raw: renderedPrompt,
+        display: promptDisplay,
+      },
+      vars,
+    };
+    try {
+      const response = await provider.callApi(renderedPrompt);
+      const ret: EvaluateResult = {
+        ...setup,
+        response,
+        success: false,
+      };
+      if (response.error) {
+        ret.error = response.error;
+      } else if (response.output) {
+        const checkResult = vars.__expected
+          ? await this.checkExpectedValue(vars.__expected, response.output)
+          : { pass: true };
+        if (!checkResult.pass) {
+          ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
         }
-        stats.tokenUsage.total += row.response?.tokenUsage?.total || 0;
-        stats.tokenUsage.prompt += row.response?.tokenUsage?.prompt || 0;
-        stats.tokenUsage.completion += row.response?.tokenUsage?.completion || 0;
+        ret.success = checkResult.pass;
+      } else {
+        ret.success = false;
+        ret.error = 'No output';
+      }
+      // Update token usage stats
+      this.stats.tokenUsage.total += response.tokenUsage?.total || 0;
+      this.stats.tokenUsage.prompt += response.tokenUsage?.prompt || 0;
+      this.stats.tokenUsage.completion += response.tokenUsage?.completion || 0;
+      if (ret.success) {
+        this.stats.successes++;
+      } else {
+        this.stats.failures++;
       }
-      if (progressbar) {
-        progressbar.increment({
-          provider: options.provider.id(),
-          prompt: options.prompt.slice(0, 10),
-          vars: Object.entries(options.vars || {})
-            .map(([k, v]) => `${k}=${v}`)
-            .join(' ')
-            .slice(0, 10),
+      return ret;
+    } catch (err) {
+      return {
+        ...setup,
+        error: String(err),
+        success: false,
+      };
+    }
+  }
+  async evaluate(): Promise<EvaluateSummary> {
+    const options = this.options;
+    const prompts: Prompt[] = [];
+    for (const promptContent of options.prompts) {
+      for (const provider of options.providers) {
+        const display =
+          options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
+        prompts.push({
+          raw: promptContent,
+          display,
         });
       }
+    }
-      // Bookkeeping for table
-      if (typeof index !== 'number') {
-        throw new Error('Expected index to be a number');
+    const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
+    const varsWithExpectedKeyRemoved = vars.map((v) => {
+      const ret = { ...v };
+      delete ret.__expected;
+      return ret;
+    });
+    const isTest = vars[0].__expected;
+    const table: string[][] = [
+      [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
+    ];
+    let progressbar: SingleBar | undefined;
+    if (options.showProgressBar) {
+      const totalNumRuns =
+        options.prompts.length * options.providers.length * (options.vars?.length || 1);
+      const cliProgress = await import('cli-progress');
+      progressbar = new cliProgress.SingleBar(
+        {
+          format:
+            'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
+        },
+        cliProgress.Presets.shades_classic,
+      );
+      progressbar.start(totalNumRuns, 0, {
+        provider: '',
+        prompt: '',
+        vars: '',
+      });
+    }
+    const runEvalOptions: RunEvalOptions[] = [];
+    for (const row of vars) {
+      for (const promptContent of options.prompts) {
+        for (const provider of options.providers) {
+          runEvalOptions.push({
+            provider,
+            prompt: promptContent,
+            vars: row,
+            includeProviderId: options.providers.length > 1,
+          });
+        }
       }
-      const combinedOutputIndex = Math.floor(index / prompts.length);
-      combinedOutputs[combinedOutputIndex].push(row.response?.output || '');
-    },
-  );
+    }
-  if (progressbar) {
-    progressbar.stop();
-  }
+    const tempResults: { index: number; row: EvaluateResult }[] = [];
+    const combinedOutputs: string[][] = new Array(vars.length).fill(null).map(() => []);
+    await async.forEachOfLimit(
+      runEvalOptions,
+      options.maxConcurrency || DEFAULT_MAX_CONCURRENCY,
+      async (options: RunEvalOptions, index: number | string) => {
+        const row = await this.runEval(options);
+        //results[index as number] = row;
+        tempResults.push({ index: index as number, row });
-  // TODO(ian): Display errors in table UI.
-  if (isTest) {
-    table.push(
-      ...combinedOutputs.map((output, index) => [
-        results[index].success ? 'PASS' : `FAIL: ${results[index].error}`,
-        ...output,
-        ...Object.values(varsWithExpectedKeyRemoved[index]),
-      ]),
-    );
-  } else {
-    table.push(
-      ...combinedOutputs.map((output, index) => [...output, ...Object.values(vars[index])]),
+        if (progressbar) {
+          progressbar.increment({
+            provider: options.provider.id(),
+            prompt: options.prompt.slice(0, 10),
+            vars: Object.entries(options.vars || {})
+              .map(([k, v]) => `${k}=${v}`)
+              .join(' ')
+              .slice(0, 10),
+          });
+        }
+        // Bookkeeping for table
+        if (typeof index !== 'number') {
+          throw new Error('Expected index to be a number');
+        }
+        const combinedOutputIndex = Math.floor(index / prompts.length);
+        combinedOutputs[combinedOutputIndex].push(row.response?.output || row.error || '');
+      },
     );
+    if (progressbar) {
+      progressbar.stop();
+    }
+    const results: EvaluateResult[] = [];
+    tempResults
+      .sort((a, b) => a.index - b.index)
+      .forEach(({ index, row }) => {
+        results[index] = row;
+      });
+    // TODO(ian): Provide full context in table cells, and have the caller
+    // construct the table contents itself.
+    if (isTest) {
+      // Iterate through each combined output
+      combinedOutputs.forEach((output, index) => {
+        // Create a new array to store the modified output with [PASS] or [FAIL] prepended
+        const modifiedOutput: string[] = [];
+        // Iterate through each output value and prepend [PASS] or [FAIL] based on the success status
+        output.forEach((o, outputIndex) => {
+          const resultIndex = index * prompts.length + outputIndex;
+          const result = results[resultIndex];
+          // TODO(ian): sometimes output and result.error can be identical (in the case of exception)
+          const resultStatus = result.success ? `[PASS] ${o}` : `[FAIL] ${result.error}\n---\n${o}`;
+          modifiedOutput.push(resultStatus);
+        });
+        // Add the modified output and the corresponding values from varsWithExpectedKeyRemoved to the table
+        const tableRow = [...modifiedOutput, ...Object.values(varsWithExpectedKeyRemoved[index])];
+        table.push(tableRow);
+      });
+    } else {
+      table.push(
+        ...combinedOutputs.map((output, index) => [...output, ...Object.values(vars[index])]),
+      );
+    }
+    return { results, stats: this.stats, table };
   }
+}
-  return { results, stats, table };
+export function evaluate(options: EvaluateOptions) {
+  const ev = new Evaluator(options);
+  return ev.evaluate();
 }

package/src/main.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import logger, { setLogLevel } from './logger.js';
 import { loadApiProvider } from './providers.js';
 import { evaluate } from './evaluator.js';
 import { readPrompts, readVars, writeOutput } from './util.js';
+import { getDirectory } from './esm.js';
 import type { CommandLineOptions, EvaluateOptions, VarMapping } from './types.js';
@@ -36,7 +37,7 @@ These prompts are nunjucks templates, so you can use logic like this:
   prompts: ['prompts.txt'],
   providers: ['openai:gpt-3.5-turbo'],
   vars: 'vars.csv',
-  maxConcurrency: 3,
+  maxConcurrency: 4,
 };`;
   const readme = `To get started, set your OPENAI_API_KEY environment variable. Then run:
 \`\`\`
@@ -80,6 +81,14 @@ async function main() {
   const program = new Command();
+  program.option('--version', 'Print version', () => {
+    const packageJson = JSON.parse(
+      readFileSync(pathJoin(getDirectory(), '../package.json'), 'utf8'),
+    );
+    console.log(packageJson.version);
+    process.exit(0);
+  });
   program
     .command('init [directory]')
     .description('Initialize project with dummy files')
@@ -120,6 +129,7 @@ async function main() {
       'Maximum number of concurrent API calls',
       String(defaultConfig.maxConcurrency),
     )
+    .option('--grader', 'Model that will grade outputs', defaultConfig.grader)
     .option('--verbose', 'Show debug logs', defaultConfig.verbose)
     .action(async (cmdObj: CommandLineOptions & Command) => {
       if (cmdObj.verbose) {
@@ -161,6 +171,12 @@ async function main() {
         ...config,
       };
+      if (cmdObj.grader) {
+        options.grading = {
+          provider: await loadApiProvider(cmdObj.grader),
+        };
+      }
       const summary = await evaluate(options);
       if (cmdObj.output) {
@@ -179,10 +195,23 @@ async function main() {
             head: ['blue', 'bold'],
           },
         });
-        // Skip first row (header) and add the rest. Color the first column green if it's a success, red if it's a failure.
+        // Skip first row (header) and add the rest. Color PASS/FAIL
         for (const row of summary.table.slice(1)) {
-          const color = row[0] === 'PASS' ? 'green' : row[0].startsWith('FAIL') ? 'red' : undefined;
-          table.push(row.map((col, i) => (i === 0 && color ? chalk[color](col) : col)));
+          table.push(
+            row.map((col) => {
+              if (col.startsWith('[PASS]')) {
+                // color '[PASS]' green
+                return chalk.green.bold(col.slice(0, 6)) + col.slice(6);
+              } else if (col.startsWith('[FAIL]')) {
+                // color everything red up until '---'
+                return col
+                  .split('---')
+                  .map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
+                  .join('---');
+              }
+              return col;
+            }),
+          );
         }
         logger.info('\n' + table.toString());

package/src/prompts.ts ADDED Viewed

@@ -0,0 +1,20 @@
+export const DEFAULT_GRADING_PROMPT = JSON.stringify([
+  {
+    role: 'system',
+    content: `You are grading content according to a user-specified rubric. If the statement in the rubric is true, then the content passes the test. You respond with a JSON object with this structure: {pass: boolean; reason: string;}.
+Examples:
+Content: Hello world
+Rubric: Contains a greeting
+{"pass": true, "reason": "the content contains the word 'world'"}
+Content: Avast ye swabs, repel the invaders!
+Rubric: Does not speak like a pirate
+{"pass": false, "reason": "'avast ye' is a common pirate term"}`,
+  },
+  {
+    role: 'user',
+    content: 'Content: {{ content }}\nRubric: {{ rubric }}',
+  },
+]);