npm - promptfoo - Versions diffs - 0.7.0 → 0.8.0 - Mend

promptfoo 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/src/onboarding.ts ADDED Viewed

@@ -0,0 +1,61 @@
+export const DEFAULT_PROMPTS = `Your first prompt goes here
+---
+Next prompt goes here. You can substitute variables like this: {{var1}} {{var2}} {{var3}}
+---
+This is the next prompt.
+These prompts are nunjucks templates, so you can use logic like this:
+{% if var1 %}
+  {{ var1 }}
+{% endif %}
+---
+If you prefer, you can break prompts into multiple files (make sure to edit promptfooconfig.yaml accordingly)
+`;
+export const DEFAULT_YAML_CONFIG = `# This configuration runs each prompt through a series of example inputs and checks if they meet requirements.
+prompts: [prompts.txt]
+providers: [openai:gpt-3.5-turbo]
+tests:
+  - description: First test case - automatic review
+    vars:
+      var1: first variable's value
+      var2: another value
+      var3: some other value
+    assert:
+      - type: equality
+        value: expected LLM output goes here
+      - type: function
+        value: output.includes('some text')
+  - description: Second test case - manual review
+    # Test cases don't need assertions if you prefer to manually review the output
+    vars:
+      var1: new value
+      var2: another value
+      var3: third value
+  - description: Third test case - other types of automatic review
+    vars:
+      var1: yet another value
+      var2: and another
+      var3: dear llm, please output your response in json format
+    assert:
+      - type: contains-json
+      - type: similarity
+        value: ensures that output is semantically similar to this text
+      - type: llm-rubric
+        value: ensure that output contains a reference to X
+`;
+export const DEFAULT_README = `To get started, set your OPENAI_API_KEY environment variable.
+Next, change a few of the prompts in prompts.txt and edit promptfooconfig.yaml.
+Then run:
+\`\`\`
+promptfoo eval
+\`\`\`
+Afterwards, you can view the results by running \`promptfoo view\`
+`;

package/src/providers.ts CHANGED Viewed

@@ -5,6 +5,15 @@ import { ApiProvider } from './types.js';
 import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai.js';
 import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai.js';
+export async function loadApiProviders(providerPaths: string | string[]): Promise<ApiProvider[]> {
+  if (typeof providerPaths === 'string') {
+    return [await loadApiProvider(providerPaths)];
+  } else if (Array.isArray(providerPaths)) {
+    return Promise.all(providerPaths.map((provider) => loadApiProvider(provider)));
+  }
+  throw new Error('Invalid providers list');
+}
 export async function loadApiProvider(providerPath: string): Promise<ApiProvider> {
   if (providerPath?.startsWith('openai:')) {
     // Load OpenAI module

package/src/types.ts CHANGED Viewed

@@ -1,11 +1,16 @@
 export interface CommandLineOptions {
+  // Shared with TestSuite
   prompts: string[];
   providers: string[];
-  output?: string;
+  output: string;
+  // Shared with EvaluateOptions
+  maxConcurrency: string;
+  // Command line only
   vars?: string;
   config?: string;
   verbose?: boolean;
-  maxConcurrency?: string;
   grader?: string;
   view?: string;
   tableCellMaxLength?: string;
@@ -48,27 +53,19 @@ export interface CsvRow {
 export type VarMapping = Record<string, string>;
 export interface GradingConfig {
-  prompt?: string;
+  rubricPrompt?: string;
   provider?: string | ApiProvider;
 }
 export interface PromptConfig {
   prefix?: string;
   suffix?: string;
-  generateSuggestions?: boolean;
 }
 export interface EvaluateOptions {
-  providers: ApiProvider[];
-  prompts: string[];
-  vars?: VarMapping[];
   maxConcurrency?: number;
   showProgressBar?: boolean;
-  grading?: GradingConfig;
-  prompt?: PromptConfig;
+  generateSuggestions?: boolean;
 }
 export interface Prompt {
@@ -108,3 +105,83 @@ export interface EvaluateSummary {
   table: EvaluateTable;
   stats: EvaluateStats;
 }
+export interface GradingResult {
+  pass: boolean;
+  reason: string;
+  tokensUsed?: TokenUsage;
+}
+// TODO(ian): maybe Assertion should support {type: config} to make the yaml cleaner
+export interface Assertion {
+  // Type of assertion
+  type: 'equals' | 'is-json' | 'contains-json' | 'javascript' | 'similar' | 'llm-rubric';
+  // The expected value, if applicable
+  value?: string;
+  // The threshold value, only applicable for similarity (cosine distance)
+  threshold?: number;
+  // Some assertions (similarity, llm-rubric) require an LLM provider
+  provider?: ApiProvider;
+}
+// Each test case is graded pass/fail.  A test case represents a unique input to the LLM after substituting `vars` in the prompt.
+export interface TestCase {
+  // Optional description of what you're testing
+  description?: string;
+  // Key-value pairs to substitute in the prompt
+  vars?: Record<string, string>;
+  // Optional list of automatic checks to run on the LLM output
+  assert?: Assertion[];
+  // Additional configuration settings for the prompt
+  options?: PromptConfig & GradingConfig;
+}
+// The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
+export interface TestSuite {
+  // Optional description of what your LLM is trying to do
+  description?: string;
+  // One or more LLM APIs to use
+  providers: ApiProvider[];
+  // One or more prompt strings
+  prompts: string[];
+  // Test cases
+  tests?: TestCase[];
+  // Default test case config
+  defaultTest?: Partial<TestCase>;
+}
+// TestSuiteConfig = Test Suite, but before everything is parsed and resolved.  Providers are just strings, prompts are filepaths, tests can be filepath or inline.
+export interface TestSuiteConfig {
+  // Optional description of what your LLM is trying to do
+  description?: string;
+  // One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
+  providers: string | string[];
+  // One or more prompt files to load
+  prompts: string | string[];
+  // Path to a test file, OR list of LLM prompt variations (aka "test case")
+  tests: string | TestCase[];
+  // Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
+  defaultTest?: Omit<TestCase, 'description'>;
+  // Path to write output. Writes to console/web viewer if not set.
+  outputPath?: string;
+}
+export type UnifiedConfig = TestSuiteConfig & {
+  evaluateOptions: EvaluateOptions;
+  commandLineOptions: Partial<CommandLineOptions>;
+};

package/src/util.ts CHANGED Viewed

@@ -7,7 +7,6 @@ import yaml from 'js-yaml';
 import nunjucks from 'nunjucks';
 import { globSync } from 'glob';
 import { parse as parsePath } from 'path';
-import { CsvRow } from './types.js';
 import { parse as parseCsv } from 'csv-parse/sync';
 import { stringify } from 'csv-stringify/sync';
@@ -16,7 +15,16 @@ import { getDirectory } from './esm.js';
 import type { RequestInfo, RequestInit, Response } from 'node-fetch';
-import type { EvaluateSummary } from './types.js';
+import type {
+  Assertion,
+  CsvRow,
+  EvaluateSummary,
+  CommandLineOptions,
+  TestSuite,
+  UnifiedConfig,
+  TestCase,
+} from './types.js';
+import { assertionFromString } from './assertions.js';
 const PROMPT_DELIMITER = '---';
@@ -28,7 +36,35 @@ function parseJson(json: string): any | undefined {
   }
 }
-export function readPrompts(promptPathsOrGlobs: string[]): string[] {
+export function maybeReadConfig(configPath: string): UnifiedConfig | undefined {
+  try {
+    return readConfig(configPath);
+  } catch {
+    return undefined;
+  }
+}
+export function readConfig(configPath: string): UnifiedConfig {
+  if (!fs.existsSync(configPath)) {
+    throw new Error(`Config file not found: ${configPath}`);
+  }
+  const ext = path.parse(configPath).ext;
+  switch (ext) {
+    case '.json':
+      const content = fs.readFileSync(configPath, 'utf-8');
+      return JSON.parse(content) as UnifiedConfig;
+    case '.js':
+      return require(configPath) as UnifiedConfig;
+    case '.yaml':
+      return yaml.load(fs.readFileSync(configPath, 'utf-8')) as UnifiedConfig;
+    default:
+      throw new Error(`Unsupported configuration file format: ${ext}`);
+  }
+}
+export function readPrompts(promptPathsOrGlobs: string | string[]): string[] {
+  promptPathsOrGlobs =
+    typeof promptPathsOrGlobs === 'string' ? [promptPathsOrGlobs] : promptPathsOrGlobs;
   const promptPaths = promptPathsOrGlobs.flatMap((pathOrGlob) => globSync(pathOrGlob));
   let promptContents: string[] = [];
@@ -49,6 +85,9 @@ export function readPrompts(promptPathsOrGlobs: string[]): string[] {
   if (promptContents.length === 1) {
     promptContents = promptContents[0].split(PROMPT_DELIMITER).map((p) => p.trim());
   }
+  if (promptContents.length === 0) {
+    throw new Error(`There are no prompts in ${promptPathsOrGlobs.join(', ')}`);
+  }
   return promptContents;
 }
@@ -67,6 +106,37 @@ export function readVars(varsPath: string): CsvRow[] {
   return rows;
 }
+export function readTests(tests: string | TestCase[] | undefined): TestCase[] {
+  if (!tests) {
+    return [];
+  }
+  if (typeof tests === 'string') {
+    // It's a filepath, load from CSV
+    const vars = readVars(tests);
+    return vars.map((row, idx) => {
+      const test = testCaseFromCsvRow(row);
+      test.description = `Row #${idx + 1}`;
+      return test;
+    });
+  }
+  // Some validation of the shape of tests
+  for (const test of tests) {
+    if (!test.assert && !test.vars) {
+      throw new Error(
+        `Test case must have either "assert" or "vars" property. Instead got ${JSON.stringify(
+          test,
+          null,
+          2,
+        )}`,
+      );
+    }
+  }
+  return tests;
+}
 export function writeOutput(outputPath: string, summary: EvaluateSummary): void {
   const outputExtension = outputPath.split('.').pop()?.toLowerCase();
@@ -153,3 +223,20 @@ export function cosineSimilarity(vecA: number[], vecB: number[]) {
   const vecBMagnitude = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
   return dotProduct / (vecAMagnitude * vecBMagnitude);
 }
+export function testCaseFromCsvRow(row: CsvRow): TestCase {
+  const vars: Record<string, string> = {};
+  const asserts: Assertion[] = [];
+  for (const [key, value] of Object.entries(row)) {
+    if (key === '__expected') {
+      asserts.push(assertionFromString(value));
+    } else {
+      vars[key] = value;
+    }
+  }
+  return {
+    vars,
+    assert: asserts,
+  };
+}

package/src/web/server.ts CHANGED Viewed

@@ -32,24 +32,6 @@ export function init(port = 15500) {
     },
   });
-  interface EvaluateRequestBody {
-    provider: string;
-    options: {
-      prompts: string[];
-      vars: Record<string, string>[];
-    };
-  }
-  app.post('/evaluate', async (req: Request, res: Response) => {
-    try {
-      const { provider, options } = req.body as EvaluateRequestBody;
-      const summary = await promptfoo.evaluate(provider, options);
-      res.json(summary);
-    } catch (error) {
-      res.status(500).json({ message: 'Error evaluating prompts' });
-    }
-  });
   const latestJsonPath = getLatestResultsPath();
   const readLatestJson = () => {
     const data = fs.readFileSync(latestJsonPath, 'utf8');