npm - promptfoo - Versions diffs - 0.72.1 → 0.73.0 - Mend

promptfoo 0.72.1 → 0.73.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (470) hide show

package/dist/test/matchers.test.js ADDED Viewed

@@ -0,0 +1,807 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+const matchers_1 = require("../src/matchers");
+const matchers_2 = require("../src/matchers");
+const prompts_1 = require("../src/prompts");
+const huggingface_1 = require("../src/providers/huggingface");
+const openai_1 = require("../src/providers/openai");
+const openai_2 = require("../src/providers/openai");
+const utils_1 = require("./utils");
+jest.mock('../src/esm');
+const Grader = new utils_1.TestGrader();
+describe('matchesSimilarity', () => {
+    beforeEach(() => {
+        jest.spyOn(openai_2.DefaultEmbeddingProvider, 'callEmbeddingApi').mockImplementation((text) => {
+            if (text === 'Expected output' || text === 'Sample output') {
+                return Promise.resolve({
+                    embedding: [1, 0, 0],
+                    tokenUsage: { total: 5, prompt: 2, completion: 3 },
+                });
+            }
+            else if (text === 'Different output') {
+                return Promise.resolve({
+                    embedding: [0, 1, 0],
+                    tokenUsage: { total: 5, prompt: 2, completion: 3 },
+                });
+            }
+            return Promise.reject(new Error('Unexpected input'));
+        });
+    });
+    afterEach(() => {
+        jest.restoreAllMocks();
+    });
+    it('should pass when similarity is above the threshold', async () => {
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const threshold = 0.5;
+        await expect((0, matchers_2.matchesSimilarity)(expected, output, threshold)).resolves.toEqual({
+            pass: true,
+            reason: 'Similarity 1.00 is greater than threshold 0.5',
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should fail when similarity is below the threshold', async () => {
+        const expected = 'Expected output';
+        const output = 'Different output';
+        const threshold = 0.9;
+        await expect((0, matchers_2.matchesSimilarity)(expected, output, threshold)).resolves.toEqual({
+            pass: false,
+            reason: 'Similarity 0.00 is less than threshold 0.9',
+            score: 0,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should fail when inverted similarity is above the threshold', async () => {
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const threshold = 0.5;
+        await expect((0, matchers_2.matchesSimilarity)(expected, output, threshold, true /* invert */)).resolves.toEqual({
+            pass: false,
+            reason: 'Similarity 1.00 is greater than threshold 0.5',
+            score: 0,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should pass when inverted similarity is below the threshold', async () => {
+        const expected = 'Expected output';
+        const output = 'Different output';
+        const threshold = 0.9;
+        await expect((0, matchers_2.matchesSimilarity)(expected, output, threshold, true /* invert */)).resolves.toEqual({
+            pass: true,
+            reason: 'Similarity 0.00 is less than threshold 0.9',
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should use the overridden similarity grading config', async () => {
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const threshold = 0.5;
+        const grading = {
+            provider: {
+                id: 'openai:embedding:text-embedding-ada-9999999',
+                config: {
+                    apiKey: 'abc123',
+                    temperature: 3.1415926,
+                },
+            },
+        };
+        const mockCallApi = jest.spyOn(openai_1.OpenAiEmbeddingProvider.prototype, 'callEmbeddingApi');
+        mockCallApi.mockImplementation(function () {
+            expect(this.config.temperature).toBe(3.1415926);
+            expect(this.getApiKey()).toBe('abc123');
+            return Promise.resolve({
+                embedding: [1, 0, 0],
+                tokenUsage: { total: 5, prompt: 2, completion: 3 },
+            });
+        });
+        await expect((0, matchers_2.matchesSimilarity)(expected, output, threshold, false, grading)).resolves.toEqual({
+            pass: true,
+            reason: 'Similarity 1.00 is greater than threshold 0.5',
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledWith('Expected output');
+        mockCallApi.mockRestore();
+    });
+    it('should throw an error when API call fails', async () => {
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const threshold = 0.5;
+        const grading = {
+            provider: {
+                id: 'openai:embedding:text-embedding-ada-9999999',
+                config: {
+                    apiKey: 'abc123',
+                    temperature: 3.1415926,
+                },
+            },
+        };
+        jest
+            .spyOn(openai_1.OpenAiEmbeddingProvider.prototype, 'callEmbeddingApi')
+            .mockRejectedValueOnce(new Error('API call failed'));
+        await expect(async () => {
+            await (0, matchers_2.matchesSimilarity)(expected, output, threshold, false, grading);
+        }).rejects.toThrow('API call failed');
+    });
+});
+describe('matchesLlmRubric', () => {
+    it('should pass when the grading provider returns a passing result', async () => {
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const options = {
+            rubricPrompt: 'Grading prompt',
+            provider: Grader,
+        };
+        await expect((0, matchers_2.matchesLlmRubric)(expected, output, options)).resolves.toEqual({
+            pass: true,
+            reason: 'Test grading output',
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should fail when the grading provider returns a failing result', async () => {
+        const expected = 'Expected output';
+        const output = 'Different output';
+        const options = {
+            rubricPrompt: 'Grading prompt',
+            provider: Grader,
+        };
+        jest.spyOn(Grader, 'callApi').mockResolvedValueOnce({
+            output: JSON.stringify({ pass: false, reason: 'Grading failed' }),
+            tokenUsage: { total: 10, prompt: 5, completion: 5 },
+        });
+        await expect((0, matchers_2.matchesLlmRubric)(expected, output, options)).resolves.toEqual({
+            pass: false,
+            reason: 'Grading failed',
+            score: 0,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should use the overridden llm rubric grading config', async () => {
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const options = {
+            rubricPrompt: 'Grading prompt',
+            provider: {
+                id: 'openai:gpt-4o-mini',
+                config: {
+                    apiKey: 'abc123',
+                    temperature: 3.1415926,
+                },
+            },
+        };
+        const mockCallApi = jest.spyOn(openai_1.OpenAiChatCompletionProvider.prototype, 'callApi');
+        mockCallApi.mockImplementation(function () {
+            expect(this.config.temperature).toBe(3.1415926);
+            expect(this.getApiKey()).toBe('abc123');
+            return Promise.resolve({
+                output: JSON.stringify({ pass: true, reason: 'Grading passed' }),
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        await expect((0, matchers_2.matchesLlmRubric)(expected, output, options)).resolves.toEqual({
+            reason: 'Grading passed',
+            pass: true,
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledWith('Grading prompt');
+        mockCallApi.mockRestore();
+    });
+});
+describe('matchesFactuality', () => {
+    it('should pass when the factuality check passes', async () => {
+        const input = 'Input text';
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const grading = {};
+        jest.spyOn(openai_2.DefaultGradingProvider, 'callApi').mockResolvedValueOnce({
+            output: '(A) The submitted answer is a subset of the expert answer and is fully consistent with it.',
+            tokenUsage: { total: 10, prompt: 5, completion: 5 },
+        });
+        await expect((0, matchers_2.matchesFactuality)(input, expected, output, grading)).resolves.toEqual({
+            pass: true,
+            reason: 'The submitted answer is a subset of the expert answer and is fully consistent with it.',
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should fail when the factuality check fails', async () => {
+        const input = 'Input text';
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const grading = {};
+        jest.spyOn(openai_2.DefaultGradingProvider, 'callApi').mockResolvedValueOnce({
+            output: '(D) There is a disagreement between the submitted answer and the expert answer.',
+            tokenUsage: { total: 10, prompt: 5, completion: 5 },
+        });
+        await expect((0, matchers_2.matchesFactuality)(input, expected, output, grading)).resolves.toEqual({
+            pass: false,
+            reason: 'There is a disagreement between the submitted answer and the expert answer.',
+            score: 0,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should use the overridden factuality grading config', async () => {
+        const input = 'Input text';
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const grading = {
+            factuality: {
+                subset: 0.8,
+                superset: 0.9,
+                agree: 1,
+                disagree: 0,
+                differButFactual: 0.7,
+            },
+        };
+        jest.spyOn(openai_2.DefaultGradingProvider, 'callApi').mockResolvedValueOnce({
+            output: '(A) The submitted answer is a subset of the expert answer and is fully consistent with it.',
+            tokenUsage: { total: 10, prompt: 5, completion: 5 },
+        });
+        await expect((0, matchers_2.matchesFactuality)(input, expected, output, grading)).resolves.toEqual({
+            pass: true,
+            reason: 'The submitted answer is a subset of the expert answer and is fully consistent with it.',
+            score: 0.8,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should throw an error when an error occurs', async () => {
+        const input = 'Input text';
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const grading = {};
+        jest.spyOn(openai_2.DefaultGradingProvider, 'callApi').mockImplementation(() => {
+            throw new Error('An error occurred');
+        });
+        await expect((0, matchers_2.matchesFactuality)(input, expected, output, grading)).rejects.toThrow('An error occurred');
+    });
+});
+describe('matchesClosedQa', () => {
+    it('should pass when the closed QA check passes', async () => {
+        const input = 'Input text';
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const grading = {};
+        jest.spyOn(openai_2.DefaultGradingProvider, 'callApi').mockResolvedValueOnce({
+            output: 'foo \n \n bar\n Y Y',
+            tokenUsage: { total: 10, prompt: 5, completion: 5 },
+        });
+        await expect((0, matchers_2.matchesClosedQa)(input, expected, output, grading)).resolves.toEqual({
+            pass: true,
+            reason: 'The submission meets the criterion',
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should fail when the closed QA check fails', async () => {
+        const input = 'Input text';
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const grading = {};
+        jest.spyOn(openai_2.DefaultGradingProvider, 'callApi').mockResolvedValueOnce({
+            output: 'foo bar N',
+            tokenUsage: { total: 10, prompt: 5, completion: 5 },
+        });
+        await expect((0, matchers_2.matchesClosedQa)(input, expected, output, grading)).resolves.toEqual({
+            pass: false,
+            reason: 'The submission does not meet the criterion:\nfoo bar N',
+            score: 0,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+    });
+    it('should throw an error when an error occurs', async () => {
+        const input = 'Input text';
+        const expected = 'Expected output';
+        const output = 'Sample output';
+        const grading = {};
+        jest.spyOn(openai_2.DefaultGradingProvider, 'callApi').mockImplementation(() => {
+            throw new Error('An error occurred');
+        });
+        await expect((0, matchers_2.matchesClosedQa)(input, expected, output, grading)).rejects.toThrow('An error occurred');
+    });
+    it('should handle input, criteria, and completion that need escaping', async () => {
+        const input = 'Input "text" with \\ escape characters and \\"nested\\" escapes';
+        const expected = 'Expected "output" with \\\\ escape characters and \\"nested\\" escapes';
+        const output = 'Sample "output" with \\\\ escape characters and \\"nested\\" escapes';
+        const grading = {};
+        let isJson = false;
+        jest.spyOn(openai_2.DefaultGradingProvider, 'callApi').mockImplementation((prompt) => {
+            try {
+                JSON.parse(prompt);
+                isJson = true;
+            }
+            catch (err) {
+                isJson = false;
+            }
+            return Promise.resolve({
+                output: 'foo \n \n bar\n Y Y',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        await expect((0, matchers_2.matchesClosedQa)(input, expected, output, grading)).resolves.toEqual({
+            pass: true,
+            reason: 'The submission meets the criterion',
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(isJson).toBeTruthy();
+    });
+});
+describe('getGradingProvider', () => {
+    it('should return the correct provider when provider is a string', async () => {
+        const provider = await (0, matchers_1.getGradingProvider)('text', 'openai:chat:gpt-3.5-turbo-foobar', openai_2.DefaultGradingProvider);
+        // ok for this not to match exactly when the string is parsed
+        expect(provider?.id()).toBe('openai:gpt-3.5-turbo-foobar');
+    });
+    it('should return the correct provider when provider is an ApiProvider', async () => {
+        const provider = await (0, matchers_1.getGradingProvider)('embedding', openai_2.DefaultEmbeddingProvider, openai_2.DefaultGradingProvider);
+        expect(provider).toBe(openai_2.DefaultEmbeddingProvider);
+    });
+    it('should return the correct provider when provider is ProviderOptions', async () => {
+        const providerOptions = {
+            id: 'openai:chat:gpt-3.5-turbo-foobar',
+            config: {
+                apiKey: 'abc123',
+                temperature: 3.1415926,
+            },
+        };
+        const provider = await (0, matchers_1.getGradingProvider)('text', providerOptions, openai_2.DefaultGradingProvider);
+        expect(provider?.id()).toBe('openai:chat:gpt-3.5-turbo-foobar');
+    });
+    it('should return the default provider when provider is not provided', async () => {
+        const provider = await (0, matchers_1.getGradingProvider)('text', undefined, openai_2.DefaultGradingProvider);
+        expect(provider).toBe(openai_2.DefaultGradingProvider);
+    });
+});
+describe('getAndCheckProvider', () => {
+    it('should return the default provider when provider is not defined', async () => {
+        await expect((0, matchers_1.getAndCheckProvider)('text', undefined, openai_2.DefaultGradingProvider, 'test check')).resolves.toBe(openai_2.DefaultGradingProvider);
+    });
+    it('should return the default provider when provider does not support type', async () => {
+        const provider = {
+            id: () => 'test-provider',
+            callApi: () => Promise.resolve({ output: 'test' }),
+        };
+        await expect((0, matchers_1.getAndCheckProvider)('embedding', provider, openai_2.DefaultEmbeddingProvider, 'test check')).resolves.toBe(openai_2.DefaultEmbeddingProvider);
+    });
+    it('should return the provider if it implements the required method', async () => {
+        const provider = {
+            id: () => 'test-provider',
+            callApi: () => Promise.resolve({ output: 'test' }),
+            callEmbeddingApi: () => Promise.resolve({ embedding: [] }),
+        };
+        const result = await (0, matchers_1.getAndCheckProvider)('embedding', provider, openai_2.DefaultEmbeddingProvider, 'test check');
+        expect(result).toBe(provider);
+    });
+    it('should return the default provider when no provider is specified', async () => {
+        const provider = await (0, matchers_1.getGradingProvider)('text', undefined, openai_2.DefaultGradingProvider);
+        expect(provider).toBe(openai_2.DefaultGradingProvider);
+    });
+    it('should return a specific provider when a provider id is specified', async () => {
+        const provider = await (0, matchers_1.getGradingProvider)('text', 'openai:chat:foo', openai_2.DefaultGradingProvider);
+        // loadApiProvider removes `chat` from the id
+        expect(provider?.id()).toBe('openai:foo');
+    });
+    it('should return a provider from ApiProvider when specified', async () => {
+        const providerOptions = {
+            id: () => 'custom-provider',
+            callApi: async () => ({}),
+        };
+        const provider = await (0, matchers_1.getGradingProvider)('text', providerOptions, openai_2.DefaultGradingProvider);
+        expect(provider?.id()).toBe('custom-provider');
+    });
+    it('should return a provider from ProviderTypeMap when specified', async () => {
+        const providerTypeMap = {
+            text: {
+                id: 'openai:chat:foo',
+            },
+            embedding: {
+                id: 'openai:embedding:bar',
+            },
+        };
+        const provider = await (0, matchers_1.getGradingProvider)('text', providerTypeMap, openai_2.DefaultGradingProvider);
+        expect(provider?.id()).toBe('openai:chat:foo');
+    });
+    it('should return a provider from ProviderTypeMap with basic strings', async () => {
+        const providerTypeMap = {
+            text: 'openai:chat:foo',
+            embedding: 'openai:embedding:bar',
+        };
+        const provider = await (0, matchers_1.getGradingProvider)('text', providerTypeMap, openai_2.DefaultGradingProvider);
+        expect(provider?.id()).toBe('openai:foo');
+    });
+    it('should throw an error when the provider does not match the type', async () => {
+        const providerTypeMap = {
+            embedding: {
+                id: 'openai:embedding:foo',
+            },
+        };
+        await expect((0, matchers_1.getGradingProvider)('text', providerTypeMap, openai_2.DefaultGradingProvider)).rejects.toThrow(new Error(`Invalid provider definition for output type 'text': ${JSON.stringify(providerTypeMap, null, 2)}`));
+    });
+});
+describe('matchesAnswerRelevance', () => {
+    it('should pass when the relevance score is above the threshold', async () => {
+        const input = 'Input text';
+        const output = 'Sample output';
+        const threshold = 0.5;
+        const mockCallApi = jest.spyOn(openai_2.DefaultGradingProvider, 'callApi');
+        mockCallApi.mockImplementation(() => {
+            return Promise.resolve({
+                output: 'foobar',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        const mockCallEmbeddingApi = jest.spyOn(openai_2.DefaultEmbeddingProvider, 'callEmbeddingApi');
+        mockCallEmbeddingApi.mockImplementation(function () {
+            return Promise.resolve({
+                embedding: [1, 0, 0],
+                tokenUsage: { total: 5, prompt: 2, completion: 3 },
+            });
+        });
+        await expect((0, matchers_2.matchesAnswerRelevance)(input, output, threshold)).resolves.toEqual({
+            pass: true,
+            reason: 'Relevance 1.00 is greater than threshold 0.5',
+            score: 1,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledWith(expect.stringContaining(prompts_1.ANSWER_RELEVANCY_GENERATE.slice(0, 50)));
+        expect(mockCallEmbeddingApi).toHaveBeenCalledWith('Input text');
+        mockCallApi.mockRestore();
+        mockCallEmbeddingApi.mockRestore();
+    });
+    it('should fail when the relevance score is below the threshold', async () => {
+        const input = 'Input text';
+        const output = 'Different output';
+        const threshold = 0.5;
+        const mockCallApi = jest.spyOn(openai_2.DefaultGradingProvider, 'callApi');
+        mockCallApi.mockImplementation((text) => {
+            return Promise.resolve({
+                output: text,
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        const mockCallEmbeddingApi = jest.spyOn(openai_2.DefaultEmbeddingProvider, 'callEmbeddingApi');
+        mockCallEmbeddingApi.mockImplementation((text) => {
+            if (text.includes('Input text')) {
+                return Promise.resolve({
+                    embedding: [1, 0, 0],
+                    tokenUsage: { total: 5, prompt: 2, completion: 3 },
+                });
+            }
+            else if (text.includes('Different output')) {
+                return Promise.resolve({
+                    embedding: [0, 1, 0],
+                    tokenUsage: { total: 5, prompt: 2, completion: 3 },
+                });
+            }
+            return Promise.reject(new Error('Unexpected input ' + text));
+        });
+        await expect((0, matchers_2.matchesAnswerRelevance)(input, output, threshold)).resolves.toEqual({
+            pass: false,
+            reason: 'Relevance 0.00 is less than threshold 0.5',
+            score: 0,
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledWith(expect.stringContaining(prompts_1.ANSWER_RELEVANCY_GENERATE.slice(0, 50)));
+        expect(mockCallEmbeddingApi).toHaveBeenCalledWith(expect.stringContaining(prompts_1.ANSWER_RELEVANCY_GENERATE.slice(0, 50)));
+        mockCallApi.mockRestore();
+        mockCallEmbeddingApi.mockRestore();
+    });
+});
+describe('matchesClassification', () => {
+    class TestGrader {
+        async callApi() {
+            throw new Error('Not implemented');
+        }
+        async callClassificationApi() {
+            return {
+                classification: {
+                    classA: 0.6,
+                    classB: 0.5,
+                },
+            };
+        }
+        id() {
+            return 'TestClassificationProvider';
+        }
+    }
+    it('should pass when the classification score is above the threshold', async () => {
+        const expected = 'classA';
+        const output = 'Sample output';
+        const threshold = 0.5;
+        const grader = new TestGrader();
+        const grading = {
+            provider: grader,
+        };
+        await expect((0, matchers_1.matchesClassification)(expected, output, threshold, grading)).resolves.toEqual({
+            pass: true,
+            reason: `Classification ${expected} has score 0.60 >= ${threshold}`,
+            score: 0.6,
+        });
+    });
+    it('should fail when the classification score is below the threshold', async () => {
+        const expected = 'classA';
+        const output = 'Different output';
+        const threshold = 0.9;
+        const grader = new TestGrader();
+        const grading = {
+            provider: grader,
+        };
+        await expect((0, matchers_1.matchesClassification)(expected, output, threshold, grading)).resolves.toEqual({
+            pass: false,
+            reason: `Classification ${expected} has score 0.60 < ${threshold}`,
+            score: 0.6,
+        });
+    });
+    it('should pass when the maximum classification score is above the threshold with undefined expected', async () => {
+        const expected = undefined;
+        const output = 'Sample output';
+        const threshold = 0.55;
+        const grader = new TestGrader();
+        const grading = {
+            provider: grader,
+        };
+        await expect((0, matchers_1.matchesClassification)(expected, output, threshold, grading)).resolves.toEqual({
+            pass: true,
+            reason: `Maximum classification score 0.60 >= ${threshold}`,
+            score: 0.6,
+        });
+    });
+    it('should use the overridden classification grading config', async () => {
+        const expected = 'classA';
+        const output = 'Sample output';
+        const threshold = 0.5;
+        const grading = {
+            provider: {
+                id: 'hf:text-classification:foobar',
+            },
+        };
+        const mockCallApi = jest.spyOn(huggingface_1.HuggingfaceTextClassificationProvider.prototype, 'callClassificationApi');
+        mockCallApi.mockImplementation(function () {
+            return Promise.resolve({
+                classification: { [expected]: 0.6 },
+            });
+        });
+        await expect((0, matchers_1.matchesClassification)(expected, output, threshold, grading)).resolves.toEqual({
+            pass: true,
+            reason: `Classification ${expected} has score 0.60 >= ${threshold}`,
+            score: 0.6,
+        });
+        expect(mockCallApi).toHaveBeenCalledWith('Sample output');
+        mockCallApi.mockRestore();
+    });
+});
+describe('matchesContextRelevance', () => {
+    it('should pass when the relevance score is above the threshold', async () => {
+        const input = 'Input text';
+        const context = 'Context text';
+        const threshold = 0.5;
+        const mockCallApi = jest.spyOn(openai_2.DefaultGradingProvider, 'callApi');
+        mockCallApi.mockImplementation(() => {
+            return Promise.resolve({
+                output: 'foo\nbar\nbaz Insufficient Information',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        await expect((0, matchers_2.matchesContextRelevance)(input, context, threshold)).resolves.toEqual({
+            pass: true,
+            reason: 'Relevance 0.67 is >= 0.5',
+            score: expect.closeTo(0.67, 0.01),
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledWith(expect.stringContaining(prompts_1.CONTEXT_RELEVANCE.slice(0, 100)));
+        mockCallApi.mockRestore();
+    });
+    it('should fail when the relevance score is below the threshold', async () => {
+        const input = 'Input text';
+        const context = 'Context text';
+        const threshold = 0.9;
+        const mockCallApi = jest.spyOn(openai_2.DefaultGradingProvider, 'callApi');
+        mockCallApi.mockImplementation(() => {
+            return Promise.resolve({
+                output: 'foo\nbar\nbaz Insufficient Information',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        await expect((0, matchers_2.matchesContextRelevance)(input, context, threshold)).resolves.toEqual({
+            pass: false,
+            reason: 'Relevance 0.67 is < 0.9',
+            score: expect.closeTo(0.67, 0.01),
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledWith(expect.stringContaining(prompts_1.CONTEXT_RELEVANCE.slice(0, 100)));
+        mockCallApi.mockRestore();
+    });
+});
+describe('matchesContextFaithfulness', () => {
+    it('should pass when the faithfulness score is above the threshold', async () => {
+        const query = 'Query text';
+        const output = 'Output text';
+        const context = 'Context text';
+        const threshold = 0.5;
+        const mockCallApi = jest.spyOn(openai_2.DefaultGradingProvider, 'callApi');
+        mockCallApi
+            .mockImplementationOnce(() => {
+            return Promise.resolve({
+                output: 'Statement 1\nStatement 2\nStatement 3',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        })
+            .mockImplementationOnce(() => {
+            return Promise.resolve({
+                output: 'Final verdict for each statement in order: Yes. No. Yes.',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        await expect((0, matchers_2.matchesContextFaithfulness)(query, output, context, threshold)).resolves.toEqual({
+            pass: true,
+            reason: 'Faithfulness 0.67 is >= 0.5',
+            score: expect.closeTo(0.67, 0.01),
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledTimes(2);
+        mockCallApi.mockRestore();
+    });
+    it('should fail when the faithfulness score is below the threshold', async () => {
+        const query = 'Query text';
+        const output = 'Output text';
+        const context = 'Context text';
+        const threshold = 0.7;
+        const mockCallApi = jest.spyOn(openai_2.DefaultGradingProvider, 'callApi');
+        mockCallApi
+            .mockImplementationOnce(() => {
+            return Promise.resolve({
+                output: 'Statement 1\nStatement 2\nStatement 3',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        })
+            .mockImplementationOnce(() => {
+            return Promise.resolve({
+                output: 'Final verdict for each statement in order: Yes. Yes. No.',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        await expect((0, matchers_2.matchesContextFaithfulness)(query, output, context, threshold)).resolves.toEqual({
+            pass: false,
+            reason: 'Faithfulness 0.67 is < 0.7',
+            score: expect.closeTo(0.67, 0.01),
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledTimes(2);
+        mockCallApi.mockRestore();
+    });
+});
+describe('matchesContextRecall', () => {
+    it('should pass when the recall score is above the threshold', async () => {
+        const context = 'Context text';
+        const groundTruth = 'Ground truth text';
+        const threshold = 0.5;
+        const mockCallApi = jest.spyOn(openai_2.DefaultGradingProvider, 'callApi');
+        mockCallApi.mockImplementation(() => {
+            return Promise.resolve({
+                output: 'foo [Attributed]\nbar [Not attributed]\nbaz [Attributed]',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        await expect((0, matchers_2.matchesContextRecall)(context, groundTruth, threshold)).resolves.toEqual({
+            pass: true,
+            reason: 'Recall 0.67 is >= 0.5',
+            score: expect.closeTo(0.67, 0.01),
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledWith(expect.stringContaining(prompts_1.CONTEXT_RECALL.slice(0, 100)));
+        mockCallApi.mockRestore();
+    });
+    it('should fail when the recall score is below the threshold', async () => {
+        const context = 'Context text';
+        const groundTruth = 'Ground truth text';
+        const threshold = 0.9;
+        const mockCallApi = jest.spyOn(openai_2.DefaultGradingProvider, 'callApi');
+        mockCallApi.mockImplementation(() => {
+            return Promise.resolve({
+                output: 'foo [Attributed]\nbar [Not attributed]\nbaz [Attributed]',
+                tokenUsage: { total: 10, prompt: 5, completion: 5 },
+            });
+        });
+        await expect((0, matchers_2.matchesContextRecall)(context, groundTruth, threshold)).resolves.toEqual({
+            pass: false,
+            reason: 'Recall 0.67 is < 0.9',
+            score: expect.closeTo(0.67, 0.01),
+            tokensUsed: {
+                total: expect.any(Number),
+                prompt: expect.any(Number),
+                completion: expect.any(Number),
+            },
+        });
+        expect(mockCallApi).toHaveBeenCalledWith(expect.stringContaining(prompts_1.CONTEXT_RECALL.slice(0, 100)));
+        mockCallApi.mockRestore();
+    });
+});
+//# sourceMappingURL=matchers.test.js.map