npm - tuneprompt - Versions diffs - 1.0.7 → 1.1.2 - Mend

tuneprompt 1.0.7 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/README.md +18 -9
package/dist/cli.js +5 -2
package/dist/commands/fix.d.ts +3 -1
package/dist/commands/fix.js +45 -25
package/dist/commands/generate.d.ts +2 -0
package/dist/commands/generate.js +11 -0
package/dist/engine/__tests__/optimizer.test.d.ts +1 -0
package/dist/engine/__tests__/optimizer.test.js +9 -0
package/dist/engine/loader.js +6 -2
package/dist/engine/metaPrompt.d.ts +5 -0
package/dist/engine/metaPrompt.js +55 -55
package/dist/engine/optimizer.d.ts +7 -21
package/dist/engine/optimizer.js +141 -252
package/dist/engine/runner.d.ts +2 -0
package/dist/engine/runner.js +56 -69
package/dist/engine/shadowTester.d.ts +17 -2
package/dist/engine/shadowTester.js +86 -128
package/dist/providers/__tests__/custom.test.d.ts +1 -0
package/dist/providers/__tests__/custom.test.js +9 -0
package/dist/providers/custom.d.ts +6 -0
package/dist/providers/custom.js +10 -0
package/dist/providers/factory.d.ts +6 -0
package/dist/providers/factory.js +38 -0
package/dist/providers/gemini.d.ts +11 -0
package/dist/providers/gemini.js +46 -0
package/dist/scoring/__tests__/rag.test.d.ts +1 -0
package/dist/scoring/__tests__/rag.test.js +10 -0
package/dist/scoring/rag.d.ts +9 -0
package/dist/scoring/rag.js +9 -0
package/dist/services/cloud.service.js +1 -1
package/dist/storage/database.js +1 -1
package/dist/types/fix.d.ts +11 -0
package/dist/types/index.d.ts +2 -1
package/dist/types/test.d.ts +8 -0
package/dist/types/test.js +2 -0
package/dist/utils/config.js +11 -5
package/dist/utils/interpolation.d.ts +4 -0
package/dist/utils/interpolation.js +16 -0
package/dist/utils/storage.d.ts +4 -0
package/dist/utils/storage.js +26 -5
package/dist/utils/validator.d.ts +2 -0
package/dist/utils/validator.js +10 -0
package/package.json +3 -2

package/dist/engine/optimizer.js CHANGED Viewed

@@ -1,118 +1,107 @@
 "use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
-    if (k2 === undefined) k2 = k;
-    var desc = Object.getOwnPropertyDescriptor(m, k);
-    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
-      desc = { enumerable: true, get: function() { return m[k]; } };
-    }
-    Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
-    if (k2 === undefined) k2 = k;
-    o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
-    Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
-    o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || (function () {
-    var ownKeys = function(o) {
-        ownKeys = Object.getOwnPropertyNames || function (o) {
-            var ar = [];
-            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
-            return ar;
-        };
-        return ownKeys(o);
-    };
-    return function (mod) {
-        if (mod && mod.__esModule) return mod;
-        var result = {};
-        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
-        __setModuleDefault(result, mod);
-        return result;
-    };
-})();
-var __importDefault = (this && this.__importDefault) || function (mod) {
-    return (mod && mod.__esModule) ? mod : { "default": mod };
-};
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.PromptOptimizer = void 0;
-const sdk_1 = __importDefault(require("@anthropic-ai/sdk"));
-const openai_1 = __importDefault(require("openai"));
 const metaPrompt_1 = require("./metaPrompt");
 const constraintExtractor_1 = require("./constraintExtractor");
+const shadowTester_1 = require("./shadowTester");
+const factory_1 = require("../providers/factory");
 class PromptOptimizer {
-    anthropic;
-    openai;
-    openrouter;
-    constructor() {
-        const anthropicKey = process.env.ANTHROPIC_API_KEY;
-        if (anthropicKey &&
-            !anthropicKey.includes('your_key') &&
-            !anthropicKey.startsWith('api_key') &&
-            anthropicKey !== 'phc_xxxxx') {
-            this.anthropic = new sdk_1.default({
-                apiKey: anthropicKey
-            });
-        }
-        const openaiKey = process.env.OPENAI_API_KEY;
-        if (openaiKey && !openaiKey.includes('your_key')) {
-            this.openai = new openai_1.default({
-                apiKey: openaiKey
-            });
-        }
-        const openrouterKey = process.env.OPENROUTER_API_KEY;
-        if (openrouterKey && !openrouterKey.includes('your_key')) {
-            this.openrouter = new openai_1.default({
-                baseURL: 'https://openrouter.ai/api/v1',
-                apiKey: openrouterKey,
-                defaultHeaders: {
-                    'HTTP-Referer': 'https://tuneprompt.xyz',
-                    'X-Title': 'TunePrompt CLI',
-                },
-            });
-        }
+    maxIterations;
+    constructor(options = {}) {
+        this.maxIterations = options.maxIterations || 3;
     }
     /**
-     * Main optimization method
+     * Main optimization method with Anti-Regression and Iterative Refinement
      */
-    async optimize(failedTest) {
+    async optimize(failedTest, suite) {
         console.log(`\n🧠 Analyzing failure: "${failedTest.description}"`);
-        // Step 1: Extract constraints and build context
+        console.log(`📈 Full test suite size: ${suite.length}`);
+        const initialAggregateScore = suite.reduce((sum, t) => sum + t.score, 0) / suite.length;
+        console.log(`📊 Current aggregate score: ${initialAggregateScore.toFixed(2)}`);
         const errorContext = (0, constraintExtractor_1.generateErrorContext)(failedTest);
-        // Step 2: Choose the right meta-prompt based on error type
-        const metaPrompt = this.selectMetaPrompt(failedTest, errorContext);
-        // Step 3: Generate fix candidates using Claude
-        console.log('⚡ Generating optimized prompt candidates...');
-        const candidates = await this.generateCandidates(metaPrompt, failedTest);
-        // Step 4: Shadow test each candidate
-        console.log('🧪 Shadow testing candidates...');
-        const bestCandidate = await this.selectBestCandidate(candidates, failedTest);
-        return {
-            originalPrompt: failedTest.prompt,
-            optimizedPrompt: bestCandidate.prompt,
-            reasoning: bestCandidate.reasoning,
-            confidence: bestCandidate.score,
-            testResults: {
-                score: bestCandidate.score,
-                passed: bestCandidate.score >= failedTest.threshold,
-                output: '' // Will be filled by shadow tester
+        const passingExamples = suite
+            .filter(t => t.score >= t.threshold)
+            .slice(0, 3)
+            .map(t => ({ input: t.input, output: t.expectedOutput }));
+        let iterations = 0;
+        let lastFailureReason = undefined;
+        let bestResult = null;
+        let bestAggregateScore = initialAggregateScore;
+        let conversation = [];
+        while (iterations < this.maxIterations) {
+            iterations++;
+            console.log(`🚀 Optimization Attempt #${iterations}...`);
+            if (iterations === 1) {
+                const input = {
+                    originalPrompt: failedTest.prompt,
+                    testInput: failedTest.input,
+                    expectedOutput: failedTest.expectedOutput,
+                    actualOutput: failedTest.actualOutput,
+                    errorType: failedTest.errorType,
+                    errorMessage: errorContext,
+                    passingExamples,
+                };
+                conversation.push({ role: 'user', content: this.getMetaPrompt(input) });
             }
-        };
+            else {
+                conversation.push({ role: 'user', content: lastFailureReason || 'Please try again.' });
+            }
+            const generationResult = await this.generateCandidates(conversation, failedTest);
+            const candidates = generationResult.candidates;
+            if (generationResult.rawResponse) {
+                conversation.push({ role: 'assistant', content: generationResult.rawResponse });
+            }
+            for (const candidate of candidates) {
+                try {
+                    console.log(`🧪 Testing candidate...`);
+                    const primaryResult = await (0, shadowTester_1.runShadowTest)(candidate.prompt, failedTest);
+                    if (primaryResult.score < failedTest.threshold) {
+                        console.log(`   ❌ Candidate failed to resolve primary error (score: ${primaryResult.score.toFixed(2)})`);
+                        const specificReason = primaryResult.failureReason || `the output was: "${primaryResult.output.substring(0, 100)}..."`;
+                        lastFailureReason = `Candidate failed. Reason: ${specificReason}. Previous reasoning: ${candidate.reasoning}`;
+                        continue;
+                    }
+                    console.log(`   ✅ Resolved primary error. Running anti-regression...`);
+                    const suiteResult = await (0, shadowTester_1.runSuiteShadowTest)(candidate.prompt, suite);
+                    console.log(`   📊 Suite aggregate score: ${suiteResult.aggregateScore.toFixed(2)}`);
+                    if (suiteResult.aggregateScore > bestAggregateScore) {
+                        bestAggregateScore = suiteResult.aggregateScore;
+                        bestResult = {
+                            originalPrompt: failedTest.prompt,
+                            optimizedPrompt: candidate.prompt,
+                            reasoning: candidate.reasoning,
+                            confidence: suiteResult.aggregateScore,
+                            testResults: {
+                                score: primaryResult.score,
+                                passed: true,
+                                output: primaryResult.output,
+                                aggregateScore: suiteResult.aggregateScore
+                            },
+                            iterations
+                        };
+                    }
+                    else if (suiteResult.aggregateScore <= bestAggregateScore) {
+                        console.log(`   📉 Candidate regression: aggregate score dropped (Current: ${bestAggregateScore.toFixed(2)} VS New: ${suiteResult.aggregateScore.toFixed(2)})`);
+                        const regressions = suiteResult.results.filter(r => !r.passed).map(r => r.failureReason).filter(Boolean);
+                        const regressionText = regressions.length > 0 ? ` Required features broke: ${regressions.slice(0, 2).join('; ')}.` : '';
+                        lastFailureReason = `The fix resolved the failure but introduced regressions in other cases.${regressionText} Maintain all successful patterns while fixing the failure.`;
+                    }
+                }
+                catch (error) {
+                    console.error(`   ⚠️ Validation error for candidate: ${error.message}`);
+                }
+            }
+            if (bestResult)
+                break;
+            console.log(`♻️ No candidate was net-positive. Retrying with refinement feedback...`);
+        }
+        if (!bestResult) {
+            throw new Error(`All fix attempts failed to resolve the regression or improve the aggregate score after ${this.maxIterations} iterations.`);
+        }
+        return bestResult;
     }
-    /**
-     * Select appropriate meta-prompt based on error type
-     */
-    selectMetaPrompt(test, errorContext) {
-        const input = {
-            originalPrompt: test.prompt,
-            testInput: test.input,
-            expectedOutput: test.expectedOutput,
-            actualOutput: test.actualOutput,
-            errorType: test.errorType,
-            errorMessage: errorContext
-        };
-        switch (test.errorType) {
+    getMetaPrompt(input) {
+        switch (input.errorType) {
             case 'json':
                 return (0, metaPrompt_1.generateJSONFixPrompt)(input);
             case 'semantic':
@@ -121,161 +110,61 @@ class PromptOptimizer {
                 return (0, metaPrompt_1.generateOptimizationPrompt)(input);
         }
     }
-    /**
-     * Generate multiple fix candidates using available LLMs with fallback
-     */
-    async generateCandidates(metaPrompt, failedTest) {
-        // Define provider priority order for candidate generation
-        const providers = ['anthropic', 'openai', 'openrouter'];
-        for (const provider of providers) {
+    async generateCandidates(messages, failedTest) {
+        if (process.env.TUNEPROMPT_MOCK_OPTIMIZER === 'true') {
+            return {
+                candidates: [
+                    { prompt: 'Optimized candidate A', reasoning: 'Mock reasoning A', score: 0 },
+                    { prompt: 'Optimized candidate B', reasoning: 'Mock reasoning B', score: 0 }
+                ],
+                rawResponse: '{"candidateA": {"prompt": "Optimized candidate A", "reasoning": "Mock reasoning A"}, "candidateB": {"prompt": "Optimized candidate B", "reasoning": "Mock reasoning B"}}'
+            };
+        }
+        const providerPool = ['anthropic', 'openai', 'gemini', 'openrouter'];
+        const systemPrompt = "You are a prompt optimizer. Output exclusively JSON. You suggest a candidateA and candidateB. You MUST format output as: {\"candidateA\": {\"prompt\": \"...\", \"reasoning\": \"...\"}, \"candidateB\": {\"prompt\": \"...\", \"reasoning\": \"...\"}}";
+        for (const providerName of providerPool) {
             try {
-                // Check if we have the required client for this provider
-                if (provider === 'anthropic' && this.anthropic) {
-                    console.log(`⚡ Using Anthropic for candidate generation...`);
-                    const response = await this.anthropic.messages.create({
-                        model: 'claude-3-5-sonnet-20240620',
-                        max_tokens: 4000,
-                        temperature: 0.7, // Some creativity for prompt rewriting
-                        messages: [{
-                                role: 'user',
-                                content: metaPrompt
-                            }]
-                    });
-                    const content = response.content[0];
-                    if (content.type !== 'text') {
-                        throw new Error('Unexpected response type from Claude');
-                    }
-                    // Parse the JSON response
-                    const parsed = JSON.parse(content.text);
-                    return [
-                        {
-                            prompt: parsed.candidateA.prompt,
-                            reasoning: parsed.candidateA.reasoning,
-                            score: 0 // Will be filled by shadow testing
-                        },
-                        {
-                            prompt: parsed.candidateB.prompt,
-                            reasoning: parsed.candidateB.reasoning,
-                            score: 0
-                        }
-                    ];
-                }
-                else if (provider === 'openai' && this.openai) {
-                    console.log(`⚡ Using OpenAI for candidate generation...`);
-                    const response = await this.openai.chat.completions.create({
-                        model: 'gpt-4o',
-                        messages: [{
-                                role: 'user',
-                                content: metaPrompt
-                            }],
-                        response_format: { type: 'json_object' }
-                    });
-                    const content = response.choices[0]?.message?.content;
-                    if (!content) {
-                        throw new Error('No content returned from OpenAI');
-                    }
-                    // Parse the JSON response
-                    const parsed = JSON.parse(content);
-                    return [
-                        {
-                            prompt: parsed.candidateA.prompt,
-                            reasoning: parsed.candidateA.reasoning,
-                            score: 0 // Will be filled by shadow testing
-                        },
-                        {
-                            prompt: parsed.candidateB.prompt,
-                            reasoning: parsed.candidateB.reasoning,
-                            score: 0
-                        }
-                    ];
-                }
-                else if (provider === 'openrouter' && this.openrouter) {
-                    console.log(`⚡ Using OpenRouter for candidate generation...`);
-                    const response = await this.openrouter.chat.completions.create({
-                        model: 'anthropic/claude-3-sonnet', // Default robust model on OpenRouter
-                        messages: [{
-                                role: 'user',
-                                content: metaPrompt
-                            }],
-                        response_format: { type: 'json_object' }
-                    });
-                    const content = response.choices[0]?.message?.content;
-                    if (!content) {
-                        // Fallback if model doesn't support JSON mode or returns empty
-                        throw new Error('No content returned from OpenRouter');
-                    }
-                    const parsed = JSON.parse(content);
-                    return [
-                        {
-                            prompt: parsed.candidateA.prompt,
-                            reasoning: parsed.candidateA.reasoning,
-                            score: 0
-                        },
-                        {
-                            prompt: parsed.candidateB.prompt,
-                            reasoning: parsed.candidateB.reasoning,
-                            score: 0
-                        }
-                    ];
-                }
+                const apiKey = factory_1.ProviderFactory.getApiKey(providerName);
+                if (!apiKey)
+                    continue;
+                // Pick a strong model for optimization if not defined
+                const model = providerName === 'anthropic' ? 'claude-3-5-sonnet-latest' :
+                    providerName === 'openai' ? 'gpt-4o' :
+                        providerName === 'gemini' ? 'gemini-2.0-flash' : undefined;
+                if (!model)
+                    continue;
+                const provider = factory_1.ProviderFactory.create(providerName, {
+                    apiKey,
+                    model,
+                    maxTokens: 4000
+                });
+                // Convert conversation to a format the provider understands
+                const userContent = messages.map(m => `${m.role.toUpperCase()}: ${m.content}`).join('\n\n');
+                const response = await provider.complete({
+                    system: systemPrompt,
+                    user: userContent
+                });
+                const content = response.content;
+                if (!content)
+                    throw new Error('No content returned');
+                const parsed = JSON.parse(content);
+                return {
+                    candidates: [
+                        { prompt: parsed.candidateA.prompt, reasoning: parsed.candidateA.reasoning, score: 0 },
+                        { prompt: parsed.candidateB.prompt, reasoning: parsed.candidateB.reasoning, score: 0 }
+                    ],
+                    rawResponse: content
+                };
             }
             catch (error) {
-                console.log(`⚠️  ${provider} provider failed for candidate generation: ${error.message}`);
-                continue; // Try next provider
+                console.log(`⚠️ Candidate generation failed for ${providerName}: ${error.message}`);
+                continue;
             }
         }
-        // All providers failed
-        console.error('All providers failed for candidate generation');
-        return [{
-                prompt: this.createFallbackPrompt(failedTest),
-                reasoning: 'Generated using fallback method',
-                score: 0
-            }];
-    }
-    /**
-     * Shadow test each candidate and return the best one
-     */
-    async selectBestCandidate(candidates, originalTest) {
-        const { runShadowTest } = await Promise.resolve().then(() => __importStar(require('./shadowTester')));
-        const testedCandidates = await Promise.all(candidates.map(async (candidate) => {
-            const result = await runShadowTest(candidate.prompt, originalTest);
-            return {
-                ...candidate,
-                score: result.score
-            };
-        }));
-        // Sort by score (highest first)
-        testedCandidates.sort((a, b) => b.score - a.score);
-        return testedCandidates[0];
-    }
-    /**
-     * Fallback prompt improvement - generates a clean rewritten prompt
-     */
-    createFallbackPrompt(test) {
-        // Extract the core intent from the original prompt
-        // Remove any existing "fix" instructions we might have added previously
-        let corePrompt = test.prompt
-            .replace(/\n\nYour response must match this exactly: "[\s\S]*?$/g, '')
-            .replace(/\n\nIMPORTANT: You must respond with valid JSON only[\s\S]*?$/g, '')
-            .replace(/\n\nBe concise and match the expected output format exactly[\s\S]*?$/g, '')
-            .trim();
-        // For JSON errors, create a structured prompt
-        if (test.errorType === 'json') {
-            return `${corePrompt}
-IMPORTANT: You must respond with valid JSON only. No explanations, no markdown, just the raw JSON object.`;
-        }
-        // For semantic errors, be more specific about expected output
-        if (test.errorType === 'semantic') {
-            return `${corePrompt}
-Your response must match this exactly: "${test.expectedOutput}"
-Do not add any extra text, greetings, or explanations. Output only what is requested.`;
-        }
-        // Default: add clarity
-        return `${corePrompt}
-Be concise and match the expected output format exactly.`;
+        return {
+            candidates: [{ prompt: failedTest.prompt, reasoning: 'Fallback - optimization failed', score: 0 }],
+            rawResponse: ''
+        };
     }
 }
 exports.PromptOptimizer = PromptOptimizer;

package/dist/engine/runner.d.ts CHANGED Viewed

@@ -6,4 +6,6 @@ export declare class TestRunner {
     private initializeProviders;
     runTests(testCases: TestCase[]): Promise<TestRun>;
     private runSingleTest;
+    private scoreResult;
+    private runSemanticScoring;
 }

package/dist/engine/runner.js CHANGED Viewed

@@ -2,12 +2,11 @@
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.TestRunner = void 0;
 const uuid_1 = require("uuid");
-const openai_1 = require("../providers/openai");
-const anthropic_1 = require("../providers/anthropic");
-const openrouter_1 = require("../providers/openrouter");
+const factory_1 = require("../providers/factory");
 const exact_match_1 = require("../scoring/exact-match");
 const json_validator_1 = require("../scoring/json-validator");
 const semantic_1 = require("../scoring/semantic");
+const interpolation_1 = require("../utils/interpolation");
 class TestRunner {
     config;
     providers = new Map();
@@ -16,16 +15,12 @@ class TestRunner {
         this.initializeProviders();
     }
     initializeProviders() {
-        if (this.config.providers.openai) {
-            const provider = new openai_1.OpenAIProvider(this.config.providers.openai);
-            this.providers.set("openai", provider);
-        }
-        if (this.config.providers.anthropic) {
-            this.providers.set("anthropic", new anthropic_1.AnthropicProvider(this.config.providers.anthropic));
-        }
-        if (this.config.providers.openrouter) {
-            const provider = new openrouter_1.OpenRouterProvider(this.config.providers.openrouter);
-            this.providers.set("openrouter", provider);
+        const providerNames = ["openai", "anthropic", "openrouter", "gemini"];
+        for (const name of providerNames) {
+            const providerConfig = this.config.providers[name];
+            if (providerConfig && providerConfig.apiKey) {
+                this.providers.set(name, factory_1.ProviderFactory.create(name, providerConfig));
+            }
         }
     }
     async runTests(testCases) {
@@ -52,14 +47,10 @@ class TestRunner {
     async runSingleTest(testCase) {
         const testId = (0, uuid_1.v4)();
         const startTime = Date.now();
-        // Define fallback order: Primary -> Fallbacks
-        const fallbackChain = ["openai", "anthropic", "openrouter"];
-        // Determine starting provider
+        const fallbackChain = ["openai", "anthropic", "gemini", "openrouter"];
         const initialProvider = testCase.config?.provider || "openai";
-        // Build the sequence of providers to try
         let providersToTry;
         if (testCase.config?.provider) {
-            // If provider is explicitly set, only try that one
             providersToTry = [testCase.config.provider];
         }
         else {
@@ -75,56 +66,13 @@ class TestRunner {
             if (!provider)
                 continue;
             try {
-                // Execute prompt
-                const response = await provider.complete(testCase.prompt);
-                // Score result
-                const scoringMethod = testCase.config?.method || "semantic";
+                // Interpolate variables if present
+                const finalPrompt = typeof testCase.prompt === 'string'
+                    ? (0, interpolation_1.interpolateVariables)(testCase.prompt, testCase.variables)
+                    : testCase.prompt;
+                const response = await provider.complete(finalPrompt);
+                const { score, error: scoringError } = await this.scoreResult(testCase, response.content, providerName);
                 const threshold = testCase.config?.threshold || this.config.threshold || 0.8;
-                let score;
-                let error;
-                if (scoringMethod === "exact") {
-                    score = (0, exact_match_1.exactMatch)(String(testCase.expect), response.content);
-                }
-                else if (scoringMethod === "json") {
-                    const result = (0, json_validator_1.validateJSON)(testCase.expect, response.content);
-                    score = result.score;
-                    error = result.error;
-                }
-                else if (scoringMethod === "semantic") {
-                    let calculatedScore;
-                    let lastScoringError;
-                    // potential embedding providers
-                    const embeddingCapable = ["openai", "openrouter"];
-                    // Order: Current provider (if capable) -> OpenAI -> OpenRouter -> others
-                    const scoringProvidersToTry = [
-                        ...(embeddingCapable.includes(providerName) ? [providerName] : []),
-                        ...embeddingCapable.filter((p) => p !== providerName),
-                    ].filter((p) => this.providers.has(p));
-                    if (scoringProvidersToTry.length === 0) {
-                        throw new Error("No embedding-capable providers available for semantic scoring");
-                    }
-                    for (const scoreProviderName of scoringProvidersToTry) {
-                        try {
-                            const scoreProvider = this.providers.get(scoreProviderName);
-                            if (!scoreProvider)
-                                continue;
-                            const scorer = new semantic_1.SemanticScorer(scoreProvider);
-                            calculatedScore = await scorer.score(String(testCase.expect), response.content);
-                            break;
-                        }
-                        catch (err) {
-                            lastScoringError = err;
-                            continue;
-                        }
-                    }
-                    if (calculatedScore === undefined) {
-                        throw new Error(`Semantic scoring failed. Last error: ${lastScoringError?.message || "Unknown error"}`);
-                    }
-                    score = calculatedScore;
-                }
-                else {
-                    throw new Error(`Unknown scoring method: ${scoringMethod}`);
-                }
                 const status = score >= threshold ? "pass" : "fail";
                 const duration = Date.now() - startTime;
                 return {
@@ -134,7 +82,7 @@ class TestRunner {
                     score,
                     actualOutput: response.content,
                     expectedOutput: String(testCase.expect),
-                    error,
+                    error: scoringError,
                     metadata: {
                         duration,
                         timestamp: new Date(),
@@ -150,7 +98,6 @@ class TestRunner {
                 continue;
             }
         }
-        // If all attempts failed
         return {
             id: testId,
             testCase,
@@ -165,5 +112,45 @@ class TestRunner {
             },
         };
     }
+    async scoreResult(testCase, actualOutput, providerName) {
+        const scoringMethod = testCase.config?.method || "semantic";
+        if (scoringMethod === "exact") {
+            return { score: (0, exact_match_1.exactMatch)(String(testCase.expect), actualOutput) };
+        }
+        if (scoringMethod === "json") {
+            const result = (0, json_validator_1.validateJSON)(testCase.expect, actualOutput);
+            return { score: result.score, error: result.error };
+        }
+        if (scoringMethod === "semantic") {
+            return this.runSemanticScoring(testCase, actualOutput, providerName);
+        }
+        throw new Error(`Unknown scoring method: ${scoringMethod}`);
+    }
+    async runSemanticScoring(testCase, actualOutput, currentProviderName) {
+        const embeddingCapable = ["openai", "openrouter"];
+        const scoringProvidersToTry = [
+            ...(embeddingCapable.includes(currentProviderName) ? [currentProviderName] : []),
+            ...embeddingCapable.filter((p) => p !== currentProviderName),
+        ].filter((p) => this.providers.has(p));
+        if (scoringProvidersToTry.length === 0) {
+            throw new Error("No embedding-capable providers available for semantic scoring");
+        }
+        let lastScoringError;
+        for (const scoreProviderName of scoringProvidersToTry) {
+            try {
+                const scoreProvider = this.providers.get(scoreProviderName);
+                if (!scoreProvider)
+                    continue;
+                const scorer = new semantic_1.SemanticScorer(scoreProvider);
+                const score = await scorer.score(String(testCase.expect), actualOutput);
+                return { score };
+            }
+            catch (err) {
+                lastScoringError = err;
+                continue;
+            }
+        }
+        throw new Error(`Semantic scoring failed. Last error: ${lastScoringError?.message || "Unknown error"}`);
+    }
 }
 exports.TestRunner = TestRunner;

package/dist/engine/shadowTester.d.ts CHANGED Viewed

@@ -3,9 +3,24 @@ export interface ShadowTestResult {
     score: number;
     output: string;
     passed: boolean;
+    failureReason?: string;
+}
+export interface ShadowSuiteResult {
+    aggregateScore: number;
+    results: {
+        testId: string;
+        score: number;
+        passed: boolean;
+        output: string;
+        failureReason?: string;
+    }[];
 }
 /**
  * Test a candidate prompt against the original test case
- * Tries providers in sequence until one succeeds
+ * Uses specified provider/model or falls back to priority sequence
+ */
+export declare function runShadowTest(candidatePrompt: string, test: FailedTest): Promise<ShadowTestResult>;
+/**
+ * Run a candidate prompt against multiple tests and return aggregate results
  */
-export declare function runShadowTest(candidatePrompt: string, originalTest: FailedTest): Promise<ShadowTestResult>;
+export declare function runSuiteShadowTest(candidatePrompt: string, tests: FailedTest[]): Promise<ShadowSuiteResult>;