tuneprompt 1.0.7 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +18 -9
  2. package/dist/cli.js +5 -2
  3. package/dist/commands/fix.d.ts +3 -1
  4. package/dist/commands/fix.js +45 -25
  5. package/dist/commands/generate.d.ts +2 -0
  6. package/dist/commands/generate.js +11 -0
  7. package/dist/engine/__tests__/optimizer.test.d.ts +1 -0
  8. package/dist/engine/__tests__/optimizer.test.js +9 -0
  9. package/dist/engine/loader.js +6 -2
  10. package/dist/engine/metaPrompt.d.ts +5 -0
  11. package/dist/engine/metaPrompt.js +55 -55
  12. package/dist/engine/optimizer.d.ts +7 -21
  13. package/dist/engine/optimizer.js +141 -252
  14. package/dist/engine/runner.d.ts +2 -0
  15. package/dist/engine/runner.js +56 -69
  16. package/dist/engine/shadowTester.d.ts +17 -2
  17. package/dist/engine/shadowTester.js +86 -128
  18. package/dist/providers/__tests__/custom.test.d.ts +1 -0
  19. package/dist/providers/__tests__/custom.test.js +9 -0
  20. package/dist/providers/custom.d.ts +6 -0
  21. package/dist/providers/custom.js +10 -0
  22. package/dist/providers/factory.d.ts +6 -0
  23. package/dist/providers/factory.js +38 -0
  24. package/dist/providers/gemini.d.ts +11 -0
  25. package/dist/providers/gemini.js +46 -0
  26. package/dist/scoring/__tests__/rag.test.d.ts +1 -0
  27. package/dist/scoring/__tests__/rag.test.js +10 -0
  28. package/dist/scoring/rag.d.ts +9 -0
  29. package/dist/scoring/rag.js +9 -0
  30. package/dist/services/cloud.service.js +1 -1
  31. package/dist/storage/database.js +1 -1
  32. package/dist/types/fix.d.ts +11 -0
  33. package/dist/types/index.d.ts +2 -1
  34. package/dist/types/test.d.ts +8 -0
  35. package/dist/types/test.js +2 -0
  36. package/dist/utils/config.js +11 -5
  37. package/dist/utils/interpolation.d.ts +4 -0
  38. package/dist/utils/interpolation.js +16 -0
  39. package/dist/utils/storage.d.ts +4 -0
  40. package/dist/utils/storage.js +26 -5
  41. package/dist/utils/validator.d.ts +2 -0
  42. package/dist/utils/validator.js +10 -0
  43. package/package.json +3 -2
@@ -1,118 +1,107 @@
1
1
  "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
- Object.defineProperty(o, "default", { enumerable: true, value: v });
15
- }) : function(o, v) {
16
- o["default"] = v;
17
- });
18
- var __importStar = (this && this.__importStar) || (function () {
19
- var ownKeys = function(o) {
20
- ownKeys = Object.getOwnPropertyNames || function (o) {
21
- var ar = [];
22
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
- return ar;
24
- };
25
- return ownKeys(o);
26
- };
27
- return function (mod) {
28
- if (mod && mod.__esModule) return mod;
29
- var result = {};
30
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
- __setModuleDefault(result, mod);
32
- return result;
33
- };
34
- })();
35
- var __importDefault = (this && this.__importDefault) || function (mod) {
36
- return (mod && mod.__esModule) ? mod : { "default": mod };
37
- };
38
2
  Object.defineProperty(exports, "__esModule", { value: true });
39
3
  exports.PromptOptimizer = void 0;
40
- const sdk_1 = __importDefault(require("@anthropic-ai/sdk"));
41
- const openai_1 = __importDefault(require("openai"));
42
4
  const metaPrompt_1 = require("./metaPrompt");
43
5
  const constraintExtractor_1 = require("./constraintExtractor");
6
+ const shadowTester_1 = require("./shadowTester");
7
+ const factory_1 = require("../providers/factory");
44
8
  class PromptOptimizer {
45
- anthropic;
46
- openai;
47
- openrouter;
48
- constructor() {
49
- const anthropicKey = process.env.ANTHROPIC_API_KEY;
50
- if (anthropicKey &&
51
- !anthropicKey.includes('your_key') &&
52
- !anthropicKey.startsWith('api_key') &&
53
- anthropicKey !== 'phc_xxxxx') {
54
- this.anthropic = new sdk_1.default({
55
- apiKey: anthropicKey
56
- });
57
- }
58
- const openaiKey = process.env.OPENAI_API_KEY;
59
- if (openaiKey && !openaiKey.includes('your_key')) {
60
- this.openai = new openai_1.default({
61
- apiKey: openaiKey
62
- });
63
- }
64
- const openrouterKey = process.env.OPENROUTER_API_KEY;
65
- if (openrouterKey && !openrouterKey.includes('your_key')) {
66
- this.openrouter = new openai_1.default({
67
- baseURL: 'https://openrouter.ai/api/v1',
68
- apiKey: openrouterKey,
69
- defaultHeaders: {
70
- 'HTTP-Referer': 'https://tuneprompt.xyz',
71
- 'X-Title': 'TunePrompt CLI',
72
- },
73
- });
74
- }
9
+ maxIterations;
10
+ constructor(options = {}) {
11
+ this.maxIterations = options.maxIterations || 3;
75
12
  }
76
13
  /**
77
- * Main optimization method
14
+ * Main optimization method with Anti-Regression and Iterative Refinement
78
15
  */
79
- async optimize(failedTest) {
16
+ async optimize(failedTest, suite) {
80
17
  console.log(`\n🧠 Analyzing failure: "${failedTest.description}"`);
81
- // Step 1: Extract constraints and build context
18
+ console.log(`📈 Full test suite size: ${suite.length}`);
19
+ const initialAggregateScore = suite.reduce((sum, t) => sum + t.score, 0) / suite.length;
20
+ console.log(`📊 Current aggregate score: ${initialAggregateScore.toFixed(2)}`);
82
21
  const errorContext = (0, constraintExtractor_1.generateErrorContext)(failedTest);
83
- // Step 2: Choose the right meta-prompt based on error type
84
- const metaPrompt = this.selectMetaPrompt(failedTest, errorContext);
85
- // Step 3: Generate fix candidates using Claude
86
- console.log('⚡ Generating optimized prompt candidates...');
87
- const candidates = await this.generateCandidates(metaPrompt, failedTest);
88
- // Step 4: Shadow test each candidate
89
- console.log('🧪 Shadow testing candidates...');
90
- const bestCandidate = await this.selectBestCandidate(candidates, failedTest);
91
- return {
92
- originalPrompt: failedTest.prompt,
93
- optimizedPrompt: bestCandidate.prompt,
94
- reasoning: bestCandidate.reasoning,
95
- confidence: bestCandidate.score,
96
- testResults: {
97
- score: bestCandidate.score,
98
- passed: bestCandidate.score >= failedTest.threshold,
99
- output: '' // Will be filled by shadow tester
22
+ const passingExamples = suite
23
+ .filter(t => t.score >= t.threshold)
24
+ .slice(0, 3)
25
+ .map(t => ({ input: t.input, output: t.expectedOutput }));
26
+ let iterations = 0;
27
+ let lastFailureReason = undefined;
28
+ let bestResult = null;
29
+ let bestAggregateScore = initialAggregateScore;
30
+ let conversation = [];
31
+ while (iterations < this.maxIterations) {
32
+ iterations++;
33
+ console.log(`🚀 Optimization Attempt #${iterations}...`);
34
+ if (iterations === 1) {
35
+ const input = {
36
+ originalPrompt: failedTest.prompt,
37
+ testInput: failedTest.input,
38
+ expectedOutput: failedTest.expectedOutput,
39
+ actualOutput: failedTest.actualOutput,
40
+ errorType: failedTest.errorType,
41
+ errorMessage: errorContext,
42
+ passingExamples,
43
+ };
44
+ conversation.push({ role: 'user', content: this.getMetaPrompt(input) });
100
45
  }
101
- };
46
+ else {
47
+ conversation.push({ role: 'user', content: lastFailureReason || 'Please try again.' });
48
+ }
49
+ const generationResult = await this.generateCandidates(conversation, failedTest);
50
+ const candidates = generationResult.candidates;
51
+ if (generationResult.rawResponse) {
52
+ conversation.push({ role: 'assistant', content: generationResult.rawResponse });
53
+ }
54
+ for (const candidate of candidates) {
55
+ try {
56
+ console.log(`🧪 Testing candidate...`);
57
+ const primaryResult = await (0, shadowTester_1.runShadowTest)(candidate.prompt, failedTest);
58
+ if (primaryResult.score < failedTest.threshold) {
59
+ console.log(` ❌ Candidate failed to resolve primary error (score: ${primaryResult.score.toFixed(2)})`);
60
+ const specificReason = primaryResult.failureReason || `the output was: "${primaryResult.output.substring(0, 100)}..."`;
61
+ lastFailureReason = `Candidate failed. Reason: ${specificReason}. Previous reasoning: ${candidate.reasoning}`;
62
+ continue;
63
+ }
64
+ console.log(` ✅ Resolved primary error. Running anti-regression...`);
65
+ const suiteResult = await (0, shadowTester_1.runSuiteShadowTest)(candidate.prompt, suite);
66
+ console.log(` 📊 Suite aggregate score: ${suiteResult.aggregateScore.toFixed(2)}`);
67
+ if (suiteResult.aggregateScore > bestAggregateScore) {
68
+ bestAggregateScore = suiteResult.aggregateScore;
69
+ bestResult = {
70
+ originalPrompt: failedTest.prompt,
71
+ optimizedPrompt: candidate.prompt,
72
+ reasoning: candidate.reasoning,
73
+ confidence: suiteResult.aggregateScore,
74
+ testResults: {
75
+ score: primaryResult.score,
76
+ passed: true,
77
+ output: primaryResult.output,
78
+ aggregateScore: suiteResult.aggregateScore
79
+ },
80
+ iterations
81
+ };
82
+ }
83
+ else if (suiteResult.aggregateScore <= bestAggregateScore) {
84
+ console.log(` 📉 Candidate regression: aggregate score dropped (Current: ${bestAggregateScore.toFixed(2)} VS New: ${suiteResult.aggregateScore.toFixed(2)})`);
85
+ const regressions = suiteResult.results.filter(r => !r.passed).map(r => r.failureReason).filter(Boolean);
86
+ const regressionText = regressions.length > 0 ? ` Required features broke: ${regressions.slice(0, 2).join('; ')}.` : '';
87
+ lastFailureReason = `The fix resolved the failure but introduced regressions in other cases.${regressionText} Maintain all successful patterns while fixing the failure.`;
88
+ }
89
+ }
90
+ catch (error) {
91
+ console.error(` ⚠️ Validation error for candidate: ${error.message}`);
92
+ }
93
+ }
94
+ if (bestResult)
95
+ break;
96
+ console.log(`♻️ No candidate was net-positive. Retrying with refinement feedback...`);
97
+ }
98
+ if (!bestResult) {
99
+ throw new Error(`All fix attempts failed to resolve the regression or improve the aggregate score after ${this.maxIterations} iterations.`);
100
+ }
101
+ return bestResult;
102
102
  }
103
- /**
104
- * Select appropriate meta-prompt based on error type
105
- */
106
- selectMetaPrompt(test, errorContext) {
107
- const input = {
108
- originalPrompt: test.prompt,
109
- testInput: test.input,
110
- expectedOutput: test.expectedOutput,
111
- actualOutput: test.actualOutput,
112
- errorType: test.errorType,
113
- errorMessage: errorContext
114
- };
115
- switch (test.errorType) {
103
+ getMetaPrompt(input) {
104
+ switch (input.errorType) {
116
105
  case 'json':
117
106
  return (0, metaPrompt_1.generateJSONFixPrompt)(input);
118
107
  case 'semantic':
@@ -121,161 +110,61 @@ class PromptOptimizer {
121
110
  return (0, metaPrompt_1.generateOptimizationPrompt)(input);
122
111
  }
123
112
  }
124
- /**
125
- * Generate multiple fix candidates using available LLMs with fallback
126
- */
127
- async generateCandidates(metaPrompt, failedTest) {
128
- // Define provider priority order for candidate generation
129
- const providers = ['anthropic', 'openai', 'openrouter'];
130
- for (const provider of providers) {
113
+ async generateCandidates(messages, failedTest) {
114
+ if (process.env.TUNEPROMPT_MOCK_OPTIMIZER === 'true') {
115
+ return {
116
+ candidates: [
117
+ { prompt: 'Optimized candidate A', reasoning: 'Mock reasoning A', score: 0 },
118
+ { prompt: 'Optimized candidate B', reasoning: 'Mock reasoning B', score: 0 }
119
+ ],
120
+ rawResponse: '{"candidateA": {"prompt": "Optimized candidate A", "reasoning": "Mock reasoning A"}, "candidateB": {"prompt": "Optimized candidate B", "reasoning": "Mock reasoning B"}}'
121
+ };
122
+ }
123
+ const providerPool = ['anthropic', 'openai', 'gemini', 'openrouter'];
124
+ const systemPrompt = "You are a prompt optimizer. Output exclusively JSON. You suggest a candidateA and candidateB. You MUST format output as: {\"candidateA\": {\"prompt\": \"...\", \"reasoning\": \"...\"}, \"candidateB\": {\"prompt\": \"...\", \"reasoning\": \"...\"}}";
125
+ for (const providerName of providerPool) {
131
126
  try {
132
- // Check if we have the required client for this provider
133
- if (provider === 'anthropic' && this.anthropic) {
134
- console.log(`⚡ Using Anthropic for candidate generation...`);
135
- const response = await this.anthropic.messages.create({
136
- model: 'claude-3-5-sonnet-20240620',
137
- max_tokens: 4000,
138
- temperature: 0.7, // Some creativity for prompt rewriting
139
- messages: [{
140
- role: 'user',
141
- content: metaPrompt
142
- }]
143
- });
144
- const content = response.content[0];
145
- if (content.type !== 'text') {
146
- throw new Error('Unexpected response type from Claude');
147
- }
148
- // Parse the JSON response
149
- const parsed = JSON.parse(content.text);
150
- return [
151
- {
152
- prompt: parsed.candidateA.prompt,
153
- reasoning: parsed.candidateA.reasoning,
154
- score: 0 // Will be filled by shadow testing
155
- },
156
- {
157
- prompt: parsed.candidateB.prompt,
158
- reasoning: parsed.candidateB.reasoning,
159
- score: 0
160
- }
161
- ];
162
- }
163
- else if (provider === 'openai' && this.openai) {
164
- console.log(`⚡ Using OpenAI for candidate generation...`);
165
- const response = await this.openai.chat.completions.create({
166
- model: 'gpt-4o',
167
- messages: [{
168
- role: 'user',
169
- content: metaPrompt
170
- }],
171
- response_format: { type: 'json_object' }
172
- });
173
- const content = response.choices[0]?.message?.content;
174
- if (!content) {
175
- throw new Error('No content returned from OpenAI');
176
- }
177
- // Parse the JSON response
178
- const parsed = JSON.parse(content);
179
- return [
180
- {
181
- prompt: parsed.candidateA.prompt,
182
- reasoning: parsed.candidateA.reasoning,
183
- score: 0 // Will be filled by shadow testing
184
- },
185
- {
186
- prompt: parsed.candidateB.prompt,
187
- reasoning: parsed.candidateB.reasoning,
188
- score: 0
189
- }
190
- ];
191
- }
192
- else if (provider === 'openrouter' && this.openrouter) {
193
- console.log(`⚡ Using OpenRouter for candidate generation...`);
194
- const response = await this.openrouter.chat.completions.create({
195
- model: 'anthropic/claude-3-sonnet', // Default robust model on OpenRouter
196
- messages: [{
197
- role: 'user',
198
- content: metaPrompt
199
- }],
200
- response_format: { type: 'json_object' }
201
- });
202
- const content = response.choices[0]?.message?.content;
203
- if (!content) {
204
- // Fallback if model doesn't support JSON mode or returns empty
205
- throw new Error('No content returned from OpenRouter');
206
- }
207
- const parsed = JSON.parse(content);
208
- return [
209
- {
210
- prompt: parsed.candidateA.prompt,
211
- reasoning: parsed.candidateA.reasoning,
212
- score: 0
213
- },
214
- {
215
- prompt: parsed.candidateB.prompt,
216
- reasoning: parsed.candidateB.reasoning,
217
- score: 0
218
- }
219
- ];
220
- }
127
+ const apiKey = factory_1.ProviderFactory.getApiKey(providerName);
128
+ if (!apiKey)
129
+ continue;
130
+ // Pick a strong model for optimization if not defined
131
+ const model = providerName === 'anthropic' ? 'claude-3-5-sonnet-latest' :
132
+ providerName === 'openai' ? 'gpt-4o' :
133
+ providerName === 'gemini' ? 'gemini-2.0-flash' : undefined;
134
+ if (!model)
135
+ continue;
136
+ const provider = factory_1.ProviderFactory.create(providerName, {
137
+ apiKey,
138
+ model,
139
+ maxTokens: 4000
140
+ });
141
+ // Convert conversation to a format the provider understands
142
+ const userContent = messages.map(m => `${m.role.toUpperCase()}: ${m.content}`).join('\n\n');
143
+ const response = await provider.complete({
144
+ system: systemPrompt,
145
+ user: userContent
146
+ });
147
+ const content = response.content;
148
+ if (!content)
149
+ throw new Error('No content returned');
150
+ const parsed = JSON.parse(content);
151
+ return {
152
+ candidates: [
153
+ { prompt: parsed.candidateA.prompt, reasoning: parsed.candidateA.reasoning, score: 0 },
154
+ { prompt: parsed.candidateB.prompt, reasoning: parsed.candidateB.reasoning, score: 0 }
155
+ ],
156
+ rawResponse: content
157
+ };
221
158
  }
222
159
  catch (error) {
223
- console.log(`⚠️ ${provider} provider failed for candidate generation: ${error.message}`);
224
- continue; // Try next provider
160
+ console.log(`⚠️ Candidate generation failed for ${providerName}: ${error.message}`);
161
+ continue;
225
162
  }
226
163
  }
227
- // All providers failed
228
- console.error('All providers failed for candidate generation');
229
- return [{
230
- prompt: this.createFallbackPrompt(failedTest),
231
- reasoning: 'Generated using fallback method',
232
- score: 0
233
- }];
234
- }
235
- /**
236
- * Shadow test each candidate and return the best one
237
- */
238
- async selectBestCandidate(candidates, originalTest) {
239
- const { runShadowTest } = await Promise.resolve().then(() => __importStar(require('./shadowTester')));
240
- const testedCandidates = await Promise.all(candidates.map(async (candidate) => {
241
- const result = await runShadowTest(candidate.prompt, originalTest);
242
- return {
243
- ...candidate,
244
- score: result.score
245
- };
246
- }));
247
- // Sort by score (highest first)
248
- testedCandidates.sort((a, b) => b.score - a.score);
249
- return testedCandidates[0];
250
- }
251
- /**
252
- * Fallback prompt improvement - generates a clean rewritten prompt
253
- */
254
- createFallbackPrompt(test) {
255
- // Extract the core intent from the original prompt
256
- // Remove any existing "fix" instructions we might have added previously
257
- let corePrompt = test.prompt
258
- .replace(/\n\nYour response must match this exactly: "[\s\S]*?$/g, '')
259
- .replace(/\n\nIMPORTANT: You must respond with valid JSON only[\s\S]*?$/g, '')
260
- .replace(/\n\nBe concise and match the expected output format exactly[\s\S]*?$/g, '')
261
- .trim();
262
- // For JSON errors, create a structured prompt
263
- if (test.errorType === 'json') {
264
- return `${corePrompt}
265
-
266
- IMPORTANT: You must respond with valid JSON only. No explanations, no markdown, just the raw JSON object.`;
267
- }
268
- // For semantic errors, be more specific about expected output
269
- if (test.errorType === 'semantic') {
270
- return `${corePrompt}
271
-
272
- Your response must match this exactly: "${test.expectedOutput}"
273
- Do not add any extra text, greetings, or explanations. Output only what is requested.`;
274
- }
275
- // Default: add clarity
276
- return `${corePrompt}
277
-
278
- Be concise and match the expected output format exactly.`;
164
+ return {
165
+ candidates: [{ prompt: failedTest.prompt, reasoning: 'Fallback - optimization failed', score: 0 }],
166
+ rawResponse: ''
167
+ };
279
168
  }
280
169
  }
281
170
  exports.PromptOptimizer = PromptOptimizer;
@@ -6,4 +6,6 @@ export declare class TestRunner {
6
6
  private initializeProviders;
7
7
  runTests(testCases: TestCase[]): Promise<TestRun>;
8
8
  private runSingleTest;
9
+ private scoreResult;
10
+ private runSemanticScoring;
9
11
  }
@@ -2,12 +2,11 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.TestRunner = void 0;
4
4
  const uuid_1 = require("uuid");
5
- const openai_1 = require("../providers/openai");
6
- const anthropic_1 = require("../providers/anthropic");
7
- const openrouter_1 = require("../providers/openrouter");
5
+ const factory_1 = require("../providers/factory");
8
6
  const exact_match_1 = require("../scoring/exact-match");
9
7
  const json_validator_1 = require("../scoring/json-validator");
10
8
  const semantic_1 = require("../scoring/semantic");
9
+ const interpolation_1 = require("../utils/interpolation");
11
10
  class TestRunner {
12
11
  config;
13
12
  providers = new Map();
@@ -16,16 +15,12 @@ class TestRunner {
16
15
  this.initializeProviders();
17
16
  }
18
17
  initializeProviders() {
19
- if (this.config.providers.openai) {
20
- const provider = new openai_1.OpenAIProvider(this.config.providers.openai);
21
- this.providers.set("openai", provider);
22
- }
23
- if (this.config.providers.anthropic) {
24
- this.providers.set("anthropic", new anthropic_1.AnthropicProvider(this.config.providers.anthropic));
25
- }
26
- if (this.config.providers.openrouter) {
27
- const provider = new openrouter_1.OpenRouterProvider(this.config.providers.openrouter);
28
- this.providers.set("openrouter", provider);
18
+ const providerNames = ["openai", "anthropic", "openrouter", "gemini"];
19
+ for (const name of providerNames) {
20
+ const providerConfig = this.config.providers[name];
21
+ if (providerConfig && providerConfig.apiKey) {
22
+ this.providers.set(name, factory_1.ProviderFactory.create(name, providerConfig));
23
+ }
29
24
  }
30
25
  }
31
26
  async runTests(testCases) {
@@ -52,14 +47,10 @@ class TestRunner {
52
47
  async runSingleTest(testCase) {
53
48
  const testId = (0, uuid_1.v4)();
54
49
  const startTime = Date.now();
55
- // Define fallback order: Primary -> Fallbacks
56
- const fallbackChain = ["openai", "anthropic", "openrouter"];
57
- // Determine starting provider
50
+ const fallbackChain = ["openai", "anthropic", "gemini", "openrouter"];
58
51
  const initialProvider = testCase.config?.provider || "openai";
59
- // Build the sequence of providers to try
60
52
  let providersToTry;
61
53
  if (testCase.config?.provider) {
62
- // If provider is explicitly set, only try that one
63
54
  providersToTry = [testCase.config.provider];
64
55
  }
65
56
  else {
@@ -75,56 +66,13 @@ class TestRunner {
75
66
  if (!provider)
76
67
  continue;
77
68
  try {
78
- // Execute prompt
79
- const response = await provider.complete(testCase.prompt);
80
- // Score result
81
- const scoringMethod = testCase.config?.method || "semantic";
69
+ // Interpolate variables if present
70
+ const finalPrompt = typeof testCase.prompt === 'string'
71
+ ? (0, interpolation_1.interpolateVariables)(testCase.prompt, testCase.variables)
72
+ : testCase.prompt;
73
+ const response = await provider.complete(finalPrompt);
74
+ const { score, error: scoringError } = await this.scoreResult(testCase, response.content, providerName);
82
75
  const threshold = testCase.config?.threshold || this.config.threshold || 0.8;
83
- let score;
84
- let error;
85
- if (scoringMethod === "exact") {
86
- score = (0, exact_match_1.exactMatch)(String(testCase.expect), response.content);
87
- }
88
- else if (scoringMethod === "json") {
89
- const result = (0, json_validator_1.validateJSON)(testCase.expect, response.content);
90
- score = result.score;
91
- error = result.error;
92
- }
93
- else if (scoringMethod === "semantic") {
94
- let calculatedScore;
95
- let lastScoringError;
96
- // potential embedding providers
97
- const embeddingCapable = ["openai", "openrouter"];
98
- // Order: Current provider (if capable) -> OpenAI -> OpenRouter -> others
99
- const scoringProvidersToTry = [
100
- ...(embeddingCapable.includes(providerName) ? [providerName] : []),
101
- ...embeddingCapable.filter((p) => p !== providerName),
102
- ].filter((p) => this.providers.has(p));
103
- if (scoringProvidersToTry.length === 0) {
104
- throw new Error("No embedding-capable providers available for semantic scoring");
105
- }
106
- for (const scoreProviderName of scoringProvidersToTry) {
107
- try {
108
- const scoreProvider = this.providers.get(scoreProviderName);
109
- if (!scoreProvider)
110
- continue;
111
- const scorer = new semantic_1.SemanticScorer(scoreProvider);
112
- calculatedScore = await scorer.score(String(testCase.expect), response.content);
113
- break;
114
- }
115
- catch (err) {
116
- lastScoringError = err;
117
- continue;
118
- }
119
- }
120
- if (calculatedScore === undefined) {
121
- throw new Error(`Semantic scoring failed. Last error: ${lastScoringError?.message || "Unknown error"}`);
122
- }
123
- score = calculatedScore;
124
- }
125
- else {
126
- throw new Error(`Unknown scoring method: ${scoringMethod}`);
127
- }
128
76
  const status = score >= threshold ? "pass" : "fail";
129
77
  const duration = Date.now() - startTime;
130
78
  return {
@@ -134,7 +82,7 @@ class TestRunner {
134
82
  score,
135
83
  actualOutput: response.content,
136
84
  expectedOutput: String(testCase.expect),
137
- error,
85
+ error: scoringError,
138
86
  metadata: {
139
87
  duration,
140
88
  timestamp: new Date(),
@@ -150,7 +98,6 @@ class TestRunner {
150
98
  continue;
151
99
  }
152
100
  }
153
- // If all attempts failed
154
101
  return {
155
102
  id: testId,
156
103
  testCase,
@@ -165,5 +112,45 @@ class TestRunner {
165
112
  },
166
113
  };
167
114
  }
115
+ async scoreResult(testCase, actualOutput, providerName) {
116
+ const scoringMethod = testCase.config?.method || "semantic";
117
+ if (scoringMethod === "exact") {
118
+ return { score: (0, exact_match_1.exactMatch)(String(testCase.expect), actualOutput) };
119
+ }
120
+ if (scoringMethod === "json") {
121
+ const result = (0, json_validator_1.validateJSON)(testCase.expect, actualOutput);
122
+ return { score: result.score, error: result.error };
123
+ }
124
+ if (scoringMethod === "semantic") {
125
+ return this.runSemanticScoring(testCase, actualOutput, providerName);
126
+ }
127
+ throw new Error(`Unknown scoring method: ${scoringMethod}`);
128
+ }
129
+ async runSemanticScoring(testCase, actualOutput, currentProviderName) {
130
+ const embeddingCapable = ["openai", "openrouter"];
131
+ const scoringProvidersToTry = [
132
+ ...(embeddingCapable.includes(currentProviderName) ? [currentProviderName] : []),
133
+ ...embeddingCapable.filter((p) => p !== currentProviderName),
134
+ ].filter((p) => this.providers.has(p));
135
+ if (scoringProvidersToTry.length === 0) {
136
+ throw new Error("No embedding-capable providers available for semantic scoring");
137
+ }
138
+ let lastScoringError;
139
+ for (const scoreProviderName of scoringProvidersToTry) {
140
+ try {
141
+ const scoreProvider = this.providers.get(scoreProviderName);
142
+ if (!scoreProvider)
143
+ continue;
144
+ const scorer = new semantic_1.SemanticScorer(scoreProvider);
145
+ const score = await scorer.score(String(testCase.expect), actualOutput);
146
+ return { score };
147
+ }
148
+ catch (err) {
149
+ lastScoringError = err;
150
+ continue;
151
+ }
152
+ }
153
+ throw new Error(`Semantic scoring failed. Last error: ${lastScoringError?.message || "Unknown error"}`);
154
+ }
168
155
  }
169
156
  exports.TestRunner = TestRunner;
@@ -3,9 +3,24 @@ export interface ShadowTestResult {
3
3
  score: number;
4
4
  output: string;
5
5
  passed: boolean;
6
+ failureReason?: string;
7
+ }
8
+ export interface ShadowSuiteResult {
9
+ aggregateScore: number;
10
+ results: {
11
+ testId: string;
12
+ score: number;
13
+ passed: boolean;
14
+ output: string;
15
+ failureReason?: string;
16
+ }[];
6
17
  }
7
18
  /**
8
19
  * Test a candidate prompt against the original test case
9
- * Tries providers in sequence until one succeeds
20
+ * Uses specified provider/model or falls back to priority sequence
21
+ */
22
+ export declare function runShadowTest(candidatePrompt: string, test: FailedTest): Promise<ShadowTestResult>;
23
+ /**
24
+ * Run a candidate prompt against multiple tests and return aggregate results
10
25
  */
11
- export declare function runShadowTest(candidatePrompt: string, originalTest: FailedTest): Promise<ShadowTestResult>;
26
+ export declare function runSuiteShadowTest(candidatePrompt: string, tests: FailedTest[]): Promise<ShadowSuiteResult>;