tuneprompt 1.0.7 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -9
- package/dist/cli.js +5 -2
- package/dist/commands/fix.d.ts +3 -1
- package/dist/commands/fix.js +45 -25
- package/dist/commands/generate.d.ts +2 -0
- package/dist/commands/generate.js +11 -0
- package/dist/engine/__tests__/optimizer.test.d.ts +1 -0
- package/dist/engine/__tests__/optimizer.test.js +9 -0
- package/dist/engine/loader.js +6 -2
- package/dist/engine/metaPrompt.d.ts +5 -0
- package/dist/engine/metaPrompt.js +55 -55
- package/dist/engine/optimizer.d.ts +7 -21
- package/dist/engine/optimizer.js +141 -252
- package/dist/engine/runner.d.ts +2 -0
- package/dist/engine/runner.js +56 -69
- package/dist/engine/shadowTester.d.ts +17 -2
- package/dist/engine/shadowTester.js +86 -128
- package/dist/providers/__tests__/custom.test.d.ts +1 -0
- package/dist/providers/__tests__/custom.test.js +9 -0
- package/dist/providers/custom.d.ts +6 -0
- package/dist/providers/custom.js +10 -0
- package/dist/providers/factory.d.ts +6 -0
- package/dist/providers/factory.js +38 -0
- package/dist/providers/gemini.d.ts +11 -0
- package/dist/providers/gemini.js +46 -0
- package/dist/scoring/__tests__/rag.test.d.ts +1 -0
- package/dist/scoring/__tests__/rag.test.js +10 -0
- package/dist/scoring/rag.d.ts +9 -0
- package/dist/scoring/rag.js +9 -0
- package/dist/services/cloud.service.js +1 -1
- package/dist/storage/database.js +1 -1
- package/dist/types/fix.d.ts +11 -0
- package/dist/types/index.d.ts +2 -1
- package/dist/types/test.d.ts +8 -0
- package/dist/types/test.js +2 -0
- package/dist/utils/config.js +11 -5
- package/dist/utils/interpolation.d.ts +4 -0
- package/dist/utils/interpolation.js +16 -0
- package/dist/utils/storage.d.ts +4 -0
- package/dist/utils/storage.js +26 -5
- package/dist/utils/validator.d.ts +2 -0
- package/dist/utils/validator.js +10 -0
- package/package.json +3 -2
package/dist/engine/optimizer.js
CHANGED
|
@@ -1,118 +1,107 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
-
};
|
|
38
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
3
|
exports.PromptOptimizer = void 0;
|
|
40
|
-
const sdk_1 = __importDefault(require("@anthropic-ai/sdk"));
|
|
41
|
-
const openai_1 = __importDefault(require("openai"));
|
|
42
4
|
const metaPrompt_1 = require("./metaPrompt");
|
|
43
5
|
const constraintExtractor_1 = require("./constraintExtractor");
|
|
6
|
+
const shadowTester_1 = require("./shadowTester");
|
|
7
|
+
const factory_1 = require("../providers/factory");
|
|
44
8
|
class PromptOptimizer {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
constructor() {
|
|
49
|
-
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
50
|
-
if (anthropicKey &&
|
|
51
|
-
!anthropicKey.includes('your_key') &&
|
|
52
|
-
!anthropicKey.startsWith('api_key') &&
|
|
53
|
-
anthropicKey !== 'phc_xxxxx') {
|
|
54
|
-
this.anthropic = new sdk_1.default({
|
|
55
|
-
apiKey: anthropicKey
|
|
56
|
-
});
|
|
57
|
-
}
|
|
58
|
-
const openaiKey = process.env.OPENAI_API_KEY;
|
|
59
|
-
if (openaiKey && !openaiKey.includes('your_key')) {
|
|
60
|
-
this.openai = new openai_1.default({
|
|
61
|
-
apiKey: openaiKey
|
|
62
|
-
});
|
|
63
|
-
}
|
|
64
|
-
const openrouterKey = process.env.OPENROUTER_API_KEY;
|
|
65
|
-
if (openrouterKey && !openrouterKey.includes('your_key')) {
|
|
66
|
-
this.openrouter = new openai_1.default({
|
|
67
|
-
baseURL: 'https://openrouter.ai/api/v1',
|
|
68
|
-
apiKey: openrouterKey,
|
|
69
|
-
defaultHeaders: {
|
|
70
|
-
'HTTP-Referer': 'https://tuneprompt.xyz',
|
|
71
|
-
'X-Title': 'TunePrompt CLI',
|
|
72
|
-
},
|
|
73
|
-
});
|
|
74
|
-
}
|
|
9
|
+
maxIterations;
|
|
10
|
+
constructor(options = {}) {
|
|
11
|
+
this.maxIterations = options.maxIterations || 3;
|
|
75
12
|
}
|
|
76
13
|
/**
|
|
77
|
-
* Main optimization method
|
|
14
|
+
* Main optimization method with Anti-Regression and Iterative Refinement
|
|
78
15
|
*/
|
|
79
|
-
async optimize(failedTest) {
|
|
16
|
+
async optimize(failedTest, suite) {
|
|
80
17
|
console.log(`\n🧠 Analyzing failure: "${failedTest.description}"`);
|
|
81
|
-
|
|
18
|
+
console.log(`📈 Full test suite size: ${suite.length}`);
|
|
19
|
+
const initialAggregateScore = suite.reduce((sum, t) => sum + t.score, 0) / suite.length;
|
|
20
|
+
console.log(`📊 Current aggregate score: ${initialAggregateScore.toFixed(2)}`);
|
|
82
21
|
const errorContext = (0, constraintExtractor_1.generateErrorContext)(failedTest);
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
22
|
+
const passingExamples = suite
|
|
23
|
+
.filter(t => t.score >= t.threshold)
|
|
24
|
+
.slice(0, 3)
|
|
25
|
+
.map(t => ({ input: t.input, output: t.expectedOutput }));
|
|
26
|
+
let iterations = 0;
|
|
27
|
+
let lastFailureReason = undefined;
|
|
28
|
+
let bestResult = null;
|
|
29
|
+
let bestAggregateScore = initialAggregateScore;
|
|
30
|
+
let conversation = [];
|
|
31
|
+
while (iterations < this.maxIterations) {
|
|
32
|
+
iterations++;
|
|
33
|
+
console.log(`🚀 Optimization Attempt #${iterations}...`);
|
|
34
|
+
if (iterations === 1) {
|
|
35
|
+
const input = {
|
|
36
|
+
originalPrompt: failedTest.prompt,
|
|
37
|
+
testInput: failedTest.input,
|
|
38
|
+
expectedOutput: failedTest.expectedOutput,
|
|
39
|
+
actualOutput: failedTest.actualOutput,
|
|
40
|
+
errorType: failedTest.errorType,
|
|
41
|
+
errorMessage: errorContext,
|
|
42
|
+
passingExamples,
|
|
43
|
+
};
|
|
44
|
+
conversation.push({ role: 'user', content: this.getMetaPrompt(input) });
|
|
100
45
|
}
|
|
101
|
-
|
|
46
|
+
else {
|
|
47
|
+
conversation.push({ role: 'user', content: lastFailureReason || 'Please try again.' });
|
|
48
|
+
}
|
|
49
|
+
const generationResult = await this.generateCandidates(conversation, failedTest);
|
|
50
|
+
const candidates = generationResult.candidates;
|
|
51
|
+
if (generationResult.rawResponse) {
|
|
52
|
+
conversation.push({ role: 'assistant', content: generationResult.rawResponse });
|
|
53
|
+
}
|
|
54
|
+
for (const candidate of candidates) {
|
|
55
|
+
try {
|
|
56
|
+
console.log(`🧪 Testing candidate...`);
|
|
57
|
+
const primaryResult = await (0, shadowTester_1.runShadowTest)(candidate.prompt, failedTest);
|
|
58
|
+
if (primaryResult.score < failedTest.threshold) {
|
|
59
|
+
console.log(` ❌ Candidate failed to resolve primary error (score: ${primaryResult.score.toFixed(2)})`);
|
|
60
|
+
const specificReason = primaryResult.failureReason || `the output was: "${primaryResult.output.substring(0, 100)}..."`;
|
|
61
|
+
lastFailureReason = `Candidate failed. Reason: ${specificReason}. Previous reasoning: ${candidate.reasoning}`;
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
console.log(` ✅ Resolved primary error. Running anti-regression...`);
|
|
65
|
+
const suiteResult = await (0, shadowTester_1.runSuiteShadowTest)(candidate.prompt, suite);
|
|
66
|
+
console.log(` 📊 Suite aggregate score: ${suiteResult.aggregateScore.toFixed(2)}`);
|
|
67
|
+
if (suiteResult.aggregateScore > bestAggregateScore) {
|
|
68
|
+
bestAggregateScore = suiteResult.aggregateScore;
|
|
69
|
+
bestResult = {
|
|
70
|
+
originalPrompt: failedTest.prompt,
|
|
71
|
+
optimizedPrompt: candidate.prompt,
|
|
72
|
+
reasoning: candidate.reasoning,
|
|
73
|
+
confidence: suiteResult.aggregateScore,
|
|
74
|
+
testResults: {
|
|
75
|
+
score: primaryResult.score,
|
|
76
|
+
passed: true,
|
|
77
|
+
output: primaryResult.output,
|
|
78
|
+
aggregateScore: suiteResult.aggregateScore
|
|
79
|
+
},
|
|
80
|
+
iterations
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
else if (suiteResult.aggregateScore <= bestAggregateScore) {
|
|
84
|
+
console.log(` 📉 Candidate regression: aggregate score dropped (Current: ${bestAggregateScore.toFixed(2)} VS New: ${suiteResult.aggregateScore.toFixed(2)})`);
|
|
85
|
+
const regressions = suiteResult.results.filter(r => !r.passed).map(r => r.failureReason).filter(Boolean);
|
|
86
|
+
const regressionText = regressions.length > 0 ? ` Required features broke: ${regressions.slice(0, 2).join('; ')}.` : '';
|
|
87
|
+
lastFailureReason = `The fix resolved the failure but introduced regressions in other cases.${regressionText} Maintain all successful patterns while fixing the failure.`;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
catch (error) {
|
|
91
|
+
console.error(` ⚠️ Validation error for candidate: ${error.message}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (bestResult)
|
|
95
|
+
break;
|
|
96
|
+
console.log(`♻️ No candidate was net-positive. Retrying with refinement feedback...`);
|
|
97
|
+
}
|
|
98
|
+
if (!bestResult) {
|
|
99
|
+
throw new Error(`All fix attempts failed to resolve the regression or improve the aggregate score after ${this.maxIterations} iterations.`);
|
|
100
|
+
}
|
|
101
|
+
return bestResult;
|
|
102
102
|
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
*/
|
|
106
|
-
selectMetaPrompt(test, errorContext) {
|
|
107
|
-
const input = {
|
|
108
|
-
originalPrompt: test.prompt,
|
|
109
|
-
testInput: test.input,
|
|
110
|
-
expectedOutput: test.expectedOutput,
|
|
111
|
-
actualOutput: test.actualOutput,
|
|
112
|
-
errorType: test.errorType,
|
|
113
|
-
errorMessage: errorContext
|
|
114
|
-
};
|
|
115
|
-
switch (test.errorType) {
|
|
103
|
+
getMetaPrompt(input) {
|
|
104
|
+
switch (input.errorType) {
|
|
116
105
|
case 'json':
|
|
117
106
|
return (0, metaPrompt_1.generateJSONFixPrompt)(input);
|
|
118
107
|
case 'semantic':
|
|
@@ -121,161 +110,61 @@ class PromptOptimizer {
|
|
|
121
110
|
return (0, metaPrompt_1.generateOptimizationPrompt)(input);
|
|
122
111
|
}
|
|
123
112
|
}
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
113
|
+
async generateCandidates(messages, failedTest) {
|
|
114
|
+
if (process.env.TUNEPROMPT_MOCK_OPTIMIZER === 'true') {
|
|
115
|
+
return {
|
|
116
|
+
candidates: [
|
|
117
|
+
{ prompt: 'Optimized candidate A', reasoning: 'Mock reasoning A', score: 0 },
|
|
118
|
+
{ prompt: 'Optimized candidate B', reasoning: 'Mock reasoning B', score: 0 }
|
|
119
|
+
],
|
|
120
|
+
rawResponse: '{"candidateA": {"prompt": "Optimized candidate A", "reasoning": "Mock reasoning A"}, "candidateB": {"prompt": "Optimized candidate B", "reasoning": "Mock reasoning B"}}'
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
const providerPool = ['anthropic', 'openai', 'gemini', 'openrouter'];
|
|
124
|
+
const systemPrompt = "You are a prompt optimizer. Output exclusively JSON. You suggest a candidateA and candidateB. You MUST format output as: {\"candidateA\": {\"prompt\": \"...\", \"reasoning\": \"...\"}, \"candidateB\": {\"prompt\": \"...\", \"reasoning\": \"...\"}}";
|
|
125
|
+
for (const providerName of providerPool) {
|
|
131
126
|
try {
|
|
132
|
-
|
|
133
|
-
if (
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
}
|
|
163
|
-
else if (provider === 'openai' && this.openai) {
|
|
164
|
-
console.log(`⚡ Using OpenAI for candidate generation...`);
|
|
165
|
-
const response = await this.openai.chat.completions.create({
|
|
166
|
-
model: 'gpt-4o',
|
|
167
|
-
messages: [{
|
|
168
|
-
role: 'user',
|
|
169
|
-
content: metaPrompt
|
|
170
|
-
}],
|
|
171
|
-
response_format: { type: 'json_object' }
|
|
172
|
-
});
|
|
173
|
-
const content = response.choices[0]?.message?.content;
|
|
174
|
-
if (!content) {
|
|
175
|
-
throw new Error('No content returned from OpenAI');
|
|
176
|
-
}
|
|
177
|
-
// Parse the JSON response
|
|
178
|
-
const parsed = JSON.parse(content);
|
|
179
|
-
return [
|
|
180
|
-
{
|
|
181
|
-
prompt: parsed.candidateA.prompt,
|
|
182
|
-
reasoning: parsed.candidateA.reasoning,
|
|
183
|
-
score: 0 // Will be filled by shadow testing
|
|
184
|
-
},
|
|
185
|
-
{
|
|
186
|
-
prompt: parsed.candidateB.prompt,
|
|
187
|
-
reasoning: parsed.candidateB.reasoning,
|
|
188
|
-
score: 0
|
|
189
|
-
}
|
|
190
|
-
];
|
|
191
|
-
}
|
|
192
|
-
else if (provider === 'openrouter' && this.openrouter) {
|
|
193
|
-
console.log(`⚡ Using OpenRouter for candidate generation...`);
|
|
194
|
-
const response = await this.openrouter.chat.completions.create({
|
|
195
|
-
model: 'anthropic/claude-3-sonnet', // Default robust model on OpenRouter
|
|
196
|
-
messages: [{
|
|
197
|
-
role: 'user',
|
|
198
|
-
content: metaPrompt
|
|
199
|
-
}],
|
|
200
|
-
response_format: { type: 'json_object' }
|
|
201
|
-
});
|
|
202
|
-
const content = response.choices[0]?.message?.content;
|
|
203
|
-
if (!content) {
|
|
204
|
-
// Fallback if model doesn't support JSON mode or returns empty
|
|
205
|
-
throw new Error('No content returned from OpenRouter');
|
|
206
|
-
}
|
|
207
|
-
const parsed = JSON.parse(content);
|
|
208
|
-
return [
|
|
209
|
-
{
|
|
210
|
-
prompt: parsed.candidateA.prompt,
|
|
211
|
-
reasoning: parsed.candidateA.reasoning,
|
|
212
|
-
score: 0
|
|
213
|
-
},
|
|
214
|
-
{
|
|
215
|
-
prompt: parsed.candidateB.prompt,
|
|
216
|
-
reasoning: parsed.candidateB.reasoning,
|
|
217
|
-
score: 0
|
|
218
|
-
}
|
|
219
|
-
];
|
|
220
|
-
}
|
|
127
|
+
const apiKey = factory_1.ProviderFactory.getApiKey(providerName);
|
|
128
|
+
if (!apiKey)
|
|
129
|
+
continue;
|
|
130
|
+
// Pick a strong model for optimization if not defined
|
|
131
|
+
const model = providerName === 'anthropic' ? 'claude-3-5-sonnet-latest' :
|
|
132
|
+
providerName === 'openai' ? 'gpt-4o' :
|
|
133
|
+
providerName === 'gemini' ? 'gemini-2.0-flash' : undefined;
|
|
134
|
+
if (!model)
|
|
135
|
+
continue;
|
|
136
|
+
const provider = factory_1.ProviderFactory.create(providerName, {
|
|
137
|
+
apiKey,
|
|
138
|
+
model,
|
|
139
|
+
maxTokens: 4000
|
|
140
|
+
});
|
|
141
|
+
// Convert conversation to a format the provider understands
|
|
142
|
+
const userContent = messages.map(m => `${m.role.toUpperCase()}: ${m.content}`).join('\n\n');
|
|
143
|
+
const response = await provider.complete({
|
|
144
|
+
system: systemPrompt,
|
|
145
|
+
user: userContent
|
|
146
|
+
});
|
|
147
|
+
const content = response.content;
|
|
148
|
+
if (!content)
|
|
149
|
+
throw new Error('No content returned');
|
|
150
|
+
const parsed = JSON.parse(content);
|
|
151
|
+
return {
|
|
152
|
+
candidates: [
|
|
153
|
+
{ prompt: parsed.candidateA.prompt, reasoning: parsed.candidateA.reasoning, score: 0 },
|
|
154
|
+
{ prompt: parsed.candidateB.prompt, reasoning: parsed.candidateB.reasoning, score: 0 }
|
|
155
|
+
],
|
|
156
|
+
rawResponse: content
|
|
157
|
+
};
|
|
221
158
|
}
|
|
222
159
|
catch (error) {
|
|
223
|
-
console.log(`⚠️
|
|
224
|
-
continue;
|
|
160
|
+
console.log(`⚠️ Candidate generation failed for ${providerName}: ${error.message}`);
|
|
161
|
+
continue;
|
|
225
162
|
}
|
|
226
163
|
}
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
reasoning: 'Generated using fallback method',
|
|
232
|
-
score: 0
|
|
233
|
-
}];
|
|
234
|
-
}
|
|
235
|
-
/**
|
|
236
|
-
* Shadow test each candidate and return the best one
|
|
237
|
-
*/
|
|
238
|
-
async selectBestCandidate(candidates, originalTest) {
|
|
239
|
-
const { runShadowTest } = await Promise.resolve().then(() => __importStar(require('./shadowTester')));
|
|
240
|
-
const testedCandidates = await Promise.all(candidates.map(async (candidate) => {
|
|
241
|
-
const result = await runShadowTest(candidate.prompt, originalTest);
|
|
242
|
-
return {
|
|
243
|
-
...candidate,
|
|
244
|
-
score: result.score
|
|
245
|
-
};
|
|
246
|
-
}));
|
|
247
|
-
// Sort by score (highest first)
|
|
248
|
-
testedCandidates.sort((a, b) => b.score - a.score);
|
|
249
|
-
return testedCandidates[0];
|
|
250
|
-
}
|
|
251
|
-
/**
|
|
252
|
-
* Fallback prompt improvement - generates a clean rewritten prompt
|
|
253
|
-
*/
|
|
254
|
-
createFallbackPrompt(test) {
|
|
255
|
-
// Extract the core intent from the original prompt
|
|
256
|
-
// Remove any existing "fix" instructions we might have added previously
|
|
257
|
-
let corePrompt = test.prompt
|
|
258
|
-
.replace(/\n\nYour response must match this exactly: "[\s\S]*?$/g, '')
|
|
259
|
-
.replace(/\n\nIMPORTANT: You must respond with valid JSON only[\s\S]*?$/g, '')
|
|
260
|
-
.replace(/\n\nBe concise and match the expected output format exactly[\s\S]*?$/g, '')
|
|
261
|
-
.trim();
|
|
262
|
-
// For JSON errors, create a structured prompt
|
|
263
|
-
if (test.errorType === 'json') {
|
|
264
|
-
return `${corePrompt}
|
|
265
|
-
|
|
266
|
-
IMPORTANT: You must respond with valid JSON only. No explanations, no markdown, just the raw JSON object.`;
|
|
267
|
-
}
|
|
268
|
-
// For semantic errors, be more specific about expected output
|
|
269
|
-
if (test.errorType === 'semantic') {
|
|
270
|
-
return `${corePrompt}
|
|
271
|
-
|
|
272
|
-
Your response must match this exactly: "${test.expectedOutput}"
|
|
273
|
-
Do not add any extra text, greetings, or explanations. Output only what is requested.`;
|
|
274
|
-
}
|
|
275
|
-
// Default: add clarity
|
|
276
|
-
return `${corePrompt}
|
|
277
|
-
|
|
278
|
-
Be concise and match the expected output format exactly.`;
|
|
164
|
+
return {
|
|
165
|
+
candidates: [{ prompt: failedTest.prompt, reasoning: 'Fallback - optimization failed', score: 0 }],
|
|
166
|
+
rawResponse: ''
|
|
167
|
+
};
|
|
279
168
|
}
|
|
280
169
|
}
|
|
281
170
|
exports.PromptOptimizer = PromptOptimizer;
|
package/dist/engine/runner.d.ts
CHANGED
package/dist/engine/runner.js
CHANGED
|
@@ -2,12 +2,11 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.TestRunner = void 0;
|
|
4
4
|
const uuid_1 = require("uuid");
|
|
5
|
-
const
|
|
6
|
-
const anthropic_1 = require("../providers/anthropic");
|
|
7
|
-
const openrouter_1 = require("../providers/openrouter");
|
|
5
|
+
const factory_1 = require("../providers/factory");
|
|
8
6
|
const exact_match_1 = require("../scoring/exact-match");
|
|
9
7
|
const json_validator_1 = require("../scoring/json-validator");
|
|
10
8
|
const semantic_1 = require("../scoring/semantic");
|
|
9
|
+
const interpolation_1 = require("../utils/interpolation");
|
|
11
10
|
class TestRunner {
|
|
12
11
|
config;
|
|
13
12
|
providers = new Map();
|
|
@@ -16,16 +15,12 @@ class TestRunner {
|
|
|
16
15
|
this.initializeProviders();
|
|
17
16
|
}
|
|
18
17
|
initializeProviders() {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
this.providers
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
}
|
|
26
|
-
if (this.config.providers.openrouter) {
|
|
27
|
-
const provider = new openrouter_1.OpenRouterProvider(this.config.providers.openrouter);
|
|
28
|
-
this.providers.set("openrouter", provider);
|
|
18
|
+
const providerNames = ["openai", "anthropic", "openrouter", "gemini"];
|
|
19
|
+
for (const name of providerNames) {
|
|
20
|
+
const providerConfig = this.config.providers[name];
|
|
21
|
+
if (providerConfig && providerConfig.apiKey) {
|
|
22
|
+
this.providers.set(name, factory_1.ProviderFactory.create(name, providerConfig));
|
|
23
|
+
}
|
|
29
24
|
}
|
|
30
25
|
}
|
|
31
26
|
async runTests(testCases) {
|
|
@@ -52,14 +47,10 @@ class TestRunner {
|
|
|
52
47
|
async runSingleTest(testCase) {
|
|
53
48
|
const testId = (0, uuid_1.v4)();
|
|
54
49
|
const startTime = Date.now();
|
|
55
|
-
|
|
56
|
-
const fallbackChain = ["openai", "anthropic", "openrouter"];
|
|
57
|
-
// Determine starting provider
|
|
50
|
+
const fallbackChain = ["openai", "anthropic", "gemini", "openrouter"];
|
|
58
51
|
const initialProvider = testCase.config?.provider || "openai";
|
|
59
|
-
// Build the sequence of providers to try
|
|
60
52
|
let providersToTry;
|
|
61
53
|
if (testCase.config?.provider) {
|
|
62
|
-
// If provider is explicitly set, only try that one
|
|
63
54
|
providersToTry = [testCase.config.provider];
|
|
64
55
|
}
|
|
65
56
|
else {
|
|
@@ -75,56 +66,13 @@ class TestRunner {
|
|
|
75
66
|
if (!provider)
|
|
76
67
|
continue;
|
|
77
68
|
try {
|
|
78
|
-
//
|
|
79
|
-
const
|
|
80
|
-
|
|
81
|
-
|
|
69
|
+
// Interpolate variables if present
|
|
70
|
+
const finalPrompt = typeof testCase.prompt === 'string'
|
|
71
|
+
? (0, interpolation_1.interpolateVariables)(testCase.prompt, testCase.variables)
|
|
72
|
+
: testCase.prompt;
|
|
73
|
+
const response = await provider.complete(finalPrompt);
|
|
74
|
+
const { score, error: scoringError } = await this.scoreResult(testCase, response.content, providerName);
|
|
82
75
|
const threshold = testCase.config?.threshold || this.config.threshold || 0.8;
|
|
83
|
-
let score;
|
|
84
|
-
let error;
|
|
85
|
-
if (scoringMethod === "exact") {
|
|
86
|
-
score = (0, exact_match_1.exactMatch)(String(testCase.expect), response.content);
|
|
87
|
-
}
|
|
88
|
-
else if (scoringMethod === "json") {
|
|
89
|
-
const result = (0, json_validator_1.validateJSON)(testCase.expect, response.content);
|
|
90
|
-
score = result.score;
|
|
91
|
-
error = result.error;
|
|
92
|
-
}
|
|
93
|
-
else if (scoringMethod === "semantic") {
|
|
94
|
-
let calculatedScore;
|
|
95
|
-
let lastScoringError;
|
|
96
|
-
// potential embedding providers
|
|
97
|
-
const embeddingCapable = ["openai", "openrouter"];
|
|
98
|
-
// Order: Current provider (if capable) -> OpenAI -> OpenRouter -> others
|
|
99
|
-
const scoringProvidersToTry = [
|
|
100
|
-
...(embeddingCapable.includes(providerName) ? [providerName] : []),
|
|
101
|
-
...embeddingCapable.filter((p) => p !== providerName),
|
|
102
|
-
].filter((p) => this.providers.has(p));
|
|
103
|
-
if (scoringProvidersToTry.length === 0) {
|
|
104
|
-
throw new Error("No embedding-capable providers available for semantic scoring");
|
|
105
|
-
}
|
|
106
|
-
for (const scoreProviderName of scoringProvidersToTry) {
|
|
107
|
-
try {
|
|
108
|
-
const scoreProvider = this.providers.get(scoreProviderName);
|
|
109
|
-
if (!scoreProvider)
|
|
110
|
-
continue;
|
|
111
|
-
const scorer = new semantic_1.SemanticScorer(scoreProvider);
|
|
112
|
-
calculatedScore = await scorer.score(String(testCase.expect), response.content);
|
|
113
|
-
break;
|
|
114
|
-
}
|
|
115
|
-
catch (err) {
|
|
116
|
-
lastScoringError = err;
|
|
117
|
-
continue;
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
if (calculatedScore === undefined) {
|
|
121
|
-
throw new Error(`Semantic scoring failed. Last error: ${lastScoringError?.message || "Unknown error"}`);
|
|
122
|
-
}
|
|
123
|
-
score = calculatedScore;
|
|
124
|
-
}
|
|
125
|
-
else {
|
|
126
|
-
throw new Error(`Unknown scoring method: ${scoringMethod}`);
|
|
127
|
-
}
|
|
128
76
|
const status = score >= threshold ? "pass" : "fail";
|
|
129
77
|
const duration = Date.now() - startTime;
|
|
130
78
|
return {
|
|
@@ -134,7 +82,7 @@ class TestRunner {
|
|
|
134
82
|
score,
|
|
135
83
|
actualOutput: response.content,
|
|
136
84
|
expectedOutput: String(testCase.expect),
|
|
137
|
-
error,
|
|
85
|
+
error: scoringError,
|
|
138
86
|
metadata: {
|
|
139
87
|
duration,
|
|
140
88
|
timestamp: new Date(),
|
|
@@ -150,7 +98,6 @@ class TestRunner {
|
|
|
150
98
|
continue;
|
|
151
99
|
}
|
|
152
100
|
}
|
|
153
|
-
// If all attempts failed
|
|
154
101
|
return {
|
|
155
102
|
id: testId,
|
|
156
103
|
testCase,
|
|
@@ -165,5 +112,45 @@ class TestRunner {
|
|
|
165
112
|
},
|
|
166
113
|
};
|
|
167
114
|
}
|
|
115
|
+
async scoreResult(testCase, actualOutput, providerName) {
|
|
116
|
+
const scoringMethod = testCase.config?.method || "semantic";
|
|
117
|
+
if (scoringMethod === "exact") {
|
|
118
|
+
return { score: (0, exact_match_1.exactMatch)(String(testCase.expect), actualOutput) };
|
|
119
|
+
}
|
|
120
|
+
if (scoringMethod === "json") {
|
|
121
|
+
const result = (0, json_validator_1.validateJSON)(testCase.expect, actualOutput);
|
|
122
|
+
return { score: result.score, error: result.error };
|
|
123
|
+
}
|
|
124
|
+
if (scoringMethod === "semantic") {
|
|
125
|
+
return this.runSemanticScoring(testCase, actualOutput, providerName);
|
|
126
|
+
}
|
|
127
|
+
throw new Error(`Unknown scoring method: ${scoringMethod}`);
|
|
128
|
+
}
|
|
129
|
+
async runSemanticScoring(testCase, actualOutput, currentProviderName) {
|
|
130
|
+
const embeddingCapable = ["openai", "openrouter"];
|
|
131
|
+
const scoringProvidersToTry = [
|
|
132
|
+
...(embeddingCapable.includes(currentProviderName) ? [currentProviderName] : []),
|
|
133
|
+
...embeddingCapable.filter((p) => p !== currentProviderName),
|
|
134
|
+
].filter((p) => this.providers.has(p));
|
|
135
|
+
if (scoringProvidersToTry.length === 0) {
|
|
136
|
+
throw new Error("No embedding-capable providers available for semantic scoring");
|
|
137
|
+
}
|
|
138
|
+
let lastScoringError;
|
|
139
|
+
for (const scoreProviderName of scoringProvidersToTry) {
|
|
140
|
+
try {
|
|
141
|
+
const scoreProvider = this.providers.get(scoreProviderName);
|
|
142
|
+
if (!scoreProvider)
|
|
143
|
+
continue;
|
|
144
|
+
const scorer = new semantic_1.SemanticScorer(scoreProvider);
|
|
145
|
+
const score = await scorer.score(String(testCase.expect), actualOutput);
|
|
146
|
+
return { score };
|
|
147
|
+
}
|
|
148
|
+
catch (err) {
|
|
149
|
+
lastScoringError = err;
|
|
150
|
+
continue;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
throw new Error(`Semantic scoring failed. Last error: ${lastScoringError?.message || "Unknown error"}`);
|
|
154
|
+
}
|
|
168
155
|
}
|
|
169
156
|
exports.TestRunner = TestRunner;
|
|
@@ -3,9 +3,24 @@ export interface ShadowTestResult {
|
|
|
3
3
|
score: number;
|
|
4
4
|
output: string;
|
|
5
5
|
passed: boolean;
|
|
6
|
+
failureReason?: string;
|
|
7
|
+
}
|
|
8
|
+
export interface ShadowSuiteResult {
|
|
9
|
+
aggregateScore: number;
|
|
10
|
+
results: {
|
|
11
|
+
testId: string;
|
|
12
|
+
score: number;
|
|
13
|
+
passed: boolean;
|
|
14
|
+
output: string;
|
|
15
|
+
failureReason?: string;
|
|
16
|
+
}[];
|
|
6
17
|
}
|
|
7
18
|
/**
|
|
8
19
|
* Test a candidate prompt against the original test case
|
|
9
|
-
*
|
|
20
|
+
* Uses specified provider/model or falls back to priority sequence
|
|
21
|
+
*/
|
|
22
|
+
export declare function runShadowTest(candidatePrompt: string, test: FailedTest): Promise<ShadowTestResult>;
|
|
23
|
+
/**
|
|
24
|
+
* Run a candidate prompt against multiple tests and return aggregate results
|
|
10
25
|
*/
|
|
11
|
-
export declare function
|
|
26
|
+
export declare function runSuiteShadowTest(candidatePrompt: string, tests: FailedTest[]): Promise<ShadowSuiteResult>;
|