tuneprompt 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +151 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +146 -0
- package/dist/commands/activate.d.ts +1 -0
- package/dist/commands/activate.js +91 -0
- package/dist/commands/fix.d.ts +1 -0
- package/dist/commands/fix.js +187 -0
- package/dist/commands/history.d.ts +5 -0
- package/dist/commands/history.js +63 -0
- package/dist/commands/init.d.ts +1 -0
- package/dist/commands/init.js +96 -0
- package/dist/commands/run.d.ts +9 -0
- package/dist/commands/run.js +216 -0
- package/dist/db/migrate.d.ts +2 -0
- package/dist/db/migrate.js +8 -0
- package/dist/engine/constraintExtractor.d.ts +8 -0
- package/dist/engine/constraintExtractor.js +54 -0
- package/dist/engine/loader.d.ts +5 -0
- package/dist/engine/loader.js +74 -0
- package/dist/engine/metaPrompt.d.ts +11 -0
- package/dist/engine/metaPrompt.js +129 -0
- package/dist/engine/optimizer.d.ts +26 -0
- package/dist/engine/optimizer.js +246 -0
- package/dist/engine/reporter.d.ts +7 -0
- package/dist/engine/reporter.js +58 -0
- package/dist/engine/runner.d.ts +9 -0
- package/dist/engine/runner.js +169 -0
- package/dist/engine/shadowTester.d.ts +11 -0
- package/dist/engine/shadowTester.js +156 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +26 -0
- package/dist/providers/anthropic.d.ts +12 -0
- package/dist/providers/anthropic.js +51 -0
- package/dist/providers/base.d.ts +15 -0
- package/dist/providers/base.js +10 -0
- package/dist/providers/openai.d.ts +12 -0
- package/dist/providers/openai.js +58 -0
- package/dist/providers/openrouter.d.ts +11 -0
- package/dist/providers/openrouter.js +83 -0
- package/dist/scoring/exact-match.d.ts +1 -0
- package/dist/scoring/exact-match.js +8 -0
- package/dist/scoring/json-validator.d.ts +4 -0
- package/dist/scoring/json-validator.js +29 -0
- package/dist/scoring/semantic.d.ts +8 -0
- package/dist/scoring/semantic.js +107 -0
- package/dist/services/cloud.service.d.ts +49 -0
- package/dist/services/cloud.service.js +82 -0
- package/dist/storage/database.d.ts +10 -0
- package/dist/storage/database.js +179 -0
- package/dist/types/fix.d.ts +28 -0
- package/dist/types/fix.js +2 -0
- package/dist/types/index.d.ts +58 -0
- package/dist/types/index.js +2 -0
- package/dist/utils/analytics.d.ts +2 -0
- package/dist/utils/analytics.js +22 -0
- package/dist/utils/config.d.ts +3 -0
- package/dist/utils/config.js +70 -0
- package/dist/utils/errorHandler.d.ts +14 -0
- package/dist/utils/errorHandler.js +40 -0
- package/dist/utils/license.d.ts +40 -0
- package/dist/utils/license.js +207 -0
- package/dist/utils/storage.d.ts +2 -0
- package/dist/utils/storage.js +25 -0
- package/package.json +76 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.PromptOptimizer = void 0;
|
|
40
|
+
const sdk_1 = __importDefault(require("@anthropic-ai/sdk"));
|
|
41
|
+
const openai_1 = __importDefault(require("openai"));
|
|
42
|
+
const metaPrompt_1 = require("./metaPrompt");
|
|
43
|
+
const constraintExtractor_1 = require("./constraintExtractor");
|
|
44
|
+
class PromptOptimizer {
|
|
45
|
+
anthropic;
|
|
46
|
+
openai;
|
|
47
|
+
constructor() {
|
|
48
|
+
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
49
|
+
if (anthropicKey && !anthropicKey.startsWith('api_key') && anthropicKey !== 'phc_xxxxx') {
|
|
50
|
+
this.anthropic = new sdk_1.default({
|
|
51
|
+
apiKey: anthropicKey
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
const openaiKey = process.env.OPENAI_API_KEY;
|
|
55
|
+
if (openaiKey && !openaiKey.startsWith('api_key')) {
|
|
56
|
+
this.openai = new openai_1.default({
|
|
57
|
+
apiKey: openaiKey
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Main optimization method
|
|
63
|
+
*/
|
|
64
|
+
async optimize(failedTest) {
|
|
65
|
+
console.log(`\n🧠 Analyzing failure: "${failedTest.description}"`);
|
|
66
|
+
// Step 1: Extract constraints and build context
|
|
67
|
+
const errorContext = (0, constraintExtractor_1.generateErrorContext)(failedTest);
|
|
68
|
+
// Step 2: Choose the right meta-prompt based on error type
|
|
69
|
+
const metaPrompt = this.selectMetaPrompt(failedTest, errorContext);
|
|
70
|
+
// Step 3: Generate fix candidates using Claude
|
|
71
|
+
console.log('⚡ Generating optimized prompt candidates...');
|
|
72
|
+
const candidates = await this.generateCandidates(metaPrompt, failedTest);
|
|
73
|
+
// Step 4: Shadow test each candidate
|
|
74
|
+
console.log('🧪 Shadow testing candidates...');
|
|
75
|
+
const bestCandidate = await this.selectBestCandidate(candidates, failedTest);
|
|
76
|
+
return {
|
|
77
|
+
originalPrompt: failedTest.prompt,
|
|
78
|
+
optimizedPrompt: bestCandidate.prompt,
|
|
79
|
+
reasoning: bestCandidate.reasoning,
|
|
80
|
+
confidence: bestCandidate.score,
|
|
81
|
+
testResults: {
|
|
82
|
+
score: bestCandidate.score,
|
|
83
|
+
passed: bestCandidate.score >= failedTest.threshold,
|
|
84
|
+
output: '' // Will be filled by shadow tester
|
|
85
|
+
}
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Select appropriate meta-prompt based on error type
|
|
90
|
+
*/
|
|
91
|
+
selectMetaPrompt(test, errorContext) {
|
|
92
|
+
const input = {
|
|
93
|
+
originalPrompt: test.prompt,
|
|
94
|
+
testInput: test.input,
|
|
95
|
+
expectedOutput: test.expectedOutput,
|
|
96
|
+
actualOutput: test.actualOutput,
|
|
97
|
+
errorType: test.errorType,
|
|
98
|
+
errorMessage: errorContext
|
|
99
|
+
};
|
|
100
|
+
switch (test.errorType) {
|
|
101
|
+
case 'json':
|
|
102
|
+
return (0, metaPrompt_1.generateJSONFixPrompt)(input);
|
|
103
|
+
case 'semantic':
|
|
104
|
+
return (0, metaPrompt_1.generateSemanticFixPrompt)(input);
|
|
105
|
+
default:
|
|
106
|
+
return (0, metaPrompt_1.generateOptimizationPrompt)(input);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Generate multiple fix candidates using available LLMs with fallback
|
|
111
|
+
*/
|
|
112
|
+
async generateCandidates(metaPrompt, failedTest) {
|
|
113
|
+
// Define provider priority order for candidate generation
|
|
114
|
+
const providers = ['anthropic', 'openai', 'openrouter'];
|
|
115
|
+
for (const provider of providers) {
|
|
116
|
+
try {
|
|
117
|
+
// Check if we have the required client for this provider
|
|
118
|
+
if (provider === 'anthropic' && this.anthropic) {
|
|
119
|
+
console.log(`⚡ Using Anthropic for candidate generation...`);
|
|
120
|
+
const response = await this.anthropic.messages.create({
|
|
121
|
+
model: 'claude-sonnet-4-20250514',
|
|
122
|
+
max_tokens: 4000,
|
|
123
|
+
temperature: 0.7, // Some creativity for prompt rewriting
|
|
124
|
+
messages: [{
|
|
125
|
+
role: 'user',
|
|
126
|
+
content: metaPrompt
|
|
127
|
+
}]
|
|
128
|
+
});
|
|
129
|
+
const content = response.content[0];
|
|
130
|
+
if (content.type !== 'text') {
|
|
131
|
+
throw new Error('Unexpected response type from Claude');
|
|
132
|
+
}
|
|
133
|
+
// Parse the JSON response
|
|
134
|
+
const parsed = JSON.parse(content.text);
|
|
135
|
+
return [
|
|
136
|
+
{
|
|
137
|
+
prompt: parsed.candidateA.prompt,
|
|
138
|
+
reasoning: parsed.candidateA.reasoning,
|
|
139
|
+
score: 0 // Will be filled by shadow testing
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
prompt: parsed.candidateB.prompt,
|
|
143
|
+
reasoning: parsed.candidateB.reasoning,
|
|
144
|
+
score: 0
|
|
145
|
+
}
|
|
146
|
+
];
|
|
147
|
+
}
|
|
148
|
+
else if (provider === 'openai' && this.openai) {
|
|
149
|
+
console.log(`⚡ Using OpenAI for candidate generation...`);
|
|
150
|
+
const response = await this.openai.chat.completions.create({
|
|
151
|
+
model: 'gpt-4o',
|
|
152
|
+
messages: [{
|
|
153
|
+
role: 'user',
|
|
154
|
+
content: metaPrompt
|
|
155
|
+
}],
|
|
156
|
+
response_format: { type: 'json_object' }
|
|
157
|
+
});
|
|
158
|
+
const content = response.choices[0]?.message?.content;
|
|
159
|
+
if (!content) {
|
|
160
|
+
throw new Error('No content returned from OpenAI');
|
|
161
|
+
}
|
|
162
|
+
// Parse the JSON response
|
|
163
|
+
const parsed = JSON.parse(content);
|
|
164
|
+
return [
|
|
165
|
+
{
|
|
166
|
+
prompt: parsed.candidateA.prompt,
|
|
167
|
+
reasoning: parsed.candidateA.reasoning,
|
|
168
|
+
score: 0 // Will be filled by shadow testing
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
prompt: parsed.candidateB.prompt,
|
|
172
|
+
reasoning: parsed.candidateB.reasoning,
|
|
173
|
+
score: 0
|
|
174
|
+
}
|
|
175
|
+
];
|
|
176
|
+
}
|
|
177
|
+
else if (provider === 'openrouter') {
|
|
178
|
+
// For OpenRouter, we'll use the shadowTester to get a response
|
|
179
|
+
console.log(`⚡ Using OpenRouter for candidate generation...`);
|
|
180
|
+
// Since OpenRouter is used in shadow testing, we'll use a different approach
|
|
181
|
+
// For now, we'll return a basic fallback since OpenRouter doesn't support structured outputs as well
|
|
182
|
+
return [{
|
|
183
|
+
prompt: this.createFallbackPrompt(failedTest),
|
|
184
|
+
reasoning: 'Generated using fallback method',
|
|
185
|
+
score: 0
|
|
186
|
+
}];
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
catch (error) {
|
|
190
|
+
console.log(`⚠️ ${provider} provider failed for candidate generation: ${error.message}`);
|
|
191
|
+
continue; // Try next provider
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
// All providers failed
|
|
195
|
+
console.error('All providers failed for candidate generation');
|
|
196
|
+
return [{
|
|
197
|
+
prompt: this.createFallbackPrompt(failedTest),
|
|
198
|
+
reasoning: 'Fallback prompt with basic improvements',
|
|
199
|
+
score: 0
|
|
200
|
+
}];
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Shadow test each candidate and return the best one
|
|
204
|
+
*/
|
|
205
|
+
async selectBestCandidate(candidates, originalTest) {
|
|
206
|
+
const { runShadowTest } = await Promise.resolve().then(() => __importStar(require('./shadowTester')));
|
|
207
|
+
const testedCandidates = await Promise.all(candidates.map(async (candidate) => {
|
|
208
|
+
const result = await runShadowTest(candidate.prompt, originalTest);
|
|
209
|
+
return {
|
|
210
|
+
...candidate,
|
|
211
|
+
score: result.score
|
|
212
|
+
};
|
|
213
|
+
}));
|
|
214
|
+
// Sort by score (highest first)
|
|
215
|
+
testedCandidates.sort((a, b) => b.score - a.score);
|
|
216
|
+
return testedCandidates[0];
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* Fallback prompt improvement - generates a clean rewritten prompt
|
|
220
|
+
*/
|
|
221
|
+
createFallbackPrompt(test) {
|
|
222
|
+
// Extract the core intent from the original prompt
|
|
223
|
+
// Remove any existing "You must provide..." prefixes to avoid duplication
|
|
224
|
+
let corePrompt = test.prompt
|
|
225
|
+
.replace(/You must provide a response that includes the following key information:\n[^\n]*\n\n/g, '')
|
|
226
|
+
.trim();
|
|
227
|
+
// For JSON errors, create a structured prompt
|
|
228
|
+
if (test.errorType === 'json') {
|
|
229
|
+
return `${corePrompt}
|
|
230
|
+
|
|
231
|
+
IMPORTANT: You must respond with valid JSON only. No explanations, no markdown, just the raw JSON object.`;
|
|
232
|
+
}
|
|
233
|
+
// For semantic errors, be more specific about expected output
|
|
234
|
+
if (test.errorType === 'semantic') {
|
|
235
|
+
return `${corePrompt}
|
|
236
|
+
|
|
237
|
+
Your response must match this exactly: "${test.expectedOutput}"
|
|
238
|
+
Do not add any extra text, greetings, or explanations. Output only what is requested.`;
|
|
239
|
+
}
|
|
240
|
+
// Default: add clarity
|
|
241
|
+
return `${corePrompt}
|
|
242
|
+
|
|
243
|
+
Be concise and match the expected output format exactly.`;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
exports.PromptOptimizer = PromptOptimizer;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.TestReporter = void 0;
|
|
7
|
+
const chalk_1 = __importDefault(require("chalk"));
|
|
8
|
+
const cli_table3_1 = __importDefault(require("cli-table3"));
|
|
9
|
+
class TestReporter {
|
|
10
|
+
printResults(run, format = 'both') {
|
|
11
|
+
if (format === 'json' || format === 'both') {
|
|
12
|
+
this.printJSON(run);
|
|
13
|
+
}
|
|
14
|
+
if (format === 'table' || format === 'both') {
|
|
15
|
+
this.printTable(run);
|
|
16
|
+
}
|
|
17
|
+
this.printSummary(run);
|
|
18
|
+
}
|
|
19
|
+
printJSON(run) {
|
|
20
|
+
console.log(JSON.stringify(run, null, 2));
|
|
21
|
+
}
|
|
22
|
+
printTable(run) {
|
|
23
|
+
const table = new cli_table3_1.default({
|
|
24
|
+
head: ['Status', 'Test', 'Score', 'Method', 'Duration'],
|
|
25
|
+
colWidths: [10, 40, 10, 15, 12]
|
|
26
|
+
});
|
|
27
|
+
for (const result of run.results) {
|
|
28
|
+
const statusIcon = result.status === 'pass'
|
|
29
|
+
? chalk_1.default.green('✓ PASS')
|
|
30
|
+
: result.status === 'fail'
|
|
31
|
+
? chalk_1.default.red('✗ FAIL')
|
|
32
|
+
: chalk_1.default.yellow('⚠ ERROR');
|
|
33
|
+
const scoreColor = result.score >= 0.8
|
|
34
|
+
? chalk_1.default.green
|
|
35
|
+
: result.score >= 0.5
|
|
36
|
+
? chalk_1.default.yellow
|
|
37
|
+
: chalk_1.default.red;
|
|
38
|
+
table.push([
|
|
39
|
+
statusIcon,
|
|
40
|
+
result.testCase.description,
|
|
41
|
+
scoreColor(result.score.toFixed(2)),
|
|
42
|
+
result.testCase.config?.method || 'semantic',
|
|
43
|
+
`${result.metadata.duration}ms`
|
|
44
|
+
]);
|
|
45
|
+
}
|
|
46
|
+
console.log(table.toString());
|
|
47
|
+
}
|
|
48
|
+
printSummary(run) {
|
|
49
|
+
console.log('\n' + chalk_1.default.bold('Summary:'));
|
|
50
|
+
console.log(` Total: ${run.totalTests}`);
|
|
51
|
+
console.log(chalk_1.default.green(` Passed: ${run.passed}`));
|
|
52
|
+
console.log(chalk_1.default.red(` Failed: ${run.failed}`));
|
|
53
|
+
console.log(` Duration: ${run.duration}ms`);
|
|
54
|
+
const totalCost = run.results.reduce((sum, r) => sum + (r.metadata.cost || 0), 0);
|
|
55
|
+
console.log(` Cost: $${totalCost.toFixed(4)}`);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
exports.TestReporter = TestReporter;
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { TestCase, TestRun, TunePromptConfig } from "../types";
|
|
2
|
+
export declare class TestRunner {
|
|
3
|
+
private config;
|
|
4
|
+
private providers;
|
|
5
|
+
constructor(config: TunePromptConfig);
|
|
6
|
+
private initializeProviders;
|
|
7
|
+
runTests(testCases: TestCase[]): Promise<TestRun>;
|
|
8
|
+
private runSingleTest;
|
|
9
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.TestRunner = void 0;
|
|
4
|
+
const uuid_1 = require("uuid");
|
|
5
|
+
const openai_1 = require("../providers/openai");
|
|
6
|
+
const anthropic_1 = require("../providers/anthropic");
|
|
7
|
+
const openrouter_1 = require("../providers/openrouter");
|
|
8
|
+
const exact_match_1 = require("../scoring/exact-match");
|
|
9
|
+
const json_validator_1 = require("../scoring/json-validator");
|
|
10
|
+
const semantic_1 = require("../scoring/semantic");
|
|
11
|
+
class TestRunner {
|
|
12
|
+
config;
|
|
13
|
+
providers = new Map();
|
|
14
|
+
constructor(config) {
|
|
15
|
+
this.config = config;
|
|
16
|
+
this.initializeProviders();
|
|
17
|
+
}
|
|
18
|
+
initializeProviders() {
|
|
19
|
+
if (this.config.providers.openai) {
|
|
20
|
+
const provider = new openai_1.OpenAIProvider(this.config.providers.openai);
|
|
21
|
+
this.providers.set("openai", provider);
|
|
22
|
+
}
|
|
23
|
+
if (this.config.providers.anthropic) {
|
|
24
|
+
this.providers.set("anthropic", new anthropic_1.AnthropicProvider(this.config.providers.anthropic));
|
|
25
|
+
}
|
|
26
|
+
if (this.config.providers.openrouter) {
|
|
27
|
+
const provider = new openrouter_1.OpenRouterProvider(this.config.providers.openrouter);
|
|
28
|
+
this.providers.set("openrouter", provider);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
async runTests(testCases) {
|
|
32
|
+
const runId = (0, uuid_1.v4)();
|
|
33
|
+
const startTime = Date.now();
|
|
34
|
+
const results = [];
|
|
35
|
+
for (const testCase of testCases) {
|
|
36
|
+
const result = await this.runSingleTest(testCase);
|
|
37
|
+
results.push(result);
|
|
38
|
+
}
|
|
39
|
+
const duration = Date.now() - startTime;
|
|
40
|
+
const passed = results.filter((r) => r.status === "pass").length;
|
|
41
|
+
const failed = results.filter((r) => r.status === "fail").length;
|
|
42
|
+
return {
|
|
43
|
+
id: runId,
|
|
44
|
+
timestamp: new Date(),
|
|
45
|
+
totalTests: testCases.length,
|
|
46
|
+
passed,
|
|
47
|
+
failed,
|
|
48
|
+
duration,
|
|
49
|
+
results,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
async runSingleTest(testCase) {
|
|
53
|
+
const testId = (0, uuid_1.v4)();
|
|
54
|
+
const startTime = Date.now();
|
|
55
|
+
// Define fallback order: Primary -> Fallbacks
|
|
56
|
+
const fallbackChain = ["openai", "anthropic", "openrouter"];
|
|
57
|
+
// Determine starting provider
|
|
58
|
+
const initialProvider = testCase.config?.provider || "openai";
|
|
59
|
+
// Build the sequence of providers to try
|
|
60
|
+
let providersToTry;
|
|
61
|
+
if (testCase.config?.provider) {
|
|
62
|
+
// If provider is explicitly set, only try that one
|
|
63
|
+
providersToTry = [testCase.config.provider];
|
|
64
|
+
}
|
|
65
|
+
else {
|
|
66
|
+
providersToTry = [
|
|
67
|
+
initialProvider,
|
|
68
|
+
...fallbackChain.filter((p) => p !== initialProvider),
|
|
69
|
+
];
|
|
70
|
+
}
|
|
71
|
+
let lastError;
|
|
72
|
+
let errors = [];
|
|
73
|
+
for (const providerName of providersToTry) {
|
|
74
|
+
const provider = this.providers.get(providerName);
|
|
75
|
+
if (!provider)
|
|
76
|
+
continue;
|
|
77
|
+
try {
|
|
78
|
+
// Execute prompt
|
|
79
|
+
const response = await provider.complete(testCase.prompt);
|
|
80
|
+
// Score result
|
|
81
|
+
const scoringMethod = testCase.config?.method || "semantic";
|
|
82
|
+
const threshold = testCase.config?.threshold || this.config.threshold || 0.8;
|
|
83
|
+
let score;
|
|
84
|
+
let error;
|
|
85
|
+
if (scoringMethod === "exact") {
|
|
86
|
+
score = (0, exact_match_1.exactMatch)(String(testCase.expect), response.content);
|
|
87
|
+
}
|
|
88
|
+
else if (scoringMethod === "json") {
|
|
89
|
+
const result = (0, json_validator_1.validateJSON)(testCase.expect, response.content);
|
|
90
|
+
score = result.score;
|
|
91
|
+
error = result.error;
|
|
92
|
+
}
|
|
93
|
+
else if (scoringMethod === "semantic") {
|
|
94
|
+
let calculatedScore;
|
|
95
|
+
let lastScoringError;
|
|
96
|
+
// potential embedding providers
|
|
97
|
+
const embeddingCapable = ["openai", "openrouter"];
|
|
98
|
+
// Order: Current provider (if capable) -> OpenAI -> OpenRouter -> others
|
|
99
|
+
const scoringProvidersToTry = [
|
|
100
|
+
...(embeddingCapable.includes(providerName) ? [providerName] : []),
|
|
101
|
+
...embeddingCapable.filter((p) => p !== providerName),
|
|
102
|
+
].filter((p) => this.providers.has(p));
|
|
103
|
+
if (scoringProvidersToTry.length === 0) {
|
|
104
|
+
throw new Error("No embedding-capable providers available for semantic scoring");
|
|
105
|
+
}
|
|
106
|
+
for (const scoreProviderName of scoringProvidersToTry) {
|
|
107
|
+
try {
|
|
108
|
+
const scoreProvider = this.providers.get(scoreProviderName);
|
|
109
|
+
if (!scoreProvider)
|
|
110
|
+
continue;
|
|
111
|
+
const scorer = new semantic_1.SemanticScorer(scoreProvider);
|
|
112
|
+
calculatedScore = await scorer.score(String(testCase.expect), response.content);
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
catch (err) {
|
|
116
|
+
lastScoringError = err;
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
if (calculatedScore === undefined) {
|
|
121
|
+
throw new Error(`Semantic scoring failed. Last error: ${lastScoringError?.message || "Unknown error"}`);
|
|
122
|
+
}
|
|
123
|
+
score = calculatedScore;
|
|
124
|
+
}
|
|
125
|
+
else {
|
|
126
|
+
throw new Error(`Unknown scoring method: ${scoringMethod}`);
|
|
127
|
+
}
|
|
128
|
+
const status = score >= threshold ? "pass" : "fail";
|
|
129
|
+
const duration = Date.now() - startTime;
|
|
130
|
+
return {
|
|
131
|
+
id: testId,
|
|
132
|
+
testCase,
|
|
133
|
+
status,
|
|
134
|
+
score,
|
|
135
|
+
actualOutput: response.content,
|
|
136
|
+
expectedOutput: String(testCase.expect),
|
|
137
|
+
error,
|
|
138
|
+
metadata: {
|
|
139
|
+
duration,
|
|
140
|
+
timestamp: new Date(),
|
|
141
|
+
tokens: response.tokens,
|
|
142
|
+
cost: response.cost,
|
|
143
|
+
provider: providerName,
|
|
144
|
+
},
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
catch (error) {
|
|
148
|
+
lastError = error;
|
|
149
|
+
errors.push(`${providerName.toUpperCase()}: ${error.message}`);
|
|
150
|
+
continue;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// If all attempts failed
|
|
154
|
+
return {
|
|
155
|
+
id: testId,
|
|
156
|
+
testCase,
|
|
157
|
+
status: "error",
|
|
158
|
+
score: 0,
|
|
159
|
+
actualOutput: "",
|
|
160
|
+
expectedOutput: String(testCase.expect),
|
|
161
|
+
error: errors.join(" | ") || lastError?.message || "All providers failed",
|
|
162
|
+
metadata: {
|
|
163
|
+
duration: Date.now() - startTime,
|
|
164
|
+
timestamp: new Date(),
|
|
165
|
+
},
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
exports.TestRunner = TestRunner;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { FailedTest } from '../types/fix';
|
|
2
|
+
export interface ShadowTestResult {
|
|
3
|
+
score: number;
|
|
4
|
+
output: string;
|
|
5
|
+
passed: boolean;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Test a candidate prompt against the original test case
|
|
9
|
+
* Tries providers in sequence until one succeeds
|
|
10
|
+
*/
|
|
11
|
+
export declare function runShadowTest(candidatePrompt: string, originalTest: FailedTest): Promise<ShadowTestResult>;
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.runShadowTest = runShadowTest;
|
|
7
|
+
const sdk_1 = __importDefault(require("@anthropic-ai/sdk"));
|
|
8
|
+
const openai_1 = __importDefault(require("openai"));
|
|
9
|
+
const semantic_1 = require("../scoring/semantic"); // From Phase 1
|
|
10
|
+
/**
|
|
11
|
+
* Test a candidate prompt against the original test case
|
|
12
|
+
* Tries providers in sequence until one succeeds
|
|
13
|
+
*/
|
|
14
|
+
async function runShadowTest(candidatePrompt, originalTest) {
|
|
15
|
+
// Define provider priority order
|
|
16
|
+
const providers = ['anthropic', 'openai', 'openrouter'];
|
|
17
|
+
// Try each provider in order
|
|
18
|
+
for (const provider of providers) {
|
|
19
|
+
try {
|
|
20
|
+
// Check if API key exists for this provider
|
|
21
|
+
const apiKey = getApiKeyForProvider(provider);
|
|
22
|
+
if (!apiKey || apiKey.startsWith('api_key') || apiKey === 'phc_xxxxx') {
|
|
23
|
+
// Silently skip placeholders or missing keys
|
|
24
|
+
continue;
|
|
25
|
+
}
|
|
26
|
+
let output;
|
|
27
|
+
if (provider === 'anthropic') {
|
|
28
|
+
output = await runAnthropicTest(candidatePrompt, originalTest.input);
|
|
29
|
+
}
|
|
30
|
+
else if (provider === 'openai') {
|
|
31
|
+
output = await runOpenAITest(candidatePrompt, originalTest.input);
|
|
32
|
+
}
|
|
33
|
+
else if (provider === 'openrouter') {
|
|
34
|
+
output = await runOpenRouterTest(candidatePrompt, originalTest.input);
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
continue; // Unsupported provider
|
|
38
|
+
}
|
|
39
|
+
// Score the output using the same method as Phase 1
|
|
40
|
+
const score = await scoreOutput(output, originalTest.expectedOutput, originalTest.errorType);
|
|
41
|
+
return {
|
|
42
|
+
score,
|
|
43
|
+
output,
|
|
44
|
+
passed: score >= originalTest.threshold
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
catch (error) {
|
|
48
|
+
console.log(`⚠️ ${provider} provider failed: ${error.message}`);
|
|
49
|
+
continue; // Try next provider
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
// All providers failed
|
|
53
|
+
console.error('All providers failed for shadow test');
|
|
54
|
+
return {
|
|
55
|
+
score: 0,
|
|
56
|
+
output: '',
|
|
57
|
+
passed: false
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
function getApiKeyForProvider(provider) {
|
|
61
|
+
switch (provider) {
|
|
62
|
+
case 'anthropic':
|
|
63
|
+
return process.env.ANTHROPIC_API_KEY;
|
|
64
|
+
case 'openai':
|
|
65
|
+
return process.env.OPENAI_API_KEY;
|
|
66
|
+
case 'openrouter':
|
|
67
|
+
return process.env.OPENROUTER_API_KEY;
|
|
68
|
+
default:
|
|
69
|
+
return undefined;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
async function runAnthropicTest(prompt, input) {
|
|
73
|
+
const anthropic = new sdk_1.default({
|
|
74
|
+
apiKey: process.env.ANTHROPIC_API_KEY
|
|
75
|
+
});
|
|
76
|
+
// Interpolate variables if present
|
|
77
|
+
const finalPrompt = interpolateVariables(prompt, input);
|
|
78
|
+
const response = await anthropic.messages.create({
|
|
79
|
+
model: 'claude-sonnet-4-20250514',
|
|
80
|
+
max_tokens: 2000,
|
|
81
|
+
messages: [{
|
|
82
|
+
role: 'user',
|
|
83
|
+
content: finalPrompt
|
|
84
|
+
}]
|
|
85
|
+
});
|
|
86
|
+
const content = response.content[0];
|
|
87
|
+
return content.type === 'text' ? content.text : '';
|
|
88
|
+
}
|
|
89
|
+
async function runOpenAITest(prompt, input) {
|
|
90
|
+
const openai = new openai_1.default({
|
|
91
|
+
apiKey: process.env.OPENAI_API_KEY
|
|
92
|
+
});
|
|
93
|
+
const finalPrompt = interpolateVariables(prompt, input);
|
|
94
|
+
const response = await openai.chat.completions.create({
|
|
95
|
+
model: 'gpt-4o',
|
|
96
|
+
messages: [{
|
|
97
|
+
role: 'user',
|
|
98
|
+
content: finalPrompt
|
|
99
|
+
}]
|
|
100
|
+
});
|
|
101
|
+
return response.choices[0]?.message?.content || '';
|
|
102
|
+
}
|
|
103
|
+
async function runOpenRouterTest(prompt, input) {
|
|
104
|
+
const key = process.env.OPENROUTER_API_KEY;
|
|
105
|
+
// Save original key and temporarily remove it to prevent OpenAI client confusion
|
|
106
|
+
const originalOpenAIKey = process.env.OPENAI_API_KEY;
|
|
107
|
+
delete process.env.OPENAI_API_KEY;
|
|
108
|
+
try {
|
|
109
|
+
const openai = new openai_1.default({
|
|
110
|
+
baseURL: 'https://openrouter.ai/api/v1',
|
|
111
|
+
apiKey: key
|
|
112
|
+
});
|
|
113
|
+
const finalPrompt = interpolateVariables(prompt, input);
|
|
114
|
+
const response = await openai.chat.completions.create({
|
|
115
|
+
model: 'nvidia/nemotron-3-nano-30b-a3b:free',
|
|
116
|
+
messages: [{
|
|
117
|
+
role: 'user',
|
|
118
|
+
content: finalPrompt
|
|
119
|
+
}]
|
|
120
|
+
});
|
|
121
|
+
return response.choices[0]?.message?.content || '';
|
|
122
|
+
}
|
|
123
|
+
finally {
|
|
124
|
+
// Restore original key
|
|
125
|
+
if (originalOpenAIKey) {
|
|
126
|
+
process.env.OPENAI_API_KEY = originalOpenAIKey;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
function interpolateVariables(prompt, variables) {
|
|
131
|
+
if (!variables)
|
|
132
|
+
return prompt;
|
|
133
|
+
let result = prompt;
|
|
134
|
+
for (const [key, value] of Object.entries(variables)) {
|
|
135
|
+
result = result.replace(new RegExp(`{{${key}}}`, 'g'), String(value));
|
|
136
|
+
}
|
|
137
|
+
return result;
|
|
138
|
+
}
|
|
139
|
+
async function scoreOutput(actual, expected, method) {
|
|
140
|
+
switch (method) {
|
|
141
|
+
case 'semantic':
|
|
142
|
+
return await (0, semantic_1.calculateSemanticSimilarity)(actual, expected);
|
|
143
|
+
case 'exact':
|
|
144
|
+
return actual.trim() === expected.trim() ? 1.0 : 0.0;
|
|
145
|
+
case 'json':
|
|
146
|
+
try {
|
|
147
|
+
JSON.parse(actual);
|
|
148
|
+
return 1.0;
|
|
149
|
+
}
|
|
150
|
+
catch {
|
|
151
|
+
return 0.0;
|
|
152
|
+
}
|
|
153
|
+
default:
|
|
154
|
+
return 0.5;
|
|
155
|
+
}
|
|
156
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export * from './engine/runner';
|
|
2
|
+
export * from './engine/loader';
|
|
3
|
+
export * from './engine/optimizer';
|
|
4
|
+
export * from './services/cloud.service';
|
|
5
|
+
export * from './types';
|
|
6
|
+
export { runTests } from './commands/run';
|
|
7
|
+
export { loadConfig } from './utils/config';
|