tuneprompt 1.1.1 ā 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -9
- package/dist/commands/fix.js +15 -35
- package/dist/commands/run.js +9 -45
- package/dist/engine/loader.js +6 -2
- package/dist/engine/optimizer.js +17 -16
- package/dist/engine/reporter.js +1 -1
- package/dist/engine/runner.js +1 -1
- package/dist/engine/shadowTester.js +26 -9
- package/dist/storage/database.js +1 -1
- package/dist/utils/config.js +7 -7
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -10,17 +10,26 @@ Industrial-grade testing framework for LLM prompts
|
|
|
10
10
|
|
|
11
11
|
TunePrompt is a comprehensive testing framework designed specifically for Large Language Model (LLM) prompts. It helps developers validate, test, and optimize their prompts with industrial-grade reliability and accuracy.
|
|
12
12
|
|
|
13
|
+
## š What's New in v1.1.1
|
|
14
|
+
|
|
15
|
+
The first production-ready release of **TunePrompt**, the industrial-grade testing framework for the modern LLM stack.
|
|
16
|
+
|
|
17
|
+
- **Multi-Provider Support**: Seamlessly test across **OpenAI**, **Anthropic**, **Gemini**, and **OpenRouter**.
|
|
18
|
+
- **Semantic Evaluation**: Advanced vector-based scoring to detect logic drift and nuance shifts.
|
|
19
|
+
- **Auto-Fix Engine (Premium)**: AI-powered prompt optimization for failing tests.
|
|
20
|
+
- **Cloud Orchestration**: Unified synchronization with the [TunePrompt Dashboard](https://www.tuneprompt.xyz).
|
|
21
|
+
- **Industrial CLI**: Built-in watch mode, CI/CD integration, and historical analytics.
|
|
22
|
+
|
|
13
23
|
## Features
|
|
14
24
|
|
|
15
|
-
- **Multi-provider Support**:
|
|
16
|
-
- **Semantic Testing**: Compare outputs using
|
|
17
|
-
- **JSON Validation**: Validate structured
|
|
18
|
-
- **LLM-based Judging**:
|
|
19
|
-
- **Watch Mode**:
|
|
20
|
-
- **CI/CD
|
|
21
|
-
- **Cloud Sync**:
|
|
22
|
-
- **Auto-fix Engine**:
|
|
23
|
-
- **Detailed Reporting**: Comprehensive test reports with scores, methods, and durations
|
|
25
|
+
- **Multi-provider Support**: Native integration with Google Gemini, OpenAI, Anthropic, and OpenRouter.
|
|
26
|
+
- **Semantic Testing**: Compare outputs using high-precision embedding similarity.
|
|
27
|
+
- **JSON Validation**: Validate structured outputs with schema-aware checks.
|
|
28
|
+
- **LLM-based Judging**: Utilize advanced providers as evaluators for qualitative metrics.
|
|
29
|
+
- **Watch Mode**: Immediate feedback loop with automatic re-runs on file changes.
|
|
30
|
+
- **CI/CD Ready**: Native integration patterns for industrial deployment pipelines.
|
|
31
|
+
- **Cloud Sync**: Global telemetry and result storage via the dashboard.
|
|
32
|
+
- **Auto-fix Engine**: Iterative refinement loop for intelligent prompt repair.
|
|
24
33
|
|
|
25
34
|
## Installation
|
|
26
35
|
|
package/dist/commands/fix.js
CHANGED
|
@@ -48,8 +48,7 @@ const fs = __importStar(require("fs"));
|
|
|
48
48
|
const errorHandler_1 = require("../utils/errorHandler");
|
|
49
49
|
async function fixCommand(options = {}) {
|
|
50
50
|
try {
|
|
51
|
-
console.log(
|
|
52
|
-
// License check with better error
|
|
51
|
+
console.log('');
|
|
53
52
|
const spinner = (0, ora_1.default)('Checking license...').start();
|
|
54
53
|
const licenseValid = await (0, license_1.checkLicense)();
|
|
55
54
|
if (!licenseValid) {
|
|
@@ -58,36 +57,29 @@ async function fixCommand(options = {}) {
|
|
|
58
57
|
throw errorHandler_1.Errors.NO_LICENSE;
|
|
59
58
|
}
|
|
60
59
|
spinner.succeed('License validated');
|
|
61
|
-
// Load failed tests with error handling
|
|
62
60
|
const failedTests = await (0, storage_1.getFailedTests)();
|
|
63
61
|
if (failedTests.length === 0) {
|
|
64
62
|
throw errorHandler_1.Errors.NO_FAILED_TESTS;
|
|
65
63
|
}
|
|
66
|
-
console.log(chalk_1.default.yellow(`\
|
|
64
|
+
console.log(chalk_1.default.yellow(`\n${failedTests.length} failed test(s):`));
|
|
67
65
|
failedTests.forEach((test, index) => {
|
|
68
|
-
|
|
69
|
-
console.log(`${index + 1}. ${chalk_1.default.bold(test.description)}${chalk_1.default.cyan(modelInfo)}`);
|
|
70
|
-
console.log(` Score: ${chalk_1.default.red(test.score.toFixed(2))} (threshold: ${test.threshold})`);
|
|
66
|
+
console.log(chalk_1.default.gray(` ${index + 1}. ${test.description} ā score: ${chalk_1.default.red(test.score.toFixed(2))} / ${test.threshold}`));
|
|
71
67
|
});
|
|
72
68
|
// Step 3: Ask which tests to fix
|
|
73
69
|
let selectedIndexes = [];
|
|
74
70
|
if (options.yes) {
|
|
75
71
|
selectedIndexes = failedTests.map((_, i) => i);
|
|
76
|
-
console.log(chalk_1.default.gray(`\nNon-interactive mode: Automatic selection of all ${failedTests.length} tests.`));
|
|
77
72
|
}
|
|
78
73
|
else {
|
|
79
74
|
const response = await inquirer_1.default.prompt([{
|
|
80
75
|
type: 'checkbox',
|
|
81
76
|
name: 'selectedIndexes',
|
|
82
77
|
message: 'Which tests would you like to fix?',
|
|
83
|
-
choices: failedTests.map((test, index) => {
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
checked: true
|
|
89
|
-
};
|
|
90
|
-
})
|
|
78
|
+
choices: failedTests.map((test, index) => ({
|
|
79
|
+
name: `${test.description} (${test.score.toFixed(2)})`,
|
|
80
|
+
value: index,
|
|
81
|
+
checked: true
|
|
82
|
+
}))
|
|
91
83
|
}]);
|
|
92
84
|
selectedIndexes = response.selectedIndexes;
|
|
93
85
|
}
|
|
@@ -102,8 +94,8 @@ async function fixCommand(options = {}) {
|
|
|
102
94
|
for (const index of selectedIndexes) {
|
|
103
95
|
const test = failedTests[index];
|
|
104
96
|
const suite = await getSuiteTests(test.id);
|
|
105
|
-
const modelInfo = test.config?.model ? ` (
|
|
106
|
-
console.log(chalk_1.default.bold(`\n
|
|
97
|
+
const modelInfo = test.config?.model ? ` (${test.config.model})` : '';
|
|
98
|
+
console.log(chalk_1.default.bold(`\nāāā ${test.description}${modelInfo} āāā\n`));
|
|
107
99
|
try {
|
|
108
100
|
const result = await optimizer.optimize(test, suite);
|
|
109
101
|
await showDiff(result.originalPrompt, result.optimizedPrompt, result.reasoning);
|
|
@@ -125,8 +117,7 @@ async function fixCommand(options = {}) {
|
|
|
125
117
|
}
|
|
126
118
|
if (action === 'apply') {
|
|
127
119
|
await applyFix(test, result.optimizedPrompt);
|
|
128
|
-
console.log(
|
|
129
|
-
console.log(chalk_1.default.gray('The next run will use this new prompt.\n'));
|
|
120
|
+
console.log(chalk_1.default.green(` ā Updated: ${test.id}`));
|
|
130
121
|
}
|
|
131
122
|
else if (action === 'edit') {
|
|
132
123
|
console.log(chalk_1.default.gray('\nOpening editor... (Save and close to apply)\n'));
|
|
@@ -149,8 +140,7 @@ async function fixCommand(options = {}) {
|
|
|
149
140
|
continue; // Skip to next test
|
|
150
141
|
}
|
|
151
142
|
}
|
|
152
|
-
console.log(chalk_1.default.bold.green('\n
|
|
153
|
-
console.log(chalk_1.default.gray('Run `tuneprompt run` to verify your fixes.\n'));
|
|
143
|
+
console.log(chalk_1.default.bold.green('\n⨠Done. Run `tuneprompt run` to verify.\n'));
|
|
154
144
|
// After fix completes
|
|
155
145
|
const license = (0, license_1.getLicenseInfo)();
|
|
156
146
|
if (license) {
|
|
@@ -162,19 +152,9 @@ async function fixCommand(options = {}) {
|
|
|
162
152
|
}
|
|
163
153
|
}
|
|
164
154
|
function showUpgradePrompt() {
|
|
165
|
-
console.log(chalk_1.default.yellow('\n
|
|
166
|
-
console.log(chalk_1.default.
|
|
167
|
-
console.log(
|
|
168
|
-
console.log('repair your failing prompts.\n');
|
|
169
|
-
console.log(chalk_1.default.bold('What you get:'));
|
|
170
|
-
console.log(' ā
AI-powered prompt optimization');
|
|
171
|
-
console.log(' ā
Shadow testing before applying fixes');
|
|
172
|
-
console.log(' ā
Interactive diff viewer');
|
|
173
|
-
console.log(' ā
Unlimited fix attempts\n');
|
|
174
|
-
console.log(chalk_1.default.bold('Get Premium:'));
|
|
175
|
-
console.log(` 1. Buy a license: ${chalk_1.default.blue.underline('https://www.tuneprompt.xyz/pricing')}`);
|
|
176
|
-
console.log(` 2. Activate: ${chalk_1.default.gray('tuneprompt activate <your-key>')}\n`);
|
|
177
|
-
console.log(chalk_1.default.yellow('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n'));
|
|
155
|
+
console.log(chalk_1.default.yellow('\nš Premium feature. Get access:'));
|
|
156
|
+
console.log(chalk_1.default.gray(` ${chalk_1.default.blue.underline('https://www.tuneprompt.xyz/pricing')}`));
|
|
157
|
+
console.log(chalk_1.default.gray(` Then: ${chalk_1.default.white('tuneprompt activate <key>')}\n`));
|
|
178
158
|
}
|
|
179
159
|
async function showDiff(original, optimized, reasoning) {
|
|
180
160
|
const diffLib = await Promise.resolve().then(() => __importStar(require('diff')));
|
package/dist/commands/run.js
CHANGED
|
@@ -15,62 +15,29 @@ const loader_1 = require("../engine/loader");
|
|
|
15
15
|
const runner_1 = require("../engine/runner");
|
|
16
16
|
const reporter_1 = require("../engine/reporter");
|
|
17
17
|
const database_1 = require("../storage/database");
|
|
18
|
-
const license_1 = require("../utils/license");
|
|
19
|
-
// At the end of your test run reporter
|
|
20
|
-
function displayRunSummary(results) {
|
|
21
|
-
const failed = results.filter(r => r.status === 'fail');
|
|
22
|
-
const passed = results.filter(r => r.status === 'pass');
|
|
23
|
-
console.log(chalk_1.default.bold.white('\n' + '='.repeat(60)));
|
|
24
|
-
console.log(chalk_1.default.bold.white('Test Summary'));
|
|
25
|
-
console.log(chalk_1.default.bold.white('='.repeat(60)));
|
|
26
|
-
console.log(chalk_1.default.green(`ā Passed: ${passed.length}`));
|
|
27
|
-
console.log(chalk_1.default.red(`ā Failed: ${failed.length}`));
|
|
28
|
-
console.log(chalk_1.default.gray(`Total: ${results.length}`));
|
|
29
|
-
console.log(chalk_1.default.bold.white('='.repeat(60) + '\n'));
|
|
30
|
-
// UPSELL MESSAGE (NEW)
|
|
31
|
-
if (failed.length > 0) {
|
|
32
|
-
console.log(chalk_1.default.yellow('ā ļø ' + failed.length + ' test(s) failed'));
|
|
33
|
-
console.log(chalk_1.default.gray('\nDon\'t waste time debugging manually.'));
|
|
34
|
-
console.log(chalk_1.default.cyan('Run ') + chalk_1.default.bold.white('tuneprompt fix') + chalk_1.default.cyan(' to let AI repair these prompts instantly.\n'));
|
|
35
|
-
// Check license status
|
|
36
|
-
const licenseManager = new license_1.LicenseManager();
|
|
37
|
-
licenseManager.hasFeature('fix').then((hasAccess) => {
|
|
38
|
-
if (!hasAccess) {
|
|
39
|
-
console.log(chalk_1.default.gray('Unlock fix with: ') + chalk_1.default.white('https://www.tuneprompt.xyz/pricing'));
|
|
40
|
-
console.log(chalk_1.default.gray('Already have a key? ') + chalk_1.default.white('tuneprompt activate <key>\n'));
|
|
41
|
-
}
|
|
42
|
-
});
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
18
|
// Extract the core run functionality to a separate function
|
|
46
19
|
async function runTests(options = {}) {
|
|
47
20
|
const startTime = Date.now();
|
|
48
|
-
const spinner = (0, ora_1.default)('Loading
|
|
21
|
+
const spinner = (0, ora_1.default)('Loading...').start();
|
|
49
22
|
try {
|
|
50
|
-
// Load config
|
|
51
23
|
const config = await (0, config_1.loadConfig)(options.config);
|
|
52
24
|
spinner.succeed('Configuration loaded');
|
|
53
|
-
|
|
54
|
-
spinner.start('Loading test cases...');
|
|
25
|
+
spinner.start('Loading tests...');
|
|
55
26
|
const loader = new loader_1.TestLoader();
|
|
56
27
|
const testCases = loader.loadTestDir(config.testDir || './tests');
|
|
57
28
|
if (testCases.length === 0) {
|
|
58
29
|
spinner.fail('No test cases found');
|
|
59
30
|
process.exit(1);
|
|
60
31
|
}
|
|
61
|
-
spinner.succeed(`Loaded ${testCases.length} test
|
|
62
|
-
// Run tests
|
|
32
|
+
spinner.succeed(`Loaded ${testCases.length} test(s)`);
|
|
63
33
|
spinner.start('Running tests...');
|
|
64
34
|
const runner = new runner_1.TestRunner(config);
|
|
65
35
|
const results = await runner.runTests(testCases);
|
|
66
36
|
spinner.stop();
|
|
67
|
-
// Save to database
|
|
68
|
-
// Save to database
|
|
69
37
|
const db = new database_1.TestDatabase();
|
|
70
38
|
db.saveRun(results);
|
|
71
|
-
// Calculate results for cloud upload
|
|
72
|
-
const currentRunId = results.id;
|
|
73
|
-
// Report results
|
|
39
|
+
// Calculate results for cloud upload
|
|
40
|
+
const currentRunId = results.id;
|
|
74
41
|
const reporter = new reporter_1.TestReporter();
|
|
75
42
|
reporter.printResults(results, config.outputFormat);
|
|
76
43
|
const isCI = options.ci ||
|
|
@@ -104,11 +71,11 @@ async function syncPendingRuns(db, options) {
|
|
|
104
71
|
const pendingRuns = db.getPendingUploads();
|
|
105
72
|
if (pendingRuns.length === 0)
|
|
106
73
|
return;
|
|
107
|
-
|
|
74
|
+
const syncSpinner = (0, ora_1.default)(`Syncing ${pendingRuns.length} run(s) to Cloud...`).start();
|
|
108
75
|
const cloudService = new cloud_service_1.CloudService();
|
|
109
76
|
await cloudService.init();
|
|
110
77
|
if (!(await cloudService.isAuthenticated())) {
|
|
111
|
-
|
|
78
|
+
syncSpinner.warn('Not authenticated. Run `tuneprompt activate` first.');
|
|
112
79
|
return;
|
|
113
80
|
}
|
|
114
81
|
// Get project ID once
|
|
@@ -124,7 +91,7 @@ async function syncPendingRuns(db, options) {
|
|
|
124
91
|
}
|
|
125
92
|
}
|
|
126
93
|
catch (err) {
|
|
127
|
-
|
|
94
|
+
syncSpinner.warn('Failed to get project info');
|
|
128
95
|
return;
|
|
129
96
|
}
|
|
130
97
|
// Common Git/Env context
|
|
@@ -179,10 +146,7 @@ async function syncPendingRuns(db, options) {
|
|
|
179
146
|
const uploadResult = await cloudService.uploadRun(runData);
|
|
180
147
|
if (uploadResult.success) {
|
|
181
148
|
db.markAsUploaded(run.id);
|
|
182
|
-
console.log(chalk_1.default.green(` ā Uploaded run from ${run.timestamp.toLocaleTimeString()}`));
|
|
183
|
-
}
|
|
184
|
-
else {
|
|
185
|
-
console.log(chalk_1.default.red(` ā Failed to upload run ${run.id}: ${uploadResult.error}`));
|
|
186
149
|
}
|
|
187
150
|
}
|
|
151
|
+
syncSpinner.succeed(`Synced ${pendingRuns.length} run(s) to Cloud`);
|
|
188
152
|
}
|
package/dist/engine/loader.js
CHANGED
|
@@ -44,12 +44,16 @@ class TestLoader {
|
|
|
44
44
|
if (ext === '.json') {
|
|
45
45
|
const data = JSON.parse(content);
|
|
46
46
|
const tests = Array.isArray(data) ? data : [data];
|
|
47
|
-
return tests
|
|
47
|
+
return tests
|
|
48
|
+
.filter((t) => t && typeof t === 'object' && t.prompt)
|
|
49
|
+
.map(t => ({ ...t, filePath: path.resolve(filePath) }));
|
|
48
50
|
}
|
|
49
51
|
else if (ext === '.yaml' || ext === '.yml') {
|
|
50
52
|
const data = yaml.load(content);
|
|
51
53
|
const tests = Array.isArray(data) ? data : [data];
|
|
52
|
-
return tests
|
|
54
|
+
return tests
|
|
55
|
+
.filter((t) => t && typeof t === 'object' && t.prompt)
|
|
56
|
+
.map(t => ({ ...t, filePath: path.resolve(filePath) }));
|
|
53
57
|
}
|
|
54
58
|
else {
|
|
55
59
|
throw new Error(`Unsupported file format: ${ext}`);
|
package/dist/engine/optimizer.js
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
2
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
6
|
exports.PromptOptimizer = void 0;
|
|
7
|
+
const ora_1 = __importDefault(require("ora"));
|
|
4
8
|
const metaPrompt_1 = require("./metaPrompt");
|
|
5
9
|
const constraintExtractor_1 = require("./constraintExtractor");
|
|
6
10
|
const shadowTester_1 = require("./shadowTester");
|
|
@@ -14,10 +18,8 @@ class PromptOptimizer {
|
|
|
14
18
|
* Main optimization method with Anti-Regression and Iterative Refinement
|
|
15
19
|
*/
|
|
16
20
|
async optimize(failedTest, suite) {
|
|
17
|
-
|
|
18
|
-
console.log(`š Full test suite size: ${suite.length}`);
|
|
21
|
+
const spinner = (0, ora_1.default)(`Analyzing failure: "${failedTest.description}"`).start();
|
|
19
22
|
const initialAggregateScore = suite.reduce((sum, t) => sum + t.score, 0) / suite.length;
|
|
20
|
-
console.log(`š Current aggregate score: ${initialAggregateScore.toFixed(2)}`);
|
|
21
23
|
const errorContext = (0, constraintExtractor_1.generateErrorContext)(failedTest);
|
|
22
24
|
const passingExamples = suite
|
|
23
25
|
.filter(t => t.score >= t.threshold)
|
|
@@ -30,7 +32,7 @@ class PromptOptimizer {
|
|
|
30
32
|
let conversation = [];
|
|
31
33
|
while (iterations < this.maxIterations) {
|
|
32
34
|
iterations++;
|
|
33
|
-
|
|
35
|
+
spinner.text = `Optimization Attempt #${iterations}/${this.maxIterations}...`;
|
|
34
36
|
if (iterations === 1) {
|
|
35
37
|
const input = {
|
|
36
38
|
originalPrompt: failedTest.prompt,
|
|
@@ -53,17 +55,15 @@ class PromptOptimizer {
|
|
|
53
55
|
}
|
|
54
56
|
for (const candidate of candidates) {
|
|
55
57
|
try {
|
|
56
|
-
|
|
58
|
+
spinner.text = `Attempt #${iterations}: Testing candidate...`;
|
|
57
59
|
const primaryResult = await (0, shadowTester_1.runShadowTest)(candidate.prompt, failedTest);
|
|
58
60
|
if (primaryResult.score < failedTest.threshold) {
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
lastFailureReason = `Candidate failed. Reason: ${specificReason}. Previous reasoning: ${candidate.reasoning}`;
|
|
61
|
+
const specificReason = primaryResult.failureReason || `the output was: "${primaryResult.output.substring(0, 50)}..."`;
|
|
62
|
+
lastFailureReason = `Candidate failed. Reason: ${specificReason}.`;
|
|
62
63
|
continue;
|
|
63
64
|
}
|
|
64
|
-
|
|
65
|
+
spinner.text = `Attempt #${iterations}: Verifying anti-regression...`;
|
|
65
66
|
const suiteResult = await (0, shadowTester_1.runSuiteShadowTest)(candidate.prompt, suite);
|
|
66
|
-
console.log(` š Suite aggregate score: ${suiteResult.aggregateScore.toFixed(2)}`);
|
|
67
67
|
if (suiteResult.aggregateScore > bestAggregateScore) {
|
|
68
68
|
bestAggregateScore = suiteResult.aggregateScore;
|
|
69
69
|
bestResult = {
|
|
@@ -81,23 +81,24 @@ class PromptOptimizer {
|
|
|
81
81
|
};
|
|
82
82
|
}
|
|
83
83
|
else if (suiteResult.aggregateScore <= bestAggregateScore) {
|
|
84
|
-
console.log(` š Candidate regression: aggregate score dropped (Current: ${bestAggregateScore.toFixed(2)} VS New: ${suiteResult.aggregateScore.toFixed(2)})`);
|
|
85
84
|
const regressions = suiteResult.results.filter(r => !r.passed).map(r => r.failureReason).filter(Boolean);
|
|
86
85
|
const regressionText = regressions.length > 0 ? ` Required features broke: ${regressions.slice(0, 2).join('; ')}.` : '';
|
|
87
|
-
lastFailureReason = `
|
|
86
|
+
lastFailureReason = `Fix introduced regressions.${regressionText}`;
|
|
88
87
|
}
|
|
89
88
|
}
|
|
90
89
|
catch (error) {
|
|
91
|
-
|
|
90
|
+
spinner.text = `Attempt #${iterations}: ā ļø ${error.message?.substring(0, 80)}`;
|
|
91
|
+
lastFailureReason = error.message;
|
|
92
92
|
}
|
|
93
93
|
}
|
|
94
94
|
if (bestResult)
|
|
95
95
|
break;
|
|
96
|
-
console.log(`ā»ļø No candidate was net-positive. Retrying with refinement feedback...`);
|
|
97
96
|
}
|
|
98
97
|
if (!bestResult) {
|
|
99
|
-
|
|
98
|
+
spinner.fail(`Optimization failed`);
|
|
99
|
+
throw new Error(`Failed to improve score after ${this.maxIterations} attempts. ${lastFailureReason || ''}`);
|
|
100
100
|
}
|
|
101
|
+
spinner.succeed(`Optimization successful!`);
|
|
101
102
|
return bestResult;
|
|
102
103
|
}
|
|
103
104
|
getMetaPrompt(input) {
|
|
@@ -130,7 +131,7 @@ class PromptOptimizer {
|
|
|
130
131
|
// Pick a strong model for optimization if not defined
|
|
131
132
|
const model = providerName === 'anthropic' ? 'claude-3-5-sonnet-latest' :
|
|
132
133
|
providerName === 'openai' ? 'gpt-4o' :
|
|
133
|
-
providerName === 'gemini' ? 'gemini-
|
|
134
|
+
providerName === 'gemini' ? 'gemini-2.0-flash' : undefined;
|
|
134
135
|
if (!model)
|
|
135
136
|
continue;
|
|
136
137
|
const provider = factory_1.ProviderFactory.create(providerName, {
|
package/dist/engine/reporter.js
CHANGED
|
@@ -7,7 +7,7 @@ exports.TestReporter = void 0;
|
|
|
7
7
|
const chalk_1 = __importDefault(require("chalk"));
|
|
8
8
|
const cli_table3_1 = __importDefault(require("cli-table3"));
|
|
9
9
|
class TestReporter {
|
|
10
|
-
printResults(run, format = '
|
|
10
|
+
printResults(run, format = 'table') {
|
|
11
11
|
if (format === 'json' || format === 'both') {
|
|
12
12
|
this.printJSON(run);
|
|
13
13
|
}
|
package/dist/engine/runner.js
CHANGED
|
@@ -18,7 +18,7 @@ class TestRunner {
|
|
|
18
18
|
const providerNames = ["openai", "anthropic", "openrouter", "gemini"];
|
|
19
19
|
for (const name of providerNames) {
|
|
20
20
|
const providerConfig = this.config.providers[name];
|
|
21
|
-
if (providerConfig) {
|
|
21
|
+
if (providerConfig && providerConfig.apiKey) {
|
|
22
22
|
this.providers.set(name, factory_1.ProviderFactory.create(name, providerConfig));
|
|
23
23
|
}
|
|
24
24
|
}
|
|
@@ -20,16 +20,34 @@ async function runShadowTest(candidatePrompt, test) {
|
|
|
20
20
|
}
|
|
21
21
|
const providerName = test.config?.provider;
|
|
22
22
|
const model = test.config?.model;
|
|
23
|
-
//
|
|
23
|
+
// Determine providers to try
|
|
24
|
+
let providersToTry = [];
|
|
24
25
|
if (providerName && model) {
|
|
26
|
+
providersToTry.push({ name: providerName, model });
|
|
27
|
+
}
|
|
28
|
+
// Fallback queue
|
|
29
|
+
const fallbackQueue = [
|
|
30
|
+
{ name: 'anthropic', model: 'claude-3-5-sonnet-latest' },
|
|
31
|
+
{ name: 'openai', model: 'gpt-4o' },
|
|
32
|
+
{ name: 'gemini', model: 'gemini-2.0-flash' },
|
|
33
|
+
{ name: 'openrouter', model: 'nvidia/nemotron-3-nano-30b-a3b:free' }
|
|
34
|
+
];
|
|
35
|
+
for (const entry of fallbackQueue) {
|
|
36
|
+
if (entry.name !== providerName) {
|
|
37
|
+
providersToTry.push(entry);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
let errors = [];
|
|
41
|
+
for (const target of providersToTry) {
|
|
25
42
|
try {
|
|
26
|
-
const apiKey = factory_1.ProviderFactory.getApiKey(
|
|
43
|
+
const apiKey = factory_1.ProviderFactory.getApiKey(target.name);
|
|
27
44
|
if (!apiKey) {
|
|
28
|
-
|
|
45
|
+
errors.push(`${target.name}: no API key (set ${target.name.toUpperCase()}_API_KEY)`);
|
|
46
|
+
continue;
|
|
29
47
|
}
|
|
30
|
-
const provider = factory_1.ProviderFactory.create(
|
|
48
|
+
const provider = factory_1.ProviderFactory.create(target.name, {
|
|
31
49
|
apiKey,
|
|
32
|
-
model,
|
|
50
|
+
model: target.model || 'latest',
|
|
33
51
|
maxTokens: 2000
|
|
34
52
|
});
|
|
35
53
|
const finalPrompt = (0, interpolation_1.interpolateVariables)(candidatePrompt, test.input);
|
|
@@ -44,12 +62,11 @@ async function runShadowTest(candidatePrompt, test) {
|
|
|
44
62
|
};
|
|
45
63
|
}
|
|
46
64
|
catch (error) {
|
|
47
|
-
|
|
48
|
-
|
|
65
|
+
errors.push(`${target.name}: ${error.message}`);
|
|
66
|
+
continue;
|
|
49
67
|
}
|
|
50
68
|
}
|
|
51
|
-
|
|
52
|
-
throw new Error(`Test "${test.description}" lacks provider/model configuration. Validation aborted.`);
|
|
69
|
+
throw new Error(`Shadow test failed for all providers: ${errors.join(' | ')}`);
|
|
53
70
|
}
|
|
54
71
|
/**
|
|
55
72
|
* Run a candidate prompt against multiple tests and return aggregate results
|
package/dist/storage/database.js
CHANGED
|
@@ -118,7 +118,7 @@ class TestDatabase {
|
|
|
118
118
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
119
119
|
`);
|
|
120
120
|
for (const result of run.results) {
|
|
121
|
-
insertResult.run(result.id, run.id, result.testCase.description, typeof result.testCase.prompt === 'string' ? result.testCase.prompt : JSON.stringify(result.testCase.prompt), result.testCase.variables ? JSON.stringify(result.testCase.variables) : null, typeof result.testCase.expect === 'string' ? result.testCase.expect : JSON.stringify(result.testCase.expect), result.testCase.config ? JSON.stringify(result.testCase.config) : null, result.testCase.filePath || null, result.status, result.score, result.actualOutput, result.expectedOutput, result.error || null, result.metadata.duration, result.metadata.tokens || null, result.metadata.cost || null, result.metadata.provider || null);
|
|
121
|
+
insertResult.run(result.id, run.id, result.testCase.description || 'No description', typeof result.testCase.prompt === 'string' ? result.testCase.prompt : JSON.stringify(result.testCase.prompt), result.testCase.variables ? JSON.stringify(result.testCase.variables) : null, typeof result.testCase.expect === 'string' ? result.testCase.expect : JSON.stringify(result.testCase.expect), result.testCase.config ? JSON.stringify(result.testCase.config) : null, result.testCase.filePath || null, result.status, result.score, result.actualOutput, result.expectedOutput, result.error || null, result.metadata.duration, result.metadata.tokens || null, result.metadata.cost || null, result.metadata.provider || null);
|
|
122
122
|
}
|
|
123
123
|
}
|
|
124
124
|
getRecentRuns(limit = 10) {
|
package/dist/utils/config.js
CHANGED
|
@@ -21,16 +21,16 @@ function validateConfig(config) {
|
|
|
21
21
|
if (!config.providers || Object.keys(config.providers).length === 0) {
|
|
22
22
|
throw new Error('At least one provider must be configured');
|
|
23
23
|
}
|
|
24
|
-
// Validate API keys
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
24
|
+
// Validate API keys - ensure at least one provider is valid
|
|
25
|
+
const validProviders = Object.entries(config.providers)
|
|
26
|
+
.filter(([_, cfg]) => !!cfg.apiKey);
|
|
27
|
+
if (validProviders.length === 0) {
|
|
28
|
+
throw new Error('No valid API keys found. Please provide at least one API key in your .env file.');
|
|
29
29
|
}
|
|
30
30
|
return {
|
|
31
31
|
threshold: config.threshold || 0.8,
|
|
32
32
|
testDir: config.testDir || './tests',
|
|
33
|
-
outputFormat: config.outputFormat || '
|
|
33
|
+
outputFormat: config.outputFormat || 'table',
|
|
34
34
|
...config
|
|
35
35
|
};
|
|
36
36
|
}
|
|
@@ -70,7 +70,7 @@ function getDefaultConfigTemplate() {
|
|
|
70
70
|
testDir: './tests',
|
|
71
71
|
|
|
72
72
|
// Output format: 'json', 'table', or 'both'
|
|
73
|
-
outputFormat: '
|
|
73
|
+
outputFormat: 'table'
|
|
74
74
|
};
|
|
75
75
|
`;
|
|
76
76
|
}
|