@artemiskit/cli 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/dist/index.js +751 -342
- package/dist/src/commands/history.d.ts.map +1 -1
- package/dist/src/commands/redteam.d.ts.map +1 -1
- package/dist/src/commands/run.d.ts.map +1 -1
- package/dist/src/commands/stress.d.ts.map +1 -1
- package/package.json +6 -6
- package/src/commands/history.ts +58 -9
- package/src/commands/redteam.ts +19 -1
- package/src/commands/run.ts +113 -3
- package/src/commands/stress.ts +28 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"history.d.ts","sourceRoot":"","sources":["../../../src/commands/history.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"history.d.ts","sourceRoot":"","sources":["../../../src/commands/history.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AA4IpC,wBAAgB,cAAc,IAAI,OAAO,CAmFxC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"redteam.d.ts","sourceRoot":"","sources":["../../../src/commands/redteam.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"redteam.d.ts","sourceRoot":"","sources":["../../../src/commands/redteam.ts"],"names":[],"mappings":"AAAA;;GAEG;AAsCH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAoCpC,wBAAgB,cAAc,IAAI,OAAO,CAycxC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/commands/run.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/commands/run.ts"],"names":[],"mappings":"AAAA;;GAEG;AAiBH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAyiBpC,wBAAgB,UAAU,IAAI,OAAO,CAggBpC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stress.d.ts","sourceRoot":"","sources":["../../../src/commands/stress.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"stress.d.ts","sourceRoot":"","sources":["../../../src/commands/stress.ts"],"names":[],"mappings":"AAAA;;GAEG;AAoBH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAsCpC,wBAAgB,aAAa,IAAI,OAAO,CA+SvC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@artemiskit/cli",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.3",
|
|
4
4
|
"description": "Command-line interface for ArtemisKit LLM evaluation toolkit",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "Apache-2.0",
|
|
@@ -45,11 +45,11 @@
|
|
|
45
45
|
"test": "bun test"
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
|
-
"@artemiskit/adapter-openai": "
|
|
49
|
-
"@artemiskit/adapter-vercel-ai": "
|
|
50
|
-
"@artemiskit/core": "
|
|
51
|
-
"@artemiskit/redteam": "
|
|
52
|
-
"@artemiskit/reports": "
|
|
48
|
+
"@artemiskit/adapter-openai": "0.1.10",
|
|
49
|
+
"@artemiskit/adapter-vercel-ai": "0.1.10",
|
|
50
|
+
"@artemiskit/core": "0.2.3",
|
|
51
|
+
"@artemiskit/redteam": "0.2.3",
|
|
52
|
+
"@artemiskit/reports": "0.2.3",
|
|
53
53
|
"chalk": "^5.3.0",
|
|
54
54
|
"cli-table3": "^0.6.3",
|
|
55
55
|
"commander": "^12.0.0",
|
package/src/commands/history.ts
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* History command - View run history
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
+
import { formatCost } from '@artemiskit/core';
|
|
5
6
|
import chalk from 'chalk';
|
|
6
7
|
import { Command } from 'commander';
|
|
7
8
|
import { loadConfig } from '../config/loader.js';
|
|
@@ -13,6 +14,7 @@ interface HistoryOptions {
|
|
|
13
14
|
scenario?: string;
|
|
14
15
|
limit?: number;
|
|
15
16
|
config?: string;
|
|
17
|
+
showCost?: boolean;
|
|
16
18
|
}
|
|
17
19
|
|
|
18
20
|
function renderHistoryTable(
|
|
@@ -21,16 +23,20 @@ function renderHistoryTable(
|
|
|
21
23
|
scenario: string;
|
|
22
24
|
successRate: number;
|
|
23
25
|
createdAt: string;
|
|
24
|
-
|
|
26
|
+
estimatedCostUsd?: number;
|
|
27
|
+
}>,
|
|
28
|
+
showCost = false
|
|
25
29
|
): string {
|
|
26
30
|
// Column widths
|
|
27
31
|
const runIdWidth = 16;
|
|
28
|
-
const scenarioWidth = 30;
|
|
32
|
+
const scenarioWidth = showCost ? 25 : 30;
|
|
29
33
|
const rateWidth = 12;
|
|
30
34
|
const dateWidth = 20;
|
|
35
|
+
const costWidth = 10;
|
|
31
36
|
|
|
32
|
-
// Total width = borders(4) + columns + spacing
|
|
33
|
-
const
|
|
37
|
+
// Total width = borders(4) + columns + spacing
|
|
38
|
+
const baseWidth = 2 + runIdWidth + 1 + scenarioWidth + 1 + rateWidth + 1 + dateWidth + 2;
|
|
39
|
+
const width = showCost ? baseWidth + costWidth + 1 : baseWidth;
|
|
34
40
|
const border = '═'.repeat(width - 2);
|
|
35
41
|
|
|
36
42
|
const formatHeaderRow = () => {
|
|
@@ -38,6 +44,10 @@ function renderHistoryTable(
|
|
|
38
44
|
const scenarioPad = padText('Scenario', scenarioWidth);
|
|
39
45
|
const ratePad = padText('Success Rate', rateWidth, 'right');
|
|
40
46
|
const datePad = padText('Date', dateWidth, 'right');
|
|
47
|
+
if (showCost) {
|
|
48
|
+
const costPad = padText('Cost', costWidth, 'right');
|
|
49
|
+
return `║ ${runIdPad} ${scenarioPad} ${ratePad} ${costPad} ${datePad} ║`;
|
|
50
|
+
}
|
|
41
51
|
return `║ ${runIdPad} ${scenarioPad} ${ratePad} ${datePad} ║`;
|
|
42
52
|
};
|
|
43
53
|
|
|
@@ -49,6 +59,8 @@ function renderHistoryTable(
|
|
|
49
59
|
`╟${'─'.repeat(width - 2)}╢`,
|
|
50
60
|
];
|
|
51
61
|
|
|
62
|
+
let totalCost = 0;
|
|
63
|
+
|
|
52
64
|
for (const run of runs) {
|
|
53
65
|
const rateColor =
|
|
54
66
|
run.successRate >= 0.9 ? chalk.green : run.successRate >= 0.7 ? chalk.yellow : chalk.red;
|
|
@@ -70,7 +82,25 @@ function renderHistoryTable(
|
|
|
70
82
|
const dateStr = `${dateObj.toLocaleDateString()} ${dateObj.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })}`;
|
|
71
83
|
const datePad = padText(dateStr, dateWidth, 'right');
|
|
72
84
|
|
|
73
|
-
|
|
85
|
+
if (showCost) {
|
|
86
|
+
const costValue = run.estimatedCostUsd !== undefined ? formatCost(run.estimatedCostUsd) : '-';
|
|
87
|
+
const costPad = padText(costValue, costWidth, 'right');
|
|
88
|
+
if (run.estimatedCostUsd !== undefined) {
|
|
89
|
+
totalCost += run.estimatedCostUsd;
|
|
90
|
+
}
|
|
91
|
+
lines.push(`║ ${runIdPad} ${scenarioPad} ${rateColored} ${chalk.dim(costPad)} ${datePad} ║`);
|
|
92
|
+
} else {
|
|
93
|
+
lines.push(`║ ${runIdPad} ${scenarioPad} ${rateColored} ${datePad} ║`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Add total cost row if showing costs
|
|
98
|
+
if (showCost) {
|
|
99
|
+
lines.push(`╟${'─'.repeat(width - 2)}╢`);
|
|
100
|
+
const totalLabel = padText('Total', runIdWidth + 1 + scenarioWidth + 1 + rateWidth, 'right');
|
|
101
|
+
const totalCostStr = padText(formatCost(totalCost), costWidth, 'right');
|
|
102
|
+
const emptyDate = padText('', dateWidth, 'right');
|
|
103
|
+
lines.push(`║ ${totalLabel} ${chalk.bold(totalCostStr)} ${emptyDate} ║`);
|
|
74
104
|
}
|
|
75
105
|
|
|
76
106
|
lines.push(`╚${border}╝`);
|
|
@@ -84,14 +114,31 @@ function renderPlainHistory(
|
|
|
84
114
|
scenario: string;
|
|
85
115
|
successRate: number;
|
|
86
116
|
createdAt: string;
|
|
87
|
-
|
|
117
|
+
estimatedCostUsd?: number;
|
|
118
|
+
}>,
|
|
119
|
+
showCost = false
|
|
88
120
|
): string {
|
|
89
121
|
const lines = ['=== RUN HISTORY ===', ''];
|
|
90
122
|
|
|
123
|
+
let totalCost = 0;
|
|
124
|
+
|
|
91
125
|
for (const run of runs) {
|
|
92
126
|
const rate = `${(run.successRate * 100).toFixed(1)}%`;
|
|
93
127
|
const date = new Date(run.createdAt).toLocaleString();
|
|
94
|
-
|
|
128
|
+
if (showCost) {
|
|
129
|
+
const cost = run.estimatedCostUsd !== undefined ? formatCost(run.estimatedCostUsd) : '-';
|
|
130
|
+
if (run.estimatedCostUsd !== undefined) {
|
|
131
|
+
totalCost += run.estimatedCostUsd;
|
|
132
|
+
}
|
|
133
|
+
lines.push(`${run.runId} ${run.scenario} ${rate} ${cost} ${date}`);
|
|
134
|
+
} else {
|
|
135
|
+
lines.push(`${run.runId} ${run.scenario} ${rate} ${date}`);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (showCost) {
|
|
140
|
+
lines.push('');
|
|
141
|
+
lines.push(`Total: ${formatCost(totalCost)}`);
|
|
95
142
|
}
|
|
96
143
|
|
|
97
144
|
return lines.join('\n');
|
|
@@ -106,6 +153,7 @@ export function historyCommand(): Command {
|
|
|
106
153
|
.option('-s, --scenario <scenario>', 'Filter by scenario')
|
|
107
154
|
.option('-l, --limit <number>', 'Limit number of results', '20')
|
|
108
155
|
.option('--config <path>', 'Path to config file')
|
|
156
|
+
.option('--show-cost', 'Show cost column and total')
|
|
109
157
|
.action(async (options: HistoryOptions) => {
|
|
110
158
|
const spinner = createSpinner('Loading history...');
|
|
111
159
|
spinner.start();
|
|
@@ -119,6 +167,7 @@ export function historyCommand(): Command {
|
|
|
119
167
|
project: options.project,
|
|
120
168
|
scenario: options.scenario,
|
|
121
169
|
limit,
|
|
170
|
+
includeCost: options.showCost,
|
|
122
171
|
});
|
|
123
172
|
|
|
124
173
|
spinner.succeed('Loaded history');
|
|
@@ -140,9 +189,9 @@ export function historyCommand(): Command {
|
|
|
140
189
|
|
|
141
190
|
// Show history table
|
|
142
191
|
if (isTTY) {
|
|
143
|
-
console.log(renderHistoryTable(runs));
|
|
192
|
+
console.log(renderHistoryTable(runs, options.showCost));
|
|
144
193
|
} else {
|
|
145
|
-
console.log(renderPlainHistory(runs));
|
|
194
|
+
console.log(renderPlainHistory(runs, options.showCost));
|
|
146
195
|
}
|
|
147
196
|
|
|
148
197
|
console.log();
|
package/src/commands/redteam.ts
CHANGED
|
@@ -32,7 +32,11 @@ import {
|
|
|
32
32
|
UnsafeResponseDetector,
|
|
33
33
|
loadCustomAttacks,
|
|
34
34
|
} from '@artemiskit/redteam';
|
|
35
|
-
import {
|
|
35
|
+
import {
|
|
36
|
+
generateJSONReport,
|
|
37
|
+
generateRedTeamHTMLReport,
|
|
38
|
+
generateRedTeamMarkdownReport,
|
|
39
|
+
} from '@artemiskit/reports';
|
|
36
40
|
import chalk from 'chalk';
|
|
37
41
|
import { Command } from 'commander';
|
|
38
42
|
import { nanoid } from 'nanoid';
|
|
@@ -66,6 +70,8 @@ interface RedteamOptions {
|
|
|
66
70
|
config?: string;
|
|
67
71
|
redact?: boolean;
|
|
68
72
|
redactPatterns?: string[];
|
|
73
|
+
export?: 'markdown';
|
|
74
|
+
exportOutput?: string;
|
|
69
75
|
}
|
|
70
76
|
|
|
71
77
|
export function redteamCommand(): Command {
|
|
@@ -91,6 +97,8 @@ export function redteamCommand(): Command {
|
|
|
91
97
|
'--redact-patterns <patterns...>',
|
|
92
98
|
'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
|
|
93
99
|
)
|
|
100
|
+
.option('--export <format>', 'Export results to format (markdown)')
|
|
101
|
+
.option('--export-output <dir>', 'Output directory for exports (default: ./artemis-exports)')
|
|
94
102
|
.action(async (scenarioPath: string, options: RedteamOptions) => {
|
|
95
103
|
const spinner = createSpinner('Loading configuration...');
|
|
96
104
|
spinner.start();
|
|
@@ -495,6 +503,16 @@ export function redteamCommand(): Command {
|
|
|
495
503
|
console.log(chalk.dim(` JSON: ${jsonPath}`));
|
|
496
504
|
}
|
|
497
505
|
|
|
506
|
+
// Export to markdown if requested
|
|
507
|
+
if (options.export === 'markdown') {
|
|
508
|
+
const exportDir = options.exportOutput || './artemis-exports';
|
|
509
|
+
await mkdir(exportDir, { recursive: true });
|
|
510
|
+
const markdown = generateRedTeamMarkdownReport(manifest);
|
|
511
|
+
const mdPath = join(exportDir, `${runId}.md`);
|
|
512
|
+
await writeFile(mdPath, markdown);
|
|
513
|
+
console.log(chalk.dim(`Exported: ${mdPath}`));
|
|
514
|
+
}
|
|
515
|
+
|
|
498
516
|
// Exit with error if there were unsafe responses
|
|
499
517
|
if (metrics.unsafe_responses > 0) {
|
|
500
518
|
process.exit(1);
|
package/src/commands/run.ts
CHANGED
|
@@ -2,16 +2,20 @@
|
|
|
2
2
|
* Run command - Execute test scenarios
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
+
import { mkdir, writeFile } from 'node:fs/promises';
|
|
5
6
|
import { basename } from 'node:path';
|
|
7
|
+
import { join } from 'node:path';
|
|
6
8
|
import {
|
|
7
9
|
type BaselineStorageAdapter,
|
|
8
10
|
type RedactionConfig,
|
|
9
11
|
type RunManifest,
|
|
10
12
|
createAdapter,
|
|
13
|
+
formatCost,
|
|
11
14
|
parseScenarioFile,
|
|
12
15
|
resolveScenarioPaths,
|
|
13
16
|
runScenario,
|
|
14
17
|
} from '@artemiskit/core';
|
|
18
|
+
import { generateMarkdownReport } from '@artemiskit/reports';
|
|
15
19
|
import chalk from 'chalk';
|
|
16
20
|
import { Command } from 'commander';
|
|
17
21
|
import { loadConfig } from '../config/loader.js';
|
|
@@ -62,6 +66,12 @@ interface RunOptions {
|
|
|
62
66
|
baseline?: boolean;
|
|
63
67
|
/** Regression threshold (0-1), default 0.05 (5%) */
|
|
64
68
|
threshold?: number;
|
|
69
|
+
/** Budget limit in USD - fail if cost exceeds this */
|
|
70
|
+
budget?: number;
|
|
71
|
+
/** Export format: markdown */
|
|
72
|
+
export?: 'markdown';
|
|
73
|
+
/** Output directory for exports */
|
|
74
|
+
exportOutput?: string;
|
|
65
75
|
}
|
|
66
76
|
|
|
67
77
|
interface ScenarioRunResult {
|
|
@@ -103,6 +113,15 @@ interface CISummary {
|
|
|
103
113
|
totalMs: number;
|
|
104
114
|
formatted: string;
|
|
105
115
|
};
|
|
116
|
+
tokens: {
|
|
117
|
+
prompt: number;
|
|
118
|
+
completion: number;
|
|
119
|
+
total: number;
|
|
120
|
+
};
|
|
121
|
+
cost: {
|
|
122
|
+
estimatedUsd: number;
|
|
123
|
+
formatted: string;
|
|
124
|
+
};
|
|
106
125
|
runs: Array<{
|
|
107
126
|
runId: string;
|
|
108
127
|
scenario: string;
|
|
@@ -112,6 +131,7 @@ interface CISummary {
|
|
|
112
131
|
failedCases: number;
|
|
113
132
|
totalCases: number;
|
|
114
133
|
durationMs: number;
|
|
134
|
+
estimatedCostUsd?: number;
|
|
115
135
|
}>;
|
|
116
136
|
baseline?: {
|
|
117
137
|
compared: boolean;
|
|
@@ -123,6 +143,11 @@ interface CISummary {
|
|
|
123
143
|
tokens: number;
|
|
124
144
|
};
|
|
125
145
|
};
|
|
146
|
+
budget?: {
|
|
147
|
+
limit: number;
|
|
148
|
+
exceeded: boolean;
|
|
149
|
+
overBy: number;
|
|
150
|
+
};
|
|
126
151
|
}
|
|
127
152
|
|
|
128
153
|
/**
|
|
@@ -167,6 +192,21 @@ function buildCISummary(results: ScenarioRunResult[]): CISummary {
|
|
|
167
192
|
const failedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.failed_cases || 0), 0);
|
|
168
193
|
const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
|
|
169
194
|
|
|
195
|
+
// Aggregate token and cost metrics
|
|
196
|
+
const totalPromptTokens = results.reduce(
|
|
197
|
+
(sum, r) => sum + (r.manifest.metrics?.total_prompt_tokens || 0),
|
|
198
|
+
0
|
|
199
|
+
);
|
|
200
|
+
const totalCompletionTokens = results.reduce(
|
|
201
|
+
(sum, r) => sum + (r.manifest.metrics?.total_completion_tokens || 0),
|
|
202
|
+
0
|
|
203
|
+
);
|
|
204
|
+
const totalTokens = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_tokens || 0), 0);
|
|
205
|
+
const totalCostUsd = results.reduce(
|
|
206
|
+
(sum, r) => sum + (r.manifest.metrics?.cost?.total_usd || 0),
|
|
207
|
+
0
|
|
208
|
+
);
|
|
209
|
+
|
|
170
210
|
return {
|
|
171
211
|
success: failedScenarios === 0,
|
|
172
212
|
scenarios: {
|
|
@@ -184,6 +224,15 @@ function buildCISummary(results: ScenarioRunResult[]): CISummary {
|
|
|
184
224
|
totalMs: totalDuration,
|
|
185
225
|
formatted: formatDuration(totalDuration),
|
|
186
226
|
},
|
|
227
|
+
tokens: {
|
|
228
|
+
prompt: totalPromptTokens,
|
|
229
|
+
completion: totalCompletionTokens,
|
|
230
|
+
total: totalTokens,
|
|
231
|
+
},
|
|
232
|
+
cost: {
|
|
233
|
+
estimatedUsd: totalCostUsd,
|
|
234
|
+
formatted: formatCost(totalCostUsd),
|
|
235
|
+
},
|
|
187
236
|
runs: results.map((r) => ({
|
|
188
237
|
runId: r.manifest.run_id || '',
|
|
189
238
|
scenario: r.scenarioName,
|
|
@@ -193,6 +242,7 @@ function buildCISummary(results: ScenarioRunResult[]): CISummary {
|
|
|
193
242
|
failedCases: r.manifest.metrics?.failed_cases || 0,
|
|
194
243
|
totalCases: r.manifest.metrics?.total_cases || 0,
|
|
195
244
|
durationMs: r.manifest.duration_ms || 0,
|
|
245
|
+
estimatedCostUsd: r.manifest.metrics?.cost?.total_usd,
|
|
196
246
|
})),
|
|
197
247
|
};
|
|
198
248
|
}
|
|
@@ -556,6 +606,9 @@ export function runCommand(): Command {
|
|
|
556
606
|
)
|
|
557
607
|
.option('--baseline', 'Compare against baseline and detect regression')
|
|
558
608
|
.option('--threshold <number>', 'Regression threshold (0-1), e.g., 0.05 for 5%', '0.05')
|
|
609
|
+
.option('--budget <amount>', 'Maximum budget in USD - fail if estimated cost exceeds this')
|
|
610
|
+
.option('--export <format>', 'Export format: markdown')
|
|
611
|
+
.option('--export-output <dir>', 'Output directory for exports (default: ./artemis-exports)')
|
|
559
612
|
.action(async (scenarioPath: string | undefined, options: RunOptions) => {
|
|
560
613
|
// Determine CI mode: explicit flag, environment variable, or summary format that implies CI
|
|
561
614
|
const isCIMode =
|
|
@@ -741,9 +794,12 @@ export function runCommand(): Command {
|
|
|
741
794
|
|
|
742
795
|
// Show additional metrics
|
|
743
796
|
console.log();
|
|
797
|
+
const costInfo = result.manifest.metrics.cost
|
|
798
|
+
? ` | Est. Cost: ${formatCost(result.manifest.metrics.cost.total_usd)}`
|
|
799
|
+
: '';
|
|
744
800
|
console.log(
|
|
745
801
|
chalk.dim(
|
|
746
|
-
`Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}`
|
|
802
|
+
`Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}${costInfo}`
|
|
747
803
|
)
|
|
748
804
|
);
|
|
749
805
|
|
|
@@ -762,6 +818,16 @@ export function runCommand(): Command {
|
|
|
762
818
|
const savedPath = await storage.save(result.manifest);
|
|
763
819
|
console.log(chalk.dim(`Saved: ${savedPath}`));
|
|
764
820
|
}
|
|
821
|
+
|
|
822
|
+
// Export to markdown if requested
|
|
823
|
+
if (options.export === 'markdown') {
|
|
824
|
+
const exportDir = options.exportOutput || './artemis-exports';
|
|
825
|
+
await mkdir(exportDir, { recursive: true });
|
|
826
|
+
const markdown = generateMarkdownReport(result.manifest);
|
|
827
|
+
const mdPath = join(exportDir, `${result.manifest.run_id}.md`);
|
|
828
|
+
await writeFile(mdPath, markdown);
|
|
829
|
+
console.log(chalk.dim(`Exported: ${mdPath}`));
|
|
830
|
+
}
|
|
765
831
|
} catch (error) {
|
|
766
832
|
// Record failed scenario
|
|
767
833
|
console.log();
|
|
@@ -860,6 +926,8 @@ export function runCommand(): Command {
|
|
|
860
926
|
console.log(`ARTEMISKIT_CASES_FAILED=${failedCases}`);
|
|
861
927
|
console.log(`ARTEMISKIT_SUCCESS_RATE=${successRate}`);
|
|
862
928
|
console.log(`ARTEMISKIT_DURATION_MS=${ciSummary.duration.totalMs}`);
|
|
929
|
+
console.log(`ARTEMISKIT_TOKENS_TOTAL=${ciSummary.tokens.total}`);
|
|
930
|
+
console.log(`ARTEMISKIT_COST_USD=${ciSummary.cost.estimatedUsd.toFixed(4)}`);
|
|
863
931
|
|
|
864
932
|
if (baselineResult) {
|
|
865
933
|
console.log('ARTEMISKIT_BASELINE_COMPARED=true');
|
|
@@ -945,11 +1013,53 @@ export function runCommand(): Command {
|
|
|
945
1013
|
}
|
|
946
1014
|
}
|
|
947
1015
|
|
|
948
|
-
//
|
|
1016
|
+
// Check budget if specified
|
|
1017
|
+
let budgetExceeded = false;
|
|
1018
|
+
if (options.budget !== undefined) {
|
|
1019
|
+
const budgetLimit = Number.parseFloat(String(options.budget));
|
|
1020
|
+
const totalCost = ciSummary.cost.estimatedUsd;
|
|
1021
|
+
|
|
1022
|
+
if (totalCost > budgetLimit) {
|
|
1023
|
+
budgetExceeded = true;
|
|
1024
|
+
const overBy = totalCost - budgetLimit;
|
|
1025
|
+
|
|
1026
|
+
// Add budget info to CI summary
|
|
1027
|
+
ciSummary.budget = {
|
|
1028
|
+
limit: budgetLimit,
|
|
1029
|
+
exceeded: true,
|
|
1030
|
+
overBy,
|
|
1031
|
+
};
|
|
1032
|
+
|
|
1033
|
+
if (isCIMode) {
|
|
1034
|
+
if (options.summary === 'json') {
|
|
1035
|
+
// Budget info already in ciSummary, will be output above
|
|
1036
|
+
} else {
|
|
1037
|
+
console.log(`ARTEMISKIT_BUDGET_LIMIT=${budgetLimit.toFixed(2)}`);
|
|
1038
|
+
console.log('ARTEMISKIT_BUDGET_EXCEEDED=true');
|
|
1039
|
+
console.log(`ARTEMISKIT_BUDGET_OVER_BY=${overBy.toFixed(4)}`);
|
|
1040
|
+
}
|
|
1041
|
+
} else {
|
|
1042
|
+
console.log();
|
|
1043
|
+
console.log(chalk.red(`${icons.failed} BUDGET EXCEEDED`));
|
|
1044
|
+
console.log(
|
|
1045
|
+
chalk.red(
|
|
1046
|
+
` Budget: $${budgetLimit.toFixed(2)} | Actual: ${formatCost(totalCost)} | Over by: ${formatCost(overBy)}`
|
|
1047
|
+
)
|
|
1048
|
+
);
|
|
1049
|
+
console.log();
|
|
1050
|
+
}
|
|
1051
|
+
} else if (!isCIMode) {
|
|
1052
|
+
console.log(
|
|
1053
|
+
`${icons.passed} ${chalk.green('Within budget')} ${chalk.dim(`($${budgetLimit.toFixed(2)} limit, ${formatCost(totalCost)} used)`)}`
|
|
1054
|
+
);
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
// Exit with error if any scenarios failed, regression detected, or budget exceeded
|
|
949
1059
|
const hasFailures = results.some((r) => !r.success);
|
|
950
1060
|
const hasRegression = baselineResult?.hasRegression || false;
|
|
951
1061
|
|
|
952
|
-
if (hasFailures || hasRegression) {
|
|
1062
|
+
if (hasFailures || hasRegression || budgetExceeded) {
|
|
953
1063
|
process.exit(1);
|
|
954
1064
|
}
|
|
955
1065
|
} catch (error) {
|
package/src/commands/stress.ts
CHANGED
|
@@ -13,6 +13,7 @@ import {
|
|
|
13
13
|
type StressRequestResult,
|
|
14
14
|
createAdapter,
|
|
15
15
|
estimateCost,
|
|
16
|
+
formatCost,
|
|
16
17
|
getGitInfo,
|
|
17
18
|
getModelPricing,
|
|
18
19
|
parseScenarioFile,
|
|
@@ -26,6 +27,7 @@ import {
|
|
|
26
27
|
colors,
|
|
27
28
|
createSpinner,
|
|
28
29
|
getProviderErrorContext,
|
|
30
|
+
icons,
|
|
29
31
|
isTTY,
|
|
30
32
|
renderError,
|
|
31
33
|
renderInfoBox,
|
|
@@ -52,6 +54,8 @@ interface StressOptions {
|
|
|
52
54
|
config?: string;
|
|
53
55
|
redact?: boolean;
|
|
54
56
|
redactPatterns?: string[];
|
|
57
|
+
/** Budget limit in USD - fail if cost exceeds this */
|
|
58
|
+
budget?: number;
|
|
55
59
|
}
|
|
56
60
|
|
|
57
61
|
export function stressCommand(): Command {
|
|
@@ -75,6 +79,7 @@ export function stressCommand(): Command {
|
|
|
75
79
|
'--redact-patterns <patterns...>',
|
|
76
80
|
'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
|
|
77
81
|
)
|
|
82
|
+
.option('--budget <amount>', 'Maximum budget in USD - fail if estimated cost exceeds this')
|
|
78
83
|
.action(async (scenarioPath: string, options: StressOptions) => {
|
|
79
84
|
const spinner = createSpinner('Loading configuration...');
|
|
80
85
|
spinner.start();
|
|
@@ -319,6 +324,29 @@ export function stressCommand(): Command {
|
|
|
319
324
|
console.log(chalk.dim(` HTML: ${htmlPath}`));
|
|
320
325
|
console.log(chalk.dim(` JSON: ${jsonPath}`));
|
|
321
326
|
}
|
|
327
|
+
|
|
328
|
+
// Check budget if specified
|
|
329
|
+
if (options.budget !== undefined && metrics.cost) {
|
|
330
|
+
const budgetLimit = Number.parseFloat(String(options.budget));
|
|
331
|
+
const totalCost = metrics.cost.estimated_total_usd;
|
|
332
|
+
|
|
333
|
+
if (totalCost > budgetLimit) {
|
|
334
|
+
const overBy = totalCost - budgetLimit;
|
|
335
|
+
console.log();
|
|
336
|
+
console.log(chalk.red(`${icons.failed} BUDGET EXCEEDED`));
|
|
337
|
+
console.log(
|
|
338
|
+
chalk.red(
|
|
339
|
+
` Budget: $${budgetLimit.toFixed(2)} | Actual: ${formatCost(totalCost)} | Over by: ${formatCost(overBy)}`
|
|
340
|
+
)
|
|
341
|
+
);
|
|
342
|
+
process.exit(1);
|
|
343
|
+
} else {
|
|
344
|
+
console.log();
|
|
345
|
+
console.log(
|
|
346
|
+
`${icons.passed} ${chalk.green('Within budget')} ${chalk.dim(`($${budgetLimit.toFixed(2)} limit, ${formatCost(totalCost)} used)`)}`
|
|
347
|
+
);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
322
350
|
} catch (error) {
|
|
323
351
|
spinner.fail('Error');
|
|
324
352
|
|