trickle-cli 0.1.203 → 0.1.205
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/benchmark.d.ts +14 -0
- package/dist/commands/benchmark.js +177 -0
- package/dist/commands/cost-report.js +2 -0
- package/dist/index.js +11 -0
- package/package.json +1 -1
- package/src/commands/benchmark.ts +173 -0
- package/src/commands/cost-report.ts +2 -0
- package/src/index.ts +12 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
|
|
3
|
+
*
|
|
4
|
+
* Runs the same command N times, captures trickle data for each run,
|
|
5
|
+
* and reports variance: pass@k, consistency, cost/latency distribution.
|
|
6
|
+
*
|
|
7
|
+
* 85% per-step accuracy compounds to 20% on 10 steps — this measures
|
|
8
|
+
* whether your agent gives consistent results across identical inputs.
|
|
9
|
+
*/
|
|
10
|
+
export declare function benchmarkCommand(command: string | undefined, opts: {
|
|
11
|
+
runs?: string;
|
|
12
|
+
json?: boolean;
|
|
13
|
+
failUnderConsistency?: string;
|
|
14
|
+
}): Promise<void>;
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
|
|
4
|
+
*
|
|
5
|
+
* Runs the same command N times, captures trickle data for each run,
|
|
6
|
+
* and reports variance: pass@k, consistency, cost/latency distribution.
|
|
7
|
+
*
|
|
8
|
+
* 85% per-step accuracy compounds to 20% on 10 steps — this measures
|
|
9
|
+
* whether your agent gives consistent results across identical inputs.
|
|
10
|
+
*/
|
|
11
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
12
|
+
if (k2 === undefined) k2 = k;
|
|
13
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
14
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
15
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
16
|
+
}
|
|
17
|
+
Object.defineProperty(o, k2, desc);
|
|
18
|
+
}) : (function(o, m, k, k2) {
|
|
19
|
+
if (k2 === undefined) k2 = k;
|
|
20
|
+
o[k2] = m[k];
|
|
21
|
+
}));
|
|
22
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
23
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
24
|
+
}) : function(o, v) {
|
|
25
|
+
o["default"] = v;
|
|
26
|
+
});
|
|
27
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
28
|
+
var ownKeys = function(o) {
|
|
29
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
30
|
+
var ar = [];
|
|
31
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
32
|
+
return ar;
|
|
33
|
+
};
|
|
34
|
+
return ownKeys(o);
|
|
35
|
+
};
|
|
36
|
+
return function (mod) {
|
|
37
|
+
if (mod && mod.__esModule) return mod;
|
|
38
|
+
var result = {};
|
|
39
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
40
|
+
__setModuleDefault(result, mod);
|
|
41
|
+
return result;
|
|
42
|
+
};
|
|
43
|
+
})();
|
|
44
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
45
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
46
|
+
};
|
|
47
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
48
|
+
exports.benchmarkCommand = benchmarkCommand;
|
|
49
|
+
const fs = __importStar(require("fs"));
|
|
50
|
+
const path = __importStar(require("path"));
|
|
51
|
+
const chalk_1 = __importDefault(require("chalk"));
|
|
52
|
+
const child_process_1 = require("child_process");
|
|
53
|
+
function countLines(fp) {
|
|
54
|
+
if (!fs.existsSync(fp))
|
|
55
|
+
return 0;
|
|
56
|
+
return fs.readFileSync(fp, 'utf-8').trim().split('\n').filter(Boolean).length;
|
|
57
|
+
}
|
|
58
|
+
function sumField(fp, field) {
|
|
59
|
+
if (!fs.existsSync(fp))
|
|
60
|
+
return 0;
|
|
61
|
+
return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
|
|
62
|
+
.reduce((s, l) => { try {
|
|
63
|
+
return s + (JSON.parse(l)[field] || 0);
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
return s;
|
|
67
|
+
} }, 0);
|
|
68
|
+
}
|
|
69
|
+
async function runTrial(command, trialDir) {
|
|
70
|
+
return new Promise((resolve) => {
|
|
71
|
+
const start = Date.now();
|
|
72
|
+
const env = { ...process.env, TRICKLE_LOCAL: '1', TRICKLE_LOCAL_DIR: trialDir };
|
|
73
|
+
const proc = (0, child_process_1.spawn)(command, [], { shell: true, env, stdio: 'pipe' });
|
|
74
|
+
proc.on('exit', (code) => resolve({ exitCode: code ?? 1, durationMs: Date.now() - start }));
|
|
75
|
+
proc.on('error', () => resolve({ exitCode: 1, durationMs: Date.now() - start }));
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
async function benchmarkCommand(command, opts) {
|
|
79
|
+
if (!command) {
|
|
80
|
+
console.log(chalk_1.default.yellow(' Usage: trickle benchmark "python my_agent.py" --runs 5'));
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
const numRuns = parseInt(opts.runs || '5', 10);
|
|
84
|
+
const baseDir = path.join(process.cwd(), '.trickle', 'benchmark');
|
|
85
|
+
fs.mkdirSync(baseDir, { recursive: true });
|
|
86
|
+
console.log('');
|
|
87
|
+
console.log(chalk_1.default.bold(' trickle benchmark'));
|
|
88
|
+
console.log(chalk_1.default.gray(' ' + '─'.repeat(60)));
|
|
89
|
+
console.log(` Command: ${chalk_1.default.cyan(command)}`);
|
|
90
|
+
console.log(` Runs: ${numRuns}`);
|
|
91
|
+
console.log('');
|
|
92
|
+
const results = [];
|
|
93
|
+
for (let i = 1; i <= numRuns; i++) {
|
|
94
|
+
const trialDir = path.join(baseDir, `run-${i}`);
|
|
95
|
+
fs.mkdirSync(trialDir, { recursive: true });
|
|
96
|
+
// Clear previous data
|
|
97
|
+
for (const f of fs.readdirSync(trialDir)) {
|
|
98
|
+
if (f.endsWith('.jsonl') || f.endsWith('.json'))
|
|
99
|
+
fs.unlinkSync(path.join(trialDir, f));
|
|
100
|
+
}
|
|
101
|
+
process.stdout.write(chalk_1.default.gray(` Run ${i}/${numRuns}... `));
|
|
102
|
+
const { exitCode, durationMs } = await runTrial(command, trialDir);
|
|
103
|
+
const functions = countLines(path.join(trialDir, 'observations.jsonl'));
|
|
104
|
+
const variables = countLines(path.join(trialDir, 'variables.jsonl'));
|
|
105
|
+
const errors = countLines(path.join(trialDir, 'errors.jsonl'));
|
|
106
|
+
const llmCalls = countLines(path.join(trialDir, 'llm.jsonl'));
|
|
107
|
+
const llmCost = Math.round(sumField(path.join(trialDir, 'llm.jsonl'), 'estimatedCostUsd') * 10000) / 10000;
|
|
108
|
+
const llmTokens = sumField(path.join(trialDir, 'llm.jsonl'), 'totalTokens');
|
|
109
|
+
const agentEvents = countLines(path.join(trialDir, 'agents.jsonl'));
|
|
110
|
+
// Simple eval score: 100 if exit 0 and no errors, minus penalties
|
|
111
|
+
const evalScore = Math.max(0, (exitCode === 0 ? 100 : 30) - errors * 15);
|
|
112
|
+
results.push({ run: i, exitCode, durationMs, functions, variables, errors, llmCalls, llmCost, llmTokens, agentEvents, evalScore });
|
|
113
|
+
const icon = exitCode === 0 ? chalk_1.default.green('✓') : chalk_1.default.red('✗');
|
|
114
|
+
console.log(`${icon} ${durationMs}ms | ${functions} fn | ${errors} err | ${llmCalls} llm ($${llmCost})`);
|
|
115
|
+
}
|
|
116
|
+
// Compute statistics
|
|
117
|
+
const passes = results.filter(r => r.exitCode === 0).length;
|
|
118
|
+
const passAtK = passes > 0 ? 1 : 0; // At least 1 succeeds
|
|
119
|
+
const passAllK = passes === numRuns ? 1 : 0; // All succeed
|
|
120
|
+
const consistency = Math.round((passes / numRuns) * 100);
|
|
121
|
+
const durations = results.map(r => r.durationMs);
|
|
122
|
+
const costs = results.map(r => r.llmCost);
|
|
123
|
+
const tokens = results.map(r => r.llmTokens);
|
|
124
|
+
const scores = results.map(r => r.evalScore);
|
|
125
|
+
const avg = (arr) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
|
|
126
|
+
const stddev = (arr) => {
|
|
127
|
+
const m = avg(arr);
|
|
128
|
+
return Math.sqrt(arr.reduce((s, v) => s + (v - m) ** 2, 0) / Math.max(1, arr.length));
|
|
129
|
+
};
|
|
130
|
+
const min = (arr) => arr.length ? Math.min(...arr) : 0;
|
|
131
|
+
const max = (arr) => arr.length ? Math.max(...arr) : 0;
|
|
132
|
+
const report = {
|
|
133
|
+
command, runs: numRuns,
|
|
134
|
+
passRate: consistency,
|
|
135
|
+
passAtK, passAllK,
|
|
136
|
+
latency: { avg: Math.round(avg(durations)), stddev: Math.round(stddev(durations)), min: min(durations), max: max(durations) },
|
|
137
|
+
cost: { total: Math.round(costs.reduce((a, b) => a + b, 0) * 10000) / 10000, avg: Math.round(avg(costs) * 10000) / 10000, stddev: Math.round(stddev(costs) * 10000) / 10000 },
|
|
138
|
+
tokens: { total: tokens.reduce((a, b) => a + b, 0), avg: Math.round(avg(tokens)) },
|
|
139
|
+
evalScore: { avg: Math.round(avg(scores)), min: min(scores), max: max(scores) },
|
|
140
|
+
trials: results,
|
|
141
|
+
};
|
|
142
|
+
if (opts.json) {
|
|
143
|
+
console.log(JSON.stringify(report, null, 2));
|
|
144
|
+
if (opts.failUnderConsistency) {
|
|
145
|
+
const threshold = parseInt(opts.failUnderConsistency, 10);
|
|
146
|
+
if (consistency < threshold)
|
|
147
|
+
process.exit(1);
|
|
148
|
+
}
|
|
149
|
+
return;
|
|
150
|
+
}
|
|
151
|
+
// Pretty print results
|
|
152
|
+
console.log(chalk_1.default.gray('\n ' + '─'.repeat(60)));
|
|
153
|
+
console.log(chalk_1.default.bold(' Results'));
|
|
154
|
+
const grade = consistency >= 90 ? chalk_1.default.green('A') : consistency >= 70 ? chalk_1.default.yellow('B') :
|
|
155
|
+
consistency >= 50 ? chalk_1.default.yellow('C') : chalk_1.default.red('F');
|
|
156
|
+
console.log(` Consistency: ${grade} ${consistency}% (${passes}/${numRuns} passed)`);
|
|
157
|
+
console.log(` pass@k: ${passAtK ? chalk_1.default.green('YES') : chalk_1.default.red('NO')} (at least 1 succeeds)`);
|
|
158
|
+
console.log(` pass^k: ${passAllK ? chalk_1.default.green('YES') : chalk_1.default.red('NO')} (all succeed)`);
|
|
159
|
+
console.log(chalk_1.default.gray('\n Latency'));
|
|
160
|
+
console.log(` avg ${avg(durations).toFixed(0)}ms | stddev ${stddev(durations).toFixed(0)}ms | min ${min(durations)}ms | max ${max(durations)}ms`);
|
|
161
|
+
if (costs.some(c => c > 0)) {
|
|
162
|
+
console.log(chalk_1.default.gray('\n Cost'));
|
|
163
|
+
console.log(` total $${report.cost.total} | avg $${report.cost.avg}/run | stddev $${report.cost.stddev}`);
|
|
164
|
+
console.log(` tokens: ${report.tokens.total} total | ${report.tokens.avg} avg/run`);
|
|
165
|
+
}
|
|
166
|
+
console.log(chalk_1.default.gray('\n Eval Score'));
|
|
167
|
+
console.log(` avg ${report.evalScore.avg}/100 | min ${report.evalScore.min} | max ${report.evalScore.max}`);
|
|
168
|
+
console.log(chalk_1.default.gray('\n ' + '─'.repeat(60)));
|
|
169
|
+
if (opts.failUnderConsistency) {
|
|
170
|
+
const threshold = parseInt(opts.failUnderConsistency, 10);
|
|
171
|
+
if (consistency < threshold) {
|
|
172
|
+
console.log(chalk_1.default.red(` FAIL: Consistency ${consistency}% below threshold ${threshold}%`));
|
|
173
|
+
process.exit(1);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
console.log('');
|
|
177
|
+
}
|
|
@@ -156,6 +156,8 @@ function costReportCommand(opts) {
|
|
|
156
156
|
['claude-opus', 'frontier'], ['claude-sonnet', 'standard'], ['claude-haiku', 'mini'],
|
|
157
157
|
['gemini-2.5-flash-lite', 'mini'], ['gemini-2.5-flash', 'standard'], ['gemini-2.5-pro', 'frontier'],
|
|
158
158
|
['gemini-2.0-flash', 'mini'], ['gemini-1.5-pro', 'frontier'], ['gemini-1.5-flash', 'mini'],
|
|
159
|
+
['mistral-large', 'frontier'], ['mistral-medium', 'standard'], ['mistral-small', 'mini'], ['codestral', 'standard'],
|
|
160
|
+
['command-r-plus', 'frontier'], ['command-r', 'standard'], ['command-light', 'mini'],
|
|
159
161
|
];
|
|
160
162
|
function classifyTier(model) {
|
|
161
163
|
for (const [pattern, tier] of TIER_RULES) {
|
package/dist/index.js
CHANGED
|
@@ -920,6 +920,17 @@ program
|
|
|
920
920
|
const { whyCommand } = await Promise.resolve().then(() => __importStar(require("./commands/why")));
|
|
921
921
|
whyCommand(query, opts);
|
|
922
922
|
});
|
|
923
|
+
// trickle benchmark
|
|
924
|
+
program
|
|
925
|
+
.command("benchmark [command...]")
|
|
926
|
+
.description("Multi-trial reliability testing — run N times, measure consistency, cost variance, pass@k")
|
|
927
|
+
.option("--runs <n>", "Number of trial runs (default: 5)")
|
|
928
|
+
.option("--json", "Output structured JSON")
|
|
929
|
+
.option("--fail-under-consistency <pct>", "Fail if consistency below threshold (0-100, for CI)")
|
|
930
|
+
.action(async (commandParts, opts) => {
|
|
931
|
+
const { benchmarkCommand } = await Promise.resolve().then(() => __importStar(require("./commands/benchmark")));
|
|
932
|
+
await benchmarkCommand(commandParts.length > 0 ? commandParts.join(' ') : undefined, opts);
|
|
933
|
+
});
|
|
923
934
|
// trickle playback
|
|
924
935
|
program
|
|
925
936
|
.command("playback")
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "trickle-cli",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.205",
|
|
4
4
|
"description": "Zero-code runtime observability for JS/Python + AI agent debugging. Traces LangChain, CrewAI, OpenAI, Anthropic, Gemini. Eval, security, compliance, cost tracking. Free, local-first.",
|
|
5
5
|
"keywords": ["observability", "tracing", "llm", "openai", "anthropic", "langchain", "crewai", "agent", "mcp", "debugging", "typescript", "python", "security", "eval", "compliance"],
|
|
6
6
|
"bin": {
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
|
|
3
|
+
*
|
|
4
|
+
* Runs the same command N times, captures trickle data for each run,
|
|
5
|
+
* and reports variance: pass@k, consistency, cost/latency distribution.
|
|
6
|
+
*
|
|
7
|
+
* 85% per-step accuracy compounds to 20% on 10 steps — this measures
|
|
8
|
+
* whether your agent gives consistent results across identical inputs.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import * as fs from 'fs';
|
|
12
|
+
import * as path from 'path';
|
|
13
|
+
import chalk from 'chalk';
|
|
14
|
+
import { spawn } from 'child_process';
|
|
15
|
+
|
|
16
|
+
interface TrialResult {
|
|
17
|
+
run: number;
|
|
18
|
+
exitCode: number;
|
|
19
|
+
durationMs: number;
|
|
20
|
+
functions: number;
|
|
21
|
+
variables: number;
|
|
22
|
+
errors: number;
|
|
23
|
+
llmCalls: number;
|
|
24
|
+
llmCost: number;
|
|
25
|
+
llmTokens: number;
|
|
26
|
+
agentEvents: number;
|
|
27
|
+
evalScore: number;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function countLines(fp: string): number {
|
|
31
|
+
if (!fs.existsSync(fp)) return 0;
|
|
32
|
+
return fs.readFileSync(fp, 'utf-8').trim().split('\n').filter(Boolean).length;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function sumField(fp: string, field: string): number {
|
|
36
|
+
if (!fs.existsSync(fp)) return 0;
|
|
37
|
+
return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
|
|
38
|
+
.reduce((s, l) => { try { return s + (JSON.parse(l)[field] || 0); } catch { return s; } }, 0);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async function runTrial(command: string, trialDir: string): Promise<{ exitCode: number; durationMs: number }> {
|
|
42
|
+
return new Promise((resolve) => {
|
|
43
|
+
const start = Date.now();
|
|
44
|
+
const env = { ...process.env, TRICKLE_LOCAL: '1', TRICKLE_LOCAL_DIR: trialDir };
|
|
45
|
+
const proc = spawn(command, [], { shell: true, env, stdio: 'pipe' });
|
|
46
|
+
proc.on('exit', (code) => resolve({ exitCode: code ?? 1, durationMs: Date.now() - start }));
|
|
47
|
+
proc.on('error', () => resolve({ exitCode: 1, durationMs: Date.now() - start }));
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export async function benchmarkCommand(
|
|
52
|
+
command: string | undefined,
|
|
53
|
+
opts: { runs?: string; json?: boolean; failUnderConsistency?: string },
|
|
54
|
+
): Promise<void> {
|
|
55
|
+
if (!command) {
|
|
56
|
+
console.log(chalk.yellow(' Usage: trickle benchmark "python my_agent.py" --runs 5'));
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const numRuns = parseInt(opts.runs || '5', 10);
|
|
61
|
+
const baseDir = path.join(process.cwd(), '.trickle', 'benchmark');
|
|
62
|
+
fs.mkdirSync(baseDir, { recursive: true });
|
|
63
|
+
|
|
64
|
+
console.log('');
|
|
65
|
+
console.log(chalk.bold(' trickle benchmark'));
|
|
66
|
+
console.log(chalk.gray(' ' + '─'.repeat(60)));
|
|
67
|
+
console.log(` Command: ${chalk.cyan(command)}`);
|
|
68
|
+
console.log(` Runs: ${numRuns}`);
|
|
69
|
+
console.log('');
|
|
70
|
+
|
|
71
|
+
const results: TrialResult[] = [];
|
|
72
|
+
|
|
73
|
+
for (let i = 1; i <= numRuns; i++) {
|
|
74
|
+
const trialDir = path.join(baseDir, `run-${i}`);
|
|
75
|
+
fs.mkdirSync(trialDir, { recursive: true });
|
|
76
|
+
// Clear previous data
|
|
77
|
+
for (const f of fs.readdirSync(trialDir)) {
|
|
78
|
+
if (f.endsWith('.jsonl') || f.endsWith('.json')) fs.unlinkSync(path.join(trialDir, f));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
process.stdout.write(chalk.gray(` Run ${i}/${numRuns}... `));
|
|
82
|
+
const { exitCode, durationMs } = await runTrial(command, trialDir);
|
|
83
|
+
|
|
84
|
+
const functions = countLines(path.join(trialDir, 'observations.jsonl'));
|
|
85
|
+
const variables = countLines(path.join(trialDir, 'variables.jsonl'));
|
|
86
|
+
const errors = countLines(path.join(trialDir, 'errors.jsonl'));
|
|
87
|
+
const llmCalls = countLines(path.join(trialDir, 'llm.jsonl'));
|
|
88
|
+
const llmCost = Math.round(sumField(path.join(trialDir, 'llm.jsonl'), 'estimatedCostUsd') * 10000) / 10000;
|
|
89
|
+
const llmTokens = sumField(path.join(trialDir, 'llm.jsonl'), 'totalTokens');
|
|
90
|
+
const agentEvents = countLines(path.join(trialDir, 'agents.jsonl'));
|
|
91
|
+
|
|
92
|
+
// Simple eval score: 100 if exit 0 and no errors, minus penalties
|
|
93
|
+
const evalScore = Math.max(0, (exitCode === 0 ? 100 : 30) - errors * 15);
|
|
94
|
+
|
|
95
|
+
results.push({ run: i, exitCode, durationMs, functions, variables, errors, llmCalls, llmCost, llmTokens, agentEvents, evalScore });
|
|
96
|
+
|
|
97
|
+
const icon = exitCode === 0 ? chalk.green('✓') : chalk.red('✗');
|
|
98
|
+
console.log(`${icon} ${durationMs}ms | ${functions} fn | ${errors} err | ${llmCalls} llm ($${llmCost})`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Compute statistics
|
|
102
|
+
const passes = results.filter(r => r.exitCode === 0).length;
|
|
103
|
+
const passAtK = passes > 0 ? 1 : 0; // At least 1 succeeds
|
|
104
|
+
const passAllK = passes === numRuns ? 1 : 0; // All succeed
|
|
105
|
+
const consistency = Math.round((passes / numRuns) * 100);
|
|
106
|
+
|
|
107
|
+
const durations = results.map(r => r.durationMs);
|
|
108
|
+
const costs = results.map(r => r.llmCost);
|
|
109
|
+
const tokens = results.map(r => r.llmTokens);
|
|
110
|
+
const scores = results.map(r => r.evalScore);
|
|
111
|
+
|
|
112
|
+
const avg = (arr: number[]) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
|
|
113
|
+
const stddev = (arr: number[]) => {
|
|
114
|
+
const m = avg(arr);
|
|
115
|
+
return Math.sqrt(arr.reduce((s, v) => s + (v - m) ** 2, 0) / Math.max(1, arr.length));
|
|
116
|
+
};
|
|
117
|
+
const min = (arr: number[]) => arr.length ? Math.min(...arr) : 0;
|
|
118
|
+
const max = (arr: number[]) => arr.length ? Math.max(...arr) : 0;
|
|
119
|
+
|
|
120
|
+
const report = {
|
|
121
|
+
command, runs: numRuns,
|
|
122
|
+
passRate: consistency,
|
|
123
|
+
passAtK, passAllK,
|
|
124
|
+
latency: { avg: Math.round(avg(durations)), stddev: Math.round(stddev(durations)), min: min(durations), max: max(durations) },
|
|
125
|
+
cost: { total: Math.round(costs.reduce((a, b) => a + b, 0) * 10000) / 10000, avg: Math.round(avg(costs) * 10000) / 10000, stddev: Math.round(stddev(costs) * 10000) / 10000 },
|
|
126
|
+
tokens: { total: tokens.reduce((a, b) => a + b, 0), avg: Math.round(avg(tokens)) },
|
|
127
|
+
evalScore: { avg: Math.round(avg(scores)), min: min(scores), max: max(scores) },
|
|
128
|
+
trials: results,
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
if (opts.json) {
|
|
132
|
+
console.log(JSON.stringify(report, null, 2));
|
|
133
|
+
if (opts.failUnderConsistency) {
|
|
134
|
+
const threshold = parseInt(opts.failUnderConsistency, 10);
|
|
135
|
+
if (consistency < threshold) process.exit(1);
|
|
136
|
+
}
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Pretty print results
|
|
141
|
+
console.log(chalk.gray('\n ' + '─'.repeat(60)));
|
|
142
|
+
console.log(chalk.bold(' Results'));
|
|
143
|
+
|
|
144
|
+
const grade = consistency >= 90 ? chalk.green('A') : consistency >= 70 ? chalk.yellow('B') :
|
|
145
|
+
consistency >= 50 ? chalk.yellow('C') : chalk.red('F');
|
|
146
|
+
console.log(` Consistency: ${grade} ${consistency}% (${passes}/${numRuns} passed)`);
|
|
147
|
+
console.log(` pass@k: ${passAtK ? chalk.green('YES') : chalk.red('NO')} (at least 1 succeeds)`);
|
|
148
|
+
console.log(` pass^k: ${passAllK ? chalk.green('YES') : chalk.red('NO')} (all succeed)`);
|
|
149
|
+
|
|
150
|
+
console.log(chalk.gray('\n Latency'));
|
|
151
|
+
console.log(` avg ${avg(durations).toFixed(0)}ms | stddev ${stddev(durations).toFixed(0)}ms | min ${min(durations)}ms | max ${max(durations)}ms`);
|
|
152
|
+
|
|
153
|
+
if (costs.some(c => c > 0)) {
|
|
154
|
+
console.log(chalk.gray('\n Cost'));
|
|
155
|
+
console.log(` total $${report.cost.total} | avg $${report.cost.avg}/run | stddev $${report.cost.stddev}`);
|
|
156
|
+
console.log(` tokens: ${report.tokens.total} total | ${report.tokens.avg} avg/run`);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
console.log(chalk.gray('\n Eval Score'));
|
|
160
|
+
console.log(` avg ${report.evalScore.avg}/100 | min ${report.evalScore.min} | max ${report.evalScore.max}`);
|
|
161
|
+
|
|
162
|
+
console.log(chalk.gray('\n ' + '─'.repeat(60)));
|
|
163
|
+
|
|
164
|
+
if (opts.failUnderConsistency) {
|
|
165
|
+
const threshold = parseInt(opts.failUnderConsistency, 10);
|
|
166
|
+
if (consistency < threshold) {
|
|
167
|
+
console.log(chalk.red(` FAIL: Consistency ${consistency}% below threshold ${threshold}%`));
|
|
168
|
+
process.exit(1);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
console.log('');
|
|
173
|
+
}
|
|
@@ -131,6 +131,8 @@ export function costReportCommand(opts: { json?: boolean; budget?: string }): vo
|
|
|
131
131
|
['claude-opus', 'frontier'], ['claude-sonnet', 'standard'], ['claude-haiku', 'mini'],
|
|
132
132
|
['gemini-2.5-flash-lite', 'mini'], ['gemini-2.5-flash', 'standard'], ['gemini-2.5-pro', 'frontier'],
|
|
133
133
|
['gemini-2.0-flash', 'mini'], ['gemini-1.5-pro', 'frontier'], ['gemini-1.5-flash', 'mini'],
|
|
134
|
+
['mistral-large', 'frontier'], ['mistral-medium', 'standard'], ['mistral-small', 'mini'], ['codestral', 'standard'],
|
|
135
|
+
['command-r-plus', 'frontier'], ['command-r', 'standard'], ['command-light', 'mini'],
|
|
134
136
|
];
|
|
135
137
|
|
|
136
138
|
function classifyTier(model: string): string {
|
package/src/index.ts
CHANGED
|
@@ -953,6 +953,18 @@ program
|
|
|
953
953
|
whyCommand(query, opts);
|
|
954
954
|
});
|
|
955
955
|
|
|
956
|
+
// trickle benchmark
|
|
957
|
+
program
|
|
958
|
+
.command("benchmark [command...]")
|
|
959
|
+
.description("Multi-trial reliability testing — run N times, measure consistency, cost variance, pass@k")
|
|
960
|
+
.option("--runs <n>", "Number of trial runs (default: 5)")
|
|
961
|
+
.option("--json", "Output structured JSON")
|
|
962
|
+
.option("--fail-under-consistency <pct>", "Fail if consistency below threshold (0-100, for CI)")
|
|
963
|
+
.action(async (commandParts: string[], opts) => {
|
|
964
|
+
const { benchmarkCommand } = await import("./commands/benchmark");
|
|
965
|
+
await benchmarkCommand(commandParts.length > 0 ? commandParts.join(' ') : undefined, opts);
|
|
966
|
+
});
|
|
967
|
+
|
|
956
968
|
// trickle playback
|
|
957
969
|
program
|
|
958
970
|
.command("playback")
|