trickle-cli 0.1.204 → 0.1.206
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/benchmark.d.ts +14 -0
- package/dist/commands/benchmark.js +177 -0
- package/dist/commands/cost-report.js +26 -5
- package/dist/index.js +11 -0
- package/package.json +1 -1
- package/src/commands/benchmark.ts +173 -0
- package/src/commands/cost-report.ts +29 -5
- package/src/index.ts +12 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
|
|
3
|
+
*
|
|
4
|
+
* Runs the same command N times, captures trickle data for each run,
|
|
5
|
+
* and reports variance: pass@k, consistency, cost/latency distribution.
|
|
6
|
+
*
|
|
7
|
+
* 85% per-step accuracy compounds to 20% on 10 steps — this measures
|
|
8
|
+
* whether your agent gives consistent results across identical inputs.
|
|
9
|
+
*/
|
|
10
|
+
export declare function benchmarkCommand(command: string | undefined, opts: {
|
|
11
|
+
runs?: string;
|
|
12
|
+
json?: boolean;
|
|
13
|
+
failUnderConsistency?: string;
|
|
14
|
+
}): Promise<void>;
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
|
|
4
|
+
*
|
|
5
|
+
* Runs the same command N times, captures trickle data for each run,
|
|
6
|
+
* and reports variance: pass@k, consistency, cost/latency distribution.
|
|
7
|
+
*
|
|
8
|
+
* 85% per-step accuracy compounds to 20% on 10 steps — this measures
|
|
9
|
+
* whether your agent gives consistent results across identical inputs.
|
|
10
|
+
*/
|
|
11
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
12
|
+
if (k2 === undefined) k2 = k;
|
|
13
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
14
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
15
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
16
|
+
}
|
|
17
|
+
Object.defineProperty(o, k2, desc);
|
|
18
|
+
}) : (function(o, m, k, k2) {
|
|
19
|
+
if (k2 === undefined) k2 = k;
|
|
20
|
+
o[k2] = m[k];
|
|
21
|
+
}));
|
|
22
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
23
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
24
|
+
}) : function(o, v) {
|
|
25
|
+
o["default"] = v;
|
|
26
|
+
});
|
|
27
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
28
|
+
var ownKeys = function(o) {
|
|
29
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
30
|
+
var ar = [];
|
|
31
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
32
|
+
return ar;
|
|
33
|
+
};
|
|
34
|
+
return ownKeys(o);
|
|
35
|
+
};
|
|
36
|
+
return function (mod) {
|
|
37
|
+
if (mod && mod.__esModule) return mod;
|
|
38
|
+
var result = {};
|
|
39
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
40
|
+
__setModuleDefault(result, mod);
|
|
41
|
+
return result;
|
|
42
|
+
};
|
|
43
|
+
})();
|
|
44
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
45
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
46
|
+
};
|
|
47
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
48
|
+
exports.benchmarkCommand = benchmarkCommand;
|
|
49
|
+
const fs = __importStar(require("fs"));
|
|
50
|
+
const path = __importStar(require("path"));
|
|
51
|
+
const chalk_1 = __importDefault(require("chalk"));
|
|
52
|
+
const child_process_1 = require("child_process");
|
|
53
|
+
function countLines(fp) {
|
|
54
|
+
if (!fs.existsSync(fp))
|
|
55
|
+
return 0;
|
|
56
|
+
return fs.readFileSync(fp, 'utf-8').trim().split('\n').filter(Boolean).length;
|
|
57
|
+
}
|
|
58
|
+
function sumField(fp, field) {
|
|
59
|
+
if (!fs.existsSync(fp))
|
|
60
|
+
return 0;
|
|
61
|
+
return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
|
|
62
|
+
.reduce((s, l) => { try {
|
|
63
|
+
return s + (JSON.parse(l)[field] || 0);
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
return s;
|
|
67
|
+
} }, 0);
|
|
68
|
+
}
|
|
69
|
+
async function runTrial(command, trialDir) {
|
|
70
|
+
return new Promise((resolve) => {
|
|
71
|
+
const start = Date.now();
|
|
72
|
+
const env = { ...process.env, TRICKLE_LOCAL: '1', TRICKLE_LOCAL_DIR: trialDir };
|
|
73
|
+
const proc = (0, child_process_1.spawn)(command, [], { shell: true, env, stdio: 'pipe' });
|
|
74
|
+
proc.on('exit', (code) => resolve({ exitCode: code ?? 1, durationMs: Date.now() - start }));
|
|
75
|
+
proc.on('error', () => resolve({ exitCode: 1, durationMs: Date.now() - start }));
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
async function benchmarkCommand(command, opts) {
|
|
79
|
+
if (!command) {
|
|
80
|
+
console.log(chalk_1.default.yellow(' Usage: trickle benchmark "python my_agent.py" --runs 5'));
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
const numRuns = parseInt(opts.runs || '5', 10);
|
|
84
|
+
const baseDir = path.join(process.cwd(), '.trickle', 'benchmark');
|
|
85
|
+
fs.mkdirSync(baseDir, { recursive: true });
|
|
86
|
+
console.log('');
|
|
87
|
+
console.log(chalk_1.default.bold(' trickle benchmark'));
|
|
88
|
+
console.log(chalk_1.default.gray(' ' + '─'.repeat(60)));
|
|
89
|
+
console.log(` Command: ${chalk_1.default.cyan(command)}`);
|
|
90
|
+
console.log(` Runs: ${numRuns}`);
|
|
91
|
+
console.log('');
|
|
92
|
+
const results = [];
|
|
93
|
+
for (let i = 1; i <= numRuns; i++) {
|
|
94
|
+
const trialDir = path.join(baseDir, `run-${i}`);
|
|
95
|
+
fs.mkdirSync(trialDir, { recursive: true });
|
|
96
|
+
// Clear previous data
|
|
97
|
+
for (const f of fs.readdirSync(trialDir)) {
|
|
98
|
+
if (f.endsWith('.jsonl') || f.endsWith('.json'))
|
|
99
|
+
fs.unlinkSync(path.join(trialDir, f));
|
|
100
|
+
}
|
|
101
|
+
process.stdout.write(chalk_1.default.gray(` Run ${i}/${numRuns}... `));
|
|
102
|
+
const { exitCode, durationMs } = await runTrial(command, trialDir);
|
|
103
|
+
const functions = countLines(path.join(trialDir, 'observations.jsonl'));
|
|
104
|
+
const variables = countLines(path.join(trialDir, 'variables.jsonl'));
|
|
105
|
+
const errors = countLines(path.join(trialDir, 'errors.jsonl'));
|
|
106
|
+
const llmCalls = countLines(path.join(trialDir, 'llm.jsonl'));
|
|
107
|
+
const llmCost = Math.round(sumField(path.join(trialDir, 'llm.jsonl'), 'estimatedCostUsd') * 10000) / 10000;
|
|
108
|
+
const llmTokens = sumField(path.join(trialDir, 'llm.jsonl'), 'totalTokens');
|
|
109
|
+
const agentEvents = countLines(path.join(trialDir, 'agents.jsonl'));
|
|
110
|
+
// Simple eval score: 100 if exit 0 and no errors, minus penalties
|
|
111
|
+
const evalScore = Math.max(0, (exitCode === 0 ? 100 : 30) - errors * 15);
|
|
112
|
+
results.push({ run: i, exitCode, durationMs, functions, variables, errors, llmCalls, llmCost, llmTokens, agentEvents, evalScore });
|
|
113
|
+
const icon = exitCode === 0 ? chalk_1.default.green('✓') : chalk_1.default.red('✗');
|
|
114
|
+
console.log(`${icon} ${durationMs}ms | ${functions} fn | ${errors} err | ${llmCalls} llm ($${llmCost})`);
|
|
115
|
+
}
|
|
116
|
+
// Compute statistics
|
|
117
|
+
const passes = results.filter(r => r.exitCode === 0).length;
|
|
118
|
+
const passAtK = passes > 0 ? 1 : 0; // At least 1 succeeds
|
|
119
|
+
const passAllK = passes === numRuns ? 1 : 0; // All succeed
|
|
120
|
+
const consistency = Math.round((passes / numRuns) * 100);
|
|
121
|
+
const durations = results.map(r => r.durationMs);
|
|
122
|
+
const costs = results.map(r => r.llmCost);
|
|
123
|
+
const tokens = results.map(r => r.llmTokens);
|
|
124
|
+
const scores = results.map(r => r.evalScore);
|
|
125
|
+
const avg = (arr) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
|
|
126
|
+
const stddev = (arr) => {
|
|
127
|
+
const m = avg(arr);
|
|
128
|
+
return Math.sqrt(arr.reduce((s, v) => s + (v - m) ** 2, 0) / Math.max(1, arr.length));
|
|
129
|
+
};
|
|
130
|
+
const min = (arr) => arr.length ? Math.min(...arr) : 0;
|
|
131
|
+
const max = (arr) => arr.length ? Math.max(...arr) : 0;
|
|
132
|
+
const report = {
|
|
133
|
+
command, runs: numRuns,
|
|
134
|
+
passRate: consistency,
|
|
135
|
+
passAtK, passAllK,
|
|
136
|
+
latency: { avg: Math.round(avg(durations)), stddev: Math.round(stddev(durations)), min: min(durations), max: max(durations) },
|
|
137
|
+
cost: { total: Math.round(costs.reduce((a, b) => a + b, 0) * 10000) / 10000, avg: Math.round(avg(costs) * 10000) / 10000, stddev: Math.round(stddev(costs) * 10000) / 10000 },
|
|
138
|
+
tokens: { total: tokens.reduce((a, b) => a + b, 0), avg: Math.round(avg(tokens)) },
|
|
139
|
+
evalScore: { avg: Math.round(avg(scores)), min: min(scores), max: max(scores) },
|
|
140
|
+
trials: results,
|
|
141
|
+
};
|
|
142
|
+
if (opts.json) {
|
|
143
|
+
console.log(JSON.stringify(report, null, 2));
|
|
144
|
+
if (opts.failUnderConsistency) {
|
|
145
|
+
const threshold = parseInt(opts.failUnderConsistency, 10);
|
|
146
|
+
if (consistency < threshold)
|
|
147
|
+
process.exit(1);
|
|
148
|
+
}
|
|
149
|
+
return;
|
|
150
|
+
}
|
|
151
|
+
// Pretty print results
|
|
152
|
+
console.log(chalk_1.default.gray('\n ' + '─'.repeat(60)));
|
|
153
|
+
console.log(chalk_1.default.bold(' Results'));
|
|
154
|
+
const grade = consistency >= 90 ? chalk_1.default.green('A') : consistency >= 70 ? chalk_1.default.yellow('B') :
|
|
155
|
+
consistency >= 50 ? chalk_1.default.yellow('C') : chalk_1.default.red('F');
|
|
156
|
+
console.log(` Consistency: ${grade} ${consistency}% (${passes}/${numRuns} passed)`);
|
|
157
|
+
console.log(` pass@k: ${passAtK ? chalk_1.default.green('YES') : chalk_1.default.red('NO')} (at least 1 succeeds)`);
|
|
158
|
+
console.log(` pass^k: ${passAllK ? chalk_1.default.green('YES') : chalk_1.default.red('NO')} (all succeed)`);
|
|
159
|
+
console.log(chalk_1.default.gray('\n Latency'));
|
|
160
|
+
console.log(` avg ${avg(durations).toFixed(0)}ms | stddev ${stddev(durations).toFixed(0)}ms | min ${min(durations)}ms | max ${max(durations)}ms`);
|
|
161
|
+
if (costs.some(c => c > 0)) {
|
|
162
|
+
console.log(chalk_1.default.gray('\n Cost'));
|
|
163
|
+
console.log(` total $${report.cost.total} | avg $${report.cost.avg}/run | stddev $${report.cost.stddev}`);
|
|
164
|
+
console.log(` tokens: ${report.tokens.total} total | ${report.tokens.avg} avg/run`);
|
|
165
|
+
}
|
|
166
|
+
console.log(chalk_1.default.gray('\n Eval Score'));
|
|
167
|
+
console.log(` avg ${report.evalScore.avg}/100 | min ${report.evalScore.min} | max ${report.evalScore.max}`);
|
|
168
|
+
console.log(chalk_1.default.gray('\n ' + '─'.repeat(60)));
|
|
169
|
+
if (opts.failUnderConsistency) {
|
|
170
|
+
const threshold = parseInt(opts.failUnderConsistency, 10);
|
|
171
|
+
if (consistency < threshold) {
|
|
172
|
+
console.log(chalk_1.default.red(` FAIL: Consistency ${consistency}% below threshold ${threshold}%`));
|
|
173
|
+
process.exit(1);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
console.log('');
|
|
177
|
+
}
|
|
@@ -301,12 +301,33 @@ function costReportCommand(opts) {
|
|
|
301
301
|
}
|
|
302
302
|
}
|
|
303
303
|
}
|
|
304
|
-
|
|
304
|
+
// Provider-reported cache tokens (Anthropic cache_read/cache_creation)
|
|
305
|
+
const cacheReadTotal = calls.reduce((s, c) => s + (c.cacheReadTokens || 0), 0);
|
|
306
|
+
const cacheWriteTotal = calls.reduce((s, c) => s + (c.cacheWriteTokens || 0), 0);
|
|
307
|
+
const callsWithCache = calls.filter((c) => c.cacheReadTokens > 0 || c.cacheWriteTokens > 0);
|
|
308
|
+
if (callsWithCache.length > 0 || cacheDetected) {
|
|
305
309
|
console.log(chalk_1.default.gray('\n ' + '─'.repeat(60)));
|
|
306
|
-
console.log(chalk_1.default.bold(' Cache Analysis')
|
|
307
|
-
|
|
308
|
-
const
|
|
309
|
-
|
|
310
|
+
console.log(chalk_1.default.bold(' Cache Analysis'));
|
|
311
|
+
if (callsWithCache.length > 0) {
|
|
312
|
+
const cacheHitCalls = calls.filter((c) => c.cacheReadTokens > 0);
|
|
313
|
+
const hitRate = calls.length > 0 ? Math.round((cacheHitCalls.length / calls.length) * 100) : 0;
|
|
314
|
+
// Estimate savings: cached tokens cost ~90% less
|
|
315
|
+
const savedTokens = cacheReadTotal;
|
|
316
|
+
const avgInputPrice = totalCost > 0 && totalTokens > 0 ? (totalCost / totalTokens) : 0.000003;
|
|
317
|
+
const estimatedSavings = savedTokens * avgInputPrice * 0.9;
|
|
318
|
+
console.log(chalk_1.default.gray(' Provider-reported cache tokens:'));
|
|
319
|
+
console.log(` Hit rate: ${chalk_1.default.green(hitRate + '%')} (${cacheHitCalls.length}/${calls.length} calls used cache)`);
|
|
320
|
+
console.log(` Cache read: ${formatTokens(cacheReadTotal)} tokens | Cache write: ${formatTokens(cacheWriteTotal)} tokens`);
|
|
321
|
+
if (estimatedSavings > 0) {
|
|
322
|
+
console.log(` Estimated savings: ${chalk_1.default.green('~$' + estimatedSavings.toFixed(4))} from cached tokens`);
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
if (cacheDetected) {
|
|
326
|
+
console.log(chalk_1.default.gray(' Latency-based detection:'));
|
|
327
|
+
for (const ca of cacheAnalysis) {
|
|
328
|
+
const speedup = (ca.slowAvg / Math.max(1, ca.fastAvg)).toFixed(0);
|
|
329
|
+
console.log(` ${chalk_1.default.cyan(ca.model.padEnd(25))} hit rate: ${chalk_1.default.green(ca.hitRate + '%')} (${ca.fastCalls} fast, ${ca.slowCalls} slow) ${speedup}x speedup`);
|
|
330
|
+
}
|
|
310
331
|
}
|
|
311
332
|
}
|
|
312
333
|
}
|
package/dist/index.js
CHANGED
|
@@ -920,6 +920,17 @@ program
|
|
|
920
920
|
const { whyCommand } = await Promise.resolve().then(() => __importStar(require("./commands/why")));
|
|
921
921
|
whyCommand(query, opts);
|
|
922
922
|
});
|
|
923
|
+
// trickle benchmark
|
|
924
|
+
program
|
|
925
|
+
.command("benchmark [command...]")
|
|
926
|
+
.description("Multi-trial reliability testing — run N times, measure consistency, cost variance, pass@k")
|
|
927
|
+
.option("--runs <n>", "Number of trial runs (default: 5)")
|
|
928
|
+
.option("--json", "Output structured JSON")
|
|
929
|
+
.option("--fail-under-consistency <pct>", "Fail if consistency below threshold (0-100, for CI)")
|
|
930
|
+
.action(async (commandParts, opts) => {
|
|
931
|
+
const { benchmarkCommand } = await Promise.resolve().then(() => __importStar(require("./commands/benchmark")));
|
|
932
|
+
await benchmarkCommand(commandParts.length > 0 ? commandParts.join(' ') : undefined, opts);
|
|
933
|
+
});
|
|
923
934
|
// trickle playback
|
|
924
935
|
program
|
|
925
936
|
.command("playback")
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "trickle-cli",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.206",
|
|
4
4
|
"description": "Zero-code runtime observability for JS/Python + AI agent debugging. Traces LangChain, CrewAI, OpenAI, Anthropic, Gemini. Eval, security, compliance, cost tracking. Free, local-first.",
|
|
5
5
|
"keywords": ["observability", "tracing", "llm", "openai", "anthropic", "langchain", "crewai", "agent", "mcp", "debugging", "typescript", "python", "security", "eval", "compliance"],
|
|
6
6
|
"bin": {
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
|
|
3
|
+
*
|
|
4
|
+
* Runs the same command N times, captures trickle data for each run,
|
|
5
|
+
* and reports variance: pass@k, consistency, cost/latency distribution.
|
|
6
|
+
*
|
|
7
|
+
* 85% per-step accuracy compounds to 20% on 10 steps — this measures
|
|
8
|
+
* whether your agent gives consistent results across identical inputs.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import * as fs from 'fs';
|
|
12
|
+
import * as path from 'path';
|
|
13
|
+
import chalk from 'chalk';
|
|
14
|
+
import { spawn } from 'child_process';
|
|
15
|
+
|
|
16
|
+
interface TrialResult {
|
|
17
|
+
run: number;
|
|
18
|
+
exitCode: number;
|
|
19
|
+
durationMs: number;
|
|
20
|
+
functions: number;
|
|
21
|
+
variables: number;
|
|
22
|
+
errors: number;
|
|
23
|
+
llmCalls: number;
|
|
24
|
+
llmCost: number;
|
|
25
|
+
llmTokens: number;
|
|
26
|
+
agentEvents: number;
|
|
27
|
+
evalScore: number;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function countLines(fp: string): number {
|
|
31
|
+
if (!fs.existsSync(fp)) return 0;
|
|
32
|
+
return fs.readFileSync(fp, 'utf-8').trim().split('\n').filter(Boolean).length;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function sumField(fp: string, field: string): number {
|
|
36
|
+
if (!fs.existsSync(fp)) return 0;
|
|
37
|
+
return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
|
|
38
|
+
.reduce((s, l) => { try { return s + (JSON.parse(l)[field] || 0); } catch { return s; } }, 0);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async function runTrial(command: string, trialDir: string): Promise<{ exitCode: number; durationMs: number }> {
|
|
42
|
+
return new Promise((resolve) => {
|
|
43
|
+
const start = Date.now();
|
|
44
|
+
const env = { ...process.env, TRICKLE_LOCAL: '1', TRICKLE_LOCAL_DIR: trialDir };
|
|
45
|
+
const proc = spawn(command, [], { shell: true, env, stdio: 'pipe' });
|
|
46
|
+
proc.on('exit', (code) => resolve({ exitCode: code ?? 1, durationMs: Date.now() - start }));
|
|
47
|
+
proc.on('error', () => resolve({ exitCode: 1, durationMs: Date.now() - start }));
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export async function benchmarkCommand(
|
|
52
|
+
command: string | undefined,
|
|
53
|
+
opts: { runs?: string; json?: boolean; failUnderConsistency?: string },
|
|
54
|
+
): Promise<void> {
|
|
55
|
+
if (!command) {
|
|
56
|
+
console.log(chalk.yellow(' Usage: trickle benchmark "python my_agent.py" --runs 5'));
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const numRuns = parseInt(opts.runs || '5', 10);
|
|
61
|
+
const baseDir = path.join(process.cwd(), '.trickle', 'benchmark');
|
|
62
|
+
fs.mkdirSync(baseDir, { recursive: true });
|
|
63
|
+
|
|
64
|
+
console.log('');
|
|
65
|
+
console.log(chalk.bold(' trickle benchmark'));
|
|
66
|
+
console.log(chalk.gray(' ' + '─'.repeat(60)));
|
|
67
|
+
console.log(` Command: ${chalk.cyan(command)}`);
|
|
68
|
+
console.log(` Runs: ${numRuns}`);
|
|
69
|
+
console.log('');
|
|
70
|
+
|
|
71
|
+
const results: TrialResult[] = [];
|
|
72
|
+
|
|
73
|
+
for (let i = 1; i <= numRuns; i++) {
|
|
74
|
+
const trialDir = path.join(baseDir, `run-${i}`);
|
|
75
|
+
fs.mkdirSync(trialDir, { recursive: true });
|
|
76
|
+
// Clear previous data
|
|
77
|
+
for (const f of fs.readdirSync(trialDir)) {
|
|
78
|
+
if (f.endsWith('.jsonl') || f.endsWith('.json')) fs.unlinkSync(path.join(trialDir, f));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
process.stdout.write(chalk.gray(` Run ${i}/${numRuns}... `));
|
|
82
|
+
const { exitCode, durationMs } = await runTrial(command, trialDir);
|
|
83
|
+
|
|
84
|
+
const functions = countLines(path.join(trialDir, 'observations.jsonl'));
|
|
85
|
+
const variables = countLines(path.join(trialDir, 'variables.jsonl'));
|
|
86
|
+
const errors = countLines(path.join(trialDir, 'errors.jsonl'));
|
|
87
|
+
const llmCalls = countLines(path.join(trialDir, 'llm.jsonl'));
|
|
88
|
+
const llmCost = Math.round(sumField(path.join(trialDir, 'llm.jsonl'), 'estimatedCostUsd') * 10000) / 10000;
|
|
89
|
+
const llmTokens = sumField(path.join(trialDir, 'llm.jsonl'), 'totalTokens');
|
|
90
|
+
const agentEvents = countLines(path.join(trialDir, 'agents.jsonl'));
|
|
91
|
+
|
|
92
|
+
// Simple eval score: 100 if exit 0 and no errors, minus penalties
|
|
93
|
+
const evalScore = Math.max(0, (exitCode === 0 ? 100 : 30) - errors * 15);
|
|
94
|
+
|
|
95
|
+
results.push({ run: i, exitCode, durationMs, functions, variables, errors, llmCalls, llmCost, llmTokens, agentEvents, evalScore });
|
|
96
|
+
|
|
97
|
+
const icon = exitCode === 0 ? chalk.green('✓') : chalk.red('✗');
|
|
98
|
+
console.log(`${icon} ${durationMs}ms | ${functions} fn | ${errors} err | ${llmCalls} llm ($${llmCost})`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Compute statistics
|
|
102
|
+
const passes = results.filter(r => r.exitCode === 0).length;
|
|
103
|
+
const passAtK = passes > 0 ? 1 : 0; // At least 1 succeeds
|
|
104
|
+
const passAllK = passes === numRuns ? 1 : 0; // All succeed
|
|
105
|
+
const consistency = Math.round((passes / numRuns) * 100);
|
|
106
|
+
|
|
107
|
+
const durations = results.map(r => r.durationMs);
|
|
108
|
+
const costs = results.map(r => r.llmCost);
|
|
109
|
+
const tokens = results.map(r => r.llmTokens);
|
|
110
|
+
const scores = results.map(r => r.evalScore);
|
|
111
|
+
|
|
112
|
+
const avg = (arr: number[]) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
|
|
113
|
+
const stddev = (arr: number[]) => {
|
|
114
|
+
const m = avg(arr);
|
|
115
|
+
return Math.sqrt(arr.reduce((s, v) => s + (v - m) ** 2, 0) / Math.max(1, arr.length));
|
|
116
|
+
};
|
|
117
|
+
const min = (arr: number[]) => arr.length ? Math.min(...arr) : 0;
|
|
118
|
+
const max = (arr: number[]) => arr.length ? Math.max(...arr) : 0;
|
|
119
|
+
|
|
120
|
+
const report = {
|
|
121
|
+
command, runs: numRuns,
|
|
122
|
+
passRate: consistency,
|
|
123
|
+
passAtK, passAllK,
|
|
124
|
+
latency: { avg: Math.round(avg(durations)), stddev: Math.round(stddev(durations)), min: min(durations), max: max(durations) },
|
|
125
|
+
cost: { total: Math.round(costs.reduce((a, b) => a + b, 0) * 10000) / 10000, avg: Math.round(avg(costs) * 10000) / 10000, stddev: Math.round(stddev(costs) * 10000) / 10000 },
|
|
126
|
+
tokens: { total: tokens.reduce((a, b) => a + b, 0), avg: Math.round(avg(tokens)) },
|
|
127
|
+
evalScore: { avg: Math.round(avg(scores)), min: min(scores), max: max(scores) },
|
|
128
|
+
trials: results,
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
if (opts.json) {
|
|
132
|
+
console.log(JSON.stringify(report, null, 2));
|
|
133
|
+
if (opts.failUnderConsistency) {
|
|
134
|
+
const threshold = parseInt(opts.failUnderConsistency, 10);
|
|
135
|
+
if (consistency < threshold) process.exit(1);
|
|
136
|
+
}
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Pretty print results
|
|
141
|
+
console.log(chalk.gray('\n ' + '─'.repeat(60)));
|
|
142
|
+
console.log(chalk.bold(' Results'));
|
|
143
|
+
|
|
144
|
+
const grade = consistency >= 90 ? chalk.green('A') : consistency >= 70 ? chalk.yellow('B') :
|
|
145
|
+
consistency >= 50 ? chalk.yellow('C') : chalk.red('F');
|
|
146
|
+
console.log(` Consistency: ${grade} ${consistency}% (${passes}/${numRuns} passed)`);
|
|
147
|
+
console.log(` pass@k: ${passAtK ? chalk.green('YES') : chalk.red('NO')} (at least 1 succeeds)`);
|
|
148
|
+
console.log(` pass^k: ${passAllK ? chalk.green('YES') : chalk.red('NO')} (all succeed)`);
|
|
149
|
+
|
|
150
|
+
console.log(chalk.gray('\n Latency'));
|
|
151
|
+
console.log(` avg ${avg(durations).toFixed(0)}ms | stddev ${stddev(durations).toFixed(0)}ms | min ${min(durations)}ms | max ${max(durations)}ms`);
|
|
152
|
+
|
|
153
|
+
if (costs.some(c => c > 0)) {
|
|
154
|
+
console.log(chalk.gray('\n Cost'));
|
|
155
|
+
console.log(` total $${report.cost.total} | avg $${report.cost.avg}/run | stddev $${report.cost.stddev}`);
|
|
156
|
+
console.log(` tokens: ${report.tokens.total} total | ${report.tokens.avg} avg/run`);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
console.log(chalk.gray('\n Eval Score'));
|
|
160
|
+
console.log(` avg ${report.evalScore.avg}/100 | min ${report.evalScore.min} | max ${report.evalScore.max}`);
|
|
161
|
+
|
|
162
|
+
console.log(chalk.gray('\n ' + '─'.repeat(60)));
|
|
163
|
+
|
|
164
|
+
if (opts.failUnderConsistency) {
|
|
165
|
+
const threshold = parseInt(opts.failUnderConsistency, 10);
|
|
166
|
+
if (consistency < threshold) {
|
|
167
|
+
console.log(chalk.red(` FAIL: Consistency ${consistency}% below threshold ${threshold}%`));
|
|
168
|
+
process.exit(1);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
console.log('');
|
|
173
|
+
}
|
|
@@ -281,12 +281,36 @@ export function costReportCommand(opts: { json?: boolean; budget?: string }): vo
|
|
|
281
281
|
}
|
|
282
282
|
}
|
|
283
283
|
|
|
284
|
-
|
|
284
|
+
// Provider-reported cache tokens (Anthropic cache_read/cache_creation)
|
|
285
|
+
const cacheReadTotal = calls.reduce((s: number, c: any) => s + (c.cacheReadTokens || 0), 0);
|
|
286
|
+
const cacheWriteTotal = calls.reduce((s: number, c: any) => s + (c.cacheWriteTokens || 0), 0);
|
|
287
|
+
const callsWithCache = calls.filter((c: any) => c.cacheReadTokens > 0 || c.cacheWriteTokens > 0);
|
|
288
|
+
|
|
289
|
+
if (callsWithCache.length > 0 || cacheDetected) {
|
|
285
290
|
console.log(chalk.gray('\n ' + '─'.repeat(60)));
|
|
286
|
-
console.log(chalk.bold(' Cache Analysis')
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
291
|
+
console.log(chalk.bold(' Cache Analysis'));
|
|
292
|
+
|
|
293
|
+
if (callsWithCache.length > 0) {
|
|
294
|
+
const cacheHitCalls = calls.filter((c: any) => c.cacheReadTokens > 0);
|
|
295
|
+
const hitRate = calls.length > 0 ? Math.round((cacheHitCalls.length / calls.length) * 100) : 0;
|
|
296
|
+
// Estimate savings: cached tokens cost ~90% less
|
|
297
|
+
const savedTokens = cacheReadTotal;
|
|
298
|
+
const avgInputPrice = totalCost > 0 && totalTokens > 0 ? (totalCost / totalTokens) : 0.000003;
|
|
299
|
+
const estimatedSavings = savedTokens * avgInputPrice * 0.9;
|
|
300
|
+
console.log(chalk.gray(' Provider-reported cache tokens:'));
|
|
301
|
+
console.log(` Hit rate: ${chalk.green(hitRate + '%')} (${cacheHitCalls.length}/${calls.length} calls used cache)`);
|
|
302
|
+
console.log(` Cache read: ${formatTokens(cacheReadTotal)} tokens | Cache write: ${formatTokens(cacheWriteTotal)} tokens`);
|
|
303
|
+
if (estimatedSavings > 0) {
|
|
304
|
+
console.log(` Estimated savings: ${chalk.green('~$' + estimatedSavings.toFixed(4))} from cached tokens`);
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (cacheDetected) {
|
|
309
|
+
console.log(chalk.gray(' Latency-based detection:'));
|
|
310
|
+
for (const ca of cacheAnalysis) {
|
|
311
|
+
const speedup = (ca.slowAvg / Math.max(1, ca.fastAvg)).toFixed(0);
|
|
312
|
+
console.log(` ${chalk.cyan(ca.model.padEnd(25))} hit rate: ${chalk.green(ca.hitRate + '%')} (${ca.fastCalls} fast, ${ca.slowCalls} slow) ${speedup}x speedup`);
|
|
313
|
+
}
|
|
290
314
|
}
|
|
291
315
|
}
|
|
292
316
|
}
|
package/src/index.ts
CHANGED
|
@@ -953,6 +953,18 @@ program
|
|
|
953
953
|
whyCommand(query, opts);
|
|
954
954
|
});
|
|
955
955
|
|
|
956
|
+
// trickle benchmark
|
|
957
|
+
program
|
|
958
|
+
.command("benchmark [command...]")
|
|
959
|
+
.description("Multi-trial reliability testing — run N times, measure consistency, cost variance, pass@k")
|
|
960
|
+
.option("--runs <n>", "Number of trial runs (default: 5)")
|
|
961
|
+
.option("--json", "Output structured JSON")
|
|
962
|
+
.option("--fail-under-consistency <pct>", "Fail if consistency below threshold (0-100, for CI)")
|
|
963
|
+
.action(async (commandParts: string[], opts) => {
|
|
964
|
+
const { benchmarkCommand } = await import("./commands/benchmark");
|
|
965
|
+
await benchmarkCommand(commandParts.length > 0 ? commandParts.join(' ') : undefined, opts);
|
|
966
|
+
});
|
|
967
|
+
|
|
956
968
|
// trickle playback
|
|
957
969
|
program
|
|
958
970
|
.command("playback")
|