trickle-cli 0.1.203 → 0.1.205

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ /**
2
+ * trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
3
+ *
4
+ * Runs the same command N times, captures trickle data for each run,
5
+ * and reports variance: pass@k, consistency, cost/latency distribution.
6
+ *
7
+ * 85% per-step accuracy compounds to 20% on 10 steps — this measures
8
+ * whether your agent gives consistent results across identical inputs.
9
+ */
10
+ export declare function benchmarkCommand(command: string | undefined, opts: {
11
+ runs?: string;
12
+ json?: boolean;
13
+ failUnderConsistency?: string;
14
+ }): Promise<void>;
@@ -0,0 +1,177 @@
1
+ "use strict";
2
+ /**
3
+ * trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
4
+ *
5
+ * Runs the same command N times, captures trickle data for each run,
6
+ * and reports variance: pass@k, consistency, cost/latency distribution.
7
+ *
8
+ * 85% per-step accuracy compounds to 20% on 10 steps — this measures
9
+ * whether your agent gives consistent results across identical inputs.
10
+ */
11
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
12
+ if (k2 === undefined) k2 = k;
13
+ var desc = Object.getOwnPropertyDescriptor(m, k);
14
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
15
+ desc = { enumerable: true, get: function() { return m[k]; } };
16
+ }
17
+ Object.defineProperty(o, k2, desc);
18
+ }) : (function(o, m, k, k2) {
19
+ if (k2 === undefined) k2 = k;
20
+ o[k2] = m[k];
21
+ }));
22
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
23
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
24
+ }) : function(o, v) {
25
+ o["default"] = v;
26
+ });
27
+ var __importStar = (this && this.__importStar) || (function () {
28
+ var ownKeys = function(o) {
29
+ ownKeys = Object.getOwnPropertyNames || function (o) {
30
+ var ar = [];
31
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
32
+ return ar;
33
+ };
34
+ return ownKeys(o);
35
+ };
36
+ return function (mod) {
37
+ if (mod && mod.__esModule) return mod;
38
+ var result = {};
39
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
40
+ __setModuleDefault(result, mod);
41
+ return result;
42
+ };
43
+ })();
44
+ var __importDefault = (this && this.__importDefault) || function (mod) {
45
+ return (mod && mod.__esModule) ? mod : { "default": mod };
46
+ };
47
+ Object.defineProperty(exports, "__esModule", { value: true });
48
+ exports.benchmarkCommand = benchmarkCommand;
49
+ const fs = __importStar(require("fs"));
50
+ const path = __importStar(require("path"));
51
+ const chalk_1 = __importDefault(require("chalk"));
52
+ const child_process_1 = require("child_process");
53
+ function countLines(fp) {
54
+ if (!fs.existsSync(fp))
55
+ return 0;
56
+ return fs.readFileSync(fp, 'utf-8').trim().split('\n').filter(Boolean).length;
57
+ }
58
+ function sumField(fp, field) {
59
+ if (!fs.existsSync(fp))
60
+ return 0;
61
+ return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
62
+ .reduce((s, l) => { try {
63
+ return s + (JSON.parse(l)[field] || 0);
64
+ }
65
+ catch {
66
+ return s;
67
+ } }, 0);
68
+ }
69
+ async function runTrial(command, trialDir) {
70
+ return new Promise((resolve) => {
71
+ const start = Date.now();
72
+ const env = { ...process.env, TRICKLE_LOCAL: '1', TRICKLE_LOCAL_DIR: trialDir };
73
+ const proc = (0, child_process_1.spawn)(command, [], { shell: true, env, stdio: 'pipe' });
74
+ proc.on('exit', (code) => resolve({ exitCode: code ?? 1, durationMs: Date.now() - start }));
75
+ proc.on('error', () => resolve({ exitCode: 1, durationMs: Date.now() - start }));
76
+ });
77
+ }
78
+ async function benchmarkCommand(command, opts) {
79
+ if (!command) {
80
+ console.log(chalk_1.default.yellow(' Usage: trickle benchmark "python my_agent.py" --runs 5'));
81
+ return;
82
+ }
83
+ const numRuns = parseInt(opts.runs || '5', 10);
84
+ const baseDir = path.join(process.cwd(), '.trickle', 'benchmark');
85
+ fs.mkdirSync(baseDir, { recursive: true });
86
+ console.log('');
87
+ console.log(chalk_1.default.bold(' trickle benchmark'));
88
+ console.log(chalk_1.default.gray(' ' + '─'.repeat(60)));
89
+ console.log(` Command: ${chalk_1.default.cyan(command)}`);
90
+ console.log(` Runs: ${numRuns}`);
91
+ console.log('');
92
+ const results = [];
93
+ for (let i = 1; i <= numRuns; i++) {
94
+ const trialDir = path.join(baseDir, `run-${i}`);
95
+ fs.mkdirSync(trialDir, { recursive: true });
96
+ // Clear previous data
97
+ for (const f of fs.readdirSync(trialDir)) {
98
+ if (f.endsWith('.jsonl') || f.endsWith('.json'))
99
+ fs.unlinkSync(path.join(trialDir, f));
100
+ }
101
+ process.stdout.write(chalk_1.default.gray(` Run ${i}/${numRuns}... `));
102
+ const { exitCode, durationMs } = await runTrial(command, trialDir);
103
+ const functions = countLines(path.join(trialDir, 'observations.jsonl'));
104
+ const variables = countLines(path.join(trialDir, 'variables.jsonl'));
105
+ const errors = countLines(path.join(trialDir, 'errors.jsonl'));
106
+ const llmCalls = countLines(path.join(trialDir, 'llm.jsonl'));
107
+ const llmCost = Math.round(sumField(path.join(trialDir, 'llm.jsonl'), 'estimatedCostUsd') * 10000) / 10000;
108
+ const llmTokens = sumField(path.join(trialDir, 'llm.jsonl'), 'totalTokens');
109
+ const agentEvents = countLines(path.join(trialDir, 'agents.jsonl'));
110
+ // Simple eval score: 100 if exit 0 and no errors, minus penalties
111
+ const evalScore = Math.max(0, (exitCode === 0 ? 100 : 30) - errors * 15);
112
+ results.push({ run: i, exitCode, durationMs, functions, variables, errors, llmCalls, llmCost, llmTokens, agentEvents, evalScore });
113
+ const icon = exitCode === 0 ? chalk_1.default.green('✓') : chalk_1.default.red('✗');
114
+ console.log(`${icon} ${durationMs}ms | ${functions} fn | ${errors} err | ${llmCalls} llm ($${llmCost})`);
115
+ }
116
+ // Compute statistics
117
+ const passes = results.filter(r => r.exitCode === 0).length;
118
+ const passAtK = passes > 0 ? 1 : 0; // At least 1 succeeds
119
+ const passAllK = passes === numRuns ? 1 : 0; // All succeed
120
+ const consistency = Math.round((passes / numRuns) * 100);
121
+ const durations = results.map(r => r.durationMs);
122
+ const costs = results.map(r => r.llmCost);
123
+ const tokens = results.map(r => r.llmTokens);
124
+ const scores = results.map(r => r.evalScore);
125
+ const avg = (arr) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
126
+ const stddev = (arr) => {
127
+ const m = avg(arr);
128
+ return Math.sqrt(arr.reduce((s, v) => s + (v - m) ** 2, 0) / Math.max(1, arr.length));
129
+ };
130
+ const min = (arr) => arr.length ? Math.min(...arr) : 0;
131
+ const max = (arr) => arr.length ? Math.max(...arr) : 0;
132
+ const report = {
133
+ command, runs: numRuns,
134
+ passRate: consistency,
135
+ passAtK, passAllK,
136
+ latency: { avg: Math.round(avg(durations)), stddev: Math.round(stddev(durations)), min: min(durations), max: max(durations) },
137
+ cost: { total: Math.round(costs.reduce((a, b) => a + b, 0) * 10000) / 10000, avg: Math.round(avg(costs) * 10000) / 10000, stddev: Math.round(stddev(costs) * 10000) / 10000 },
138
+ tokens: { total: tokens.reduce((a, b) => a + b, 0), avg: Math.round(avg(tokens)) },
139
+ evalScore: { avg: Math.round(avg(scores)), min: min(scores), max: max(scores) },
140
+ trials: results,
141
+ };
142
+ if (opts.json) {
143
+ console.log(JSON.stringify(report, null, 2));
144
+ if (opts.failUnderConsistency) {
145
+ const threshold = parseInt(opts.failUnderConsistency, 10);
146
+ if (consistency < threshold)
147
+ process.exit(1);
148
+ }
149
+ return;
150
+ }
151
+ // Pretty print results
152
+ console.log(chalk_1.default.gray('\n ' + '─'.repeat(60)));
153
+ console.log(chalk_1.default.bold(' Results'));
154
+ const grade = consistency >= 90 ? chalk_1.default.green('A') : consistency >= 70 ? chalk_1.default.yellow('B') :
155
+ consistency >= 50 ? chalk_1.default.yellow('C') : chalk_1.default.red('F');
156
+ console.log(` Consistency: ${grade} ${consistency}% (${passes}/${numRuns} passed)`);
157
+ console.log(` pass@k: ${passAtK ? chalk_1.default.green('YES') : chalk_1.default.red('NO')} (at least 1 succeeds)`);
158
+ console.log(` pass^k: ${passAllK ? chalk_1.default.green('YES') : chalk_1.default.red('NO')} (all succeed)`);
159
+ console.log(chalk_1.default.gray('\n Latency'));
160
+ console.log(` avg ${avg(durations).toFixed(0)}ms | stddev ${stddev(durations).toFixed(0)}ms | min ${min(durations)}ms | max ${max(durations)}ms`);
161
+ if (costs.some(c => c > 0)) {
162
+ console.log(chalk_1.default.gray('\n Cost'));
163
+ console.log(` total $${report.cost.total} | avg $${report.cost.avg}/run | stddev $${report.cost.stddev}`);
164
+ console.log(` tokens: ${report.tokens.total} total | ${report.tokens.avg} avg/run`);
165
+ }
166
+ console.log(chalk_1.default.gray('\n Eval Score'));
167
+ console.log(` avg ${report.evalScore.avg}/100 | min ${report.evalScore.min} | max ${report.evalScore.max}`);
168
+ console.log(chalk_1.default.gray('\n ' + '─'.repeat(60)));
169
+ if (opts.failUnderConsistency) {
170
+ const threshold = parseInt(opts.failUnderConsistency, 10);
171
+ if (consistency < threshold) {
172
+ console.log(chalk_1.default.red(` FAIL: Consistency ${consistency}% below threshold ${threshold}%`));
173
+ process.exit(1);
174
+ }
175
+ }
176
+ console.log('');
177
+ }
@@ -156,6 +156,8 @@ function costReportCommand(opts) {
156
156
  ['claude-opus', 'frontier'], ['claude-sonnet', 'standard'], ['claude-haiku', 'mini'],
157
157
  ['gemini-2.5-flash-lite', 'mini'], ['gemini-2.5-flash', 'standard'], ['gemini-2.5-pro', 'frontier'],
158
158
  ['gemini-2.0-flash', 'mini'], ['gemini-1.5-pro', 'frontier'], ['gemini-1.5-flash', 'mini'],
159
+ ['mistral-large', 'frontier'], ['mistral-medium', 'standard'], ['mistral-small', 'mini'], ['codestral', 'standard'],
160
+ ['command-r-plus', 'frontier'], ['command-r', 'standard'], ['command-light', 'mini'],
159
161
  ];
160
162
  function classifyTier(model) {
161
163
  for (const [pattern, tier] of TIER_RULES) {
package/dist/index.js CHANGED
@@ -920,6 +920,17 @@ program
920
920
  const { whyCommand } = await Promise.resolve().then(() => __importStar(require("./commands/why")));
921
921
  whyCommand(query, opts);
922
922
  });
923
+ // trickle benchmark
924
+ program
925
+ .command("benchmark [command...]")
926
+ .description("Multi-trial reliability testing — run N times, measure consistency, cost variance, pass@k")
927
+ .option("--runs <n>", "Number of trial runs (default: 5)")
928
+ .option("--json", "Output structured JSON")
929
+ .option("--fail-under-consistency <pct>", "Fail if consistency below threshold (0-100, for CI)")
930
+ .action(async (commandParts, opts) => {
931
+ const { benchmarkCommand } = await Promise.resolve().then(() => __importStar(require("./commands/benchmark")));
932
+ await benchmarkCommand(commandParts.length > 0 ? commandParts.join(' ') : undefined, opts);
933
+ });
923
934
  // trickle playback
924
935
  program
925
936
  .command("playback")
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "trickle-cli",
3
- "version": "0.1.203",
3
+ "version": "0.1.205",
4
4
  "description": "Zero-code runtime observability for JS/Python + AI agent debugging. Traces LangChain, CrewAI, OpenAI, Anthropic, Gemini. Eval, security, compliance, cost tracking. Free, local-first.",
5
5
  "keywords": ["observability", "tracing", "llm", "openai", "anthropic", "langchain", "crewai", "agent", "mcp", "debugging", "typescript", "python", "security", "eval", "compliance"],
6
6
  "bin": {
@@ -0,0 +1,173 @@
1
+ /**
2
+ * trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
3
+ *
4
+ * Runs the same command N times, captures trickle data for each run,
5
+ * and reports variance: pass@k, consistency, cost/latency distribution.
6
+ *
7
+ * 85% per-step accuracy compounds to 20% on 10 steps — this measures
8
+ * whether your agent gives consistent results across identical inputs.
9
+ */
10
+
11
+ import * as fs from 'fs';
12
+ import * as path from 'path';
13
+ import chalk from 'chalk';
14
+ import { spawn } from 'child_process';
15
+
16
+ interface TrialResult {
17
+ run: number;
18
+ exitCode: number;
19
+ durationMs: number;
20
+ functions: number;
21
+ variables: number;
22
+ errors: number;
23
+ llmCalls: number;
24
+ llmCost: number;
25
+ llmTokens: number;
26
+ agentEvents: number;
27
+ evalScore: number;
28
+ }
29
+
30
+ function countLines(fp: string): number {
31
+ if (!fs.existsSync(fp)) return 0;
32
+ return fs.readFileSync(fp, 'utf-8').trim().split('\n').filter(Boolean).length;
33
+ }
34
+
35
+ function sumField(fp: string, field: string): number {
36
+ if (!fs.existsSync(fp)) return 0;
37
+ return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
38
+ .reduce((s, l) => { try { return s + (JSON.parse(l)[field] || 0); } catch { return s; } }, 0);
39
+ }
40
+
41
+ async function runTrial(command: string, trialDir: string): Promise<{ exitCode: number; durationMs: number }> {
42
+ return new Promise((resolve) => {
43
+ const start = Date.now();
44
+ const env = { ...process.env, TRICKLE_LOCAL: '1', TRICKLE_LOCAL_DIR: trialDir };
45
+ const proc = spawn(command, [], { shell: true, env, stdio: 'pipe' });
46
+ proc.on('exit', (code) => resolve({ exitCode: code ?? 1, durationMs: Date.now() - start }));
47
+ proc.on('error', () => resolve({ exitCode: 1, durationMs: Date.now() - start }));
48
+ });
49
+ }
50
+
51
+ export async function benchmarkCommand(
52
+ command: string | undefined,
53
+ opts: { runs?: string; json?: boolean; failUnderConsistency?: string },
54
+ ): Promise<void> {
55
+ if (!command) {
56
+ console.log(chalk.yellow(' Usage: trickle benchmark "python my_agent.py" --runs 5'));
57
+ return;
58
+ }
59
+
60
+ const numRuns = parseInt(opts.runs || '5', 10);
61
+ const baseDir = path.join(process.cwd(), '.trickle', 'benchmark');
62
+ fs.mkdirSync(baseDir, { recursive: true });
63
+
64
+ console.log('');
65
+ console.log(chalk.bold(' trickle benchmark'));
66
+ console.log(chalk.gray(' ' + '─'.repeat(60)));
67
+ console.log(` Command: ${chalk.cyan(command)}`);
68
+ console.log(` Runs: ${numRuns}`);
69
+ console.log('');
70
+
71
+ const results: TrialResult[] = [];
72
+
73
+ for (let i = 1; i <= numRuns; i++) {
74
+ const trialDir = path.join(baseDir, `run-${i}`);
75
+ fs.mkdirSync(trialDir, { recursive: true });
76
+ // Clear previous data
77
+ for (const f of fs.readdirSync(trialDir)) {
78
+ if (f.endsWith('.jsonl') || f.endsWith('.json')) fs.unlinkSync(path.join(trialDir, f));
79
+ }
80
+
81
+ process.stdout.write(chalk.gray(` Run ${i}/${numRuns}... `));
82
+ const { exitCode, durationMs } = await runTrial(command, trialDir);
83
+
84
+ const functions = countLines(path.join(trialDir, 'observations.jsonl'));
85
+ const variables = countLines(path.join(trialDir, 'variables.jsonl'));
86
+ const errors = countLines(path.join(trialDir, 'errors.jsonl'));
87
+ const llmCalls = countLines(path.join(trialDir, 'llm.jsonl'));
88
+ const llmCost = Math.round(sumField(path.join(trialDir, 'llm.jsonl'), 'estimatedCostUsd') * 10000) / 10000;
89
+ const llmTokens = sumField(path.join(trialDir, 'llm.jsonl'), 'totalTokens');
90
+ const agentEvents = countLines(path.join(trialDir, 'agents.jsonl'));
91
+
92
+ // Simple eval score: 100 if exit 0 and no errors, minus penalties
93
+ const evalScore = Math.max(0, (exitCode === 0 ? 100 : 30) - errors * 15);
94
+
95
+ results.push({ run: i, exitCode, durationMs, functions, variables, errors, llmCalls, llmCost, llmTokens, agentEvents, evalScore });
96
+
97
+ const icon = exitCode === 0 ? chalk.green('✓') : chalk.red('✗');
98
+ console.log(`${icon} ${durationMs}ms | ${functions} fn | ${errors} err | ${llmCalls} llm ($${llmCost})`);
99
+ }
100
+
101
+ // Compute statistics
102
+ const passes = results.filter(r => r.exitCode === 0).length;
103
+ const passAtK = passes > 0 ? 1 : 0; // At least 1 succeeds
104
+ const passAllK = passes === numRuns ? 1 : 0; // All succeed
105
+ const consistency = Math.round((passes / numRuns) * 100);
106
+
107
+ const durations = results.map(r => r.durationMs);
108
+ const costs = results.map(r => r.llmCost);
109
+ const tokens = results.map(r => r.llmTokens);
110
+ const scores = results.map(r => r.evalScore);
111
+
112
+ const avg = (arr: number[]) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
113
+ const stddev = (arr: number[]) => {
114
+ const m = avg(arr);
115
+ return Math.sqrt(arr.reduce((s, v) => s + (v - m) ** 2, 0) / Math.max(1, arr.length));
116
+ };
117
+ const min = (arr: number[]) => arr.length ? Math.min(...arr) : 0;
118
+ const max = (arr: number[]) => arr.length ? Math.max(...arr) : 0;
119
+
120
+ const report = {
121
+ command, runs: numRuns,
122
+ passRate: consistency,
123
+ passAtK, passAllK,
124
+ latency: { avg: Math.round(avg(durations)), stddev: Math.round(stddev(durations)), min: min(durations), max: max(durations) },
125
+ cost: { total: Math.round(costs.reduce((a, b) => a + b, 0) * 10000) / 10000, avg: Math.round(avg(costs) * 10000) / 10000, stddev: Math.round(stddev(costs) * 10000) / 10000 },
126
+ tokens: { total: tokens.reduce((a, b) => a + b, 0), avg: Math.round(avg(tokens)) },
127
+ evalScore: { avg: Math.round(avg(scores)), min: min(scores), max: max(scores) },
128
+ trials: results,
129
+ };
130
+
131
+ if (opts.json) {
132
+ console.log(JSON.stringify(report, null, 2));
133
+ if (opts.failUnderConsistency) {
134
+ const threshold = parseInt(opts.failUnderConsistency, 10);
135
+ if (consistency < threshold) process.exit(1);
136
+ }
137
+ return;
138
+ }
139
+
140
+ // Pretty print results
141
+ console.log(chalk.gray('\n ' + '─'.repeat(60)));
142
+ console.log(chalk.bold(' Results'));
143
+
144
+ const grade = consistency >= 90 ? chalk.green('A') : consistency >= 70 ? chalk.yellow('B') :
145
+ consistency >= 50 ? chalk.yellow('C') : chalk.red('F');
146
+ console.log(` Consistency: ${grade} ${consistency}% (${passes}/${numRuns} passed)`);
147
+ console.log(` pass@k: ${passAtK ? chalk.green('YES') : chalk.red('NO')} (at least 1 succeeds)`);
148
+ console.log(` pass^k: ${passAllK ? chalk.green('YES') : chalk.red('NO')} (all succeed)`);
149
+
150
+ console.log(chalk.gray('\n Latency'));
151
+ console.log(` avg ${avg(durations).toFixed(0)}ms | stddev ${stddev(durations).toFixed(0)}ms | min ${min(durations)}ms | max ${max(durations)}ms`);
152
+
153
+ if (costs.some(c => c > 0)) {
154
+ console.log(chalk.gray('\n Cost'));
155
+ console.log(` total $${report.cost.total} | avg $${report.cost.avg}/run | stddev $${report.cost.stddev}`);
156
+ console.log(` tokens: ${report.tokens.total} total | ${report.tokens.avg} avg/run`);
157
+ }
158
+
159
+ console.log(chalk.gray('\n Eval Score'));
160
+ console.log(` avg ${report.evalScore.avg}/100 | min ${report.evalScore.min} | max ${report.evalScore.max}`);
161
+
162
+ console.log(chalk.gray('\n ' + '─'.repeat(60)));
163
+
164
+ if (opts.failUnderConsistency) {
165
+ const threshold = parseInt(opts.failUnderConsistency, 10);
166
+ if (consistency < threshold) {
167
+ console.log(chalk.red(` FAIL: Consistency ${consistency}% below threshold ${threshold}%`));
168
+ process.exit(1);
169
+ }
170
+ }
171
+
172
+ console.log('');
173
+ }
@@ -131,6 +131,8 @@ export function costReportCommand(opts: { json?: boolean; budget?: string }): vo
131
131
  ['claude-opus', 'frontier'], ['claude-sonnet', 'standard'], ['claude-haiku', 'mini'],
132
132
  ['gemini-2.5-flash-lite', 'mini'], ['gemini-2.5-flash', 'standard'], ['gemini-2.5-pro', 'frontier'],
133
133
  ['gemini-2.0-flash', 'mini'], ['gemini-1.5-pro', 'frontier'], ['gemini-1.5-flash', 'mini'],
134
+ ['mistral-large', 'frontier'], ['mistral-medium', 'standard'], ['mistral-small', 'mini'], ['codestral', 'standard'],
135
+ ['command-r-plus', 'frontier'], ['command-r', 'standard'], ['command-light', 'mini'],
134
136
  ];
135
137
 
136
138
  function classifyTier(model: string): string {
package/src/index.ts CHANGED
@@ -953,6 +953,18 @@ program
953
953
  whyCommand(query, opts);
954
954
  });
955
955
 
956
+ // trickle benchmark
957
+ program
958
+ .command("benchmark [command...]")
959
+ .description("Multi-trial reliability testing — run N times, measure consistency, cost variance, pass@k")
960
+ .option("--runs <n>", "Number of trial runs (default: 5)")
961
+ .option("--json", "Output structured JSON")
962
+ .option("--fail-under-consistency <pct>", "Fail if consistency below threshold (0-100, for CI)")
963
+ .action(async (commandParts: string[], opts) => {
964
+ const { benchmarkCommand } = await import("./commands/benchmark");
965
+ await benchmarkCommand(commandParts.length > 0 ? commandParts.join(' ') : undefined, opts);
966
+ });
967
+
956
968
  // trickle playback
957
969
  program
958
970
  .command("playback")