npm - trickle-cli - Versions diffs - 0.1.203 → 0.1.205 - Mend

trickle-cli 0.1.203 → 0.1.205

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/commands/benchmark.d.ts +14 -0
package/dist/commands/benchmark.js +177 -0
package/dist/commands/cost-report.js +2 -0
package/dist/index.js +11 -0
package/package.json +1 -1
package/src/commands/benchmark.ts +173 -0
package/src/commands/cost-report.ts +2 -0
package/src/index.ts +12 -0

package/dist/commands/benchmark.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
+ *
+ * Runs the same command N times, captures trickle data for each run,
+ * and reports variance: pass@k, consistency, cost/latency distribution.
+ *
+ * 85% per-step accuracy compounds to 20% on 10 steps — this measures
+ * whether your agent gives consistent results across identical inputs.
+ */
+export declare function benchmarkCommand(command: string | undefined, opts: {
+    runs?: string;
+    json?: boolean;
+    failUnderConsistency?: string;
+}): Promise<void>;

package/dist/commands/benchmark.js ADDED Viewed

@@ -0,0 +1,177 @@
+"use strict";
+/**
+ * trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
+ *
+ * Runs the same command N times, captures trickle data for each run,
+ * and reports variance: pass@k, consistency, cost/latency distribution.
+ *
+ * 85% per-step accuracy compounds to 20% on 10 steps — this measures
+ * whether your agent gives consistent results across identical inputs.
+ */
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.benchmarkCommand = benchmarkCommand;
+const fs = __importStar(require("fs"));
+const path = __importStar(require("path"));
+const chalk_1 = __importDefault(require("chalk"));
+const child_process_1 = require("child_process");
+function countLines(fp) {
+    if (!fs.existsSync(fp))
+        return 0;
+    return fs.readFileSync(fp, 'utf-8').trim().split('\n').filter(Boolean).length;
+}
+function sumField(fp, field) {
+    if (!fs.existsSync(fp))
+        return 0;
+    return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
+        .reduce((s, l) => { try {
+        return s + (JSON.parse(l)[field] || 0);
+    }
+    catch {
+        return s;
+    } }, 0);
+}
+async function runTrial(command, trialDir) {
+    return new Promise((resolve) => {
+        const start = Date.now();
+        const env = { ...process.env, TRICKLE_LOCAL: '1', TRICKLE_LOCAL_DIR: trialDir };
+        const proc = (0, child_process_1.spawn)(command, [], { shell: true, env, stdio: 'pipe' });
+        proc.on('exit', (code) => resolve({ exitCode: code ?? 1, durationMs: Date.now() - start }));
+        proc.on('error', () => resolve({ exitCode: 1, durationMs: Date.now() - start }));
+    });
+}
+async function benchmarkCommand(command, opts) {
+    if (!command) {
+        console.log(chalk_1.default.yellow('  Usage: trickle benchmark "python my_agent.py" --runs 5'));
+        return;
+    }
+    const numRuns = parseInt(opts.runs || '5', 10);
+    const baseDir = path.join(process.cwd(), '.trickle', 'benchmark');
+    fs.mkdirSync(baseDir, { recursive: true });
+    console.log('');
+    console.log(chalk_1.default.bold('  trickle benchmark'));
+    console.log(chalk_1.default.gray('  ' + '─'.repeat(60)));
+    console.log(`  Command: ${chalk_1.default.cyan(command)}`);
+    console.log(`  Runs: ${numRuns}`);
+    console.log('');
+    const results = [];
+    for (let i = 1; i <= numRuns; i++) {
+        const trialDir = path.join(baseDir, `run-${i}`);
+        fs.mkdirSync(trialDir, { recursive: true });
+        // Clear previous data
+        for (const f of fs.readdirSync(trialDir)) {
+            if (f.endsWith('.jsonl') || f.endsWith('.json'))
+                fs.unlinkSync(path.join(trialDir, f));
+        }
+        process.stdout.write(chalk_1.default.gray(`  Run ${i}/${numRuns}... `));
+        const { exitCode, durationMs } = await runTrial(command, trialDir);
+        const functions = countLines(path.join(trialDir, 'observations.jsonl'));
+        const variables = countLines(path.join(trialDir, 'variables.jsonl'));
+        const errors = countLines(path.join(trialDir, 'errors.jsonl'));
+        const llmCalls = countLines(path.join(trialDir, 'llm.jsonl'));
+        const llmCost = Math.round(sumField(path.join(trialDir, 'llm.jsonl'), 'estimatedCostUsd') * 10000) / 10000;
+        const llmTokens = sumField(path.join(trialDir, 'llm.jsonl'), 'totalTokens');
+        const agentEvents = countLines(path.join(trialDir, 'agents.jsonl'));
+        // Simple eval score: 100 if exit 0 and no errors, minus penalties
+        const evalScore = Math.max(0, (exitCode === 0 ? 100 : 30) - errors * 15);
+        results.push({ run: i, exitCode, durationMs, functions, variables, errors, llmCalls, llmCost, llmTokens, agentEvents, evalScore });
+        const icon = exitCode === 0 ? chalk_1.default.green('✓') : chalk_1.default.red('✗');
+        console.log(`${icon} ${durationMs}ms | ${functions} fn | ${errors} err | ${llmCalls} llm ($${llmCost})`);
+    }
+    // Compute statistics
+    const passes = results.filter(r => r.exitCode === 0).length;
+    const passAtK = passes > 0 ? 1 : 0; // At least 1 succeeds
+    const passAllK = passes === numRuns ? 1 : 0; // All succeed
+    const consistency = Math.round((passes / numRuns) * 100);
+    const durations = results.map(r => r.durationMs);
+    const costs = results.map(r => r.llmCost);
+    const tokens = results.map(r => r.llmTokens);
+    const scores = results.map(r => r.evalScore);
+    const avg = (arr) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
+    const stddev = (arr) => {
+        const m = avg(arr);
+        return Math.sqrt(arr.reduce((s, v) => s + (v - m) ** 2, 0) / Math.max(1, arr.length));
+    };
+    const min = (arr) => arr.length ? Math.min(...arr) : 0;
+    const max = (arr) => arr.length ? Math.max(...arr) : 0;
+    const report = {
+        command, runs: numRuns,
+        passRate: consistency,
+        passAtK, passAllK,
+        latency: { avg: Math.round(avg(durations)), stddev: Math.round(stddev(durations)), min: min(durations), max: max(durations) },
+        cost: { total: Math.round(costs.reduce((a, b) => a + b, 0) * 10000) / 10000, avg: Math.round(avg(costs) * 10000) / 10000, stddev: Math.round(stddev(costs) * 10000) / 10000 },
+        tokens: { total: tokens.reduce((a, b) => a + b, 0), avg: Math.round(avg(tokens)) },
+        evalScore: { avg: Math.round(avg(scores)), min: min(scores), max: max(scores) },
+        trials: results,
+    };
+    if (opts.json) {
+        console.log(JSON.stringify(report, null, 2));
+        if (opts.failUnderConsistency) {
+            const threshold = parseInt(opts.failUnderConsistency, 10);
+            if (consistency < threshold)
+                process.exit(1);
+        }
+        return;
+    }
+    // Pretty print results
+    console.log(chalk_1.default.gray('\n  ' + '─'.repeat(60)));
+    console.log(chalk_1.default.bold('  Results'));
+    const grade = consistency >= 90 ? chalk_1.default.green('A') : consistency >= 70 ? chalk_1.default.yellow('B') :
+        consistency >= 50 ? chalk_1.default.yellow('C') : chalk_1.default.red('F');
+    console.log(`  Consistency: ${grade} ${consistency}% (${passes}/${numRuns} passed)`);
+    console.log(`  pass@k: ${passAtK ? chalk_1.default.green('YES') : chalk_1.default.red('NO')} (at least 1 succeeds)`);
+    console.log(`  pass^k: ${passAllK ? chalk_1.default.green('YES') : chalk_1.default.red('NO')} (all succeed)`);
+    console.log(chalk_1.default.gray('\n  Latency'));
+    console.log(`  avg ${avg(durations).toFixed(0)}ms | stddev ${stddev(durations).toFixed(0)}ms | min ${min(durations)}ms | max ${max(durations)}ms`);
+    if (costs.some(c => c > 0)) {
+        console.log(chalk_1.default.gray('\n  Cost'));
+        console.log(`  total $${report.cost.total} | avg $${report.cost.avg}/run | stddev $${report.cost.stddev}`);
+        console.log(`  tokens: ${report.tokens.total} total | ${report.tokens.avg} avg/run`);
+    }
+    console.log(chalk_1.default.gray('\n  Eval Score'));
+    console.log(`  avg ${report.evalScore.avg}/100 | min ${report.evalScore.min} | max ${report.evalScore.max}`);
+    console.log(chalk_1.default.gray('\n  ' + '─'.repeat(60)));
+    if (opts.failUnderConsistency) {
+        const threshold = parseInt(opts.failUnderConsistency, 10);
+        if (consistency < threshold) {
+            console.log(chalk_1.default.red(`  FAIL: Consistency ${consistency}% below threshold ${threshold}%`));
+            process.exit(1);
+        }
+    }
+    console.log('');
+}

package/dist/commands/cost-report.js CHANGED Viewed

@@ -156,6 +156,8 @@ function costReportCommand(opts) {
         ['claude-opus', 'frontier'], ['claude-sonnet', 'standard'], ['claude-haiku', 'mini'],
         ['gemini-2.5-flash-lite', 'mini'], ['gemini-2.5-flash', 'standard'], ['gemini-2.5-pro', 'frontier'],
         ['gemini-2.0-flash', 'mini'], ['gemini-1.5-pro', 'frontier'], ['gemini-1.5-flash', 'mini'],
+        ['mistral-large', 'frontier'], ['mistral-medium', 'standard'], ['mistral-small', 'mini'], ['codestral', 'standard'],
+        ['command-r-plus', 'frontier'], ['command-r', 'standard'], ['command-light', 'mini'],
     ];
     function classifyTier(model) {
         for (const [pattern, tier] of TIER_RULES) {

package/dist/index.js CHANGED Viewed

@@ -920,6 +920,17 @@ program
     const { whyCommand } = await Promise.resolve().then(() => __importStar(require("./commands/why")));
     whyCommand(query, opts);
 });
+// trickle benchmark
+program
+    .command("benchmark [command...]")
+    .description("Multi-trial reliability testing — run N times, measure consistency, cost variance, pass@k")
+    .option("--runs <n>", "Number of trial runs (default: 5)")
+    .option("--json", "Output structured JSON")
+    .option("--fail-under-consistency <pct>", "Fail if consistency below threshold (0-100, for CI)")
+    .action(async (commandParts, opts) => {
+    const { benchmarkCommand } = await Promise.resolve().then(() => __importStar(require("./commands/benchmark")));
+    await benchmarkCommand(commandParts.length > 0 ? commandParts.join(' ') : undefined, opts);
+});
 // trickle playback
 program
     .command("playback")

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "trickle-cli",
-  "version": "0.1.203",
+  "version": "0.1.205",
   "description": "Zero-code runtime observability for JS/Python + AI agent debugging. Traces LangChain, CrewAI, OpenAI, Anthropic, Gemini. Eval, security, compliance, cost tracking. Free, local-first.",
   "keywords": ["observability", "tracing", "llm", "openai", "anthropic", "langchain", "crewai", "agent", "mcp", "debugging", "typescript", "python", "security", "eval", "compliance"],
   "bin": {

package/src/commands/benchmark.ts ADDED Viewed

@@ -0,0 +1,173 @@
+/**
+ * trickle benchmark <command> --runs N — Multi-trial agent reliability testing.
+ *
+ * Runs the same command N times, captures trickle data for each run,
+ * and reports variance: pass@k, consistency, cost/latency distribution.
+ *
+ * 85% per-step accuracy compounds to 20% on 10 steps — this measures
+ * whether your agent gives consistent results across identical inputs.
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+import chalk from 'chalk';
+import { spawn } from 'child_process';
+interface TrialResult {
+  run: number;
+  exitCode: number;
+  durationMs: number;
+  functions: number;
+  variables: number;
+  errors: number;
+  llmCalls: number;
+  llmCost: number;
+  llmTokens: number;
+  agentEvents: number;
+  evalScore: number;
+}
+function countLines(fp: string): number {
+  if (!fs.existsSync(fp)) return 0;
+  return fs.readFileSync(fp, 'utf-8').trim().split('\n').filter(Boolean).length;
+}
+function sumField(fp: string, field: string): number {
+  if (!fs.existsSync(fp)) return 0;
+  return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
+    .reduce((s, l) => { try { return s + (JSON.parse(l)[field] || 0); } catch { return s; } }, 0);
+}
+async function runTrial(command: string, trialDir: string): Promise<{ exitCode: number; durationMs: number }> {
+  return new Promise((resolve) => {
+    const start = Date.now();
+    const env = { ...process.env, TRICKLE_LOCAL: '1', TRICKLE_LOCAL_DIR: trialDir };
+    const proc = spawn(command, [], { shell: true, env, stdio: 'pipe' });
+    proc.on('exit', (code) => resolve({ exitCode: code ?? 1, durationMs: Date.now() - start }));
+    proc.on('error', () => resolve({ exitCode: 1, durationMs: Date.now() - start }));
+  });
+}
+export async function benchmarkCommand(
+  command: string | undefined,
+  opts: { runs?: string; json?: boolean; failUnderConsistency?: string },
+): Promise<void> {
+  if (!command) {
+    console.log(chalk.yellow('  Usage: trickle benchmark "python my_agent.py" --runs 5'));
+    return;
+  }
+  const numRuns = parseInt(opts.runs || '5', 10);
+  const baseDir = path.join(process.cwd(), '.trickle', 'benchmark');
+  fs.mkdirSync(baseDir, { recursive: true });
+  console.log('');
+  console.log(chalk.bold('  trickle benchmark'));
+  console.log(chalk.gray('  ' + '─'.repeat(60)));
+  console.log(`  Command: ${chalk.cyan(command)}`);
+  console.log(`  Runs: ${numRuns}`);
+  console.log('');
+  const results: TrialResult[] = [];
+  for (let i = 1; i <= numRuns; i++) {
+    const trialDir = path.join(baseDir, `run-${i}`);
+    fs.mkdirSync(trialDir, { recursive: true });
+    // Clear previous data
+    for (const f of fs.readdirSync(trialDir)) {
+      if (f.endsWith('.jsonl') || f.endsWith('.json')) fs.unlinkSync(path.join(trialDir, f));
+    }
+    process.stdout.write(chalk.gray(`  Run ${i}/${numRuns}... `));
+    const { exitCode, durationMs } = await runTrial(command, trialDir);
+    const functions = countLines(path.join(trialDir, 'observations.jsonl'));
+    const variables = countLines(path.join(trialDir, 'variables.jsonl'));
+    const errors = countLines(path.join(trialDir, 'errors.jsonl'));
+    const llmCalls = countLines(path.join(trialDir, 'llm.jsonl'));
+    const llmCost = Math.round(sumField(path.join(trialDir, 'llm.jsonl'), 'estimatedCostUsd') * 10000) / 10000;
+    const llmTokens = sumField(path.join(trialDir, 'llm.jsonl'), 'totalTokens');
+    const agentEvents = countLines(path.join(trialDir, 'agents.jsonl'));
+    // Simple eval score: 100 if exit 0 and no errors, minus penalties
+    const evalScore = Math.max(0, (exitCode === 0 ? 100 : 30) - errors * 15);
+    results.push({ run: i, exitCode, durationMs, functions, variables, errors, llmCalls, llmCost, llmTokens, agentEvents, evalScore });
+    const icon = exitCode === 0 ? chalk.green('✓') : chalk.red('✗');
+    console.log(`${icon} ${durationMs}ms | ${functions} fn | ${errors} err | ${llmCalls} llm ($${llmCost})`);
+  }
+  // Compute statistics
+  const passes = results.filter(r => r.exitCode === 0).length;
+  const passAtK = passes > 0 ? 1 : 0; // At least 1 succeeds
+  const passAllK = passes === numRuns ? 1 : 0; // All succeed
+  const consistency = Math.round((passes / numRuns) * 100);
+  const durations = results.map(r => r.durationMs);
+  const costs = results.map(r => r.llmCost);
+  const tokens = results.map(r => r.llmTokens);
+  const scores = results.map(r => r.evalScore);
+  const avg = (arr: number[]) => arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
+  const stddev = (arr: number[]) => {
+    const m = avg(arr);
+    return Math.sqrt(arr.reduce((s, v) => s + (v - m) ** 2, 0) / Math.max(1, arr.length));
+  };
+  const min = (arr: number[]) => arr.length ? Math.min(...arr) : 0;
+  const max = (arr: number[]) => arr.length ? Math.max(...arr) : 0;
+  const report = {
+    command, runs: numRuns,
+    passRate: consistency,
+    passAtK, passAllK,
+    latency: { avg: Math.round(avg(durations)), stddev: Math.round(stddev(durations)), min: min(durations), max: max(durations) },
+    cost: { total: Math.round(costs.reduce((a, b) => a + b, 0) * 10000) / 10000, avg: Math.round(avg(costs) * 10000) / 10000, stddev: Math.round(stddev(costs) * 10000) / 10000 },
+    tokens: { total: tokens.reduce((a, b) => a + b, 0), avg: Math.round(avg(tokens)) },
+    evalScore: { avg: Math.round(avg(scores)), min: min(scores), max: max(scores) },
+    trials: results,
+  };
+  if (opts.json) {
+    console.log(JSON.stringify(report, null, 2));
+    if (opts.failUnderConsistency) {
+      const threshold = parseInt(opts.failUnderConsistency, 10);
+      if (consistency < threshold) process.exit(1);
+    }
+    return;
+  }
+  // Pretty print results
+  console.log(chalk.gray('\n  ' + '─'.repeat(60)));
+  console.log(chalk.bold('  Results'));
+  const grade = consistency >= 90 ? chalk.green('A') : consistency >= 70 ? chalk.yellow('B') :
+    consistency >= 50 ? chalk.yellow('C') : chalk.red('F');
+  console.log(`  Consistency: ${grade} ${consistency}% (${passes}/${numRuns} passed)`);
+  console.log(`  pass@k: ${passAtK ? chalk.green('YES') : chalk.red('NO')} (at least 1 succeeds)`);
+  console.log(`  pass^k: ${passAllK ? chalk.green('YES') : chalk.red('NO')} (all succeed)`);
+  console.log(chalk.gray('\n  Latency'));
+  console.log(`  avg ${avg(durations).toFixed(0)}ms | stddev ${stddev(durations).toFixed(0)}ms | min ${min(durations)}ms | max ${max(durations)}ms`);
+  if (costs.some(c => c > 0)) {
+    console.log(chalk.gray('\n  Cost'));
+    console.log(`  total $${report.cost.total} | avg $${report.cost.avg}/run | stddev $${report.cost.stddev}`);
+    console.log(`  tokens: ${report.tokens.total} total | ${report.tokens.avg} avg/run`);
+  }
+  console.log(chalk.gray('\n  Eval Score'));
+  console.log(`  avg ${report.evalScore.avg}/100 | min ${report.evalScore.min} | max ${report.evalScore.max}`);
+  console.log(chalk.gray('\n  ' + '─'.repeat(60)));
+  if (opts.failUnderConsistency) {
+    const threshold = parseInt(opts.failUnderConsistency, 10);
+    if (consistency < threshold) {
+      console.log(chalk.red(`  FAIL: Consistency ${consistency}% below threshold ${threshold}%`));
+      process.exit(1);
+    }
+  }
+  console.log('');
+}

package/src/commands/cost-report.ts CHANGED Viewed

@@ -131,6 +131,8 @@ export function costReportCommand(opts: { json?: boolean; budget?: string }): vo
     ['claude-opus', 'frontier'], ['claude-sonnet', 'standard'], ['claude-haiku', 'mini'],
     ['gemini-2.5-flash-lite', 'mini'], ['gemini-2.5-flash', 'standard'], ['gemini-2.5-pro', 'frontier'],
     ['gemini-2.0-flash', 'mini'], ['gemini-1.5-pro', 'frontier'], ['gemini-1.5-flash', 'mini'],
+    ['mistral-large', 'frontier'], ['mistral-medium', 'standard'], ['mistral-small', 'mini'], ['codestral', 'standard'],
+    ['command-r-plus', 'frontier'], ['command-r', 'standard'], ['command-light', 'mini'],
   ];
   function classifyTier(model: string): string {

package/src/index.ts CHANGED Viewed

@@ -953,6 +953,18 @@ program
     whyCommand(query, opts);
   });
+// trickle benchmark
+program
+  .command("benchmark [command...]")
+  .description("Multi-trial reliability testing — run N times, measure consistency, cost variance, pass@k")
+  .option("--runs <n>", "Number of trial runs (default: 5)")
+  .option("--json", "Output structured JSON")
+  .option("--fail-under-consistency <pct>", "Fail if consistency below threshold (0-100, for CI)")
+  .action(async (commandParts: string[], opts) => {
+    const { benchmarkCommand } = await import("./commands/benchmark");
+    await benchmarkCommand(commandParts.length > 0 ? commandParts.join(' ') : undefined, opts);
+  });
 // trickle playback
 program
   .command("playback")