npm - trickle-cli - Versions diffs - 0.1.187 → 0.1.189 - Mend

trickle-cli 0.1.187 → 0.1.189

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/commands/eval.d.ts +16 -0
package/dist/commands/eval.js +250 -0
package/dist/commands/run-diff.d.ts +20 -0
package/dist/commands/run-diff.js +72 -4
package/dist/index.js +9 -0
package/package.json +1 -1
package/src/commands/eval.ts +231 -0
package/src/commands/run-diff.ts +91 -4
package/src/index.ts +10 -0

package/dist/commands/eval.d.ts ADDED Viewed

@@ -0,0 +1,16 @@
+/**
+ * trickle eval — Score agent runs using traces already captured.
+ *
+ * Analyzes agents.jsonl, llm.jsonl, errors.jsonl to produce reliability
+ * scores without needing an LLM-as-judge. Zero cost, zero API keys.
+ *
+ * Scoring dimensions:
+ * - Completion: Did the agent finish successfully?
+ * - Error rate: How many errors during execution?
+ * - Cost efficiency: Tokens per meaningful output
+ * - Tool reliability: Success rate of tool calls
+ * - Latency: Was execution time reasonable?
+ */
+export declare function evalCommand(opts: {
+    json?: boolean;
+}): void;

package/dist/commands/eval.js ADDED Viewed

@@ -0,0 +1,250 @@
+"use strict";
+/**
+ * trickle eval — Score agent runs using traces already captured.
+ *
+ * Analyzes agents.jsonl, llm.jsonl, errors.jsonl to produce reliability
+ * scores without needing an LLM-as-judge. Zero cost, zero API keys.
+ *
+ * Scoring dimensions:
+ * - Completion: Did the agent finish successfully?
+ * - Error rate: How many errors during execution?
+ * - Cost efficiency: Tokens per meaningful output
+ * - Tool reliability: Success rate of tool calls
+ * - Latency: Was execution time reasonable?
+ */
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.evalCommand = evalCommand;
+const fs = __importStar(require("fs"));
+const path = __importStar(require("path"));
+const chalk_1 = __importDefault(require("chalk"));
+function readJsonl(fp) {
+    if (!fs.existsSync(fp))
+        return [];
+    return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
+        .map(l => { try {
+        return JSON.parse(l);
+    }
+    catch {
+        return null;
+    } }).filter(Boolean);
+}
+function evalCommand(opts) {
+    const dir = process.env.TRICKLE_LOCAL_DIR || path.join(process.cwd(), '.trickle');
+    const agentEvents = readJsonl(path.join(dir, 'agents.jsonl'));
+    const llmCalls = readJsonl(path.join(dir, 'llm.jsonl'));
+    const errors = readJsonl(path.join(dir, 'errors.jsonl'));
+    const mcpCalls = readJsonl(path.join(dir, 'mcp.jsonl'));
+    if (agentEvents.length === 0 && llmCalls.length === 0) {
+        console.log(chalk_1.default.yellow('  No agent or LLM data to evaluate. Run an agent with trickle first.'));
+        return;
+    }
+    const result = scoreRun(agentEvents, llmCalls, errors, mcpCalls);
+    if (opts.json) {
+        console.log(JSON.stringify(result, null, 2));
+        return;
+    }
+    // Pretty print
+    console.log('');
+    console.log(chalk_1.default.bold('  trickle eval'));
+    console.log(chalk_1.default.gray('  ' + '─'.repeat(60)));
+    const gradeColor = result.overallScore >= 80 ? chalk_1.default.green :
+        result.overallScore >= 60 ? chalk_1.default.yellow : chalk_1.default.red;
+    console.log(`  Overall: ${gradeColor(result.grade + ' (' + result.overallScore + '/100)')}`);
+    console.log('');
+    // Dimension scores
+    const dims = result.dimensions;
+    printDimension('Completion', dims.completion);
+    printDimension('Errors', dims.errors);
+    printDimension('Cost Efficiency', dims.costEfficiency);
+    printDimension('Tool Reliability', dims.toolReliability);
+    printDimension('Latency', dims.latency);
+    console.log(chalk_1.default.gray('\n  ' + '─'.repeat(60)));
+    console.log(chalk_1.default.bold('  Summary'));
+    console.log(`  ${result.summary}`);
+    if (result.recommendations.length > 0) {
+        console.log(chalk_1.default.bold('\n  Recommendations'));
+        for (const rec of result.recommendations) {
+            console.log(`  ${chalk_1.default.yellow('→')} ${rec}`);
+        }
+    }
+    console.log('');
+}
+function printDimension(name, dim) {
+    const bar = renderBar(dim.score);
+    const color = dim.score >= 80 ? chalk_1.default.green : dim.score >= 60 ? chalk_1.default.yellow : chalk_1.default.red;
+    console.log(`  ${name.padEnd(18)} ${bar} ${color(String(dim.score).padStart(3))}/100  ${chalk_1.default.gray(dim.detail)}`);
+}
+function renderBar(score) {
+    const filled = Math.round(score / 5);
+    const empty = 20 - filled;
+    const color = score >= 80 ? chalk_1.default.green : score >= 60 ? chalk_1.default.yellow : chalk_1.default.red;
+    return color('█'.repeat(filled)) + chalk_1.default.gray('░'.repeat(empty));
+}
+function scoreRun(agentEvents, llmCalls, errors, mcpCalls) {
+    const recommendations = [];
+    // 1. Completion score (0-100)
+    const crewStarts = agentEvents.filter(e => e.event === 'crew_start' || e.event === 'chain_start');
+    const crewEnds = agentEvents.filter(e => e.event === 'crew_end' || e.event === 'chain_end');
+    const crewErrors = agentEvents.filter(e => e.event === 'crew_error' || e.event === 'chain_error');
+    const completionRate = crewStarts.length > 0
+        ? Math.min(1, crewEnds.length / crewStarts.length)
+        : (llmCalls.length > 0 ? (llmCalls.filter(c => !c.error).length / llmCalls.length) : 1);
+    const completionScore = Math.round(completionRate * 100);
+    let completionDetail = '';
+    if (crewStarts.length > 0) {
+        completionDetail = `${crewEnds.length}/${crewStarts.length} workflows completed`;
+        if (crewErrors.length > 0)
+            completionDetail += `, ${crewErrors.length} failed`;
+    }
+    else {
+        completionDetail = `${llmCalls.filter(c => !c.error).length}/${llmCalls.length} LLM calls succeeded`;
+    }
+    if (completionScore < 80)
+        recommendations.push('Improve completion rate — check agent error handling and tool reliability');
+    // 2. Error score (0-100, inverse of error rate)
+    const totalSteps = agentEvents.length + llmCalls.length + mcpCalls.length;
+    const errorEvents = [
+        ...agentEvents.filter(e => e.event?.includes('error')),
+        ...llmCalls.filter(c => c.error),
+        ...mcpCalls.filter(c => c.isError),
+        ...errors,
+    ];
+    const errorRate = totalSteps > 0 ? errorEvents.length / totalSteps : 0;
+    const errorScore = Math.round(Math.max(0, (1 - errorRate * 5)) * 100); // 20% errors = 0 score
+    const errorDetail = `${errorEvents.length} errors in ${totalSteps} steps (${(errorRate * 100).toFixed(1)}%)`;
+    if (errorScore < 80)
+        recommendations.push(`Reduce error rate — ${errorEvents.length} errors detected. Use \`trickle why\` to investigate`);
+    // 3. Cost efficiency (0-100)
+    const totalCost = llmCalls.reduce((s, c) => s + (c.estimatedCostUsd || 0), 0);
+    const totalTokens = llmCalls.reduce((s, c) => s + (c.totalTokens || 0), 0);
+    const outputTokens = llmCalls.reduce((s, c) => s + (c.outputTokens || 0), 0);
+    const inputTokens = llmCalls.reduce((s, c) => s + (c.inputTokens || 0), 0);
+    // Efficiency: ratio of output tokens to input tokens (higher = more efficient)
+    const ioRatio = inputTokens > 0 ? outputTokens / inputTokens : 1;
+    // Score: 1:1 ratio = 100, 1:10 ratio = 50, 1:100 = 10
+    const costScore = llmCalls.length === 0 ? 100 : Math.round(Math.min(100, Math.max(10, ioRatio * 100)));
+    const costDetail = llmCalls.length > 0
+        ? `$${totalCost.toFixed(4)} total, ${formatTokens(inputTokens)} in → ${formatTokens(outputTokens)} out (${ioRatio.toFixed(2)} ratio)`
+        : 'No LLM calls';
+    if (costScore < 60 && llmCalls.length > 0)
+        recommendations.push('Reduce prompt size — input tokens far exceed output. Consider summarizing context before sending');
+    // 4. Tool reliability (0-100)
+    const toolStarts = agentEvents.filter(e => e.event === 'tool_start');
+    const toolEnds = agentEvents.filter(e => e.event === 'tool_end');
+    const toolErrors = agentEvents.filter(e => e.event === 'tool_error');
+    const mcpErrors = mcpCalls.filter(c => c.isError);
+    const totalToolCalls = toolStarts.length + mcpCalls.filter(c => c.tool !== '__list_tools').length;
+    const totalToolErrors = toolErrors.length + mcpErrors.length;
+    const toolSuccessRate = totalToolCalls > 0 ? 1 - (totalToolErrors / totalToolCalls) : 1;
+    const toolScore = Math.round(toolSuccessRate * 100);
+    const toolDetail = totalToolCalls > 0
+        ? `${totalToolCalls - totalToolErrors}/${totalToolCalls} tool calls succeeded`
+        : 'No tool calls';
+    if (toolScore < 80)
+        recommendations.push(`Fix failing tools — ${totalToolErrors} tool errors detected. Check tool implementations`);
+    // Check for retry loops
+    const toolNames = toolStarts.map(e => e.tool || '');
+    let maxConsecutive = 1;
+    let current = 1;
+    for (let i = 1; i < toolNames.length; i++) {
+        if (toolNames[i] === toolNames[i - 1] && toolNames[i]) {
+            current++;
+            maxConsecutive = Math.max(maxConsecutive, current);
+        }
+        else
+            current = 1;
+    }
+    if (maxConsecutive >= 3)
+        recommendations.push(`Tool retry loop detected (${maxConsecutive} consecutive calls). Agent may be stuck`);
+    // 5. Latency score (0-100)
+    const durations = [
+        ...agentEvents.filter(e => e.durationMs).map(e => e.durationMs),
+        ...llmCalls.filter(c => c.durationMs).map(c => c.durationMs),
+    ];
+    const avgLatency = durations.length > 0 ? durations.reduce((s, d) => s + d, 0) / durations.length : 0;
+    const maxLatency = durations.length > 0 ? Math.max(...durations) : 0;
+    // Score: < 500ms avg = 100, 500-2000 = linear, > 5000ms = 20
+    const latencyScore = durations.length === 0 ? 100 :
+        Math.round(Math.min(100, Math.max(20, 100 - (avgLatency - 500) / 50)));
+    const latencyDetail = durations.length > 0
+        ? `avg ${avgLatency.toFixed(0)}ms, max ${maxLatency.toFixed(0)}ms across ${durations.length} steps`
+        : 'No timing data';
+    if (latencyScore < 60)
+        recommendations.push(`High latency — avg ${avgLatency.toFixed(0)}ms. Consider faster models or reducing prompt size`);
+    // Overall score (weighted average)
+    const weights = { completion: 0.3, errors: 0.25, costEfficiency: 0.15, toolReliability: 0.2, latency: 0.1 };
+    const overallScore = Math.round(completionScore * weights.completion +
+        errorScore * weights.errors +
+        costScore * weights.costEfficiency +
+        toolScore * weights.toolReliability +
+        latencyScore * weights.latency);
+    const grade = overallScore >= 90 ? 'A' : overallScore >= 80 ? 'B' : overallScore >= 70 ? 'C' :
+        overallScore >= 60 ? 'D' : 'F';
+    // Summary
+    const parts = [];
+    if (crewStarts.length > 0)
+        parts.push(`${crewStarts.length} workflow(s)`);
+    if (llmCalls.length > 0)
+        parts.push(`${llmCalls.length} LLM calls ($${totalCost.toFixed(4)})`);
+    if (totalToolCalls > 0)
+        parts.push(`${totalToolCalls} tool calls`);
+    if (errorEvents.length > 0)
+        parts.push(`${errorEvents.length} errors`);
+    const summary = parts.join(', ') || 'No agent activity detected';
+    return {
+        overallScore,
+        grade,
+        dimensions: {
+            completion: { score: completionScore, detail: completionDetail },
+            errors: { score: errorScore, detail: errorDetail },
+            costEfficiency: { score: costScore, detail: costDetail },
+            toolReliability: { score: toolScore, detail: toolDetail },
+            latency: { score: latencyScore, detail: latencyDetail },
+        },
+        summary,
+        recommendations,
+    };
+}
+function formatTokens(n) {
+    if (n >= 1_000_000)
+        return (n / 1_000_000).toFixed(1) + 'M';
+    if (n >= 1_000)
+        return (n / 1_000).toFixed(1) + 'K';
+    return String(n);
+}

package/dist/commands/run-diff.d.ts CHANGED Viewed

@@ -46,6 +46,26 @@ export interface RunDiff {
         newAlerts: string[];
         resolvedAlerts: string[];
     };
+    llm: {
+        beforeCalls: number;
+        afterCalls: number;
+        beforeCost: number;
+        afterCost: number;
+        costDelta: number;
+        beforeTokens: number;
+        afterTokens: number;
+        modelChanges: string[];
+    };
+    agents: {
+        beforeSteps: number;
+        afterSteps: number;
+        beforeTools: string[];
+        afterTools: string[];
+        newTools: string[];
+        removedTools: string[];
+        beforeErrors: number;
+        afterErrors: number;
+    };
     verdict: 'improved' | 'regressed' | 'unchanged' | 'mixed';
 }
 export declare function diffRuns(beforeDir: string, afterDir: string): RunDiff;

package/dist/commands/run-diff.js CHANGED Viewed

@@ -81,7 +81,21 @@ function collectRunData(dir) {
     }
     const errorMessages = new Set(errors.map((e) => (e.message || '').substring(0, 100)));
     const alertMessages = new Set(alerts.map((a) => (a.message || '').substring(0, 100)));
-    return { funcMap, queryPatterns, errorMessages, alertMessages, queryCount: queries.length, errorCount: errors.length, alertCount: alerts.length };
+    // LLM data
+    const llmCalls = readJsonl(path.join(dir, 'llm.jsonl'));
+    const llmCost = llmCalls.reduce((s, c) => s + (c.estimatedCostUsd || 0), 0);
+    const llmTokens = llmCalls.reduce((s, c) => s + (c.totalTokens || 0), 0);
+    const llmModels = new Set(llmCalls.map((c) => `${c.provider}/${c.model}`));
+    // Agent data
+    const agentEvents = readJsonl(path.join(dir, 'agents.jsonl'));
+    const agentTools = new Set(agentEvents.filter((e) => e.event === 'tool_start' || e.event === 'tool_end').map((e) => e.tool || ''));
+    const agentErrors = agentEvents.filter((e) => e.event?.includes('error'));
+    return {
+        funcMap, queryPatterns, errorMessages, alertMessages,
+        queryCount: queries.length, errorCount: errors.length, alertCount: alerts.length,
+        llmCalls: llmCalls.length, llmCost, llmTokens, llmModels,
+        agentEvents: agentEvents.length, agentTools, agentErrors: agentErrors.length,
+    };
 }
 function diffRuns(beforeDir, afterDir) {
     const before = collectRunData(beforeDir);
@@ -116,9 +130,29 @@ function diffRuns(beforeDir, afterDir) {
     // Alerts
     const newAlerts = [...after.alertMessages].filter(a => !before.alertMessages.has(a));
     const resolvedAlerts = [...before.alertMessages].filter(a => !after.alertMessages.has(a));
+    // LLM comparison
+    const costDelta = after.llmCost - before.llmCost;
+    const afterModels = [...after.llmModels];
+    const beforeModels = [...before.llmModels];
+    const modelChanges = [];
+    for (const m of afterModels)
+        if (!before.llmModels.has(m))
+            modelChanges.push(`+ ${m}`);
+    for (const m of beforeModels)
+        if (!after.llmModels.has(m))
+            modelChanges.push(`- ${m}`);
+    // Agent comparison
+    const afterTools = [...after.agentTools];
+    const beforeTools = [...before.agentTools];
+    const newAgentTools = afterTools.filter(t => !before.agentTools.has(t));
+    const removedAgentTools = beforeTools.filter(t => !after.agentTools.has(t));
     // Verdict
-    const improvements = resolvedErrors.length + resolvedAlerts.length + fasterBy.length + (nPlusOneAfter < nPlusOneBefore ? 1 : 0);
-    const regressions = newErrors.length + newAlerts.length + slowerBy.length + (nPlusOneAfter > nPlusOneBefore ? 1 : 0);
+    const improvements = resolvedErrors.length + resolvedAlerts.length + fasterBy.length +
+        (nPlusOneAfter < nPlusOneBefore ? 1 : 0) + (costDelta < -0.001 ? 1 : 0) +
+        (after.agentErrors < before.agentErrors ? 1 : 0);
+    const regressions = newErrors.length + newAlerts.length + slowerBy.length +
+        (nPlusOneAfter > nPlusOneBefore ? 1 : 0) + (costDelta > before.llmCost * 0.2 ? 1 : 0) +
+        (after.agentErrors > before.agentErrors ? 1 : 0);
     const verdict = improvements > 0 && regressions === 0 ? 'improved' :
         regressions > 0 && improvements === 0 ? 'regressed' :
             improvements > 0 && regressions > 0 ? 'mixed' : 'unchanged';
@@ -127,6 +161,19 @@ function diffRuns(beforeDir, afterDir) {
         queries: { beforeTotal: before.queryCount, afterTotal: after.queryCount, newPatterns: newPatterns.slice(0, 5), removedPatterns: removedPatterns.slice(0, 5), nPlusOneBefore, nPlusOneAfter },
         errors: { beforeCount: before.errorCount, afterCount: after.errorCount, newErrors, resolvedErrors },
         alerts: { beforeCount: before.alertCount, afterCount: after.alertCount, newAlerts, resolvedAlerts },
+        llm: {
+            beforeCalls: before.llmCalls, afterCalls: after.llmCalls,
+            beforeCost: Math.round(before.llmCost * 10000) / 10000, afterCost: Math.round(after.llmCost * 10000) / 10000,
+            costDelta: Math.round(costDelta * 10000) / 10000,
+            beforeTokens: before.llmTokens, afterTokens: after.llmTokens,
+            modelChanges,
+        },
+        agents: {
+            beforeSteps: before.agentEvents, afterSteps: after.agentEvents,
+            beforeTools, afterTools,
+            newTools: newAgentTools, removedTools: removedAgentTools,
+            beforeErrors: before.agentErrors, afterErrors: after.agentErrors,
+        },
         verdict,
     };
 }
@@ -141,7 +188,7 @@ function runDiffCommand(opts) {
         }
         if (!fs.existsSync(snapshotDir))
             fs.mkdirSync(snapshotDir, { recursive: true });
-        for (const f of ['observations.jsonl', 'queries.jsonl', 'errors.jsonl', 'alerts.jsonl', 'calltrace.jsonl']) {
+        for (const f of ['observations.jsonl', 'queries.jsonl', 'errors.jsonl', 'alerts.jsonl', 'calltrace.jsonl', 'llm.jsonl', 'agents.jsonl', 'mcp.jsonl']) {
             const src = path.join(trickleDir, f);
             if (fs.existsSync(src))
                 fs.copyFileSync(src, path.join(snapshotDir, f));
@@ -188,6 +235,27 @@ function runDiffCommand(opts) {
         console.log(chalk_1.default.red(`  New errors: ${diff.errors.newErrors.join(', ').substring(0, 80)}`));
     if (diff.errors.resolvedErrors.length > 0)
         console.log(chalk_1.default.green(`  Resolved: ${diff.errors.resolvedErrors.join(', ').substring(0, 80)}`));
+    // LLM diff
+    if (diff.llm.beforeCalls > 0 || diff.llm.afterCalls > 0) {
+        console.log(`  LLM calls: ${diff.llm.beforeCalls} → ${diff.llm.afterCalls}`);
+        const costColor = diff.llm.costDelta > 0 ? chalk_1.default.red : diff.llm.costDelta < 0 ? chalk_1.default.green : chalk_1.default.gray;
+        const costSign = diff.llm.costDelta > 0 ? '+' : '';
+        console.log(`  LLM cost: $${diff.llm.beforeCost} → $${diff.llm.afterCost} (${costColor(costSign + '$' + diff.llm.costDelta.toFixed(4))})`);
+        if (diff.llm.modelChanges.length > 0)
+            console.log(chalk_1.default.cyan(`  Model changes: ${diff.llm.modelChanges.join(', ')}`));
+    }
+    // Agent diff
+    if (diff.agents.beforeSteps > 0 || diff.agents.afterSteps > 0) {
+        console.log(`  Agent steps: ${diff.agents.beforeSteps} → ${diff.agents.afterSteps}`);
+        if (diff.agents.newTools.length > 0)
+            console.log(chalk_1.default.green(`  + New tools: ${diff.agents.newTools.join(', ')}`));
+        if (diff.agents.removedTools.length > 0)
+            console.log(chalk_1.default.red(`  - Removed tools: ${diff.agents.removedTools.join(', ')}`));
+        if (diff.agents.beforeErrors !== diff.agents.afterErrors) {
+            const errColor = diff.agents.afterErrors > diff.agents.beforeErrors ? chalk_1.default.red : chalk_1.default.green;
+            console.log(errColor(`  Agent errors: ${diff.agents.beforeErrors} → ${diff.agents.afterErrors}`));
+        }
+    }
     console.log(chalk_1.default.gray('  ' + '─'.repeat(50)));
     console.log('');
 }

package/dist/index.js CHANGED Viewed

@@ -913,6 +913,15 @@ program
     const { whyCommand } = await Promise.resolve().then(() => __importStar(require("./commands/why")));
     whyCommand(query, opts);
 });
+// trickle eval
+program
+    .command("eval")
+    .description("Score agent runs on reliability — completion, errors, cost efficiency, tool reliability, latency")
+    .option("--json", "Output raw JSON for CI integration")
+    .action(async (opts) => {
+    const { evalCommand } = await Promise.resolve().then(() => __importStar(require("./commands/eval")));
+    evalCommand(opts);
+});
 // trickle cost-report
 program
     .command("cost-report")

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "trickle-cli",
-  "version": "0.1.187",
+  "version": "0.1.189",
   "description": "CLI for trickle runtime type observability",
   "bin": {
     "trickle": "dist/index.js"

package/src/commands/eval.ts ADDED Viewed

@@ -0,0 +1,231 @@
+/**
+ * trickle eval — Score agent runs using traces already captured.
+ *
+ * Analyzes agents.jsonl, llm.jsonl, errors.jsonl to produce reliability
+ * scores without needing an LLM-as-judge. Zero cost, zero API keys.
+ *
+ * Scoring dimensions:
+ * - Completion: Did the agent finish successfully?
+ * - Error rate: How many errors during execution?
+ * - Cost efficiency: Tokens per meaningful output
+ * - Tool reliability: Success rate of tool calls
+ * - Latency: Was execution time reasonable?
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+import chalk from 'chalk';
+interface EvalResult {
+  overallScore: number;
+  grade: string;
+  dimensions: {
+    completion: { score: number; detail: string };
+    errors: { score: number; detail: string };
+    costEfficiency: { score: number; detail: string };
+    toolReliability: { score: number; detail: string };
+    latency: { score: number; detail: string };
+  };
+  summary: string;
+  recommendations: string[];
+}
+function readJsonl(fp: string): any[] {
+  if (!fs.existsSync(fp)) return [];
+  return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
+    .map(l => { try { return JSON.parse(l); } catch { return null; } }).filter(Boolean);
+}
+export function evalCommand(opts: { json?: boolean }): void {
+  const dir = process.env.TRICKLE_LOCAL_DIR || path.join(process.cwd(), '.trickle');
+  const agentEvents = readJsonl(path.join(dir, 'agents.jsonl'));
+  const llmCalls = readJsonl(path.join(dir, 'llm.jsonl'));
+  const errors = readJsonl(path.join(dir, 'errors.jsonl'));
+  const mcpCalls = readJsonl(path.join(dir, 'mcp.jsonl'));
+  if (agentEvents.length === 0 && llmCalls.length === 0) {
+    console.log(chalk.yellow('  No agent or LLM data to evaluate. Run an agent with trickle first.'));
+    return;
+  }
+  const result = scoreRun(agentEvents, llmCalls, errors, mcpCalls);
+  if (opts.json) {
+    console.log(JSON.stringify(result, null, 2));
+    return;
+  }
+  // Pretty print
+  console.log('');
+  console.log(chalk.bold('  trickle eval'));
+  console.log(chalk.gray('  ' + '─'.repeat(60)));
+  const gradeColor = result.overallScore >= 80 ? chalk.green :
+    result.overallScore >= 60 ? chalk.yellow : chalk.red;
+  console.log(`  Overall: ${gradeColor(result.grade + ' (' + result.overallScore + '/100)')}`);
+  console.log('');
+  // Dimension scores
+  const dims = result.dimensions;
+  printDimension('Completion', dims.completion);
+  printDimension('Errors', dims.errors);
+  printDimension('Cost Efficiency', dims.costEfficiency);
+  printDimension('Tool Reliability', dims.toolReliability);
+  printDimension('Latency', dims.latency);
+  console.log(chalk.gray('\n  ' + '─'.repeat(60)));
+  console.log(chalk.bold('  Summary'));
+  console.log(`  ${result.summary}`);
+  if (result.recommendations.length > 0) {
+    console.log(chalk.bold('\n  Recommendations'));
+    for (const rec of result.recommendations) {
+      console.log(`  ${chalk.yellow('→')} ${rec}`);
+    }
+  }
+  console.log('');
+}
+function printDimension(name: string, dim: { score: number; detail: string }): void {
+  const bar = renderBar(dim.score);
+  const color = dim.score >= 80 ? chalk.green : dim.score >= 60 ? chalk.yellow : chalk.red;
+  console.log(`  ${name.padEnd(18)} ${bar} ${color(String(dim.score).padStart(3))}/100  ${chalk.gray(dim.detail)}`);
+}
+function renderBar(score: number): string {
+  const filled = Math.round(score / 5);
+  const empty = 20 - filled;
+  const color = score >= 80 ? chalk.green : score >= 60 ? chalk.yellow : chalk.red;
+  return color('█'.repeat(filled)) + chalk.gray('░'.repeat(empty));
+}
+function scoreRun(
+  agentEvents: any[], llmCalls: any[], errors: any[], mcpCalls: any[],
+): EvalResult {
+  const recommendations: string[] = [];
+  // 1. Completion score (0-100)
+  const crewStarts = agentEvents.filter(e => e.event === 'crew_start' || e.event === 'chain_start');
+  const crewEnds = agentEvents.filter(e => e.event === 'crew_end' || e.event === 'chain_end');
+  const crewErrors = agentEvents.filter(e => e.event === 'crew_error' || e.event === 'chain_error');
+  const completionRate = crewStarts.length > 0
+    ? Math.min(1, crewEnds.length / crewStarts.length)
+    : (llmCalls.length > 0 ? (llmCalls.filter(c => !c.error).length / llmCalls.length) : 1);
+  const completionScore = Math.round(completionRate * 100);
+  let completionDetail = '';
+  if (crewStarts.length > 0) {
+    completionDetail = `${crewEnds.length}/${crewStarts.length} workflows completed`;
+    if (crewErrors.length > 0) completionDetail += `, ${crewErrors.length} failed`;
+  } else {
+    completionDetail = `${llmCalls.filter(c => !c.error).length}/${llmCalls.length} LLM calls succeeded`;
+  }
+  if (completionScore < 80) recommendations.push('Improve completion rate — check agent error handling and tool reliability');
+  // 2. Error score (0-100, inverse of error rate)
+  const totalSteps = agentEvents.length + llmCalls.length + mcpCalls.length;
+  const errorEvents = [
+    ...agentEvents.filter(e => e.event?.includes('error')),
+    ...llmCalls.filter(c => c.error),
+    ...mcpCalls.filter(c => c.isError),
+    ...errors,
+  ];
+  const errorRate = totalSteps > 0 ? errorEvents.length / totalSteps : 0;
+  const errorScore = Math.round(Math.max(0, (1 - errorRate * 5)) * 100); // 20% errors = 0 score
+  const errorDetail = `${errorEvents.length} errors in ${totalSteps} steps (${(errorRate * 100).toFixed(1)}%)`;
+  if (errorScore < 80) recommendations.push(`Reduce error rate — ${errorEvents.length} errors detected. Use \`trickle why\` to investigate`);
+  // 3. Cost efficiency (0-100)
+  const totalCost = llmCalls.reduce((s: number, c: any) => s + (c.estimatedCostUsd || 0), 0);
+  const totalTokens = llmCalls.reduce((s: number, c: any) => s + (c.totalTokens || 0), 0);
+  const outputTokens = llmCalls.reduce((s: number, c: any) => s + (c.outputTokens || 0), 0);
+  const inputTokens = llmCalls.reduce((s: number, c: any) => s + (c.inputTokens || 0), 0);
+  // Efficiency: ratio of output tokens to input tokens (higher = more efficient)
+  const ioRatio = inputTokens > 0 ? outputTokens / inputTokens : 1;
+  // Score: 1:1 ratio = 100, 1:10 ratio = 50, 1:100 = 10
+  const costScore = llmCalls.length === 0 ? 100 : Math.round(Math.min(100, Math.max(10, ioRatio * 100)));
+  const costDetail = llmCalls.length > 0
+    ? `$${totalCost.toFixed(4)} total, ${formatTokens(inputTokens)} in → ${formatTokens(outputTokens)} out (${ioRatio.toFixed(2)} ratio)`
+    : 'No LLM calls';
+  if (costScore < 60 && llmCalls.length > 0) recommendations.push('Reduce prompt size — input tokens far exceed output. Consider summarizing context before sending');
+  // 4. Tool reliability (0-100)
+  const toolStarts = agentEvents.filter(e => e.event === 'tool_start');
+  const toolEnds = agentEvents.filter(e => e.event === 'tool_end');
+  const toolErrors = agentEvents.filter(e => e.event === 'tool_error');
+  const mcpErrors = mcpCalls.filter(c => c.isError);
+  const totalToolCalls = toolStarts.length + mcpCalls.filter(c => c.tool !== '__list_tools').length;
+  const totalToolErrors = toolErrors.length + mcpErrors.length;
+  const toolSuccessRate = totalToolCalls > 0 ? 1 - (totalToolErrors / totalToolCalls) : 1;
+  const toolScore = Math.round(toolSuccessRate * 100);
+  const toolDetail = totalToolCalls > 0
+    ? `${totalToolCalls - totalToolErrors}/${totalToolCalls} tool calls succeeded`
+    : 'No tool calls';
+  if (toolScore < 80) recommendations.push(`Fix failing tools — ${totalToolErrors} tool errors detected. Check tool implementations`);
+  // Check for retry loops
+  const toolNames = toolStarts.map(e => e.tool || '');
+  let maxConsecutive = 1;
+  let current = 1;
+  for (let i = 1; i < toolNames.length; i++) {
+    if (toolNames[i] === toolNames[i - 1] && toolNames[i]) { current++; maxConsecutive = Math.max(maxConsecutive, current); }
+    else current = 1;
+  }
+  if (maxConsecutive >= 3) recommendations.push(`Tool retry loop detected (${maxConsecutive} consecutive calls). Agent may be stuck`);
+  // 5. Latency score (0-100)
+  const durations = [
+    ...agentEvents.filter(e => e.durationMs).map(e => e.durationMs),
+    ...llmCalls.filter(c => c.durationMs).map(c => c.durationMs),
+  ];
+  const avgLatency = durations.length > 0 ? durations.reduce((s: number, d: number) => s + d, 0) / durations.length : 0;
+  const maxLatency = durations.length > 0 ? Math.max(...durations) : 0;
+  // Score: < 500ms avg = 100, 500-2000 = linear, > 5000ms = 20
+  const latencyScore = durations.length === 0 ? 100 :
+    Math.round(Math.min(100, Math.max(20, 100 - (avgLatency - 500) / 50)));
+  const latencyDetail = durations.length > 0
+    ? `avg ${avgLatency.toFixed(0)}ms, max ${maxLatency.toFixed(0)}ms across ${durations.length} steps`
+    : 'No timing data';
+  if (latencyScore < 60) recommendations.push(`High latency — avg ${avgLatency.toFixed(0)}ms. Consider faster models or reducing prompt size`);
+  // Overall score (weighted average)
+  const weights = { completion: 0.3, errors: 0.25, costEfficiency: 0.15, toolReliability: 0.2, latency: 0.1 };
+  const overallScore = Math.round(
+    completionScore * weights.completion +
+    errorScore * weights.errors +
+    costScore * weights.costEfficiency +
+    toolScore * weights.toolReliability +
+    latencyScore * weights.latency
+  );
+  const grade = overallScore >= 90 ? 'A' : overallScore >= 80 ? 'B' : overallScore >= 70 ? 'C' :
+    overallScore >= 60 ? 'D' : 'F';
+  // Summary
+  const parts: string[] = [];
+  if (crewStarts.length > 0) parts.push(`${crewStarts.length} workflow(s)`);
+  if (llmCalls.length > 0) parts.push(`${llmCalls.length} LLM calls ($${totalCost.toFixed(4)})`);
+  if (totalToolCalls > 0) parts.push(`${totalToolCalls} tool calls`);
+  if (errorEvents.length > 0) parts.push(`${errorEvents.length} errors`);
+  const summary = parts.join(', ') || 'No agent activity detected';
+  return {
+    overallScore,
+    grade,
+    dimensions: {
+      completion: { score: completionScore, detail: completionDetail },
+      errors: { score: errorScore, detail: errorDetail },
+      costEfficiency: { score: costScore, detail: costDetail },
+      toolReliability: { score: toolScore, detail: toolDetail },
+      latency: { score: latencyScore, detail: latencyDetail },
+    },
+    summary,
+    recommendations,
+  };
+}
+function formatTokens(n: number): string {
+  if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
+  if (n >= 1_000) return (n / 1_000).toFixed(1) + 'K';
+  return String(n);
+}

package/src/commands/run-diff.ts CHANGED Viewed

@@ -48,6 +48,26 @@ export interface RunDiff {
     newAlerts: string[];
     resolvedAlerts: string[];
   };
+  llm: {
+    beforeCalls: number;
+    afterCalls: number;
+    beforeCost: number;
+    afterCost: number;
+    costDelta: number;
+    beforeTokens: number;
+    afterTokens: number;
+    modelChanges: string[];
+  };
+  agents: {
+    beforeSteps: number;
+    afterSteps: number;
+    beforeTools: string[];
+    afterTools: string[];
+    newTools: string[];
+    removedTools: string[];
+    beforeErrors: number;
+    afterErrors: number;
+  };
   verdict: 'improved' | 'regressed' | 'unchanged' | 'mixed';
 }
@@ -72,7 +92,23 @@ function collectRunData(dir: string) {
   const errorMessages = new Set(errors.map((e: any) => (e.message || '').substring(0, 100)));
   const alertMessages = new Set(alerts.map((a: any) => (a.message || '').substring(0, 100)));
-  return { funcMap, queryPatterns, errorMessages, alertMessages, queryCount: queries.length, errorCount: errors.length, alertCount: alerts.length };
+  // LLM data
+  const llmCalls = readJsonl(path.join(dir, 'llm.jsonl'));
+  const llmCost = llmCalls.reduce((s: number, c: any) => s + (c.estimatedCostUsd || 0), 0);
+  const llmTokens = llmCalls.reduce((s: number, c: any) => s + (c.totalTokens || 0), 0);
+  const llmModels = new Set(llmCalls.map((c: any) => `${c.provider}/${c.model}`));
+  // Agent data
+  const agentEvents = readJsonl(path.join(dir, 'agents.jsonl'));
+  const agentTools = new Set(agentEvents.filter((e: any) => e.event === 'tool_start' || e.event === 'tool_end').map((e: any) => e.tool || ''));
+  const agentErrors = agentEvents.filter((e: any) => e.event?.includes('error'));
+  return {
+    funcMap, queryPatterns, errorMessages, alertMessages,
+    queryCount: queries.length, errorCount: errors.length, alertCount: alerts.length,
+    llmCalls: llmCalls.length, llmCost, llmTokens, llmModels,
+    agentEvents: agentEvents.length, agentTools, agentErrors: agentErrors.length,
+  };
 }
 export function diffRuns(beforeDir: string, afterDir: string): RunDiff {
@@ -112,9 +148,27 @@ export function diffRuns(beforeDir: string, afterDir: string): RunDiff {
   const newAlerts = [...after.alertMessages].filter(a => !before.alertMessages.has(a));
   const resolvedAlerts = [...before.alertMessages].filter(a => !after.alertMessages.has(a));
+  // LLM comparison
+  const costDelta = after.llmCost - before.llmCost;
+  const afterModels = [...after.llmModels];
+  const beforeModels = [...before.llmModels];
+  const modelChanges: string[] = [];
+  for (const m of afterModels) if (!before.llmModels.has(m)) modelChanges.push(`+ ${m}`);
+  for (const m of beforeModels) if (!after.llmModels.has(m)) modelChanges.push(`- ${m}`);
+  // Agent comparison
+  const afterTools = [...after.agentTools];
+  const beforeTools = [...before.agentTools];
+  const newAgentTools = afterTools.filter(t => !before.agentTools.has(t));
+  const removedAgentTools = beforeTools.filter(t => !after.agentTools.has(t));
   // Verdict
-  const improvements = resolvedErrors.length + resolvedAlerts.length + fasterBy.length + (nPlusOneAfter < nPlusOneBefore ? 1 : 0);
-  const regressions = newErrors.length + newAlerts.length + slowerBy.length + (nPlusOneAfter > nPlusOneBefore ? 1 : 0);
+  const improvements = resolvedErrors.length + resolvedAlerts.length + fasterBy.length +
+    (nPlusOneAfter < nPlusOneBefore ? 1 : 0) + (costDelta < -0.001 ? 1 : 0) +
+    (after.agentErrors < before.agentErrors ? 1 : 0);
+  const regressions = newErrors.length + newAlerts.length + slowerBy.length +
+    (nPlusOneAfter > nPlusOneBefore ? 1 : 0) + (costDelta > before.llmCost * 0.2 ? 1 : 0) +
+    (after.agentErrors > before.agentErrors ? 1 : 0);
   const verdict: RunDiff['verdict'] = improvements > 0 && regressions === 0 ? 'improved' :
     regressions > 0 && improvements === 0 ? 'regressed' :
     improvements > 0 && regressions > 0 ? 'mixed' : 'unchanged';
@@ -124,6 +178,19 @@ export function diffRuns(beforeDir: string, afterDir: string): RunDiff {
     queries: { beforeTotal: before.queryCount, afterTotal: after.queryCount, newPatterns: newPatterns.slice(0, 5), removedPatterns: removedPatterns.slice(0, 5), nPlusOneBefore, nPlusOneAfter },
     errors: { beforeCount: before.errorCount, afterCount: after.errorCount, newErrors, resolvedErrors },
     alerts: { beforeCount: before.alertCount, afterCount: after.alertCount, newAlerts, resolvedAlerts },
+    llm: {
+      beforeCalls: before.llmCalls, afterCalls: after.llmCalls,
+      beforeCost: Math.round(before.llmCost * 10000) / 10000, afterCost: Math.round(after.llmCost * 10000) / 10000,
+      costDelta: Math.round(costDelta * 10000) / 10000,
+      beforeTokens: before.llmTokens, afterTokens: after.llmTokens,
+      modelChanges,
+    },
+    agents: {
+      beforeSteps: before.agentEvents, afterSteps: after.agentEvents,
+      beforeTools, afterTools,
+      newTools: newAgentTools, removedTools: removedAgentTools,
+      beforeErrors: before.agentErrors, afterErrors: after.agentErrors,
+    },
     verdict,
   };
 }
@@ -146,7 +213,7 @@ export function runDiffCommand(opts: DiffOptions): void {
       return;
     }
     if (!fs.existsSync(snapshotDir)) fs.mkdirSync(snapshotDir, { recursive: true });
-    for (const f of ['observations.jsonl', 'queries.jsonl', 'errors.jsonl', 'alerts.jsonl', 'calltrace.jsonl']) {
+    for (const f of ['observations.jsonl', 'queries.jsonl', 'errors.jsonl', 'alerts.jsonl', 'calltrace.jsonl', 'llm.jsonl', 'agents.jsonl', 'mcp.jsonl']) {
       const src = path.join(trickleDir, f);
       if (fs.existsSync(src)) fs.copyFileSync(src, path.join(snapshotDir, f));
     }
@@ -199,6 +266,26 @@ export function runDiffCommand(opts: DiffOptions): void {
   if (diff.errors.newErrors.length > 0) console.log(chalk.red(`  New errors: ${diff.errors.newErrors.join(', ').substring(0, 80)}`));
   if (diff.errors.resolvedErrors.length > 0) console.log(chalk.green(`  Resolved: ${diff.errors.resolvedErrors.join(', ').substring(0, 80)}`));
+  // LLM diff
+  if (diff.llm.beforeCalls > 0 || diff.llm.afterCalls > 0) {
+    console.log(`  LLM calls: ${diff.llm.beforeCalls} → ${diff.llm.afterCalls}`);
+    const costColor = diff.llm.costDelta > 0 ? chalk.red : diff.llm.costDelta < 0 ? chalk.green : chalk.gray;
+    const costSign = diff.llm.costDelta > 0 ? '+' : '';
+    console.log(`  LLM cost: $${diff.llm.beforeCost} → $${diff.llm.afterCost} (${costColor(costSign + '$' + diff.llm.costDelta.toFixed(4))})`);
+    if (diff.llm.modelChanges.length > 0) console.log(chalk.cyan(`  Model changes: ${diff.llm.modelChanges.join(', ')}`));
+  }
+  // Agent diff
+  if (diff.agents.beforeSteps > 0 || diff.agents.afterSteps > 0) {
+    console.log(`  Agent steps: ${diff.agents.beforeSteps} → ${diff.agents.afterSteps}`);
+    if (diff.agents.newTools.length > 0) console.log(chalk.green(`  + New tools: ${diff.agents.newTools.join(', ')}`));
+    if (diff.agents.removedTools.length > 0) console.log(chalk.red(`  - Removed tools: ${diff.agents.removedTools.join(', ')}`));
+    if (diff.agents.beforeErrors !== diff.agents.afterErrors) {
+      const errColor = diff.agents.afterErrors > diff.agents.beforeErrors ? chalk.red : chalk.green;
+      console.log(errColor(`  Agent errors: ${diff.agents.beforeErrors} → ${diff.agents.afterErrors}`));
+    }
+  }
   console.log(chalk.gray('  ' + '─'.repeat(50)));
   console.log('');
 }

package/src/index.ts CHANGED Viewed

@@ -946,6 +946,16 @@ program
     whyCommand(query, opts);
   });
+// trickle eval
+program
+  .command("eval")
+  .description("Score agent runs on reliability — completion, errors, cost efficiency, tool reliability, latency")
+  .option("--json", "Output raw JSON for CI integration")
+  .action(async (opts) => {
+    const { evalCommand } = await import("./commands/eval");
+    evalCommand(opts);
+  });
 // trickle cost-report
 program
   .command("cost-report")