trickle-cli 0.1.187 → 0.1.189

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ /**
2
+ * trickle eval — Score agent runs using traces already captured.
3
+ *
4
+ * Analyzes agents.jsonl, llm.jsonl, errors.jsonl to produce reliability
5
+ * scores without needing an LLM-as-judge. Zero cost, zero API keys.
6
+ *
7
+ * Scoring dimensions:
8
+ * - Completion: Did the agent finish successfully?
9
+ * - Error rate: How many errors during execution?
10
+ * - Cost efficiency: Tokens per meaningful output
11
+ * - Tool reliability: Success rate of tool calls
12
+ * - Latency: Was execution time reasonable?
13
+ */
14
+ export declare function evalCommand(opts: {
15
+ json?: boolean;
16
+ }): void;
@@ -0,0 +1,250 @@
1
+ "use strict";
2
+ /**
3
+ * trickle eval — Score agent runs using traces already captured.
4
+ *
5
+ * Analyzes agents.jsonl, llm.jsonl, errors.jsonl to produce reliability
6
+ * scores without needing an LLM-as-judge. Zero cost, zero API keys.
7
+ *
8
+ * Scoring dimensions:
9
+ * - Completion: Did the agent finish successfully?
10
+ * - Error rate: How many errors during execution?
11
+ * - Cost efficiency: Tokens per meaningful output
12
+ * - Tool reliability: Success rate of tool calls
13
+ * - Latency: Was execution time reasonable?
14
+ */
15
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
16
+ if (k2 === undefined) k2 = k;
17
+ var desc = Object.getOwnPropertyDescriptor(m, k);
18
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
19
+ desc = { enumerable: true, get: function() { return m[k]; } };
20
+ }
21
+ Object.defineProperty(o, k2, desc);
22
+ }) : (function(o, m, k, k2) {
23
+ if (k2 === undefined) k2 = k;
24
+ o[k2] = m[k];
25
+ }));
26
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
27
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
28
+ }) : function(o, v) {
29
+ o["default"] = v;
30
+ });
31
+ var __importStar = (this && this.__importStar) || (function () {
32
+ var ownKeys = function(o) {
33
+ ownKeys = Object.getOwnPropertyNames || function (o) {
34
+ var ar = [];
35
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
36
+ return ar;
37
+ };
38
+ return ownKeys(o);
39
+ };
40
+ return function (mod) {
41
+ if (mod && mod.__esModule) return mod;
42
+ var result = {};
43
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
44
+ __setModuleDefault(result, mod);
45
+ return result;
46
+ };
47
+ })();
48
+ var __importDefault = (this && this.__importDefault) || function (mod) {
49
+ return (mod && mod.__esModule) ? mod : { "default": mod };
50
+ };
51
+ Object.defineProperty(exports, "__esModule", { value: true });
52
+ exports.evalCommand = evalCommand;
53
+ const fs = __importStar(require("fs"));
54
+ const path = __importStar(require("path"));
55
+ const chalk_1 = __importDefault(require("chalk"));
56
+ function readJsonl(fp) {
57
+ if (!fs.existsSync(fp))
58
+ return [];
59
+ return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
60
+ .map(l => { try {
61
+ return JSON.parse(l);
62
+ }
63
+ catch {
64
+ return null;
65
+ } }).filter(Boolean);
66
+ }
67
+ function evalCommand(opts) {
68
+ const dir = process.env.TRICKLE_LOCAL_DIR || path.join(process.cwd(), '.trickle');
69
+ const agentEvents = readJsonl(path.join(dir, 'agents.jsonl'));
70
+ const llmCalls = readJsonl(path.join(dir, 'llm.jsonl'));
71
+ const errors = readJsonl(path.join(dir, 'errors.jsonl'));
72
+ const mcpCalls = readJsonl(path.join(dir, 'mcp.jsonl'));
73
+ if (agentEvents.length === 0 && llmCalls.length === 0) {
74
+ console.log(chalk_1.default.yellow(' No agent or LLM data to evaluate. Run an agent with trickle first.'));
75
+ return;
76
+ }
77
+ const result = scoreRun(agentEvents, llmCalls, errors, mcpCalls);
78
+ if (opts.json) {
79
+ console.log(JSON.stringify(result, null, 2));
80
+ return;
81
+ }
82
+ // Pretty print
83
+ console.log('');
84
+ console.log(chalk_1.default.bold(' trickle eval'));
85
+ console.log(chalk_1.default.gray(' ' + '─'.repeat(60)));
86
+ const gradeColor = result.overallScore >= 80 ? chalk_1.default.green :
87
+ result.overallScore >= 60 ? chalk_1.default.yellow : chalk_1.default.red;
88
+ console.log(` Overall: ${gradeColor(result.grade + ' (' + result.overallScore + '/100)')}`);
89
+ console.log('');
90
+ // Dimension scores
91
+ const dims = result.dimensions;
92
+ printDimension('Completion', dims.completion);
93
+ printDimension('Errors', dims.errors);
94
+ printDimension('Cost Efficiency', dims.costEfficiency);
95
+ printDimension('Tool Reliability', dims.toolReliability);
96
+ printDimension('Latency', dims.latency);
97
+ console.log(chalk_1.default.gray('\n ' + '─'.repeat(60)));
98
+ console.log(chalk_1.default.bold(' Summary'));
99
+ console.log(` ${result.summary}`);
100
+ if (result.recommendations.length > 0) {
101
+ console.log(chalk_1.default.bold('\n Recommendations'));
102
+ for (const rec of result.recommendations) {
103
+ console.log(` ${chalk_1.default.yellow('→')} ${rec}`);
104
+ }
105
+ }
106
+ console.log('');
107
+ }
108
+ function printDimension(name, dim) {
109
+ const bar = renderBar(dim.score);
110
+ const color = dim.score >= 80 ? chalk_1.default.green : dim.score >= 60 ? chalk_1.default.yellow : chalk_1.default.red;
111
+ console.log(` ${name.padEnd(18)} ${bar} ${color(String(dim.score).padStart(3))}/100 ${chalk_1.default.gray(dim.detail)}`);
112
+ }
113
+ function renderBar(score) {
114
+ const filled = Math.round(score / 5);
115
+ const empty = 20 - filled;
116
+ const color = score >= 80 ? chalk_1.default.green : score >= 60 ? chalk_1.default.yellow : chalk_1.default.red;
117
+ return color('█'.repeat(filled)) + chalk_1.default.gray('░'.repeat(empty));
118
+ }
119
+ function scoreRun(agentEvents, llmCalls, errors, mcpCalls) {
120
+ const recommendations = [];
121
+ // 1. Completion score (0-100)
122
+ const crewStarts = agentEvents.filter(e => e.event === 'crew_start' || e.event === 'chain_start');
123
+ const crewEnds = agentEvents.filter(e => e.event === 'crew_end' || e.event === 'chain_end');
124
+ const crewErrors = agentEvents.filter(e => e.event === 'crew_error' || e.event === 'chain_error');
125
+ const completionRate = crewStarts.length > 0
126
+ ? Math.min(1, crewEnds.length / crewStarts.length)
127
+ : (llmCalls.length > 0 ? (llmCalls.filter(c => !c.error).length / llmCalls.length) : 1);
128
+ const completionScore = Math.round(completionRate * 100);
129
+ let completionDetail = '';
130
+ if (crewStarts.length > 0) {
131
+ completionDetail = `${crewEnds.length}/${crewStarts.length} workflows completed`;
132
+ if (crewErrors.length > 0)
133
+ completionDetail += `, ${crewErrors.length} failed`;
134
+ }
135
+ else {
136
+ completionDetail = `${llmCalls.filter(c => !c.error).length}/${llmCalls.length} LLM calls succeeded`;
137
+ }
138
+ if (completionScore < 80)
139
+ recommendations.push('Improve completion rate — check agent error handling and tool reliability');
140
+ // 2. Error score (0-100, inverse of error rate)
141
+ const totalSteps = agentEvents.length + llmCalls.length + mcpCalls.length;
142
+ const errorEvents = [
143
+ ...agentEvents.filter(e => e.event?.includes('error')),
144
+ ...llmCalls.filter(c => c.error),
145
+ ...mcpCalls.filter(c => c.isError),
146
+ ...errors,
147
+ ];
148
+ const errorRate = totalSteps > 0 ? errorEvents.length / totalSteps : 0;
149
+ const errorScore = Math.round(Math.max(0, (1 - errorRate * 5)) * 100); // 20% errors = 0 score
150
+ const errorDetail = `${errorEvents.length} errors in ${totalSteps} steps (${(errorRate * 100).toFixed(1)}%)`;
151
+ if (errorScore < 80)
152
+ recommendations.push(`Reduce error rate — ${errorEvents.length} errors detected. Use \`trickle why\` to investigate`);
153
+ // 3. Cost efficiency (0-100)
154
+ const totalCost = llmCalls.reduce((s, c) => s + (c.estimatedCostUsd || 0), 0);
155
+ const totalTokens = llmCalls.reduce((s, c) => s + (c.totalTokens || 0), 0);
156
+ const outputTokens = llmCalls.reduce((s, c) => s + (c.outputTokens || 0), 0);
157
+ const inputTokens = llmCalls.reduce((s, c) => s + (c.inputTokens || 0), 0);
158
+ // Efficiency: ratio of output tokens to input tokens (higher = more efficient)
159
+ const ioRatio = inputTokens > 0 ? outputTokens / inputTokens : 1;
160
+ // Score: 1:1 ratio = 100, 1:10 ratio = 50, 1:100 = 10
161
+ const costScore = llmCalls.length === 0 ? 100 : Math.round(Math.min(100, Math.max(10, ioRatio * 100)));
162
+ const costDetail = llmCalls.length > 0
163
+ ? `$${totalCost.toFixed(4)} total, ${formatTokens(inputTokens)} in → ${formatTokens(outputTokens)} out (${ioRatio.toFixed(2)} ratio)`
164
+ : 'No LLM calls';
165
+ if (costScore < 60 && llmCalls.length > 0)
166
+ recommendations.push('Reduce prompt size — input tokens far exceed output. Consider summarizing context before sending');
167
+ // 4. Tool reliability (0-100)
168
+ const toolStarts = agentEvents.filter(e => e.event === 'tool_start');
169
+ const toolEnds = agentEvents.filter(e => e.event === 'tool_end');
170
+ const toolErrors = agentEvents.filter(e => e.event === 'tool_error');
171
+ const mcpErrors = mcpCalls.filter(c => c.isError);
172
+ const totalToolCalls = toolStarts.length + mcpCalls.filter(c => c.tool !== '__list_tools').length;
173
+ const totalToolErrors = toolErrors.length + mcpErrors.length;
174
+ const toolSuccessRate = totalToolCalls > 0 ? 1 - (totalToolErrors / totalToolCalls) : 1;
175
+ const toolScore = Math.round(toolSuccessRate * 100);
176
+ const toolDetail = totalToolCalls > 0
177
+ ? `${totalToolCalls - totalToolErrors}/${totalToolCalls} tool calls succeeded`
178
+ : 'No tool calls';
179
+ if (toolScore < 80)
180
+ recommendations.push(`Fix failing tools — ${totalToolErrors} tool errors detected. Check tool implementations`);
181
+ // Check for retry loops
182
+ const toolNames = toolStarts.map(e => e.tool || '');
183
+ let maxConsecutive = 1;
184
+ let current = 1;
185
+ for (let i = 1; i < toolNames.length; i++) {
186
+ if (toolNames[i] === toolNames[i - 1] && toolNames[i]) {
187
+ current++;
188
+ maxConsecutive = Math.max(maxConsecutive, current);
189
+ }
190
+ else
191
+ current = 1;
192
+ }
193
+ if (maxConsecutive >= 3)
194
+ recommendations.push(`Tool retry loop detected (${maxConsecutive} consecutive calls). Agent may be stuck`);
195
+ // 5. Latency score (0-100)
196
+ const durations = [
197
+ ...agentEvents.filter(e => e.durationMs).map(e => e.durationMs),
198
+ ...llmCalls.filter(c => c.durationMs).map(c => c.durationMs),
199
+ ];
200
+ const avgLatency = durations.length > 0 ? durations.reduce((s, d) => s + d, 0) / durations.length : 0;
201
+ const maxLatency = durations.length > 0 ? Math.max(...durations) : 0;
202
+ // Score: < 500ms avg = 100, 500-2000 = linear, > 5000ms = 20
203
+ const latencyScore = durations.length === 0 ? 100 :
204
+ Math.round(Math.min(100, Math.max(20, 100 - (avgLatency - 500) / 50)));
205
+ const latencyDetail = durations.length > 0
206
+ ? `avg ${avgLatency.toFixed(0)}ms, max ${maxLatency.toFixed(0)}ms across ${durations.length} steps`
207
+ : 'No timing data';
208
+ if (latencyScore < 60)
209
+ recommendations.push(`High latency — avg ${avgLatency.toFixed(0)}ms. Consider faster models or reducing prompt size`);
210
+ // Overall score (weighted average)
211
+ const weights = { completion: 0.3, errors: 0.25, costEfficiency: 0.15, toolReliability: 0.2, latency: 0.1 };
212
+ const overallScore = Math.round(completionScore * weights.completion +
213
+ errorScore * weights.errors +
214
+ costScore * weights.costEfficiency +
215
+ toolScore * weights.toolReliability +
216
+ latencyScore * weights.latency);
217
+ const grade = overallScore >= 90 ? 'A' : overallScore >= 80 ? 'B' : overallScore >= 70 ? 'C' :
218
+ overallScore >= 60 ? 'D' : 'F';
219
+ // Summary
220
+ const parts = [];
221
+ if (crewStarts.length > 0)
222
+ parts.push(`${crewStarts.length} workflow(s)`);
223
+ if (llmCalls.length > 0)
224
+ parts.push(`${llmCalls.length} LLM calls ($${totalCost.toFixed(4)})`);
225
+ if (totalToolCalls > 0)
226
+ parts.push(`${totalToolCalls} tool calls`);
227
+ if (errorEvents.length > 0)
228
+ parts.push(`${errorEvents.length} errors`);
229
+ const summary = parts.join(', ') || 'No agent activity detected';
230
+ return {
231
+ overallScore,
232
+ grade,
233
+ dimensions: {
234
+ completion: { score: completionScore, detail: completionDetail },
235
+ errors: { score: errorScore, detail: errorDetail },
236
+ costEfficiency: { score: costScore, detail: costDetail },
237
+ toolReliability: { score: toolScore, detail: toolDetail },
238
+ latency: { score: latencyScore, detail: latencyDetail },
239
+ },
240
+ summary,
241
+ recommendations,
242
+ };
243
+ }
244
+ function formatTokens(n) {
245
+ if (n >= 1_000_000)
246
+ return (n / 1_000_000).toFixed(1) + 'M';
247
+ if (n >= 1_000)
248
+ return (n / 1_000).toFixed(1) + 'K';
249
+ return String(n);
250
+ }
@@ -46,6 +46,26 @@ export interface RunDiff {
46
46
  newAlerts: string[];
47
47
  resolvedAlerts: string[];
48
48
  };
49
+ llm: {
50
+ beforeCalls: number;
51
+ afterCalls: number;
52
+ beforeCost: number;
53
+ afterCost: number;
54
+ costDelta: number;
55
+ beforeTokens: number;
56
+ afterTokens: number;
57
+ modelChanges: string[];
58
+ };
59
+ agents: {
60
+ beforeSteps: number;
61
+ afterSteps: number;
62
+ beforeTools: string[];
63
+ afterTools: string[];
64
+ newTools: string[];
65
+ removedTools: string[];
66
+ beforeErrors: number;
67
+ afterErrors: number;
68
+ };
49
69
  verdict: 'improved' | 'regressed' | 'unchanged' | 'mixed';
50
70
  }
51
71
  export declare function diffRuns(beforeDir: string, afterDir: string): RunDiff;
@@ -81,7 +81,21 @@ function collectRunData(dir) {
81
81
  }
82
82
  const errorMessages = new Set(errors.map((e) => (e.message || '').substring(0, 100)));
83
83
  const alertMessages = new Set(alerts.map((a) => (a.message || '').substring(0, 100)));
84
- return { funcMap, queryPatterns, errorMessages, alertMessages, queryCount: queries.length, errorCount: errors.length, alertCount: alerts.length };
84
+ // LLM data
85
+ const llmCalls = readJsonl(path.join(dir, 'llm.jsonl'));
86
+ const llmCost = llmCalls.reduce((s, c) => s + (c.estimatedCostUsd || 0), 0);
87
+ const llmTokens = llmCalls.reduce((s, c) => s + (c.totalTokens || 0), 0);
88
+ const llmModels = new Set(llmCalls.map((c) => `${c.provider}/${c.model}`));
89
+ // Agent data
90
+ const agentEvents = readJsonl(path.join(dir, 'agents.jsonl'));
91
+ const agentTools = new Set(agentEvents.filter((e) => e.event === 'tool_start' || e.event === 'tool_end').map((e) => e.tool || ''));
92
+ const agentErrors = agentEvents.filter((e) => e.event?.includes('error'));
93
+ return {
94
+ funcMap, queryPatterns, errorMessages, alertMessages,
95
+ queryCount: queries.length, errorCount: errors.length, alertCount: alerts.length,
96
+ llmCalls: llmCalls.length, llmCost, llmTokens, llmModels,
97
+ agentEvents: agentEvents.length, agentTools, agentErrors: agentErrors.length,
98
+ };
85
99
  }
86
100
  function diffRuns(beforeDir, afterDir) {
87
101
  const before = collectRunData(beforeDir);
@@ -116,9 +130,29 @@ function diffRuns(beforeDir, afterDir) {
116
130
  // Alerts
117
131
  const newAlerts = [...after.alertMessages].filter(a => !before.alertMessages.has(a));
118
132
  const resolvedAlerts = [...before.alertMessages].filter(a => !after.alertMessages.has(a));
133
+ // LLM comparison
134
+ const costDelta = after.llmCost - before.llmCost;
135
+ const afterModels = [...after.llmModels];
136
+ const beforeModels = [...before.llmModels];
137
+ const modelChanges = [];
138
+ for (const m of afterModels)
139
+ if (!before.llmModels.has(m))
140
+ modelChanges.push(`+ ${m}`);
141
+ for (const m of beforeModels)
142
+ if (!after.llmModels.has(m))
143
+ modelChanges.push(`- ${m}`);
144
+ // Agent comparison
145
+ const afterTools = [...after.agentTools];
146
+ const beforeTools = [...before.agentTools];
147
+ const newAgentTools = afterTools.filter(t => !before.agentTools.has(t));
148
+ const removedAgentTools = beforeTools.filter(t => !after.agentTools.has(t));
119
149
  // Verdict
120
- const improvements = resolvedErrors.length + resolvedAlerts.length + fasterBy.length + (nPlusOneAfter < nPlusOneBefore ? 1 : 0);
121
- const regressions = newErrors.length + newAlerts.length + slowerBy.length + (nPlusOneAfter > nPlusOneBefore ? 1 : 0);
150
+ const improvements = resolvedErrors.length + resolvedAlerts.length + fasterBy.length +
151
+ (nPlusOneAfter < nPlusOneBefore ? 1 : 0) + (costDelta < -0.001 ? 1 : 0) +
152
+ (after.agentErrors < before.agentErrors ? 1 : 0);
153
+ const regressions = newErrors.length + newAlerts.length + slowerBy.length +
154
+ (nPlusOneAfter > nPlusOneBefore ? 1 : 0) + (costDelta > before.llmCost * 0.2 ? 1 : 0) +
155
+ (after.agentErrors > before.agentErrors ? 1 : 0);
122
156
  const verdict = improvements > 0 && regressions === 0 ? 'improved' :
123
157
  regressions > 0 && improvements === 0 ? 'regressed' :
124
158
  improvements > 0 && regressions > 0 ? 'mixed' : 'unchanged';
@@ -127,6 +161,19 @@ function diffRuns(beforeDir, afterDir) {
127
161
  queries: { beforeTotal: before.queryCount, afterTotal: after.queryCount, newPatterns: newPatterns.slice(0, 5), removedPatterns: removedPatterns.slice(0, 5), nPlusOneBefore, nPlusOneAfter },
128
162
  errors: { beforeCount: before.errorCount, afterCount: after.errorCount, newErrors, resolvedErrors },
129
163
  alerts: { beforeCount: before.alertCount, afterCount: after.alertCount, newAlerts, resolvedAlerts },
164
+ llm: {
165
+ beforeCalls: before.llmCalls, afterCalls: after.llmCalls,
166
+ beforeCost: Math.round(before.llmCost * 10000) / 10000, afterCost: Math.round(after.llmCost * 10000) / 10000,
167
+ costDelta: Math.round(costDelta * 10000) / 10000,
168
+ beforeTokens: before.llmTokens, afterTokens: after.llmTokens,
169
+ modelChanges,
170
+ },
171
+ agents: {
172
+ beforeSteps: before.agentEvents, afterSteps: after.agentEvents,
173
+ beforeTools, afterTools,
174
+ newTools: newAgentTools, removedTools: removedAgentTools,
175
+ beforeErrors: before.agentErrors, afterErrors: after.agentErrors,
176
+ },
130
177
  verdict,
131
178
  };
132
179
  }
@@ -141,7 +188,7 @@ function runDiffCommand(opts) {
141
188
  }
142
189
  if (!fs.existsSync(snapshotDir))
143
190
  fs.mkdirSync(snapshotDir, { recursive: true });
144
- for (const f of ['observations.jsonl', 'queries.jsonl', 'errors.jsonl', 'alerts.jsonl', 'calltrace.jsonl']) {
191
+ for (const f of ['observations.jsonl', 'queries.jsonl', 'errors.jsonl', 'alerts.jsonl', 'calltrace.jsonl', 'llm.jsonl', 'agents.jsonl', 'mcp.jsonl']) {
145
192
  const src = path.join(trickleDir, f);
146
193
  if (fs.existsSync(src))
147
194
  fs.copyFileSync(src, path.join(snapshotDir, f));
@@ -188,6 +235,27 @@ function runDiffCommand(opts) {
188
235
  console.log(chalk_1.default.red(` New errors: ${diff.errors.newErrors.join(', ').substring(0, 80)}`));
189
236
  if (diff.errors.resolvedErrors.length > 0)
190
237
  console.log(chalk_1.default.green(` Resolved: ${diff.errors.resolvedErrors.join(', ').substring(0, 80)}`));
238
+ // LLM diff
239
+ if (diff.llm.beforeCalls > 0 || diff.llm.afterCalls > 0) {
240
+ console.log(` LLM calls: ${diff.llm.beforeCalls} → ${diff.llm.afterCalls}`);
241
+ const costColor = diff.llm.costDelta > 0 ? chalk_1.default.red : diff.llm.costDelta < 0 ? chalk_1.default.green : chalk_1.default.gray;
242
+ const costSign = diff.llm.costDelta > 0 ? '+' : '';
243
+ console.log(` LLM cost: $${diff.llm.beforeCost} → $${diff.llm.afterCost} (${costColor(costSign + '$' + diff.llm.costDelta.toFixed(4))})`);
244
+ if (diff.llm.modelChanges.length > 0)
245
+ console.log(chalk_1.default.cyan(` Model changes: ${diff.llm.modelChanges.join(', ')}`));
246
+ }
247
+ // Agent diff
248
+ if (diff.agents.beforeSteps > 0 || diff.agents.afterSteps > 0) {
249
+ console.log(` Agent steps: ${diff.agents.beforeSteps} → ${diff.agents.afterSteps}`);
250
+ if (diff.agents.newTools.length > 0)
251
+ console.log(chalk_1.default.green(` + New tools: ${diff.agents.newTools.join(', ')}`));
252
+ if (diff.agents.removedTools.length > 0)
253
+ console.log(chalk_1.default.red(` - Removed tools: ${diff.agents.removedTools.join(', ')}`));
254
+ if (diff.agents.beforeErrors !== diff.agents.afterErrors) {
255
+ const errColor = diff.agents.afterErrors > diff.agents.beforeErrors ? chalk_1.default.red : chalk_1.default.green;
256
+ console.log(errColor(` Agent errors: ${diff.agents.beforeErrors} → ${diff.agents.afterErrors}`));
257
+ }
258
+ }
191
259
  console.log(chalk_1.default.gray(' ' + '─'.repeat(50)));
192
260
  console.log('');
193
261
  }
package/dist/index.js CHANGED
@@ -913,6 +913,15 @@ program
913
913
  const { whyCommand } = await Promise.resolve().then(() => __importStar(require("./commands/why")));
914
914
  whyCommand(query, opts);
915
915
  });
916
+ // trickle eval
917
+ program
918
+ .command("eval")
919
+ .description("Score agent runs on reliability — completion, errors, cost efficiency, tool reliability, latency")
920
+ .option("--json", "Output raw JSON for CI integration")
921
+ .action(async (opts) => {
922
+ const { evalCommand } = await Promise.resolve().then(() => __importStar(require("./commands/eval")));
923
+ evalCommand(opts);
924
+ });
916
925
  // trickle cost-report
917
926
  program
918
927
  .command("cost-report")
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "trickle-cli",
3
- "version": "0.1.187",
3
+ "version": "0.1.189",
4
4
  "description": "CLI for trickle runtime type observability",
5
5
  "bin": {
6
6
  "trickle": "dist/index.js"
@@ -0,0 +1,231 @@
1
+ /**
2
+ * trickle eval — Score agent runs using traces already captured.
3
+ *
4
+ * Analyzes agents.jsonl, llm.jsonl, errors.jsonl to produce reliability
5
+ * scores without needing an LLM-as-judge. Zero cost, zero API keys.
6
+ *
7
+ * Scoring dimensions:
8
+ * - Completion: Did the agent finish successfully?
9
+ * - Error rate: How many errors during execution?
10
+ * - Cost efficiency: Tokens per meaningful output
11
+ * - Tool reliability: Success rate of tool calls
12
+ * - Latency: Was execution time reasonable?
13
+ */
14
+
15
+ import * as fs from 'fs';
16
+ import * as path from 'path';
17
+ import chalk from 'chalk';
18
+
19
+ interface EvalResult {
20
+ overallScore: number;
21
+ grade: string;
22
+ dimensions: {
23
+ completion: { score: number; detail: string };
24
+ errors: { score: number; detail: string };
25
+ costEfficiency: { score: number; detail: string };
26
+ toolReliability: { score: number; detail: string };
27
+ latency: { score: number; detail: string };
28
+ };
29
+ summary: string;
30
+ recommendations: string[];
31
+ }
32
+
33
+ function readJsonl(fp: string): any[] {
34
+ if (!fs.existsSync(fp)) return [];
35
+ return fs.readFileSync(fp, 'utf-8').split('\n').filter(Boolean)
36
+ .map(l => { try { return JSON.parse(l); } catch { return null; } }).filter(Boolean);
37
+ }
38
+
39
+ export function evalCommand(opts: { json?: boolean }): void {
40
+ const dir = process.env.TRICKLE_LOCAL_DIR || path.join(process.cwd(), '.trickle');
41
+ const agentEvents = readJsonl(path.join(dir, 'agents.jsonl'));
42
+ const llmCalls = readJsonl(path.join(dir, 'llm.jsonl'));
43
+ const errors = readJsonl(path.join(dir, 'errors.jsonl'));
44
+ const mcpCalls = readJsonl(path.join(dir, 'mcp.jsonl'));
45
+
46
+ if (agentEvents.length === 0 && llmCalls.length === 0) {
47
+ console.log(chalk.yellow(' No agent or LLM data to evaluate. Run an agent with trickle first.'));
48
+ return;
49
+ }
50
+
51
+ const result = scoreRun(agentEvents, llmCalls, errors, mcpCalls);
52
+
53
+ if (opts.json) {
54
+ console.log(JSON.stringify(result, null, 2));
55
+ return;
56
+ }
57
+
58
+ // Pretty print
59
+ console.log('');
60
+ console.log(chalk.bold(' trickle eval'));
61
+ console.log(chalk.gray(' ' + '─'.repeat(60)));
62
+
63
+ const gradeColor = result.overallScore >= 80 ? chalk.green :
64
+ result.overallScore >= 60 ? chalk.yellow : chalk.red;
65
+ console.log(` Overall: ${gradeColor(result.grade + ' (' + result.overallScore + '/100)')}`);
66
+ console.log('');
67
+
68
+ // Dimension scores
69
+ const dims = result.dimensions;
70
+ printDimension('Completion', dims.completion);
71
+ printDimension('Errors', dims.errors);
72
+ printDimension('Cost Efficiency', dims.costEfficiency);
73
+ printDimension('Tool Reliability', dims.toolReliability);
74
+ printDimension('Latency', dims.latency);
75
+
76
+ console.log(chalk.gray('\n ' + '─'.repeat(60)));
77
+ console.log(chalk.bold(' Summary'));
78
+ console.log(` ${result.summary}`);
79
+
80
+ if (result.recommendations.length > 0) {
81
+ console.log(chalk.bold('\n Recommendations'));
82
+ for (const rec of result.recommendations) {
83
+ console.log(` ${chalk.yellow('→')} ${rec}`);
84
+ }
85
+ }
86
+
87
+ console.log('');
88
+ }
89
+
90
+ function printDimension(name: string, dim: { score: number; detail: string }): void {
91
+ const bar = renderBar(dim.score);
92
+ const color = dim.score >= 80 ? chalk.green : dim.score >= 60 ? chalk.yellow : chalk.red;
93
+ console.log(` ${name.padEnd(18)} ${bar} ${color(String(dim.score).padStart(3))}/100 ${chalk.gray(dim.detail)}`);
94
+ }
95
+
96
+ function renderBar(score: number): string {
97
+ const filled = Math.round(score / 5);
98
+ const empty = 20 - filled;
99
+ const color = score >= 80 ? chalk.green : score >= 60 ? chalk.yellow : chalk.red;
100
+ return color('█'.repeat(filled)) + chalk.gray('░'.repeat(empty));
101
+ }
102
+
103
+ function scoreRun(
104
+ agentEvents: any[], llmCalls: any[], errors: any[], mcpCalls: any[],
105
+ ): EvalResult {
106
+ const recommendations: string[] = [];
107
+
108
+ // 1. Completion score (0-100)
109
+ const crewStarts = agentEvents.filter(e => e.event === 'crew_start' || e.event === 'chain_start');
110
+ const crewEnds = agentEvents.filter(e => e.event === 'crew_end' || e.event === 'chain_end');
111
+ const crewErrors = agentEvents.filter(e => e.event === 'crew_error' || e.event === 'chain_error');
112
+ const completionRate = crewStarts.length > 0
113
+ ? Math.min(1, crewEnds.length / crewStarts.length)
114
+ : (llmCalls.length > 0 ? (llmCalls.filter(c => !c.error).length / llmCalls.length) : 1);
115
+ const completionScore = Math.round(completionRate * 100);
116
+ let completionDetail = '';
117
+ if (crewStarts.length > 0) {
118
+ completionDetail = `${crewEnds.length}/${crewStarts.length} workflows completed`;
119
+ if (crewErrors.length > 0) completionDetail += `, ${crewErrors.length} failed`;
120
+ } else {
121
+ completionDetail = `${llmCalls.filter(c => !c.error).length}/${llmCalls.length} LLM calls succeeded`;
122
+ }
123
+ if (completionScore < 80) recommendations.push('Improve completion rate — check agent error handling and tool reliability');
124
+
125
+ // 2. Error score (0-100, inverse of error rate)
126
+ const totalSteps = agentEvents.length + llmCalls.length + mcpCalls.length;
127
+ const errorEvents = [
128
+ ...agentEvents.filter(e => e.event?.includes('error')),
129
+ ...llmCalls.filter(c => c.error),
130
+ ...mcpCalls.filter(c => c.isError),
131
+ ...errors,
132
+ ];
133
+ const errorRate = totalSteps > 0 ? errorEvents.length / totalSteps : 0;
134
+ const errorScore = Math.round(Math.max(0, (1 - errorRate * 5)) * 100); // 20% errors = 0 score
135
+ const errorDetail = `${errorEvents.length} errors in ${totalSteps} steps (${(errorRate * 100).toFixed(1)}%)`;
136
+ if (errorScore < 80) recommendations.push(`Reduce error rate — ${errorEvents.length} errors detected. Use \`trickle why\` to investigate`);
137
+
138
+ // 3. Cost efficiency (0-100)
139
+ const totalCost = llmCalls.reduce((s: number, c: any) => s + (c.estimatedCostUsd || 0), 0);
140
+ const totalTokens = llmCalls.reduce((s: number, c: any) => s + (c.totalTokens || 0), 0);
141
+ const outputTokens = llmCalls.reduce((s: number, c: any) => s + (c.outputTokens || 0), 0);
142
+ const inputTokens = llmCalls.reduce((s: number, c: any) => s + (c.inputTokens || 0), 0);
143
+ // Efficiency: ratio of output tokens to input tokens (higher = more efficient)
144
+ const ioRatio = inputTokens > 0 ? outputTokens / inputTokens : 1;
145
+ // Score: 1:1 ratio = 100, 1:10 ratio = 50, 1:100 = 10
146
+ const costScore = llmCalls.length === 0 ? 100 : Math.round(Math.min(100, Math.max(10, ioRatio * 100)));
147
+ const costDetail = llmCalls.length > 0
148
+ ? `$${totalCost.toFixed(4)} total, ${formatTokens(inputTokens)} in → ${formatTokens(outputTokens)} out (${ioRatio.toFixed(2)} ratio)`
149
+ : 'No LLM calls';
150
+ if (costScore < 60 && llmCalls.length > 0) recommendations.push('Reduce prompt size — input tokens far exceed output. Consider summarizing context before sending');
151
+
152
+ // 4. Tool reliability (0-100)
153
+ const toolStarts = agentEvents.filter(e => e.event === 'tool_start');
154
+ const toolEnds = agentEvents.filter(e => e.event === 'tool_end');
155
+ const toolErrors = agentEvents.filter(e => e.event === 'tool_error');
156
+ const mcpErrors = mcpCalls.filter(c => c.isError);
157
+ const totalToolCalls = toolStarts.length + mcpCalls.filter(c => c.tool !== '__list_tools').length;
158
+ const totalToolErrors = toolErrors.length + mcpErrors.length;
159
+ const toolSuccessRate = totalToolCalls > 0 ? 1 - (totalToolErrors / totalToolCalls) : 1;
160
+ const toolScore = Math.round(toolSuccessRate * 100);
161
+ const toolDetail = totalToolCalls > 0
162
+ ? `${totalToolCalls - totalToolErrors}/${totalToolCalls} tool calls succeeded`
163
+ : 'No tool calls';
164
+ if (toolScore < 80) recommendations.push(`Fix failing tools — ${totalToolErrors} tool errors detected. Check tool implementations`);
165
+
166
+ // Check for retry loops
167
+ const toolNames = toolStarts.map(e => e.tool || '');
168
+ let maxConsecutive = 1;
169
+ let current = 1;
170
+ for (let i = 1; i < toolNames.length; i++) {
171
+ if (toolNames[i] === toolNames[i - 1] && toolNames[i]) { current++; maxConsecutive = Math.max(maxConsecutive, current); }
172
+ else current = 1;
173
+ }
174
+ if (maxConsecutive >= 3) recommendations.push(`Tool retry loop detected (${maxConsecutive} consecutive calls). Agent may be stuck`);
175
+
176
+ // 5. Latency score (0-100)
177
+ const durations = [
178
+ ...agentEvents.filter(e => e.durationMs).map(e => e.durationMs),
179
+ ...llmCalls.filter(c => c.durationMs).map(c => c.durationMs),
180
+ ];
181
+ const avgLatency = durations.length > 0 ? durations.reduce((s: number, d: number) => s + d, 0) / durations.length : 0;
182
+ const maxLatency = durations.length > 0 ? Math.max(...durations) : 0;
183
+ // Score: < 500ms avg = 100, 500-2000 = linear, > 5000ms = 20
184
+ const latencyScore = durations.length === 0 ? 100 :
185
+ Math.round(Math.min(100, Math.max(20, 100 - (avgLatency - 500) / 50)));
186
+ const latencyDetail = durations.length > 0
187
+ ? `avg ${avgLatency.toFixed(0)}ms, max ${maxLatency.toFixed(0)}ms across ${durations.length} steps`
188
+ : 'No timing data';
189
+ if (latencyScore < 60) recommendations.push(`High latency — avg ${avgLatency.toFixed(0)}ms. Consider faster models or reducing prompt size`);
190
+
191
+ // Overall score (weighted average)
192
+ const weights = { completion: 0.3, errors: 0.25, costEfficiency: 0.15, toolReliability: 0.2, latency: 0.1 };
193
+ const overallScore = Math.round(
194
+ completionScore * weights.completion +
195
+ errorScore * weights.errors +
196
+ costScore * weights.costEfficiency +
197
+ toolScore * weights.toolReliability +
198
+ latencyScore * weights.latency
199
+ );
200
+
201
+ const grade = overallScore >= 90 ? 'A' : overallScore >= 80 ? 'B' : overallScore >= 70 ? 'C' :
202
+ overallScore >= 60 ? 'D' : 'F';
203
+
204
+ // Summary
205
+ const parts: string[] = [];
206
+ if (crewStarts.length > 0) parts.push(`${crewStarts.length} workflow(s)`);
207
+ if (llmCalls.length > 0) parts.push(`${llmCalls.length} LLM calls ($${totalCost.toFixed(4)})`);
208
+ if (totalToolCalls > 0) parts.push(`${totalToolCalls} tool calls`);
209
+ if (errorEvents.length > 0) parts.push(`${errorEvents.length} errors`);
210
+ const summary = parts.join(', ') || 'No agent activity detected';
211
+
212
+ return {
213
+ overallScore,
214
+ grade,
215
+ dimensions: {
216
+ completion: { score: completionScore, detail: completionDetail },
217
+ errors: { score: errorScore, detail: errorDetail },
218
+ costEfficiency: { score: costScore, detail: costDetail },
219
+ toolReliability: { score: toolScore, detail: toolDetail },
220
+ latency: { score: latencyScore, detail: latencyDetail },
221
+ },
222
+ summary,
223
+ recommendations,
224
+ };
225
+ }
226
+
227
+ function formatTokens(n: number): string {
228
+ if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
229
+ if (n >= 1_000) return (n / 1_000).toFixed(1) + 'K';
230
+ return String(n);
231
+ }
@@ -48,6 +48,26 @@ export interface RunDiff {
48
48
  newAlerts: string[];
49
49
  resolvedAlerts: string[];
50
50
  };
51
+ llm: {
52
+ beforeCalls: number;
53
+ afterCalls: number;
54
+ beforeCost: number;
55
+ afterCost: number;
56
+ costDelta: number;
57
+ beforeTokens: number;
58
+ afterTokens: number;
59
+ modelChanges: string[];
60
+ };
61
+ agents: {
62
+ beforeSteps: number;
63
+ afterSteps: number;
64
+ beforeTools: string[];
65
+ afterTools: string[];
66
+ newTools: string[];
67
+ removedTools: string[];
68
+ beforeErrors: number;
69
+ afterErrors: number;
70
+ };
51
71
  verdict: 'improved' | 'regressed' | 'unchanged' | 'mixed';
52
72
  }
53
73
 
@@ -72,7 +92,23 @@ function collectRunData(dir: string) {
72
92
  const errorMessages = new Set(errors.map((e: any) => (e.message || '').substring(0, 100)));
73
93
  const alertMessages = new Set(alerts.map((a: any) => (a.message || '').substring(0, 100)));
74
94
 
75
- return { funcMap, queryPatterns, errorMessages, alertMessages, queryCount: queries.length, errorCount: errors.length, alertCount: alerts.length };
95
+ // LLM data
96
+ const llmCalls = readJsonl(path.join(dir, 'llm.jsonl'));
97
+ const llmCost = llmCalls.reduce((s: number, c: any) => s + (c.estimatedCostUsd || 0), 0);
98
+ const llmTokens = llmCalls.reduce((s: number, c: any) => s + (c.totalTokens || 0), 0);
99
+ const llmModels = new Set(llmCalls.map((c: any) => `${c.provider}/${c.model}`));
100
+
101
+ // Agent data
102
+ const agentEvents = readJsonl(path.join(dir, 'agents.jsonl'));
103
+ const agentTools = new Set(agentEvents.filter((e: any) => e.event === 'tool_start' || e.event === 'tool_end').map((e: any) => e.tool || ''));
104
+ const agentErrors = agentEvents.filter((e: any) => e.event?.includes('error'));
105
+
106
+ return {
107
+ funcMap, queryPatterns, errorMessages, alertMessages,
108
+ queryCount: queries.length, errorCount: errors.length, alertCount: alerts.length,
109
+ llmCalls: llmCalls.length, llmCost, llmTokens, llmModels,
110
+ agentEvents: agentEvents.length, agentTools, agentErrors: agentErrors.length,
111
+ };
76
112
  }
77
113
 
78
114
  export function diffRuns(beforeDir: string, afterDir: string): RunDiff {
@@ -112,9 +148,27 @@ export function diffRuns(beforeDir: string, afterDir: string): RunDiff {
112
148
  const newAlerts = [...after.alertMessages].filter(a => !before.alertMessages.has(a));
113
149
  const resolvedAlerts = [...before.alertMessages].filter(a => !after.alertMessages.has(a));
114
150
 
151
+ // LLM comparison
152
+ const costDelta = after.llmCost - before.llmCost;
153
+ const afterModels = [...after.llmModels];
154
+ const beforeModels = [...before.llmModels];
155
+ const modelChanges: string[] = [];
156
+ for (const m of afterModels) if (!before.llmModels.has(m)) modelChanges.push(`+ ${m}`);
157
+ for (const m of beforeModels) if (!after.llmModels.has(m)) modelChanges.push(`- ${m}`);
158
+
159
+ // Agent comparison
160
+ const afterTools = [...after.agentTools];
161
+ const beforeTools = [...before.agentTools];
162
+ const newAgentTools = afterTools.filter(t => !before.agentTools.has(t));
163
+ const removedAgentTools = beforeTools.filter(t => !after.agentTools.has(t));
164
+
115
165
  // Verdict
116
- const improvements = resolvedErrors.length + resolvedAlerts.length + fasterBy.length + (nPlusOneAfter < nPlusOneBefore ? 1 : 0);
117
- const regressions = newErrors.length + newAlerts.length + slowerBy.length + (nPlusOneAfter > nPlusOneBefore ? 1 : 0);
166
+ const improvements = resolvedErrors.length + resolvedAlerts.length + fasterBy.length +
167
+ (nPlusOneAfter < nPlusOneBefore ? 1 : 0) + (costDelta < -0.001 ? 1 : 0) +
168
+ (after.agentErrors < before.agentErrors ? 1 : 0);
169
+ const regressions = newErrors.length + newAlerts.length + slowerBy.length +
170
+ (nPlusOneAfter > nPlusOneBefore ? 1 : 0) + (costDelta > before.llmCost * 0.2 ? 1 : 0) +
171
+ (after.agentErrors > before.agentErrors ? 1 : 0);
118
172
  const verdict: RunDiff['verdict'] = improvements > 0 && regressions === 0 ? 'improved' :
119
173
  regressions > 0 && improvements === 0 ? 'regressed' :
120
174
  improvements > 0 && regressions > 0 ? 'mixed' : 'unchanged';
@@ -124,6 +178,19 @@ export function diffRuns(beforeDir: string, afterDir: string): RunDiff {
124
178
  queries: { beforeTotal: before.queryCount, afterTotal: after.queryCount, newPatterns: newPatterns.slice(0, 5), removedPatterns: removedPatterns.slice(0, 5), nPlusOneBefore, nPlusOneAfter },
125
179
  errors: { beforeCount: before.errorCount, afterCount: after.errorCount, newErrors, resolvedErrors },
126
180
  alerts: { beforeCount: before.alertCount, afterCount: after.alertCount, newAlerts, resolvedAlerts },
181
+ llm: {
182
+ beforeCalls: before.llmCalls, afterCalls: after.llmCalls,
183
+ beforeCost: Math.round(before.llmCost * 10000) / 10000, afterCost: Math.round(after.llmCost * 10000) / 10000,
184
+ costDelta: Math.round(costDelta * 10000) / 10000,
185
+ beforeTokens: before.llmTokens, afterTokens: after.llmTokens,
186
+ modelChanges,
187
+ },
188
+ agents: {
189
+ beforeSteps: before.agentEvents, afterSteps: after.agentEvents,
190
+ beforeTools, afterTools,
191
+ newTools: newAgentTools, removedTools: removedAgentTools,
192
+ beforeErrors: before.agentErrors, afterErrors: after.agentErrors,
193
+ },
127
194
  verdict,
128
195
  };
129
196
  }
@@ -146,7 +213,7 @@ export function runDiffCommand(opts: DiffOptions): void {
146
213
  return;
147
214
  }
148
215
  if (!fs.existsSync(snapshotDir)) fs.mkdirSync(snapshotDir, { recursive: true });
149
- for (const f of ['observations.jsonl', 'queries.jsonl', 'errors.jsonl', 'alerts.jsonl', 'calltrace.jsonl']) {
216
+ for (const f of ['observations.jsonl', 'queries.jsonl', 'errors.jsonl', 'alerts.jsonl', 'calltrace.jsonl', 'llm.jsonl', 'agents.jsonl', 'mcp.jsonl']) {
150
217
  const src = path.join(trickleDir, f);
151
218
  if (fs.existsSync(src)) fs.copyFileSync(src, path.join(snapshotDir, f));
152
219
  }
@@ -199,6 +266,26 @@ export function runDiffCommand(opts: DiffOptions): void {
199
266
  if (diff.errors.newErrors.length > 0) console.log(chalk.red(` New errors: ${diff.errors.newErrors.join(', ').substring(0, 80)}`));
200
267
  if (diff.errors.resolvedErrors.length > 0) console.log(chalk.green(` Resolved: ${diff.errors.resolvedErrors.join(', ').substring(0, 80)}`));
201
268
 
269
+ // LLM diff
270
+ if (diff.llm.beforeCalls > 0 || diff.llm.afterCalls > 0) {
271
+ console.log(` LLM calls: ${diff.llm.beforeCalls} → ${diff.llm.afterCalls}`);
272
+ const costColor = diff.llm.costDelta > 0 ? chalk.red : diff.llm.costDelta < 0 ? chalk.green : chalk.gray;
273
+ const costSign = diff.llm.costDelta > 0 ? '+' : '';
274
+ console.log(` LLM cost: $${diff.llm.beforeCost} → $${diff.llm.afterCost} (${costColor(costSign + '$' + diff.llm.costDelta.toFixed(4))})`);
275
+ if (diff.llm.modelChanges.length > 0) console.log(chalk.cyan(` Model changes: ${diff.llm.modelChanges.join(', ')}`));
276
+ }
277
+
278
+ // Agent diff
279
+ if (diff.agents.beforeSteps > 0 || diff.agents.afterSteps > 0) {
280
+ console.log(` Agent steps: ${diff.agents.beforeSteps} → ${diff.agents.afterSteps}`);
281
+ if (diff.agents.newTools.length > 0) console.log(chalk.green(` + New tools: ${diff.agents.newTools.join(', ')}`));
282
+ if (diff.agents.removedTools.length > 0) console.log(chalk.red(` - Removed tools: ${diff.agents.removedTools.join(', ')}`));
283
+ if (diff.agents.beforeErrors !== diff.agents.afterErrors) {
284
+ const errColor = diff.agents.afterErrors > diff.agents.beforeErrors ? chalk.red : chalk.green;
285
+ console.log(errColor(` Agent errors: ${diff.agents.beforeErrors} → ${diff.agents.afterErrors}`));
286
+ }
287
+ }
288
+
202
289
  console.log(chalk.gray(' ' + '─'.repeat(50)));
203
290
  console.log('');
204
291
  }
package/src/index.ts CHANGED
@@ -946,6 +946,16 @@ program
946
946
  whyCommand(query, opts);
947
947
  });
948
948
 
949
+ // trickle eval
950
+ program
951
+ .command("eval")
952
+ .description("Score agent runs on reliability — completion, errors, cost efficiency, tool reliability, latency")
953
+ .option("--json", "Output raw JSON for CI integration")
954
+ .action(async (opts) => {
955
+ const { evalCommand } = await import("./commands/eval");
956
+ evalCommand(opts);
957
+ });
958
+
949
959
  // trickle cost-report
950
960
  program
951
961
  .command("cost-report")