@townco/debugger 0.1.31 → 0.1.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,12 +1,13 @@
1
1
  {
2
2
  "name": "@townco/debugger",
3
- "version": "0.1.31",
3
+ "version": "0.1.33",
4
4
  "type": "module",
5
5
  "engines": {
6
6
  "bun": ">=1.3.0"
7
7
  },
8
8
  "files": [
9
9
  "src",
10
+ "styles",
10
11
  "tsconfig.json"
11
12
  ],
12
13
  "scripts": {
@@ -18,27 +19,26 @@
18
19
  "@anthropic-ai/sdk": "^0.70.0",
19
20
  "@lancedb/lancedb": "^0.22.3",
20
21
  "@radix-ui/react-dialog": "^1.1.15",
21
- "@radix-ui/react-label": "^2.1.7",
22
+ "@radix-ui/react-label": "^2.1.8",
22
23
  "@radix-ui/react-select": "^2.2.6",
23
- "@radix-ui/react-slot": "^1.2.3",
24
- "@radix-ui/react-tabs": "^1.1.0",
25
- "@townco/otlp-server": "0.1.31",
26
- "@townco/ui": "0.1.76",
27
- "bun-plugin-tailwind": "^0.1.2",
24
+ "@radix-ui/react-slot": "^1.2.4",
25
+ "@radix-ui/react-tabs": "^1.1.13",
26
+ "@townco/otlp-server": "0.1.33",
27
+ "@townco/ui": "^0.1.77",
28
28
  "class-variance-authority": "^0.7.1",
29
29
  "clsx": "^2.1.1",
30
- "lucide-react": "^0.545.0",
30
+ "lucide-react": "^0.556.0",
31
31
  "openai": "^4.77.3",
32
- "react": "19.2.1",
33
- "react-dom": "19.2.1",
34
- "tailwind-merge": "^3.3.1",
32
+ "react-dom": "^19.2.1",
33
+ "tailwind-merge": "^3.4.0",
35
34
  "zod": "^4.1.13"
36
35
  },
37
36
  "devDependencies": {
38
- "@townco/tsconfig": "0.1.73",
37
+ "@townco/tsconfig": "0.1.75",
39
38
  "@types/bun": "latest",
40
- "@types/react": "^19",
41
- "@types/react-dom": "^19",
39
+ "@types/react": "^19.2.7",
40
+ "@types/react-dom": "^19.2.3",
41
+ "bun-plugin-tailwind": "^0.1.2",
42
42
  "tailwindcss": "^4.1.11",
43
43
  "tw-animate-css": "^1.4.0",
44
44
  "typescript": "^5.9.3"
package/src/App.tsx CHANGED
@@ -57,6 +57,7 @@ class ErrorBoundary extends Component<
57
57
  {this.state.error?.message}
58
58
  </pre>
59
59
  <button
60
+ type="button"
60
61
  onClick={() => window.location.reload()}
61
62
  style={{
62
63
  marginTop: "1rem",
@@ -7,7 +7,6 @@ import { LLMAnalysisOutputSchema, SessionAnalysisSchema } from "./schema";
7
7
  import type {
8
8
  AnalysisMetrics,
9
9
  DetailedToolCall,
10
- LLMAnalysisOutput,
11
10
  PreComputedFields,
12
11
  SessionAnalysis,
13
12
  } from "./types";
@@ -118,7 +117,7 @@ function formatConversationTranscript(session: StoredSession): string {
118
117
 
119
118
  for (const block of msg.content) {
120
119
  if (block.type === "text") {
121
- transcript += block.text + "\n";
120
+ transcript += `${block.text}\n`;
122
121
  } else if (block.type === "tool_call") {
123
122
  transcript += `[Tool: ${block.title}`;
124
123
  if (block.status === "completed") {
@@ -0,0 +1,528 @@
1
+ /**
2
+ * Comparison analyzer - analyzes comparison runs using Claude to produce
3
+ * Reproducibility Reports and Change Impact Reports.
4
+ */
5
+
6
+ import Anthropic from "@anthropic-ai/sdk";
7
+ import type { ComparisonConfig, SessionMetrics } from "../types";
8
+ import {
9
+ LLMComparisonOutputSchema,
10
+ SessionComparisonAnalysisSchema,
11
+ } from "./comparison-schema";
12
+ import type {
13
+ ConfigSummary,
14
+ LLMComparisonOutput,
15
+ SessionComparisonAnalysis,
16
+ } from "./comparison-types";
17
+
18
+ // Session types (same as analyzer.ts)
19
+ type StoredSession = {
20
+ sessionId: string;
21
+ messages: SessionMessage[];
22
+ metadata: {
23
+ createdAt: string;
24
+ updatedAt: string;
25
+ agentName: string;
26
+ };
27
+ context: unknown[];
28
+ };
29
+
30
+ type SessionMessage = {
31
+ role: "user" | "assistant";
32
+ content: ContentBlock[];
33
+ timestamp: string;
34
+ };
35
+
36
+ type ContentBlock =
37
+ | { type: "text"; text: string }
38
+ | { type: "image"; [key: string]: unknown }
39
+ | {
40
+ type: "tool_call";
41
+ id: string;
42
+ title: string;
43
+ status: "pending" | "in_progress" | "completed" | "failed";
44
+ error?: string;
45
+ };
46
+
47
+ const anthropic = new Anthropic({
48
+ apiKey: process.env.ANTHROPIC_API_KEY,
49
+ });
50
+
51
+ const ANALYSIS_MODEL = "claude-sonnet-4-5-20250929";
52
+
53
+ const COMPARISON_SYSTEM_PROMPT = `You are an expert AI agent behavior analyst helping software engineers improve their agents.
54
+
55
+ You will analyze 3 versions of the same agent task:
56
+ - ORIGINAL: The historical session the engineer is trying to understand/improve
57
+ - CONTROL: A fresh replay with the same configuration (tests reproducibility)
58
+ - VARIANT: A replay with modified configuration (tests the engineer's changes)
59
+
60
+ Your job is to produce TWO reports:
61
+
62
+ 1. REPRODUCIBILITY REPORT (Original vs Control)
63
+ - Assess whether the baseline behavior is stable
64
+ - Identify any concerning divergences that suggest non-determinism
65
+ - Help the engineer understand if they can trust A/B comparisons
66
+
67
+ 2. CHANGE IMPACT REPORT (Control vs Variant)
68
+ - Evaluate whether the engineer's changes achieved their stated hypothesis
69
+ - Identify intended effects (did the change work?)
70
+ - Identify unintended effects (regressions, side effects)
71
+ - Provide specific, actionable recommendations
72
+
73
+ CRITICAL ANALYSIS PRINCIPLES:
74
+
75
+ 1. SPECIFICITY: Reference exact tool names, quote transcripts, cite evidence
76
+ - BAD: "The variant used fewer tool calls"
77
+ - GOOD: "Variant made 3 [TOOL:read_file] calls vs control's 7. It skipped redundant reads of config.json (called 3x in control) by caching the result."
78
+
79
+ 2. EVIDENCE-BASED: Always cite your evidence using these formats:
80
+ - [TOOL:name] for tool names
81
+ - [ARG:key=value] for tool arguments
82
+ - [MSG:"quote..."] for quoting assistant messages
83
+ - [OUTPUT:"excerpt..."] for tool outputs
84
+
85
+ 3. ACTIONABLE RECOMMENDATIONS:
86
+ - BAD: "Consider making the prompt more specific"
87
+ - GOOD: "Add to system prompt: 'When reading configuration files, cache the contents for the session to avoid redundant reads.'"
88
+
89
+ 4. HYPOTHESIS-FOCUSED: The user stated a hypothesis about what their change would do. Directly evaluate whether that hypothesis was achieved.
90
+
91
+ 5. METRICS INTERPRETATION: Don't just report numbers - explain what they mean for the user.
92
+ - BAD: "Token usage increased by 20%"
93
+ - GOOD: "Token usage increased by 20% (from 5,000 to 6,000), adding ~$0.03 per session, but the improved answer quality likely justifies this cost."
94
+
95
+ You must respond with valid JSON matching this schema:
96
+ {
97
+ "reproducibility": {
98
+ "verdict": "STABLE" | "UNSTABLE" | "PARTIALLY_STABLE",
99
+ "confidence": "HIGH" | "MEDIUM" | "LOW",
100
+ "summary": "2-3 sentences summarizing reproducibility findings",
101
+ "behavioral_differences": [
102
+ {
103
+ "category": "TOOL_USAGE" | "RESPONSE_CONTENT" | "REASONING_PATH" | "ERROR_HANDLING" | "PERFORMANCE",
104
+ "observation": "What was different",
105
+ "evidence": "Specific quotes/citations",
106
+ "significance": "CRITICAL" | "NOTABLE" | "MINOR"
107
+ }
108
+ ],
109
+ "metric_comparison": {
110
+ "duration_delta_pct": number,
111
+ "token_delta_pct": number,
112
+ "cost_delta_pct": number,
113
+ "tool_call_delta": number,
114
+ "interpretation": "What these metrics mean"
115
+ },
116
+ "recommendations": [
117
+ {
118
+ "priority": "HIGH" | "MEDIUM" | "LOW",
119
+ "action": "Specific action to take",
120
+ "rationale": "Why this helps"
121
+ }
122
+ ]
123
+ },
124
+ "change_impact": {
125
+ "verdict": "IMPROVED" | "DEGRADED" | "NEUTRAL" | "MIXED",
126
+ "confidence": "HIGH" | "MEDIUM" | "LOW",
127
+ "summary": "2-3 sentences summarizing change impact",
128
+ "hypothesis_assessment": "Did the changes achieve the user's hypothesis?",
129
+ "intended_effects": [
130
+ {
131
+ "expected_change": "What was supposed to happen",
132
+ "observed_outcome": "What actually happened",
133
+ "evidence": "Specific quotes/citations",
134
+ "assessment": "ACHIEVED" | "PARTIALLY_ACHIEVED" | "NOT_ACHIEVED" | "OPPOSITE_EFFECT"
135
+ }
136
+ ],
137
+ "unintended_effects": [
138
+ {
139
+ "observation": "What unexpected thing happened",
140
+ "evidence": "Specific quotes/citations",
141
+ "impact": "POSITIVE" | "NEGATIVE" | "NEUTRAL",
142
+ "severity": "CRITICAL" | "NOTABLE" | "MINOR"
143
+ }
144
+ ],
145
+ "metric_comparison": {
146
+ "duration_delta_pct": number,
147
+ "token_delta_pct": number,
148
+ "cost_delta_pct": number,
149
+ "tool_call_delta": number,
150
+ "interpretation": "What these metrics mean"
151
+ },
152
+ "tool_usage_changes": [
153
+ {
154
+ "tool_name": "name",
155
+ "control_calls": number,
156
+ "variant_calls": number,
157
+ "pattern_change": "How usage changed"
158
+ }
159
+ ],
160
+ "recommendations": [
161
+ {
162
+ "priority": "HIGH" | "MEDIUM" | "LOW",
163
+ "action": "Specific action to take",
164
+ "rationale": "Why this helps",
165
+ "expected_impact": "What improvement to expect"
166
+ }
167
+ ]
168
+ },
169
+ "next_experiments": [
170
+ {
171
+ "hypothesis": "What you want to test",
172
+ "suggested_change": {
173
+ "dimension": "model" | "system_prompt" | "tools",
174
+ "description": "What to change",
175
+ "example": "Concrete example if prompt change"
176
+ },
177
+ "expected_outcome": "What you expect to happen"
178
+ }
179
+ ]
180
+ }
181
+
182
+ Respond with ONLY the JSON object, no additional text.`;
183
+
184
+ /**
185
+ * Format a single transcript with intelligent truncation
186
+ */
187
+ function formatTranscript(
188
+ session: StoredSession,
189
+ label: string,
190
+ maxChars: number = 15000,
191
+ ): string {
192
+ let transcript = `### ${label}\n\n`;
193
+ let currentLength = transcript.length;
194
+
195
+ for (const msg of session.messages) {
196
+ const roleHeader = `## ${msg.role.toUpperCase()}\n`;
197
+
198
+ if (currentLength + roleHeader.length > maxChars) {
199
+ transcript += "\n[...transcript truncated...]\n";
200
+ break;
201
+ }
202
+
203
+ transcript += roleHeader;
204
+ currentLength += roleHeader.length;
205
+
206
+ for (const block of msg.content) {
207
+ if (block.type === "text") {
208
+ let text = block.text;
209
+
210
+ // Truncate very long text blocks
211
+ if (text.length > 1500) {
212
+ text = `${text.slice(0, 800)}\n[...truncated...]\n${text.slice(-400)}`;
213
+ }
214
+
215
+ if (currentLength + text.length > maxChars) {
216
+ transcript += text.slice(0, maxChars - currentLength - 50);
217
+ transcript += "\n[...transcript truncated...]\n";
218
+ return transcript;
219
+ }
220
+
221
+ transcript += `${text}\n`;
222
+ currentLength += text.length + 1;
223
+ } else if (block.type === "tool_call") {
224
+ const toolInfo = `[TOOL:${block.title} - ${block.status}${block.error ? `: ${block.error}` : ""}]\n`;
225
+
226
+ if (currentLength + toolInfo.length > maxChars) {
227
+ transcript += "\n[...transcript truncated...]\n";
228
+ return transcript;
229
+ }
230
+
231
+ transcript += toolInfo;
232
+ currentLength += toolInfo.length;
233
+ }
234
+ }
235
+
236
+ transcript += "\n";
237
+ currentLength += 1;
238
+ }
239
+
240
+ return transcript;
241
+ }
242
+
243
+ /**
244
+ * Format config diff for the prompt
245
+ */
246
+ function formatConfigDiff(
247
+ config: ComparisonConfig,
248
+ originalSystemPrompt?: string,
249
+ originalTools?: string[],
250
+ ): string {
251
+ const parts: string[] = [];
252
+
253
+ if (config.dimensions.includes("model") && config.variantModel) {
254
+ parts.push(`MODEL CHANGE:
255
+ - Control: ${config.controlModel || "default"}
256
+ - Variant: ${config.variantModel}`);
257
+ }
258
+
259
+ if (
260
+ config.dimensions.includes("system_prompt") &&
261
+ config.variantSystemPrompt
262
+ ) {
263
+ // Truncate very long prompts for display
264
+ const originalTruncated = originalSystemPrompt
265
+ ? originalSystemPrompt.length > 2000
266
+ ? `${originalSystemPrompt.slice(0, 1500)}\n[...truncated...]`
267
+ : originalSystemPrompt
268
+ : "[not available]";
269
+
270
+ const variantTruncated =
271
+ config.variantSystemPrompt.length > 2000
272
+ ? `${config.variantSystemPrompt.slice(0, 1500)}\n[...truncated...]`
273
+ : config.variantSystemPrompt;
274
+
275
+ parts.push(`SYSTEM PROMPT CHANGE:
276
+
277
+ Control (Original):
278
+ """
279
+ ${originalTruncated}
280
+ """
281
+
282
+ Variant (Modified):
283
+ """
284
+ ${variantTruncated}
285
+ """`);
286
+ }
287
+
288
+ if (config.dimensions.includes("tools") && config.variantTools) {
289
+ const controlTools = originalTools || [];
290
+ const variantTools = config.variantTools;
291
+ const added = variantTools.filter((t) => !controlTools.includes(t));
292
+ const removed = controlTools.filter((t) => !variantTools.includes(t));
293
+
294
+ parts.push(`TOOLS CHANGE:
295
+ - Added: ${added.length > 0 ? added.join(", ") : "none"}
296
+ - Removed: ${removed.length > 0 ? removed.join(", ") : "none"}`);
297
+ }
298
+
299
+ return parts.length > 0 ? parts.join("\n\n") : "No config changes specified";
300
+ }
301
+
302
+ /**
303
+ * Format metrics comparison table
304
+ */
305
+ function formatMetricsTable(
306
+ original: SessionMetrics | null,
307
+ control: SessionMetrics | null,
308
+ variant: SessionMetrics | null,
309
+ ): string {
310
+ const fmt = (val: number | undefined, suffix = "") =>
311
+ val !== undefined ? `${val.toLocaleString()}${suffix}` : "N/A";
312
+ const fmtCost = (val: number | undefined) =>
313
+ val !== undefined ? `$${val.toFixed(4)}` : "N/A";
314
+ const fmtDur = (val: number | undefined) =>
315
+ val !== undefined ? `${(val / 1000).toFixed(1)}s` : "N/A";
316
+
317
+ return `| Metric | Original | Control | Variant |
318
+ |--------|----------|---------|---------|
319
+ | Duration | ${fmtDur(original?.durationMs)} | ${fmtDur(control?.durationMs)} | ${fmtDur(variant?.durationMs)} |
320
+ | Input Tokens | ${fmt(original?.inputTokens)} | ${fmt(control?.inputTokens)} | ${fmt(variant?.inputTokens)} |
321
+ | Output Tokens | ${fmt(original?.outputTokens)} | ${fmt(control?.outputTokens)} | ${fmt(variant?.outputTokens)} |
322
+ | Total Tokens | ${fmt(original?.totalTokens)} | ${fmt(control?.totalTokens)} | ${fmt(variant?.totalTokens)} |
323
+ | Est. Cost | ${fmtCost(original?.estimatedCost)} | ${fmtCost(control?.estimatedCost)} | ${fmtCost(variant?.estimatedCost)} |
324
+ | Tool Calls | ${fmt(original?.toolCallCount)} | ${fmt(control?.toolCallCount)} | ${fmt(variant?.toolCallCount)} |`;
325
+ }
326
+
327
+ /**
328
+ * Build the comparison prompt
329
+ */
330
+ function buildComparisonPrompt(
331
+ hypothesis: string,
332
+ configDiff: string,
333
+ metricsTable: string,
334
+ originalTranscript: string,
335
+ controlTranscript: string,
336
+ variantTranscript: string,
337
+ ): string {
338
+ return `# COMPARISON ANALYSIS REQUEST
339
+
340
+ ## USER'S HYPOTHESIS
341
+ ${hypothesis || "No hypothesis provided - infer the expected change from the config diff."}
342
+
343
+ ## CONFIG DIFF
344
+ ${configDiff}
345
+
346
+ ## METRICS COMPARISON
347
+ ${metricsTable}
348
+
349
+ ## TRANSCRIPTS
350
+
351
+ ${originalTranscript}
352
+
353
+ ${controlTranscript}
354
+
355
+ ${variantTranscript}
356
+
357
+ ---
358
+
359
+ Analyze these three sessions and produce:
360
+ 1. A REPRODUCIBILITY REPORT comparing Original vs Control
361
+ 2. A CHANGE IMPACT REPORT comparing Control vs Variant
362
+
363
+ Focus on specific, evidence-based observations with actionable recommendations.`;
364
+ }
365
+
366
+ /**
367
+ * Extract JSON from potential markdown code blocks
368
+ */
369
+ function extractJSON(text: string): string {
370
+ const jsonMatch = text.match(/```(?:json)?\n([\s\S]*?)\n```/);
371
+ if (jsonMatch?.[1]) {
372
+ return jsonMatch[1];
373
+ }
374
+ return text.trim();
375
+ }
376
+
377
+ /**
378
+ * Build config summary from comparison config
379
+ */
380
+ function buildConfigSummary(
381
+ config: ComparisonConfig,
382
+ originalTools?: string[],
383
+ ): ConfigSummary {
384
+ const summary: ConfigSummary = {
385
+ system_prompt_changed: config.dimensions.includes("system_prompt"),
386
+ tools_added: [],
387
+ tools_removed: [],
388
+ };
389
+
390
+ if (config.dimensions.includes("model") && config.variantModel) {
391
+ summary.model_change = {
392
+ from: config.controlModel || "default",
393
+ to: config.variantModel,
394
+ };
395
+ }
396
+
397
+ if (config.dimensions.includes("tools") && config.variantTools) {
398
+ const controlTools = originalTools || [];
399
+ summary.tools_added = config.variantTools.filter(
400
+ (t) => !controlTools.includes(t),
401
+ );
402
+ summary.tools_removed = controlTools.filter(
403
+ (t) => !config.variantTools?.includes(t),
404
+ );
405
+ }
406
+
407
+ return summary;
408
+ }
409
+
410
+ /**
411
+ * Options for comparison analysis
412
+ */
413
+ export interface AnalyzeComparisonOptions {
414
+ runId: string;
415
+ hypothesis: string;
416
+ config: ComparisonConfig;
417
+ originalSession: StoredSession;
418
+ controlSession: StoredSession;
419
+ variantSession: StoredSession;
420
+ originalMetrics: SessionMetrics | null;
421
+ controlMetrics: SessionMetrics | null;
422
+ variantMetrics: SessionMetrics | null;
423
+ originalSystemPrompt?: string | undefined;
424
+ originalTools?: string[] | undefined;
425
+ }
426
+
427
+ /**
428
+ * Analyze a comparison run using Claude
429
+ */
430
+ export async function analyzeComparison(
431
+ options: AnalyzeComparisonOptions,
432
+ ): Promise<SessionComparisonAnalysis> {
433
+ const {
434
+ runId,
435
+ hypothesis,
436
+ config,
437
+ originalSession,
438
+ controlSession,
439
+ variantSession,
440
+ originalMetrics,
441
+ controlMetrics,
442
+ variantMetrics,
443
+ originalSystemPrompt,
444
+ originalTools,
445
+ } = options;
446
+
447
+ // 1. Format all components
448
+ const configDiff = formatConfigDiff(
449
+ config,
450
+ originalSystemPrompt,
451
+ originalTools,
452
+ );
453
+ const metricsTable = formatMetricsTable(
454
+ originalMetrics,
455
+ controlMetrics,
456
+ variantMetrics,
457
+ );
458
+ const originalTranscript = formatTranscript(
459
+ originalSession,
460
+ "ORIGINAL SESSION",
461
+ );
462
+ const controlTranscript = formatTranscript(controlSession, "CONTROL SESSION");
463
+ const variantTranscript = formatTranscript(variantSession, "VARIANT SESSION");
464
+
465
+ // 2. Build the prompt
466
+ const prompt = buildComparisonPrompt(
467
+ hypothesis,
468
+ configDiff,
469
+ metricsTable,
470
+ originalTranscript,
471
+ controlTranscript,
472
+ variantTranscript,
473
+ );
474
+
475
+ // 3. Call Claude
476
+ const response = await anthropic.messages.create({
477
+ model: ANALYSIS_MODEL,
478
+ max_tokens: 8192,
479
+ temperature: 0,
480
+ system: COMPARISON_SYSTEM_PROMPT,
481
+ messages: [{ role: "user", content: prompt }],
482
+ });
483
+
484
+ // 4. Parse and validate response
485
+ const content = response.content[0];
486
+ if (!content) {
487
+ throw new Error("No content in response");
488
+ }
489
+ if (content.type !== "text") {
490
+ throw new Error("Unexpected response format");
491
+ }
492
+
493
+ const jsonText = extractJSON(content.text);
494
+
495
+ // Debug: log the raw response if parsing fails
496
+ let parsed: unknown;
497
+ try {
498
+ parsed = JSON.parse(jsonText);
499
+ } catch (parseError) {
500
+ console.error("Failed to parse LLM response as JSON:");
501
+ console.error("Raw response:", content.text.slice(0, 1000));
502
+ console.error("Extracted JSON text:", jsonText.slice(0, 1000));
503
+ throw new Error(
504
+ `Invalid JSON in LLM response: ${parseError instanceof Error ? parseError.message : "Unknown parse error"}`,
505
+ );
506
+ }
507
+
508
+ const llmOutput: LLMComparisonOutput =
509
+ LLMComparisonOutputSchema.parse(parsed);
510
+
511
+ // 5. Build full analysis object
512
+ const analysis: SessionComparisonAnalysis = {
513
+ comparison_run_id: runId,
514
+ created_at: new Date().toISOString(),
515
+ original_session_id: originalSession.sessionId,
516
+ control_session_id: controlSession.sessionId,
517
+ variant_session_id: variantSession.sessionId,
518
+ hypothesis,
519
+ dimensions_compared: config.dimensions,
520
+ config_summary: buildConfigSummary(config, originalTools),
521
+ reproducibility: llmOutput.reproducibility,
522
+ change_impact: llmOutput.change_impact,
523
+ next_experiments: llmOutput.next_experiments,
524
+ };
525
+
526
+ // 6. Validate final schema
527
+ return SessionComparisonAnalysisSchema.parse(analysis);
528
+ }