@townco/debugger 0.1.31 → 0.1.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/App.tsx +1 -0
- package/src/analysis/analyzer.ts +1 -2
- package/src/analysis/comparison-analyzer.ts +528 -0
- package/src/analysis/comparison-schema.ts +151 -0
- package/src/analysis/comparison-types.ts +194 -0
- package/src/analysis-db.ts +13 -6
- package/src/comparison-db.ts +75 -3
- package/src/components/AnalyzeAllButton.tsx +6 -2
- package/src/components/ComparisonAnalysisDialog.tsx +591 -0
- package/src/components/DebuggerHeader.tsx +0 -1
- package/src/components/LogList.tsx +9 -0
- package/src/components/SessionTraceList.tsx +9 -0
- package/src/components/SpanDetailsPanel.tsx +20 -1
- package/src/components/SpanTimeline.tsx +31 -4
- package/src/components/SpanTree.tsx +10 -1
- package/src/components/TurnMetadataPanel.tsx +0 -1
- package/src/components/UnifiedTimeline.tsx +25 -35
- package/src/components/ui/button.tsx +1 -1
- package/src/components/ui/card.tsx +1 -1
- package/src/components/ui/checkbox.tsx +43 -0
- package/src/components/ui/input.tsx +1 -1
- package/src/components/ui/label.tsx +1 -1
- package/src/components/ui/select.tsx +1 -1
- package/src/components/ui/textarea.tsx +1 -1
- package/src/frontend.tsx +2 -0
- package/src/lib/metrics.test.ts +2 -0
- package/src/lib/turnExtractor.ts +28 -0
- package/src/pages/ComparisonView.tsx +1310 -322
- package/src/pages/FindSessions.tsx +3 -1
- package/src/pages/TownHall.tsx +30 -14
- package/src/server.ts +177 -7
- package/src/types.ts +4 -0
- package/styles/globals.css +120 -0
- package/tsconfig.json +2 -2
package/package.json
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@townco/debugger",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.33",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"engines": {
|
|
6
6
|
"bun": ">=1.3.0"
|
|
7
7
|
},
|
|
8
8
|
"files": [
|
|
9
9
|
"src",
|
|
10
|
+
"styles",
|
|
10
11
|
"tsconfig.json"
|
|
11
12
|
],
|
|
12
13
|
"scripts": {
|
|
@@ -18,27 +19,26 @@
|
|
|
18
19
|
"@anthropic-ai/sdk": "^0.70.0",
|
|
19
20
|
"@lancedb/lancedb": "^0.22.3",
|
|
20
21
|
"@radix-ui/react-dialog": "^1.1.15",
|
|
21
|
-
"@radix-ui/react-label": "^2.1.
|
|
22
|
+
"@radix-ui/react-label": "^2.1.8",
|
|
22
23
|
"@radix-ui/react-select": "^2.2.6",
|
|
23
|
-
"@radix-ui/react-slot": "^1.2.
|
|
24
|
-
"@radix-ui/react-tabs": "^1.1.
|
|
25
|
-
"@townco/otlp-server": "0.1.
|
|
26
|
-
"@townco/ui": "0.1.
|
|
27
|
-
"bun-plugin-tailwind": "^0.1.2",
|
|
24
|
+
"@radix-ui/react-slot": "^1.2.4",
|
|
25
|
+
"@radix-ui/react-tabs": "^1.1.13",
|
|
26
|
+
"@townco/otlp-server": "0.1.33",
|
|
27
|
+
"@townco/ui": "^0.1.77",
|
|
28
28
|
"class-variance-authority": "^0.7.1",
|
|
29
29
|
"clsx": "^2.1.1",
|
|
30
|
-
"lucide-react": "^0.
|
|
30
|
+
"lucide-react": "^0.556.0",
|
|
31
31
|
"openai": "^4.77.3",
|
|
32
|
-
"react": "19.2.1",
|
|
33
|
-
"
|
|
34
|
-
"tailwind-merge": "^3.3.1",
|
|
32
|
+
"react-dom": "^19.2.1",
|
|
33
|
+
"tailwind-merge": "^3.4.0",
|
|
35
34
|
"zod": "^4.1.13"
|
|
36
35
|
},
|
|
37
36
|
"devDependencies": {
|
|
38
|
-
"@townco/tsconfig": "0.1.
|
|
37
|
+
"@townco/tsconfig": "0.1.75",
|
|
39
38
|
"@types/bun": "latest",
|
|
40
|
-
"@types/react": "^19",
|
|
41
|
-
"@types/react-dom": "^19",
|
|
39
|
+
"@types/react": "^19.2.7",
|
|
40
|
+
"@types/react-dom": "^19.2.3",
|
|
41
|
+
"bun-plugin-tailwind": "^0.1.2",
|
|
42
42
|
"tailwindcss": "^4.1.11",
|
|
43
43
|
"tw-animate-css": "^1.4.0",
|
|
44
44
|
"typescript": "^5.9.3"
|
package/src/App.tsx
CHANGED
package/src/analysis/analyzer.ts
CHANGED
|
@@ -7,7 +7,6 @@ import { LLMAnalysisOutputSchema, SessionAnalysisSchema } from "./schema";
|
|
|
7
7
|
import type {
|
|
8
8
|
AnalysisMetrics,
|
|
9
9
|
DetailedToolCall,
|
|
10
|
-
LLMAnalysisOutput,
|
|
11
10
|
PreComputedFields,
|
|
12
11
|
SessionAnalysis,
|
|
13
12
|
} from "./types";
|
|
@@ -118,7 +117,7 @@ function formatConversationTranscript(session: StoredSession): string {
|
|
|
118
117
|
|
|
119
118
|
for (const block of msg.content) {
|
|
120
119
|
if (block.type === "text") {
|
|
121
|
-
transcript += block.text
|
|
120
|
+
transcript += `${block.text}\n`;
|
|
122
121
|
} else if (block.type === "tool_call") {
|
|
123
122
|
transcript += `[Tool: ${block.title}`;
|
|
124
123
|
if (block.status === "completed") {
|
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Comparison analyzer - analyzes comparison runs using Claude to produce
|
|
3
|
+
* Reproducibility Reports and Change Impact Reports.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
7
|
+
import type { ComparisonConfig, SessionMetrics } from "../types";
|
|
8
|
+
import {
|
|
9
|
+
LLMComparisonOutputSchema,
|
|
10
|
+
SessionComparisonAnalysisSchema,
|
|
11
|
+
} from "./comparison-schema";
|
|
12
|
+
import type {
|
|
13
|
+
ConfigSummary,
|
|
14
|
+
LLMComparisonOutput,
|
|
15
|
+
SessionComparisonAnalysis,
|
|
16
|
+
} from "./comparison-types";
|
|
17
|
+
|
|
18
|
+
// Session types (same as analyzer.ts)
|
|
19
|
+
type StoredSession = {
|
|
20
|
+
sessionId: string;
|
|
21
|
+
messages: SessionMessage[];
|
|
22
|
+
metadata: {
|
|
23
|
+
createdAt: string;
|
|
24
|
+
updatedAt: string;
|
|
25
|
+
agentName: string;
|
|
26
|
+
};
|
|
27
|
+
context: unknown[];
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
type SessionMessage = {
|
|
31
|
+
role: "user" | "assistant";
|
|
32
|
+
content: ContentBlock[];
|
|
33
|
+
timestamp: string;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
type ContentBlock =
|
|
37
|
+
| { type: "text"; text: string }
|
|
38
|
+
| { type: "image"; [key: string]: unknown }
|
|
39
|
+
| {
|
|
40
|
+
type: "tool_call";
|
|
41
|
+
id: string;
|
|
42
|
+
title: string;
|
|
43
|
+
status: "pending" | "in_progress" | "completed" | "failed";
|
|
44
|
+
error?: string;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const anthropic = new Anthropic({
|
|
48
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
const ANALYSIS_MODEL = "claude-sonnet-4-5-20250929";
|
|
52
|
+
|
|
53
|
+
const COMPARISON_SYSTEM_PROMPT = `You are an expert AI agent behavior analyst helping software engineers improve their agents.
|
|
54
|
+
|
|
55
|
+
You will analyze 3 versions of the same agent task:
|
|
56
|
+
- ORIGINAL: The historical session the engineer is trying to understand/improve
|
|
57
|
+
- CONTROL: A fresh replay with the same configuration (tests reproducibility)
|
|
58
|
+
- VARIANT: A replay with modified configuration (tests the engineer's changes)
|
|
59
|
+
|
|
60
|
+
Your job is to produce TWO reports:
|
|
61
|
+
|
|
62
|
+
1. REPRODUCIBILITY REPORT (Original vs Control)
|
|
63
|
+
- Assess whether the baseline behavior is stable
|
|
64
|
+
- Identify any concerning divergences that suggest non-determinism
|
|
65
|
+
- Help the engineer understand if they can trust A/B comparisons
|
|
66
|
+
|
|
67
|
+
2. CHANGE IMPACT REPORT (Control vs Variant)
|
|
68
|
+
- Evaluate whether the engineer's changes achieved their stated hypothesis
|
|
69
|
+
- Identify intended effects (did the change work?)
|
|
70
|
+
- Identify unintended effects (regressions, side effects)
|
|
71
|
+
- Provide specific, actionable recommendations
|
|
72
|
+
|
|
73
|
+
CRITICAL ANALYSIS PRINCIPLES:
|
|
74
|
+
|
|
75
|
+
1. SPECIFICITY: Reference exact tool names, quote transcripts, cite evidence
|
|
76
|
+
- BAD: "The variant used fewer tool calls"
|
|
77
|
+
- GOOD: "Variant made 3 [TOOL:read_file] calls vs control's 7. It skipped redundant reads of config.json (called 3x in control) by caching the result."
|
|
78
|
+
|
|
79
|
+
2. EVIDENCE-BASED: Always cite your evidence using these formats:
|
|
80
|
+
- [TOOL:name] for tool names
|
|
81
|
+
- [ARG:key=value] for tool arguments
|
|
82
|
+
- [MSG:"quote..."] for quoting assistant messages
|
|
83
|
+
- [OUTPUT:"excerpt..."] for tool outputs
|
|
84
|
+
|
|
85
|
+
3. ACTIONABLE RECOMMENDATIONS:
|
|
86
|
+
- BAD: "Consider making the prompt more specific"
|
|
87
|
+
- GOOD: "Add to system prompt: 'When reading configuration files, cache the contents for the session to avoid redundant reads.'"
|
|
88
|
+
|
|
89
|
+
4. HYPOTHESIS-FOCUSED: The user stated a hypothesis about what their change would do. Directly evaluate whether that hypothesis was achieved.
|
|
90
|
+
|
|
91
|
+
5. METRICS INTERPRETATION: Don't just report numbers - explain what they mean for the user.
|
|
92
|
+
- BAD: "Token usage increased by 20%"
|
|
93
|
+
- GOOD: "Token usage increased by 20% (from 5,000 to 6,000), adding ~$0.03 per session, but the improved answer quality likely justifies this cost."
|
|
94
|
+
|
|
95
|
+
You must respond with valid JSON matching this schema:
|
|
96
|
+
{
|
|
97
|
+
"reproducibility": {
|
|
98
|
+
"verdict": "STABLE" | "UNSTABLE" | "PARTIALLY_STABLE",
|
|
99
|
+
"confidence": "HIGH" | "MEDIUM" | "LOW",
|
|
100
|
+
"summary": "2-3 sentences summarizing reproducibility findings",
|
|
101
|
+
"behavioral_differences": [
|
|
102
|
+
{
|
|
103
|
+
"category": "TOOL_USAGE" | "RESPONSE_CONTENT" | "REASONING_PATH" | "ERROR_HANDLING" | "PERFORMANCE",
|
|
104
|
+
"observation": "What was different",
|
|
105
|
+
"evidence": "Specific quotes/citations",
|
|
106
|
+
"significance": "CRITICAL" | "NOTABLE" | "MINOR"
|
|
107
|
+
}
|
|
108
|
+
],
|
|
109
|
+
"metric_comparison": {
|
|
110
|
+
"duration_delta_pct": number,
|
|
111
|
+
"token_delta_pct": number,
|
|
112
|
+
"cost_delta_pct": number,
|
|
113
|
+
"tool_call_delta": number,
|
|
114
|
+
"interpretation": "What these metrics mean"
|
|
115
|
+
},
|
|
116
|
+
"recommendations": [
|
|
117
|
+
{
|
|
118
|
+
"priority": "HIGH" | "MEDIUM" | "LOW",
|
|
119
|
+
"action": "Specific action to take",
|
|
120
|
+
"rationale": "Why this helps"
|
|
121
|
+
}
|
|
122
|
+
]
|
|
123
|
+
},
|
|
124
|
+
"change_impact": {
|
|
125
|
+
"verdict": "IMPROVED" | "DEGRADED" | "NEUTRAL" | "MIXED",
|
|
126
|
+
"confidence": "HIGH" | "MEDIUM" | "LOW",
|
|
127
|
+
"summary": "2-3 sentences summarizing change impact",
|
|
128
|
+
"hypothesis_assessment": "Did the changes achieve the user's hypothesis?",
|
|
129
|
+
"intended_effects": [
|
|
130
|
+
{
|
|
131
|
+
"expected_change": "What was supposed to happen",
|
|
132
|
+
"observed_outcome": "What actually happened",
|
|
133
|
+
"evidence": "Specific quotes/citations",
|
|
134
|
+
"assessment": "ACHIEVED" | "PARTIALLY_ACHIEVED" | "NOT_ACHIEVED" | "OPPOSITE_EFFECT"
|
|
135
|
+
}
|
|
136
|
+
],
|
|
137
|
+
"unintended_effects": [
|
|
138
|
+
{
|
|
139
|
+
"observation": "What unexpected thing happened",
|
|
140
|
+
"evidence": "Specific quotes/citations",
|
|
141
|
+
"impact": "POSITIVE" | "NEGATIVE" | "NEUTRAL",
|
|
142
|
+
"severity": "CRITICAL" | "NOTABLE" | "MINOR"
|
|
143
|
+
}
|
|
144
|
+
],
|
|
145
|
+
"metric_comparison": {
|
|
146
|
+
"duration_delta_pct": number,
|
|
147
|
+
"token_delta_pct": number,
|
|
148
|
+
"cost_delta_pct": number,
|
|
149
|
+
"tool_call_delta": number,
|
|
150
|
+
"interpretation": "What these metrics mean"
|
|
151
|
+
},
|
|
152
|
+
"tool_usage_changes": [
|
|
153
|
+
{
|
|
154
|
+
"tool_name": "name",
|
|
155
|
+
"control_calls": number,
|
|
156
|
+
"variant_calls": number,
|
|
157
|
+
"pattern_change": "How usage changed"
|
|
158
|
+
}
|
|
159
|
+
],
|
|
160
|
+
"recommendations": [
|
|
161
|
+
{
|
|
162
|
+
"priority": "HIGH" | "MEDIUM" | "LOW",
|
|
163
|
+
"action": "Specific action to take",
|
|
164
|
+
"rationale": "Why this helps",
|
|
165
|
+
"expected_impact": "What improvement to expect"
|
|
166
|
+
}
|
|
167
|
+
]
|
|
168
|
+
},
|
|
169
|
+
"next_experiments": [
|
|
170
|
+
{
|
|
171
|
+
"hypothesis": "What you want to test",
|
|
172
|
+
"suggested_change": {
|
|
173
|
+
"dimension": "model" | "system_prompt" | "tools",
|
|
174
|
+
"description": "What to change",
|
|
175
|
+
"example": "Concrete example if prompt change"
|
|
176
|
+
},
|
|
177
|
+
"expected_outcome": "What you expect to happen"
|
|
178
|
+
}
|
|
179
|
+
]
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
Respond with ONLY the JSON object, no additional text.`;
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Format a single transcript with intelligent truncation
|
|
186
|
+
*/
|
|
187
|
+
function formatTranscript(
|
|
188
|
+
session: StoredSession,
|
|
189
|
+
label: string,
|
|
190
|
+
maxChars: number = 15000,
|
|
191
|
+
): string {
|
|
192
|
+
let transcript = `### ${label}\n\n`;
|
|
193
|
+
let currentLength = transcript.length;
|
|
194
|
+
|
|
195
|
+
for (const msg of session.messages) {
|
|
196
|
+
const roleHeader = `## ${msg.role.toUpperCase()}\n`;
|
|
197
|
+
|
|
198
|
+
if (currentLength + roleHeader.length > maxChars) {
|
|
199
|
+
transcript += "\n[...transcript truncated...]\n";
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
transcript += roleHeader;
|
|
204
|
+
currentLength += roleHeader.length;
|
|
205
|
+
|
|
206
|
+
for (const block of msg.content) {
|
|
207
|
+
if (block.type === "text") {
|
|
208
|
+
let text = block.text;
|
|
209
|
+
|
|
210
|
+
// Truncate very long text blocks
|
|
211
|
+
if (text.length > 1500) {
|
|
212
|
+
text = `${text.slice(0, 800)}\n[...truncated...]\n${text.slice(-400)}`;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (currentLength + text.length > maxChars) {
|
|
216
|
+
transcript += text.slice(0, maxChars - currentLength - 50);
|
|
217
|
+
transcript += "\n[...transcript truncated...]\n";
|
|
218
|
+
return transcript;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
transcript += `${text}\n`;
|
|
222
|
+
currentLength += text.length + 1;
|
|
223
|
+
} else if (block.type === "tool_call") {
|
|
224
|
+
const toolInfo = `[TOOL:${block.title} - ${block.status}${block.error ? `: ${block.error}` : ""}]\n`;
|
|
225
|
+
|
|
226
|
+
if (currentLength + toolInfo.length > maxChars) {
|
|
227
|
+
transcript += "\n[...transcript truncated...]\n";
|
|
228
|
+
return transcript;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
transcript += toolInfo;
|
|
232
|
+
currentLength += toolInfo.length;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
transcript += "\n";
|
|
237
|
+
currentLength += 1;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return transcript;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Format config diff for the prompt
|
|
245
|
+
*/
|
|
246
|
+
function formatConfigDiff(
|
|
247
|
+
config: ComparisonConfig,
|
|
248
|
+
originalSystemPrompt?: string,
|
|
249
|
+
originalTools?: string[],
|
|
250
|
+
): string {
|
|
251
|
+
const parts: string[] = [];
|
|
252
|
+
|
|
253
|
+
if (config.dimensions.includes("model") && config.variantModel) {
|
|
254
|
+
parts.push(`MODEL CHANGE:
|
|
255
|
+
- Control: ${config.controlModel || "default"}
|
|
256
|
+
- Variant: ${config.variantModel}`);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if (
|
|
260
|
+
config.dimensions.includes("system_prompt") &&
|
|
261
|
+
config.variantSystemPrompt
|
|
262
|
+
) {
|
|
263
|
+
// Truncate very long prompts for display
|
|
264
|
+
const originalTruncated = originalSystemPrompt
|
|
265
|
+
? originalSystemPrompt.length > 2000
|
|
266
|
+
? `${originalSystemPrompt.slice(0, 1500)}\n[...truncated...]`
|
|
267
|
+
: originalSystemPrompt
|
|
268
|
+
: "[not available]";
|
|
269
|
+
|
|
270
|
+
const variantTruncated =
|
|
271
|
+
config.variantSystemPrompt.length > 2000
|
|
272
|
+
? `${config.variantSystemPrompt.slice(0, 1500)}\n[...truncated...]`
|
|
273
|
+
: config.variantSystemPrompt;
|
|
274
|
+
|
|
275
|
+
parts.push(`SYSTEM PROMPT CHANGE:
|
|
276
|
+
|
|
277
|
+
Control (Original):
|
|
278
|
+
"""
|
|
279
|
+
${originalTruncated}
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
Variant (Modified):
|
|
283
|
+
"""
|
|
284
|
+
${variantTruncated}
|
|
285
|
+
"""`);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
if (config.dimensions.includes("tools") && config.variantTools) {
|
|
289
|
+
const controlTools = originalTools || [];
|
|
290
|
+
const variantTools = config.variantTools;
|
|
291
|
+
const added = variantTools.filter((t) => !controlTools.includes(t));
|
|
292
|
+
const removed = controlTools.filter((t) => !variantTools.includes(t));
|
|
293
|
+
|
|
294
|
+
parts.push(`TOOLS CHANGE:
|
|
295
|
+
- Added: ${added.length > 0 ? added.join(", ") : "none"}
|
|
296
|
+
- Removed: ${removed.length > 0 ? removed.join(", ") : "none"}`);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
return parts.length > 0 ? parts.join("\n\n") : "No config changes specified";
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Format metrics comparison table
|
|
304
|
+
*/
|
|
305
|
+
function formatMetricsTable(
|
|
306
|
+
original: SessionMetrics | null,
|
|
307
|
+
control: SessionMetrics | null,
|
|
308
|
+
variant: SessionMetrics | null,
|
|
309
|
+
): string {
|
|
310
|
+
const fmt = (val: number | undefined, suffix = "") =>
|
|
311
|
+
val !== undefined ? `${val.toLocaleString()}${suffix}` : "N/A";
|
|
312
|
+
const fmtCost = (val: number | undefined) =>
|
|
313
|
+
val !== undefined ? `$${val.toFixed(4)}` : "N/A";
|
|
314
|
+
const fmtDur = (val: number | undefined) =>
|
|
315
|
+
val !== undefined ? `${(val / 1000).toFixed(1)}s` : "N/A";
|
|
316
|
+
|
|
317
|
+
return `| Metric | Original | Control | Variant |
|
|
318
|
+
|--------|----------|---------|---------|
|
|
319
|
+
| Duration | ${fmtDur(original?.durationMs)} | ${fmtDur(control?.durationMs)} | ${fmtDur(variant?.durationMs)} |
|
|
320
|
+
| Input Tokens | ${fmt(original?.inputTokens)} | ${fmt(control?.inputTokens)} | ${fmt(variant?.inputTokens)} |
|
|
321
|
+
| Output Tokens | ${fmt(original?.outputTokens)} | ${fmt(control?.outputTokens)} | ${fmt(variant?.outputTokens)} |
|
|
322
|
+
| Total Tokens | ${fmt(original?.totalTokens)} | ${fmt(control?.totalTokens)} | ${fmt(variant?.totalTokens)} |
|
|
323
|
+
| Est. Cost | ${fmtCost(original?.estimatedCost)} | ${fmtCost(control?.estimatedCost)} | ${fmtCost(variant?.estimatedCost)} |
|
|
324
|
+
| Tool Calls | ${fmt(original?.toolCallCount)} | ${fmt(control?.toolCallCount)} | ${fmt(variant?.toolCallCount)} |`;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Build the comparison prompt
|
|
329
|
+
*/
|
|
330
|
+
function buildComparisonPrompt(
|
|
331
|
+
hypothesis: string,
|
|
332
|
+
configDiff: string,
|
|
333
|
+
metricsTable: string,
|
|
334
|
+
originalTranscript: string,
|
|
335
|
+
controlTranscript: string,
|
|
336
|
+
variantTranscript: string,
|
|
337
|
+
): string {
|
|
338
|
+
return `# COMPARISON ANALYSIS REQUEST
|
|
339
|
+
|
|
340
|
+
## USER'S HYPOTHESIS
|
|
341
|
+
${hypothesis || "No hypothesis provided - infer the expected change from the config diff."}
|
|
342
|
+
|
|
343
|
+
## CONFIG DIFF
|
|
344
|
+
${configDiff}
|
|
345
|
+
|
|
346
|
+
## METRICS COMPARISON
|
|
347
|
+
${metricsTable}
|
|
348
|
+
|
|
349
|
+
## TRANSCRIPTS
|
|
350
|
+
|
|
351
|
+
${originalTranscript}
|
|
352
|
+
|
|
353
|
+
${controlTranscript}
|
|
354
|
+
|
|
355
|
+
${variantTranscript}
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
359
|
+
Analyze these three sessions and produce:
|
|
360
|
+
1. A REPRODUCIBILITY REPORT comparing Original vs Control
|
|
361
|
+
2. A CHANGE IMPACT REPORT comparing Control vs Variant
|
|
362
|
+
|
|
363
|
+
Focus on specific, evidence-based observations with actionable recommendations.`;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Extract JSON from potential markdown code blocks
|
|
368
|
+
*/
|
|
369
|
+
function extractJSON(text: string): string {
|
|
370
|
+
const jsonMatch = text.match(/```(?:json)?\n([\s\S]*?)\n```/);
|
|
371
|
+
if (jsonMatch?.[1]) {
|
|
372
|
+
return jsonMatch[1];
|
|
373
|
+
}
|
|
374
|
+
return text.trim();
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Build config summary from comparison config
|
|
379
|
+
*/
|
|
380
|
+
function buildConfigSummary(
|
|
381
|
+
config: ComparisonConfig,
|
|
382
|
+
originalTools?: string[],
|
|
383
|
+
): ConfigSummary {
|
|
384
|
+
const summary: ConfigSummary = {
|
|
385
|
+
system_prompt_changed: config.dimensions.includes("system_prompt"),
|
|
386
|
+
tools_added: [],
|
|
387
|
+
tools_removed: [],
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
if (config.dimensions.includes("model") && config.variantModel) {
|
|
391
|
+
summary.model_change = {
|
|
392
|
+
from: config.controlModel || "default",
|
|
393
|
+
to: config.variantModel,
|
|
394
|
+
};
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
if (config.dimensions.includes("tools") && config.variantTools) {
|
|
398
|
+
const controlTools = originalTools || [];
|
|
399
|
+
summary.tools_added = config.variantTools.filter(
|
|
400
|
+
(t) => !controlTools.includes(t),
|
|
401
|
+
);
|
|
402
|
+
summary.tools_removed = controlTools.filter(
|
|
403
|
+
(t) => !config.variantTools?.includes(t),
|
|
404
|
+
);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
return summary;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
/**
|
|
411
|
+
* Options for comparison analysis
|
|
412
|
+
*/
|
|
413
|
+
export interface AnalyzeComparisonOptions {
|
|
414
|
+
runId: string;
|
|
415
|
+
hypothesis: string;
|
|
416
|
+
config: ComparisonConfig;
|
|
417
|
+
originalSession: StoredSession;
|
|
418
|
+
controlSession: StoredSession;
|
|
419
|
+
variantSession: StoredSession;
|
|
420
|
+
originalMetrics: SessionMetrics | null;
|
|
421
|
+
controlMetrics: SessionMetrics | null;
|
|
422
|
+
variantMetrics: SessionMetrics | null;
|
|
423
|
+
originalSystemPrompt?: string | undefined;
|
|
424
|
+
originalTools?: string[] | undefined;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Analyze a comparison run using Claude
|
|
429
|
+
*/
|
|
430
|
+
export async function analyzeComparison(
|
|
431
|
+
options: AnalyzeComparisonOptions,
|
|
432
|
+
): Promise<SessionComparisonAnalysis> {
|
|
433
|
+
const {
|
|
434
|
+
runId,
|
|
435
|
+
hypothesis,
|
|
436
|
+
config,
|
|
437
|
+
originalSession,
|
|
438
|
+
controlSession,
|
|
439
|
+
variantSession,
|
|
440
|
+
originalMetrics,
|
|
441
|
+
controlMetrics,
|
|
442
|
+
variantMetrics,
|
|
443
|
+
originalSystemPrompt,
|
|
444
|
+
originalTools,
|
|
445
|
+
} = options;
|
|
446
|
+
|
|
447
|
+
// 1. Format all components
|
|
448
|
+
const configDiff = formatConfigDiff(
|
|
449
|
+
config,
|
|
450
|
+
originalSystemPrompt,
|
|
451
|
+
originalTools,
|
|
452
|
+
);
|
|
453
|
+
const metricsTable = formatMetricsTable(
|
|
454
|
+
originalMetrics,
|
|
455
|
+
controlMetrics,
|
|
456
|
+
variantMetrics,
|
|
457
|
+
);
|
|
458
|
+
const originalTranscript = formatTranscript(
|
|
459
|
+
originalSession,
|
|
460
|
+
"ORIGINAL SESSION",
|
|
461
|
+
);
|
|
462
|
+
const controlTranscript = formatTranscript(controlSession, "CONTROL SESSION");
|
|
463
|
+
const variantTranscript = formatTranscript(variantSession, "VARIANT SESSION");
|
|
464
|
+
|
|
465
|
+
// 2. Build the prompt
|
|
466
|
+
const prompt = buildComparisonPrompt(
|
|
467
|
+
hypothesis,
|
|
468
|
+
configDiff,
|
|
469
|
+
metricsTable,
|
|
470
|
+
originalTranscript,
|
|
471
|
+
controlTranscript,
|
|
472
|
+
variantTranscript,
|
|
473
|
+
);
|
|
474
|
+
|
|
475
|
+
// 3. Call Claude
|
|
476
|
+
const response = await anthropic.messages.create({
|
|
477
|
+
model: ANALYSIS_MODEL,
|
|
478
|
+
max_tokens: 8192,
|
|
479
|
+
temperature: 0,
|
|
480
|
+
system: COMPARISON_SYSTEM_PROMPT,
|
|
481
|
+
messages: [{ role: "user", content: prompt }],
|
|
482
|
+
});
|
|
483
|
+
|
|
484
|
+
// 4. Parse and validate response
|
|
485
|
+
const content = response.content[0];
|
|
486
|
+
if (!content) {
|
|
487
|
+
throw new Error("No content in response");
|
|
488
|
+
}
|
|
489
|
+
if (content.type !== "text") {
|
|
490
|
+
throw new Error("Unexpected response format");
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
const jsonText = extractJSON(content.text);
|
|
494
|
+
|
|
495
|
+
// Debug: log the raw response if parsing fails
|
|
496
|
+
let parsed: unknown;
|
|
497
|
+
try {
|
|
498
|
+
parsed = JSON.parse(jsonText);
|
|
499
|
+
} catch (parseError) {
|
|
500
|
+
console.error("Failed to parse LLM response as JSON:");
|
|
501
|
+
console.error("Raw response:", content.text.slice(0, 1000));
|
|
502
|
+
console.error("Extracted JSON text:", jsonText.slice(0, 1000));
|
|
503
|
+
throw new Error(
|
|
504
|
+
`Invalid JSON in LLM response: ${parseError instanceof Error ? parseError.message : "Unknown parse error"}`,
|
|
505
|
+
);
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
const llmOutput: LLMComparisonOutput =
|
|
509
|
+
LLMComparisonOutputSchema.parse(parsed);
|
|
510
|
+
|
|
511
|
+
// 5. Build full analysis object
|
|
512
|
+
const analysis: SessionComparisonAnalysis = {
|
|
513
|
+
comparison_run_id: runId,
|
|
514
|
+
created_at: new Date().toISOString(),
|
|
515
|
+
original_session_id: originalSession.sessionId,
|
|
516
|
+
control_session_id: controlSession.sessionId,
|
|
517
|
+
variant_session_id: variantSession.sessionId,
|
|
518
|
+
hypothesis,
|
|
519
|
+
dimensions_compared: config.dimensions,
|
|
520
|
+
config_summary: buildConfigSummary(config, originalTools),
|
|
521
|
+
reproducibility: llmOutput.reproducibility,
|
|
522
|
+
change_impact: llmOutput.change_impact,
|
|
523
|
+
next_experiments: llmOutput.next_experiments,
|
|
524
|
+
};
|
|
525
|
+
|
|
526
|
+
// 6. Validate final schema
|
|
527
|
+
return SessionComparisonAnalysisSchema.parse(analysis);
|
|
528
|
+
}
|