@townco/debugger 0.1.28 → 0.1.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +7 -4
- package/src/App.tsx +6 -0
- package/src/analysis/analyzer.ts +272 -0
- package/src/analysis/embeddings.ts +97 -0
- package/src/analysis/schema.ts +91 -0
- package/src/analysis/types.ts +157 -0
- package/src/analysis-db.ts +238 -0
- package/src/comparison-db.test.ts +28 -5
- package/src/comparison-db.ts +57 -9
- package/src/components/AnalyzeAllButton.tsx +81 -0
- package/src/components/DebuggerHeader.tsx +12 -0
- package/src/components/SessionAnalysisButton.tsx +109 -0
- package/src/components/SessionAnalysisDialog.tsx +240 -0
- package/src/components/UnifiedTimeline.tsx +3 -3
- package/src/components/ui/dialog.tsx +120 -0
- package/src/db.ts +3 -2
- package/src/lib/metrics.ts +131 -11
- package/src/pages/ComparisonView.tsx +618 -177
- package/src/pages/FindSessions.tsx +247 -0
- package/src/pages/SessionList.tsx +76 -10
- package/src/pages/SessionView.tsx +33 -1
- package/src/pages/TownHall.tsx +345 -187
- package/src/schemas.ts +27 -8
- package/src/server.ts +423 -3
- package/src/types.ts +11 -2
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ChevronDown, ChevronUp, Loader2 } from "lucide-react";
|
|
1
2
|
import { useCallback, useEffect, useState } from "react";
|
|
2
3
|
import { Button } from "@/components/ui/button";
|
|
3
4
|
import {
|
|
@@ -7,6 +8,7 @@ import {
|
|
|
7
8
|
CardHeader,
|
|
8
9
|
CardTitle,
|
|
9
10
|
} from "@/components/ui/card";
|
|
11
|
+
import type { SessionAnalysis } from "../analysis/types";
|
|
10
12
|
import { DebuggerLayout } from "../components/DebuggerLayout";
|
|
11
13
|
import { formatCost, formatDuration, formatTokens } from "../lib/metrics";
|
|
12
14
|
import type { ComparisonConfig, ComparisonRun, SessionMetrics } from "../types";
|
|
@@ -33,6 +35,259 @@ const AGENT_SERVER_URL =
|
|
|
33
35
|
? window.location.origin.replace(":4000", ":3100")
|
|
34
36
|
: "http://localhost:3100";
|
|
35
37
|
|
|
38
|
+
// Expandable Session Analysis Panel
|
|
39
|
+
function SessionAnalysisPanel({
|
|
40
|
+
analysis,
|
|
41
|
+
isLoading,
|
|
42
|
+
isExpanded,
|
|
43
|
+
onToggle,
|
|
44
|
+
accentColor,
|
|
45
|
+
}: {
|
|
46
|
+
analysis: SessionAnalysis | null;
|
|
47
|
+
isLoading: boolean;
|
|
48
|
+
isExpanded: boolean;
|
|
49
|
+
onToggle: () => void;
|
|
50
|
+
accentColor: "blue" | "orange";
|
|
51
|
+
}) {
|
|
52
|
+
const colorClasses =
|
|
53
|
+
accentColor === "blue"
|
|
54
|
+
? "border-blue-200 dark:border-blue-800 bg-blue-50/50 dark:bg-blue-950/30"
|
|
55
|
+
: "border-orange-200 dark:border-orange-800 bg-orange-50/50 dark:bg-orange-950/30";
|
|
56
|
+
|
|
57
|
+
const headerColorClasses =
|
|
58
|
+
accentColor === "blue"
|
|
59
|
+
? "hover:bg-blue-100/50 dark:hover:bg-blue-900/30"
|
|
60
|
+
: "hover:bg-orange-100/50 dark:hover:bg-orange-900/30";
|
|
61
|
+
|
|
62
|
+
if (isLoading) {
|
|
63
|
+
return (
|
|
64
|
+
<div className={`border rounded-md p-3 ${colorClasses}`}>
|
|
65
|
+
<div className="flex items-center gap-2 text-xs text-muted-foreground">
|
|
66
|
+
<Loader2 className="w-3 h-3 animate-spin" />
|
|
67
|
+
Loading analysis...
|
|
68
|
+
</div>
|
|
69
|
+
</div>
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (!analysis) {
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return (
|
|
78
|
+
<div className={`border rounded-md overflow-hidden ${colorClasses}`}>
|
|
79
|
+
<button
|
|
80
|
+
type="button"
|
|
81
|
+
onClick={onToggle}
|
|
82
|
+
className={`w-full px-3 py-2 flex items-center justify-between text-left transition-colors ${headerColorClasses}`}
|
|
83
|
+
>
|
|
84
|
+
<div className="flex items-center gap-2">
|
|
85
|
+
<span className="text-xs font-semibold">Session Analysis</span>
|
|
86
|
+
<span
|
|
87
|
+
className={`text-[10px] px-1.5 py-0.5 rounded ${
|
|
88
|
+
analysis.outcome.status === "SUCCESS"
|
|
89
|
+
? "bg-green-100 text-green-700 dark:bg-green-900/50 dark:text-green-300"
|
|
90
|
+
: analysis.outcome.status === "FAILURE"
|
|
91
|
+
? "bg-red-100 text-red-700 dark:bg-red-900/50 dark:text-red-300"
|
|
92
|
+
: "bg-yellow-100 text-yellow-700 dark:bg-yellow-900/50 dark:text-yellow-300"
|
|
93
|
+
}`}
|
|
94
|
+
>
|
|
95
|
+
{analysis.outcome.status}
|
|
96
|
+
</span>
|
|
97
|
+
</div>
|
|
98
|
+
{isExpanded ? (
|
|
99
|
+
<ChevronUp className="w-4 h-4 text-muted-foreground" />
|
|
100
|
+
) : (
|
|
101
|
+
<ChevronDown className="w-4 h-4 text-muted-foreground" />
|
|
102
|
+
)}
|
|
103
|
+
</button>
|
|
104
|
+
|
|
105
|
+
{isExpanded && (
|
|
106
|
+
<div className="px-3 pb-3 space-y-3 text-xs">
|
|
107
|
+
{/* Task */}
|
|
108
|
+
<div>
|
|
109
|
+
<div className="font-semibold text-muted-foreground mb-1">
|
|
110
|
+
Task Summary
|
|
111
|
+
</div>
|
|
112
|
+
<div className="text-foreground">{analysis.task.task_summary}</div>
|
|
113
|
+
</div>
|
|
114
|
+
|
|
115
|
+
{/* Intent */}
|
|
116
|
+
<div className="flex items-center gap-2">
|
|
117
|
+
<span className="font-semibold text-muted-foreground">Intent:</span>
|
|
118
|
+
<span className="px-2 py-0.5 bg-primary/10 text-primary rounded text-[11px] font-medium">
|
|
119
|
+
{analysis.task.intent_type}
|
|
120
|
+
</span>
|
|
121
|
+
</div>
|
|
122
|
+
|
|
123
|
+
{/* Trajectory */}
|
|
124
|
+
<div>
|
|
125
|
+
<div className="font-semibold text-muted-foreground mb-1">
|
|
126
|
+
High Level Plan
|
|
127
|
+
</div>
|
|
128
|
+
<div className="text-foreground text-[11px] leading-relaxed">
|
|
129
|
+
{analysis.trajectory.high_level_plan}
|
|
130
|
+
</div>
|
|
131
|
+
</div>
|
|
132
|
+
|
|
133
|
+
{/* Outcome */}
|
|
134
|
+
<div>
|
|
135
|
+
<div className="font-semibold text-muted-foreground mb-1">
|
|
136
|
+
Assessment
|
|
137
|
+
</div>
|
|
138
|
+
<div className="text-foreground text-[11px] leading-relaxed">
|
|
139
|
+
{analysis.outcome.assessment}
|
|
140
|
+
</div>
|
|
141
|
+
</div>
|
|
142
|
+
|
|
143
|
+
{/* Answer Type */}
|
|
144
|
+
<div className="flex items-center gap-2">
|
|
145
|
+
<span className="font-semibold text-muted-foreground">
|
|
146
|
+
Answer Type:
|
|
147
|
+
</span>
|
|
148
|
+
<span className="px-2 py-0.5 bg-secondary text-secondary-foreground rounded text-[11px] font-medium">
|
|
149
|
+
{analysis.outcome.answer_type}
|
|
150
|
+
</span>
|
|
151
|
+
</div>
|
|
152
|
+
|
|
153
|
+
{/* Metrics Summary */}
|
|
154
|
+
{analysis.metrics && (
|
|
155
|
+
<div className="grid grid-cols-5 gap-2 pt-2 border-t border-border/50">
|
|
156
|
+
<div>
|
|
157
|
+
<div className="text-[10px] text-muted-foreground">
|
|
158
|
+
Duration
|
|
159
|
+
</div>
|
|
160
|
+
<div className="font-medium">
|
|
161
|
+
{formatDuration(analysis.metrics.durationMs)}
|
|
162
|
+
</div>
|
|
163
|
+
</div>
|
|
164
|
+
<div>
|
|
165
|
+
<div className="text-[10px] text-muted-foreground">Input</div>
|
|
166
|
+
<div className="font-medium">
|
|
167
|
+
{formatTokens(analysis.metrics.inputTokens)}
|
|
168
|
+
</div>
|
|
169
|
+
</div>
|
|
170
|
+
<div>
|
|
171
|
+
<div className="text-[10px] text-muted-foreground">Output</div>
|
|
172
|
+
<div className="font-medium">
|
|
173
|
+
{formatTokens(analysis.metrics.outputTokens)}
|
|
174
|
+
</div>
|
|
175
|
+
</div>
|
|
176
|
+
<div>
|
|
177
|
+
<div className="text-[10px] text-muted-foreground">Total</div>
|
|
178
|
+
<div className="font-medium">
|
|
179
|
+
{formatTokens(analysis.metrics.totalTokens)}
|
|
180
|
+
</div>
|
|
181
|
+
</div>
|
|
182
|
+
<div>
|
|
183
|
+
<div className="text-[10px] text-muted-foreground">Cost</div>
|
|
184
|
+
<div className="font-medium text-green-600 dark:text-green-400">
|
|
185
|
+
{formatCost(analysis.metrics.estimatedCost)}
|
|
186
|
+
</div>
|
|
187
|
+
</div>
|
|
188
|
+
</div>
|
|
189
|
+
)}
|
|
190
|
+
</div>
|
|
191
|
+
)}
|
|
192
|
+
</div>
|
|
193
|
+
);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Collapsible Tool Calls Panel
|
|
197
|
+
function ToolCallsPanel({
|
|
198
|
+
toolCalls,
|
|
199
|
+
isExpanded,
|
|
200
|
+
onToggle,
|
|
201
|
+
accentColor,
|
|
202
|
+
}: {
|
|
203
|
+
toolCalls: SessionMetrics["toolCalls"];
|
|
204
|
+
isExpanded: boolean;
|
|
205
|
+
onToggle: () => void;
|
|
206
|
+
accentColor: "blue" | "orange";
|
|
207
|
+
}) {
|
|
208
|
+
const colorClasses =
|
|
209
|
+
accentColor === "blue"
|
|
210
|
+
? "border-blue-200 dark:border-blue-800 bg-blue-50/50 dark:bg-blue-950/30"
|
|
211
|
+
: "border-orange-200 dark:border-orange-800 bg-orange-50/50 dark:bg-orange-950/30";
|
|
212
|
+
|
|
213
|
+
const headerColorClasses =
|
|
214
|
+
accentColor === "blue"
|
|
215
|
+
? "hover:bg-blue-100/50 dark:hover:bg-blue-900/30"
|
|
216
|
+
: "hover:bg-orange-100/50 dark:hover:bg-orange-900/30";
|
|
217
|
+
|
|
218
|
+
const toolCallCount = toolCalls?.length ?? 0;
|
|
219
|
+
|
|
220
|
+
return (
|
|
221
|
+
<div className={`border rounded-md overflow-hidden ${colorClasses}`}>
|
|
222
|
+
<button
|
|
223
|
+
type="button"
|
|
224
|
+
onClick={onToggle}
|
|
225
|
+
className={`w-full px-3 py-2 flex items-center justify-between text-left transition-colors ${headerColorClasses}`}
|
|
226
|
+
>
|
|
227
|
+
<div className="flex items-center gap-2">
|
|
228
|
+
<span className="text-xs font-semibold">Tool Calls</span>
|
|
229
|
+
<span className="text-[10px] px-1.5 py-0.5 rounded bg-secondary text-secondary-foreground">
|
|
230
|
+
{toolCallCount}
|
|
231
|
+
</span>
|
|
232
|
+
</div>
|
|
233
|
+
{isExpanded ? (
|
|
234
|
+
<ChevronUp className="w-4 h-4 text-muted-foreground" />
|
|
235
|
+
) : (
|
|
236
|
+
<ChevronDown className="w-4 h-4 text-muted-foreground" />
|
|
237
|
+
)}
|
|
238
|
+
</button>
|
|
239
|
+
|
|
240
|
+
{isExpanded && (
|
|
241
|
+
<div className="px-3 pb-3">
|
|
242
|
+
{!toolCalls || toolCalls.length === 0 ? (
|
|
243
|
+
<div className="text-xs text-muted-foreground">No tool calls</div>
|
|
244
|
+
) : (
|
|
245
|
+
<div className="space-y-2">
|
|
246
|
+
{toolCalls.map((call, idx) => (
|
|
247
|
+
<details
|
|
248
|
+
key={`${call.name}-${call.startTimeUnixNano ?? idx}`}
|
|
249
|
+
className="rounded-md border px-3 py-2 bg-background/50"
|
|
250
|
+
>
|
|
251
|
+
<summary className="text-xs font-medium cursor-pointer flex items-center justify-between">
|
|
252
|
+
<span>
|
|
253
|
+
{call.name}{" "}
|
|
254
|
+
{call.startTimeUnixNano ? (
|
|
255
|
+
<span className="text-muted-foreground">
|
|
256
|
+
@{" "}
|
|
257
|
+
{new Date(
|
|
258
|
+
call.startTimeUnixNano / 1_000_000,
|
|
259
|
+
).toLocaleTimeString()}
|
|
260
|
+
</span>
|
|
261
|
+
) : null}
|
|
262
|
+
</span>
|
|
263
|
+
<span className="text-muted-foreground text-[11px]">
|
|
264
|
+
view
|
|
265
|
+
</span>
|
|
266
|
+
</summary>
|
|
267
|
+
<div className="mt-2 text-[11px] space-y-1 break-words">
|
|
268
|
+
<div>
|
|
269
|
+
<span className="font-semibold">Args:</span>{" "}
|
|
270
|
+
<pre className="break-words whitespace-pre-wrap bg-muted rounded p-2 mt-1 overflow-x-auto max-h-40">
|
|
271
|
+
{JSON.stringify(call.input, null, 2)}
|
|
272
|
+
</pre>
|
|
273
|
+
</div>
|
|
274
|
+
<div>
|
|
275
|
+
<span className="font-semibold">Result:</span>{" "}
|
|
276
|
+
<pre className="break-words whitespace-pre-wrap bg-muted rounded p-2 mt-1 overflow-x-auto max-h-40">
|
|
277
|
+
{JSON.stringify(call.output, null, 2)}
|
|
278
|
+
</pre>
|
|
279
|
+
</div>
|
|
280
|
+
</div>
|
|
281
|
+
</details>
|
|
282
|
+
))}
|
|
283
|
+
</div>
|
|
284
|
+
)}
|
|
285
|
+
</div>
|
|
286
|
+
)}
|
|
287
|
+
</div>
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
|
|
36
291
|
export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
37
292
|
const [run, setRun] = useState<ComparisonRun | null>(null);
|
|
38
293
|
const [config, setConfig] = useState<ComparisonConfig | null>(null);
|
|
@@ -58,6 +313,28 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
58
313
|
const [isRunning, setIsRunning] = useState(false);
|
|
59
314
|
const [hasRun, setHasRun] = useState(false);
|
|
60
315
|
|
|
316
|
+
// Session analysis state
|
|
317
|
+
const [controlAnalysis, setControlAnalysis] =
|
|
318
|
+
useState<SessionAnalysis | null>(null);
|
|
319
|
+
const [variantAnalysis, setVariantAnalysis] =
|
|
320
|
+
useState<SessionAnalysis | null>(null);
|
|
321
|
+
const [controlAnalysisLoading, setControlAnalysisLoading] = useState(false);
|
|
322
|
+
const [variantAnalysisLoading, setVariantAnalysisLoading] = useState(false);
|
|
323
|
+
const [analysisExpanded, setAnalysisExpanded] = useState<{
|
|
324
|
+
control: boolean;
|
|
325
|
+
variant: boolean;
|
|
326
|
+
}>({
|
|
327
|
+
control: false,
|
|
328
|
+
variant: false,
|
|
329
|
+
});
|
|
330
|
+
const [toolCallsExpanded, setToolCallsExpanded] = useState<{
|
|
331
|
+
control: boolean;
|
|
332
|
+
variant: boolean;
|
|
333
|
+
}>({
|
|
334
|
+
control: false,
|
|
335
|
+
variant: false,
|
|
336
|
+
});
|
|
337
|
+
|
|
61
338
|
// Fetch comparison run details and restore saved messages
|
|
62
339
|
useEffect(() => {
|
|
63
340
|
Promise.all([
|
|
@@ -99,8 +376,10 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
99
376
|
}
|
|
100
377
|
}
|
|
101
378
|
|
|
102
|
-
// Fetch the config
|
|
103
|
-
return fetch(`/api/comparison-config`).then((res) =>
|
|
379
|
+
// Fetch the config by the run's configId (not the latest config!)
|
|
380
|
+
return fetch(`/api/comparison-config/${runData.configId}`).then((res) =>
|
|
381
|
+
res.json(),
|
|
382
|
+
);
|
|
104
383
|
})
|
|
105
384
|
.then((configData) => {
|
|
106
385
|
setConfig(configData);
|
|
@@ -112,6 +391,14 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
112
391
|
});
|
|
113
392
|
}, [runId]);
|
|
114
393
|
|
|
394
|
+
const generateRequestId = (prefix: string, sessionId?: string) => {
|
|
395
|
+
const randomPart =
|
|
396
|
+
typeof crypto !== "undefined" && "randomUUID" in crypto
|
|
397
|
+
? crypto.randomUUID()
|
|
398
|
+
: `${Math.random().toString(16).slice(2)}-${Date.now().toString(16)}`;
|
|
399
|
+
return `${prefix}-${sessionId ? `${sessionId}-` : ""}${randomPart}`;
|
|
400
|
+
};
|
|
401
|
+
|
|
115
402
|
// Create a new session with the agent server
|
|
116
403
|
const createSession = async (
|
|
117
404
|
configOverrides?: Record<string, unknown>,
|
|
@@ -121,7 +408,7 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
121
408
|
headers: { "Content-Type": "application/json" },
|
|
122
409
|
body: JSON.stringify({
|
|
123
410
|
jsonrpc: "2.0",
|
|
124
|
-
id:
|
|
411
|
+
id: generateRequestId("init"),
|
|
125
412
|
method: "initialize",
|
|
126
413
|
params: {
|
|
127
414
|
protocolVersion: 1,
|
|
@@ -136,7 +423,7 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
136
423
|
headers: { "Content-Type": "application/json" },
|
|
137
424
|
body: JSON.stringify({
|
|
138
425
|
jsonrpc: "2.0",
|
|
139
|
-
id:
|
|
426
|
+
id: generateRequestId("session"),
|
|
140
427
|
method: "session/new",
|
|
141
428
|
params: {
|
|
142
429
|
cwd: "/",
|
|
@@ -154,7 +441,7 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
154
441
|
sessionId: string,
|
|
155
442
|
message: string,
|
|
156
443
|
onUpdate: (content: string) => void,
|
|
157
|
-
): Promise<
|
|
444
|
+
): Promise<string> => {
|
|
158
445
|
let accumulatedContent = "";
|
|
159
446
|
let abortController: AbortController | null = new AbortController();
|
|
160
447
|
|
|
@@ -228,7 +515,7 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
228
515
|
headers: { "Content-Type": "application/json" },
|
|
229
516
|
body: JSON.stringify({
|
|
230
517
|
jsonrpc: "2.0",
|
|
231
|
-
id:
|
|
518
|
+
id: generateRequestId("prompt", sessionId),
|
|
232
519
|
method: "session/prompt",
|
|
233
520
|
params: {
|
|
234
521
|
sessionId,
|
|
@@ -243,6 +530,9 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
243
530
|
// Abort the SSE connection since we're done
|
|
244
531
|
abortController.abort();
|
|
245
532
|
abortController = null;
|
|
533
|
+
|
|
534
|
+
// Return the accumulated content
|
|
535
|
+
return accumulatedContent;
|
|
246
536
|
};
|
|
247
537
|
|
|
248
538
|
// Run the comparison
|
|
@@ -271,15 +561,16 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
271
561
|
});
|
|
272
562
|
|
|
273
563
|
try {
|
|
274
|
-
// Build config overrides based on
|
|
564
|
+
// Build config overrides based on all selected dimensions
|
|
275
565
|
const variantOverrides: Record<string, unknown> = {};
|
|
276
|
-
|
|
566
|
+
const dimensions = config.dimensions || [];
|
|
567
|
+
if (dimensions.includes("model") && config.variantModel) {
|
|
277
568
|
variantOverrides.model = config.variantModel;
|
|
278
569
|
}
|
|
279
|
-
if (
|
|
570
|
+
if (dimensions.includes("system_prompt") && config.variantSystemPrompt) {
|
|
280
571
|
variantOverrides.systemPrompt = config.variantSystemPrompt;
|
|
281
572
|
}
|
|
282
|
-
if (
|
|
573
|
+
if (dimensions.includes("tools") && config.variantTools) {
|
|
283
574
|
variantOverrides.tools = config.variantTools;
|
|
284
575
|
}
|
|
285
576
|
|
|
@@ -307,123 +598,157 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
307
598
|
const startTime = Date.now();
|
|
308
599
|
|
|
309
600
|
// Track final responses and metrics
|
|
310
|
-
let
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
601
|
+
let finalControlMetrics: SessionMetrics = {
|
|
602
|
+
durationMs: 0,
|
|
603
|
+
inputTokens: 0,
|
|
604
|
+
outputTokens: 0,
|
|
605
|
+
totalTokens: 0,
|
|
606
|
+
estimatedCost: 0,
|
|
607
|
+
toolCallCount: 0,
|
|
608
|
+
};
|
|
609
|
+
let finalVariantMetrics: SessionMetrics = {
|
|
610
|
+
durationMs: 0,
|
|
611
|
+
inputTokens: 0,
|
|
612
|
+
outputTokens: 0,
|
|
613
|
+
totalTokens: 0,
|
|
614
|
+
estimatedCost: 0,
|
|
615
|
+
toolCallCount: 0,
|
|
616
|
+
};
|
|
617
|
+
|
|
618
|
+
// Helper to run a session and fetch metrics
|
|
619
|
+
const runSession = async (
|
|
620
|
+
sessionId: string,
|
|
621
|
+
model: string,
|
|
622
|
+
setState: typeof setControlState,
|
|
623
|
+
onContentUpdate: (content: string) => void,
|
|
624
|
+
): Promise<{ response: string; metrics: SessionMetrics }> => {
|
|
625
|
+
try {
|
|
626
|
+
const response = await sendMessageAndCollect(
|
|
627
|
+
sessionId,
|
|
628
|
+
firstMessage,
|
|
629
|
+
onContentUpdate,
|
|
630
|
+
);
|
|
631
|
+
|
|
632
|
+
const duration = Date.now() - startTime;
|
|
633
|
+
|
|
634
|
+
// Poll metrics until they stabilize or we hit a max wait window.
|
|
635
|
+
const fetchMetricsWithRetry = async (): Promise<SessionMetrics> => {
|
|
636
|
+
const maxWaitMs = 60_000;
|
|
637
|
+
const pollIntervalMs = 2_000;
|
|
638
|
+
let elapsed = 0;
|
|
639
|
+
let previousTokens = -1;
|
|
640
|
+
let previousTools = -1;
|
|
641
|
+
let lastMetrics: SessionMetrics | null = null;
|
|
642
|
+
|
|
643
|
+
while (elapsed <= maxWaitMs) {
|
|
644
|
+
try {
|
|
645
|
+
const metricsRes = await fetch(
|
|
646
|
+
`/api/session-metrics/${sessionId}?model=${encodeURIComponent(model)}`,
|
|
647
|
+
);
|
|
648
|
+
const metrics = await metricsRes.json();
|
|
649
|
+
lastMetrics = { ...metrics, durationMs: duration };
|
|
650
|
+
|
|
651
|
+
// If tokens/tool calls stopped changing and we have data, treat as final.
|
|
652
|
+
if (
|
|
653
|
+
metrics.totalTokens > 0 &&
|
|
654
|
+
metrics.totalTokens === previousTokens &&
|
|
655
|
+
metrics.toolCallCount === previousTools
|
|
656
|
+
) {
|
|
657
|
+
return lastMetrics!;
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
previousTokens = metrics.totalTokens ?? 0;
|
|
661
|
+
previousTools = metrics.toolCallCount ?? 0;
|
|
662
|
+
} catch {
|
|
663
|
+
// swallow and retry
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
await new Promise((r) => setTimeout(r, pollIntervalMs));
|
|
667
|
+
elapsed += pollIntervalMs;
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
// Return whatever we last saw (or zeros if nothing ever arrived)
|
|
671
|
+
return (
|
|
672
|
+
lastMetrics ?? {
|
|
348
673
|
durationMs: duration,
|
|
349
674
|
inputTokens: 0,
|
|
350
675
|
outputTokens: 0,
|
|
351
676
|
totalTokens: 0,
|
|
352
677
|
estimatedCost: 0,
|
|
353
678
|
toolCallCount: 0,
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
}));
|
|
360
|
-
}
|
|
361
|
-
})
|
|
362
|
-
.catch((err) => {
|
|
363
|
-
setControlState((prev) => ({
|
|
364
|
-
...prev,
|
|
365
|
-
isStreaming: false,
|
|
366
|
-
error: err.message,
|
|
367
|
-
}));
|
|
368
|
-
}),
|
|
679
|
+
}
|
|
680
|
+
);
|
|
681
|
+
};
|
|
682
|
+
|
|
683
|
+
const metrics = await fetchMetricsWithRetry();
|
|
369
684
|
|
|
370
|
-
|
|
371
|
-
sendMessageAndCollect(variantSessionId, firstMessage, (content) => {
|
|
372
|
-
finalVariantResponse = content;
|
|
373
|
-
setVariantState((prev) => ({
|
|
685
|
+
setState((prev) => ({
|
|
374
686
|
...prev,
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
{ role: "assistant", content },
|
|
378
|
-
],
|
|
687
|
+
isStreaming: false,
|
|
688
|
+
metrics,
|
|
379
689
|
}));
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
690
|
+
|
|
691
|
+
return { response, metrics };
|
|
692
|
+
} catch (err) {
|
|
693
|
+
setState((prev) => ({
|
|
694
|
+
...prev,
|
|
695
|
+
isStreaming: false,
|
|
696
|
+
error: err instanceof Error ? err.message : "Unknown error",
|
|
697
|
+
}));
|
|
698
|
+
return {
|
|
699
|
+
response: "",
|
|
700
|
+
metrics: {
|
|
701
|
+
durationMs: 0,
|
|
702
|
+
inputTokens: 0,
|
|
703
|
+
outputTokens: 0,
|
|
704
|
+
totalTokens: 0,
|
|
705
|
+
estimatedCost: 0,
|
|
706
|
+
toolCallCount: 0,
|
|
707
|
+
},
|
|
708
|
+
};
|
|
709
|
+
}
|
|
710
|
+
};
|
|
711
|
+
|
|
712
|
+
const controlModel = config.controlModel || "claude-sonnet-4-5-20250929";
|
|
713
|
+
const variantModel =
|
|
714
|
+
config.variantModel ||
|
|
715
|
+
config.controlModel ||
|
|
716
|
+
"claude-sonnet-4-5-20250929";
|
|
717
|
+
|
|
718
|
+
const [controlResult, variantResult] = await Promise.all([
|
|
719
|
+
runSession(
|
|
720
|
+
controlSessionId,
|
|
721
|
+
controlModel,
|
|
722
|
+
setControlState,
|
|
723
|
+
(content) => {
|
|
724
|
+
setControlState((prev) => ({
|
|
725
|
+
...prev,
|
|
726
|
+
messages: [
|
|
727
|
+
{ role: "user", content: firstMessage },
|
|
728
|
+
{ role: "assistant", content },
|
|
729
|
+
],
|
|
730
|
+
}));
|
|
731
|
+
},
|
|
732
|
+
),
|
|
733
|
+
runSession(
|
|
734
|
+
variantSessionId,
|
|
735
|
+
variantModel,
|
|
736
|
+
setVariantState,
|
|
737
|
+
(content) => {
|
|
419
738
|
setVariantState((prev) => ({
|
|
420
739
|
...prev,
|
|
421
|
-
|
|
422
|
-
|
|
740
|
+
messages: [
|
|
741
|
+
{ role: "user", content: firstMessage },
|
|
742
|
+
{ role: "assistant", content },
|
|
743
|
+
],
|
|
423
744
|
}));
|
|
424
|
-
}
|
|
745
|
+
},
|
|
746
|
+
),
|
|
425
747
|
]);
|
|
426
748
|
|
|
749
|
+
finalControlMetrics = controlResult.metrics;
|
|
750
|
+
finalVariantMetrics = variantResult.metrics;
|
|
751
|
+
|
|
427
752
|
// Update run status with responses and metrics
|
|
428
753
|
await fetch(`/api/comparison-run/${runId}/update`, {
|
|
429
754
|
method: "POST",
|
|
@@ -432,8 +757,8 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
432
757
|
status: "completed",
|
|
433
758
|
controlMetrics: finalControlMetrics,
|
|
434
759
|
variantMetrics: finalVariantMetrics,
|
|
435
|
-
controlResponse:
|
|
436
|
-
variantResponse:
|
|
760
|
+
controlResponse: controlResult.response,
|
|
761
|
+
variantResponse: variantResult.response,
|
|
437
762
|
}),
|
|
438
763
|
});
|
|
439
764
|
} catch (err) {
|
|
@@ -443,6 +768,91 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
443
768
|
}
|
|
444
769
|
}, [run, config, runId]);
|
|
445
770
|
|
|
771
|
+
// Function to fetch existing or trigger new session analysis
|
|
772
|
+
const triggerAnalysis = useCallback(
|
|
773
|
+
async (sessionId: string, type: "control" | "variant") => {
|
|
774
|
+
const setLoading =
|
|
775
|
+
type === "control"
|
|
776
|
+
? setControlAnalysisLoading
|
|
777
|
+
: setVariantAnalysisLoading;
|
|
778
|
+
const setAnalysis =
|
|
779
|
+
type === "control" ? setControlAnalysis : setVariantAnalysis;
|
|
780
|
+
|
|
781
|
+
setLoading(true);
|
|
782
|
+
try {
|
|
783
|
+
// First try to fetch existing analysis from cache
|
|
784
|
+
const existingRes = await fetch(
|
|
785
|
+
`/api/session-analyses?sessionId=${sessionId}`,
|
|
786
|
+
);
|
|
787
|
+
if (existingRes.ok) {
|
|
788
|
+
const existingAnalysis = await existingRes.json();
|
|
789
|
+
if (existingAnalysis && !existingAnalysis.error) {
|
|
790
|
+
setAnalysis(existingAnalysis);
|
|
791
|
+
setAnalysisExpanded((prev) => ({ ...prev, [type]: true }));
|
|
792
|
+
return;
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
// No existing analysis, trigger new one
|
|
797
|
+
const res = await fetch(`/api/analyze-session/${sessionId}`, {
|
|
798
|
+
method: "POST",
|
|
799
|
+
});
|
|
800
|
+
if (res.ok) {
|
|
801
|
+
const analysis = await res.json();
|
|
802
|
+
setAnalysis(analysis);
|
|
803
|
+
// Auto-expand when analysis completes
|
|
804
|
+
setAnalysisExpanded((prev) => ({ ...prev, [type]: true }));
|
|
805
|
+
}
|
|
806
|
+
} catch (err) {
|
|
807
|
+
console.error(`Failed to analyze ${type} session:`, err);
|
|
808
|
+
} finally {
|
|
809
|
+
setLoading(false);
|
|
810
|
+
}
|
|
811
|
+
},
|
|
812
|
+
[],
|
|
813
|
+
);
|
|
814
|
+
|
|
815
|
+
// Auto-trigger analysis when sessions complete
|
|
816
|
+
useEffect(() => {
|
|
817
|
+
// Control session completed
|
|
818
|
+
if (
|
|
819
|
+
controlState.sessionId &&
|
|
820
|
+
!controlState.isStreaming &&
|
|
821
|
+
controlState.metrics &&
|
|
822
|
+
!controlAnalysis &&
|
|
823
|
+
!controlAnalysisLoading
|
|
824
|
+
) {
|
|
825
|
+
triggerAnalysis(controlState.sessionId, "control");
|
|
826
|
+
}
|
|
827
|
+
}, [
|
|
828
|
+
controlState.sessionId,
|
|
829
|
+
controlState.isStreaming,
|
|
830
|
+
controlState.metrics,
|
|
831
|
+
controlAnalysis,
|
|
832
|
+
controlAnalysisLoading,
|
|
833
|
+
triggerAnalysis,
|
|
834
|
+
]);
|
|
835
|
+
|
|
836
|
+
useEffect(() => {
|
|
837
|
+
// Variant session completed
|
|
838
|
+
if (
|
|
839
|
+
variantState.sessionId &&
|
|
840
|
+
!variantState.isStreaming &&
|
|
841
|
+
variantState.metrics &&
|
|
842
|
+
!variantAnalysis &&
|
|
843
|
+
!variantAnalysisLoading
|
|
844
|
+
) {
|
|
845
|
+
triggerAnalysis(variantState.sessionId, "variant");
|
|
846
|
+
}
|
|
847
|
+
}, [
|
|
848
|
+
variantState.sessionId,
|
|
849
|
+
variantState.isStreaming,
|
|
850
|
+
variantState.metrics,
|
|
851
|
+
variantAnalysis,
|
|
852
|
+
variantAnalysisLoading,
|
|
853
|
+
triggerAnalysis,
|
|
854
|
+
]);
|
|
855
|
+
|
|
446
856
|
if (loading) {
|
|
447
857
|
return (
|
|
448
858
|
<DebuggerLayout title="Comparison" showBackButton backHref="/town-hall">
|
|
@@ -464,31 +874,49 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
464
874
|
}
|
|
465
875
|
|
|
466
876
|
const getControlDimensionLabel = () => {
|
|
467
|
-
if (!config
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
877
|
+
if (!config || !config.dimensions || config.dimensions.length === 0)
|
|
878
|
+
return "";
|
|
879
|
+
const labels: string[] = [];
|
|
880
|
+
for (const dim of config.dimensions) {
|
|
881
|
+
switch (dim) {
|
|
882
|
+
case "model":
|
|
883
|
+
labels.push(`Model: ${config.controlModel || "original"}`);
|
|
884
|
+
break;
|
|
885
|
+
case "system_prompt":
|
|
886
|
+
labels.push("System Prompt: original");
|
|
887
|
+
break;
|
|
888
|
+
case "tools":
|
|
889
|
+
labels.push("Tools: original");
|
|
890
|
+
break;
|
|
891
|
+
}
|
|
477
892
|
}
|
|
893
|
+
return labels.join(" | ");
|
|
478
894
|
};
|
|
479
895
|
|
|
480
896
|
const getDimensionLabel = () => {
|
|
481
|
-
if (!config
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
897
|
+
if (!config || !config.dimensions || config.dimensions.length === 0)
|
|
898
|
+
return "";
|
|
899
|
+
const labels: string[] = [];
|
|
900
|
+
for (const dim of config.dimensions) {
|
|
901
|
+
switch (dim) {
|
|
902
|
+
case "model":
|
|
903
|
+
labels.push(`Model: ${config.variantModel}`);
|
|
904
|
+
break;
|
|
905
|
+
case "system_prompt":
|
|
906
|
+
labels.push("System Prompt: modified");
|
|
907
|
+
break;
|
|
908
|
+
case "tools":
|
|
909
|
+
labels.push(`Tools: ${config.variantTools?.join(", ")}`);
|
|
910
|
+
break;
|
|
911
|
+
}
|
|
491
912
|
}
|
|
913
|
+
return labels.join(" | ");
|
|
914
|
+
};
|
|
915
|
+
|
|
916
|
+
const getDimensionsSummary = () => {
|
|
917
|
+
if (!config || !config.dimensions || config.dimensions.length === 0)
|
|
918
|
+
return "";
|
|
919
|
+
return config.dimensions.map((d) => d.replace("_", " ")).join(", ");
|
|
492
920
|
};
|
|
493
921
|
|
|
494
922
|
return (
|
|
@@ -499,8 +927,7 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
499
927
|
<div>
|
|
500
928
|
<h2 className="text-lg font-semibold">A/B Comparison</h2>
|
|
501
929
|
<p className="text-sm text-muted-foreground">
|
|
502
|
-
Comparing: {
|
|
503
|
-
{getDimensionLabel()}
|
|
930
|
+
Comparing: {getDimensionsSummary()}
|
|
504
931
|
</p>
|
|
505
932
|
</div>
|
|
506
933
|
{!hasRun && (
|
|
@@ -592,27 +1019,34 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
592
1019
|
</div>
|
|
593
1020
|
)}
|
|
594
1021
|
</CardContent>
|
|
595
|
-
{/*
|
|
1022
|
+
{/* Session Analysis & Tool Calls */}
|
|
596
1023
|
{controlState.metrics && (
|
|
597
|
-
<div className="border-t p-3 shrink-0 bg-muted/50">
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
1024
|
+
<div className="border-t p-3 shrink-0 bg-muted/50 space-y-3">
|
|
1025
|
+
{/* Session Analysis */}
|
|
1026
|
+
<SessionAnalysisPanel
|
|
1027
|
+
analysis={controlAnalysis}
|
|
1028
|
+
isLoading={controlAnalysisLoading}
|
|
1029
|
+
isExpanded={analysisExpanded.control}
|
|
1030
|
+
onToggle={() =>
|
|
1031
|
+
setAnalysisExpanded((prev) => ({
|
|
1032
|
+
...prev,
|
|
1033
|
+
control: !prev.control,
|
|
1034
|
+
}))
|
|
1035
|
+
}
|
|
1036
|
+
accentColor="blue"
|
|
1037
|
+
/>
|
|
1038
|
+
{/* Tool Calls */}
|
|
1039
|
+
<ToolCallsPanel
|
|
1040
|
+
toolCalls={controlState.metrics.toolCalls}
|
|
1041
|
+
isExpanded={toolCallsExpanded.control}
|
|
1042
|
+
onToggle={() =>
|
|
1043
|
+
setToolCallsExpanded((prev) => ({
|
|
1044
|
+
...prev,
|
|
1045
|
+
control: !prev.control,
|
|
1046
|
+
}))
|
|
1047
|
+
}
|
|
1048
|
+
accentColor="blue"
|
|
1049
|
+
/>
|
|
616
1050
|
</div>
|
|
617
1051
|
)}
|
|
618
1052
|
</Card>
|
|
@@ -653,27 +1087,34 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
653
1087
|
</div>
|
|
654
1088
|
)}
|
|
655
1089
|
</CardContent>
|
|
656
|
-
{/*
|
|
1090
|
+
{/* Session Analysis & Tool Calls */}
|
|
657
1091
|
{variantState.metrics && (
|
|
658
|
-
<div className="border-t p-3 shrink-0 bg-muted/50">
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
1092
|
+
<div className="border-t p-3 shrink-0 bg-muted/50 space-y-3">
|
|
1093
|
+
{/* Session Analysis */}
|
|
1094
|
+
<SessionAnalysisPanel
|
|
1095
|
+
analysis={variantAnalysis}
|
|
1096
|
+
isLoading={variantAnalysisLoading}
|
|
1097
|
+
isExpanded={analysisExpanded.variant}
|
|
1098
|
+
onToggle={() =>
|
|
1099
|
+
setAnalysisExpanded((prev) => ({
|
|
1100
|
+
...prev,
|
|
1101
|
+
variant: !prev.variant,
|
|
1102
|
+
}))
|
|
1103
|
+
}
|
|
1104
|
+
accentColor="orange"
|
|
1105
|
+
/>
|
|
1106
|
+
{/* Tool Calls */}
|
|
1107
|
+
<ToolCallsPanel
|
|
1108
|
+
toolCalls={variantState.metrics.toolCalls}
|
|
1109
|
+
isExpanded={toolCallsExpanded.variant}
|
|
1110
|
+
onToggle={() =>
|
|
1111
|
+
setToolCallsExpanded((prev) => ({
|
|
1112
|
+
...prev,
|
|
1113
|
+
variant: !prev.variant,
|
|
1114
|
+
}))
|
|
1115
|
+
}
|
|
1116
|
+
accentColor="orange"
|
|
1117
|
+
/>
|
|
677
1118
|
</div>
|
|
678
1119
|
)}
|
|
679
1120
|
</Card>
|