@townco/debugger 0.1.31 → 0.1.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/src/components/ui/checkbox.tsx +42 -0
- package/src/pages/ComparisonView.tsx +761 -267
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@townco/debugger",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.32",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"engines": {
|
|
6
6
|
"bun": ">=1.3.0"
|
|
@@ -22,8 +22,8 @@
|
|
|
22
22
|
"@radix-ui/react-select": "^2.2.6",
|
|
23
23
|
"@radix-ui/react-slot": "^1.2.3",
|
|
24
24
|
"@radix-ui/react-tabs": "^1.1.0",
|
|
25
|
-
"@townco/otlp-server": "0.1.
|
|
26
|
-
"@townco/ui": "0.1.
|
|
25
|
+
"@townco/otlp-server": "0.1.32",
|
|
26
|
+
"@townco/ui": "0.1.77",
|
|
27
27
|
"bun-plugin-tailwind": "^0.1.2",
|
|
28
28
|
"class-variance-authority": "^0.7.1",
|
|
29
29
|
"clsx": "^2.1.1",
|
|
@@ -35,7 +35,7 @@
|
|
|
35
35
|
"zod": "^4.1.13"
|
|
36
36
|
},
|
|
37
37
|
"devDependencies": {
|
|
38
|
-
"@townco/tsconfig": "0.1.
|
|
38
|
+
"@townco/tsconfig": "0.1.74",
|
|
39
39
|
"@types/bun": "latest",
|
|
40
40
|
"@types/react": "^19",
|
|
41
41
|
"@types/react-dom": "^19",
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { Check } from "lucide-react";
|
|
2
|
+
import * as React from "react";
|
|
3
|
+
|
|
4
|
+
import { cn } from "@/lib/utils";
|
|
5
|
+
|
|
6
|
+
interface CheckboxProps {
|
|
7
|
+
id?: string;
|
|
8
|
+
checked?: boolean;
|
|
9
|
+
onCheckedChange?: (checked: boolean) => void;
|
|
10
|
+
disabled?: boolean;
|
|
11
|
+
className?: string;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
const Checkbox = React.forwardRef<HTMLButtonElement, CheckboxProps>(
|
|
15
|
+
({ id, checked = false, onCheckedChange, disabled, className }, ref) => {
|
|
16
|
+
return (
|
|
17
|
+
<button
|
|
18
|
+
ref={ref}
|
|
19
|
+
type="button"
|
|
20
|
+
role="checkbox"
|
|
21
|
+
id={id}
|
|
22
|
+
aria-checked={checked}
|
|
23
|
+
disabled={disabled}
|
|
24
|
+
onClick={() => onCheckedChange?.(!checked)}
|
|
25
|
+
className={cn(
|
|
26
|
+
"peer h-4 w-4 shrink-0 rounded-sm border border-primary ring-offset-background focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50",
|
|
27
|
+
checked && "bg-primary text-primary-foreground",
|
|
28
|
+
className,
|
|
29
|
+
)}
|
|
30
|
+
>
|
|
31
|
+
{checked && (
|
|
32
|
+
<span className="flex items-center justify-center text-current">
|
|
33
|
+
<Check className="h-3.5 w-3.5" />
|
|
34
|
+
</span>
|
|
35
|
+
)}
|
|
36
|
+
</button>
|
|
37
|
+
);
|
|
38
|
+
},
|
|
39
|
+
);
|
|
40
|
+
Checkbox.displayName = "Checkbox";
|
|
41
|
+
|
|
42
|
+
export { Checkbox };
|
|
@@ -1,5 +1,12 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
1
|
+
import {
|
|
2
|
+
ChevronDown,
|
|
3
|
+
ChevronUp,
|
|
4
|
+
Loader2,
|
|
5
|
+
Play,
|
|
6
|
+
ToggleLeft,
|
|
7
|
+
ToggleRight,
|
|
8
|
+
} from "lucide-react";
|
|
9
|
+
import { useCallback, useEffect, useRef, useState } from "react";
|
|
3
10
|
import { Button } from "@/components/ui/button";
|
|
4
11
|
import {
|
|
5
12
|
Card,
|
|
@@ -8,6 +15,7 @@ import {
|
|
|
8
15
|
CardHeader,
|
|
9
16
|
CardTitle,
|
|
10
17
|
} from "@/components/ui/card";
|
|
18
|
+
import { Checkbox } from "@/components/ui/checkbox";
|
|
11
19
|
import type { SessionAnalysis } from "../analysis/types";
|
|
12
20
|
import { DebuggerLayout } from "../components/DebuggerLayout";
|
|
13
21
|
import { formatCost, formatDuration, formatTokens } from "../lib/metrics";
|
|
@@ -26,8 +34,17 @@ interface SessionState {
|
|
|
26
34
|
sessionId: string | null;
|
|
27
35
|
messages: ChatMessage[];
|
|
28
36
|
isStreaming: boolean;
|
|
37
|
+
isSending: boolean; // true while sending is in progress (before streaming starts)
|
|
29
38
|
metrics: SessionMetrics | null;
|
|
30
39
|
error: string | null;
|
|
40
|
+
autoRun: boolean;
|
|
41
|
+
turnIndex: number; // last completed user message index for this arm
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
interface QueueState {
|
|
45
|
+
currentIndex: number; // last completed turn (both arms finished)
|
|
46
|
+
stagedIndex: number; // next user message ready to send
|
|
47
|
+
status: "idle" | "running" | "completed";
|
|
31
48
|
}
|
|
32
49
|
|
|
33
50
|
const AGENT_SERVER_URL =
|
|
@@ -41,12 +58,14 @@ function SessionAnalysisPanel({
|
|
|
41
58
|
isLoading,
|
|
42
59
|
isExpanded,
|
|
43
60
|
onToggle,
|
|
61
|
+
onRunAnalysis,
|
|
44
62
|
accentColor,
|
|
45
63
|
}: {
|
|
46
64
|
analysis: SessionAnalysis | null;
|
|
47
65
|
isLoading: boolean;
|
|
48
66
|
isExpanded: boolean;
|
|
49
67
|
onToggle: () => void;
|
|
68
|
+
onRunAnalysis: () => void;
|
|
50
69
|
accentColor: "blue" | "orange";
|
|
51
70
|
}) {
|
|
52
71
|
const colorClasses =
|
|
@@ -64,14 +83,31 @@ function SessionAnalysisPanel({
|
|
|
64
83
|
<div className={`border rounded-md p-3 ${colorClasses}`}>
|
|
65
84
|
<div className="flex items-center gap-2 text-xs text-muted-foreground">
|
|
66
85
|
<Loader2 className="w-3 h-3 animate-spin" />
|
|
67
|
-
|
|
86
|
+
Running analysis...
|
|
68
87
|
</div>
|
|
69
88
|
</div>
|
|
70
89
|
);
|
|
71
90
|
}
|
|
72
91
|
|
|
73
92
|
if (!analysis) {
|
|
74
|
-
return
|
|
93
|
+
return (
|
|
94
|
+
<div className={`border rounded-md p-3 ${colorClasses}`}>
|
|
95
|
+
<div className="flex items-center justify-between">
|
|
96
|
+
<span className="text-xs text-muted-foreground">
|
|
97
|
+
Session Analysis
|
|
98
|
+
</span>
|
|
99
|
+
<Button
|
|
100
|
+
size="sm"
|
|
101
|
+
variant="outline"
|
|
102
|
+
onClick={onRunAnalysis}
|
|
103
|
+
className="h-6 text-xs px-2"
|
|
104
|
+
>
|
|
105
|
+
<Play className="w-3 h-3 mr-1" />
|
|
106
|
+
Run Analysis
|
|
107
|
+
</Button>
|
|
108
|
+
</div>
|
|
109
|
+
</div>
|
|
110
|
+
);
|
|
75
111
|
}
|
|
76
112
|
|
|
77
113
|
return (
|
|
@@ -294,22 +330,62 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
294
330
|
const [loading, setLoading] = useState(true);
|
|
295
331
|
const [error, setError] = useState<string | null>(null);
|
|
296
332
|
|
|
333
|
+
// User messages from source session
|
|
334
|
+
const [userMessages, setUserMessages] = useState<string[]>([]);
|
|
335
|
+
const [initialAutoRun, setInitialAutoRun] = useState(false);
|
|
336
|
+
|
|
337
|
+
// Queue state for multi-message replay
|
|
338
|
+
const [queueState, setQueueState] = useState<QueueState>({
|
|
339
|
+
currentIndex: -1,
|
|
340
|
+
stagedIndex: 0,
|
|
341
|
+
status: "idle",
|
|
342
|
+
});
|
|
343
|
+
|
|
297
344
|
// Session states
|
|
298
345
|
const [controlState, setControlState] = useState<SessionState>({
|
|
299
346
|
sessionId: null,
|
|
300
347
|
messages: [],
|
|
301
348
|
isStreaming: false,
|
|
349
|
+
isSending: false,
|
|
302
350
|
metrics: null,
|
|
303
351
|
error: null,
|
|
352
|
+
autoRun: false,
|
|
353
|
+
turnIndex: -1,
|
|
304
354
|
});
|
|
305
355
|
const [variantState, setVariantState] = useState<SessionState>({
|
|
306
356
|
sessionId: null,
|
|
307
357
|
messages: [],
|
|
308
358
|
isStreaming: false,
|
|
359
|
+
isSending: false,
|
|
309
360
|
metrics: null,
|
|
310
361
|
error: null,
|
|
362
|
+
autoRun: false,
|
|
363
|
+
turnIndex: -1,
|
|
311
364
|
});
|
|
312
365
|
|
|
366
|
+
// Refs for stable callbacks
|
|
367
|
+
const controlStateRef = useRef(controlState);
|
|
368
|
+
const variantStateRef = useRef(variantState);
|
|
369
|
+
const queueStateRef = useRef(queueState);
|
|
370
|
+
const userMessagesRef = useRef(userMessages);
|
|
371
|
+
|
|
372
|
+
// Separate refs for send locks - these update synchronously to prevent race conditions
|
|
373
|
+
const controlSendingRef = useRef(false);
|
|
374
|
+
const variantSendingRef = useRef(false);
|
|
375
|
+
|
|
376
|
+
useEffect(() => {
|
|
377
|
+
controlStateRef.current = controlState;
|
|
378
|
+
}, [controlState]);
|
|
379
|
+
useEffect(() => {
|
|
380
|
+
variantStateRef.current = variantState;
|
|
381
|
+
}, [variantState]);
|
|
382
|
+
useEffect(() => {
|
|
383
|
+
queueStateRef.current = queueState;
|
|
384
|
+
}, [queueState]);
|
|
385
|
+
useEffect(() => {
|
|
386
|
+
userMessagesRef.current = userMessages;
|
|
387
|
+
}, [userMessages]);
|
|
388
|
+
|
|
313
389
|
const [isRunning, setIsRunning] = useState(false);
|
|
314
390
|
const [hasRun, setHasRun] = useState(false);
|
|
315
391
|
|
|
@@ -335,45 +411,115 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
335
411
|
variant: false,
|
|
336
412
|
});
|
|
337
413
|
|
|
338
|
-
// Fetch comparison run details and restore saved messages
|
|
414
|
+
// Fetch comparison run details, conversation, and restore saved messages
|
|
339
415
|
useEffect(() => {
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
.then((
|
|
416
|
+
let runData: ComparisonRun;
|
|
417
|
+
|
|
418
|
+
fetch(`/api/comparison-run/${runId}`)
|
|
419
|
+
.then((res) => res.json())
|
|
420
|
+
.then(async (data) => {
|
|
421
|
+
runData = data;
|
|
344
422
|
setRun(runData);
|
|
345
423
|
|
|
346
|
-
//
|
|
424
|
+
// Fetch conversation from source session to get all user messages
|
|
425
|
+
const conversationRes = await fetch(
|
|
426
|
+
`/api/session-conversation?sessionId=${runData.sourceSessionId}`,
|
|
427
|
+
);
|
|
428
|
+
const conversation = await conversationRes.json();
|
|
429
|
+
|
|
430
|
+
// Extract user messages in order
|
|
431
|
+
const messages: string[] = [];
|
|
432
|
+
for (const trace of conversation) {
|
|
433
|
+
if (trace.userInput) {
|
|
434
|
+
messages.push(trace.userInput);
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// If no messages found in conversation, fall back to firstUserMessage
|
|
439
|
+
if (messages.length === 0 && runData.firstUserMessage) {
|
|
440
|
+
messages.push(runData.firstUserMessage);
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
setUserMessages(messages);
|
|
444
|
+
|
|
445
|
+
// Restore saved messages if the run has been completed or running
|
|
347
446
|
if (runData.status === "completed" || runData.status === "running") {
|
|
348
447
|
setHasRun(true);
|
|
349
448
|
|
|
449
|
+
// Fetch full conversation history from control and variant sessions
|
|
450
|
+
const [controlConversation, variantConversation] = await Promise.all([
|
|
451
|
+
runData.controlSessionId
|
|
452
|
+
? fetch(
|
|
453
|
+
`/api/session-conversation?sessionId=${runData.controlSessionId}`,
|
|
454
|
+
).then((res) => res.json())
|
|
455
|
+
: Promise.resolve([]),
|
|
456
|
+
runData.variantSessionId
|
|
457
|
+
? fetch(
|
|
458
|
+
`/api/session-conversation?sessionId=${runData.variantSessionId}`,
|
|
459
|
+
).then((res) => res.json())
|
|
460
|
+
: Promise.resolve([]),
|
|
461
|
+
]);
|
|
462
|
+
|
|
463
|
+
// Convert traces to chat messages
|
|
464
|
+
const tracesToChatMessages = (
|
|
465
|
+
traces: Array<{ userInput?: string; llmOutput?: string }>,
|
|
466
|
+
): ChatMessage[] => {
|
|
467
|
+
const chatMessages: ChatMessage[] = [];
|
|
468
|
+
for (const trace of traces) {
|
|
469
|
+
if (trace.userInput) {
|
|
470
|
+
chatMessages.push({
|
|
471
|
+
role: "user" as const,
|
|
472
|
+
content: trace.userInput,
|
|
473
|
+
});
|
|
474
|
+
}
|
|
475
|
+
if (trace.llmOutput) {
|
|
476
|
+
chatMessages.push({
|
|
477
|
+
role: "assistant" as const,
|
|
478
|
+
content: trace.llmOutput,
|
|
479
|
+
});
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
return chatMessages;
|
|
483
|
+
};
|
|
484
|
+
|
|
350
485
|
// Restore control messages
|
|
351
|
-
if (runData.
|
|
352
|
-
|
|
486
|
+
if (runData.controlSessionId) {
|
|
487
|
+
const controlMessages = tracesToChatMessages(controlConversation);
|
|
488
|
+
setControlState((prev) => ({
|
|
489
|
+
...prev,
|
|
353
490
|
sessionId: runData.controlSessionId,
|
|
354
|
-
messages:
|
|
355
|
-
{ role: "user", content: runData.firstUserMessage },
|
|
356
|
-
{ role: "assistant", content: runData.controlResponse },
|
|
357
|
-
],
|
|
491
|
+
messages: controlMessages,
|
|
358
492
|
isStreaming: false,
|
|
359
493
|
metrics: runData.controlMetrics,
|
|
360
494
|
error: null,
|
|
361
|
-
|
|
495
|
+
turnIndex: Math.floor(controlMessages.length / 2) - 1,
|
|
496
|
+
}));
|
|
362
497
|
}
|
|
363
498
|
|
|
364
499
|
// Restore variant messages
|
|
365
|
-
if (runData.
|
|
366
|
-
|
|
500
|
+
if (runData.variantSessionId) {
|
|
501
|
+
const variantMessages = tracesToChatMessages(variantConversation);
|
|
502
|
+
setVariantState((prev) => ({
|
|
503
|
+
...prev,
|
|
367
504
|
sessionId: runData.variantSessionId,
|
|
368
|
-
messages:
|
|
369
|
-
{ role: "user", content: runData.firstUserMessage },
|
|
370
|
-
{ role: "assistant", content: runData.variantResponse },
|
|
371
|
-
],
|
|
505
|
+
messages: variantMessages,
|
|
372
506
|
isStreaming: false,
|
|
373
507
|
metrics: runData.variantMetrics,
|
|
374
508
|
error: null,
|
|
375
|
-
|
|
509
|
+
turnIndex: Math.floor(variantMessages.length / 2) - 1,
|
|
510
|
+
}));
|
|
376
511
|
}
|
|
512
|
+
|
|
513
|
+
// Set queue state based on completed messages
|
|
514
|
+
const completedTurns = Math.min(
|
|
515
|
+
Math.floor(tracesToChatMessages(controlConversation).length / 2),
|
|
516
|
+
Math.floor(tracesToChatMessages(variantConversation).length / 2),
|
|
517
|
+
);
|
|
518
|
+
setQueueState({
|
|
519
|
+
currentIndex: completedTurns - 1,
|
|
520
|
+
stagedIndex: completedTurns,
|
|
521
|
+
status: runData.status === "completed" ? "completed" : "running",
|
|
522
|
+
});
|
|
377
523
|
}
|
|
378
524
|
|
|
379
525
|
// Fetch the config by the run's configId (not the latest config!)
|
|
@@ -535,29 +681,372 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
535
681
|
return accumulatedContent;
|
|
536
682
|
};
|
|
537
683
|
|
|
538
|
-
//
|
|
684
|
+
// Helper to fetch metrics with retry
|
|
685
|
+
const fetchMetricsWithRetry = useCallback(
|
|
686
|
+
async (
|
|
687
|
+
sessionId: string,
|
|
688
|
+
model: string,
|
|
689
|
+
duration: number,
|
|
690
|
+
): Promise<SessionMetrics> => {
|
|
691
|
+
const maxWaitMs = 60_000;
|
|
692
|
+
const pollIntervalMs = 2_000;
|
|
693
|
+
let elapsed = 0;
|
|
694
|
+
let previousTokens = -1;
|
|
695
|
+
let previousTools = -1;
|
|
696
|
+
let lastMetrics: SessionMetrics | null = null;
|
|
697
|
+
|
|
698
|
+
while (elapsed <= maxWaitMs) {
|
|
699
|
+
try {
|
|
700
|
+
const metricsRes = await fetch(
|
|
701
|
+
`/api/session-metrics/${sessionId}?model=${encodeURIComponent(model)}`,
|
|
702
|
+
);
|
|
703
|
+
const metrics = await metricsRes.json();
|
|
704
|
+
lastMetrics = { ...metrics, durationMs: duration };
|
|
705
|
+
|
|
706
|
+
// If tokens/tool calls stopped changing and we have data, treat as final.
|
|
707
|
+
if (
|
|
708
|
+
metrics.totalTokens > 0 &&
|
|
709
|
+
metrics.totalTokens === previousTokens &&
|
|
710
|
+
metrics.toolCallCount === previousTools
|
|
711
|
+
) {
|
|
712
|
+
return lastMetrics!;
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
previousTokens = metrics.totalTokens ?? 0;
|
|
716
|
+
previousTools = metrics.toolCallCount ?? 0;
|
|
717
|
+
} catch {
|
|
718
|
+
// swallow and retry
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
await new Promise((r) => setTimeout(r, pollIntervalMs));
|
|
722
|
+
elapsed += pollIntervalMs;
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
// Return whatever we last saw (or zeros if nothing ever arrived)
|
|
726
|
+
return (
|
|
727
|
+
lastMetrics ?? {
|
|
728
|
+
durationMs: duration,
|
|
729
|
+
inputTokens: 0,
|
|
730
|
+
outputTokens: 0,
|
|
731
|
+
totalTokens: 0,
|
|
732
|
+
estimatedCost: 0,
|
|
733
|
+
toolCallCount: 0,
|
|
734
|
+
}
|
|
735
|
+
);
|
|
736
|
+
},
|
|
737
|
+
[],
|
|
738
|
+
);
|
|
739
|
+
|
|
740
|
+
// Send a single message to one arm and handle the response
|
|
741
|
+
const sendMessageToArm = useCallback(
|
|
742
|
+
async (
|
|
743
|
+
sessionId: string,
|
|
744
|
+
message: string,
|
|
745
|
+
messageIndex: number,
|
|
746
|
+
model: string,
|
|
747
|
+
arm: "control" | "variant",
|
|
748
|
+
startTime: number,
|
|
749
|
+
): Promise<{ response: string; metrics: SessionMetrics }> => {
|
|
750
|
+
const setState = arm === "control" ? setControlState : setVariantState;
|
|
751
|
+
|
|
752
|
+
try {
|
|
753
|
+
// Add user message and set streaming
|
|
754
|
+
setState((prev) => ({
|
|
755
|
+
...prev,
|
|
756
|
+
isStreaming: true,
|
|
757
|
+
messages: [...prev.messages, { role: "user", content: message }],
|
|
758
|
+
}));
|
|
759
|
+
|
|
760
|
+
let accumulatedContent = "";
|
|
761
|
+
|
|
762
|
+
const response = await sendMessageAndCollect(
|
|
763
|
+
sessionId,
|
|
764
|
+
message,
|
|
765
|
+
(content) => {
|
|
766
|
+
accumulatedContent = content;
|
|
767
|
+
setState((prev) => {
|
|
768
|
+
// Find the last assistant message or add one
|
|
769
|
+
const messages = [...prev.messages];
|
|
770
|
+
const lastMsg = messages[messages.length - 1];
|
|
771
|
+
if (lastMsg && lastMsg.role === "assistant") {
|
|
772
|
+
messages[messages.length - 1] = {
|
|
773
|
+
role: "assistant",
|
|
774
|
+
content,
|
|
775
|
+
};
|
|
776
|
+
} else {
|
|
777
|
+
messages.push({ role: "assistant", content });
|
|
778
|
+
}
|
|
779
|
+
return { ...prev, messages };
|
|
780
|
+
});
|
|
781
|
+
},
|
|
782
|
+
);
|
|
783
|
+
|
|
784
|
+
const duration = Date.now() - startTime;
|
|
785
|
+
const metrics = await fetchMetricsWithRetry(sessionId, model, duration);
|
|
786
|
+
|
|
787
|
+
setState((prev) => ({
|
|
788
|
+
...prev,
|
|
789
|
+
isStreaming: false,
|
|
790
|
+
turnIndex: messageIndex,
|
|
791
|
+
metrics,
|
|
792
|
+
error: null,
|
|
793
|
+
}));
|
|
794
|
+
|
|
795
|
+
return { response, metrics };
|
|
796
|
+
} catch (err) {
|
|
797
|
+
setState((prev) => ({
|
|
798
|
+
...prev,
|
|
799
|
+
isStreaming: false,
|
|
800
|
+
error: err instanceof Error ? err.message : "Unknown error",
|
|
801
|
+
}));
|
|
802
|
+
return {
|
|
803
|
+
response: "",
|
|
804
|
+
metrics: {
|
|
805
|
+
durationMs: 0,
|
|
806
|
+
inputTokens: 0,
|
|
807
|
+
outputTokens: 0,
|
|
808
|
+
totalTokens: 0,
|
|
809
|
+
estimatedCost: 0,
|
|
810
|
+
toolCallCount: 0,
|
|
811
|
+
},
|
|
812
|
+
};
|
|
813
|
+
}
|
|
814
|
+
},
|
|
815
|
+
[fetchMetricsWithRetry],
|
|
816
|
+
);
|
|
817
|
+
|
|
818
|
+
// Send staged message to a specific arm
|
|
819
|
+
const sendStagedToArm = useCallback(
|
|
820
|
+
async (arm: "control" | "variant") => {
|
|
821
|
+
const state =
|
|
822
|
+
arm === "control" ? controlStateRef.current : variantStateRef.current;
|
|
823
|
+
const setState = arm === "control" ? setControlState : setVariantState;
|
|
824
|
+
const sendingRef =
|
|
825
|
+
arm === "control" ? controlSendingRef : variantSendingRef;
|
|
826
|
+
const queue = queueStateRef.current;
|
|
827
|
+
const messages = userMessagesRef.current;
|
|
828
|
+
|
|
829
|
+
// Check the synchronous ref first to prevent duplicate sends
|
|
830
|
+
if (sendingRef.current) return;
|
|
831
|
+
if (!state.sessionId || state.isStreaming) return;
|
|
832
|
+
if (queue.stagedIndex >= messages.length) return;
|
|
833
|
+
|
|
834
|
+
const message = messages[queue.stagedIndex];
|
|
835
|
+
if (!message) return;
|
|
836
|
+
|
|
837
|
+
// Set sending lock immediately (synchronously) to prevent race conditions
|
|
838
|
+
sendingRef.current = true;
|
|
839
|
+
setState((prev) => ({ ...prev, isSending: true }));
|
|
840
|
+
|
|
841
|
+
const model =
|
|
842
|
+
arm === "control"
|
|
843
|
+
? config?.controlModel || "claude-sonnet-4-5-20250929"
|
|
844
|
+
: config?.variantModel ||
|
|
845
|
+
config?.controlModel ||
|
|
846
|
+
"claude-sonnet-4-5-20250929";
|
|
847
|
+
|
|
848
|
+
try {
|
|
849
|
+
await sendMessageToArm(
|
|
850
|
+
state.sessionId,
|
|
851
|
+
message,
|
|
852
|
+
queue.stagedIndex,
|
|
853
|
+
model,
|
|
854
|
+
arm,
|
|
855
|
+
Date.now(),
|
|
856
|
+
);
|
|
857
|
+
} finally {
|
|
858
|
+
// Clear sending lock after completion
|
|
859
|
+
sendingRef.current = false;
|
|
860
|
+
setState((prev) => ({ ...prev, isSending: false }));
|
|
861
|
+
}
|
|
862
|
+
},
|
|
863
|
+
[config, sendMessageToArm],
|
|
864
|
+
);
|
|
865
|
+
|
|
866
|
+
// Send staged message to both arms
|
|
867
|
+
const sendStagedToBoth = useCallback(async () => {
|
|
868
|
+
const control = controlStateRef.current;
|
|
869
|
+
const variant = variantStateRef.current;
|
|
870
|
+
|
|
871
|
+
if (!control.sessionId || !variant.sessionId) return;
|
|
872
|
+
if (control.isStreaming || variant.isStreaming) return;
|
|
873
|
+
if (controlSendingRef.current || variantSendingRef.current) return;
|
|
874
|
+
|
|
875
|
+
await Promise.all([sendStagedToArm("control"), sendStagedToArm("variant")]);
|
|
876
|
+
}, [sendStagedToArm]);
|
|
877
|
+
|
|
878
|
+
// Check and advance queue after both arms complete a turn
|
|
879
|
+
useEffect(() => {
|
|
880
|
+
// Both arms must have completed the same turn and not be in the middle of sending
|
|
881
|
+
if (
|
|
882
|
+
controlState.isStreaming ||
|
|
883
|
+
variantState.isStreaming ||
|
|
884
|
+
controlState.isSending ||
|
|
885
|
+
variantState.isSending ||
|
|
886
|
+
queueState.status !== "running"
|
|
887
|
+
)
|
|
888
|
+
return;
|
|
889
|
+
if (controlState.turnIndex !== variantState.turnIndex) return;
|
|
890
|
+
|
|
891
|
+
const completedIndex = controlState.turnIndex;
|
|
892
|
+
|
|
893
|
+
// Advance currentIndex if both completed
|
|
894
|
+
if (completedIndex > queueState.currentIndex) {
|
|
895
|
+
const nextIndex = completedIndex + 1;
|
|
896
|
+
|
|
897
|
+
if (nextIndex >= userMessages.length) {
|
|
898
|
+
// All messages completed
|
|
899
|
+
setQueueState((prev) => ({
|
|
900
|
+
...prev,
|
|
901
|
+
currentIndex: completedIndex,
|
|
902
|
+
status: "completed",
|
|
903
|
+
}));
|
|
904
|
+
setIsRunning(false);
|
|
905
|
+
|
|
906
|
+
// Persist final state
|
|
907
|
+
if (run && controlState.sessionId && variantState.sessionId) {
|
|
908
|
+
// Get last responses from messages
|
|
909
|
+
const controlMsgs = controlState.messages;
|
|
910
|
+
const variantMsgs = variantState.messages;
|
|
911
|
+
const lastControlResponse =
|
|
912
|
+
controlMsgs[controlMsgs.length - 1]?.role === "assistant"
|
|
913
|
+
? controlMsgs[controlMsgs.length - 1]?.content
|
|
914
|
+
: "";
|
|
915
|
+
const lastVariantResponse =
|
|
916
|
+
variantMsgs[variantMsgs.length - 1]?.role === "assistant"
|
|
917
|
+
? variantMsgs[variantMsgs.length - 1]?.content
|
|
918
|
+
: "";
|
|
919
|
+
|
|
920
|
+
fetch(`/api/comparison-run/${runId}/update`, {
|
|
921
|
+
method: "POST",
|
|
922
|
+
headers: { "Content-Type": "application/json" },
|
|
923
|
+
body: JSON.stringify({
|
|
924
|
+
status: "completed",
|
|
925
|
+
controlMetrics: controlState.metrics,
|
|
926
|
+
variantMetrics: variantState.metrics,
|
|
927
|
+
controlResponse: lastControlResponse,
|
|
928
|
+
variantResponse: lastVariantResponse,
|
|
929
|
+
}),
|
|
930
|
+
});
|
|
931
|
+
}
|
|
932
|
+
} else {
|
|
933
|
+
// Stage next message
|
|
934
|
+
setQueueState((prev) => ({
|
|
935
|
+
...prev,
|
|
936
|
+
currentIndex: completedIndex,
|
|
937
|
+
stagedIndex: nextIndex,
|
|
938
|
+
}));
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
}, [
|
|
942
|
+
controlState.isStreaming,
|
|
943
|
+
controlState.isSending,
|
|
944
|
+
controlState.turnIndex,
|
|
945
|
+
controlState.messages,
|
|
946
|
+
controlState.metrics,
|
|
947
|
+
controlState.sessionId,
|
|
948
|
+
variantState.isStreaming,
|
|
949
|
+
variantState.isSending,
|
|
950
|
+
variantState.turnIndex,
|
|
951
|
+
variantState.messages,
|
|
952
|
+
variantState.metrics,
|
|
953
|
+
variantState.sessionId,
|
|
954
|
+
queueState.status,
|
|
955
|
+
queueState.currentIndex,
|
|
956
|
+
userMessages.length,
|
|
957
|
+
run,
|
|
958
|
+
runId,
|
|
959
|
+
]);
|
|
960
|
+
|
|
961
|
+
// Auto-send staged message when conditions are met
|
|
962
|
+
useEffect(() => {
|
|
963
|
+
if (queueState.status !== "running") return;
|
|
964
|
+
if (queueState.stagedIndex >= userMessages.length) return;
|
|
965
|
+
|
|
966
|
+
const message = userMessages[queueState.stagedIndex];
|
|
967
|
+
if (!message) return;
|
|
968
|
+
|
|
969
|
+
// Check if control should auto-send
|
|
970
|
+
if (
|
|
971
|
+
controlState.autoRun &&
|
|
972
|
+
!controlState.isStreaming &&
|
|
973
|
+
!controlState.isSending &&
|
|
974
|
+
controlState.sessionId &&
|
|
975
|
+
controlState.turnIndex === queueState.currentIndex
|
|
976
|
+
) {
|
|
977
|
+
sendStagedToArm("control");
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
// Check if variant should auto-send
|
|
981
|
+
if (
|
|
982
|
+
variantState.autoRun &&
|
|
983
|
+
!variantState.isStreaming &&
|
|
984
|
+
!variantState.isSending &&
|
|
985
|
+
variantState.sessionId &&
|
|
986
|
+
variantState.turnIndex === queueState.currentIndex
|
|
987
|
+
) {
|
|
988
|
+
sendStagedToArm("variant");
|
|
989
|
+
}
|
|
990
|
+
}, [
|
|
991
|
+
queueState.status,
|
|
992
|
+
queueState.stagedIndex,
|
|
993
|
+
queueState.currentIndex,
|
|
994
|
+
userMessages,
|
|
995
|
+
controlState.autoRun,
|
|
996
|
+
controlState.isStreaming,
|
|
997
|
+
controlState.isSending,
|
|
998
|
+
controlState.sessionId,
|
|
999
|
+
controlState.turnIndex,
|
|
1000
|
+
variantState.autoRun,
|
|
1001
|
+
variantState.isStreaming,
|
|
1002
|
+
variantState.isSending,
|
|
1003
|
+
variantState.sessionId,
|
|
1004
|
+
variantState.turnIndex,
|
|
1005
|
+
sendStagedToArm,
|
|
1006
|
+
]);
|
|
1007
|
+
|
|
1008
|
+
// Toggle auto-run for an arm
|
|
1009
|
+
const toggleAutoRun = useCallback((arm: "control" | "variant") => {
|
|
1010
|
+
const setState = arm === "control" ? setControlState : setVariantState;
|
|
1011
|
+
setState((prev) => ({ ...prev, autoRun: !prev.autoRun }));
|
|
1012
|
+
}, []);
|
|
1013
|
+
|
|
1014
|
+
// Start the comparison (initialize sessions, first message sent by auto-send effect)
|
|
539
1015
|
const runComparison = useCallback(async () => {
|
|
540
|
-
if (!run || !config) return;
|
|
1016
|
+
if (!run || !config || userMessages.length === 0) return;
|
|
541
1017
|
|
|
542
1018
|
setIsRunning(true);
|
|
543
1019
|
setHasRun(true);
|
|
544
1020
|
|
|
545
|
-
|
|
1021
|
+
// Reset sending refs
|
|
1022
|
+
controlSendingRef.current = false;
|
|
1023
|
+
variantSendingRef.current = false;
|
|
546
1024
|
|
|
547
|
-
// Reset states
|
|
1025
|
+
// Reset states with initial autoRun setting
|
|
548
1026
|
setControlState({
|
|
549
1027
|
sessionId: null,
|
|
550
|
-
messages: [
|
|
551
|
-
isStreaming:
|
|
1028
|
+
messages: [],
|
|
1029
|
+
isStreaming: false,
|
|
1030
|
+
isSending: false,
|
|
552
1031
|
metrics: null,
|
|
553
1032
|
error: null,
|
|
1033
|
+
autoRun: initialAutoRun,
|
|
1034
|
+
turnIndex: -1,
|
|
554
1035
|
});
|
|
555
1036
|
setVariantState({
|
|
556
1037
|
sessionId: null,
|
|
557
|
-
messages: [
|
|
558
|
-
isStreaming:
|
|
1038
|
+
messages: [],
|
|
1039
|
+
isStreaming: false,
|
|
1040
|
+
isSending: false,
|
|
559
1041
|
metrics: null,
|
|
560
1042
|
error: null,
|
|
1043
|
+
autoRun: initialAutoRun,
|
|
1044
|
+
turnIndex: -1,
|
|
1045
|
+
});
|
|
1046
|
+
setQueueState({
|
|
1047
|
+
currentIndex: -1,
|
|
1048
|
+
stagedIndex: 0,
|
|
1049
|
+
status: "running",
|
|
561
1050
|
});
|
|
562
1051
|
|
|
563
1052
|
try {
|
|
@@ -594,179 +1083,13 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
594
1083
|
}),
|
|
595
1084
|
});
|
|
596
1085
|
|
|
597
|
-
//
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
// Track final responses and metrics
|
|
601
|
-
let finalControlMetrics: SessionMetrics = {
|
|
602
|
-
durationMs: 0,
|
|
603
|
-
inputTokens: 0,
|
|
604
|
-
outputTokens: 0,
|
|
605
|
-
totalTokens: 0,
|
|
606
|
-
estimatedCost: 0,
|
|
607
|
-
toolCallCount: 0,
|
|
608
|
-
};
|
|
609
|
-
let finalVariantMetrics: SessionMetrics = {
|
|
610
|
-
durationMs: 0,
|
|
611
|
-
inputTokens: 0,
|
|
612
|
-
outputTokens: 0,
|
|
613
|
-
totalTokens: 0,
|
|
614
|
-
estimatedCost: 0,
|
|
615
|
-
toolCallCount: 0,
|
|
616
|
-
};
|
|
617
|
-
|
|
618
|
-
// Helper to run a session and fetch metrics
|
|
619
|
-
const runSession = async (
|
|
620
|
-
sessionId: string,
|
|
621
|
-
model: string,
|
|
622
|
-
setState: typeof setControlState,
|
|
623
|
-
onContentUpdate: (content: string) => void,
|
|
624
|
-
): Promise<{ response: string; metrics: SessionMetrics }> => {
|
|
625
|
-
try {
|
|
626
|
-
const response = await sendMessageAndCollect(
|
|
627
|
-
sessionId,
|
|
628
|
-
firstMessage,
|
|
629
|
-
onContentUpdate,
|
|
630
|
-
);
|
|
631
|
-
|
|
632
|
-
const duration = Date.now() - startTime;
|
|
633
|
-
|
|
634
|
-
// Poll metrics until they stabilize or we hit a max wait window.
|
|
635
|
-
const fetchMetricsWithRetry = async (): Promise<SessionMetrics> => {
|
|
636
|
-
const maxWaitMs = 60_000;
|
|
637
|
-
const pollIntervalMs = 2_000;
|
|
638
|
-
let elapsed = 0;
|
|
639
|
-
let previousTokens = -1;
|
|
640
|
-
let previousTools = -1;
|
|
641
|
-
let lastMetrics: SessionMetrics | null = null;
|
|
642
|
-
|
|
643
|
-
while (elapsed <= maxWaitMs) {
|
|
644
|
-
try {
|
|
645
|
-
const metricsRes = await fetch(
|
|
646
|
-
`/api/session-metrics/${sessionId}?model=${encodeURIComponent(model)}`,
|
|
647
|
-
);
|
|
648
|
-
const metrics = await metricsRes.json();
|
|
649
|
-
lastMetrics = { ...metrics, durationMs: duration };
|
|
650
|
-
|
|
651
|
-
// If tokens/tool calls stopped changing and we have data, treat as final.
|
|
652
|
-
if (
|
|
653
|
-
metrics.totalTokens > 0 &&
|
|
654
|
-
metrics.totalTokens === previousTokens &&
|
|
655
|
-
metrics.toolCallCount === previousTools
|
|
656
|
-
) {
|
|
657
|
-
return lastMetrics!;
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
previousTokens = metrics.totalTokens ?? 0;
|
|
661
|
-
previousTools = metrics.toolCallCount ?? 0;
|
|
662
|
-
} catch {
|
|
663
|
-
// swallow and retry
|
|
664
|
-
}
|
|
665
|
-
|
|
666
|
-
await new Promise((r) => setTimeout(r, pollIntervalMs));
|
|
667
|
-
elapsed += pollIntervalMs;
|
|
668
|
-
}
|
|
669
|
-
|
|
670
|
-
// Return whatever we last saw (or zeros if nothing ever arrived)
|
|
671
|
-
return (
|
|
672
|
-
lastMetrics ?? {
|
|
673
|
-
durationMs: duration,
|
|
674
|
-
inputTokens: 0,
|
|
675
|
-
outputTokens: 0,
|
|
676
|
-
totalTokens: 0,
|
|
677
|
-
estimatedCost: 0,
|
|
678
|
-
toolCallCount: 0,
|
|
679
|
-
}
|
|
680
|
-
);
|
|
681
|
-
};
|
|
682
|
-
|
|
683
|
-
const metrics = await fetchMetricsWithRetry();
|
|
684
|
-
|
|
685
|
-
setState((prev) => ({
|
|
686
|
-
...prev,
|
|
687
|
-
isStreaming: false,
|
|
688
|
-
metrics,
|
|
689
|
-
}));
|
|
690
|
-
|
|
691
|
-
return { response, metrics };
|
|
692
|
-
} catch (err) {
|
|
693
|
-
setState((prev) => ({
|
|
694
|
-
...prev,
|
|
695
|
-
isStreaming: false,
|
|
696
|
-
error: err instanceof Error ? err.message : "Unknown error",
|
|
697
|
-
}));
|
|
698
|
-
return {
|
|
699
|
-
response: "",
|
|
700
|
-
metrics: {
|
|
701
|
-
durationMs: 0,
|
|
702
|
-
inputTokens: 0,
|
|
703
|
-
outputTokens: 0,
|
|
704
|
-
totalTokens: 0,
|
|
705
|
-
estimatedCost: 0,
|
|
706
|
-
toolCallCount: 0,
|
|
707
|
-
},
|
|
708
|
-
};
|
|
709
|
-
}
|
|
710
|
-
};
|
|
711
|
-
|
|
712
|
-
const controlModel = config.controlModel || "claude-sonnet-4-5-20250929";
|
|
713
|
-
const variantModel =
|
|
714
|
-
config.variantModel ||
|
|
715
|
-
config.controlModel ||
|
|
716
|
-
"claude-sonnet-4-5-20250929";
|
|
717
|
-
|
|
718
|
-
const [controlResult, variantResult] = await Promise.all([
|
|
719
|
-
runSession(
|
|
720
|
-
controlSessionId,
|
|
721
|
-
controlModel,
|
|
722
|
-
setControlState,
|
|
723
|
-
(content) => {
|
|
724
|
-
setControlState((prev) => ({
|
|
725
|
-
...prev,
|
|
726
|
-
messages: [
|
|
727
|
-
{ role: "user", content: firstMessage },
|
|
728
|
-
{ role: "assistant", content },
|
|
729
|
-
],
|
|
730
|
-
}));
|
|
731
|
-
},
|
|
732
|
-
),
|
|
733
|
-
runSession(
|
|
734
|
-
variantSessionId,
|
|
735
|
-
variantModel,
|
|
736
|
-
setVariantState,
|
|
737
|
-
(content) => {
|
|
738
|
-
setVariantState((prev) => ({
|
|
739
|
-
...prev,
|
|
740
|
-
messages: [
|
|
741
|
-
{ role: "user", content: firstMessage },
|
|
742
|
-
{ role: "assistant", content },
|
|
743
|
-
],
|
|
744
|
-
}));
|
|
745
|
-
},
|
|
746
|
-
),
|
|
747
|
-
]);
|
|
748
|
-
|
|
749
|
-
finalControlMetrics = controlResult.metrics;
|
|
750
|
-
finalVariantMetrics = variantResult.metrics;
|
|
751
|
-
|
|
752
|
-
// Update run status with responses and metrics
|
|
753
|
-
await fetch(`/api/comparison-run/${runId}/update`, {
|
|
754
|
-
method: "POST",
|
|
755
|
-
headers: { "Content-Type": "application/json" },
|
|
756
|
-
body: JSON.stringify({
|
|
757
|
-
status: "completed",
|
|
758
|
-
controlMetrics: finalControlMetrics,
|
|
759
|
-
variantMetrics: finalVariantMetrics,
|
|
760
|
-
controlResponse: controlResult.response,
|
|
761
|
-
variantResponse: variantResult.response,
|
|
762
|
-
}),
|
|
763
|
-
});
|
|
1086
|
+
// Don't send first message here - let the auto-send effect handle it
|
|
1087
|
+
// This ensures all messages go through the same code path and prevents duplicates
|
|
764
1088
|
} catch (err) {
|
|
765
1089
|
setError(err instanceof Error ? err.message : "Failed to run comparison");
|
|
766
|
-
} finally {
|
|
767
1090
|
setIsRunning(false);
|
|
768
1091
|
}
|
|
769
|
-
}, [run, config, runId]);
|
|
1092
|
+
}, [run, config, userMessages, initialAutoRun, runId, createSession]);
|
|
770
1093
|
|
|
771
1094
|
// Function to fetch existing or trigger new session analysis
|
|
772
1095
|
const triggerAnalysis = useCallback(
|
|
@@ -812,47 +1135,6 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
812
1135
|
[],
|
|
813
1136
|
);
|
|
814
1137
|
|
|
815
|
-
// Auto-trigger analysis when sessions complete
|
|
816
|
-
useEffect(() => {
|
|
817
|
-
// Control session completed
|
|
818
|
-
if (
|
|
819
|
-
controlState.sessionId &&
|
|
820
|
-
!controlState.isStreaming &&
|
|
821
|
-
controlState.metrics &&
|
|
822
|
-
!controlAnalysis &&
|
|
823
|
-
!controlAnalysisLoading
|
|
824
|
-
) {
|
|
825
|
-
triggerAnalysis(controlState.sessionId, "control");
|
|
826
|
-
}
|
|
827
|
-
}, [
|
|
828
|
-
controlState.sessionId,
|
|
829
|
-
controlState.isStreaming,
|
|
830
|
-
controlState.metrics,
|
|
831
|
-
controlAnalysis,
|
|
832
|
-
controlAnalysisLoading,
|
|
833
|
-
triggerAnalysis,
|
|
834
|
-
]);
|
|
835
|
-
|
|
836
|
-
useEffect(() => {
|
|
837
|
-
// Variant session completed
|
|
838
|
-
if (
|
|
839
|
-
variantState.sessionId &&
|
|
840
|
-
!variantState.isStreaming &&
|
|
841
|
-
variantState.metrics &&
|
|
842
|
-
!variantAnalysis &&
|
|
843
|
-
!variantAnalysisLoading
|
|
844
|
-
) {
|
|
845
|
-
triggerAnalysis(variantState.sessionId, "variant");
|
|
846
|
-
}
|
|
847
|
-
}, [
|
|
848
|
-
variantState.sessionId,
|
|
849
|
-
variantState.isStreaming,
|
|
850
|
-
variantState.metrics,
|
|
851
|
-
variantAnalysis,
|
|
852
|
-
variantAnalysisLoading,
|
|
853
|
-
triggerAnalysis,
|
|
854
|
-
]);
|
|
855
|
-
|
|
856
1138
|
if (loading) {
|
|
857
1139
|
return (
|
|
858
1140
|
<DebuggerLayout title="Comparison" showBackButton backHref="/town-hall">
|
|
@@ -924,37 +1206,159 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
924
1206
|
<div className="container mx-auto p-4 h-[calc(100vh-4rem)] flex flex-col overflow-hidden">
|
|
925
1207
|
{/* Header */}
|
|
926
1208
|
<div className="flex items-center justify-between mb-4">
|
|
927
|
-
<div>
|
|
928
|
-
<
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
1209
|
+
<div className="flex items-center gap-3">
|
|
1210
|
+
<div>
|
|
1211
|
+
<h2 className="text-lg font-semibold">A/B Comparison</h2>
|
|
1212
|
+
<p className="text-sm text-muted-foreground">
|
|
1213
|
+
Comparing: {getDimensionsSummary()}
|
|
1214
|
+
</p>
|
|
1215
|
+
</div>
|
|
1216
|
+
{/* Message count badge when running */}
|
|
1217
|
+
{hasRun && userMessages.length > 1 && (
|
|
1218
|
+
<div className="flex items-center gap-2 px-3 py-1.5 rounded-full bg-muted text-sm">
|
|
1219
|
+
<span className="font-medium">
|
|
1220
|
+
{queueState.currentIndex + 1}/{userMessages.length}
|
|
1221
|
+
</span>
|
|
1222
|
+
<span className="text-muted-foreground">messages</span>
|
|
1223
|
+
{queueState.status === "completed" && (
|
|
1224
|
+
<span className="text-green-600 dark:text-green-400 text-xs">
|
|
1225
|
+
Complete
|
|
1226
|
+
</span>
|
|
1227
|
+
)}
|
|
1228
|
+
</div>
|
|
1229
|
+
)}
|
|
932
1230
|
</div>
|
|
933
1231
|
{!hasRun && (
|
|
934
|
-
<Button
|
|
935
|
-
{
|
|
1232
|
+
<Button
|
|
1233
|
+
onClick={runComparison}
|
|
1234
|
+
disabled={isRunning || userMessages.length === 0}
|
|
1235
|
+
>
|
|
1236
|
+
{isRunning ? "Running..." : "Start Comparison"}
|
|
936
1237
|
</Button>
|
|
937
1238
|
)}
|
|
938
1239
|
</div>
|
|
939
1240
|
|
|
1241
|
+
{/* Queue Banner - shown when there's a staged message waiting */}
|
|
1242
|
+
{hasRun &&
|
|
1243
|
+
queueState.status === "running" &&
|
|
1244
|
+
queueState.stagedIndex > queueState.currentIndex &&
|
|
1245
|
+
queueState.stagedIndex < userMessages.length &&
|
|
1246
|
+
!controlState.isStreaming &&
|
|
1247
|
+
!variantState.isStreaming && (
|
|
1248
|
+
<div className="mb-4 p-3 rounded-lg border bg-muted/50 flex items-center gap-4">
|
|
1249
|
+
<div className="flex-1">
|
|
1250
|
+
<div className="text-xs font-medium text-muted-foreground mb-1">
|
|
1251
|
+
Next message ready (#{queueState.stagedIndex + 1})
|
|
1252
|
+
</div>
|
|
1253
|
+
<div className="text-sm truncate">
|
|
1254
|
+
{userMessages[queueState.stagedIndex]?.slice(0, 100)}
|
|
1255
|
+
{(userMessages[queueState.stagedIndex]?.length ?? 0) > 100
|
|
1256
|
+
? "..."
|
|
1257
|
+
: ""}
|
|
1258
|
+
</div>
|
|
1259
|
+
</div>
|
|
1260
|
+
<div className="flex items-center gap-2 shrink-0">
|
|
1261
|
+
{/* Per-arm send buttons when that arm is not auto-running */}
|
|
1262
|
+
{!controlState.autoRun &&
|
|
1263
|
+
controlState.turnIndex === queueState.currentIndex && (
|
|
1264
|
+
<Button
|
|
1265
|
+
size="sm"
|
|
1266
|
+
variant="outline"
|
|
1267
|
+
onClick={() => sendStagedToArm("control")}
|
|
1268
|
+
className="text-blue-600 border-blue-300 hover:bg-blue-50 dark:text-blue-400 dark:border-blue-700 dark:hover:bg-blue-950"
|
|
1269
|
+
>
|
|
1270
|
+
<Play className="w-3 h-3 mr-1" />
|
|
1271
|
+
Control
|
|
1272
|
+
</Button>
|
|
1273
|
+
)}
|
|
1274
|
+
{!variantState.autoRun &&
|
|
1275
|
+
variantState.turnIndex === queueState.currentIndex && (
|
|
1276
|
+
<Button
|
|
1277
|
+
size="sm"
|
|
1278
|
+
variant="outline"
|
|
1279
|
+
onClick={() => sendStagedToArm("variant")}
|
|
1280
|
+
className="text-orange-600 border-orange-300 hover:bg-orange-50 dark:text-orange-400 dark:border-orange-700 dark:hover:bg-orange-950"
|
|
1281
|
+
>
|
|
1282
|
+
<Play className="w-3 h-3 mr-1" />
|
|
1283
|
+
Variant
|
|
1284
|
+
</Button>
|
|
1285
|
+
)}
|
|
1286
|
+
{/* Send to both button */}
|
|
1287
|
+
{!controlState.autoRun &&
|
|
1288
|
+
!variantState.autoRun &&
|
|
1289
|
+
controlState.turnIndex === queueState.currentIndex &&
|
|
1290
|
+
variantState.turnIndex === queueState.currentIndex && (
|
|
1291
|
+
<Button size="sm" onClick={sendStagedToBoth}>
|
|
1292
|
+
<Play className="w-3 h-3 mr-1" />
|
|
1293
|
+
Send to Both
|
|
1294
|
+
</Button>
|
|
1295
|
+
)}
|
|
1296
|
+
</div>
|
|
1297
|
+
</div>
|
|
1298
|
+
)}
|
|
1299
|
+
|
|
940
1300
|
{/* Pre-run state */}
|
|
941
1301
|
{!hasRun && (
|
|
942
1302
|
<div className="flex-1 flex items-center justify-center">
|
|
943
|
-
<Card className="max-w-
|
|
1303
|
+
<Card className="max-w-lg w-full">
|
|
944
1304
|
<CardHeader className="text-center">
|
|
945
1305
|
<CardTitle>Ready to Compare</CardTitle>
|
|
946
1306
|
<CardDescription>
|
|
947
|
-
This comparison will
|
|
948
|
-
|
|
1307
|
+
This comparison will replay {userMessages.length} user message
|
|
1308
|
+
{userMessages.length !== 1 ? "s" : ""} to both configurations
|
|
1309
|
+
and display the results side by side.
|
|
949
1310
|
</CardDescription>
|
|
950
1311
|
</CardHeader>
|
|
951
1312
|
<CardContent className="space-y-4">
|
|
952
|
-
|
|
1313
|
+
{/* User messages list */}
|
|
1314
|
+
<div className="bg-muted rounded-lg p-4 max-h-64 overflow-y-auto">
|
|
953
1315
|
<div className="text-xs font-medium uppercase text-muted-foreground mb-2">
|
|
954
|
-
|
|
1316
|
+
User Messages ({userMessages.length})
|
|
1317
|
+
</div>
|
|
1318
|
+
<div className="space-y-2">
|
|
1319
|
+
{userMessages.map((msg, idx) => (
|
|
1320
|
+
<details key={idx} className="group">
|
|
1321
|
+
<summary className="text-sm cursor-pointer flex items-center gap-2 hover:text-foreground">
|
|
1322
|
+
<span className="text-xs font-mono text-muted-foreground w-5">
|
|
1323
|
+
{idx + 1}.
|
|
1324
|
+
</span>
|
|
1325
|
+
<span className="truncate flex-1">
|
|
1326
|
+
{msg.slice(0, 80)}
|
|
1327
|
+
{msg.length > 80 ? "..." : ""}
|
|
1328
|
+
</span>
|
|
1329
|
+
<ChevronDown className="w-3 h-3 text-muted-foreground group-open:rotate-180 transition-transform" />
|
|
1330
|
+
</summary>
|
|
1331
|
+
<div className="mt-2 ml-7 text-sm whitespace-pre-wrap bg-background/50 rounded p-2 text-muted-foreground">
|
|
1332
|
+
{msg}
|
|
1333
|
+
</div>
|
|
1334
|
+
</details>
|
|
1335
|
+
))}
|
|
955
1336
|
</div>
|
|
956
|
-
<div className="text-sm">{run?.firstUserMessage}</div>
|
|
957
1337
|
</div>
|
|
1338
|
+
|
|
1339
|
+
{/* Auto-run checkbox */}
|
|
1340
|
+
{userMessages.length > 1 && (
|
|
1341
|
+
<div className="flex items-center gap-3 p-3 rounded-lg border bg-background">
|
|
1342
|
+
<Checkbox
|
|
1343
|
+
id="auto-run"
|
|
1344
|
+
checked={initialAutoRun}
|
|
1345
|
+
onCheckedChange={(checked) => setInitialAutoRun(checked)}
|
|
1346
|
+
/>
|
|
1347
|
+
<div className="flex-1">
|
|
1348
|
+
<label
|
|
1349
|
+
htmlFor="auto-run"
|
|
1350
|
+
className="text-sm font-medium cursor-pointer"
|
|
1351
|
+
>
|
|
1352
|
+
Auto run all messages
|
|
1353
|
+
</label>
|
|
1354
|
+
<p className="text-xs text-muted-foreground">
|
|
1355
|
+
If off, next messages are enqueued after each turn.
|
|
1356
|
+
</p>
|
|
1357
|
+
</div>
|
|
1358
|
+
</div>
|
|
1359
|
+
)}
|
|
1360
|
+
|
|
1361
|
+
{/* Control vs Variant labels */}
|
|
958
1362
|
<div className="grid grid-cols-2 gap-4 text-sm">
|
|
959
1363
|
<div className="space-y-1">
|
|
960
1364
|
<div className="flex items-center gap-2">
|
|
@@ -986,10 +1390,36 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
986
1390
|
{/* Control */}
|
|
987
1391
|
<Card className="flex flex-col h-full min-h-0 overflow-hidden">
|
|
988
1392
|
<CardHeader className="py-3 border-b shrink-0">
|
|
989
|
-
<
|
|
990
|
-
<
|
|
991
|
-
|
|
992
|
-
|
|
1393
|
+
<div className="flex items-center justify-between">
|
|
1394
|
+
<CardTitle className="text-sm flex items-center gap-2">
|
|
1395
|
+
<span className="w-2 h-2 rounded-full bg-blue-500" />
|
|
1396
|
+
Control (Original)
|
|
1397
|
+
{controlState.isStreaming && (
|
|
1398
|
+
<Loader2 className="w-3 h-3 animate-spin text-blue-500" />
|
|
1399
|
+
)}
|
|
1400
|
+
</CardTitle>
|
|
1401
|
+
{/* Auto-run toggle for Control */}
|
|
1402
|
+
{userMessages.length > 1 &&
|
|
1403
|
+
queueState.status === "running" && (
|
|
1404
|
+
<button
|
|
1405
|
+
type="button"
|
|
1406
|
+
onClick={() => toggleAutoRun("control")}
|
|
1407
|
+
className="flex items-center gap-1.5 text-xs text-muted-foreground hover:text-foreground transition-colors"
|
|
1408
|
+
title={
|
|
1409
|
+
controlState.autoRun
|
|
1410
|
+
? "Disable auto-run"
|
|
1411
|
+
: "Enable auto-run"
|
|
1412
|
+
}
|
|
1413
|
+
>
|
|
1414
|
+
{controlState.autoRun ? (
|
|
1415
|
+
<ToggleRight className="w-4 h-4 text-blue-500" />
|
|
1416
|
+
) : (
|
|
1417
|
+
<ToggleLeft className="w-4 h-4" />
|
|
1418
|
+
)}
|
|
1419
|
+
<span>Auto</span>
|
|
1420
|
+
</button>
|
|
1421
|
+
)}
|
|
1422
|
+
</div>
|
|
993
1423
|
<CardDescription className="text-xs">
|
|
994
1424
|
{getControlDimensionLabel()}
|
|
995
1425
|
</CardDescription>
|
|
@@ -1014,8 +1444,23 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
1014
1444
|
</div>
|
|
1015
1445
|
))}
|
|
1016
1446
|
{controlState.error && (
|
|
1017
|
-
<div className="
|
|
1018
|
-
|
|
1447
|
+
<div className="p-3 rounded-lg border border-red-200 bg-red-50 dark:border-red-800 dark:bg-red-950/30">
|
|
1448
|
+
<div className="text-red-600 dark:text-red-400 text-sm mb-2">
|
|
1449
|
+
Error: {controlState.error}
|
|
1450
|
+
</div>
|
|
1451
|
+
{queueState.status === "running" && (
|
|
1452
|
+
<Button
|
|
1453
|
+
size="sm"
|
|
1454
|
+
variant="outline"
|
|
1455
|
+
onClick={() => {
|
|
1456
|
+
setControlState((prev) => ({ ...prev, error: null }));
|
|
1457
|
+
sendStagedToArm("control");
|
|
1458
|
+
}}
|
|
1459
|
+
className="text-red-600 border-red-300 hover:bg-red-100 dark:text-red-400"
|
|
1460
|
+
>
|
|
1461
|
+
Retry
|
|
1462
|
+
</Button>
|
|
1463
|
+
)}
|
|
1019
1464
|
</div>
|
|
1020
1465
|
)}
|
|
1021
1466
|
</CardContent>
|
|
@@ -1033,6 +1478,10 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
1033
1478
|
control: !prev.control,
|
|
1034
1479
|
}))
|
|
1035
1480
|
}
|
|
1481
|
+
onRunAnalysis={() =>
|
|
1482
|
+
controlState.sessionId &&
|
|
1483
|
+
triggerAnalysis(controlState.sessionId, "control")
|
|
1484
|
+
}
|
|
1036
1485
|
accentColor="blue"
|
|
1037
1486
|
/>
|
|
1038
1487
|
{/* Tool Calls */}
|
|
@@ -1054,10 +1503,36 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
1054
1503
|
{/* Variant */}
|
|
1055
1504
|
<Card className="flex flex-col h-full min-h-0 overflow-hidden">
|
|
1056
1505
|
<CardHeader className="py-3 border-b shrink-0">
|
|
1057
|
-
<
|
|
1058
|
-
<
|
|
1059
|
-
|
|
1060
|
-
|
|
1506
|
+
<div className="flex items-center justify-between">
|
|
1507
|
+
<CardTitle className="text-sm flex items-center gap-2">
|
|
1508
|
+
<span className="w-2 h-2 rounded-full bg-orange-500" />
|
|
1509
|
+
Variant
|
|
1510
|
+
{variantState.isStreaming && (
|
|
1511
|
+
<Loader2 className="w-3 h-3 animate-spin text-orange-500" />
|
|
1512
|
+
)}
|
|
1513
|
+
</CardTitle>
|
|
1514
|
+
{/* Auto-run toggle for Variant */}
|
|
1515
|
+
{userMessages.length > 1 &&
|
|
1516
|
+
queueState.status === "running" && (
|
|
1517
|
+
<button
|
|
1518
|
+
type="button"
|
|
1519
|
+
onClick={() => toggleAutoRun("variant")}
|
|
1520
|
+
className="flex items-center gap-1.5 text-xs text-muted-foreground hover:text-foreground transition-colors"
|
|
1521
|
+
title={
|
|
1522
|
+
variantState.autoRun
|
|
1523
|
+
? "Disable auto-run"
|
|
1524
|
+
: "Enable auto-run"
|
|
1525
|
+
}
|
|
1526
|
+
>
|
|
1527
|
+
{variantState.autoRun ? (
|
|
1528
|
+
<ToggleRight className="w-4 h-4 text-orange-500" />
|
|
1529
|
+
) : (
|
|
1530
|
+
<ToggleLeft className="w-4 h-4" />
|
|
1531
|
+
)}
|
|
1532
|
+
<span>Auto</span>
|
|
1533
|
+
</button>
|
|
1534
|
+
)}
|
|
1535
|
+
</div>
|
|
1061
1536
|
<CardDescription className="text-xs">
|
|
1062
1537
|
{getDimensionLabel()}
|
|
1063
1538
|
</CardDescription>
|
|
@@ -1082,8 +1557,23 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
1082
1557
|
</div>
|
|
1083
1558
|
))}
|
|
1084
1559
|
{variantState.error && (
|
|
1085
|
-
<div className="
|
|
1086
|
-
|
|
1560
|
+
<div className="p-3 rounded-lg border border-red-200 bg-red-50 dark:border-red-800 dark:bg-red-950/30">
|
|
1561
|
+
<div className="text-red-600 dark:text-red-400 text-sm mb-2">
|
|
1562
|
+
Error: {variantState.error}
|
|
1563
|
+
</div>
|
|
1564
|
+
{queueState.status === "running" && (
|
|
1565
|
+
<Button
|
|
1566
|
+
size="sm"
|
|
1567
|
+
variant="outline"
|
|
1568
|
+
onClick={() => {
|
|
1569
|
+
setVariantState((prev) => ({ ...prev, error: null }));
|
|
1570
|
+
sendStagedToArm("variant");
|
|
1571
|
+
}}
|
|
1572
|
+
className="text-red-600 border-red-300 hover:bg-red-100 dark:text-red-400"
|
|
1573
|
+
>
|
|
1574
|
+
Retry
|
|
1575
|
+
</Button>
|
|
1576
|
+
)}
|
|
1087
1577
|
</div>
|
|
1088
1578
|
)}
|
|
1089
1579
|
</CardContent>
|
|
@@ -1101,6 +1591,10 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
|
|
|
1101
1591
|
variant: !prev.variant,
|
|
1102
1592
|
}))
|
|
1103
1593
|
}
|
|
1594
|
+
onRunAnalysis={() =>
|
|
1595
|
+
variantState.sessionId &&
|
|
1596
|
+
triggerAnalysis(variantState.sessionId, "variant")
|
|
1597
|
+
}
|
|
1104
1598
|
accentColor="orange"
|
|
1105
1599
|
/>
|
|
1106
1600
|
{/* Tool Calls */}
|