@townco/debugger 0.1.31 → 0.1.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@townco/debugger",
3
- "version": "0.1.31",
3
+ "version": "0.1.32",
4
4
  "type": "module",
5
5
  "engines": {
6
6
  "bun": ">=1.3.0"
@@ -22,8 +22,8 @@
22
22
  "@radix-ui/react-select": "^2.2.6",
23
23
  "@radix-ui/react-slot": "^1.2.3",
24
24
  "@radix-ui/react-tabs": "^1.1.0",
25
- "@townco/otlp-server": "0.1.31",
26
- "@townco/ui": "0.1.76",
25
+ "@townco/otlp-server": "0.1.32",
26
+ "@townco/ui": "0.1.77",
27
27
  "bun-plugin-tailwind": "^0.1.2",
28
28
  "class-variance-authority": "^0.7.1",
29
29
  "clsx": "^2.1.1",
@@ -35,7 +35,7 @@
35
35
  "zod": "^4.1.13"
36
36
  },
37
37
  "devDependencies": {
38
- "@townco/tsconfig": "0.1.73",
38
+ "@townco/tsconfig": "0.1.74",
39
39
  "@types/bun": "latest",
40
40
  "@types/react": "^19",
41
41
  "@types/react-dom": "^19",
@@ -0,0 +1,42 @@
1
+ import { Check } from "lucide-react";
2
+ import * as React from "react";
3
+
4
+ import { cn } from "@/lib/utils";
5
+
6
+ interface CheckboxProps {
7
+ id?: string;
8
+ checked?: boolean;
9
+ onCheckedChange?: (checked: boolean) => void;
10
+ disabled?: boolean;
11
+ className?: string;
12
+ }
13
+
14
+ const Checkbox = React.forwardRef<HTMLButtonElement, CheckboxProps>(
15
+ ({ id, checked = false, onCheckedChange, disabled, className }, ref) => {
16
+ return (
17
+ <button
18
+ ref={ref}
19
+ type="button"
20
+ role="checkbox"
21
+ id={id}
22
+ aria-checked={checked}
23
+ disabled={disabled}
24
+ onClick={() => onCheckedChange?.(!checked)}
25
+ className={cn(
26
+ "peer h-4 w-4 shrink-0 rounded-sm border border-primary ring-offset-background focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50",
27
+ checked && "bg-primary text-primary-foreground",
28
+ className,
29
+ )}
30
+ >
31
+ {checked && (
32
+ <span className="flex items-center justify-center text-current">
33
+ <Check className="h-3.5 w-3.5" />
34
+ </span>
35
+ )}
36
+ </button>
37
+ );
38
+ },
39
+ );
40
+ Checkbox.displayName = "Checkbox";
41
+
42
+ export { Checkbox };
@@ -1,5 +1,12 @@
1
- import { ChevronDown, ChevronUp, Loader2 } from "lucide-react";
2
- import { useCallback, useEffect, useState } from "react";
1
+ import {
2
+ ChevronDown,
3
+ ChevronUp,
4
+ Loader2,
5
+ Play,
6
+ ToggleLeft,
7
+ ToggleRight,
8
+ } from "lucide-react";
9
+ import { useCallback, useEffect, useRef, useState } from "react";
3
10
  import { Button } from "@/components/ui/button";
4
11
  import {
5
12
  Card,
@@ -8,6 +15,7 @@ import {
8
15
  CardHeader,
9
16
  CardTitle,
10
17
  } from "@/components/ui/card";
18
+ import { Checkbox } from "@/components/ui/checkbox";
11
19
  import type { SessionAnalysis } from "../analysis/types";
12
20
  import { DebuggerLayout } from "../components/DebuggerLayout";
13
21
  import { formatCost, formatDuration, formatTokens } from "../lib/metrics";
@@ -26,8 +34,17 @@ interface SessionState {
26
34
  sessionId: string | null;
27
35
  messages: ChatMessage[];
28
36
  isStreaming: boolean;
37
+ isSending: boolean; // true while sending is in progress (before streaming starts)
29
38
  metrics: SessionMetrics | null;
30
39
  error: string | null;
40
+ autoRun: boolean;
41
+ turnIndex: number; // last completed user message index for this arm
42
+ }
43
+
44
+ interface QueueState {
45
+ currentIndex: number; // last completed turn (both arms finished)
46
+ stagedIndex: number; // next user message ready to send
47
+ status: "idle" | "running" | "completed";
31
48
  }
32
49
 
33
50
  const AGENT_SERVER_URL =
@@ -41,12 +58,14 @@ function SessionAnalysisPanel({
41
58
  isLoading,
42
59
  isExpanded,
43
60
  onToggle,
61
+ onRunAnalysis,
44
62
  accentColor,
45
63
  }: {
46
64
  analysis: SessionAnalysis | null;
47
65
  isLoading: boolean;
48
66
  isExpanded: boolean;
49
67
  onToggle: () => void;
68
+ onRunAnalysis: () => void;
50
69
  accentColor: "blue" | "orange";
51
70
  }) {
52
71
  const colorClasses =
@@ -64,14 +83,31 @@ function SessionAnalysisPanel({
64
83
  <div className={`border rounded-md p-3 ${colorClasses}`}>
65
84
  <div className="flex items-center gap-2 text-xs text-muted-foreground">
66
85
  <Loader2 className="w-3 h-3 animate-spin" />
67
- Loading analysis...
86
+ Running analysis...
68
87
  </div>
69
88
  </div>
70
89
  );
71
90
  }
72
91
 
73
92
  if (!analysis) {
74
- return null;
93
+ return (
94
+ <div className={`border rounded-md p-3 ${colorClasses}`}>
95
+ <div className="flex items-center justify-between">
96
+ <span className="text-xs text-muted-foreground">
97
+ Session Analysis
98
+ </span>
99
+ <Button
100
+ size="sm"
101
+ variant="outline"
102
+ onClick={onRunAnalysis}
103
+ className="h-6 text-xs px-2"
104
+ >
105
+ <Play className="w-3 h-3 mr-1" />
106
+ Run Analysis
107
+ </Button>
108
+ </div>
109
+ </div>
110
+ );
75
111
  }
76
112
 
77
113
  return (
@@ -294,22 +330,62 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
294
330
  const [loading, setLoading] = useState(true);
295
331
  const [error, setError] = useState<string | null>(null);
296
332
 
333
+ // User messages from source session
334
+ const [userMessages, setUserMessages] = useState<string[]>([]);
335
+ const [initialAutoRun, setInitialAutoRun] = useState(false);
336
+
337
+ // Queue state for multi-message replay
338
+ const [queueState, setQueueState] = useState<QueueState>({
339
+ currentIndex: -1,
340
+ stagedIndex: 0,
341
+ status: "idle",
342
+ });
343
+
297
344
  // Session states
298
345
  const [controlState, setControlState] = useState<SessionState>({
299
346
  sessionId: null,
300
347
  messages: [],
301
348
  isStreaming: false,
349
+ isSending: false,
302
350
  metrics: null,
303
351
  error: null,
352
+ autoRun: false,
353
+ turnIndex: -1,
304
354
  });
305
355
  const [variantState, setVariantState] = useState<SessionState>({
306
356
  sessionId: null,
307
357
  messages: [],
308
358
  isStreaming: false,
359
+ isSending: false,
309
360
  metrics: null,
310
361
  error: null,
362
+ autoRun: false,
363
+ turnIndex: -1,
311
364
  });
312
365
 
366
+ // Refs for stable callbacks
367
+ const controlStateRef = useRef(controlState);
368
+ const variantStateRef = useRef(variantState);
369
+ const queueStateRef = useRef(queueState);
370
+ const userMessagesRef = useRef(userMessages);
371
+
372
+ // Separate refs for send locks - these update synchronously to prevent race conditions
373
+ const controlSendingRef = useRef(false);
374
+ const variantSendingRef = useRef(false);
375
+
376
+ useEffect(() => {
377
+ controlStateRef.current = controlState;
378
+ }, [controlState]);
379
+ useEffect(() => {
380
+ variantStateRef.current = variantState;
381
+ }, [variantState]);
382
+ useEffect(() => {
383
+ queueStateRef.current = queueState;
384
+ }, [queueState]);
385
+ useEffect(() => {
386
+ userMessagesRef.current = userMessages;
387
+ }, [userMessages]);
388
+
313
389
  const [isRunning, setIsRunning] = useState(false);
314
390
  const [hasRun, setHasRun] = useState(false);
315
391
 
@@ -335,45 +411,115 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
335
411
  variant: false,
336
412
  });
337
413
 
338
- // Fetch comparison run details and restore saved messages
414
+ // Fetch comparison run details, conversation, and restore saved messages
339
415
  useEffect(() => {
340
- Promise.all([
341
- fetch(`/api/comparison-run/${runId}`).then((res) => res.json()),
342
- ])
343
- .then(([runData]) => {
416
+ let runData: ComparisonRun;
417
+
418
+ fetch(`/api/comparison-run/${runId}`)
419
+ .then((res) => res.json())
420
+ .then(async (data) => {
421
+ runData = data;
344
422
  setRun(runData);
345
423
 
346
- // Restore saved messages if the run has been completed
424
+ // Fetch conversation from source session to get all user messages
425
+ const conversationRes = await fetch(
426
+ `/api/session-conversation?sessionId=${runData.sourceSessionId}`,
427
+ );
428
+ const conversation = await conversationRes.json();
429
+
430
+ // Extract user messages in order
431
+ const messages: string[] = [];
432
+ for (const trace of conversation) {
433
+ if (trace.userInput) {
434
+ messages.push(trace.userInput);
435
+ }
436
+ }
437
+
438
+ // If no messages found in conversation, fall back to firstUserMessage
439
+ if (messages.length === 0 && runData.firstUserMessage) {
440
+ messages.push(runData.firstUserMessage);
441
+ }
442
+
443
+ setUserMessages(messages);
444
+
445
+ // Restore saved messages if the run has been completed or running
347
446
  if (runData.status === "completed" || runData.status === "running") {
348
447
  setHasRun(true);
349
448
 
449
+ // Fetch full conversation history from control and variant sessions
450
+ const [controlConversation, variantConversation] = await Promise.all([
451
+ runData.controlSessionId
452
+ ? fetch(
453
+ `/api/session-conversation?sessionId=${runData.controlSessionId}`,
454
+ ).then((res) => res.json())
455
+ : Promise.resolve([]),
456
+ runData.variantSessionId
457
+ ? fetch(
458
+ `/api/session-conversation?sessionId=${runData.variantSessionId}`,
459
+ ).then((res) => res.json())
460
+ : Promise.resolve([]),
461
+ ]);
462
+
463
+ // Convert traces to chat messages
464
+ const tracesToChatMessages = (
465
+ traces: Array<{ userInput?: string; llmOutput?: string }>,
466
+ ): ChatMessage[] => {
467
+ const chatMessages: ChatMessage[] = [];
468
+ for (const trace of traces) {
469
+ if (trace.userInput) {
470
+ chatMessages.push({
471
+ role: "user" as const,
472
+ content: trace.userInput,
473
+ });
474
+ }
475
+ if (trace.llmOutput) {
476
+ chatMessages.push({
477
+ role: "assistant" as const,
478
+ content: trace.llmOutput,
479
+ });
480
+ }
481
+ }
482
+ return chatMessages;
483
+ };
484
+
350
485
  // Restore control messages
351
- if (runData.controlResponse) {
352
- setControlState({
486
+ if (runData.controlSessionId) {
487
+ const controlMessages = tracesToChatMessages(controlConversation);
488
+ setControlState((prev) => ({
489
+ ...prev,
353
490
  sessionId: runData.controlSessionId,
354
- messages: [
355
- { role: "user", content: runData.firstUserMessage },
356
- { role: "assistant", content: runData.controlResponse },
357
- ],
491
+ messages: controlMessages,
358
492
  isStreaming: false,
359
493
  metrics: runData.controlMetrics,
360
494
  error: null,
361
- });
495
+ turnIndex: Math.floor(controlMessages.length / 2) - 1,
496
+ }));
362
497
  }
363
498
 
364
499
  // Restore variant messages
365
- if (runData.variantResponse) {
366
- setVariantState({
500
+ if (runData.variantSessionId) {
501
+ const variantMessages = tracesToChatMessages(variantConversation);
502
+ setVariantState((prev) => ({
503
+ ...prev,
367
504
  sessionId: runData.variantSessionId,
368
- messages: [
369
- { role: "user", content: runData.firstUserMessage },
370
- { role: "assistant", content: runData.variantResponse },
371
- ],
505
+ messages: variantMessages,
372
506
  isStreaming: false,
373
507
  metrics: runData.variantMetrics,
374
508
  error: null,
375
- });
509
+ turnIndex: Math.floor(variantMessages.length / 2) - 1,
510
+ }));
376
511
  }
512
+
513
+ // Set queue state based on completed messages
514
+ const completedTurns = Math.min(
515
+ Math.floor(tracesToChatMessages(controlConversation).length / 2),
516
+ Math.floor(tracesToChatMessages(variantConversation).length / 2),
517
+ );
518
+ setQueueState({
519
+ currentIndex: completedTurns - 1,
520
+ stagedIndex: completedTurns,
521
+ status: runData.status === "completed" ? "completed" : "running",
522
+ });
377
523
  }
378
524
 
379
525
  // Fetch the config by the run's configId (not the latest config!)
@@ -535,29 +681,372 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
535
681
  return accumulatedContent;
536
682
  };
537
683
 
538
- // Run the comparison
684
+ // Helper to fetch metrics with retry
685
+ const fetchMetricsWithRetry = useCallback(
686
+ async (
687
+ sessionId: string,
688
+ model: string,
689
+ duration: number,
690
+ ): Promise<SessionMetrics> => {
691
+ const maxWaitMs = 60_000;
692
+ const pollIntervalMs = 2_000;
693
+ let elapsed = 0;
694
+ let previousTokens = -1;
695
+ let previousTools = -1;
696
+ let lastMetrics: SessionMetrics | null = null;
697
+
698
+ while (elapsed <= maxWaitMs) {
699
+ try {
700
+ const metricsRes = await fetch(
701
+ `/api/session-metrics/${sessionId}?model=${encodeURIComponent(model)}`,
702
+ );
703
+ const metrics = await metricsRes.json();
704
+ lastMetrics = { ...metrics, durationMs: duration };
705
+
706
+ // If tokens/tool calls stopped changing and we have data, treat as final.
707
+ if (
708
+ metrics.totalTokens > 0 &&
709
+ metrics.totalTokens === previousTokens &&
710
+ metrics.toolCallCount === previousTools
711
+ ) {
712
+ return lastMetrics!;
713
+ }
714
+
715
+ previousTokens = metrics.totalTokens ?? 0;
716
+ previousTools = metrics.toolCallCount ?? 0;
717
+ } catch {
718
+ // swallow and retry
719
+ }
720
+
721
+ await new Promise((r) => setTimeout(r, pollIntervalMs));
722
+ elapsed += pollIntervalMs;
723
+ }
724
+
725
+ // Return whatever we last saw (or zeros if nothing ever arrived)
726
+ return (
727
+ lastMetrics ?? {
728
+ durationMs: duration,
729
+ inputTokens: 0,
730
+ outputTokens: 0,
731
+ totalTokens: 0,
732
+ estimatedCost: 0,
733
+ toolCallCount: 0,
734
+ }
735
+ );
736
+ },
737
+ [],
738
+ );
739
+
740
+ // Send a single message to one arm and handle the response
741
+ const sendMessageToArm = useCallback(
742
+ async (
743
+ sessionId: string,
744
+ message: string,
745
+ messageIndex: number,
746
+ model: string,
747
+ arm: "control" | "variant",
748
+ startTime: number,
749
+ ): Promise<{ response: string; metrics: SessionMetrics }> => {
750
+ const setState = arm === "control" ? setControlState : setVariantState;
751
+
752
+ try {
753
+ // Add user message and set streaming
754
+ setState((prev) => ({
755
+ ...prev,
756
+ isStreaming: true,
757
+ messages: [...prev.messages, { role: "user", content: message }],
758
+ }));
759
+
760
+ let accumulatedContent = "";
761
+
762
+ const response = await sendMessageAndCollect(
763
+ sessionId,
764
+ message,
765
+ (content) => {
766
+ accumulatedContent = content;
767
+ setState((prev) => {
768
+ // Find the last assistant message or add one
769
+ const messages = [...prev.messages];
770
+ const lastMsg = messages[messages.length - 1];
771
+ if (lastMsg && lastMsg.role === "assistant") {
772
+ messages[messages.length - 1] = {
773
+ role: "assistant",
774
+ content,
775
+ };
776
+ } else {
777
+ messages.push({ role: "assistant", content });
778
+ }
779
+ return { ...prev, messages };
780
+ });
781
+ },
782
+ );
783
+
784
+ const duration = Date.now() - startTime;
785
+ const metrics = await fetchMetricsWithRetry(sessionId, model, duration);
786
+
787
+ setState((prev) => ({
788
+ ...prev,
789
+ isStreaming: false,
790
+ turnIndex: messageIndex,
791
+ metrics,
792
+ error: null,
793
+ }));
794
+
795
+ return { response, metrics };
796
+ } catch (err) {
797
+ setState((prev) => ({
798
+ ...prev,
799
+ isStreaming: false,
800
+ error: err instanceof Error ? err.message : "Unknown error",
801
+ }));
802
+ return {
803
+ response: "",
804
+ metrics: {
805
+ durationMs: 0,
806
+ inputTokens: 0,
807
+ outputTokens: 0,
808
+ totalTokens: 0,
809
+ estimatedCost: 0,
810
+ toolCallCount: 0,
811
+ },
812
+ };
813
+ }
814
+ },
815
+ [fetchMetricsWithRetry],
816
+ );
817
+
818
+ // Send staged message to a specific arm
819
+ const sendStagedToArm = useCallback(
820
+ async (arm: "control" | "variant") => {
821
+ const state =
822
+ arm === "control" ? controlStateRef.current : variantStateRef.current;
823
+ const setState = arm === "control" ? setControlState : setVariantState;
824
+ const sendingRef =
825
+ arm === "control" ? controlSendingRef : variantSendingRef;
826
+ const queue = queueStateRef.current;
827
+ const messages = userMessagesRef.current;
828
+
829
+ // Check the synchronous ref first to prevent duplicate sends
830
+ if (sendingRef.current) return;
831
+ if (!state.sessionId || state.isStreaming) return;
832
+ if (queue.stagedIndex >= messages.length) return;
833
+
834
+ const message = messages[queue.stagedIndex];
835
+ if (!message) return;
836
+
837
+ // Set sending lock immediately (synchronously) to prevent race conditions
838
+ sendingRef.current = true;
839
+ setState((prev) => ({ ...prev, isSending: true }));
840
+
841
+ const model =
842
+ arm === "control"
843
+ ? config?.controlModel || "claude-sonnet-4-5-20250929"
844
+ : config?.variantModel ||
845
+ config?.controlModel ||
846
+ "claude-sonnet-4-5-20250929";
847
+
848
+ try {
849
+ await sendMessageToArm(
850
+ state.sessionId,
851
+ message,
852
+ queue.stagedIndex,
853
+ model,
854
+ arm,
855
+ Date.now(),
856
+ );
857
+ } finally {
858
+ // Clear sending lock after completion
859
+ sendingRef.current = false;
860
+ setState((prev) => ({ ...prev, isSending: false }));
861
+ }
862
+ },
863
+ [config, sendMessageToArm],
864
+ );
865
+
866
+ // Send staged message to both arms
867
+ const sendStagedToBoth = useCallback(async () => {
868
+ const control = controlStateRef.current;
869
+ const variant = variantStateRef.current;
870
+
871
+ if (!control.sessionId || !variant.sessionId) return;
872
+ if (control.isStreaming || variant.isStreaming) return;
873
+ if (controlSendingRef.current || variantSendingRef.current) return;
874
+
875
+ await Promise.all([sendStagedToArm("control"), sendStagedToArm("variant")]);
876
+ }, [sendStagedToArm]);
877
+
878
+ // Check and advance queue after both arms complete a turn
879
+ useEffect(() => {
880
+ // Both arms must have completed the same turn and not be in the middle of sending
881
+ if (
882
+ controlState.isStreaming ||
883
+ variantState.isStreaming ||
884
+ controlState.isSending ||
885
+ variantState.isSending ||
886
+ queueState.status !== "running"
887
+ )
888
+ return;
889
+ if (controlState.turnIndex !== variantState.turnIndex) return;
890
+
891
+ const completedIndex = controlState.turnIndex;
892
+
893
+ // Advance currentIndex if both completed
894
+ if (completedIndex > queueState.currentIndex) {
895
+ const nextIndex = completedIndex + 1;
896
+
897
+ if (nextIndex >= userMessages.length) {
898
+ // All messages completed
899
+ setQueueState((prev) => ({
900
+ ...prev,
901
+ currentIndex: completedIndex,
902
+ status: "completed",
903
+ }));
904
+ setIsRunning(false);
905
+
906
+ // Persist final state
907
+ if (run && controlState.sessionId && variantState.sessionId) {
908
+ // Get last responses from messages
909
+ const controlMsgs = controlState.messages;
910
+ const variantMsgs = variantState.messages;
911
+ const lastControlResponse =
912
+ controlMsgs[controlMsgs.length - 1]?.role === "assistant"
913
+ ? controlMsgs[controlMsgs.length - 1]?.content
914
+ : "";
915
+ const lastVariantResponse =
916
+ variantMsgs[variantMsgs.length - 1]?.role === "assistant"
917
+ ? variantMsgs[variantMsgs.length - 1]?.content
918
+ : "";
919
+
920
+ fetch(`/api/comparison-run/${runId}/update`, {
921
+ method: "POST",
922
+ headers: { "Content-Type": "application/json" },
923
+ body: JSON.stringify({
924
+ status: "completed",
925
+ controlMetrics: controlState.metrics,
926
+ variantMetrics: variantState.metrics,
927
+ controlResponse: lastControlResponse,
928
+ variantResponse: lastVariantResponse,
929
+ }),
930
+ });
931
+ }
932
+ } else {
933
+ // Stage next message
934
+ setQueueState((prev) => ({
935
+ ...prev,
936
+ currentIndex: completedIndex,
937
+ stagedIndex: nextIndex,
938
+ }));
939
+ }
940
+ }
941
+ }, [
942
+ controlState.isStreaming,
943
+ controlState.isSending,
944
+ controlState.turnIndex,
945
+ controlState.messages,
946
+ controlState.metrics,
947
+ controlState.sessionId,
948
+ variantState.isStreaming,
949
+ variantState.isSending,
950
+ variantState.turnIndex,
951
+ variantState.messages,
952
+ variantState.metrics,
953
+ variantState.sessionId,
954
+ queueState.status,
955
+ queueState.currentIndex,
956
+ userMessages.length,
957
+ run,
958
+ runId,
959
+ ]);
960
+
961
+ // Auto-send staged message when conditions are met
962
+ useEffect(() => {
963
+ if (queueState.status !== "running") return;
964
+ if (queueState.stagedIndex >= userMessages.length) return;
965
+
966
+ const message = userMessages[queueState.stagedIndex];
967
+ if (!message) return;
968
+
969
+ // Check if control should auto-send
970
+ if (
971
+ controlState.autoRun &&
972
+ !controlState.isStreaming &&
973
+ !controlState.isSending &&
974
+ controlState.sessionId &&
975
+ controlState.turnIndex === queueState.currentIndex
976
+ ) {
977
+ sendStagedToArm("control");
978
+ }
979
+
980
+ // Check if variant should auto-send
981
+ if (
982
+ variantState.autoRun &&
983
+ !variantState.isStreaming &&
984
+ !variantState.isSending &&
985
+ variantState.sessionId &&
986
+ variantState.turnIndex === queueState.currentIndex
987
+ ) {
988
+ sendStagedToArm("variant");
989
+ }
990
+ }, [
991
+ queueState.status,
992
+ queueState.stagedIndex,
993
+ queueState.currentIndex,
994
+ userMessages,
995
+ controlState.autoRun,
996
+ controlState.isStreaming,
997
+ controlState.isSending,
998
+ controlState.sessionId,
999
+ controlState.turnIndex,
1000
+ variantState.autoRun,
1001
+ variantState.isStreaming,
1002
+ variantState.isSending,
1003
+ variantState.sessionId,
1004
+ variantState.turnIndex,
1005
+ sendStagedToArm,
1006
+ ]);
1007
+
1008
+ // Toggle auto-run for an arm
1009
+ const toggleAutoRun = useCallback((arm: "control" | "variant") => {
1010
+ const setState = arm === "control" ? setControlState : setVariantState;
1011
+ setState((prev) => ({ ...prev, autoRun: !prev.autoRun }));
1012
+ }, []);
1013
+
1014
+ // Start the comparison (initialize sessions, first message sent by auto-send effect)
539
1015
  const runComparison = useCallback(async () => {
540
- if (!run || !config) return;
1016
+ if (!run || !config || userMessages.length === 0) return;
541
1017
 
542
1018
  setIsRunning(true);
543
1019
  setHasRun(true);
544
1020
 
545
- const firstMessage = run.firstUserMessage;
1021
+ // Reset sending refs
1022
+ controlSendingRef.current = false;
1023
+ variantSendingRef.current = false;
546
1024
 
547
- // Reset states
1025
+ // Reset states with initial autoRun setting
548
1026
  setControlState({
549
1027
  sessionId: null,
550
- messages: [{ role: "user", content: firstMessage }],
551
- isStreaming: true,
1028
+ messages: [],
1029
+ isStreaming: false,
1030
+ isSending: false,
552
1031
  metrics: null,
553
1032
  error: null,
1033
+ autoRun: initialAutoRun,
1034
+ turnIndex: -1,
554
1035
  });
555
1036
  setVariantState({
556
1037
  sessionId: null,
557
- messages: [{ role: "user", content: firstMessage }],
558
- isStreaming: true,
1038
+ messages: [],
1039
+ isStreaming: false,
1040
+ isSending: false,
559
1041
  metrics: null,
560
1042
  error: null,
1043
+ autoRun: initialAutoRun,
1044
+ turnIndex: -1,
1045
+ });
1046
+ setQueueState({
1047
+ currentIndex: -1,
1048
+ stagedIndex: 0,
1049
+ status: "running",
561
1050
  });
562
1051
 
563
1052
  try {
@@ -594,179 +1083,13 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
594
1083
  }),
595
1084
  });
596
1085
 
597
- // Run both sessions in parallel
598
- const startTime = Date.now();
599
-
600
- // Track final responses and metrics
601
- let finalControlMetrics: SessionMetrics = {
602
- durationMs: 0,
603
- inputTokens: 0,
604
- outputTokens: 0,
605
- totalTokens: 0,
606
- estimatedCost: 0,
607
- toolCallCount: 0,
608
- };
609
- let finalVariantMetrics: SessionMetrics = {
610
- durationMs: 0,
611
- inputTokens: 0,
612
- outputTokens: 0,
613
- totalTokens: 0,
614
- estimatedCost: 0,
615
- toolCallCount: 0,
616
- };
617
-
618
- // Helper to run a session and fetch metrics
619
- const runSession = async (
620
- sessionId: string,
621
- model: string,
622
- setState: typeof setControlState,
623
- onContentUpdate: (content: string) => void,
624
- ): Promise<{ response: string; metrics: SessionMetrics }> => {
625
- try {
626
- const response = await sendMessageAndCollect(
627
- sessionId,
628
- firstMessage,
629
- onContentUpdate,
630
- );
631
-
632
- const duration = Date.now() - startTime;
633
-
634
- // Poll metrics until they stabilize or we hit a max wait window.
635
- const fetchMetricsWithRetry = async (): Promise<SessionMetrics> => {
636
- const maxWaitMs = 60_000;
637
- const pollIntervalMs = 2_000;
638
- let elapsed = 0;
639
- let previousTokens = -1;
640
- let previousTools = -1;
641
- let lastMetrics: SessionMetrics | null = null;
642
-
643
- while (elapsed <= maxWaitMs) {
644
- try {
645
- const metricsRes = await fetch(
646
- `/api/session-metrics/${sessionId}?model=${encodeURIComponent(model)}`,
647
- );
648
- const metrics = await metricsRes.json();
649
- lastMetrics = { ...metrics, durationMs: duration };
650
-
651
- // If tokens/tool calls stopped changing and we have data, treat as final.
652
- if (
653
- metrics.totalTokens > 0 &&
654
- metrics.totalTokens === previousTokens &&
655
- metrics.toolCallCount === previousTools
656
- ) {
657
- return lastMetrics!;
658
- }
659
-
660
- previousTokens = metrics.totalTokens ?? 0;
661
- previousTools = metrics.toolCallCount ?? 0;
662
- } catch {
663
- // swallow and retry
664
- }
665
-
666
- await new Promise((r) => setTimeout(r, pollIntervalMs));
667
- elapsed += pollIntervalMs;
668
- }
669
-
670
- // Return whatever we last saw (or zeros if nothing ever arrived)
671
- return (
672
- lastMetrics ?? {
673
- durationMs: duration,
674
- inputTokens: 0,
675
- outputTokens: 0,
676
- totalTokens: 0,
677
- estimatedCost: 0,
678
- toolCallCount: 0,
679
- }
680
- );
681
- };
682
-
683
- const metrics = await fetchMetricsWithRetry();
684
-
685
- setState((prev) => ({
686
- ...prev,
687
- isStreaming: false,
688
- metrics,
689
- }));
690
-
691
- return { response, metrics };
692
- } catch (err) {
693
- setState((prev) => ({
694
- ...prev,
695
- isStreaming: false,
696
- error: err instanceof Error ? err.message : "Unknown error",
697
- }));
698
- return {
699
- response: "",
700
- metrics: {
701
- durationMs: 0,
702
- inputTokens: 0,
703
- outputTokens: 0,
704
- totalTokens: 0,
705
- estimatedCost: 0,
706
- toolCallCount: 0,
707
- },
708
- };
709
- }
710
- };
711
-
712
- const controlModel = config.controlModel || "claude-sonnet-4-5-20250929";
713
- const variantModel =
714
- config.variantModel ||
715
- config.controlModel ||
716
- "claude-sonnet-4-5-20250929";
717
-
718
- const [controlResult, variantResult] = await Promise.all([
719
- runSession(
720
- controlSessionId,
721
- controlModel,
722
- setControlState,
723
- (content) => {
724
- setControlState((prev) => ({
725
- ...prev,
726
- messages: [
727
- { role: "user", content: firstMessage },
728
- { role: "assistant", content },
729
- ],
730
- }));
731
- },
732
- ),
733
- runSession(
734
- variantSessionId,
735
- variantModel,
736
- setVariantState,
737
- (content) => {
738
- setVariantState((prev) => ({
739
- ...prev,
740
- messages: [
741
- { role: "user", content: firstMessage },
742
- { role: "assistant", content },
743
- ],
744
- }));
745
- },
746
- ),
747
- ]);
748
-
749
- finalControlMetrics = controlResult.metrics;
750
- finalVariantMetrics = variantResult.metrics;
751
-
752
- // Update run status with responses and metrics
753
- await fetch(`/api/comparison-run/${runId}/update`, {
754
- method: "POST",
755
- headers: { "Content-Type": "application/json" },
756
- body: JSON.stringify({
757
- status: "completed",
758
- controlMetrics: finalControlMetrics,
759
- variantMetrics: finalVariantMetrics,
760
- controlResponse: controlResult.response,
761
- variantResponse: variantResult.response,
762
- }),
763
- });
1086
+ // Don't send first message here - let the auto-send effect handle it
1087
+ // This ensures all messages go through the same code path and prevents duplicates
764
1088
  } catch (err) {
765
1089
  setError(err instanceof Error ? err.message : "Failed to run comparison");
766
- } finally {
767
1090
  setIsRunning(false);
768
1091
  }
769
- }, [run, config, runId]);
1092
+ }, [run, config, userMessages, initialAutoRun, runId, createSession]);
770
1093
 
771
1094
  // Function to fetch existing or trigger new session analysis
772
1095
  const triggerAnalysis = useCallback(
@@ -812,47 +1135,6 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
812
1135
  [],
813
1136
  );
814
1137
 
815
- // Auto-trigger analysis when sessions complete
816
- useEffect(() => {
817
- // Control session completed
818
- if (
819
- controlState.sessionId &&
820
- !controlState.isStreaming &&
821
- controlState.metrics &&
822
- !controlAnalysis &&
823
- !controlAnalysisLoading
824
- ) {
825
- triggerAnalysis(controlState.sessionId, "control");
826
- }
827
- }, [
828
- controlState.sessionId,
829
- controlState.isStreaming,
830
- controlState.metrics,
831
- controlAnalysis,
832
- controlAnalysisLoading,
833
- triggerAnalysis,
834
- ]);
835
-
836
- useEffect(() => {
837
- // Variant session completed
838
- if (
839
- variantState.sessionId &&
840
- !variantState.isStreaming &&
841
- variantState.metrics &&
842
- !variantAnalysis &&
843
- !variantAnalysisLoading
844
- ) {
845
- triggerAnalysis(variantState.sessionId, "variant");
846
- }
847
- }, [
848
- variantState.sessionId,
849
- variantState.isStreaming,
850
- variantState.metrics,
851
- variantAnalysis,
852
- variantAnalysisLoading,
853
- triggerAnalysis,
854
- ]);
855
-
856
1138
  if (loading) {
857
1139
  return (
858
1140
  <DebuggerLayout title="Comparison" showBackButton backHref="/town-hall">
@@ -924,37 +1206,159 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
924
1206
  <div className="container mx-auto p-4 h-[calc(100vh-4rem)] flex flex-col overflow-hidden">
925
1207
  {/* Header */}
926
1208
  <div className="flex items-center justify-between mb-4">
927
- <div>
928
- <h2 className="text-lg font-semibold">A/B Comparison</h2>
929
- <p className="text-sm text-muted-foreground">
930
- Comparing: {getDimensionsSummary()}
931
- </p>
1209
+ <div className="flex items-center gap-3">
1210
+ <div>
1211
+ <h2 className="text-lg font-semibold">A/B Comparison</h2>
1212
+ <p className="text-sm text-muted-foreground">
1213
+ Comparing: {getDimensionsSummary()}
1214
+ </p>
1215
+ </div>
1216
+ {/* Message count badge when running */}
1217
+ {hasRun && userMessages.length > 1 && (
1218
+ <div className="flex items-center gap-2 px-3 py-1.5 rounded-full bg-muted text-sm">
1219
+ <span className="font-medium">
1220
+ {queueState.currentIndex + 1}/{userMessages.length}
1221
+ </span>
1222
+ <span className="text-muted-foreground">messages</span>
1223
+ {queueState.status === "completed" && (
1224
+ <span className="text-green-600 dark:text-green-400 text-xs">
1225
+ Complete
1226
+ </span>
1227
+ )}
1228
+ </div>
1229
+ )}
932
1230
  </div>
933
1231
  {!hasRun && (
934
- <Button onClick={runComparison} disabled={isRunning}>
935
- {isRunning ? "Running..." : "Run Comparison"}
1232
+ <Button
1233
+ onClick={runComparison}
1234
+ disabled={isRunning || userMessages.length === 0}
1235
+ >
1236
+ {isRunning ? "Running..." : "Start Comparison"}
936
1237
  </Button>
937
1238
  )}
938
1239
  </div>
939
1240
 
1241
+ {/* Queue Banner - shown when there's a staged message waiting */}
1242
+ {hasRun &&
1243
+ queueState.status === "running" &&
1244
+ queueState.stagedIndex > queueState.currentIndex &&
1245
+ queueState.stagedIndex < userMessages.length &&
1246
+ !controlState.isStreaming &&
1247
+ !variantState.isStreaming && (
1248
+ <div className="mb-4 p-3 rounded-lg border bg-muted/50 flex items-center gap-4">
1249
+ <div className="flex-1">
1250
+ <div className="text-xs font-medium text-muted-foreground mb-1">
1251
+ Next message ready (#{queueState.stagedIndex + 1})
1252
+ </div>
1253
+ <div className="text-sm truncate">
1254
+ {userMessages[queueState.stagedIndex]?.slice(0, 100)}
1255
+ {(userMessages[queueState.stagedIndex]?.length ?? 0) > 100
1256
+ ? "..."
1257
+ : ""}
1258
+ </div>
1259
+ </div>
1260
+ <div className="flex items-center gap-2 shrink-0">
1261
+ {/* Per-arm send buttons when that arm is not auto-running */}
1262
+ {!controlState.autoRun &&
1263
+ controlState.turnIndex === queueState.currentIndex && (
1264
+ <Button
1265
+ size="sm"
1266
+ variant="outline"
1267
+ onClick={() => sendStagedToArm("control")}
1268
+ className="text-blue-600 border-blue-300 hover:bg-blue-50 dark:text-blue-400 dark:border-blue-700 dark:hover:bg-blue-950"
1269
+ >
1270
+ <Play className="w-3 h-3 mr-1" />
1271
+ Control
1272
+ </Button>
1273
+ )}
1274
+ {!variantState.autoRun &&
1275
+ variantState.turnIndex === queueState.currentIndex && (
1276
+ <Button
1277
+ size="sm"
1278
+ variant="outline"
1279
+ onClick={() => sendStagedToArm("variant")}
1280
+ className="text-orange-600 border-orange-300 hover:bg-orange-50 dark:text-orange-400 dark:border-orange-700 dark:hover:bg-orange-950"
1281
+ >
1282
+ <Play className="w-3 h-3 mr-1" />
1283
+ Variant
1284
+ </Button>
1285
+ )}
1286
+ {/* Send to both button */}
1287
+ {!controlState.autoRun &&
1288
+ !variantState.autoRun &&
1289
+ controlState.turnIndex === queueState.currentIndex &&
1290
+ variantState.turnIndex === queueState.currentIndex && (
1291
+ <Button size="sm" onClick={sendStagedToBoth}>
1292
+ <Play className="w-3 h-3 mr-1" />
1293
+ Send to Both
1294
+ </Button>
1295
+ )}
1296
+ </div>
1297
+ </div>
1298
+ )}
1299
+
940
1300
  {/* Pre-run state */}
941
1301
  {!hasRun && (
942
1302
  <div className="flex-1 flex items-center justify-center">
943
- <Card className="max-w-md w-full">
1303
+ <Card className="max-w-lg w-full">
944
1304
  <CardHeader className="text-center">
945
1305
  <CardTitle>Ready to Compare</CardTitle>
946
1306
  <CardDescription>
947
- This comparison will send the same prompt to both
948
- configurations and display the results side by side.
1307
+ This comparison will replay {userMessages.length} user message
1308
+ {userMessages.length !== 1 ? "s" : ""} to both configurations
1309
+ and display the results side by side.
949
1310
  </CardDescription>
950
1311
  </CardHeader>
951
1312
  <CardContent className="space-y-4">
952
- <div className="bg-muted rounded-lg p-4">
1313
+ {/* User messages list */}
1314
+ <div className="bg-muted rounded-lg p-4 max-h-64 overflow-y-auto">
953
1315
  <div className="text-xs font-medium uppercase text-muted-foreground mb-2">
954
- First message
1316
+ User Messages ({userMessages.length})
1317
+ </div>
1318
+ <div className="space-y-2">
1319
+ {userMessages.map((msg, idx) => (
1320
+ <details key={idx} className="group">
1321
+ <summary className="text-sm cursor-pointer flex items-center gap-2 hover:text-foreground">
1322
+ <span className="text-xs font-mono text-muted-foreground w-5">
1323
+ {idx + 1}.
1324
+ </span>
1325
+ <span className="truncate flex-1">
1326
+ {msg.slice(0, 80)}
1327
+ {msg.length > 80 ? "..." : ""}
1328
+ </span>
1329
+ <ChevronDown className="w-3 h-3 text-muted-foreground group-open:rotate-180 transition-transform" />
1330
+ </summary>
1331
+ <div className="mt-2 ml-7 text-sm whitespace-pre-wrap bg-background/50 rounded p-2 text-muted-foreground">
1332
+ {msg}
1333
+ </div>
1334
+ </details>
1335
+ ))}
955
1336
  </div>
956
- <div className="text-sm">{run?.firstUserMessage}</div>
957
1337
  </div>
1338
+
1339
+ {/* Auto-run checkbox */}
1340
+ {userMessages.length > 1 && (
1341
+ <div className="flex items-center gap-3 p-3 rounded-lg border bg-background">
1342
+ <Checkbox
1343
+ id="auto-run"
1344
+ checked={initialAutoRun}
1345
+ onCheckedChange={(checked) => setInitialAutoRun(checked)}
1346
+ />
1347
+ <div className="flex-1">
1348
+ <label
1349
+ htmlFor="auto-run"
1350
+ className="text-sm font-medium cursor-pointer"
1351
+ >
1352
+ Auto run all messages
1353
+ </label>
1354
+ <p className="text-xs text-muted-foreground">
1355
+ If off, next messages are enqueued after each turn.
1356
+ </p>
1357
+ </div>
1358
+ </div>
1359
+ )}
1360
+
1361
+ {/* Control vs Variant labels */}
958
1362
  <div className="grid grid-cols-2 gap-4 text-sm">
959
1363
  <div className="space-y-1">
960
1364
  <div className="flex items-center gap-2">
@@ -986,10 +1390,36 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
986
1390
  {/* Control */}
987
1391
  <Card className="flex flex-col h-full min-h-0 overflow-hidden">
988
1392
  <CardHeader className="py-3 border-b shrink-0">
989
- <CardTitle className="text-sm flex items-center gap-2">
990
- <span className="w-2 h-2 rounded-full bg-blue-500" />
991
- Control (Original)
992
- </CardTitle>
1393
+ <div className="flex items-center justify-between">
1394
+ <CardTitle className="text-sm flex items-center gap-2">
1395
+ <span className="w-2 h-2 rounded-full bg-blue-500" />
1396
+ Control (Original)
1397
+ {controlState.isStreaming && (
1398
+ <Loader2 className="w-3 h-3 animate-spin text-blue-500" />
1399
+ )}
1400
+ </CardTitle>
1401
+ {/* Auto-run toggle for Control */}
1402
+ {userMessages.length > 1 &&
1403
+ queueState.status === "running" && (
1404
+ <button
1405
+ type="button"
1406
+ onClick={() => toggleAutoRun("control")}
1407
+ className="flex items-center gap-1.5 text-xs text-muted-foreground hover:text-foreground transition-colors"
1408
+ title={
1409
+ controlState.autoRun
1410
+ ? "Disable auto-run"
1411
+ : "Enable auto-run"
1412
+ }
1413
+ >
1414
+ {controlState.autoRun ? (
1415
+ <ToggleRight className="w-4 h-4 text-blue-500" />
1416
+ ) : (
1417
+ <ToggleLeft className="w-4 h-4" />
1418
+ )}
1419
+ <span>Auto</span>
1420
+ </button>
1421
+ )}
1422
+ </div>
993
1423
  <CardDescription className="text-xs">
994
1424
  {getControlDimensionLabel()}
995
1425
  </CardDescription>
@@ -1014,8 +1444,23 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
1014
1444
  </div>
1015
1445
  ))}
1016
1446
  {controlState.error && (
1017
- <div className="text-red-500 text-sm">
1018
- Error: {controlState.error}
1447
+ <div className="p-3 rounded-lg border border-red-200 bg-red-50 dark:border-red-800 dark:bg-red-950/30">
1448
+ <div className="text-red-600 dark:text-red-400 text-sm mb-2">
1449
+ Error: {controlState.error}
1450
+ </div>
1451
+ {queueState.status === "running" && (
1452
+ <Button
1453
+ size="sm"
1454
+ variant="outline"
1455
+ onClick={() => {
1456
+ setControlState((prev) => ({ ...prev, error: null }));
1457
+ sendStagedToArm("control");
1458
+ }}
1459
+ className="text-red-600 border-red-300 hover:bg-red-100 dark:text-red-400"
1460
+ >
1461
+ Retry
1462
+ </Button>
1463
+ )}
1019
1464
  </div>
1020
1465
  )}
1021
1466
  </CardContent>
@@ -1033,6 +1478,10 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
1033
1478
  control: !prev.control,
1034
1479
  }))
1035
1480
  }
1481
+ onRunAnalysis={() =>
1482
+ controlState.sessionId &&
1483
+ triggerAnalysis(controlState.sessionId, "control")
1484
+ }
1036
1485
  accentColor="blue"
1037
1486
  />
1038
1487
  {/* Tool Calls */}
@@ -1054,10 +1503,36 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
1054
1503
  {/* Variant */}
1055
1504
  <Card className="flex flex-col h-full min-h-0 overflow-hidden">
1056
1505
  <CardHeader className="py-3 border-b shrink-0">
1057
- <CardTitle className="text-sm flex items-center gap-2">
1058
- <span className="w-2 h-2 rounded-full bg-orange-500" />
1059
- Variant
1060
- </CardTitle>
1506
+ <div className="flex items-center justify-between">
1507
+ <CardTitle className="text-sm flex items-center gap-2">
1508
+ <span className="w-2 h-2 rounded-full bg-orange-500" />
1509
+ Variant
1510
+ {variantState.isStreaming && (
1511
+ <Loader2 className="w-3 h-3 animate-spin text-orange-500" />
1512
+ )}
1513
+ </CardTitle>
1514
+ {/* Auto-run toggle for Variant */}
1515
+ {userMessages.length > 1 &&
1516
+ queueState.status === "running" && (
1517
+ <button
1518
+ type="button"
1519
+ onClick={() => toggleAutoRun("variant")}
1520
+ className="flex items-center gap-1.5 text-xs text-muted-foreground hover:text-foreground transition-colors"
1521
+ title={
1522
+ variantState.autoRun
1523
+ ? "Disable auto-run"
1524
+ : "Enable auto-run"
1525
+ }
1526
+ >
1527
+ {variantState.autoRun ? (
1528
+ <ToggleRight className="w-4 h-4 text-orange-500" />
1529
+ ) : (
1530
+ <ToggleLeft className="w-4 h-4" />
1531
+ )}
1532
+ <span>Auto</span>
1533
+ </button>
1534
+ )}
1535
+ </div>
1061
1536
  <CardDescription className="text-xs">
1062
1537
  {getDimensionLabel()}
1063
1538
  </CardDescription>
@@ -1082,8 +1557,23 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
1082
1557
  </div>
1083
1558
  ))}
1084
1559
  {variantState.error && (
1085
- <div className="text-red-500 text-sm">
1086
- Error: {variantState.error}
1560
+ <div className="p-3 rounded-lg border border-red-200 bg-red-50 dark:border-red-800 dark:bg-red-950/30">
1561
+ <div className="text-red-600 dark:text-red-400 text-sm mb-2">
1562
+ Error: {variantState.error}
1563
+ </div>
1564
+ {queueState.status === "running" && (
1565
+ <Button
1566
+ size="sm"
1567
+ variant="outline"
1568
+ onClick={() => {
1569
+ setVariantState((prev) => ({ ...prev, error: null }));
1570
+ sendStagedToArm("variant");
1571
+ }}
1572
+ className="text-red-600 border-red-300 hover:bg-red-100 dark:text-red-400"
1573
+ >
1574
+ Retry
1575
+ </Button>
1576
+ )}
1087
1577
  </div>
1088
1578
  )}
1089
1579
  </CardContent>
@@ -1101,6 +1591,10 @@ export function ComparisonView({ runId }: ComparisonViewProps) {
1101
1591
  variant: !prev.variant,
1102
1592
  }))
1103
1593
  }
1594
+ onRunAnalysis={() =>
1595
+ variantState.sessionId &&
1596
+ triggerAnalysis(variantState.sessionId, "variant")
1597
+ }
1104
1598
  accentColor="orange"
1105
1599
  />
1106
1600
  {/* Tool Calls */}