bosun 0.40.21 → 0.41.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/.env.example +8 -0
  2. package/README.md +20 -0
  3. package/agent/agent-custom-tools.mjs +23 -5
  4. package/agent/agent-event-bus.mjs +248 -6
  5. package/agent/agent-pool.mjs +131 -30
  6. package/agent/agent-work-analyzer.mjs +8 -16
  7. package/agent/primary-agent.mjs +81 -7
  8. package/agent/retry-queue.mjs +164 -0
  9. package/bench/swebench/bosun-swebench.mjs +5 -0
  10. package/bosun.config.example.json +25 -0
  11. package/bosun.schema.json +825 -183
  12. package/cli.mjs +267 -8
  13. package/config/config-doctor.mjs +51 -2
  14. package/config/config.mjs +232 -5
  15. package/github/github-auth-manager.mjs +70 -19
  16. package/infra/library-manager.mjs +894 -60
  17. package/infra/monitor.mjs +701 -69
  18. package/infra/runtime-accumulator.mjs +376 -84
  19. package/infra/session-tracker.mjs +95 -28
  20. package/infra/test-runtime.mjs +267 -0
  21. package/lib/codebase-audit.mjs +133 -18
  22. package/package.json +30 -8
  23. package/server/setup-web-server.mjs +29 -1
  24. package/server/ui-server.mjs +1571 -49
  25. package/setup.mjs +27 -24
  26. package/shell/codex-shell.mjs +34 -3
  27. package/shell/copilot-shell.mjs +50 -8
  28. package/task/msg-hub.mjs +193 -0
  29. package/task/pipeline.mjs +544 -0
  30. package/task/task-claims.mjs +6 -10
  31. package/task/task-cli.mjs +38 -2
  32. package/task/task-executor-pipeline.mjs +143 -0
  33. package/task/task-executor.mjs +36 -27
  34. package/telegram/get-telegram-chat-id.mjs +57 -47
  35. package/ui/components/chat-view.js +18 -1
  36. package/ui/components/workspace-switcher.js +321 -9
  37. package/ui/demo-defaults.js +17830 -10433
  38. package/ui/demo.html +9 -1
  39. package/ui/modules/router.js +1 -1
  40. package/ui/modules/settings-schema.js +2 -0
  41. package/ui/modules/state.js +54 -57
  42. package/ui/modules/voice-client-sdk.js +376 -37
  43. package/ui/modules/voice-client.js +173 -33
  44. package/ui/setup.html +68 -2
  45. package/ui/styles/components.css +571 -1
  46. package/ui/styles.css +201 -1
  47. package/ui/tabs/dashboard.js +74 -0
  48. package/ui/tabs/library.js +410 -55
  49. package/ui/tabs/logs.js +10 -0
  50. package/ui/tabs/settings.js +178 -99
  51. package/ui/tabs/tasks.js +1083 -507
  52. package/ui/tabs/telemetry.js +34 -0
  53. package/ui/tabs/workflow-canvas-utils.mjs +38 -1
  54. package/ui/tabs/workflows.js +1275 -402
  55. package/voice/voice-agents-sdk.mjs +2 -2
  56. package/voice/voice-relay.mjs +28 -20
  57. package/workflow/declarative-workflows.mjs +145 -0
  58. package/workflow/msg-hub.mjs +237 -0
  59. package/workflow/pipeline-workflows.mjs +287 -0
  60. package/workflow/pipeline.mjs +828 -315
  61. package/workflow/project-detection.mjs +559 -0
  62. package/workflow/workflow-cli.mjs +128 -0
  63. package/workflow/workflow-contract.mjs +433 -232
  64. package/workflow/workflow-engine.mjs +510 -47
  65. package/workflow/workflow-nodes/custom-loader.mjs +251 -0
  66. package/workflow/workflow-nodes.mjs +2024 -184
  67. package/workflow/workflow-templates.mjs +118 -24
  68. package/workflow-templates/agents.mjs +20 -20
  69. package/workflow-templates/bosun-native.mjs +212 -2
  70. package/workflow-templates/code-quality.mjs +20 -14
  71. package/workflow-templates/continuation-loop.mjs +339 -0
  72. package/workflow-templates/github.mjs +516 -40
  73. package/workflow-templates/planning.mjs +446 -17
  74. package/workflow-templates/reliability.mjs +65 -12
  75. package/workflow-templates/task-batch.mjs +27 -10
  76. package/workflow-templates/task-execution.mjs +752 -0
  77. package/workflow-templates/task-lifecycle.mjs +117 -14
  78. package/workspace/context-cache.mjs +66 -18
  79. package/workspace/workspace-manager.mjs +153 -1
  80. package/workflow-templates/issue-continuation.mjs +0 -243
@@ -39,9 +39,76 @@ export const isSdkVoiceActive = computed(() =>
39
39
  sdkVoiceState.value !== "idle" && sdkVoiceState.value !== "error"
40
40
  );
41
41
 
42
- // Noise-control default: disable user-side live ASR transcript output/persistence.
43
- // Assistant response text remains enabled.
44
- const ENABLE_USER_TRANSCRIPT = false;
42
+ // User transcript is always enabled transcription is surfaced from the API's
43
+ // input_audio_transcription feature (primary) or browser SpeechRecognition (backup).
44
+ const ENABLE_USER_TRANSCRIPT = true;
45
+
46
+ // ── Browser SpeechRecognition (parallel backup for user transcription) ──────
47
+
48
+ const _BrowserSpeechRecognition = typeof globalThis !== "undefined"
49
+ ? (globalThis.SpeechRecognition || globalThis.webkitSpeechRecognition)
50
+ : null;
51
+ let _browserRecognition = null;
52
+ let _browserTranscriptActive = false;
53
+ // When the API-level transcription delivers a user transcript, we prefer it
54
+ // over the browser's; this flag suppresses duplicate browser results.
55
+ let _apiTranscriptDelivered = false;
56
+
57
+ function _startBrowserTranscription() {
58
+ if (!_BrowserSpeechRecognition || _browserRecognition) return;
59
+ try {
60
+ const recognition = new _BrowserSpeechRecognition();
61
+ recognition.continuous = true;
62
+ recognition.interimResults = true;
63
+ recognition.maxAlternatives = 1;
64
+ // Attempt to match user's language, fall back to English
65
+ recognition.lang = navigator?.language || "en-US";
66
+
67
+ recognition.onresult = (event) => {
68
+ // Only use browser transcript when API-level transcription hasn't delivered yet
69
+ if (_apiTranscriptDelivered) return;
70
+ let transcript = "";
71
+ for (let i = event.resultIndex; i < event.results.length; i++) {
72
+ transcript += event.results[i][0].transcript;
73
+ }
74
+ const text = transcript.trim();
75
+ if (!text) return;
76
+ sdkVoiceTranscript.value = text;
77
+ emit("transcript", { text, final: event.results[event.resultIndex]?.isFinal || false, source: "browser" });
78
+ if (event.results[event.resultIndex]?.isFinal) {
79
+ _persistTranscriptIfNew("user", text, "browser.speech_recognition.final");
80
+ }
81
+ };
82
+
83
+ recognition.onerror = (e) => {
84
+ // Non-fatal: browser recognition may fail on some systems
85
+ if (e.error !== "no-speech" && e.error !== "aborted") {
86
+ console.warn("[voice-client-sdk] Browser SpeechRecognition error:", e.error);
87
+ }
88
+ };
89
+
90
+ recognition.onend = () => {
91
+ // Auto-restart while session is active
92
+ if (_browserTranscriptActive && _session) {
93
+ try { recognition.start(); } catch { /* already running or stopped */ }
94
+ }
95
+ };
96
+
97
+ recognition.start();
98
+ _browserRecognition = recognition;
99
+ _browserTranscriptActive = true;
100
+ } catch (err) {
101
+ console.warn("[voice-client-sdk] Browser SpeechRecognition unavailable:", err?.message);
102
+ }
103
+ }
104
+
105
+ function _stopBrowserTranscription() {
106
+ _browserTranscriptActive = false;
107
+ if (_browserRecognition) {
108
+ try { _browserRecognition.stop(); } catch { /* ignore */ }
109
+ _browserRecognition = null;
110
+ }
111
+ }
45
112
 
46
113
  // ── Module-scope state ──────────────────────────────────────────────────────
47
114
 
@@ -74,7 +141,13 @@ let _toolCompletionAckTimer = null;
74
141
  let _assistantBaselineBeforeToolAck = "";
75
142
  const _sdkCapturedMicStreams = new Set();
76
143
  let _lastAutoBargeInAt = 0;
77
- const AUTO_BARGE_IN_COOLDOWN_MS = 700;
144
+ const AUTO_BARGE_IN_COOLDOWN_MS = 1200;
145
+ // Minimum speech duration (ms) before an interrupt is allowed — filters keyboard/click noise
146
+ let _speechStartedAt = 0;
147
+ const MIN_SPEECH_DURATION_FOR_INTERRUPT_MS = 400;
148
+ // Delayed response clear — keep response visible in center after turn ends
149
+ let _responseClearTimer = null;
150
+ const RESPONSE_DISPLAY_HOLD_MS = 8000;
78
151
  let _traceTurnCounter = 0;
79
152
  let _traceCurrentTurnId = null;
80
153
  let _traceTurnActive = false;
@@ -108,6 +181,14 @@ function emit(event, data) {
108
181
 
109
182
  function maybeAutoInterruptSdkResponse(reason = "speech-started") {
110
183
  const now = Date.now();
184
+ // Only interrupt if speech has been ongoing long enough to be real speech
185
+ // (filters out keyboard clicks, mouse clicks, coughs, etc.)
186
+ if (_speechStartedAt > 0) {
187
+ const speechDuration = now - _speechStartedAt;
188
+ if (speechDuration < MIN_SPEECH_DURATION_FOR_INTERRUPT_MS) {
189
+ return false;
190
+ }
191
+ }
111
192
  if (!shouldAutoBargeIn({
112
193
  muted: isVoiceMicMuted.value,
113
194
  audioActive: Boolean(_session),
@@ -378,6 +459,13 @@ function _resetTranscriptPersistenceState() {
378
459
  _traceTurnActive = false;
379
460
  _traceLlmFirstTokenMarked = false;
380
461
  _traceTtsFirstAudioMarked = false;
462
+ _apiTranscriptDelivered = false;
463
+ // Clean up tool result injection state
464
+ for (const timer of _pendingToolResultTimers.values()) {
465
+ clearTimeout(timer);
466
+ }
467
+ _pendingToolResultTimers.clear();
468
+ _toolResultInjected.clear();
381
469
  }
382
470
 
383
471
  function _flushPendingTranscriptBuffers() {
@@ -391,7 +479,7 @@ function _flushPendingTranscriptBuffers() {
391
479
  }
392
480
 
393
481
  const finalUser = String(_pendingUserTranscriptText || "").trim();
394
- if (finalUser && ENABLE_USER_TRANSCRIPT) {
482
+ if (finalUser) {
395
483
  _persistTranscriptIfNew("user", finalUser, "sdk.history_updated.user.flush");
396
484
  }
397
485
 
@@ -447,25 +535,201 @@ function _markAssistantToolResponseObserved(latestAssistantText = "") {
447
535
  }
448
536
  }
449
537
 
538
+ // ── Robust tool result injection ────────────────────────────────────────────
539
+ // After the SDK processes a tool call, we verify that the model has received
540
+ // the function_call_output. If the model hasn't responded within a short
541
+ // window, we manually inject the result via sendEvent() as a fallback.
542
+
543
+ let _pendingToolResultTimers = new Map();
544
+ let _toolResultInjected = new Set(); // call IDs that were manually injected
545
+
546
+ function _ensureToolResultInjected(session, callId, toolName, resultStr) {
547
+ // Immediately inject the tool result into the model's conversation context.
548
+ // The SDK's auto-injection is unreliable — the result gets stored in the
549
+ // session tracker / chat history but doesn't always reach the model's
550
+ // realtime conversation context, causing the model to say "I'm having
551
+ // trouble" even though the tool succeeded.
552
+ const key = String(callId || "");
553
+ if (!key) return;
554
+
555
+ // Truncate large results for voice context
556
+ const VOICE_TOOL_OUTPUT_MAX = 6000;
557
+ let output = resultStr || "Done";
558
+ if (output.length > VOICE_TOOL_OUTPUT_MAX) {
559
+ output = output.slice(0, VOICE_TOOL_OUTPUT_MAX)
560
+ + "\n... (truncated for voice — full result available in chat)";
561
+ }
562
+
563
+ // Mark as injected immediately to prevent duplicate injections
564
+ _toolResultInjected.add(key);
565
+
566
+ // Inject NOW — don't wait for SDK auto-injection
567
+ if (session && typeof session.sendEvent === "function") {
568
+ try {
569
+ session.sendEvent({
570
+ type: "conversation.item.create",
571
+ item: {
572
+ type: "function_call_output",
573
+ call_id: callId,
574
+ output,
575
+ },
576
+ });
577
+ session.sendEvent({ type: "response.create" });
578
+ console.info(`[voice-client-sdk] Injected tool result for ${toolName} (${callId})`);
579
+ return;
580
+ } catch (err) {
581
+ console.warn("[voice-client-sdk] sendEvent injection failed:", err?.message);
582
+ }
583
+ }
584
+
585
+ // Fallback: inject as user-role context message
586
+ _injectContextMessage(
587
+ session,
588
+ `[Tool Result — ${toolName}]\n${output}`,
589
+ );
590
+ }
591
+
592
+ /**
593
+ * Inject a context message directly into the voice agent's conversation.
594
+ * Used for tool result fallback injection and background progress updates.
595
+ */
596
+ function _injectContextMessage(session, text) {
597
+ if (!session || !text) return;
598
+ const inputText = String(text).trim();
599
+ if (!inputText) return;
600
+
601
+ if (typeof session.sendMessage === "function") {
602
+ // @openai/agents SDK — sendMessage injects as user text and triggers response
603
+ session.sendMessage(inputText);
604
+ } else if (typeof session.sendEvent === "function") {
605
+ session.sendEvent({
606
+ type: "conversation.item.create",
607
+ item: {
608
+ type: "message",
609
+ role: "user",
610
+ content: [{ type: "input_text", text: inputText }],
611
+ },
612
+ });
613
+ session.sendEvent({ type: "response.create" });
614
+ } else if (session.readyState === WebSocket.OPEN) {
615
+ // Gemini WebSocket
616
+ session.send(JSON.stringify({ type: "text.input", text: inputText }));
617
+ }
618
+ }
619
+
620
+ // ── Background agent progress tracking ──────────────────────────────────────
621
+ // When a voice tool dispatches a background task/agent, we track it and
622
+ // periodically inject progress updates into the voice conversation so the
623
+ // model stays aware without the user having to ask.
624
+
625
+ let _backgroundProgressTimer = null;
626
+ let _trackedBackgroundTasks = new Map(); // taskId → { name, startedAt, lastStatus, sessionId }
627
+
628
+ function _trackBackgroundTask(taskId, info = {}) {
629
+ const key = String(taskId || "").trim();
630
+ if (!key) return;
631
+ _trackedBackgroundTasks.set(key, {
632
+ name: String(info.name || "background task").trim(),
633
+ startedAt: Date.now(),
634
+ lastStatus: "started",
635
+ lastCheckedAt: 0,
636
+ sessionId: String(info.sessionId || "").trim() || null,
637
+ completionInjected: false,
638
+ });
639
+ _ensureBackgroundProgressPolling();
640
+ }
641
+
642
+ function _ensureBackgroundProgressPolling() {
643
+ if (_backgroundProgressTimer) return;
644
+ if (_trackedBackgroundTasks.size === 0) return;
645
+
646
+ _backgroundProgressTimer = setInterval(async () => {
647
+ if (!_session || _trackedBackgroundTasks.size === 0) {
648
+ _stopBackgroundProgressPolling();
649
+ return;
650
+ }
651
+
652
+ for (const [taskId, task] of _trackedBackgroundTasks) {
653
+ const now = Date.now();
654
+ // Don't check more than every 15 seconds
655
+ if (now - task.lastCheckedAt < 15_000) continue;
656
+ task.lastCheckedAt = now;
657
+
658
+ try {
659
+ const res = await fetch("/api/voice/tool", {
660
+ method: "POST",
661
+ headers: { "Content-Type": "application/json" },
662
+ body: JSON.stringify({
663
+ toolName: "poll_background_session",
664
+ args: { sessionId: task.sessionId || taskId },
665
+ sessionId: sdkVoiceSessionId.value,
666
+ }),
667
+ });
668
+ const result = await res.json();
669
+ const statusText = typeof result?.result === "string"
670
+ ? result.result
671
+ : JSON.stringify(result?.result || "");
672
+
673
+ // Detect completion/failure
674
+ const isComplete = /complete|finished|done|failed|error/i.test(statusText);
675
+ const previousStatus = task.lastStatus;
676
+ task.lastStatus = statusText.slice(0, 200);
677
+
678
+ // Only inject if status meaningfully changed or task completed
679
+ if (isComplete && !task.completionInjected) {
680
+ task.completionInjected = true;
681
+ const summary = statusText.length > 500
682
+ ? statusText.slice(0, 500) + "..."
683
+ : statusText;
684
+ _injectContextMessage(
685
+ _session,
686
+ `[Background Task Update — ${task.name}]\nStatus: ${summary}\n` +
687
+ "(You don't need to tell the user about this unless they ask about it.)",
688
+ );
689
+ // Remove completed task
690
+ _trackedBackgroundTasks.delete(taskId);
691
+ }
692
+ } catch {
693
+ // Non-fatal — will retry on next interval
694
+ }
695
+ }
696
+
697
+ if (_trackedBackgroundTasks.size === 0) {
698
+ _stopBackgroundProgressPolling();
699
+ }
700
+ }, 10_000); // Check every 10 seconds
701
+ }
702
+
703
+ function _stopBackgroundProgressPolling() {
704
+ if (_backgroundProgressTimer) {
705
+ clearInterval(_backgroundProgressTimer);
706
+ _backgroundProgressTimer = null;
707
+ }
708
+ _trackedBackgroundTasks.clear();
709
+ // Clean up pending tool result timers
710
+ for (const timer of _pendingToolResultTimers.values()) {
711
+ clearTimeout(timer);
712
+ }
713
+ _pendingToolResultTimers.clear();
714
+ _toolResultInjected.clear();
715
+ }
716
+
450
717
  function _scheduleUserTranscriptFinalize(text) {
451
718
  const value = String(text || "").trim();
452
719
  if (!value) return;
453
720
  _pendingUserTranscriptText = value;
721
+ // API-level transcript arrived — prefer it over browser SpeechRecognition
722
+ _apiTranscriptDelivered = true;
454
723
  if (_pendingUserTranscriptTimer) clearTimeout(_pendingUserTranscriptTimer);
455
724
  _pendingUserTranscriptTimer = setTimeout(() => {
456
725
  _pendingUserTranscriptTimer = null;
457
726
  const finalText = String(_pendingUserTranscriptText || "").trim();
458
727
  if (!finalText) return;
459
- if (ENABLE_USER_TRANSCRIPT) {
460
- sdkVoiceTranscript.value = finalText;
461
- emit("transcript", { text: finalText, final: true });
462
- _persistTranscriptIfNew("user", finalText, "sdk.history_updated.user.final");
463
- } else {
464
- sdkVoiceTranscript.value = "";
465
- // Skip persisting user transcript — ASR often hallucinates wrong
466
- // languages from short fragments; the model still receives the raw
467
- // audio correctly so nothing is lost.
468
- }
728
+ sdkVoiceTranscript.value = finalText;
729
+ emit("transcript", { text: finalText, final: true, source: "api" });
730
+ _persistTranscriptIfNew("user", finalText, "sdk.history_updated.user.final");
731
+ // Reset for next utterance
732
+ _apiTranscriptDelivered = false;
469
733
  }, 350);
470
734
  }
471
735
 
@@ -478,7 +742,6 @@ function _scheduleAssistantTranscriptFinalize(text) {
478
742
  _pendingAssistantTranscriptTimer = null;
479
743
  const finalText = String(_pendingAssistantTranscriptText || "").trim();
480
744
  if (!finalText) return;
481
- sdkVoiceState.value = "thinking";
482
745
  _sdkTraceMarkLlmFirstToken("llm_first_token", { reason: "assistant_transcript.final" });
483
746
  _sdkTraceMarkTtsFirstAudio("tts_first_audio", { reason: "assistant_transcript.final" });
484
747
  sdkVoiceResponse.value = finalText;
@@ -486,10 +749,39 @@ function _scheduleAssistantTranscriptFinalize(text) {
486
749
  _persistTranscriptIfNew("assistant", finalText, "sdk.history_updated.assistant.final");
487
750
  _markAssistantToolResponseObserved(finalText);
488
751
  _sdkTraceEndTurn("turn_end", { reason: "assistant_transcript.final" });
752
+ // Keep response visible in center — schedule delayed clear instead of
753
+ // immediately setting sdkVoiceResponse to "". The response will persist
754
+ // until the user starts speaking or the hold timer expires.
755
+ _scheduleResponseClear();
489
756
  sdkVoiceState.value = "listening";
490
757
  }, 700);
491
758
  }
492
759
 
760
+ /**
761
+ * Schedule a delayed clear of the assistant response from the center display.
762
+ * The response stays visible for RESPONSE_DISPLAY_HOLD_MS or until the user
763
+ * starts speaking (whichever comes first).
764
+ */
765
+ function _scheduleResponseClear() {
766
+ if (_responseClearTimer) clearTimeout(_responseClearTimer);
767
+ _responseClearTimer = setTimeout(() => {
768
+ _responseClearTimer = null;
769
+ sdkVoiceResponse.value = "";
770
+ }, RESPONSE_DISPLAY_HOLD_MS);
771
+ }
772
+
773
+ /**
774
+ * Immediately clear the response display — called when the user starts
775
+ * speaking so the center area shows their new transcript.
776
+ */
777
+ function _clearResponseForNewTurn() {
778
+ if (_responseClearTimer) {
779
+ clearTimeout(_responseClearTimer);
780
+ _responseClearTimer = null;
781
+ }
782
+ sdkVoiceResponse.value = "";
783
+ }
784
+
493
785
  // ── OpenAI/Azure Agents SDK Session ─────────────────────────────────────────
494
786
 
495
787
  /**
@@ -568,7 +860,22 @@ async function startAgentsSdkSession(config, options = {}) {
568
860
  }
569
861
  // SDK expects string results — ensure we always return a string.
570
862
  const output = result.result ?? result.output ?? "Done";
571
- return typeof output === "string" ? output : JSON.stringify(output);
863
+ const outputStr = typeof output === "string" ? output : JSON.stringify(output);
864
+
865
+ // Track background tasks for progress polling
866
+ const BACKGROUND_TOOLS = new Set([
867
+ "delegate_to_agent", "execute_workflow", "create_task",
868
+ ]);
869
+ if (BACKGROUND_TOOLS.has(t.name)) {
870
+ const taskId = result?.taskId || result?.sessionId
871
+ || args?.sessionId || `bg-${Date.now()}`;
872
+ _trackBackgroundTask(taskId, {
873
+ name: `${t.name}: ${String(args?.prompt || args?.title || args?.workflowId || "").slice(0, 60)}`,
874
+ sessionId: result?.sessionId || args?.sessionId || taskId,
875
+ });
876
+ }
877
+
878
+ return outputStr;
572
879
  };
573
880
 
574
881
  // The @openai/agents SDK calls invokeFunctionTool → tool.invoke(runContext, input, details)
@@ -618,22 +925,22 @@ async function startAgentsSdkSession(config, options = {}) {
618
925
  type: turnDetection,
619
926
  ...(turnDetection === "server_vad"
620
927
  ? {
621
- threshold: 0.7,
622
- prefix_padding_ms: 400,
623
- silence_duration_ms: 1300,
928
+ threshold: 0.82,
929
+ prefix_padding_ms: 500,
930
+ silence_duration_ms: 1600,
624
931
  create_response: true,
625
- interrupt_response: true,
932
+ interrupt_response: false,
626
933
  createResponse: true,
627
- interruptResponse: true,
934
+ interruptResponse: false,
628
935
  }
629
936
  : {}),
630
937
  ...(turnDetection === "semantic_vad"
631
938
  ? {
632
- eagerness: "medium",
939
+ eagerness: "low",
633
940
  create_response: true,
634
- interrupt_response: true,
941
+ interrupt_response: false,
635
942
  createResponse: true,
636
- interruptResponse: true,
943
+ interruptResponse: false,
637
944
  }
638
945
  : {}),
639
946
  };
@@ -704,10 +1011,23 @@ async function startAgentsSdkSession(config, options = {}) {
704
1011
  session.on("transport_event", (event) => {
705
1012
  const eventType = event?.type || "";
706
1013
  if (eventType === "input_audio_buffer.speech_started") {
1014
+ _speechStartedAt = Date.now();
707
1015
  _sdkTraceBeginTurn("turn_start", { reason: "speech_started" });
708
- maybeAutoInterruptSdkResponse("speech-started");
1016
+ // Clear any lingering response so the center shows user's new transcript
1017
+ _clearResponseForNewTurn();
1018
+ // Don't interrupt immediately — the barge-in will check speech duration
1019
+ // in maybeAutoInterruptSdkResponse when called from the debounced path.
1020
+ // Only attempt barge-in after MIN_SPEECH_DURATION_FOR_INTERRUPT_MS.
1021
+ setTimeout(() => {
1022
+ if (_speechStartedAt > 0 && (Date.now() - _speechStartedAt) >= MIN_SPEECH_DURATION_FOR_INTERRUPT_MS) {
1023
+ maybeAutoInterruptSdkResponse("speech-started-confirmed");
1024
+ }
1025
+ }, MIN_SPEECH_DURATION_FOR_INTERRUPT_MS);
709
1026
  emit("speech-started", {});
710
1027
  }
1028
+ if (eventType === "input_audio_buffer.speech_stopped") {
1029
+ _speechStartedAt = 0;
1030
+ }
711
1031
  });
712
1032
 
713
1033
  // ── Tool call events ──
@@ -731,13 +1051,22 @@ async function startAgentsSdkSession(config, options = {}) {
731
1051
  session.on("agent_tool_end", (_ctx, _agent, tool, result, details) => {
732
1052
  const toolCall = details?.toolCall || {};
733
1053
  const callId = toolCall?.callId || toolCall?.call_id;
734
- const resultPreview = typeof result === "string"
735
- ? result.slice(0, 120) + (result.length > 120 ? "..." : "")
736
- : "(non-string result)";
737
- console.info(`[voice-client-sdk] tool call done: ${tool?.name} (${callId}) ${resultPreview}`);
1054
+ const name = tool?.name || "unknown";
1055
+ const resultStr = typeof result === "string" ? result : JSON.stringify(result ?? "");
1056
+ const resultPreview = resultStr.length > 120
1057
+ ? resultStr.slice(0, 120) + "..."
1058
+ : resultStr;
1059
+ console.info(`[voice-client-sdk] tool call done: ${name} (${callId}) → ${resultPreview}`);
738
1060
  sdkVoiceToolCalls.value = sdkVoiceToolCalls.value.map((tc) =>
739
1061
  tc.callId === callId ? { ...tc, status: "complete" } : tc
740
1062
  );
1063
+
1064
+ // ── Robust tool result injection ──
1065
+ // The SDK should auto-inject the function_call_output, but in case it
1066
+ // doesn't (race condition, SDK bug, etc.), we verify via a short delay
1067
+ // and manually inject if the model hasn't acknowledged the result.
1068
+ _ensureToolResultInjected(session, callId, name, resultStr);
1069
+
741
1070
  // Return to listening once all tool calls have resolved.
742
1071
  const stillRunning = sdkVoiceToolCalls.value.some((tc) => tc.status === "running");
743
1072
  if (!stillRunning && sdkVoiceState.value === "thinking") {
@@ -746,7 +1075,7 @@ async function startAgentsSdkSession(config, options = {}) {
746
1075
  if (!stillRunning) {
747
1076
  _markToolCompletionPending();
748
1077
  }
749
- emit("tool-call-complete", { callId, name: tool?.name, result });
1078
+ emit("tool-call-complete", { callId, name, result });
750
1079
  });
751
1080
 
752
1081
  session.on("error", (err) => {
@@ -870,6 +1199,10 @@ async function startAgentsSdkSession(config, options = {}) {
870
1199
  sdkVoiceSessionId.value = _callContext.sessionId || `voice-sdk-${Date.now()}`;
871
1200
  startDurationTimer();
872
1201
 
1202
+ // Start browser SpeechRecognition as parallel/backup transcription source
1203
+ _apiTranscriptDelivered = false;
1204
+ _startBrowserTranscription();
1205
+
873
1206
  emit("connected", {
874
1207
  provider: tokenData.provider,
875
1208
  sessionId: sdkVoiceSessionId.value,
@@ -926,6 +1259,10 @@ async function startGeminiLiveSession(config, options = {}) {
926
1259
  sdkVoiceSessionId.value = _callContext.sessionId || `voice-gemini-${Date.now()}`;
927
1260
  startDurationTimer();
928
1261
 
1262
+ // Start browser SpeechRecognition as parallel/backup transcription
1263
+ _apiTranscriptDelivered = false;
1264
+ _startBrowserTranscription();
1265
+
929
1266
  // Start mic capture and stream to server
930
1267
  startGeminiMicCapture(ws).catch((err) => {
931
1268
  console.error("[voice-client-sdk] Gemini mic capture failed:", err);
@@ -1133,12 +1470,8 @@ function handleGeminiServerEvent(msg) {
1133
1470
 
1134
1471
  switch (type) {
1135
1472
  case "transcript.user":
1136
- if (ENABLE_USER_TRANSCRIPT) {
1137
- sdkVoiceTranscript.value = msg.text || "";
1138
- emit("transcript", { text: msg.text, final: true });
1139
- } else {
1140
- sdkVoiceTranscript.value = "";
1141
- }
1473
+ sdkVoiceTranscript.value = msg.text || "";
1474
+ emit("transcript", { text: msg.text, final: true, source: "api" });
1142
1475
  _persistTranscriptIfNew("user", msg.text, "gemini.user_transcript");
1143
1476
  break;
1144
1477
 
@@ -1289,6 +1622,8 @@ export async function startSdkVoiceSession(options = {}) {
1289
1622
  sdkVoiceToolCalls.value = [];
1290
1623
  _usingLegacyFallback = false;
1291
1624
  _lastAutoBargeInAt = 0;
1625
+ _speechStartedAt = 0;
1626
+ if (_responseClearTimer) { clearTimeout(_responseClearTimer); _responseClearTimer = null; }
1292
1627
  _resetTranscriptPersistenceState();
1293
1628
 
1294
1629
  try {
@@ -1367,6 +1702,8 @@ export function stopSdkVoiceSession() {
1367
1702
  _sdkExplicitStop = true;
1368
1703
  emit("session-ending", { sessionId: sdkVoiceSessionId.value });
1369
1704
  _flushPendingTranscriptBuffers();
1705
+ _stopBrowserTranscription();
1706
+ _stopBackgroundProgressPolling();
1370
1707
  if (_geminiRecorder) {
1371
1708
  try { _geminiRecorder.stop(); } catch { /* ignore */ }
1372
1709
  _geminiRecorder = null;
@@ -1400,6 +1737,8 @@ export function stopSdkVoiceSession() {
1400
1737
 
1401
1738
  clearInterval(_durationTimer);
1402
1739
  _durationTimer = null;
1740
+ if (_responseClearTimer) { clearTimeout(_responseClearTimer); _responseClearTimer = null; }
1741
+ _speechStartedAt = 0;
1403
1742
 
1404
1743
  sdkVoiceState.value = "idle";
1405
1744
  sdkVoiceTranscript.value = "";