bosun 0.40.21 → 0.41.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +8 -0
- package/README.md +20 -0
- package/agent/agent-custom-tools.mjs +23 -5
- package/agent/agent-event-bus.mjs +248 -6
- package/agent/agent-pool.mjs +131 -30
- package/agent/agent-work-analyzer.mjs +8 -16
- package/agent/primary-agent.mjs +81 -7
- package/agent/retry-queue.mjs +164 -0
- package/bench/swebench/bosun-swebench.mjs +5 -0
- package/bosun.config.example.json +25 -0
- package/bosun.schema.json +825 -183
- package/cli.mjs +267 -8
- package/config/config-doctor.mjs +51 -2
- package/config/config.mjs +232 -5
- package/github/github-auth-manager.mjs +70 -19
- package/infra/library-manager.mjs +894 -60
- package/infra/monitor.mjs +701 -69
- package/infra/runtime-accumulator.mjs +376 -84
- package/infra/session-tracker.mjs +95 -28
- package/infra/test-runtime.mjs +267 -0
- package/lib/codebase-audit.mjs +133 -18
- package/package.json +30 -8
- package/server/setup-web-server.mjs +29 -1
- package/server/ui-server.mjs +1571 -49
- package/setup.mjs +27 -24
- package/shell/codex-shell.mjs +34 -3
- package/shell/copilot-shell.mjs +50 -8
- package/task/msg-hub.mjs +193 -0
- package/task/pipeline.mjs +544 -0
- package/task/task-claims.mjs +6 -10
- package/task/task-cli.mjs +38 -2
- package/task/task-executor-pipeline.mjs +143 -0
- package/task/task-executor.mjs +36 -27
- package/telegram/get-telegram-chat-id.mjs +57 -47
- package/ui/components/chat-view.js +18 -1
- package/ui/components/workspace-switcher.js +321 -9
- package/ui/demo-defaults.js +17830 -10433
- package/ui/demo.html +9 -1
- package/ui/modules/router.js +1 -1
- package/ui/modules/settings-schema.js +2 -0
- package/ui/modules/state.js +54 -57
- package/ui/modules/voice-client-sdk.js +376 -37
- package/ui/modules/voice-client.js +173 -33
- package/ui/setup.html +68 -2
- package/ui/styles/components.css +571 -1
- package/ui/styles.css +201 -1
- package/ui/tabs/dashboard.js +74 -0
- package/ui/tabs/library.js +410 -55
- package/ui/tabs/logs.js +10 -0
- package/ui/tabs/settings.js +178 -99
- package/ui/tabs/tasks.js +1083 -507
- package/ui/tabs/telemetry.js +34 -0
- package/ui/tabs/workflow-canvas-utils.mjs +38 -1
- package/ui/tabs/workflows.js +1275 -402
- package/voice/voice-agents-sdk.mjs +2 -2
- package/voice/voice-relay.mjs +28 -20
- package/workflow/declarative-workflows.mjs +145 -0
- package/workflow/msg-hub.mjs +237 -0
- package/workflow/pipeline-workflows.mjs +287 -0
- package/workflow/pipeline.mjs +828 -315
- package/workflow/project-detection.mjs +559 -0
- package/workflow/workflow-cli.mjs +128 -0
- package/workflow/workflow-contract.mjs +433 -232
- package/workflow/workflow-engine.mjs +510 -47
- package/workflow/workflow-nodes/custom-loader.mjs +251 -0
- package/workflow/workflow-nodes.mjs +2024 -184
- package/workflow/workflow-templates.mjs +118 -24
- package/workflow-templates/agents.mjs +20 -20
- package/workflow-templates/bosun-native.mjs +212 -2
- package/workflow-templates/code-quality.mjs +20 -14
- package/workflow-templates/continuation-loop.mjs +339 -0
- package/workflow-templates/github.mjs +516 -40
- package/workflow-templates/planning.mjs +446 -17
- package/workflow-templates/reliability.mjs +65 -12
- package/workflow-templates/task-batch.mjs +27 -10
- package/workflow-templates/task-execution.mjs +752 -0
- package/workflow-templates/task-lifecycle.mjs +117 -14
- package/workspace/context-cache.mjs +66 -18
- package/workspace/workspace-manager.mjs +153 -1
- package/workflow-templates/issue-continuation.mjs +0 -243
|
@@ -39,9 +39,76 @@ export const isSdkVoiceActive = computed(() =>
|
|
|
39
39
|
sdkVoiceState.value !== "idle" && sdkVoiceState.value !== "error"
|
|
40
40
|
);
|
|
41
41
|
|
|
42
|
-
//
|
|
43
|
-
//
|
|
44
|
-
const ENABLE_USER_TRANSCRIPT =
|
|
42
|
+
// User transcript is always enabled — transcription is surfaced from the API's
|
|
43
|
+
// input_audio_transcription feature (primary) or browser SpeechRecognition (backup).
|
|
44
|
+
const ENABLE_USER_TRANSCRIPT = true;
|
|
45
|
+
|
|
46
|
+
// ── Browser SpeechRecognition (parallel backup for user transcription) ──────
|
|
47
|
+
|
|
48
|
+
const _BrowserSpeechRecognition = typeof globalThis !== "undefined"
|
|
49
|
+
? (globalThis.SpeechRecognition || globalThis.webkitSpeechRecognition)
|
|
50
|
+
: null;
|
|
51
|
+
let _browserRecognition = null;
|
|
52
|
+
let _browserTranscriptActive = false;
|
|
53
|
+
// When the API-level transcription delivers a user transcript, we prefer it
|
|
54
|
+
// over the browser's; this flag suppresses duplicate browser results.
|
|
55
|
+
let _apiTranscriptDelivered = false;
|
|
56
|
+
|
|
57
|
+
function _startBrowserTranscription() {
|
|
58
|
+
if (!_BrowserSpeechRecognition || _browserRecognition) return;
|
|
59
|
+
try {
|
|
60
|
+
const recognition = new _BrowserSpeechRecognition();
|
|
61
|
+
recognition.continuous = true;
|
|
62
|
+
recognition.interimResults = true;
|
|
63
|
+
recognition.maxAlternatives = 1;
|
|
64
|
+
// Attempt to match user's language, fall back to English
|
|
65
|
+
recognition.lang = navigator?.language || "en-US";
|
|
66
|
+
|
|
67
|
+
recognition.onresult = (event) => {
|
|
68
|
+
// Only use browser transcript when API-level transcription hasn't delivered yet
|
|
69
|
+
if (_apiTranscriptDelivered) return;
|
|
70
|
+
let transcript = "";
|
|
71
|
+
for (let i = event.resultIndex; i < event.results.length; i++) {
|
|
72
|
+
transcript += event.results[i][0].transcript;
|
|
73
|
+
}
|
|
74
|
+
const text = transcript.trim();
|
|
75
|
+
if (!text) return;
|
|
76
|
+
sdkVoiceTranscript.value = text;
|
|
77
|
+
emit("transcript", { text, final: event.results[event.resultIndex]?.isFinal || false, source: "browser" });
|
|
78
|
+
if (event.results[event.resultIndex]?.isFinal) {
|
|
79
|
+
_persistTranscriptIfNew("user", text, "browser.speech_recognition.final");
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
recognition.onerror = (e) => {
|
|
84
|
+
// Non-fatal: browser recognition may fail on some systems
|
|
85
|
+
if (e.error !== "no-speech" && e.error !== "aborted") {
|
|
86
|
+
console.warn("[voice-client-sdk] Browser SpeechRecognition error:", e.error);
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
recognition.onend = () => {
|
|
91
|
+
// Auto-restart while session is active
|
|
92
|
+
if (_browserTranscriptActive && _session) {
|
|
93
|
+
try { recognition.start(); } catch { /* already running or stopped */ }
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
recognition.start();
|
|
98
|
+
_browserRecognition = recognition;
|
|
99
|
+
_browserTranscriptActive = true;
|
|
100
|
+
} catch (err) {
|
|
101
|
+
console.warn("[voice-client-sdk] Browser SpeechRecognition unavailable:", err?.message);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function _stopBrowserTranscription() {
|
|
106
|
+
_browserTranscriptActive = false;
|
|
107
|
+
if (_browserRecognition) {
|
|
108
|
+
try { _browserRecognition.stop(); } catch { /* ignore */ }
|
|
109
|
+
_browserRecognition = null;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
45
112
|
|
|
46
113
|
// ── Module-scope state ──────────────────────────────────────────────────────
|
|
47
114
|
|
|
@@ -74,7 +141,13 @@ let _toolCompletionAckTimer = null;
|
|
|
74
141
|
let _assistantBaselineBeforeToolAck = "";
|
|
75
142
|
const _sdkCapturedMicStreams = new Set();
|
|
76
143
|
let _lastAutoBargeInAt = 0;
|
|
77
|
-
const AUTO_BARGE_IN_COOLDOWN_MS =
|
|
144
|
+
const AUTO_BARGE_IN_COOLDOWN_MS = 1200;
|
|
145
|
+
// Minimum speech duration (ms) before an interrupt is allowed — filters keyboard/click noise
|
|
146
|
+
let _speechStartedAt = 0;
|
|
147
|
+
const MIN_SPEECH_DURATION_FOR_INTERRUPT_MS = 400;
|
|
148
|
+
// Delayed response clear — keep response visible in center after turn ends
|
|
149
|
+
let _responseClearTimer = null;
|
|
150
|
+
const RESPONSE_DISPLAY_HOLD_MS = 8000;
|
|
78
151
|
let _traceTurnCounter = 0;
|
|
79
152
|
let _traceCurrentTurnId = null;
|
|
80
153
|
let _traceTurnActive = false;
|
|
@@ -108,6 +181,14 @@ function emit(event, data) {
|
|
|
108
181
|
|
|
109
182
|
function maybeAutoInterruptSdkResponse(reason = "speech-started") {
|
|
110
183
|
const now = Date.now();
|
|
184
|
+
// Only interrupt if speech has been ongoing long enough to be real speech
|
|
185
|
+
// (filters out keyboard clicks, mouse clicks, coughs, etc.)
|
|
186
|
+
if (_speechStartedAt > 0) {
|
|
187
|
+
const speechDuration = now - _speechStartedAt;
|
|
188
|
+
if (speechDuration < MIN_SPEECH_DURATION_FOR_INTERRUPT_MS) {
|
|
189
|
+
return false;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
111
192
|
if (!shouldAutoBargeIn({
|
|
112
193
|
muted: isVoiceMicMuted.value,
|
|
113
194
|
audioActive: Boolean(_session),
|
|
@@ -378,6 +459,13 @@ function _resetTranscriptPersistenceState() {
|
|
|
378
459
|
_traceTurnActive = false;
|
|
379
460
|
_traceLlmFirstTokenMarked = false;
|
|
380
461
|
_traceTtsFirstAudioMarked = false;
|
|
462
|
+
_apiTranscriptDelivered = false;
|
|
463
|
+
// Clean up tool result injection state
|
|
464
|
+
for (const timer of _pendingToolResultTimers.values()) {
|
|
465
|
+
clearTimeout(timer);
|
|
466
|
+
}
|
|
467
|
+
_pendingToolResultTimers.clear();
|
|
468
|
+
_toolResultInjected.clear();
|
|
381
469
|
}
|
|
382
470
|
|
|
383
471
|
function _flushPendingTranscriptBuffers() {
|
|
@@ -391,7 +479,7 @@ function _flushPendingTranscriptBuffers() {
|
|
|
391
479
|
}
|
|
392
480
|
|
|
393
481
|
const finalUser = String(_pendingUserTranscriptText || "").trim();
|
|
394
|
-
if (finalUser
|
|
482
|
+
if (finalUser) {
|
|
395
483
|
_persistTranscriptIfNew("user", finalUser, "sdk.history_updated.user.flush");
|
|
396
484
|
}
|
|
397
485
|
|
|
@@ -447,25 +535,201 @@ function _markAssistantToolResponseObserved(latestAssistantText = "") {
|
|
|
447
535
|
}
|
|
448
536
|
}
|
|
449
537
|
|
|
538
|
+
// ── Robust tool result injection ────────────────────────────────────────────
|
|
539
|
+
// After the SDK processes a tool call, we verify that the model has received
|
|
540
|
+
// the function_call_output. If the model hasn't responded within a short
|
|
541
|
+
// window, we manually inject the result via sendEvent() as a fallback.
|
|
542
|
+
|
|
543
|
+
let _pendingToolResultTimers = new Map();
|
|
544
|
+
let _toolResultInjected = new Set(); // call IDs that were manually injected
|
|
545
|
+
|
|
546
|
+
function _ensureToolResultInjected(session, callId, toolName, resultStr) {
|
|
547
|
+
// Immediately inject the tool result into the model's conversation context.
|
|
548
|
+
// The SDK's auto-injection is unreliable — the result gets stored in the
|
|
549
|
+
// session tracker / chat history but doesn't always reach the model's
|
|
550
|
+
// realtime conversation context, causing the model to say "I'm having
|
|
551
|
+
// trouble" even though the tool succeeded.
|
|
552
|
+
const key = String(callId || "");
|
|
553
|
+
if (!key) return;
|
|
554
|
+
|
|
555
|
+
// Truncate large results for voice context
|
|
556
|
+
const VOICE_TOOL_OUTPUT_MAX = 6000;
|
|
557
|
+
let output = resultStr || "Done";
|
|
558
|
+
if (output.length > VOICE_TOOL_OUTPUT_MAX) {
|
|
559
|
+
output = output.slice(0, VOICE_TOOL_OUTPUT_MAX)
|
|
560
|
+
+ "\n... (truncated for voice — full result available in chat)";
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
// Mark as injected immediately to prevent duplicate injections
|
|
564
|
+
_toolResultInjected.add(key);
|
|
565
|
+
|
|
566
|
+
// Inject NOW — don't wait for SDK auto-injection
|
|
567
|
+
if (session && typeof session.sendEvent === "function") {
|
|
568
|
+
try {
|
|
569
|
+
session.sendEvent({
|
|
570
|
+
type: "conversation.item.create",
|
|
571
|
+
item: {
|
|
572
|
+
type: "function_call_output",
|
|
573
|
+
call_id: callId,
|
|
574
|
+
output,
|
|
575
|
+
},
|
|
576
|
+
});
|
|
577
|
+
session.sendEvent({ type: "response.create" });
|
|
578
|
+
console.info(`[voice-client-sdk] Injected tool result for ${toolName} (${callId})`);
|
|
579
|
+
return;
|
|
580
|
+
} catch (err) {
|
|
581
|
+
console.warn("[voice-client-sdk] sendEvent injection failed:", err?.message);
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
// Fallback: inject as user-role context message
|
|
586
|
+
_injectContextMessage(
|
|
587
|
+
session,
|
|
588
|
+
`[Tool Result — ${toolName}]\n${output}`,
|
|
589
|
+
);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
/**
|
|
593
|
+
* Inject a context message directly into the voice agent's conversation.
|
|
594
|
+
* Used for tool result fallback injection and background progress updates.
|
|
595
|
+
*/
|
|
596
|
+
function _injectContextMessage(session, text) {
|
|
597
|
+
if (!session || !text) return;
|
|
598
|
+
const inputText = String(text).trim();
|
|
599
|
+
if (!inputText) return;
|
|
600
|
+
|
|
601
|
+
if (typeof session.sendMessage === "function") {
|
|
602
|
+
// @openai/agents SDK — sendMessage injects as user text and triggers response
|
|
603
|
+
session.sendMessage(inputText);
|
|
604
|
+
} else if (typeof session.sendEvent === "function") {
|
|
605
|
+
session.sendEvent({
|
|
606
|
+
type: "conversation.item.create",
|
|
607
|
+
item: {
|
|
608
|
+
type: "message",
|
|
609
|
+
role: "user",
|
|
610
|
+
content: [{ type: "input_text", text: inputText }],
|
|
611
|
+
},
|
|
612
|
+
});
|
|
613
|
+
session.sendEvent({ type: "response.create" });
|
|
614
|
+
} else if (session.readyState === WebSocket.OPEN) {
|
|
615
|
+
// Gemini WebSocket
|
|
616
|
+
session.send(JSON.stringify({ type: "text.input", text: inputText }));
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// ── Background agent progress tracking ──────────────────────────────────────
|
|
621
|
+
// When a voice tool dispatches a background task/agent, we track it and
|
|
622
|
+
// periodically inject progress updates into the voice conversation so the
|
|
623
|
+
// model stays aware without the user having to ask.
|
|
624
|
+
|
|
625
|
+
let _backgroundProgressTimer = null;
|
|
626
|
+
let _trackedBackgroundTasks = new Map(); // taskId → { name, startedAt, lastStatus, sessionId }
|
|
627
|
+
|
|
628
|
+
function _trackBackgroundTask(taskId, info = {}) {
|
|
629
|
+
const key = String(taskId || "").trim();
|
|
630
|
+
if (!key) return;
|
|
631
|
+
_trackedBackgroundTasks.set(key, {
|
|
632
|
+
name: String(info.name || "background task").trim(),
|
|
633
|
+
startedAt: Date.now(),
|
|
634
|
+
lastStatus: "started",
|
|
635
|
+
lastCheckedAt: 0,
|
|
636
|
+
sessionId: String(info.sessionId || "").trim() || null,
|
|
637
|
+
completionInjected: false,
|
|
638
|
+
});
|
|
639
|
+
_ensureBackgroundProgressPolling();
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
function _ensureBackgroundProgressPolling() {
|
|
643
|
+
if (_backgroundProgressTimer) return;
|
|
644
|
+
if (_trackedBackgroundTasks.size === 0) return;
|
|
645
|
+
|
|
646
|
+
_backgroundProgressTimer = setInterval(async () => {
|
|
647
|
+
if (!_session || _trackedBackgroundTasks.size === 0) {
|
|
648
|
+
_stopBackgroundProgressPolling();
|
|
649
|
+
return;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
for (const [taskId, task] of _trackedBackgroundTasks) {
|
|
653
|
+
const now = Date.now();
|
|
654
|
+
// Don't check more than every 15 seconds
|
|
655
|
+
if (now - task.lastCheckedAt < 15_000) continue;
|
|
656
|
+
task.lastCheckedAt = now;
|
|
657
|
+
|
|
658
|
+
try {
|
|
659
|
+
const res = await fetch("/api/voice/tool", {
|
|
660
|
+
method: "POST",
|
|
661
|
+
headers: { "Content-Type": "application/json" },
|
|
662
|
+
body: JSON.stringify({
|
|
663
|
+
toolName: "poll_background_session",
|
|
664
|
+
args: { sessionId: task.sessionId || taskId },
|
|
665
|
+
sessionId: sdkVoiceSessionId.value,
|
|
666
|
+
}),
|
|
667
|
+
});
|
|
668
|
+
const result = await res.json();
|
|
669
|
+
const statusText = typeof result?.result === "string"
|
|
670
|
+
? result.result
|
|
671
|
+
: JSON.stringify(result?.result || "");
|
|
672
|
+
|
|
673
|
+
// Detect completion/failure
|
|
674
|
+
const isComplete = /complete|finished|done|failed|error/i.test(statusText);
|
|
675
|
+
const previousStatus = task.lastStatus;
|
|
676
|
+
task.lastStatus = statusText.slice(0, 200);
|
|
677
|
+
|
|
678
|
+
// Only inject if status meaningfully changed or task completed
|
|
679
|
+
if (isComplete && !task.completionInjected) {
|
|
680
|
+
task.completionInjected = true;
|
|
681
|
+
const summary = statusText.length > 500
|
|
682
|
+
? statusText.slice(0, 500) + "..."
|
|
683
|
+
: statusText;
|
|
684
|
+
_injectContextMessage(
|
|
685
|
+
_session,
|
|
686
|
+
`[Background Task Update — ${task.name}]\nStatus: ${summary}\n` +
|
|
687
|
+
"(You don't need to tell the user about this unless they ask about it.)",
|
|
688
|
+
);
|
|
689
|
+
// Remove completed task
|
|
690
|
+
_trackedBackgroundTasks.delete(taskId);
|
|
691
|
+
}
|
|
692
|
+
} catch {
|
|
693
|
+
// Non-fatal — will retry on next interval
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
if (_trackedBackgroundTasks.size === 0) {
|
|
698
|
+
_stopBackgroundProgressPolling();
|
|
699
|
+
}
|
|
700
|
+
}, 10_000); // Check every 10 seconds
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
function _stopBackgroundProgressPolling() {
|
|
704
|
+
if (_backgroundProgressTimer) {
|
|
705
|
+
clearInterval(_backgroundProgressTimer);
|
|
706
|
+
_backgroundProgressTimer = null;
|
|
707
|
+
}
|
|
708
|
+
_trackedBackgroundTasks.clear();
|
|
709
|
+
// Clean up pending tool result timers
|
|
710
|
+
for (const timer of _pendingToolResultTimers.values()) {
|
|
711
|
+
clearTimeout(timer);
|
|
712
|
+
}
|
|
713
|
+
_pendingToolResultTimers.clear();
|
|
714
|
+
_toolResultInjected.clear();
|
|
715
|
+
}
|
|
716
|
+
|
|
450
717
|
function _scheduleUserTranscriptFinalize(text) {
|
|
451
718
|
const value = String(text || "").trim();
|
|
452
719
|
if (!value) return;
|
|
453
720
|
_pendingUserTranscriptText = value;
|
|
721
|
+
// API-level transcript arrived — prefer it over browser SpeechRecognition
|
|
722
|
+
_apiTranscriptDelivered = true;
|
|
454
723
|
if (_pendingUserTranscriptTimer) clearTimeout(_pendingUserTranscriptTimer);
|
|
455
724
|
_pendingUserTranscriptTimer = setTimeout(() => {
|
|
456
725
|
_pendingUserTranscriptTimer = null;
|
|
457
726
|
const finalText = String(_pendingUserTranscriptText || "").trim();
|
|
458
727
|
if (!finalText) return;
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
sdkVoiceTranscript.value = "";
|
|
465
|
-
// Skip persisting user transcript — ASR often hallucinates wrong
|
|
466
|
-
// languages from short fragments; the model still receives the raw
|
|
467
|
-
// audio correctly so nothing is lost.
|
|
468
|
-
}
|
|
728
|
+
sdkVoiceTranscript.value = finalText;
|
|
729
|
+
emit("transcript", { text: finalText, final: true, source: "api" });
|
|
730
|
+
_persistTranscriptIfNew("user", finalText, "sdk.history_updated.user.final");
|
|
731
|
+
// Reset for next utterance
|
|
732
|
+
_apiTranscriptDelivered = false;
|
|
469
733
|
}, 350);
|
|
470
734
|
}
|
|
471
735
|
|
|
@@ -478,7 +742,6 @@ function _scheduleAssistantTranscriptFinalize(text) {
|
|
|
478
742
|
_pendingAssistantTranscriptTimer = null;
|
|
479
743
|
const finalText = String(_pendingAssistantTranscriptText || "").trim();
|
|
480
744
|
if (!finalText) return;
|
|
481
|
-
sdkVoiceState.value = "thinking";
|
|
482
745
|
_sdkTraceMarkLlmFirstToken("llm_first_token", { reason: "assistant_transcript.final" });
|
|
483
746
|
_sdkTraceMarkTtsFirstAudio("tts_first_audio", { reason: "assistant_transcript.final" });
|
|
484
747
|
sdkVoiceResponse.value = finalText;
|
|
@@ -486,10 +749,39 @@ function _scheduleAssistantTranscriptFinalize(text) {
|
|
|
486
749
|
_persistTranscriptIfNew("assistant", finalText, "sdk.history_updated.assistant.final");
|
|
487
750
|
_markAssistantToolResponseObserved(finalText);
|
|
488
751
|
_sdkTraceEndTurn("turn_end", { reason: "assistant_transcript.final" });
|
|
752
|
+
// Keep response visible in center — schedule delayed clear instead of
|
|
753
|
+
// immediately setting sdkVoiceResponse to "". The response will persist
|
|
754
|
+
// until the user starts speaking or the hold timer expires.
|
|
755
|
+
_scheduleResponseClear();
|
|
489
756
|
sdkVoiceState.value = "listening";
|
|
490
757
|
}, 700);
|
|
491
758
|
}
|
|
492
759
|
|
|
760
|
+
/**
|
|
761
|
+
* Schedule a delayed clear of the assistant response from the center display.
|
|
762
|
+
* The response stays visible for RESPONSE_DISPLAY_HOLD_MS or until the user
|
|
763
|
+
* starts speaking (whichever comes first).
|
|
764
|
+
*/
|
|
765
|
+
function _scheduleResponseClear() {
|
|
766
|
+
if (_responseClearTimer) clearTimeout(_responseClearTimer);
|
|
767
|
+
_responseClearTimer = setTimeout(() => {
|
|
768
|
+
_responseClearTimer = null;
|
|
769
|
+
sdkVoiceResponse.value = "";
|
|
770
|
+
}, RESPONSE_DISPLAY_HOLD_MS);
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
/**
|
|
774
|
+
* Immediately clear the response display — called when the user starts
|
|
775
|
+
* speaking so the center area shows their new transcript.
|
|
776
|
+
*/
|
|
777
|
+
function _clearResponseForNewTurn() {
|
|
778
|
+
if (_responseClearTimer) {
|
|
779
|
+
clearTimeout(_responseClearTimer);
|
|
780
|
+
_responseClearTimer = null;
|
|
781
|
+
}
|
|
782
|
+
sdkVoiceResponse.value = "";
|
|
783
|
+
}
|
|
784
|
+
|
|
493
785
|
// ── OpenAI/Azure Agents SDK Session ─────────────────────────────────────────
|
|
494
786
|
|
|
495
787
|
/**
|
|
@@ -568,7 +860,22 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
568
860
|
}
|
|
569
861
|
// SDK expects string results — ensure we always return a string.
|
|
570
862
|
const output = result.result ?? result.output ?? "Done";
|
|
571
|
-
|
|
863
|
+
const outputStr = typeof output === "string" ? output : JSON.stringify(output);
|
|
864
|
+
|
|
865
|
+
// Track background tasks for progress polling
|
|
866
|
+
const BACKGROUND_TOOLS = new Set([
|
|
867
|
+
"delegate_to_agent", "execute_workflow", "create_task",
|
|
868
|
+
]);
|
|
869
|
+
if (BACKGROUND_TOOLS.has(t.name)) {
|
|
870
|
+
const taskId = result?.taskId || result?.sessionId
|
|
871
|
+
|| args?.sessionId || `bg-${Date.now()}`;
|
|
872
|
+
_trackBackgroundTask(taskId, {
|
|
873
|
+
name: `${t.name}: ${String(args?.prompt || args?.title || args?.workflowId || "").slice(0, 60)}`,
|
|
874
|
+
sessionId: result?.sessionId || args?.sessionId || taskId,
|
|
875
|
+
});
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
return outputStr;
|
|
572
879
|
};
|
|
573
880
|
|
|
574
881
|
// The @openai/agents SDK calls invokeFunctionTool → tool.invoke(runContext, input, details)
|
|
@@ -618,22 +925,22 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
618
925
|
type: turnDetection,
|
|
619
926
|
...(turnDetection === "server_vad"
|
|
620
927
|
? {
|
|
621
|
-
threshold: 0.
|
|
622
|
-
prefix_padding_ms:
|
|
623
|
-
silence_duration_ms:
|
|
928
|
+
threshold: 0.82,
|
|
929
|
+
prefix_padding_ms: 500,
|
|
930
|
+
silence_duration_ms: 1600,
|
|
624
931
|
create_response: true,
|
|
625
|
-
interrupt_response:
|
|
932
|
+
interrupt_response: false,
|
|
626
933
|
createResponse: true,
|
|
627
|
-
interruptResponse:
|
|
934
|
+
interruptResponse: false,
|
|
628
935
|
}
|
|
629
936
|
: {}),
|
|
630
937
|
...(turnDetection === "semantic_vad"
|
|
631
938
|
? {
|
|
632
|
-
eagerness: "
|
|
939
|
+
eagerness: "low",
|
|
633
940
|
create_response: true,
|
|
634
|
-
interrupt_response:
|
|
941
|
+
interrupt_response: false,
|
|
635
942
|
createResponse: true,
|
|
636
|
-
interruptResponse:
|
|
943
|
+
interruptResponse: false,
|
|
637
944
|
}
|
|
638
945
|
: {}),
|
|
639
946
|
};
|
|
@@ -704,10 +1011,23 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
704
1011
|
session.on("transport_event", (event) => {
|
|
705
1012
|
const eventType = event?.type || "";
|
|
706
1013
|
if (eventType === "input_audio_buffer.speech_started") {
|
|
1014
|
+
_speechStartedAt = Date.now();
|
|
707
1015
|
_sdkTraceBeginTurn("turn_start", { reason: "speech_started" });
|
|
708
|
-
|
|
1016
|
+
// Clear any lingering response so the center shows user's new transcript
|
|
1017
|
+
_clearResponseForNewTurn();
|
|
1018
|
+
// Don't interrupt immediately — the barge-in will check speech duration
|
|
1019
|
+
// in maybeAutoInterruptSdkResponse when called from the debounced path.
|
|
1020
|
+
// Only attempt barge-in after MIN_SPEECH_DURATION_FOR_INTERRUPT_MS.
|
|
1021
|
+
setTimeout(() => {
|
|
1022
|
+
if (_speechStartedAt > 0 && (Date.now() - _speechStartedAt) >= MIN_SPEECH_DURATION_FOR_INTERRUPT_MS) {
|
|
1023
|
+
maybeAutoInterruptSdkResponse("speech-started-confirmed");
|
|
1024
|
+
}
|
|
1025
|
+
}, MIN_SPEECH_DURATION_FOR_INTERRUPT_MS);
|
|
709
1026
|
emit("speech-started", {});
|
|
710
1027
|
}
|
|
1028
|
+
if (eventType === "input_audio_buffer.speech_stopped") {
|
|
1029
|
+
_speechStartedAt = 0;
|
|
1030
|
+
}
|
|
711
1031
|
});
|
|
712
1032
|
|
|
713
1033
|
// ── Tool call events ──
|
|
@@ -731,13 +1051,22 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
731
1051
|
session.on("agent_tool_end", (_ctx, _agent, tool, result, details) => {
|
|
732
1052
|
const toolCall = details?.toolCall || {};
|
|
733
1053
|
const callId = toolCall?.callId || toolCall?.call_id;
|
|
734
|
-
const
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
1054
|
+
const name = tool?.name || "unknown";
|
|
1055
|
+
const resultStr = typeof result === "string" ? result : JSON.stringify(result ?? "");
|
|
1056
|
+
const resultPreview = resultStr.length > 120
|
|
1057
|
+
? resultStr.slice(0, 120) + "..."
|
|
1058
|
+
: resultStr;
|
|
1059
|
+
console.info(`[voice-client-sdk] tool call done: ${name} (${callId}) → ${resultPreview}`);
|
|
738
1060
|
sdkVoiceToolCalls.value = sdkVoiceToolCalls.value.map((tc) =>
|
|
739
1061
|
tc.callId === callId ? { ...tc, status: "complete" } : tc
|
|
740
1062
|
);
|
|
1063
|
+
|
|
1064
|
+
// ── Robust tool result injection ──
|
|
1065
|
+
// The SDK should auto-inject the function_call_output, but in case it
|
|
1066
|
+
// doesn't (race condition, SDK bug, etc.), we verify via a short delay
|
|
1067
|
+
// and manually inject if the model hasn't acknowledged the result.
|
|
1068
|
+
_ensureToolResultInjected(session, callId, name, resultStr);
|
|
1069
|
+
|
|
741
1070
|
// Return to listening once all tool calls have resolved.
|
|
742
1071
|
const stillRunning = sdkVoiceToolCalls.value.some((tc) => tc.status === "running");
|
|
743
1072
|
if (!stillRunning && sdkVoiceState.value === "thinking") {
|
|
@@ -746,7 +1075,7 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
746
1075
|
if (!stillRunning) {
|
|
747
1076
|
_markToolCompletionPending();
|
|
748
1077
|
}
|
|
749
|
-
emit("tool-call-complete", { callId, name
|
|
1078
|
+
emit("tool-call-complete", { callId, name, result });
|
|
750
1079
|
});
|
|
751
1080
|
|
|
752
1081
|
session.on("error", (err) => {
|
|
@@ -870,6 +1199,10 @@ async function startAgentsSdkSession(config, options = {}) {
|
|
|
870
1199
|
sdkVoiceSessionId.value = _callContext.sessionId || `voice-sdk-${Date.now()}`;
|
|
871
1200
|
startDurationTimer();
|
|
872
1201
|
|
|
1202
|
+
// Start browser SpeechRecognition as parallel/backup transcription source
|
|
1203
|
+
_apiTranscriptDelivered = false;
|
|
1204
|
+
_startBrowserTranscription();
|
|
1205
|
+
|
|
873
1206
|
emit("connected", {
|
|
874
1207
|
provider: tokenData.provider,
|
|
875
1208
|
sessionId: sdkVoiceSessionId.value,
|
|
@@ -926,6 +1259,10 @@ async function startGeminiLiveSession(config, options = {}) {
|
|
|
926
1259
|
sdkVoiceSessionId.value = _callContext.sessionId || `voice-gemini-${Date.now()}`;
|
|
927
1260
|
startDurationTimer();
|
|
928
1261
|
|
|
1262
|
+
// Start browser SpeechRecognition as parallel/backup transcription
|
|
1263
|
+
_apiTranscriptDelivered = false;
|
|
1264
|
+
_startBrowserTranscription();
|
|
1265
|
+
|
|
929
1266
|
// Start mic capture and stream to server
|
|
930
1267
|
startGeminiMicCapture(ws).catch((err) => {
|
|
931
1268
|
console.error("[voice-client-sdk] Gemini mic capture failed:", err);
|
|
@@ -1133,12 +1470,8 @@ function handleGeminiServerEvent(msg) {
|
|
|
1133
1470
|
|
|
1134
1471
|
switch (type) {
|
|
1135
1472
|
case "transcript.user":
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
emit("transcript", { text: msg.text, final: true });
|
|
1139
|
-
} else {
|
|
1140
|
-
sdkVoiceTranscript.value = "";
|
|
1141
|
-
}
|
|
1473
|
+
sdkVoiceTranscript.value = msg.text || "";
|
|
1474
|
+
emit("transcript", { text: msg.text, final: true, source: "api" });
|
|
1142
1475
|
_persistTranscriptIfNew("user", msg.text, "gemini.user_transcript");
|
|
1143
1476
|
break;
|
|
1144
1477
|
|
|
@@ -1289,6 +1622,8 @@ export async function startSdkVoiceSession(options = {}) {
|
|
|
1289
1622
|
sdkVoiceToolCalls.value = [];
|
|
1290
1623
|
_usingLegacyFallback = false;
|
|
1291
1624
|
_lastAutoBargeInAt = 0;
|
|
1625
|
+
_speechStartedAt = 0;
|
|
1626
|
+
if (_responseClearTimer) { clearTimeout(_responseClearTimer); _responseClearTimer = null; }
|
|
1292
1627
|
_resetTranscriptPersistenceState();
|
|
1293
1628
|
|
|
1294
1629
|
try {
|
|
@@ -1367,6 +1702,8 @@ export function stopSdkVoiceSession() {
|
|
|
1367
1702
|
_sdkExplicitStop = true;
|
|
1368
1703
|
emit("session-ending", { sessionId: sdkVoiceSessionId.value });
|
|
1369
1704
|
_flushPendingTranscriptBuffers();
|
|
1705
|
+
_stopBrowserTranscription();
|
|
1706
|
+
_stopBackgroundProgressPolling();
|
|
1370
1707
|
if (_geminiRecorder) {
|
|
1371
1708
|
try { _geminiRecorder.stop(); } catch { /* ignore */ }
|
|
1372
1709
|
_geminiRecorder = null;
|
|
@@ -1400,6 +1737,8 @@ export function stopSdkVoiceSession() {
|
|
|
1400
1737
|
|
|
1401
1738
|
clearInterval(_durationTimer);
|
|
1402
1739
|
_durationTimer = null;
|
|
1740
|
+
if (_responseClearTimer) { clearTimeout(_responseClearTimer); _responseClearTimer = null; }
|
|
1741
|
+
_speechStartedAt = 0;
|
|
1403
1742
|
|
|
1404
1743
|
sdkVoiceState.value = "idle";
|
|
1405
1744
|
sdkVoiceTranscript.value = "";
|