bosun 0.37.0 → 0.37.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.env.example +4 -1
  2. package/agent-tool-config.mjs +338 -0
  3. package/bosun-skills.mjs +59 -4
  4. package/bosun.schema.json +1 -1
  5. package/desktop/launch.mjs +18 -0
  6. package/desktop/main.mjs +52 -13
  7. package/fleet-coordinator.mjs +34 -1
  8. package/kanban-adapter.mjs +30 -3
  9. package/library-manager.mjs +66 -0
  10. package/maintenance.mjs +30 -5
  11. package/monitor.mjs +56 -0
  12. package/package.json +4 -1
  13. package/setup-web-server.mjs +73 -12
  14. package/setup.mjs +3 -3
  15. package/ui/app.js +40 -3
  16. package/ui/components/session-list.js +25 -7
  17. package/ui/components/workspace-switcher.js +48 -1
  18. package/ui/demo.html +176 -0
  19. package/ui/modules/mic-track-registry.js +83 -0
  20. package/ui/modules/settings-schema.js +4 -1
  21. package/ui/modules/state.js +25 -0
  22. package/ui/modules/streaming.js +1 -1
  23. package/ui/modules/voice-barge-in.js +27 -0
  24. package/ui/modules/voice-client-sdk.js +268 -42
  25. package/ui/modules/voice-client.js +665 -61
  26. package/ui/modules/voice-overlay.js +829 -47
  27. package/ui/setup.html +151 -9
  28. package/ui/styles.css +258 -0
  29. package/ui/tabs/chat.js +11 -0
  30. package/ui/tabs/library.js +890 -15
  31. package/ui/tabs/settings.js +51 -11
  32. package/ui/tabs/telemetry.js +327 -105
  33. package/ui/tabs/workflows.js +86 -0
  34. package/ui-server.mjs +1201 -107
  35. package/voice-action-dispatcher.mjs +81 -0
  36. package/voice-agents-sdk.mjs +2 -2
  37. package/voice-relay.mjs +131 -14
  38. package/voice-tools.mjs +475 -9
  39. package/workflow-engine.mjs +54 -0
  40. package/workflow-nodes.mjs +177 -28
  41. package/workflow-templates/github.mjs +205 -94
  42. package/workflow-templates/task-batch.mjs +247 -0
  43. package/workflow-templates.mjs +15 -0
@@ -746,6 +746,83 @@ registerAction("workflow.run_get", async (params) => {
746
746
  return run;
747
747
  });
748
748
 
749
+ registerAction("workflow.save", async (params) => {
750
+ const wfEngineMod = await getWorkflowEngineModule();
751
+ const engine = typeof wfEngineMod.getWorkflowEngine === "function"
752
+ ? wfEngineMod.getWorkflowEngine()
753
+ : null;
754
+ if (!engine?.save) throw new Error("Workflow engine is unavailable");
755
+ const def = params?.definition;
756
+ if (!def || typeof def !== "object" || Array.isArray(def)) {
757
+ throw new Error("definition object is required");
758
+ }
759
+ const workflowId = String(params.workflowId || def.id || "").trim();
760
+ const payload = { ...def };
761
+ if (workflowId) payload.id = workflowId;
762
+ if (!Array.isArray(payload.nodes)) payload.nodes = [];
763
+ if (!Array.isArray(payload.edges)) payload.edges = [];
764
+ if (!payload.name) payload.name = payload.id || "Voice Workflow";
765
+ const saved = engine.save(payload);
766
+ return {
767
+ id: saved.id,
768
+ name: saved.name || saved.id,
769
+ enabled: saved.enabled !== false,
770
+ nodeCount: Array.isArray(saved.nodes) ? saved.nodes.length : 0,
771
+ edgeCount: Array.isArray(saved.edges) ? saved.edges.length : 0,
772
+ };
773
+ });
774
+
775
+ registerAction("workflow.delete", async (params) => {
776
+ const wfEngineMod = await getWorkflowEngineModule();
777
+ const engine = typeof wfEngineMod.getWorkflowEngine === "function"
778
+ ? wfEngineMod.getWorkflowEngine()
779
+ : null;
780
+ if (!engine?.delete) throw new Error("Workflow engine is unavailable");
781
+ const workflowId = String(params.workflowId || params.id || "").trim();
782
+ if (!workflowId) throw new Error("workflowId is required");
783
+ const deleted = await engine.delete(workflowId);
784
+ return { ok: Boolean(deleted), workflowId };
785
+ });
786
+
787
+ registerAction("workflow.execute", async (params) => {
788
+ const wfEngineMod = await getWorkflowEngineModule();
789
+ const engine = typeof wfEngineMod.getWorkflowEngine === "function"
790
+ ? wfEngineMod.getWorkflowEngine()
791
+ : null;
792
+ if (!engine?.execute) throw new Error("Workflow engine is unavailable");
793
+ const workflowId = String(params.workflowId || params.id || "").trim();
794
+ if (!workflowId) throw new Error("workflowId is required");
795
+ const input = params?.input && typeof params.input === "object" ? params.input : {};
796
+ const force = params?.force === true;
797
+ const ctx = await engine.execute(workflowId, input, { force });
798
+ return {
799
+ runId: ctx?.id || null,
800
+ workflowId,
801
+ status: Array.isArray(ctx?.errors) && ctx.errors.length > 0 ? "failed" : "completed",
802
+ errorCount: Array.isArray(ctx?.errors) ? ctx.errors.length : 0,
803
+ };
804
+ });
805
+
806
+ registerAction("workflow.retry", async (params) => {
807
+ const wfEngineMod = await getWorkflowEngineModule();
808
+ const engine = typeof wfEngineMod.getWorkflowEngine === "function"
809
+ ? wfEngineMod.getWorkflowEngine()
810
+ : null;
811
+ if (!engine?.retryRun) throw new Error("Workflow retry is unavailable");
812
+ const runId = String(params.runId || params.id || "").trim();
813
+ if (!runId) throw new Error("runId is required");
814
+ const mode = String(params.mode || "from_failed").trim().toLowerCase() === "from_scratch"
815
+ ? "from_scratch"
816
+ : "from_failed";
817
+ const currentRun = engine?.getRunDetail ? engine.getRunDetail(runId) : null;
818
+ if (!currentRun) throw new Error(`Workflow run "${runId}" not found`);
819
+ const currentStatus = String(currentRun?.status || "").trim().toLowerCase();
820
+ if (mode === "from_failed" && currentStatus !== "failed") {
821
+ throw new Error(`retry mode "from_failed" requires a failed run (current=${currentRun?.status || "unknown"})`);
822
+ }
823
+ return engine.retryRun(runId, { mode });
824
+ });
825
+
749
826
  // ── Skill/prompt actions ────────────────────────────────────────────────────
750
827
 
751
828
  registerAction("skill.list", async () => {
@@ -860,6 +937,10 @@ export function getActionManifest() {
860
937
  { action: "workflow.saved_list", description: "List installed workflow definitions. params: {}" },
861
938
  { action: "workflow.runs", description: "List workflow run history. params: { workflowId?, status?, limit? }" },
862
939
  { action: "workflow.run_get", description: "Get a workflow run detail. params: { runId }" },
940
+ { action: "workflow.save", description: "Create/update a workflow definition. params: { workflowId?, definition }" },
941
+ { action: "workflow.delete", description: "Delete a workflow definition. params: { workflowId }" },
942
+ { action: "workflow.execute", description: "Execute a workflow now. params: { workflowId, input?, force? }" },
943
+ { action: "workflow.retry", description: "Retry a workflow run. params: { runId, mode?: from_failed|from_scratch }" },
863
944
  { action: "skill.list", description: "List available skills. params: {}" },
864
945
  { action: "prompt.list", description: "List agent prompt definitions. params: {}" },
865
946
  { action: "prompt.get", description: "Get a prompt template. params: { key }" },
@@ -275,7 +275,7 @@ export async function createRealtimeSession(agent, provider, config = {}, option
275
275
  ).trim() || "alloy";
276
276
 
277
277
  const turnDetection = String(
278
- options.turnDetection || config.turnDetection || "server_vad",
278
+ options.turnDetection || config.turnDetection || "semantic_vad",
279
279
  ).trim();
280
280
 
281
281
  const sessionConfig = {
@@ -668,7 +668,7 @@ export async function getClientSdkConfig(voiceConfig = {}) {
668
668
  tier: availability.info?.tier || 2,
669
669
  model: configuredModel,
670
670
  voiceId: voiceConfig.voiceId || "alloy",
671
- turnDetection: voiceConfig.turnDetection || "server_vad",
671
+ turnDetection: voiceConfig.turnDetection || "semantic_vad",
672
672
  fallbackReason,
673
673
  };
674
674
  }
package/voice-relay.mjs CHANGED
@@ -26,7 +26,7 @@ const OPENAI_REALTIME_MODEL = "gpt-realtime-1.5";
26
26
  const OPENAI_AUDIO_RESPONSES_MODEL = "gpt-audio-1.5";
27
27
  const OPENAI_RESPONSES_URL = "https://api.openai.com/v1/responses";
28
28
  const OPENAI_DEFAULT_VISION_MODEL = "gpt-4.1-nano";
29
- const REALTIME_TRANSCRIBE_MODEL = "gpt-4o-transcribe";
29
+ const DEFAULT_TRANSCRIBE_MODEL = "gpt-4o-transcribe";
30
30
 
31
31
  const AZURE_API_VERSION = "2025-04-01-preview";
32
32
 
@@ -73,6 +73,8 @@ function buildOpenAIRealtimeWebRtcUrl(model, overrideBase = "") {
73
73
 
74
74
  // GA models (gpt-realtime, gpt-realtime-1.5, gpt-realtime-mini, etc.) use /openai/v1/ paths.
75
75
  // Preview models (for example gpt-4o-realtime-preview-*) use legacy /openai/realtimeapi/ paths.
76
+ // NOTE: Azure AI Foundry "Global Standard" deployments may only support preview paths
77
+ // even for GA model names. We try GA first. If it 404s the caller falls back to preview.
76
78
  function isAzureGaProtocol(deployment) {
77
79
  const d = String(deployment || "").toLowerCase().trim();
78
80
  return d.startsWith("gpt-realtime") && !d.startsWith("gpt-4o-realtime");
@@ -93,6 +95,13 @@ function normalizeAzureRealtimeDeployment(rawDeployment) {
93
95
  return deployment;
94
96
  }
95
97
 
98
+ function parseOptionalBoolean(rawValue) {
99
+ if (rawValue == null) return null;
100
+ const normalized = String(rawValue).trim().toLowerCase();
101
+ if (!normalized) return null;
102
+ return !["0", "false", "no", "off"].includes(normalized);
103
+ }
104
+
96
105
  function isOpenAIAudioResponsesModel(rawModel) {
97
106
  const model = String(rawModel || "").trim().toLowerCase();
98
107
  return /^gpt-audio/.test(model);
@@ -305,18 +314,42 @@ function sanitizeVoiceCallContext(context = {}) {
305
314
  const rawExecutor = String(context?.executor || "").trim().toLowerCase();
306
315
  const rawMode = String(context?.mode || "").trim().toLowerCase();
307
316
  const rawModel = String(context?.model || "").trim();
317
+ const rawVoiceAgentId = String(context?.voiceAgentId || "").trim();
318
+ const rawVoiceAgentName = String(context?.voiceAgentName || "").trim();
319
+ const rawVoiceAgentInstructions = String(context?.voiceAgentInstructions || "").trim();
320
+ const rawVoiceToolCapabilityPrompt = String(context?.voiceToolCapabilityPrompt || "").trim();
321
+ const rawVoiceAgentSkills = Array.isArray(context?.voiceAgentSkills)
322
+ ? context.voiceAgentSkills.map((s) => String(s || "").trim()).filter(Boolean)
323
+ : [];
324
+ const rawEnabledMcpServers = Array.isArray(context?.enabledMcpServers)
325
+ ? context.enabledMcpServers.map((s) => String(s || "").trim()).filter(Boolean)
326
+ : [];
308
327
 
309
328
  return {
310
329
  sessionId: rawSessionId || null,
311
330
  executor: VALID_EXECUTORS.has(rawExecutor) ? rawExecutor : null,
312
331
  mode: VALID_AGENT_MODES.has(rawMode) ? rawMode : null,
313
332
  model: rawModel || null,
333
+ voiceAgentId: rawVoiceAgentId || null,
334
+ voiceAgentName: rawVoiceAgentName || null,
335
+ voiceAgentInstructions: rawVoiceAgentInstructions || null,
336
+ voiceToolCapabilityPrompt: rawVoiceToolCapabilityPrompt || null,
337
+ voiceAgentSkills: rawVoiceAgentSkills,
338
+ enabledMcpServers: rawEnabledMcpServers,
314
339
  };
315
340
  }
316
341
 
317
342
  async function buildSessionScopedInstructions(baseInstructions, callContext = {}) {
318
343
  const context = sanitizeVoiceCallContext(callContext);
319
- if (!context.sessionId && !context.executor && !context.mode && !context.model) {
344
+ if (
345
+ !context.sessionId
346
+ && !context.executor
347
+ && !context.mode
348
+ && !context.model
349
+ && !context.voiceAgentId
350
+ && !context.voiceAgentInstructions
351
+ && !context.voiceToolCapabilityPrompt
352
+ ) {
320
353
  return baseInstructions;
321
354
  }
322
355
 
@@ -381,6 +414,22 @@ async function buildSessionScopedInstructions(baseInstructions, callContext = {}
381
414
  "",
382
415
  "## Bosun Voice Call Context",
383
416
  `Active chat session id: ${context.sessionId || "none"}.`,
417
+ context.voiceAgentId
418
+ ? `Active voice agent id: ${context.voiceAgentId}.`
419
+ : "Active voice agent id: default.",
420
+ context.voiceAgentName
421
+ ? `Active voice agent name: ${context.voiceAgentName}.`
422
+ : "",
423
+ context.voiceAgentInstructions
424
+ ? `Voice agent instruction emphasis: ${context.voiceAgentInstructions}`
425
+ : "",
426
+ context.voiceToolCapabilityPrompt || "",
427
+ context.enabledMcpServers?.length
428
+ ? `Enabled MCP servers for this session: ${context.enabledMcpServers.join(", ")}.`
429
+ : "",
430
+ context.voiceAgentSkills?.length
431
+ ? `Voice agent skills: ${context.voiceAgentSkills.join(", ")}.`
432
+ : "",
384
433
  context.executor
385
434
  ? `Preferred executor for delegated work: ${context.executor}.`
386
435
  : "Preferred executor for delegated work: use configured default.",
@@ -783,6 +832,12 @@ export function getVoiceConfig(forceReload = false) {
783
832
  azureDeployment: String(ep.deployment || ep.azureDeployment || "").trim() || null,
784
833
  voiceId: String(ep.voiceId || "").trim() || null,
785
834
  visionModel: String(ep.visionModel || "").trim() || null,
835
+ transcriptionModel: String(ep.transcriptionModel || "").trim() || null,
836
+ // Azure defaults to transcription OFF unless explicitly enabled because
837
+ // item-level ASR failures can spam and destabilize long-running calls.
838
+ transcriptionEnabled: String(ep.provider || "").toLowerCase() === "azure"
839
+ ? (ep.transcriptionEnabled === true)
840
+ : (ep.transcriptionEnabled !== false),
786
841
  role: String(ep.role || "primary").trim() || "primary",
787
842
  weight: typeof ep.weight === "number" ? ep.weight : 100,
788
843
  name: String(ep.name || "").trim() || null,
@@ -852,7 +907,7 @@ export function getVoiceConfig(forceReload = false) {
852
907
  const model = String(voice.model || process.env.VOICE_MODEL || defaultModel).trim() || defaultModel;
853
908
  const voiceId = voice.voiceId || process.env.VOICE_ID || "alloy";
854
909
  const turnDetection =
855
- voice.turnDetection || process.env.VOICE_TURN_DETECTION || "server_vad";
910
+ voice.turnDetection || process.env.VOICE_TURN_DETECTION || "semantic_vad";
856
911
  const defaultVisionModel =
857
912
  provider === "claude"
858
913
  ? CLAUDE_DEFAULT_VISION_MODEL
@@ -861,6 +916,19 @@ export function getVoiceConfig(forceReload = false) {
861
916
  : OPENAI_DEFAULT_VISION_MODEL;
862
917
  const visionModel =
863
918
  voice.visionModel || process.env.VOICE_VISION_MODEL || defaultVisionModel;
919
+ const transcriptionModel =
920
+ voice.transcriptionModel || process.env.VOICE_TRANSCRIPTION_MODEL || DEFAULT_TRANSCRIBE_MODEL;
921
+ const transcriptionEnabledRaw =
922
+ voice.transcriptionEnabled ?? process.env.VOICE_TRANSCRIPTION_ENABLED;
923
+ const transcriptionEnabled =
924
+ transcriptionEnabledRaw == null
925
+ ? true
926
+ : !["0", "false", "no", "off"].includes(
927
+ String(transcriptionEnabledRaw).trim().toLowerCase(),
928
+ );
929
+ const azureTranscriptionEnabled = parseOptionalBoolean(
930
+ voice.azureTranscriptionEnabled ?? process.env.VOICE_AZURE_TRANSCRIPTION_ENABLED,
931
+ );
864
932
  const fallbackMode =
865
933
  voice.fallbackMode || process.env.VOICE_FALLBACK_MODE || "browser";
866
934
  const delegateExecutor =
@@ -906,6 +974,9 @@ For complex operations like writing code or creating PRs, delegate to the approp
906
974
  turnDetection,
907
975
  visionModel,
908
976
  instructions,
977
+ transcriptionModel,
978
+ transcriptionEnabled,
979
+ azureTranscriptionEnabled,
909
980
  fallbackMode,
910
981
  delegateExecutor,
911
982
  enabled,
@@ -1120,6 +1191,13 @@ async function createOpenAIEphemeralToken(cfg, toolDefinitions = [], callContext
1120
1191
  const instructions = await buildSessionScopedInstructions(cfg.instructions, context);
1121
1192
  const model = normalizeOpenAIRealtimeModel(candidate?.model || cfg.model || OPENAI_REALTIME_MODEL);
1122
1193
  const voiceId = String(candidate?.voiceId || cfg.voiceId || "alloy").trim() || "alloy";
1194
+ // Per-endpoint transcription overrides
1195
+ const transcriptionModel = String(candidate?.transcriptionModel || "").trim() || cfg.transcriptionModel;
1196
+ const transcriptionEnabled = candidate?.transcriptionEnabled !== undefined
1197
+ ? candidate.transcriptionEnabled !== false
1198
+ : cfg.azureTranscriptionEnabled != null
1199
+ ? cfg.azureTranscriptionEnabled !== false
1200
+ : false;
1123
1201
 
1124
1202
  const sessionConfig = {
1125
1203
  model,
@@ -1132,9 +1210,9 @@ async function createOpenAIEphemeralToken(cfg, toolDefinitions = [], callContext
1132
1210
  turn_detection: {
1133
1211
  type: cfg.turnDetection,
1134
1212
  ...(cfg.turnDetection === "server_vad" ? {
1135
- threshold: 0.35,
1213
+ threshold: 0.7,
1136
1214
  prefix_padding_ms: 400,
1137
- silence_duration_ms: 700,
1215
+ silence_duration_ms: 1200,
1138
1216
  create_response: true,
1139
1217
  interrupt_response: true,
1140
1218
  } : {}),
@@ -1144,7 +1222,7 @@ async function createOpenAIEphemeralToken(cfg, toolDefinitions = [], callContext
1144
1222
  interrupt_response: true,
1145
1223
  } : {}),
1146
1224
  },
1147
- input_audio_transcription: { model: REALTIME_TRANSCRIBE_MODEL },
1225
+ ...(transcriptionEnabled ? { input_audio_transcription: { model: transcriptionModel } } : {}),
1148
1226
  tools: toolDefinitions,
1149
1227
  };
1150
1228
 
@@ -1197,11 +1275,17 @@ async function createAzureEphemeralToken(cfg, toolDefinitions = [], callContext
1197
1275
  candidate?.azureDeployment || candidate?.model || cfg.azureDeployment || OPENAI_REALTIME_MODEL,
1198
1276
  );
1199
1277
  const voiceId = String(candidate?.voiceId || cfg.voiceId || "alloy").trim() || "alloy";
1278
+ // Per-endpoint transcription overrides
1279
+ const transcriptionModel = String(candidate?.transcriptionModel || "").trim() || cfg.transcriptionModel;
1280
+ const transcriptionEnabled = candidate?.transcriptionEnabled !== undefined ? candidate.transcriptionEnabled !== false : cfg.transcriptionEnabled;
1200
1281
  // GA protocol (gpt-realtime-1.5, gpt-realtime, etc.) uses /openai/v1/realtime/sessions?api-version=...
1201
1282
  // Preview protocol uses /openai/realtimeapi/sessions?api-version=...
1202
- const url = isAzureGaProtocol(deployment)
1203
- ? `${resolvedEndpoint}/openai/v1/realtime/sessions?api-version=${AZURE_API_VERSION}`
1204
- : `${resolvedEndpoint}/openai/realtimeapi/sessions?api-version=${AZURE_API_VERSION}&deployment=${encodeURIComponent(deployment)}`;
1283
+ // Azure AI Foundry "Global Standard" resources may not support GA paths even for GA model names,
1284
+ // so we build both and try GA first with automatic fallback to preview.
1285
+ const gaUrl = `${resolvedEndpoint}/openai/v1/realtime/sessions?api-version=${AZURE_API_VERSION}`;
1286
+ const previewUrl = `${resolvedEndpoint}/openai/realtimeapi/sessions?api-version=${AZURE_API_VERSION}&deployment=${encodeURIComponent(deployment)}`;
1287
+ const useGa = isAzureGaProtocol(deployment);
1288
+ const url = useGa ? gaUrl : previewUrl;
1205
1289
 
1206
1290
  const headers = {
1207
1291
  "Content-Type": "application/json",
@@ -1226,9 +1310,9 @@ async function createAzureEphemeralToken(cfg, toolDefinitions = [], callContext
1226
1310
  turn_detection: {
1227
1311
  type: cfg.turnDetection,
1228
1312
  ...(cfg.turnDetection === "server_vad" ? {
1229
- threshold: 0.35,
1313
+ threshold: 0.7,
1230
1314
  prefix_padding_ms: 400,
1231
- silence_duration_ms: 700,
1315
+ silence_duration_ms: 1200,
1232
1316
  create_response: true,
1233
1317
  interrupt_response: true,
1234
1318
  } : {}),
@@ -1238,16 +1322,28 @@ async function createAzureEphemeralToken(cfg, toolDefinitions = [], callContext
1238
1322
  interrupt_response: true,
1239
1323
  } : {}),
1240
1324
  },
1241
- input_audio_transcription: { model: REALTIME_TRANSCRIBE_MODEL },
1325
+ ...(transcriptionEnabled ? { input_audio_transcription: { model: transcriptionModel } } : {}),
1242
1326
  tools: toolDefinitions,
1243
1327
  };
1244
1328
 
1245
- const response = await fetch(url, {
1329
+ let response = await fetch(url, {
1246
1330
  method: "POST",
1247
1331
  headers,
1248
1332
  body: JSON.stringify(sessionConfig),
1249
1333
  });
1250
1334
 
1335
+ // Azure AI Foundry "Global Standard" deployments may 404 on the GA path.
1336
+ // Automatically fall back to the preview path before giving up.
1337
+ if (!response.ok && response.status === 404 && useGa) {
1338
+ const previewConfig = { ...sessionConfig };
1339
+ delete previewConfig.type; // preview path does not accept type: "realtime"
1340
+ response = await fetch(previewUrl, {
1341
+ method: "POST",
1342
+ headers,
1343
+ body: JSON.stringify(previewConfig),
1344
+ });
1345
+ }
1346
+
1251
1347
  if (!response.ok) {
1252
1348
  const errorText = await buildProviderErrorDetails(response, "unknown");
1253
1349
  throw new Error(`Azure Realtime session failed (${response.status}): ${errorText}`);
@@ -1255,9 +1351,22 @@ async function createAzureEphemeralToken(cfg, toolDefinitions = [], callContext
1255
1351
 
1256
1352
  const data = await response.json();
1257
1353
  // WebRTC URL diverges from /sessions URL: GA uses /openai/v1/realtime, preview uses /openai/realtime.
1258
- const webrtcUrl = isAzureGaProtocol(deployment)
1354
+ // If the GA session was created via fallback to preview, use preview WebRTC URL too.
1355
+ const gaSessionSucceeded = useGa && response.url?.includes("/v1/realtime");
1356
+ const webrtcUrl = (useGa && gaSessionSucceeded)
1259
1357
  ? `${resolvedEndpoint}/openai/v1/realtime?api-version=${AZURE_API_VERSION}`
1260
1358
  : `${resolvedEndpoint}/openai/realtime?api-version=${AZURE_API_VERSION}&deployment=${encodeURIComponent(deployment)}`;
1359
+
1360
+ // WebSocket fallback URL — Azure Realtime API always supports WebSocket even
1361
+ // when WebRTC SDP is unavailable (404). The api-key query parameter provides
1362
+ // authentication since browsers cannot set custom headers on WebSocket.
1363
+ const wsAuthParam = resolvedOAuthToken
1364
+ ? `access_token=${encodeURIComponent(resolvedOAuthToken)}`
1365
+ : `api-key=${encodeURIComponent(resolvedApiKey)}`;
1366
+ const wsUrl = (useGa && gaSessionSucceeded)
1367
+ ? `wss://${new URL(resolvedEndpoint).host}/openai/v1/realtime?api-version=${AZURE_API_VERSION}&${wsAuthParam}`
1368
+ : `wss://${new URL(resolvedEndpoint).host}/openai/realtime?api-version=${AZURE_API_VERSION}&deployment=${encodeURIComponent(deployment)}&${wsAuthParam}`;
1369
+
1261
1370
  return {
1262
1371
  token: data.client_secret?.value || data.token,
1263
1372
  expiresAt: data.client_secret?.expires_at || (Date.now() / 1000 + 60),
@@ -1265,6 +1374,7 @@ async function createAzureEphemeralToken(cfg, toolDefinitions = [], callContext
1265
1374
  voiceId,
1266
1375
  provider: "azure",
1267
1376
  url: webrtcUrl,
1377
+ wsUrl,
1268
1378
  sessionConfig,
1269
1379
  azureEndpoint: resolvedEndpoint,
1270
1380
  azureDeployment: deployment,
@@ -1420,9 +1530,16 @@ const VOICE_SESSION_ALLOWED_TOOLS = new Set([
1420
1530
  "list_directory",
1421
1531
  "get_recent_logs",
1422
1532
  "list_workflows",
1533
+ "create_workflow",
1534
+ "update_workflow_definition",
1535
+ "delete_workflow",
1536
+ "create_workflow_from_template",
1537
+ "generate_workflow_with_agent",
1423
1538
  "get_workflow_definition",
1539
+ "execute_workflow",
1424
1540
  "list_workflow_runs",
1425
1541
  "get_workflow_run",
1542
+ "analyze_workflow",
1426
1543
  "retry_workflow_run",
1427
1544
  "list_skills",
1428
1545
  "list_prompts",