@oscharko-dev/keiko-server 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/chat-handlers.d.ts +18 -2
- package/dist/chat-handlers.d.ts.map +1 -1
- package/dist/chat-handlers.js +185 -3
- package/dist/command-runner-errors.d.ts +17 -0
- package/dist/command-runner-errors.d.ts.map +1 -0
- package/dist/command-runner-errors.js +37 -0
- package/dist/command-runner-evidence.d.ts +23 -0
- package/dist/command-runner-evidence.d.ts.map +1 -0
- package/dist/command-runner-evidence.js +69 -0
- package/dist/command-runner-routes.d.ts +7 -0
- package/dist/command-runner-routes.d.ts.map +1 -0
- package/dist/command-runner-routes.js +175 -0
- package/dist/command-runner.d.ts +29 -0
- package/dist/command-runner.d.ts.map +1 -0
- package/dist/command-runner.js +348 -0
- package/dist/conversation-prompt.d.ts +2 -2
- package/dist/conversation-prompt.d.ts.map +1 -1
- package/dist/conversation-prompt.js +17 -1
- package/dist/csp.d.ts.map +1 -1
- package/dist/csp.js +3 -0
- package/dist/deps.d.ts +27 -1
- package/dist/deps.d.ts.map +1 -1
- package/dist/deps.js +288 -13
- package/dist/discussion-prompt.d.ts +4 -0
- package/dist/discussion-prompt.d.ts.map +1 -0
- package/dist/discussion-prompt.js +19 -0
- package/dist/editor/agentActionAudit.d.ts +18 -0
- package/dist/editor/agentActionAudit.d.ts.map +1 -0
- package/dist/editor/agentActionAudit.js +80 -0
- package/dist/editor/agentRoutes.d.ts +1 -0
- package/dist/editor/agentRoutes.d.ts.map +1 -1
- package/dist/editor/agentRoutes.js +292 -55
- package/dist/editor/agentSessionRegistry.d.ts +35 -0
- package/dist/editor/agentSessionRegistry.d.ts.map +1 -0
- package/dist/editor/agentSessionRegistry.js +243 -0
- package/dist/editor/completionRoutes.d.ts.map +1 -1
- package/dist/editor/completionRoutes.js +5 -10
- package/dist/editor/languageRoutes.d.ts +12 -1
- package/dist/editor/languageRoutes.d.ts.map +1 -1
- package/dist/editor/languageRoutes.js +71 -8
- package/dist/editor/languageService.d.ts +3 -2
- package/dist/editor/languageService.d.ts.map +1 -1
- package/dist/editor/languageService.js +41 -3
- package/dist/editor/languageServiceHost.d.ts.map +1 -1
- package/dist/editor/languageServiceHost.js +2 -2
- package/dist/editor/lsp/hostLanguageOperation.d.ts +17 -0
- package/dist/editor/lsp/hostLanguageOperation.d.ts.map +1 -0
- package/dist/editor/lsp/hostLanguageOperation.js +436 -0
- package/dist/editor/lsp/hostLanguageProviders.d.ts +26 -0
- package/dist/editor/lsp/hostLanguageProviders.d.ts.map +1 -0
- package/dist/editor/lsp/hostLanguageProviders.js +161 -0
- package/dist/editor/lsp/lspFrameCodec.d.ts +13 -0
- package/dist/editor/lsp/lspFrameCodec.d.ts.map +1 -0
- package/dist/editor/lsp/lspFrameCodec.js +164 -0
- package/dist/editor/lsp/lspJsonRpcClient.d.ts +34 -0
- package/dist/editor/lsp/lspJsonRpcClient.d.ts.map +1 -0
- package/dist/editor/lsp/lspJsonRpcClient.js +173 -0
- package/dist/editor/lsp/lspLanguageProvider.d.ts +7 -0
- package/dist/editor/lsp/lspLanguageProvider.d.ts.map +1 -0
- package/dist/editor/lsp/lspLanguageProvider.js +29 -0
- package/dist/editor/lsp/lspLifecycleLedger.d.ts +5 -0
- package/dist/editor/lsp/lspLifecycleLedger.d.ts.map +1 -0
- package/dist/editor/lsp/lspLifecycleLedger.js +37 -0
- package/dist/editor/lsp/lspNodeAdapter.d.ts +31 -0
- package/dist/editor/lsp/lspNodeAdapter.d.ts.map +1 -0
- package/dist/editor/lsp/lspNodeAdapter.js +230 -0
- package/dist/editor/lsp/lspProcessManager.d.ts +24 -0
- package/dist/editor/lsp/lspProcessManager.d.ts.map +1 -0
- package/dist/editor/lsp/lspProcessManager.js +255 -0
- package/dist/editor/lsp/lspRestartThrottle.d.ts +6 -0
- package/dist/editor/lsp/lspRestartThrottle.d.ts.map +1 -0
- package/dist/editor/lsp/lspRestartThrottle.js +24 -0
- package/dist/editor/lsp/lspStatusRoute.d.ts +8 -0
- package/dist/editor/lsp/lspStatusRoute.d.ts.map +1 -0
- package/dist/editor/lsp/lspStatusRoute.js +22 -0
- package/dist/editor/lsp/lspTransport.d.ts +19 -0
- package/dist/editor/lsp/lspTransport.d.ts.map +1 -0
- package/dist/editor/lsp/lspTransport.js +55 -0
- package/dist/editor/lsp/testing/fakeLspProcess.d.ts +23 -0
- package/dist/editor/lsp/testing/fakeLspProcess.d.ts.map +1 -0
- package/dist/editor/lsp/testing/fakeLspProcess.js +132 -0
- package/dist/files.d.ts +45 -0
- package/dist/files.d.ts.map +1 -1
- package/dist/files.js +631 -7
- package/dist/gateway-readiness.js +3 -3
- package/dist/gateway-setup.d.ts +2 -0
- package/dist/gateway-setup.d.ts.map +1 -1
- package/dist/gateway-setup.js +275 -11
- package/dist/gitDelivery/actionSheetProjection.d.ts +30 -0
- package/dist/gitDelivery/actionSheetProjection.d.ts.map +1 -0
- package/dist/gitDelivery/actionSheetProjection.js +206 -0
- package/dist/gitDelivery/actionSheetRoutes.d.ts +29 -0
- package/dist/gitDelivery/actionSheetRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/actionSheetRoutes.js +293 -0
- package/dist/gitDelivery/agentOperationsRoutes.d.ts +33 -0
- package/dist/gitDelivery/agentOperationsRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/agentOperationsRoutes.js +405 -0
- package/dist/gitDelivery/commitRoutes.d.ts +23 -0
- package/dist/gitDelivery/commitRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/commitRoutes.js +204 -0
- package/dist/gitDelivery/evidenceRoutes.d.ts +9 -0
- package/dist/gitDelivery/evidenceRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/evidenceRoutes.js +101 -0
- package/dist/gitDelivery/execution.d.ts +38 -0
- package/dist/gitDelivery/execution.d.ts.map +1 -0
- package/dist/gitDelivery/execution.js +117 -0
- package/dist/gitDelivery/localMutationRoutes.d.ts +30 -0
- package/dist/gitDelivery/localMutationRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/localMutationRoutes.js +165 -0
- package/dist/gitDelivery/mergeExecution.d.ts +63 -0
- package/dist/gitDelivery/mergeExecution.d.ts.map +1 -0
- package/dist/gitDelivery/mergeExecution.js +168 -0
- package/dist/gitDelivery/mergeRoutes.d.ts +12 -0
- package/dist/gitDelivery/mergeRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/mergeRoutes.js +218 -0
- package/dist/gitDelivery/mutationEvidenceLedger.d.ts +23 -0
- package/dist/gitDelivery/mutationEvidenceLedger.d.ts.map +1 -0
- package/dist/gitDelivery/mutationEvidenceLedger.js +87 -0
- package/dist/gitDelivery/prExecution.d.ts +54 -0
- package/dist/gitDelivery/prExecution.d.ts.map +1 -0
- package/dist/gitDelivery/prExecution.js +192 -0
- package/dist/gitDelivery/prRoutes.d.ts +12 -0
- package/dist/gitDelivery/prRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/prRoutes.js +256 -0
- package/dist/gitDelivery/pushExecution.d.ts +43 -0
- package/dist/gitDelivery/pushExecution.d.ts.map +1 -0
- package/dist/gitDelivery/pushExecution.js +124 -0
- package/dist/gitDelivery/pushRoutes.d.ts +12 -0
- package/dist/gitDelivery/pushRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/pushRoutes.js +200 -0
- package/dist/gitDelivery/requestGuards.d.ts +15 -0
- package/dist/gitDelivery/requestGuards.d.ts.map +1 -0
- package/dist/gitDelivery/requestGuards.js +97 -0
- package/dist/gitDelivery/syncEvidence.d.ts +37 -0
- package/dist/gitDelivery/syncEvidence.d.ts.map +1 -0
- package/dist/gitDelivery/syncEvidence.js +85 -0
- package/dist/gitDelivery/syncExecution.d.ts +30 -0
- package/dist/gitDelivery/syncExecution.d.ts.map +1 -0
- package/dist/gitDelivery/syncExecution.js +266 -0
- package/dist/gitDelivery/syncRoutes.d.ts +13 -0
- package/dist/gitDelivery/syncRoutes.d.ts.map +1 -0
- package/dist/gitDelivery/syncRoutes.js +200 -0
- package/dist/gitPorcelainStatus.d.ts +15 -0
- package/dist/gitPorcelainStatus.d.ts.map +1 -0
- package/dist/gitPorcelainStatus.js +104 -0
- package/dist/gitRepositoryReads.d.ts +10 -0
- package/dist/gitRepositoryReads.d.ts.map +1 -0
- package/dist/gitRepositoryReads.js +314 -0
- package/dist/gitRepositoryRoutes.d.ts +7 -0
- package/dist/gitRepositoryRoutes.d.ts.map +1 -0
- package/dist/gitRepositoryRoutes.js +221 -0
- package/dist/gitRoutes.d.ts +66 -0
- package/dist/gitRoutes.d.ts.map +1 -0
- package/dist/gitRoutes.js +543 -0
- package/dist/governed-workflow.d.ts +2 -0
- package/dist/governed-workflow.d.ts.map +1 -1
- package/dist/governed-workflow.js +4 -0
- package/dist/grounded-qa.d.ts +11 -0
- package/dist/grounded-qa.d.ts.map +1 -1
- package/dist/grounded-qa.js +13 -4
- package/dist/headers.d.ts +4 -1
- package/dist/headers.d.ts.map +1 -1
- package/dist/headers.js +11 -4
- package/dist/index.d.ts +8 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +9 -1
- package/dist/qualityIntelligence/figmaSnapshotRoutes.d.ts +1 -1
- package/dist/qualityIntelligence/figmaSnapshotRoutes.d.ts.map +1 -1
- package/dist/qualityIntelligence/figmaSnapshotRoutes.js +1 -1
- package/dist/read-handlers.d.ts +5 -0
- package/dist/read-handlers.d.ts.map +1 -1
- package/dist/read-handlers.js +57 -1
- package/dist/routes.d.ts.map +1 -1
- package/dist/routes.js +259 -6
- package/dist/run-engine.d.ts.map +1 -1
- package/dist/run-engine.js +3 -0
- package/dist/run-handlers.d.ts.map +1 -1
- package/dist/run-handlers.js +74 -4
- package/dist/run-request.d.ts +11 -0
- package/dist/run-request.d.ts.map +1 -1
- package/dist/run-request.js +158 -10
- package/dist/runtime/capabilityDetector.d.ts +38 -0
- package/dist/runtime/capabilityDetector.d.ts.map +1 -0
- package/dist/runtime/capabilityDetector.js +443 -0
- package/dist/runtime/capabilityRoutes.d.ts +9 -0
- package/dist/runtime/capabilityRoutes.d.ts.map +1 -0
- package/dist/runtime/capabilityRoutes.js +45 -0
- package/dist/runtime/containerEngineDetector.d.ts +17 -0
- package/dist/runtime/containerEngineDetector.d.ts.map +1 -0
- package/dist/runtime/containerEngineDetector.js +222 -0
- package/dist/runtime/containerRoutes.d.ts +8 -0
- package/dist/runtime/containerRoutes.d.ts.map +1 -0
- package/dist/runtime/containerRoutes.js +207 -0
- package/dist/runtime/containerRunner-errors.d.ts +18 -0
- package/dist/runtime/containerRunner-errors.d.ts.map +1 -0
- package/dist/runtime/containerRunner-errors.js +42 -0
- package/dist/runtime/containerRunner-evidence.d.ts +24 -0
- package/dist/runtime/containerRunner-evidence.d.ts.map +1 -0
- package/dist/runtime/containerRunner-evidence.js +74 -0
- package/dist/runtime/containerRunner.d.ts +37 -0
- package/dist/runtime/containerRunner.d.ts.map +1 -0
- package/dist/runtime/containerRunner.js +443 -0
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +24 -4
- package/dist/store/schema.d.ts +1 -1
- package/dist/store/schema.d.ts.map +1 -1
- package/dist/store/schema.js +62 -1
- package/dist/task-workspace/active-store.d.ts +21 -0
- package/dist/task-workspace/active-store.d.ts.map +1 -0
- package/dist/task-workspace/active-store.js +55 -0
- package/dist/task-workspace/authorization.d.ts +7 -0
- package/dist/task-workspace/authorization.d.ts.map +1 -0
- package/dist/task-workspace/authorization.js +54 -0
- package/dist/task-workspace/binding.d.ts +3 -0
- package/dist/task-workspace/binding.d.ts.map +1 -0
- package/dist/task-workspace/binding.js +22 -0
- package/dist/task-workspace/cleanup.d.ts +4 -0
- package/dist/task-workspace/cleanup.d.ts.map +1 -0
- package/dist/task-workspace/cleanup.js +428 -0
- package/dist/task-workspace/errors.d.ts +14 -0
- package/dist/task-workspace/errors.d.ts.map +1 -0
- package/dist/task-workspace/errors.js +81 -0
- package/dist/task-workspace/evidence.d.ts +32 -0
- package/dist/task-workspace/evidence.d.ts.map +1 -0
- package/dist/task-workspace/evidence.js +52 -0
- package/dist/task-workspace/field-safety.d.ts +3 -0
- package/dist/task-workspace/field-safety.d.ts.map +1 -0
- package/dist/task-workspace/field-safety.js +42 -0
- package/dist/task-workspace/health.d.ts +4 -0
- package/dist/task-workspace/health.d.ts.map +1 -0
- package/dist/task-workspace/health.js +163 -0
- package/dist/task-workspace/lifecycle.d.ts +3 -0
- package/dist/task-workspace/lifecycle.d.ts.map +1 -0
- package/dist/task-workspace/lifecycle.js +248 -0
- package/dist/task-workspace/locks.d.ts +13 -0
- package/dist/task-workspace/locks.d.ts.map +1 -0
- package/dist/task-workspace/locks.js +44 -0
- package/dist/task-workspace/managed-root.d.ts +7 -0
- package/dist/task-workspace/managed-root.d.ts.map +1 -0
- package/dist/task-workspace/managed-root.js +98 -0
- package/dist/task-workspace/mutex.d.ts +8 -0
- package/dist/task-workspace/mutex.d.ts.map +1 -0
- package/dist/task-workspace/mutex.js +82 -0
- package/dist/task-workspace/naming.d.ts +15 -0
- package/dist/task-workspace/naming.d.ts.map +1 -0
- package/dist/task-workspace/naming.js +0 -0
- package/dist/task-workspace/provisioning.d.ts +3 -0
- package/dist/task-workspace/provisioning.d.ts.map +1 -0
- package/dist/task-workspace/provisioning.js +528 -0
- package/dist/task-workspace/reconciliation.d.ts +15 -0
- package/dist/task-workspace/reconciliation.d.ts.map +1 -0
- package/dist/task-workspace/reconciliation.js +274 -0
- package/dist/task-workspace/repair.d.ts +3 -0
- package/dist/task-workspace/repair.d.ts.map +1 -0
- package/dist/task-workspace/repair.js +286 -0
- package/dist/task-workspace/routes.d.ts +19 -0
- package/dist/task-workspace/routes.d.ts.map +1 -0
- package/dist/task-workspace/routes.js +481 -0
- package/dist/task-workspace/store.d.ts +12 -0
- package/dist/task-workspace/store.d.ts.map +1 -0
- package/dist/task-workspace/store.js +128 -0
- package/dist/task-workspace/types.d.ts +170 -0
- package/dist/task-workspace/types.d.ts.map +1 -0
- package/dist/task-workspace/types.js +5 -0
- package/dist/voice-action-governance.d.ts +23 -0
- package/dist/voice-action-governance.d.ts.map +1 -0
- package/dist/voice-action-governance.js +126 -0
- package/dist/voice-handlers.d.ts +6 -0
- package/dist/voice-handlers.d.ts.map +1 -0
- package/dist/voice-handlers.js +570 -0
- package/dist/voice-realtime-grounded-tool.d.ts +31 -0
- package/dist/voice-realtime-grounded-tool.d.ts.map +1 -0
- package/dist/voice-realtime-grounded-tool.js +322 -0
- package/dist/voice-realtime.d.ts +69 -0
- package/dist/voice-realtime.d.ts.map +1 -0
- package/dist/voice-realtime.js +787 -0
- package/dist/workspace-state-handlers.d.ts +5 -0
- package/dist/workspace-state-handlers.d.ts.map +1 -0
- package/dist/workspace-state-handlers.js +106 -0
- package/package.json +20 -19
|
@@ -0,0 +1,787 @@
|
|
|
1
|
+
// BFF realtime voice control plane (Issue #497, Epic #491, ADR-0058 D3/D6, ADR-0059). Re-opens the
|
|
2
|
+
// BFF WebSocket upgrade — deliberately hard-rejected for every other path (server.ts) — for the single
|
|
3
|
+
// loopback control path `/api/voice/control`, and ONLY when the resolved voice capability is the
|
|
4
|
+
// full-realtime profile and policy permits it (AC1). The WebSocket carries the #496 control/signaling
|
|
5
|
+
// protocol; raw audio never rides it — real-time media flows browser↔provider over native WebRTC
|
|
6
|
+
// (DTLS-SRTP), negotiated by the preferred proxied-SDP mode so the long-lived provider credential
|
|
7
|
+
// never reaches the browser (AC2).
|
|
8
|
+
//
|
|
9
|
+
// Security posture (ADR-0058 D6): the upgrade reuses the same loopback `isAllowedHost` Host/Origin
|
|
10
|
+
// check as the HTTP path (host-check.ts) — a WebSocket handshake cannot carry the JSON+CSRF guard, so
|
|
11
|
+
// the loopback-origin check (which rejects opaque `Origin: null` and any non-loopback origin) plus the
|
|
12
|
+
// capability gate are the load-bearing cross-origin defenses. SDP/ICE payloads are opaque,
|
|
13
|
+
// `secret-bearing` strings: they are forwarded verbatim through the Model Gateway egress seam and are
|
|
14
|
+
// never logged or persisted (privacy-contract §2/§4). Transcript text is `reviewable-text`: it is run
|
|
15
|
+
// through `stripUnsafeFormatChars` and the BFF redactor before it may enter the bounded replay buffer.
|
|
16
|
+
// Raw audio is never a control message (a binary frame is rejected) and is never persisted (AC1/AC6).
|
|
17
|
+
import { WebSocketServer } from "ws";
|
|
18
|
+
import { findConfiguredCapability, requestRealtimeNegotiation, resolveRealtimeVoice, resolveVoiceCapability, selectRealtimeVoiceModel, } from "@oscharko-dev/keiko-model-gateway";
|
|
19
|
+
import { CONVERSATION_SYSTEM_PROMPT } from "./conversation-prompt.js";
|
|
20
|
+
import { DEFAULT_VOICE_PROTOCOL_TIMEOUTS, isVoiceReplayEligible, stripUnsafeFormatChars, validateVoiceControlMessage, VOICE_PERSONAS, VOICE_PROFILE_NEGOTIATION_MODE, VOICE_PROTOCOL_VERSION, voiceMessageAllowedForProfile, } from "@oscharko-dev/keiko-contracts";
|
|
21
|
+
import { retrieveMemoryContext } from "@oscharko-dev/keiko-memory-retrieval";
|
|
22
|
+
import { isAllowedHost } from "./host-check.js";
|
|
23
|
+
import { currentGatewayConfig, currentGatewayEgressConfig } from "./deps.js";
|
|
24
|
+
import { isVoiceDisabledByPolicy, isVoiceRealtimeCapable } from "./read-handlers.js";
|
|
25
|
+
import { conversationMemoryScopes, resolveConversationMemoryContext, } from "./memory-conversation-context.js";
|
|
26
|
+
import { vaultAsQueryPort } from "./memory-conv-handlers.js";
|
|
27
|
+
// The single loopback path the BFF WebSocket upgrade is re-opened for. Every other upgrade keeps the
|
|
28
|
+
// hard 404 + socket.destroy() default (server.ts).
|
|
29
|
+
export const VOICE_CONTROL_PATH = "/api/voice/control";
|
|
30
|
+
// An SDP offer is small (a single audio m-line plus ICE/DTLS metadata); reject a larger one before
|
|
31
|
+
// any provider call so a hostile client cannot push an unbounded body through the egress seam.
|
|
32
|
+
const MAX_OFFER_SDP_BYTES = 256_000;
|
|
33
|
+
// Reviewable transcript text is bounded before strip/redact so a hostile client cannot grow the
|
|
34
|
+
// replay buffer without limit.
|
|
35
|
+
const MAX_TRANSCRIPT_CHARS = 8_000;
|
|
36
|
+
// The bounded per-session replay-diagnostic record (AC6): the host re-delivers these `replayable`
|
|
37
|
+
// host→client events to a reconnecting client. Oldest entries are evicted past the cap.
|
|
38
|
+
const MAX_REPLAY_EVENTS = 200;
|
|
39
|
+
// A disconnected session is resumable (by idempotency key) for this long, then swept.
|
|
40
|
+
const SESSION_RESUME_TTL_MS = 60_000;
|
|
41
|
+
// Bound the number of tracked sessions on the loopback control plane (single local user).
|
|
42
|
+
const MAX_ACTIVE_SESSIONS = 64;
|
|
43
|
+
const MAX_REALTIME_CONTEXT_MESSAGES = 12;
|
|
44
|
+
const MAX_REALTIME_CONTEXT_CHARS = 12_000;
|
|
45
|
+
// Bound the identifier length/charset before a client-chosen id may be tracked or logged, so a
|
|
46
|
+
// hostile id cannot inject into a log/audit line (content-free redaction class, protocol §8).
|
|
47
|
+
const MAX_ID_LENGTH = 200;
|
|
48
|
+
// Printable ASCII, no control characters, quotes, or whitespace.
|
|
49
|
+
const SAFE_IDENTIFIER = /^[\x21-\x7e]+$/;
|
|
50
|
+
function isRecord(value) {
|
|
51
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
52
|
+
}
|
|
53
|
+
function isSafeIdentifier(value) {
|
|
54
|
+
return (typeof value === "string" &&
|
|
55
|
+
value.length > 0 &&
|
|
56
|
+
value.length <= MAX_ID_LENGTH &&
|
|
57
|
+
SAFE_IDENTIFIER.test(value));
|
|
58
|
+
}
|
|
59
|
+
function isVoicePersona(value) {
|
|
60
|
+
return typeof value === "string" && VOICE_PERSONAS.includes(value);
|
|
61
|
+
}
|
|
62
|
+
function isVoiceSessionChatContext(value) {
|
|
63
|
+
if (!isRecord(value) || !isSafeIdentifier(value.chatId)) {
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
if (value.memory !== undefined && !isVoiceSessionMemoryContext(value.memory)) {
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
return value.grounding === undefined || isVoiceSessionGroundingContext(value.grounding);
|
|
70
|
+
}
|
|
71
|
+
function isVoiceSessionMemoryContext(value) {
|
|
72
|
+
if (!isRecord(value) || typeof value.enabled !== "boolean") {
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
const budgetTokens = value.budgetTokens;
|
|
76
|
+
return (budgetTokens === undefined ||
|
|
77
|
+
(typeof budgetTokens === "number" && Number.isInteger(budgetTokens) && budgetTokens >= 0));
|
|
78
|
+
}
|
|
79
|
+
function isVoiceSessionGroundingContext(value) {
|
|
80
|
+
if (!isRecord(value) || typeof value.enabled !== "boolean") {
|
|
81
|
+
return false;
|
|
82
|
+
}
|
|
83
|
+
const sourceCount = value.sourceCount;
|
|
84
|
+
if (typeof sourceCount !== "number" || !Number.isInteger(sourceCount) || sourceCount < 0) {
|
|
85
|
+
return false;
|
|
86
|
+
}
|
|
87
|
+
return (value.kind === "files" ||
|
|
88
|
+
value.kind === "knowledge" ||
|
|
89
|
+
value.kind === "hybrid" ||
|
|
90
|
+
value.kind === "multi");
|
|
91
|
+
}
|
|
92
|
+
// Resolves the configured realtime-voice provider to negotiate against, or undefined when none is
|
|
93
|
+
// configured/usable. Mirrors voice-handlers.ts resolveSttProvider.
|
|
94
|
+
function resolveRealtimeProvider(config) {
|
|
95
|
+
const modelId = selectRealtimeVoiceModel(config);
|
|
96
|
+
if (modelId === undefined) {
|
|
97
|
+
return undefined;
|
|
98
|
+
}
|
|
99
|
+
return config.providers.find((provider) => provider.modelId === modelId);
|
|
100
|
+
}
|
|
101
|
+
// The spoken-dialogue persona for the realtime session. It is the SAME Keiko system persona the text
|
|
102
|
+
// chat uses (CONVERSATION_SYSTEM_PROMPT) plus a short voice-delivery addendum, so spoken and written
|
|
103
|
+
// Keiko cannot diverge. Setting it explicitly replaces the provider's default demo persona ("a helpful,
|
|
104
|
+
// witty, and friendly AI … Talk quickly … knowledge cutoff 2023-10"), which would otherwise make the
|
|
105
|
+
// realtime voice a separate, ungrounded assistant — a direct violation of the dialogue-mode invariant.
|
|
106
|
+
const REALTIME_SPOKEN_ADDENDUM = " You are speaking with the user by voice. Keep replies short, natural, and conversational. " +
|
|
107
|
+
"Do not read code, file paths, or long identifiers aloud verbatim; summarize them in words.";
|
|
108
|
+
const REALTIME_GROUNDED_VOICE_ADDENDUM = " This voice session is connected to Keiko grounding sources. For any substantive question about " +
|
|
109
|
+
"the connected repository, files, documents, knowledge capsules, or project context, call the " +
|
|
110
|
+
"search_keiko_grounding tool before giving the final answer. You may briefly say that you are " +
|
|
111
|
+
"checking, but do not state factual conclusions until the tool result is available. After the tool " +
|
|
112
|
+
"result arrives, speak the answer faithfully and do not add unsupported facts.";
|
|
113
|
+
const REALTIME_GROUNDING_TOOL = {
|
|
114
|
+
type: "function",
|
|
115
|
+
name: "search_keiko_grounding",
|
|
116
|
+
description: "Search Keiko's connected repository, file, document, and knowledge sources for the current chat and return a citation-backed answer.",
|
|
117
|
+
parameters: {
|
|
118
|
+
type: "object",
|
|
119
|
+
additionalProperties: false,
|
|
120
|
+
properties: {
|
|
121
|
+
query: {
|
|
122
|
+
type: "string",
|
|
123
|
+
description: "The user's grounded question, rewritten only enough to preserve the intended meaning.",
|
|
124
|
+
},
|
|
125
|
+
},
|
|
126
|
+
required: ["query"],
|
|
127
|
+
},
|
|
128
|
+
};
|
|
129
|
+
// Whisper transcription enables input-audio transcripts so the chat transcript can capture what the
|
|
130
|
+
// user said by voice (the dialogue-mode grounding/evidence path).
|
|
131
|
+
const DEFAULT_REALTIME_TRANSCRIPTION_MODEL = "whisper-1";
|
|
132
|
+
// Realtime SDP negotiation is interactive: a user is waiting with the microphone open. The generic
|
|
133
|
+
// provider request timeout (commonly 30s) is far too long here — a hung provider would freeze the
|
|
134
|
+
// session on "negotiating" for half a minute before surfacing an error. Clamp to a short interactive
|
|
135
|
+
// bound so a stalled handshake fails fast and the composer can degrade to text.
|
|
136
|
+
const REALTIME_NEGOTIATION_TIMEOUT_MS = 8_000;
|
|
137
|
+
function trimContextText(text) {
|
|
138
|
+
const safe = stripUnsafeFormatChars(text).trim();
|
|
139
|
+
return safe.length <= MAX_REALTIME_CONTEXT_CHARS
|
|
140
|
+
? safe
|
|
141
|
+
: safe.slice(-MAX_REALTIME_CONTEXT_CHARS);
|
|
142
|
+
}
|
|
143
|
+
function recentChatContext(deps, chatId) {
|
|
144
|
+
const messages = deps.store
|
|
145
|
+
.listMessages(chatId, MAX_REALTIME_CONTEXT_MESSAGES)
|
|
146
|
+
.filter((message) => message.role === "user" || message.role === "assistant");
|
|
147
|
+
const text = messages
|
|
148
|
+
.map((message) => `${message.role === "user" ? "User" : "Assistant"}: ${message.content}`)
|
|
149
|
+
.join("\n");
|
|
150
|
+
return trimContextText(text);
|
|
151
|
+
}
|
|
152
|
+
function isRouteLikeResult(value) {
|
|
153
|
+
return isRecord(value) && typeof value.status === "number";
|
|
154
|
+
}
|
|
155
|
+
function shouldIncludeRealtimeMemory(deps, chatContext) {
|
|
156
|
+
return (chatContext?.memory?.enabled === true &&
|
|
157
|
+
deps.memoryVault !== undefined &&
|
|
158
|
+
chatContext.chatId.length > 0);
|
|
159
|
+
}
|
|
160
|
+
function latestRealtimeMemoryQuery(deps, chatId, fallback) {
|
|
161
|
+
return (deps.store
|
|
162
|
+
.listMessages(chatId, MAX_REALTIME_CONTEXT_MESSAGES)
|
|
163
|
+
.filter((message) => message.role === "user")
|
|
164
|
+
.at(-1)?.content ?? fallback);
|
|
165
|
+
}
|
|
166
|
+
function realtimeMemoryContext(deps, chatContext) {
|
|
167
|
+
if (!shouldIncludeRealtimeMemory(deps, chatContext)) {
|
|
168
|
+
return "";
|
|
169
|
+
}
|
|
170
|
+
const memoryVault = deps.memoryVault;
|
|
171
|
+
if (memoryVault === undefined) {
|
|
172
|
+
return "";
|
|
173
|
+
}
|
|
174
|
+
const chat = deps.store.findChatById(chatContext.chatId);
|
|
175
|
+
if (chat === undefined) {
|
|
176
|
+
return "";
|
|
177
|
+
}
|
|
178
|
+
const runtime = resolveConversationMemoryContext(deps, chat.projectPath, chat.id);
|
|
179
|
+
if (isRouteLikeResult(runtime)) {
|
|
180
|
+
return "";
|
|
181
|
+
}
|
|
182
|
+
const queryText = latestRealtimeMemoryQuery(deps, chat.id, chat.title);
|
|
183
|
+
try {
|
|
184
|
+
const retrieval = retrieveMemoryContext({
|
|
185
|
+
scopes: conversationMemoryScopes(runtime),
|
|
186
|
+
queryText,
|
|
187
|
+
...(chatContext.memory.budgetTokens !== undefined
|
|
188
|
+
? { budgetTokens: chatContext.memory.budgetTokens }
|
|
189
|
+
: {}),
|
|
190
|
+
nowMs: Date.now(),
|
|
191
|
+
}, vaultAsQueryPort(memoryVault));
|
|
192
|
+
return trimContextText(retrieval.contextBlock.text);
|
|
193
|
+
}
|
|
194
|
+
catch {
|
|
195
|
+
return "";
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
function realtimeInstructions(deps, chatContext) {
|
|
199
|
+
const sections = [`${CONVERSATION_SYSTEM_PROMPT}${REALTIME_SPOKEN_ADDENDUM}`];
|
|
200
|
+
if (chatContext?.grounding?.enabled === true) {
|
|
201
|
+
sections.push(REALTIME_GROUNDED_VOICE_ADDENDUM);
|
|
202
|
+
}
|
|
203
|
+
if (chatContext !== undefined) {
|
|
204
|
+
const recent = recentChatContext(deps, chatContext.chatId);
|
|
205
|
+
if (recent.length > 0) {
|
|
206
|
+
sections.push(`Current Keiko chat context:\n${recent}`);
|
|
207
|
+
}
|
|
208
|
+
const memory = realtimeMemoryContext(deps, chatContext);
|
|
209
|
+
if (memory.length > 0) {
|
|
210
|
+
sections.push(`MemoriaViva context available for this voice session:\n${memory}`);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
return sections.join("\n\n");
|
|
214
|
+
}
|
|
215
|
+
function realtimeGroundingEnabled(chatContext) {
|
|
216
|
+
return chatContext?.grounding?.enabled === true;
|
|
217
|
+
}
|
|
218
|
+
function realtimeProviderSupportsTools(config, provider) {
|
|
219
|
+
const capability = findConfiguredCapability(config, provider.modelId);
|
|
220
|
+
if (capability === undefined) {
|
|
221
|
+
return true;
|
|
222
|
+
}
|
|
223
|
+
return capability.toolCalling || capability.supportsRealtimeVoice === true;
|
|
224
|
+
}
|
|
225
|
+
// Resolves the realtime session voice from the provider's persona→voice mapping, guarded to a
|
|
226
|
+
// realtime-valid id. Without an explicit voice the session would use the provider default; with a
|
|
227
|
+
// TTS-only id (e.g. the operator-config `nova`) the realtime model rejects the session. The client's
|
|
228
|
+
// selected persona wins when mapped; otherwise the neutral persona, then the first configured profile,
|
|
229
|
+
// are the fallbacks — every result passes through `resolveRealtimeVoice` so it is always realtime-valid.
|
|
230
|
+
function resolveRealtimeVoiceId(provider, persona) {
|
|
231
|
+
const profiles = provider.voiceProfiles ?? [];
|
|
232
|
+
const selected = persona !== undefined
|
|
233
|
+
? profiles.find((profile) => profile.persona === persona)?.voiceId
|
|
234
|
+
: undefined;
|
|
235
|
+
const neutral = profiles.find((profile) => profile.persona === "neutral")?.voiceId;
|
|
236
|
+
return resolveRealtimeVoice(selected ?? neutral ?? profiles[0]?.voiceId);
|
|
237
|
+
}
|
|
238
|
+
function buildNegotiationRequest(provider, offerSdp, persona, instructions, groundingEnabled, toolsSupported, deps, signal) {
|
|
239
|
+
const egress = provider.egress ?? currentGatewayEgressConfig(deps);
|
|
240
|
+
return {
|
|
241
|
+
endpoint: provider.baseUrl,
|
|
242
|
+
apiKey: provider.apiKey,
|
|
243
|
+
...(provider.apiKeyHeaderName !== undefined
|
|
244
|
+
? { apiKeyHeaderName: provider.apiKeyHeaderName }
|
|
245
|
+
: {}),
|
|
246
|
+
...(provider.realtimeAuthMode !== undefined
|
|
247
|
+
? { realtimeAuthMode: provider.realtimeAuthMode }
|
|
248
|
+
: {}),
|
|
249
|
+
modelId: provider.modelId,
|
|
250
|
+
instructions,
|
|
251
|
+
voiceId: resolveRealtimeVoiceId(provider, persona),
|
|
252
|
+
transcriptionModel: DEFAULT_REALTIME_TRANSCRIPTION_MODEL,
|
|
253
|
+
...(groundingEnabled && toolsSupported
|
|
254
|
+
? { tools: [REALTIME_GROUNDING_TOOL], toolChoice: "auto" }
|
|
255
|
+
: {}),
|
|
256
|
+
offerSdp,
|
|
257
|
+
signal,
|
|
258
|
+
timeoutMs: Math.min(provider.timeoutMs, REALTIME_NEGOTIATION_TIMEOUT_MS),
|
|
259
|
+
...(egress !== undefined ? { egress } : {}),
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
// The protocol state machine for one attached control socket. Pure of WebSocket/IO concerns beyond
|
|
263
|
+
// the injected `socket.send` — every inbound frame is validated against the #496 contract, gated by
|
|
264
|
+
// the session profile, and answered per the protocol; every outbound frame is sequenced, redacted,
|
|
265
|
+
// and (when replay-eligible) buffered. Negotiation is delegated to the injected `negotiate` seam.
|
|
266
|
+
export class VoiceControlConnection {
|
|
267
|
+
socket;
|
|
268
|
+
session;
|
|
269
|
+
negotiate;
|
|
270
|
+
redact;
|
|
271
|
+
negotiation;
|
|
272
|
+
closed = false;
|
|
273
|
+
constructor(options) {
|
|
274
|
+
this.socket = options.socket;
|
|
275
|
+
this.session = options.session;
|
|
276
|
+
this.negotiate = options.negotiate;
|
|
277
|
+
this.redact = options.redact;
|
|
278
|
+
}
|
|
279
|
+
// Re-delivers the buffered replayable events to a (re)attached client, then announces the resolved
|
|
280
|
+
// session — the reconnect catch-up of protocol §7. The control transport is `loopback-websocket`,
|
|
281
|
+
// recorded per session so the contract's `VOICE_CONTROL_TRANSPORT_V1` baseline is never mutated.
|
|
282
|
+
start(resume) {
|
|
283
|
+
if (resume) {
|
|
284
|
+
for (const buffered of this.session.replay) {
|
|
285
|
+
this.dispatchOut(buffered);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
this.emit({
|
|
289
|
+
kind: "session.created",
|
|
290
|
+
profile: this.session.profile,
|
|
291
|
+
controlTransport: "loopback-websocket",
|
|
292
|
+
mediaTransport: "webrtc",
|
|
293
|
+
negotiationMode: VOICE_PROFILE_NEGOTIATION_MODE[this.session.profile],
|
|
294
|
+
...(this.session.providerLocality !== undefined
|
|
295
|
+
? { providerLocality: this.session.providerLocality }
|
|
296
|
+
: {}),
|
|
297
|
+
});
|
|
298
|
+
this.emit({
|
|
299
|
+
kind: "capability.offer",
|
|
300
|
+
profile: this.session.profile,
|
|
301
|
+
capabilities: { speechToText: true, speechOutput: true, realtimeVoice: true },
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
async receive(raw) {
|
|
305
|
+
if (this.closed) {
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
let parsed;
|
|
309
|
+
try {
|
|
310
|
+
parsed = JSON.parse(raw);
|
|
311
|
+
}
|
|
312
|
+
catch {
|
|
313
|
+
this.fail("invalid-message");
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
316
|
+
const validation = validateVoiceControlMessage(parsed);
|
|
317
|
+
if (!validation.ok) {
|
|
318
|
+
// The envelope (including the version-compatibility rule) is malformed. If the version is the
|
|
319
|
+
// problem, answer the protocol's dedicated unsupported-version error before closing.
|
|
320
|
+
const code = isRecord(parsed) && parsed.protocolVersion !== VOICE_PROTOCOL_VERSION
|
|
321
|
+
? "unsupported-version"
|
|
322
|
+
: "invalid-message";
|
|
323
|
+
this.fail(code);
|
|
324
|
+
return;
|
|
325
|
+
}
|
|
326
|
+
const message = parsed;
|
|
327
|
+
// Idempotency on (sessionId, seq): an already-seen or stale client sequence is ignored, never
|
|
328
|
+
// re-processed (protocol §7). session.create is the resume anchor and is always handled.
|
|
329
|
+
if (message.kind !== "session.create" && message.seq <= this.session.lastClientSeq) {
|
|
330
|
+
return;
|
|
331
|
+
}
|
|
332
|
+
if (!voiceMessageAllowedForProfile(message.kind, this.session.profile)) {
|
|
333
|
+
this.emitError("not-allowed-for-profile");
|
|
334
|
+
return;
|
|
335
|
+
}
|
|
336
|
+
this.session.lastClientSeq = Math.max(this.session.lastClientSeq, message.seq);
|
|
337
|
+
await this.dispatchIn(message);
|
|
338
|
+
}
|
|
339
|
+
// Aborts any in-flight negotiation and marks the connection closed (socket lifecycle ended). The
|
|
340
|
+
// session record is retained by the plane for the resume window; this only detaches the socket.
|
|
341
|
+
dispose() {
|
|
342
|
+
this.closed = true;
|
|
343
|
+
this.negotiation?.abort();
|
|
344
|
+
this.negotiation = undefined;
|
|
345
|
+
}
|
|
346
|
+
async dispatchIn(message) {
|
|
347
|
+
// Only the kinds that drive a host action are handled. Every other permitted kind is an
|
|
348
|
+
// observable no-op: a repeated session.create (start() already announced the session), a late
|
|
349
|
+
// signal.ice.candidate (proxied single-shot SDP carries ICE in the offer/answer — no provider
|
|
350
|
+
// trickle channel), client-reported media.track.state / transcript.partial / playback.state, and
|
|
351
|
+
// any host-originated kind a client should not send. A future contract kind is likewise ignored —
|
|
352
|
+
// unknown control messages are never trusted.
|
|
353
|
+
switch (message.kind) {
|
|
354
|
+
case "signal.sdp.offer":
|
|
355
|
+
await this.handleOffer(message.sdp);
|
|
356
|
+
return;
|
|
357
|
+
case "capability.select":
|
|
358
|
+
// The only selectable profile is the resolved full-realtime profile; acknowledge by policy.
|
|
359
|
+
this.emit({ kind: "policy.decision", decision: "allow" });
|
|
360
|
+
return;
|
|
361
|
+
case "control.interrupt":
|
|
362
|
+
// Barge-in is observed on the control plane; the actual interruption happens on the media
|
|
363
|
+
// plane. Acknowledge playback interruption so the state is observable (AC6).
|
|
364
|
+
this.emit({ kind: "playback.state", state: "interrupted" });
|
|
365
|
+
return;
|
|
366
|
+
case "control.cancel":
|
|
367
|
+
this.cancelNegotiation();
|
|
368
|
+
return;
|
|
369
|
+
case "transcript.committed":
|
|
370
|
+
this.recordTranscript(message.text);
|
|
371
|
+
return;
|
|
372
|
+
case "transcript.discarded":
|
|
373
|
+
// Replayable + content-free: record it into the reconnect buffer without echoing.
|
|
374
|
+
this.record({ kind: "transcript.discarded" });
|
|
375
|
+
return;
|
|
376
|
+
case "session.close":
|
|
377
|
+
this.emit({ kind: "session.closed", reason: "client-request" });
|
|
378
|
+
this.shutdown(1000, "session closed");
|
|
379
|
+
return;
|
|
380
|
+
default:
|
|
381
|
+
return;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
cancelNegotiation() {
|
|
385
|
+
this.negotiation?.abort();
|
|
386
|
+
this.negotiation = undefined;
|
|
387
|
+
}
|
|
388
|
+
async handleOffer(offerSdp) {
|
|
389
|
+
if (typeof offerSdp !== "string" || offerSdp.length === 0 || !offerSdp.startsWith("v=")) {
|
|
390
|
+
this.emitError("invalid-message");
|
|
391
|
+
return;
|
|
392
|
+
}
|
|
393
|
+
if (Buffer.byteLength(offerSdp, "utf8") > MAX_OFFER_SDP_BYTES) {
|
|
394
|
+
this.emitError("invalid-message");
|
|
395
|
+
return;
|
|
396
|
+
}
|
|
397
|
+
this.emit({ kind: "media.track.state", track: "audio-in", state: "negotiating" });
|
|
398
|
+
const controller = new AbortController();
|
|
399
|
+
this.negotiation = controller;
|
|
400
|
+
let outcome;
|
|
401
|
+
try {
|
|
402
|
+
outcome = await this.negotiate(offerSdp, this.session.persona, this.session.chatContext, controller.signal);
|
|
403
|
+
}
|
|
404
|
+
catch {
|
|
405
|
+
outcome = { ok: false, kind: "transport" };
|
|
406
|
+
}
|
|
407
|
+
if (this.negotiation !== controller) {
|
|
408
|
+
// A cancel/close superseded this negotiation; drop its result.
|
|
409
|
+
return;
|
|
410
|
+
}
|
|
411
|
+
this.negotiation = undefined;
|
|
412
|
+
if (this.closed) {
|
|
413
|
+
return;
|
|
414
|
+
}
|
|
415
|
+
if (!outcome.ok) {
|
|
416
|
+
this.emitError("negotiation-failed");
|
|
417
|
+
this.emit({ kind: "media.track.state", track: "audio-in", state: "ended" });
|
|
418
|
+
return;
|
|
419
|
+
}
|
|
420
|
+
// signal.sdp.answer is ephemeral + secret-bearing: it is sent to the live negotiation but never
|
|
421
|
+
// buffered into the replay record (emit() only buffers replay-eligible kinds).
|
|
422
|
+
this.emit({ kind: "signal.sdp.answer", sdp: outcome.value.answerSdp });
|
|
423
|
+
this.emit({ kind: "media.track.state", track: "audio-in", state: "live" });
|
|
424
|
+
this.emit({ kind: "media.track.state", track: "audio-out", state: "live" });
|
|
425
|
+
}
|
|
426
|
+
recordTranscript(text) {
|
|
427
|
+
if (typeof text !== "string") {
|
|
428
|
+
return;
|
|
429
|
+
}
|
|
430
|
+
// reviewable-text: neutralise Trojan-source / bidi rendering before the text may enter the replay
|
|
431
|
+
// buffer (protocol §8). Bounded length. A
|
|
432
|
+
// committed transcript the client relayed from the provider is recorded into the reconnect buffer
|
|
433
|
+
// (it is `replayable`) but not echoed back to the client that already holds it.
|
|
434
|
+
const safe = stripUnsafeFormatChars(text).slice(0, MAX_TRANSCRIPT_CHARS);
|
|
435
|
+
this.record({ kind: "transcript.committed", text: safe });
|
|
436
|
+
}
|
|
437
|
+
emitError(code) {
|
|
438
|
+
this.emit({ kind: "error", code });
|
|
439
|
+
}
|
|
440
|
+
// Builds a sequenced host→client control message from a payload, appends it to the bounded replay
|
|
441
|
+
// buffer when the kind is replay-eligible (protocol §7), and returns it for sending.
|
|
442
|
+
build(payload) {
|
|
443
|
+
const message = {
|
|
444
|
+
protocolVersion: VOICE_PROTOCOL_VERSION,
|
|
445
|
+
sessionId: this.session.sessionId,
|
|
446
|
+
seq: this.session.hostSeq,
|
|
447
|
+
direction: "host-to-client",
|
|
448
|
+
...payload,
|
|
449
|
+
};
|
|
450
|
+
this.session.hostSeq += 1;
|
|
451
|
+
if (isVoiceReplayEligible(message.kind)) {
|
|
452
|
+
this.session.replay.push(message);
|
|
453
|
+
if (this.session.replay.length > MAX_REPLAY_EVENTS) {
|
|
454
|
+
this.session.replay.shift();
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
return message;
|
|
458
|
+
}
|
|
459
|
+
// Sequences, redacts, and sends a host→client message now (and buffers it when replay-eligible).
|
|
460
|
+
emit(payload) {
|
|
461
|
+
this.dispatchOut(this.build(payload));
|
|
462
|
+
}
|
|
463
|
+
// Records a replay-eligible event into the reconnect buffer WITHOUT sending it — used for durable
|
|
464
|
+
// events the client relayed (e.g. a committed transcript) which the connected client already holds,
|
|
465
|
+
// but which a future reconnect must be able to replay.
|
|
466
|
+
record(payload) {
|
|
467
|
+
this.build(payload);
|
|
468
|
+
}
|
|
469
|
+
dispatchOut(message) {
|
|
470
|
+
if (this.closed) {
|
|
471
|
+
return;
|
|
472
|
+
}
|
|
473
|
+
try {
|
|
474
|
+
this.socket.send(JSON.stringify(this.redact(message)));
|
|
475
|
+
}
|
|
476
|
+
catch {
|
|
477
|
+
// a send failure means the socket is gone; the close handler will dispose the connection.
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
fail(code) {
|
|
481
|
+
this.emitError(code);
|
|
482
|
+
this.shutdown(1008, code);
|
|
483
|
+
}
|
|
484
|
+
shutdown(code, reason) {
|
|
485
|
+
if (this.closed) {
|
|
486
|
+
return;
|
|
487
|
+
}
|
|
488
|
+
this.closed = true;
|
|
489
|
+
this.negotiation?.abort();
|
|
490
|
+
this.negotiation = undefined;
|
|
491
|
+
try {
|
|
492
|
+
this.socket.close(code, reason);
|
|
493
|
+
}
|
|
494
|
+
catch {
|
|
495
|
+
// ignore — socket already gone
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
// Reads whether a raw upgrade request targets the single allowed voice control path.
|
|
500
|
+
function isVoiceControlUpgrade(req) {
|
|
501
|
+
let url;
|
|
502
|
+
try {
|
|
503
|
+
url = new URL(req.url ?? "/", "http://127.0.0.1");
|
|
504
|
+
}
|
|
505
|
+
catch {
|
|
506
|
+
return false;
|
|
507
|
+
}
|
|
508
|
+
return url.pathname === VOICE_CONTROL_PATH;
|
|
509
|
+
}
|
|
510
|
+
function rawDataToString(data, isBinary) {
|
|
511
|
+
if (isBinary) {
|
|
512
|
+
return undefined;
|
|
513
|
+
}
|
|
514
|
+
if (typeof data === "string") {
|
|
515
|
+
return data;
|
|
516
|
+
}
|
|
517
|
+
if (Buffer.isBuffer(data)) {
|
|
518
|
+
return data.toString("utf8");
|
|
519
|
+
}
|
|
520
|
+
if (Array.isArray(data)) {
|
|
521
|
+
return Buffer.concat(data).toString("utf8");
|
|
522
|
+
}
|
|
523
|
+
return Buffer.from(data).toString("utf8");
|
|
524
|
+
}
|
|
525
|
+
// One heartbeat sweep over the live control sockets: a socket that did not answer the previous ping
|
|
526
|
+
// (isAlive === false) is terminated so a half-open connection cannot linger and hold a session slot;
|
|
527
|
+
// every other socket is re-armed (isAlive = false) and pinged. Pure over the iterable — no clock, no
|
|
528
|
+
// timer — so it is directly unit-testable without a real WebSocket server.
|
|
529
|
+
export function sweepControlHeartbeat(sockets) {
|
|
530
|
+
for (const socket of sockets) {
|
|
531
|
+
if (socket.isAlive === false) {
|
|
532
|
+
socket.terminate();
|
|
533
|
+
continue;
|
|
534
|
+
}
|
|
535
|
+
socket.isAlive = false;
|
|
536
|
+
socket.ping();
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
function readSessionCreateFrame(ws, raw) {
|
|
540
|
+
let parsed;
|
|
541
|
+
try {
|
|
542
|
+
parsed = JSON.parse(raw);
|
|
543
|
+
}
|
|
544
|
+
catch {
|
|
545
|
+
ws.close(1008, "invalid opening frame");
|
|
546
|
+
return undefined;
|
|
547
|
+
}
|
|
548
|
+
if (!validateVoiceControlMessage(parsed).ok || !isRecord(parsed) || parsed.kind !== "session.create") {
|
|
549
|
+
ws.close(1008, "expected session.create");
|
|
550
|
+
return undefined;
|
|
551
|
+
}
|
|
552
|
+
return parsed;
|
|
553
|
+
}
|
|
554
|
+
function sessionCreateMatchesVoiceProfile(parsed, voice) {
|
|
555
|
+
return (parsed.requestedProfile === voice.profile &&
|
|
556
|
+
parsed.negotiationMode === VOICE_PROFILE_NEGOTIATION_MODE[voice.profile]);
|
|
557
|
+
}
|
|
558
|
+
function resolveSessionChatContext(ws, deps, sessionId, parsed) {
|
|
559
|
+
if (!isVoiceSessionChatContext(parsed.chatContext)) {
|
|
560
|
+
emitStandaloneError(ws, sessionId, "not-allowed-for-profile", deps.redactor);
|
|
561
|
+
ws.close(1008, "missing chat context");
|
|
562
|
+
return undefined;
|
|
563
|
+
}
|
|
564
|
+
const chatContext = parsed.chatContext;
|
|
565
|
+
if (deps.store.findChatById(chatContext.chatId) === undefined) {
|
|
566
|
+
emitStandaloneError(ws, sessionId, "not-allowed-for-profile", deps.redactor);
|
|
567
|
+
ws.close(1008, "unknown chat context");
|
|
568
|
+
return undefined;
|
|
569
|
+
}
|
|
570
|
+
return chatContext;
|
|
571
|
+
}
|
|
572
|
+
class VoiceControlPlaneImpl {
|
|
573
|
+
planeDeps;
|
|
574
|
+
wss = new WebSocketServer({ noServer: true });
|
|
575
|
+
sessions = new Map();
|
|
576
|
+
// Liveness sweep timer, started lazily on the first connection and cleared on closeAll (shutdown).
|
|
577
|
+
// A socket that misses a ping/pong cycle is terminated by the next sweep.
|
|
578
|
+
heartbeat;
|
|
579
|
+
constructor(planeDeps) {
|
|
580
|
+
this.planeDeps = planeDeps;
|
|
581
|
+
}
|
|
582
|
+
startHeartbeat() {
|
|
583
|
+
if (this.heartbeat !== undefined) {
|
|
584
|
+
return;
|
|
585
|
+
}
|
|
586
|
+
const timer = setInterval(() => {
|
|
587
|
+
sweepControlHeartbeat(this.wss.clients);
|
|
588
|
+
}, DEFAULT_VOICE_PROTOCOL_TIMEOUTS.heartbeatIntervalMs);
|
|
589
|
+
// Do not keep the Node process alive solely for the heartbeat.
|
|
590
|
+
timer.unref();
|
|
591
|
+
this.heartbeat = timer;
|
|
592
|
+
}
|
|
593
|
+
handleUpgrade(req, sock, head) {
|
|
594
|
+
if (!isVoiceControlUpgrade(req)) {
|
|
595
|
+
return false;
|
|
596
|
+
}
|
|
597
|
+
const deps = this.planeDeps.handlerDeps();
|
|
598
|
+
// Three load-bearing gates: (1) a present, loopback Origin — a browser always sends Origin on a WS
|
|
599
|
+
// handshake, so requiring it (in addition to `isAllowedHost` rejecting a non-loopback or opaque
|
|
600
|
+
// `null` Origin) keeps the control plane reachable only from the loopback browser origin and never
|
|
601
|
+
// from a non-browser local process; (2) loopback Host; (3) the full-realtime capability gate (a
|
|
602
|
+
// no-voice / STT-only / policy-disabled deployment keeps the WebSocket hard-rejected, AC1/AC3).
|
|
603
|
+
if (typeof req.headers.origin !== "string" ||
|
|
604
|
+
!isAllowedHost(req, this.planeDeps.port) ||
|
|
605
|
+
!isVoiceRealtimeCapable(deps)) {
|
|
606
|
+
return false;
|
|
607
|
+
}
|
|
608
|
+
this.wss.handleUpgrade(req, sock, head, (ws) => {
|
|
609
|
+
this.onConnection(ws, deps);
|
|
610
|
+
});
|
|
611
|
+
return true;
|
|
612
|
+
}
|
|
613
|
+
closeAll() {
|
|
614
|
+
if (this.heartbeat !== undefined) {
|
|
615
|
+
clearInterval(this.heartbeat);
|
|
616
|
+
this.heartbeat = undefined;
|
|
617
|
+
}
|
|
618
|
+
for (const client of this.wss.clients) {
|
|
619
|
+
client.close(1001, "server shutting down");
|
|
620
|
+
}
|
|
621
|
+
this.wss.close();
|
|
622
|
+
this.sessions.clear();
|
|
623
|
+
}
|
|
624
|
+
sweepExpired(now) {
|
|
625
|
+
for (const [key, state] of this.sessions) {
|
|
626
|
+
if (state.detachedAt !== undefined && now - state.detachedAt > SESSION_RESUME_TTL_MS) {
|
|
627
|
+
this.sessions.delete(key);
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
buildNegotiate(deps) {
|
|
632
|
+
return async (offerSdp, persona, chatContext, signal) => {
|
|
633
|
+
const config = currentGatewayConfig(deps);
|
|
634
|
+
if (config === undefined) {
|
|
635
|
+
return { ok: false, kind: "unsupported-model" };
|
|
636
|
+
}
|
|
637
|
+
const provider = resolveRealtimeProvider(config);
|
|
638
|
+
if (provider === undefined) {
|
|
639
|
+
return { ok: false, kind: "unsupported-model" };
|
|
640
|
+
}
|
|
641
|
+
const negotiate = deps.voiceRealtimeNegotiationRequest ?? requestRealtimeNegotiation;
|
|
642
|
+
const instructions = realtimeInstructions(deps, chatContext);
|
|
643
|
+
const groundingEnabled = realtimeGroundingEnabled(chatContext);
|
|
644
|
+
const toolsSupported = realtimeProviderSupportsTools(config, provider);
|
|
645
|
+
if (groundingEnabled && !toolsSupported) {
|
|
646
|
+
return { ok: false, kind: "unsupported-model" };
|
|
647
|
+
}
|
|
648
|
+
return negotiate(buildNegotiationRequest(provider, offerSdp, persona, instructions, groundingEnabled, toolsSupported, deps, signal));
|
|
649
|
+
};
|
|
650
|
+
}
|
|
651
|
+
// Resolves (creating or resuming) the session for a validated opening frame, or closes the socket
|
|
652
|
+
// and returns undefined on a bad frame. The negotiation mode must match the canonical mode for the
|
|
653
|
+
// effective profile (an inconsistent mode is rejected, never advisory).
|
|
654
|
+
resolveSession(ws, deps, voice, raw) {
|
|
655
|
+
const parsed = readSessionCreateFrame(ws, raw);
|
|
656
|
+
if (parsed === undefined)
|
|
657
|
+
return undefined;
|
|
658
|
+
const sessionId = parsed.sessionId;
|
|
659
|
+
if (!isSafeIdentifier(sessionId) || !isSafeIdentifier(parsed.idempotencyKey)) {
|
|
660
|
+
ws.close(1008, "invalid session identifiers");
|
|
661
|
+
return undefined;
|
|
662
|
+
}
|
|
663
|
+
if (!sessionCreateMatchesVoiceProfile(parsed, voice)) {
|
|
664
|
+
emitStandaloneError(ws, sessionId, "not-allowed-for-profile", deps.redactor);
|
|
665
|
+
ws.close(1008, "profile/negotiation mismatch");
|
|
666
|
+
return undefined;
|
|
667
|
+
}
|
|
668
|
+
// Optional, content-free persona: validated against the closed VOICE_PERSONAS set. An absent or
|
|
669
|
+
// malformed value is treated as "no preference" (undefined) rather than rejected, so a stale client
|
|
670
|
+
// can never break session creation; the voice then falls back to the configured default.
|
|
671
|
+
const persona = isVoicePersona(parsed.persona) ? parsed.persona : undefined;
|
|
672
|
+
const chatContext = resolveSessionChatContext(ws, deps, sessionId, parsed);
|
|
673
|
+
if (chatContext === undefined)
|
|
674
|
+
return undefined;
|
|
675
|
+
return this.createOrResumeSession(ws, voice, sessionId, parsed.idempotencyKey, persona, chatContext, deps.redactor);
|
|
676
|
+
}
|
|
677
|
+
createOrResumeSession(ws, voice, sessionId, idempotencyKey, persona, chatContext, redact) {
|
|
678
|
+
this.sweepExpired(Date.now());
|
|
679
|
+
const resumed = this.sessions.get(idempotencyKey);
|
|
680
|
+
if (resumed?.sessionId === sessionId) {
|
|
681
|
+
resumed.detachedAt = undefined;
|
|
682
|
+
return { state: resumed, resume: true };
|
|
683
|
+
}
|
|
684
|
+
if (this.sessions.size >= MAX_ACTIVE_SESSIONS) {
|
|
685
|
+
emitStandaloneError(ws, sessionId, "rate-limited", redact);
|
|
686
|
+
ws.close(1013, "too many sessions");
|
|
687
|
+
return undefined;
|
|
688
|
+
}
|
|
689
|
+
const state = {
|
|
690
|
+
sessionId,
|
|
691
|
+
idempotencyKey,
|
|
692
|
+
profile: voice.profile,
|
|
693
|
+
providerLocality: voice.providerLocality,
|
|
694
|
+
persona,
|
|
695
|
+
chatContext,
|
|
696
|
+
hostSeq: 0,
|
|
697
|
+
lastClientSeq: 0,
|
|
698
|
+
replay: [],
|
|
699
|
+
detachedAt: undefined,
|
|
700
|
+
};
|
|
701
|
+
this.sessions.set(idempotencyKey, state);
|
|
702
|
+
return { state, resume: false };
|
|
703
|
+
}
|
|
704
|
+
// Heartbeat liveness for one socket: mark it alive, refresh on each pong, and ensure the sweep timer
|
|
705
|
+
// is running. A socket that stops answering pings is terminated by the next sweep (no half-open leaks).
|
|
706
|
+
attachHeartbeat(ws) {
|
|
707
|
+
const live = ws;
|
|
708
|
+
live.isAlive = true;
|
|
709
|
+
ws.on("pong", () => {
|
|
710
|
+
live.isAlive = true;
|
|
711
|
+
});
|
|
712
|
+
this.startHeartbeat();
|
|
713
|
+
}
|
|
714
|
+
onConnection(ws, deps) {
|
|
715
|
+
this.attachHeartbeat(ws);
|
|
716
|
+
const voice = resolveVoiceCapability(currentGatewayConfig(deps) ?? { providers: [] }, {
|
|
717
|
+
policyDisabled: isVoiceDisabledByPolicy(deps.env),
|
|
718
|
+
});
|
|
719
|
+
const negotiate = this.buildNegotiate(deps);
|
|
720
|
+
const socket = {
|
|
721
|
+
send: (data) => {
|
|
722
|
+
ws.send(data);
|
|
723
|
+
},
|
|
724
|
+
close: (code, reason) => {
|
|
725
|
+
ws.close(code, reason);
|
|
726
|
+
},
|
|
727
|
+
};
|
|
728
|
+
let connection;
|
|
729
|
+
let activeSession;
|
|
730
|
+
ws.on("message", (data, isBinary) => {
|
|
731
|
+
const raw = rawDataToString(data, isBinary);
|
|
732
|
+
if (raw === undefined) {
|
|
733
|
+
// Raw audio / binary frames are never a control message (AC1): reject and close.
|
|
734
|
+
ws.close(1003, "binary frames are not permitted on the control plane");
|
|
735
|
+
return;
|
|
736
|
+
}
|
|
737
|
+
if (connection !== undefined) {
|
|
738
|
+
void connection.receive(raw);
|
|
739
|
+
return;
|
|
740
|
+
}
|
|
741
|
+
const resolved = this.resolveSession(ws, deps, voice, raw);
|
|
742
|
+
if (resolved === undefined) {
|
|
743
|
+
return;
|
|
744
|
+
}
|
|
745
|
+
activeSession = resolved.state;
|
|
746
|
+
connection = new VoiceControlConnection({
|
|
747
|
+
socket,
|
|
748
|
+
session: resolved.state,
|
|
749
|
+
negotiate,
|
|
750
|
+
redact: deps.redactor,
|
|
751
|
+
});
|
|
752
|
+
connection.start(resolved.resume);
|
|
753
|
+
});
|
|
754
|
+
ws.on("close", () => {
|
|
755
|
+
connection?.dispose();
|
|
756
|
+
// Mark the session resumable for the reconnect window rather than deleting it immediately.
|
|
757
|
+
if (activeSession !== undefined && activeSession.detachedAt === undefined) {
|
|
758
|
+
activeSession.detachedAt = Date.now();
|
|
759
|
+
}
|
|
760
|
+
});
|
|
761
|
+
ws.on("error", () => {
|
|
762
|
+
ws.close(1011, "control plane error");
|
|
763
|
+
});
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
// Builds the realtime voice control plane bound to one server.
|
|
767
|
+
export function createVoiceControlPlane(planeDeps) {
|
|
768
|
+
return new VoiceControlPlaneImpl(planeDeps);
|
|
769
|
+
}
|
|
770
|
+
// Emits a standalone protocol error on a socket that has no attached connection yet (e.g. a rejected
|
|
771
|
+
// opening frame), so the client receives a typed error before the close, redacted like every frame.
|
|
772
|
+
function emitStandaloneError(ws, sessionId, code, redact) {
|
|
773
|
+
const message = {
|
|
774
|
+
protocolVersion: VOICE_PROTOCOL_VERSION,
|
|
775
|
+
sessionId,
|
|
776
|
+
seq: 0,
|
|
777
|
+
direction: "host-to-client",
|
|
778
|
+
kind: "error",
|
|
779
|
+
code,
|
|
780
|
+
};
|
|
781
|
+
try {
|
|
782
|
+
ws.send(JSON.stringify(redact(message)));
|
|
783
|
+
}
|
|
784
|
+
catch {
|
|
785
|
+
// ignore — socket already gone
|
|
786
|
+
}
|
|
787
|
+
}
|