@vellumai/vellum-gateway 0.6.3 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/ARCHITECTURE.md +111 -28
  2. package/Dockerfile +2 -2
  3. package/bun.lock +1 -0
  4. package/bunfig.toml +6 -0
  5. package/package.json +3 -2
  6. package/src/__tests__/config-file-cache.test.ts +6 -6
  7. package/src/__tests__/config.test.ts +1 -1
  8. package/src/__tests__/credential-reader.test.ts +12 -12
  9. package/src/__tests__/credential-watcher-managed-bootstrap.test.ts +77 -11
  10. package/src/__tests__/credential-watcher.test.ts +3 -2
  11. package/src/__tests__/feature-flags-route.test.ts +5 -5
  12. package/src/__tests__/ipc-contact-routes.test.ts +302 -0
  13. package/src/__tests__/ipc-feature-flag-routes.test.ts +284 -0
  14. package/src/__tests__/privacy-config-route.test.ts +911 -0
  15. package/src/__tests__/remote-feature-flag-sync.test.ts +5 -5
  16. package/src/__tests__/runtime-proxy.test.ts +114 -0
  17. package/src/__tests__/schema.test.ts +2 -0
  18. package/src/__tests__/slack-deliver.test.ts +287 -0
  19. package/src/__tests__/slack-errors.test.ts +14 -0
  20. package/src/__tests__/stt-stream-websocket.test.ts +392 -0
  21. package/src/__tests__/test-preload.ts +28 -0
  22. package/src/__tests__/twilio-media-websocket.test.ts +618 -0
  23. package/src/auth/token-service.ts +4 -9
  24. package/src/avatar-sync/avatar-channel-syncer.ts +78 -0
  25. package/src/avatar-sync/avatar-sync-watcher.ts +80 -0
  26. package/src/avatar-sync/slack-avatar-syncer.ts +70 -0
  27. package/src/avatar-sync/types.ts +16 -0
  28. package/src/cli/enable-proxy.ts +3 -6
  29. package/src/config.ts +3 -18
  30. package/src/credential-reader.ts +11 -23
  31. package/src/credential-watcher.ts +3 -3
  32. package/src/db/connection.ts +97 -6
  33. package/src/db/contact-store.ts +156 -0
  34. package/src/db/data-migrations/index.ts +73 -0
  35. package/src/db/data-migrations/m0001-guardian-init-lock.ts +62 -0
  36. package/src/email/register-callback.ts +7 -0
  37. package/src/feature-flag-registry.json +46 -14
  38. package/src/feature-flag-remote-store.ts +4 -9
  39. package/src/feature-flag-store.ts +4 -9
  40. package/src/http/routes/channel-verification-session-proxy.ts +2 -2
  41. package/src/http/routes/email-webhook.ts +6 -2
  42. package/src/http/routes/privacy-config.ts +217 -8
  43. package/src/http/routes/slack-deliver.ts +147 -24
  44. package/src/http/routes/stt-stream-websocket.ts +277 -0
  45. package/src/http/routes/twilio-media-websocket.ts +271 -0
  46. package/src/index.ts +185 -1
  47. package/src/ipc/contact-handlers.ts +65 -0
  48. package/src/ipc/feature-flag-handlers.ts +61 -0
  49. package/src/ipc/server.ts +272 -0
  50. package/src/logger.ts +1 -1
  51. package/src/paths.ts +83 -0
  52. package/src/platform-url.ts +24 -0
  53. package/src/schema.ts +259 -1
  54. package/src/slack/errors.ts +10 -0
  55. package/src/telegram/webhook-manager.ts +18 -0
  56. package/src/trust-store.ts +2 -6
package/ARCHITECTURE.md CHANGED
@@ -32,6 +32,56 @@ Internet
32
32
  +-- /webhooks/* --> BLOCKED (404, never forwarded to runtime)
33
33
  ```
34
34
 
35
+ ### STT Route Proxying (Assistant-Scoped Rewrite)
36
+
37
+ Native clients (macOS, iOS) send speech-to-text transcription requests through the gateway to the daemon's STT service. Clients POST to the assistant-scoped path `/v1/assistants/:assistantId/stt/transcribe`, which the gateway's runtime proxy rewrites to the flat daemon path `/v1/stt/transcribe`. This follows the same assistant-scoped rewrite pattern used by other client-facing endpoints (feature flags, privacy config, etc.).
38
+
39
+ The request carries base64-encoded WAV audio and a MIME type. The daemon resolves the configured STT provider via `resolveBatchTranscriber()` and returns the transcribed text. Clients use the response to implement a service-first strategy: the service transcription takes precedence when available, with Apple-native `SFSpeechRecognizer` as fallback when the service returns 503 (not configured) or fails.
40
+
41
+ | Client path (gateway) | Daemon path (after rewrite) | Method |
42
+ | ----------------------------------- | --------------------------- | ------ |
43
+ | `/v1/assistants/:id/stt/transcribe` | `/v1/stt/transcribe` | POST |
44
+
45
+ **Key source files:**
46
+
47
+ | File | Purpose |
48
+ | ------------------------------------------------ | ------------------------------------------------------------------------- |
49
+ | `gateway/src/http/routes/runtime-proxy.ts` | Assistant-scoped path rewriting (`/v1/assistants/:id/...` → `/v1/...`) |
50
+ | `assistant/src/runtime/routes/stt-routes.ts` | Daemon HTTP endpoint: validates audio, resolves transcriber, returns text |
51
+ | `clients/shared/Network/STTClient.swift` | Shared client: POSTs audio to the gateway, returns typed `STTResult` |
52
+ | `clients/shared/Utilities/AudioWavEncoder.swift` | WAV encoding utility for PCM audio buffers |
53
+
54
+ ### STT Streaming WebSocket Proxy
55
+
56
+ Native clients (macOS, iOS) open WebSocket connections through the gateway to the daemon's real-time STT streaming endpoint for conversation chat message capture. The gateway authenticates the downstream client using an edge JWT (actor principal required), then opens an upstream WebSocket connection to the daemon's `/v1/stt/stream` endpoint with a short-lived gateway service token. This keeps the daemon's WebSocket endpoint unreachable from the public internet while allowing authenticated clients to stream audio for real-time transcription.
57
+
58
+ **Config-authoritative model:** The runtime always resolves the streaming transcriber from `services.stt.provider` in the assistant config, regardless of any `provider` query parameter. The `provider` parameter is optional compatibility metadata — when supplied and it disagrees with the configured provider, the runtime logs a mismatch warning for operator visibility.
59
+
60
+ **Client path:** `wss://<gateway>/v1/stt/stream?mimeType=<mime>[&provider=<id>][&sampleRate=<hz>]`
61
+
62
+ **Query parameters:**
63
+
64
+ | Parameter | Required | Description |
65
+ | ------------ | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
66
+ | `mimeType` | Yes | MIME type of the audio being streamed (e.g. `audio/webm;codecs=opus`) |
67
+ | `provider` | No | Optional STT provider identifier (`deepgram`, `google-gemini`). Forwarded as compatibility metadata — the runtime resolves the transcriber from config, not from this parameter. |
68
+ | `sampleRate` | No | Sample rate in Hz (e.g. `16000`). Passed through to the daemon. |
69
+ | `token` | No | Edge JWT (alternative to `Authorization: Bearer` header for WS upgrades) |
70
+
71
+ **Auth model:** STT streaming is an authenticated, assistant-scoped path. The client must present a valid edge JWT with an actor principal. Service tokens are rejected. When `runtimeProxyRequireAuth` is globally disabled (dev bypass), the upgrade proceeds without token validation.
72
+
73
+ **Proxy behavior:** The gateway buffers up to 100 downstream messages while the upstream connection to the daemon is being established. If the buffer overflows, the downstream connection is closed with code 1008 (policy violation). Once the upstream connection opens, buffered messages are flushed in order. All subsequent messages are forwarded bidirectionally: client audio frames flow upstream, daemon transcript events (JSON text frames: `ready`, `partial`, `final`, `error`, `closed`) flow downstream. When either side closes, the other side is closed with the same code/reason.
74
+
75
+ **Key source files:**
76
+
77
+ | File | Purpose |
78
+ | ------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
79
+ | `gateway/src/http/routes/stt-stream-websocket.ts` | WebSocket upgrade handler (`createSttStreamWebsocketHandler`) and proxy handlers (`getSttStreamWebsocketHandlers`) |
80
+ | `gateway/src/index.ts` | Route registration: wires upgrade handler to the gateway's Bun HTTP server |
81
+ | `assistant/src/runtime/http-server.ts` | Daemon-side WebSocket upgrade at `/v1/stt/stream`, session creation and registry |
82
+ | `assistant/src/stt/stt-stream-session.ts` | Runtime session orchestrator: drives the `StreamingTranscriber` from the WebSocket |
83
+ | `clients/shared/Network/STTStreamingClient.swift` | Swift client: builds the gateway WS URL via `GatewayHTTPClient.buildWebSocketRequest` |
84
+
35
85
  ### Assistant Feature Flags API
36
86
 
37
87
  The gateway exposes a REST API for reading and mutating assistant feature flags. Assistant feature flags are assistant-scoped, declaration-driven booleans that can gate any assistant behavior. Skill availability is one consumer, but not a required coupling (see [`assistant/ARCHITECTURE.md`](../assistant/ARCHITECTURE.md) for resolver and skill enforcement details).
@@ -551,12 +601,23 @@ If no guardian binding exists for the channel, escalation fails closed -- the me
551
601
 
552
602
  #### SQLite Tables
553
603
 
604
+ **Assistant DB** (`assistant.db` — current owner, migrating to gateway):
605
+
554
606
  | Table | Purpose |
555
607
  | --------------------------- | --------------------------------------------------------------------- |
556
608
  | `assistant_ingress_invites` | Invite tokens with SHA-256 hashes, expiry, use counts |
557
609
  | `contacts` | Contact records with role, relationship, and per-contact metadata |
558
610
  | `contact_channels` | Channel bindings per contact with access policy (allow/deny/escalate) |
559
611
 
612
+ **Gateway DB** (`gateway.sqlite` — future owner of auth/authz):
613
+
614
+ | Table | Purpose |
615
+ | ------------------ | ---------------------------------------------------------------------- |
616
+ | `contacts` | Contact auth/authz: id, display_name, role, principal_id |
617
+ | `contact_channels` | Channel bindings with policy, status, external IDs, verification state |
618
+
619
+ The gateway declares `contacts` and `contact_channels` tables and exposes them via IPC (`list_contacts`, `get_contact`, `get_contact_by_channel`, `get_channels_for_contact`). Endpoint cutover and data migration are in progress — the gateway will become the canonical owner once dual-writing is enabled.
620
+
560
621
  #### Key Modules
561
622
 
562
623
  | Module | Purpose |
@@ -566,6 +627,8 @@ If no guardian binding exists for the channel, escalation fails closed -- the me
566
627
  | `assistant/src/contacts/contacts-write.ts` | Contact and channel writes (upsert, policy changes, invite redemption) |
567
628
  | `assistant/src/daemon/handlers/config-inbox.ts` | Handlers for invite and member contracts |
568
629
  | `assistant/src/runtime/routes/channel-routes.ts` | ACL enforcement point -- member lookup, policy check, escalation creation |
630
+ | `gateway/src/db/contact-store.ts` | Gateway-side read-only ContactStore (prepared-statement queries) |
631
+ | `gateway/src/ipc/contact-handlers.ts` | IPC route handlers for contact reads |
569
632
 
570
633
  ### Telegram Credential Flow
571
634
 
@@ -678,9 +741,9 @@ The Socket Mode client auto-reconnects on any WebSocket close or error. The back
678
741
 
679
742
  ---
680
743
 
681
- ## AI Phone Calls — Twilio ConversationRelay
744
+ ## AI Phone Calls — Twilio Voice
682
745
 
683
- The Calls subsystem supports both **outbound** and **inbound** voice calls via Twilio's ConversationRelay protocol. The assistant uses an LLM-driven conversation loop to speak in real time. Voice is a first-class channel with its own per-call conversation (outbound key: `asst:${assistantId}:voice:call:${callSessionId}`, inbound key: `asst:${assistantId}:voice:inbound:${callSid}`). When the AI needs guardian input during a call, it dispatches ASK_GUARDIAN requests cross-channel to mac/telegram via the guardian dispatch engine. Answer resolution uses first-writer-wins semantics -- the first channel to respond provides the answer, and remaining channels receive a "already answered" notice.
746
+ The Calls subsystem supports both **outbound** and **inbound** voice calls via Twilio. The Twilio integration path is provider-conditional: `services.stt.provider` determines whether calls use ConversationRelay (Twilio-native STT for Deepgram/Google) or Media Streams (daemon-side STT for OpenAI Whisper). The assistant uses an LLM-driven conversation loop to speak in real time. Voice is a first-class channel with its own per-call conversation (outbound key: `asst:${assistantId}:voice:call:${callSessionId}`, inbound key: `asst:${assistantId}:voice:inbound:${callSid}`). When the AI needs guardian input during a call, it dispatches ASK_GUARDIAN requests cross-channel to mac/telegram via the guardian dispatch engine. Answer resolution uses first-writer-wins semantics -- the first channel to respond provides the answer, and remaining channels receive a "already answered" notice.
684
747
 
685
748
  ### Outbound Call Flow
686
749
 
@@ -891,7 +954,8 @@ sequenceDiagram
891
954
  | `gateway/src/http/routes/twilio-voice-webhook.ts` | Gateway route: validates Twilio signature, forwards voice webhook to runtime |
892
955
  | `gateway/src/http/routes/twilio-status-webhook.ts` | Gateway route: validates Twilio signature, forwards status callback to runtime |
893
956
  | `gateway/src/http/routes/twilio-connect-action-webhook.ts` | Gateway route: validates Twilio signature, forwards connect-action to runtime |
894
- | `gateway/src/http/routes/twilio-relay-websocket.ts` | Gateway route: WebSocket proxy for ConversationRelay frames between Twilio and runtime |
957
+ | `gateway/src/http/routes/twilio-relay-websocket.ts` | Gateway route: WebSocket proxy for ConversationRelay frames between Twilio and runtime (used for Deepgram/Google native STT) |
958
+ | `gateway/src/http/routes/twilio-media-websocket.ts` | Gateway route: WebSocket proxy for Media Streams frames between Twilio and runtime (used for OpenAI Whisper media-stream STT) |
895
959
  | `gateway/src/twilio/validate-webhook.ts` | Twilio webhook validation: HMAC-SHA1 signature verification, payload size limits, fail-closed when auth token missing |
896
960
 
897
961
  ### Call State Machine
@@ -960,14 +1024,17 @@ All five tables live in `~/.vellum/workspace/data/db/assistant.db` alongside exi
960
1024
 
961
1025
  Internet-facing Twilio callbacks terminate at the gateway, which validates signatures before forwarding to the runtime. This keeps the runtime behind the gateway's bearer-auth boundary.
962
1026
 
963
- | Gateway Route | Validates | Forwards To (Runtime) |
964
- | -------------------------------------- | --------------------------------- | ---------------------------------------------------------------------------------------- |
965
- | `POST /webhooks/twilio/voice` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/voice-webhook` (JSON: `{ params, originalUrl, assistantId? }`) |
966
- | `POST /webhooks/twilio/status` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/status` (JSON: `{ params }`) |
967
- | `POST /webhooks/twilio/connect-action` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/connect-action` (JSON: `{ params }`) |
968
- | `WS /webhooks/twilio/relay` | WebSocket upgrade | `WS /v1/calls/relay` (bidirectional proxy) |
1027
+ | Gateway Route | Validates | Forwards To (Runtime) |
1028
+ | ---------------------------------------------------------- | --------------------------------- | ---------------------------------------------------------------------------------------- |
1029
+ | `POST /webhooks/twilio/voice` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/voice-webhook` (JSON: `{ params, originalUrl, assistantId? }`) |
1030
+ | `POST /webhooks/twilio/status` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/status` (JSON: `{ params }`) |
1031
+ | `POST /webhooks/twilio/connect-action` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/connect-action` (JSON: `{ params }`) |
1032
+ | `WS /webhooks/twilio/relay` | WebSocket upgrade | `WS /v1/calls/relay` (bidirectional proxy) — ConversationRelay path |
1033
+ | `WS /webhooks/twilio/media-stream/<callSessionId>/<token>` | WebSocket upgrade | `WS /v1/calls/media-stream` (bidirectional proxy) — Media Streams path |
1034
+
1035
+ In gateway-fronted deployments, the TwiML WebSocket URL (returned by the voice webhook) should point to the gateway's `/webhooks/twilio/relay` (ConversationRelay) or `/webhooks/twilio/media-stream/<callSessionId>/<token>` (Media Streams) endpoint rather than directly to the runtime. The gateway proxies frames bidirectionally between Twilio and the runtime, preserving close and error semantics for proper cleanup.
969
1036
 
970
- In gateway-fronted deployments, the TwiML WebSocket URL (returned by the voice webhook) should point to the gateway's `/webhooks/twilio/relay` endpoint rather than directly to the runtime. The gateway proxies ConversationRelay frames bidirectionally between Twilio and the runtime, preserving close and error semantics for proper cleanup.
1037
+ **Media Streams handshake metadata:** Twilio Media Streams does not reliably preserve URL query parameters across the WebSocket upgrade, so handshake metadata (`callSessionId` and auth `token`) is encoded as **URL path segments** (primary transport). The gateway also supports legacy query-parameter-based handshake as a fallback for backward compatibility. The metadata extractor in `twilio-media-websocket.ts` resolves values from path segments first, falling back to query parameters.
971
1038
 
972
1039
  Signature validation is **fail-closed**: if the Twilio auth token is not configured, all webhook requests are rejected with `403`. Missing or invalid `X-Twilio-Signature` headers are also rejected with `403`. Payload size is capped by `maxWebhookPayloadBytes` (checked via both `Content-Length` header and actual body size).
973
1040
 
@@ -999,7 +1066,8 @@ This makes ingress URL updates smoother in local tunnel workflows because Twilio
999
1066
  | POST | `/v1/calls/:callSessionId/instruction` | Relay a steering instruction to an active call's controller (alternative to in-conversation bridge) |
1000
1067
  | POST | `/v1/internal/twilio/status` | Internal status callback used by gateway; accepts JSON `{ params }` |
1001
1068
  | POST | `/v1/internal/twilio/connect-action` | Internal connect action callback used by gateway; accepts JSON `{ params }` |
1002
- | WS | `/v1/calls/relay` | ConversationRelay WebSocket (bidirectional: prompt/interrupt/dtmf from Twilio, text tokens/end to Twilio) |
1069
+ | WS | `/v1/calls/relay` | ConversationRelay WebSocket (bidirectional: prompt/interrupt/dtmf from Twilio, text tokens/end to Twilio) — Deepgram/Google path |
1070
+ | WS | `/v1/calls/media-stream` | Media Streams WebSocket (raw audio from Twilio, daemon-side STT) — OpenAI Whisper path |
1003
1071
 
1004
1072
  ### Tools
1005
1073
 
@@ -1016,7 +1084,7 @@ Both tools and HTTP routes delegate to the same domain functions in `call-domain
1016
1084
  The CallController detects two special markers in the LLM's response text:
1017
1085
 
1018
1086
  - **`[ASK_GUARDIAN: question]`** — The AI needs to consult the guardian. The controller creates a pending question, notifies the session via `fireCallQuestionNotifier`, puts the caller on hold, and waits for a guardian answer (timeout configured via `calls.userConsultTimeoutSeconds`).
1019
- - **`[END_CALL]`** — The AI has determined the call's purpose is fulfilled. The controller sends a goodbye, closes the ConversationRelay session, and marks the call as completed.
1087
+ - **`[END_CALL]`** — The AI has determined the call's purpose is fulfilled. The controller sends a goodbye, closes the call session, and marks the call as completed.
1020
1088
 
1021
1089
  Both markers are stripped from the TTS output so the callee never hears the raw control text.
1022
1090
 
@@ -1035,19 +1103,20 @@ Malformed or unprocessable provider callback payloads are logged as dead-letter
1035
1103
 
1036
1104
  Call behavior is controlled via the `calls` config block in the assistant configuration (`config/schema.ts`). All values have sensible defaults and are validated via Zod:
1037
1105
 
1038
- | Field | Type | Default | Description |
1039
- | ----------------------------------- | -------- | ------------------------------ | --------------------------------------------------------------------------------------------------- |
1040
- | `calls.enabled` | boolean | `true` | Master toggle for the calls feature. When `false`, call routes return 403 and tools return errors. |
1041
- | `calls.provider` | enum | `'twilio'` | Voice provider to use (currently only Twilio is supported). |
1042
- | `calls.maxDurationSeconds` | int | `3600` | Maximum allowed duration per call. |
1043
- | `calls.userConsultTimeoutSeconds` | int | `120` | How long to wait for a user answer before timing out a pending question. |
1044
- | `calls.disclosure.enabled` | boolean | `true` | Whether the AI should disclose it is an AI at the start of the call. |
1045
- | `calls.disclosure.text` | string | _(default disclosure prompt)_ | The disclosure instruction included in the system prompt. |
1046
- | `calls.safety.denyCategories` | string[] | `[]` | Categories of calls to deny (e.g., emergency numbers are always denied regardless of this setting). |
1047
- | `calls.model` | string | _(unset — uses default model)_ | Optional override for the LLM model used in voice call conversations. |
1048
- | `calls.voice.language` | string | `'en-US'` | Language code for TTS and transcription. |
1049
- | `calls.voice.transcriptionProvider` | enum | `'Deepgram'` | Speech-to-text provider (`Deepgram` or `Google`). |
1050
- | `elevenlabs.voiceId` | string | `'ZF6FPAbjXT4488VcRRnw'` | ElevenLabs voice ID used by both in-app TTS and phone calls. Defaults to Amelia. |
1106
+ | Field | Type | Default | Description |
1107
+ | --------------------------------- | -------- | ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
1108
+ | `calls.enabled` | boolean | `true` | Master toggle for the calls feature. When `false`, call routes return 403 and tools return errors. |
1109
+ | `calls.provider` | enum | `'twilio'` | Voice provider to use (currently only Twilio is supported). |
1110
+ | `calls.maxDurationSeconds` | int | `3600` | Maximum allowed duration per call. |
1111
+ | `calls.userConsultTimeoutSeconds` | int | `120` | How long to wait for a user answer before timing out a pending question. |
1112
+ | `calls.disclosure.enabled` | boolean | `true` | Whether the AI should disclose it is an AI at the start of the call. |
1113
+ | `calls.disclosure.text` | string | _(default disclosure prompt)_ | The disclosure instruction included in the system prompt. |
1114
+ | `calls.safety.denyCategories` | string[] | `[]` | Categories of calls to deny (e.g., emergency numbers are always denied regardless of this setting). |
1115
+ | `calls.model` | string | _(unset — uses default model)_ | Optional override for the LLM model used in voice call conversations. |
1116
+ | `calls.voice.language` | string | `'en-US'` | Language code for TTS and transcription. |
1117
+ | `services.stt.provider` | enum | `'deepgram'` | STT provider for all boundaries including telephony. Determines the Twilio integration path (ConversationRelay-native for `deepgram`/`google-gemini`, media-stream for `openai-whisper`). |
1118
+ | `services.tts.provider` | enum | `'elevenlabs'` | Active TTS provider for speech synthesis (catalog-driven; see [TTS Provider Abstraction](../assistant/ARCHITECTURE.md#tts-provider-abstraction-servicestts)). |
1119
+ | `services.tts.providers.<id>.*` | object | _(per-provider defaults)_ | Provider-specific settings block. One block per catalog entry (e.g. `elevenlabs`, `fish-audio`). |
1051
1120
 
1052
1121
  ### Caller Identity Resolution
1053
1122
 
@@ -1064,10 +1133,24 @@ Both the resolved mode and source are logged at info level on success, and rejec
1064
1133
 
1065
1134
  ### Voice Quality Profile Resolution
1066
1135
 
1067
- Voice and TTS settings are configurable via the `calls.voice` config block — they are not hardcoded. The function `resolveVoiceQualityProfile()` in `voice-quality.ts` reads the current config and resolves it into a `VoiceQualityProfile` containing the TTS provider, voice spec string, language, and transcription provider.
1136
+ Voice and TTS settings are configurable via the `calls.voice` and `services.tts` config blocks — they are not hardcoded. The function `resolveVoiceQualityProfile()` in `voice-quality.ts` uses the catalog-driven call strategy abstraction to determine how the active TTS provider integrates with the Twilio telephony path, then resolves the result into a `VoiceQualityProfile` containing the TTS provider, voice spec string, and language.
1137
+
1138
+ The active TTS provider is determined by `services.tts.provider` (default: `"elevenlabs"`). Provider-specific settings (voice ID, model, tuning parameters) are read from `services.tts.providers.<id>`. The call mode (`native-twilio` or `synthesized-play`) is resolved from the canonical provider catalog via `resolveCallStrategy()` in `tts-call-strategy.ts` — it reads the provider's declared `callMode` rather than inferring behavior from runtime capabilities.
1139
+
1140
+ For `native-twilio` providers (e.g. ElevenLabs), the voice quality profile looks up a registered `NativeTwilioVoiceSpecBuilder` to construct the provider-specific voice spec string for the ConversationRelay `voice` attribute. New native providers plug in by registering their own voice spec builder — no edits to core call routing logic required. For `synthesized-play` providers (e.g. Fish Audio), `ttsProvider` is set to `"Google"` as a placeholder in TwiML and actual audio is delivered via `play` messages — the assistant synthesises audio via the provider's HTTP API.
1141
+
1142
+ The voice webhook in `twilio-routes.ts` calls `resolveVoiceQualityProfile()` for TTS settings and separately resolves the telephony STT strategy via `resolveTelephonySttRouting()`. The routing result determines which TwiML generator to use: `generateTwiML()` for Twilio-native ConversationRelay, or `generateStreamTwiML()` for the media-stream path. This separation keeps TTS and STT resolution independent — the voice quality profile controls the TTS provider, voice, and language, while the routing strategy controls the STT integration path.
1143
+
1144
+ For full details on the catalog-driven TTS architecture, provider catalog, call strategy abstraction, and the provider-add checklist, see the [TTS Provider Abstraction](../assistant/ARCHITECTURE.md#tts-provider-abstraction-servicestts) section in the assistant architecture docs.
1145
+
1146
+ ### Telephony STT: Provider-Conditional Hybrid Routing
1147
+
1148
+ Telephony STT is unified under `services.stt.provider`. The voice webhook in `twilio-routes.ts` calls `resolveTelephonySttRouting()` to determine the Twilio integration path based on the active provider:
1149
+
1150
+ - **Deepgram / Google** (`conversation-relay-native` strategy) — TwiML emits `<Connect><ConversationRelay>` with Twilio-native `transcriptionProvider` and `speechModel` attributes. The gateway proxies ConversationRelay frames via `/webhooks/twilio/relay`. The daemon receives transcribed text, not raw audio.
1068
1151
 
1069
- All calls use **ElevenLabs** as the TTS provider via Twilio ConversationRelay. The voice ID is read from the shared `elevenlabs.voiceId` config key (defaulting to Amelia `ZF6FPAbjXT4488VcRRnw`). Optional tuning parameters (`voiceModelId`, `speed`, `stability`, `similarityBoost`) are also read from the top-level `elevenlabs` config. When `voiceModelId` is set, the emitted voice spec uses the Twilio ConversationRelay extended format: `voiceId-model-speed_stability_similarity`. When `voiceModelId` is empty (the default), only the bare `voiceId` is sent.
1152
+ - **OpenAI Whisper** (`media-stream-custom` strategy)TwiML emits `<Connect><Stream>` pointing to the gateway's media-stream proxy (`/webhooks/twilio/media-stream`). The gateway forwards raw audio frames to the daemon's media-stream server, which transcribes server-side.
1070
1153
 
1071
- The voice webhook in `twilio-routes.ts` calls `resolveVoiceQualityProfile()` and passes the result directly to `generateTwiML()` to produce ConversationRelay TwiML.
1154
+ Both paths are active in production. The strategy selection happens at call setup time based on the current `services.stt.provider` value. See `docs/internal-reference.md` for a provider-specific troubleshooting matrix.
1072
1155
 
1073
1156
  ---
package/Dockerfile CHANGED
@@ -2,7 +2,7 @@
2
2
  FROM oven/bun:1.3.11@sha256:0733e50325078969732ebe3b15ce4c4be5082f18c4ac1a0f0ca4839c2e4e42a7 AS bun
3
3
 
4
4
  # Build stage
5
- FROM debian:trixie@sha256:3615a749858a1cba49b408fb49c37093db813321355a9ab7c1f9f4836341e9db AS builder
5
+ FROM debian:trixie@sha256:3352c2e13876c8a5c5873ef20870e1939e73cb9a3c1aeba5e3e72172a85ce9ed AS builder
6
6
 
7
7
  WORKDIR /app
8
8
 
@@ -14,7 +14,7 @@ RUN bun install --frozen-lockfile --production
14
14
  COPY . .
15
15
 
16
16
  # Runtime stage
17
- FROM debian:trixie-slim@sha256:1d3c811171a08a5adaa4a163fbafd96b61b87aa871bbc7aa15431ac275d3d430 AS runner
17
+ FROM debian:trixie-slim@sha256:4ffb3a1511099754cddc70eb1b12e50ffdb67619aa0ab6c13fcd800a78ef7c7a AS runner
18
18
 
19
19
  WORKDIR /app
20
20
 
package/bun.lock CHANGED
@@ -10,6 +10,7 @@
10
10
  "pino": "9.14.0",
11
11
  "pino-pretty": "13.1.3",
12
12
  "uuid": "13.0.0",
13
+ "zod": "4.3.6",
13
14
  },
14
15
  "devDependencies": {
15
16
  "@types/bun": "1.3.9",
package/bunfig.toml ADDED
@@ -0,0 +1,6 @@
1
+ [install]
2
+ exact = true
3
+
4
+ [test]
5
+ root = "./src"
6
+ preload = ["./src/__tests__/test-preload.ts"]
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vellumai/vellum-gateway",
3
- "version": "0.6.3",
3
+ "version": "0.6.4",
4
4
  "license": "MIT",
5
5
  "type": "module",
6
6
  "exports": {
@@ -27,7 +27,8 @@
27
27
  "minimatch": "10.2.4",
28
28
  "pino": "9.14.0",
29
29
  "pino-pretty": "13.1.3",
30
- "uuid": "13.0.0"
30
+ "uuid": "13.0.0",
31
+ "zod": "4.3.6"
31
32
  },
32
33
  "devDependencies": {
33
34
  "@types/bun": "1.3.9",
@@ -13,26 +13,26 @@ import { ConfigFileCache } from "../config-file-cache.js";
13
13
  let testBaseDir: string;
14
14
  let workspaceDir: string;
15
15
  let configPath: string;
16
- let savedBaseDataDir: string | undefined;
16
+ let savedWorkspaceDir: string | undefined;
17
17
 
18
18
  function writeConfig(data: Record<string, unknown>): void {
19
19
  writeFileSync(configPath, JSON.stringify(data));
20
20
  }
21
21
 
22
22
  beforeEach(() => {
23
- savedBaseDataDir = process.env.BASE_DATA_DIR;
23
+ savedWorkspaceDir = process.env.VELLUM_WORKSPACE_DIR;
24
24
  testBaseDir = mkdtempSync(join(tmpdir(), "config-file-cache-test-"));
25
25
  workspaceDir = join(testBaseDir, ".vellum", "workspace");
26
26
  mkdirSync(workspaceDir, { recursive: true });
27
27
  configPath = join(workspaceDir, "config.json");
28
- process.env.BASE_DATA_DIR = testBaseDir;
28
+ process.env.VELLUM_WORKSPACE_DIR = workspaceDir;
29
29
  });
30
30
 
31
31
  afterEach(() => {
32
- if (savedBaseDataDir === undefined) {
33
- delete process.env.BASE_DATA_DIR;
32
+ if (savedWorkspaceDir === undefined) {
33
+ delete process.env.VELLUM_WORKSPACE_DIR;
34
34
  } else {
35
- process.env.BASE_DATA_DIR = savedBaseDataDir;
35
+ process.env.VELLUM_WORKSPACE_DIR = savedWorkspaceDir;
36
36
  }
37
37
  rmSync(testBaseDir, { recursive: true, force: true });
38
38
  });
@@ -23,7 +23,7 @@ describe("config: hardcoded defaults", () => {
23
23
  expect(config.unmappedPolicy).toBe("reject");
24
24
  expect(config.routingEntries).toEqual([]);
25
25
  expect(config.defaultAssistantId).toBeUndefined();
26
- expect(config.logFile.dir).toMatch(/\.vellum\/logs$/);
26
+ expect(config.logFile.dir).toMatch(/logs$/);
27
27
  expect(config.logFile.retentionDays).toBe(30);
28
28
  });
29
29
 
@@ -39,7 +39,7 @@ const testDir = join(
39
39
  );
40
40
 
41
41
  function metadataDir(): string {
42
- return join(testDir, ".vellum", "workspace", "data", "credentials");
42
+ return join(testDir, "data", "credentials");
43
43
  }
44
44
 
45
45
  function writeMetadata(
@@ -106,8 +106,8 @@ function encryptEntries(
106
106
  }
107
107
 
108
108
  function writeEncryptedStore(entries: Record<string, string>): void {
109
- const storePath = join(testDir, ".vellum", "protected", "keys.enc");
110
- mkdirSync(join(testDir, ".vellum", "protected"), { recursive: true });
109
+ mkdirSync(testDir, { recursive: true });
110
+ const storePath = join(testDir, "keys.enc");
111
111
 
112
112
  const salt = randomBytes(16);
113
113
  const key = pbkdf2Sync(
@@ -131,17 +131,16 @@ function writeEncryptedStore(entries: Record<string, string>): void {
131
131
  * The store.key is used directly as the AES-256-GCM key (no PBKDF2).
132
132
  */
133
133
  function writeEncryptedStoreV2(entries: Record<string, string>): void {
134
- const protectedDir = join(testDir, ".vellum", "protected");
135
- mkdirSync(protectedDir, { recursive: true });
134
+ mkdirSync(testDir, { recursive: true });
136
135
 
137
136
  const storeKey = randomBytes(KEY_LENGTH);
138
- writeFileSync(join(protectedDir, "store.key"), storeKey);
137
+ writeFileSync(join(testDir, "store.key"), storeKey);
139
138
 
140
139
  const store = {
141
140
  version: 2,
142
141
  entries: encryptEntries(entries, storeKey),
143
142
  };
144
- writeFileSync(join(protectedDir, "keys.enc"), JSON.stringify(store));
143
+ writeFileSync(join(testDir, "keys.enc"), JSON.stringify(store));
145
144
  }
146
145
 
147
146
  // ---------------------------------------------------------------------------
@@ -149,12 +148,14 @@ function writeEncryptedStoreV2(entries: Record<string, string>): void {
149
148
  // ---------------------------------------------------------------------------
150
149
 
151
150
  beforeEach(() => {
152
- process.env.BASE_DATA_DIR = testDir;
151
+ process.env.GATEWAY_SECURITY_DIR = testDir;
152
+ process.env.VELLUM_WORKSPACE_DIR = testDir;
153
153
  logCalls.length = 0;
154
154
  });
155
155
 
156
156
  afterEach(() => {
157
- delete process.env.BASE_DATA_DIR;
157
+ delete process.env.GATEWAY_SECURITY_DIR;
158
+ delete process.env.VELLUM_WORKSPACE_DIR;
158
159
  try {
159
160
  rmSync(testDir, { recursive: true, force: true });
160
161
  } catch {
@@ -178,8 +179,7 @@ describe("v2 encrypted store with store.key", () => {
178
179
 
179
180
  test("returns undefined for v2 store when store.key is missing", async () => {
180
181
  // Write a v2 store but without the store.key file
181
- const protectedDir = join(testDir, ".vellum", "protected");
182
- mkdirSync(protectedDir, { recursive: true });
182
+ mkdirSync(testDir, { recursive: true });
183
183
 
184
184
  const storeKey = randomBytes(KEY_LENGTH);
185
185
  const store = {
@@ -189,7 +189,7 @@ describe("v2 encrypted store with store.key", () => {
189
189
  storeKey,
190
190
  ),
191
191
  };
192
- writeFileSync(join(protectedDir, "keys.enc"), JSON.stringify(store));
192
+ writeFileSync(join(testDir, "keys.enc"), JSON.stringify(store));
193
193
  // Deliberately do NOT write store.key
194
194
 
195
195
  const result = await readCredential(credentialKey("test", "key"));
@@ -1,4 +1,5 @@
1
1
  import { afterEach, describe, expect, test } from "bun:test";
2
+ import { createServer } from "node:net";
2
3
  import { spawn, type ChildProcess } from "node:child_process";
3
4
  import { mkdirSync, renameSync, rmSync, writeFileSync } from "node:fs";
4
5
  import { tmpdir } from "node:os";
@@ -54,19 +55,51 @@ let gatewayPort = 0;
54
55
  let cesPort = 0;
55
56
  let cesServer: ReturnType<typeof Bun.serve> | null = null;
56
57
 
57
- function assignPorts(): void {
58
- if (gatewayPort !== 0 && cesPort !== 0) return;
59
- gatewayPort = 49152 + Math.floor(Math.random() * 8_192);
60
- cesPort = gatewayPort + 1;
58
+ /** Ask the OS for a free port by briefly binding to port 0. */
59
+ function getFreePort(): Promise<number> {
60
+ return new Promise((resolve, reject) => {
61
+ const srv = createServer();
62
+ srv.listen(0, "127.0.0.1", () => {
63
+ const addr = srv.address();
64
+ if (!addr || typeof addr === "string") {
65
+ srv.close();
66
+ reject(new Error("Failed to get free port"));
67
+ return;
68
+ }
69
+ const port = addr.port;
70
+ srv.close(() => resolve(port));
71
+ });
72
+ srv.on("error", reject);
73
+ });
74
+ }
75
+
76
+ /** Wait for a child process to exit, with a safety timeout. */
77
+ function waitForExit(proc: ChildProcess, timeoutMs = 5_000): Promise<void> {
78
+ return new Promise<void>((resolve) => {
79
+ if (proc.exitCode !== null || proc.signalCode !== null) {
80
+ resolve();
81
+ return;
82
+ }
83
+ const timer = setTimeout(resolve, timeoutMs);
84
+ proc.on("exit", () => {
85
+ clearTimeout(timer);
86
+ resolve();
87
+ });
88
+ });
61
89
  }
62
90
 
63
91
  async function startGateway(): Promise<void> {
64
- assignPorts();
92
+ if (cesPort === 0)
93
+ throw new Error(
94
+ "CES port not assigned — call startFakeCes or reserveCesPort first",
95
+ );
96
+ gatewayPort = await getFreePort();
65
97
 
66
98
  gatewayProc = spawn("bun", ["run", gatewayEntry], {
67
99
  env: {
68
100
  ...process.env,
69
- BASE_DATA_DIR: testDir,
101
+ GATEWAY_SECURITY_DIR: join(testDir, ".vellum", "protected"),
102
+ VELLUM_WORKSPACE_DIR: join(testDir, ".vellum", "workspace"),
70
103
  GATEWAY_PORT: String(gatewayPort),
71
104
  CES_CREDENTIAL_URL: `http://127.0.0.1:${cesPort}`,
72
105
  CES_SERVICE_TOKEN: TEST_SERVICE_TOKEN,
@@ -76,8 +109,27 @@ async function startGateway(): Promise<void> {
76
109
  stdio: ["ignore", "pipe", "pipe"],
77
110
  });
78
111
 
112
+ // Collect stderr for diagnostics on failure.
113
+ const stderrChunks: Buffer[] = [];
114
+ gatewayProc.stderr?.on("data", (chunk: Buffer) => stderrChunks.push(chunk));
115
+
116
+ // Track early exit so we can fail fast instead of polling for 30s.
117
+ let earlyExitCode: number | null = null;
118
+ let earlyExitSignal: string | null = null;
119
+ gatewayProc.on("exit", (code, signal) => {
120
+ earlyExitCode = code;
121
+ earlyExitSignal = signal;
122
+ });
123
+
79
124
  const deadline = Date.now() + 30_000;
80
125
  while (Date.now() < deadline) {
126
+ // If the process already died, fail immediately with stderr.
127
+ if (earlyExitCode !== null || earlyExitSignal !== null) {
128
+ const stderr = Buffer.concat(stderrChunks).toString().slice(-2000);
129
+ throw new Error(
130
+ `Gateway exited early (code=${earlyExitCode}, signal=${earlyExitSignal})\n${stderr}`,
131
+ );
132
+ }
81
133
  try {
82
134
  const res = await fetch(`http://localhost:${gatewayPort}/healthz`);
83
135
  if (res.ok) return;
@@ -86,7 +138,10 @@ async function startGateway(): Promise<void> {
86
138
  }
87
139
  await new Promise((resolve) => setTimeout(resolve, 100));
88
140
  }
89
- throw new Error("Gateway failed to start within 30 seconds");
141
+ const stderr = Buffer.concat(stderrChunks).toString().slice(-2000);
142
+ throw new Error(
143
+ `Gateway failed to start within 30 seconds\nstderr: ${stderr}`,
144
+ );
90
145
  }
91
146
 
92
147
  function startFakeCes(opts: {
@@ -94,11 +149,12 @@ function startFakeCes(opts: {
94
149
  credentials?: Record<string, string>;
95
150
  resolveValue?: (account: string) => string | undefined;
96
151
  }): void {
97
- assignPorts();
98
152
  const accounts = opts.accounts ?? Object.keys(opts.credentials ?? {});
99
153
  const credentials = opts.credentials ?? {};
100
154
  cesServer = Bun.serve({
101
- port: cesPort,
155
+ // If cesPort was pre-reserved (for tests that start the gateway before
156
+ // the CES), bind to that port. Otherwise let the OS pick a free one.
157
+ port: cesPort || 0,
102
158
  fetch(req) {
103
159
  const authHeader = req.headers.get("authorization");
104
160
  if (authHeader !== `Bearer ${TEST_SERVICE_TOKEN}`) {
@@ -130,17 +186,22 @@ function startFakeCes(opts: {
130
186
  return new Response("Not Found", { status: 404 });
131
187
  },
132
188
  });
189
+ cesPort = cesServer.port!;
133
190
  }
134
191
 
135
- afterEach(() => {
192
+ afterEach(async () => {
136
193
  cesServer?.stop(true);
137
194
  cesServer = null;
138
195
  gatewayPort = 0;
139
196
  cesPort = 0;
140
197
 
141
198
  if (gatewayProc) {
142
- gatewayProc.kill("SIGKILL");
199
+ const proc = gatewayProc;
143
200
  gatewayProc = null;
201
+ proc.kill("SIGKILL");
202
+ // Wait for the process to actually exit so ports and file handles are
203
+ // fully released before the next test starts.
204
+ await waitForExit(proc);
144
205
  }
145
206
 
146
207
  rmSync(testDir, { recursive: true, force: true });
@@ -151,6 +212,11 @@ describe("gateway managed credential bootstrap retry", () => {
151
212
  mkdirSync(testDir, { recursive: true });
152
213
  writeCredentialMetadata();
153
214
 
215
+ // Reserve the CES port before starting the gateway so the gateway
216
+ // knows where CES will eventually appear. CES isn't running yet —
217
+ // the gateway's managed bootstrap will get ECONNREFUSED until we
218
+ // start the fake CES below.
219
+ cesPort = await getFreePort();
154
220
  await startGateway();
155
221
 
156
222
  const base = `http://localhost:${gatewayPort}`;
@@ -86,7 +86,7 @@ function encrypt(
86
86
 
87
87
  /**
88
88
  * Write Telegram bot_token and webhook_secret into the encrypted store
89
- * at $BASE_DATA_DIR/.vellum/protected/keys.enc, using the same key
89
+ * at $GATEWAY_SECURITY_DIR/keys.enc, using the same key
90
90
  * derivation the gateway's credential-reader will use to decrypt.
91
91
  */
92
92
  function writeEncryptedStore(botToken: string, webhookSecret: string): void {
@@ -195,7 +195,8 @@ async function startGateway(): Promise<void> {
195
195
  gatewayProc = spawn("bun", ["run", gatewayEntry], {
196
196
  env: {
197
197
  ...process.env,
198
- BASE_DATA_DIR: testDir,
198
+ GATEWAY_SECURITY_DIR: join(testDir, ".vellum", "protected"),
199
+ VELLUM_WORKSPACE_DIR: join(testDir, ".vellum", "workspace"),
199
200
  GATEWAY_PORT: String(port),
200
201
  // Ensure Telegram is NOT configured via env vars
201
202
  TELEGRAM_BOT_TOKEN: "",
@@ -57,10 +57,10 @@ const TEST_REGISTRY = {
57
57
  ],
58
58
  };
59
59
 
60
- const savedBaseDataDir = process.env.BASE_DATA_DIR;
60
+ const savedGatewaySecurityDir = process.env.GATEWAY_SECURITY_DIR;
61
61
 
62
62
  beforeEach(() => {
63
- process.env.BASE_DATA_DIR = testDir;
63
+ process.env.GATEWAY_SECURITY_DIR = protectedDir;
64
64
  mkdirSync(protectedDir, { recursive: true });
65
65
  writeFileSync(defaultsPath, JSON.stringify(TEST_REGISTRY, null, 2));
66
66
  // Point registry resolution at the isolated test file first
@@ -71,10 +71,10 @@ beforeEach(() => {
71
71
  });
72
72
 
73
73
  afterEach(() => {
74
- if (savedBaseDataDir === undefined) {
75
- delete process.env.BASE_DATA_DIR;
74
+ if (savedGatewaySecurityDir === undefined) {
75
+ delete process.env.GATEWAY_SECURITY_DIR;
76
76
  } else {
77
- process.env.BASE_DATA_DIR = savedBaseDataDir;
77
+ process.env.GATEWAY_SECURITY_DIR = savedGatewaySecurityDir;
78
78
  }
79
79
  try {
80
80
  rmSync(testDir, { recursive: true, force: true });