npm - @vellumai/vellum-gateway - Versions diffs - 0.6.3 → 0.6.4 - Mend

@vellumai/vellum-gateway 0.6.3 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/ARCHITECTURE.md +111 -28
package/Dockerfile +2 -2
package/bun.lock +1 -0
package/bunfig.toml +6 -0
package/package.json +3 -2
package/src/__tests__/config-file-cache.test.ts +6 -6
package/src/__tests__/config.test.ts +1 -1
package/src/__tests__/credential-reader.test.ts +12 -12
package/src/__tests__/credential-watcher-managed-bootstrap.test.ts +77 -11
package/src/__tests__/credential-watcher.test.ts +3 -2
package/src/__tests__/feature-flags-route.test.ts +5 -5
package/src/__tests__/ipc-contact-routes.test.ts +302 -0
package/src/__tests__/ipc-feature-flag-routes.test.ts +284 -0
package/src/__tests__/privacy-config-route.test.ts +911 -0
package/src/__tests__/remote-feature-flag-sync.test.ts +5 -5
package/src/__tests__/runtime-proxy.test.ts +114 -0
package/src/__tests__/schema.test.ts +2 -0
package/src/__tests__/slack-deliver.test.ts +287 -0
package/src/__tests__/slack-errors.test.ts +14 -0
package/src/__tests__/stt-stream-websocket.test.ts +392 -0
package/src/__tests__/test-preload.ts +28 -0
package/src/__tests__/twilio-media-websocket.test.ts +618 -0
package/src/auth/token-service.ts +4 -9
package/src/avatar-sync/avatar-channel-syncer.ts +78 -0
package/src/avatar-sync/avatar-sync-watcher.ts +80 -0
package/src/avatar-sync/slack-avatar-syncer.ts +70 -0
package/src/avatar-sync/types.ts +16 -0
package/src/cli/enable-proxy.ts +3 -6
package/src/config.ts +3 -18
package/src/credential-reader.ts +11 -23
package/src/credential-watcher.ts +3 -3
package/src/db/connection.ts +97 -6
package/src/db/contact-store.ts +156 -0
package/src/db/data-migrations/index.ts +73 -0
package/src/db/data-migrations/m0001-guardian-init-lock.ts +62 -0
package/src/email/register-callback.ts +7 -0
package/src/feature-flag-registry.json +46 -14
package/src/feature-flag-remote-store.ts +4 -9
package/src/feature-flag-store.ts +4 -9
package/src/http/routes/channel-verification-session-proxy.ts +2 -2
package/src/http/routes/email-webhook.ts +6 -2
package/src/http/routes/privacy-config.ts +217 -8
package/src/http/routes/slack-deliver.ts +147 -24
package/src/http/routes/stt-stream-websocket.ts +277 -0
package/src/http/routes/twilio-media-websocket.ts +271 -0
package/src/index.ts +185 -1
package/src/ipc/contact-handlers.ts +65 -0
package/src/ipc/feature-flag-handlers.ts +61 -0
package/src/ipc/server.ts +272 -0
package/src/logger.ts +1 -1
package/src/paths.ts +83 -0
package/src/platform-url.ts +24 -0
package/src/schema.ts +259 -1
package/src/slack/errors.ts +10 -0
package/src/telegram/webhook-manager.ts +18 -0
package/src/trust-store.ts +2 -6

package/ARCHITECTURE.md CHANGED Viewed

@@ -32,6 +32,56 @@ Internet
        +-- /webhooks/* --> BLOCKED (404, never forwarded to runtime)
 ```
+### STT Route Proxying (Assistant-Scoped Rewrite)
+Native clients (macOS, iOS) send speech-to-text transcription requests through the gateway to the daemon's STT service. Clients POST to the assistant-scoped path `/v1/assistants/:assistantId/stt/transcribe`, which the gateway's runtime proxy rewrites to the flat daemon path `/v1/stt/transcribe`. This follows the same assistant-scoped rewrite pattern used by other client-facing endpoints (feature flags, privacy config, etc.).
+The request carries base64-encoded WAV audio and a MIME type. The daemon resolves the configured STT provider via `resolveBatchTranscriber()` and returns the transcribed text. Clients use the response to implement a service-first strategy: the service transcription takes precedence when available, with Apple-native `SFSpeechRecognizer` as fallback when the service returns 503 (not configured) or fails.
+| Client path (gateway)               | Daemon path (after rewrite) | Method |
+| ----------------------------------- | --------------------------- | ------ |
+| `/v1/assistants/:id/stt/transcribe` | `/v1/stt/transcribe`        | POST   |
+**Key source files:**
+| File                                             | Purpose                                                                   |
+| ------------------------------------------------ | ------------------------------------------------------------------------- |
+| `gateway/src/http/routes/runtime-proxy.ts`       | Assistant-scoped path rewriting (`/v1/assistants/:id/...` → `/v1/...`)    |
+| `assistant/src/runtime/routes/stt-routes.ts`     | Daemon HTTP endpoint: validates audio, resolves transcriber, returns text |
+| `clients/shared/Network/STTClient.swift`         | Shared client: POSTs audio to the gateway, returns typed `STTResult`      |
+| `clients/shared/Utilities/AudioWavEncoder.swift` | WAV encoding utility for PCM audio buffers                                |
+### STT Streaming WebSocket Proxy
+Native clients (macOS, iOS) open WebSocket connections through the gateway to the daemon's real-time STT streaming endpoint for conversation chat message capture. The gateway authenticates the downstream client using an edge JWT (actor principal required), then opens an upstream WebSocket connection to the daemon's `/v1/stt/stream` endpoint with a short-lived gateway service token. This keeps the daemon's WebSocket endpoint unreachable from the public internet while allowing authenticated clients to stream audio for real-time transcription.
+**Config-authoritative model:** The runtime always resolves the streaming transcriber from `services.stt.provider` in the assistant config, regardless of any `provider` query parameter. The `provider` parameter is optional compatibility metadata — when supplied and it disagrees with the configured provider, the runtime logs a mismatch warning for operator visibility.
+**Client path:** `wss://<gateway>/v1/stt/stream?mimeType=<mime>[&provider=<id>][&sampleRate=<hz>]`
+**Query parameters:**
+| Parameter    | Required | Description                                                                                                                                                                      |
+| ------------ | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `mimeType`   | Yes      | MIME type of the audio being streamed (e.g. `audio/webm;codecs=opus`)                                                                                                            |
+| `provider`   | No       | Optional STT provider identifier (`deepgram`, `google-gemini`). Forwarded as compatibility metadata — the runtime resolves the transcriber from config, not from this parameter. |
+| `sampleRate` | No       | Sample rate in Hz (e.g. `16000`). Passed through to the daemon.                                                                                                                  |
+| `token`      | No       | Edge JWT (alternative to `Authorization: Bearer` header for WS upgrades)                                                                                                         |
+**Auth model:** STT streaming is an authenticated, assistant-scoped path. The client must present a valid edge JWT with an actor principal. Service tokens are rejected. When `runtimeProxyRequireAuth` is globally disabled (dev bypass), the upgrade proceeds without token validation.
+**Proxy behavior:** The gateway buffers up to 100 downstream messages while the upstream connection to the daemon is being established. If the buffer overflows, the downstream connection is closed with code 1008 (policy violation). Once the upstream connection opens, buffered messages are flushed in order. All subsequent messages are forwarded bidirectionally: client audio frames flow upstream, daemon transcript events (JSON text frames: `ready`, `partial`, `final`, `error`, `closed`) flow downstream. When either side closes, the other side is closed with the same code/reason.
+**Key source files:**
+| File                                              | Purpose                                                                                                            |
+| ------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
+| `gateway/src/http/routes/stt-stream-websocket.ts` | WebSocket upgrade handler (`createSttStreamWebsocketHandler`) and proxy handlers (`getSttStreamWebsocketHandlers`) |
+| `gateway/src/index.ts`                            | Route registration: wires upgrade handler to the gateway's Bun HTTP server                                         |
+| `assistant/src/runtime/http-server.ts`            | Daemon-side WebSocket upgrade at `/v1/stt/stream`, session creation and registry                                   |
+| `assistant/src/stt/stt-stream-session.ts`         | Runtime session orchestrator: drives the `StreamingTranscriber` from the WebSocket                                 |
+| `clients/shared/Network/STTStreamingClient.swift` | Swift client: builds the gateway WS URL via `GatewayHTTPClient.buildWebSocketRequest`                              |
 ### Assistant Feature Flags API
 The gateway exposes a REST API for reading and mutating assistant feature flags. Assistant feature flags are assistant-scoped, declaration-driven booleans that can gate any assistant behavior. Skill availability is one consumer, but not a required coupling (see [`assistant/ARCHITECTURE.md`](../assistant/ARCHITECTURE.md) for resolver and skill enforcement details).
@@ -551,12 +601,23 @@ If no guardian binding exists for the channel, escalation fails closed -- the me
 #### SQLite Tables
+**Assistant DB** (`assistant.db` — current owner, migrating to gateway):
 | Table                       | Purpose                                                               |
 | --------------------------- | --------------------------------------------------------------------- |
 | `assistant_ingress_invites` | Invite tokens with SHA-256 hashes, expiry, use counts                 |
 | `contacts`                  | Contact records with role, relationship, and per-contact metadata     |
 | `contact_channels`          | Channel bindings per contact with access policy (allow/deny/escalate) |
+**Gateway DB** (`gateway.sqlite` — future owner of auth/authz):
+| Table              | Purpose                                                                |
+| ------------------ | ---------------------------------------------------------------------- |
+| `contacts`         | Contact auth/authz: id, display_name, role, principal_id               |
+| `contact_channels` | Channel bindings with policy, status, external IDs, verification state |
+The gateway declares `contacts` and `contact_channels` tables and exposes them via IPC (`list_contacts`, `get_contact`, `get_contact_by_channel`, `get_channels_for_contact`). Endpoint cutover and data migration are in progress — the gateway will become the canonical owner once dual-writing is enabled.
 #### Key Modules
 | Module                                           | Purpose                                                                   |
@@ -566,6 +627,8 @@ If no guardian binding exists for the channel, escalation fails closed -- the me
 | `assistant/src/contacts/contacts-write.ts`       | Contact and channel writes (upsert, policy changes, invite redemption)    |
 | `assistant/src/daemon/handlers/config-inbox.ts`  | Handlers for invite and member contracts                                  |
 | `assistant/src/runtime/routes/channel-routes.ts` | ACL enforcement point -- member lookup, policy check, escalation creation |
+| `gateway/src/db/contact-store.ts`                | Gateway-side read-only ContactStore (prepared-statement queries)          |
+| `gateway/src/ipc/contact-handlers.ts`            | IPC route handlers for contact reads                                      |
 ### Telegram Credential Flow
@@ -678,9 +741,9 @@ The Socket Mode client auto-reconnects on any WebSocket close or error. The back
 ---
-## AI Phone Calls — Twilio ConversationRelay
+## AI Phone Calls — Twilio Voice
-The Calls subsystem supports both **outbound** and **inbound** voice calls via Twilio's ConversationRelay protocol. The assistant uses an LLM-driven conversation loop to speak in real time. Voice is a first-class channel with its own per-call conversation (outbound key: `asst:${assistantId}:voice:call:${callSessionId}`, inbound key: `asst:${assistantId}:voice:inbound:${callSid}`). When the AI needs guardian input during a call, it dispatches ASK_GUARDIAN requests cross-channel to mac/telegram via the guardian dispatch engine. Answer resolution uses first-writer-wins semantics -- the first channel to respond provides the answer, and remaining channels receive a "already answered" notice.
+The Calls subsystem supports both **outbound** and **inbound** voice calls via Twilio. The Twilio integration path is provider-conditional: `services.stt.provider` determines whether calls use ConversationRelay (Twilio-native STT for Deepgram/Google) or Media Streams (daemon-side STT for OpenAI Whisper). The assistant uses an LLM-driven conversation loop to speak in real time. Voice is a first-class channel with its own per-call conversation (outbound key: `asst:${assistantId}:voice:call:${callSessionId}`, inbound key: `asst:${assistantId}:voice:inbound:${callSid}`). When the AI needs guardian input during a call, it dispatches ASK_GUARDIAN requests cross-channel to mac/telegram via the guardian dispatch engine. Answer resolution uses first-writer-wins semantics -- the first channel to respond provides the answer, and remaining channels receive a "already answered" notice.
 ### Outbound Call Flow
@@ -891,7 +954,8 @@ sequenceDiagram
 | `gateway/src/http/routes/twilio-voice-webhook.ts`                | Gateway route: validates Twilio signature, forwards voice webhook to runtime                                                                                                                                           |
 | `gateway/src/http/routes/twilio-status-webhook.ts`               | Gateway route: validates Twilio signature, forwards status callback to runtime                                                                                                                                         |
 | `gateway/src/http/routes/twilio-connect-action-webhook.ts`       | Gateway route: validates Twilio signature, forwards connect-action to runtime                                                                                                                                          |
-| `gateway/src/http/routes/twilio-relay-websocket.ts`              | Gateway route: WebSocket proxy for ConversationRelay frames between Twilio and runtime                                                                                                                                 |
+| `gateway/src/http/routes/twilio-relay-websocket.ts`              | Gateway route: WebSocket proxy for ConversationRelay frames between Twilio and runtime (used for Deepgram/Google native STT)                                                                                           |
+| `gateway/src/http/routes/twilio-media-websocket.ts`              | Gateway route: WebSocket proxy for Media Streams frames between Twilio and runtime (used for OpenAI Whisper media-stream STT)                                                                                          |
 | `gateway/src/twilio/validate-webhook.ts`                         | Twilio webhook validation: HMAC-SHA1 signature verification, payload size limits, fail-closed when auth token missing                                                                                                  |
 ### Call State Machine
@@ -960,14 +1024,17 @@ All five tables live in `~/.vellum/workspace/data/db/assistant.db` alongside exi
 Internet-facing Twilio callbacks terminate at the gateway, which validates signatures before forwarding to the runtime. This keeps the runtime behind the gateway's bearer-auth boundary.
-| Gateway Route                          | Validates                         | Forwards To (Runtime)                                                                    |
-| -------------------------------------- | --------------------------------- | ---------------------------------------------------------------------------------------- |
-| `POST /webhooks/twilio/voice`          | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/voice-webhook` (JSON: `{ params, originalUrl, assistantId? }`) |
-| `POST /webhooks/twilio/status`         | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/status` (JSON: `{ params }`)                                   |
-| `POST /webhooks/twilio/connect-action` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/connect-action` (JSON: `{ params }`)                           |
-| `WS /webhooks/twilio/relay`            | WebSocket upgrade                 | `WS /v1/calls/relay` (bidirectional proxy)                                               |
+| Gateway Route                                              | Validates                         | Forwards To (Runtime)                                                                    |
+| ---------------------------------------------------------- | --------------------------------- | ---------------------------------------------------------------------------------------- |
+| `POST /webhooks/twilio/voice`                              | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/voice-webhook` (JSON: `{ params, originalUrl, assistantId? }`) |
+| `POST /webhooks/twilio/status`                             | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/status` (JSON: `{ params }`)                                   |
+| `POST /webhooks/twilio/connect-action`                     | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/connect-action` (JSON: `{ params }`)                           |
+| `WS /webhooks/twilio/relay`                                | WebSocket upgrade                 | `WS /v1/calls/relay` (bidirectional proxy) — ConversationRelay path                      |
+| `WS /webhooks/twilio/media-stream/<callSessionId>/<token>` | WebSocket upgrade                 | `WS /v1/calls/media-stream` (bidirectional proxy) — Media Streams path                   |
+In gateway-fronted deployments, the TwiML WebSocket URL (returned by the voice webhook) should point to the gateway's `/webhooks/twilio/relay` (ConversationRelay) or `/webhooks/twilio/media-stream/<callSessionId>/<token>` (Media Streams) endpoint rather than directly to the runtime. The gateway proxies frames bidirectionally between Twilio and the runtime, preserving close and error semantics for proper cleanup.
-In gateway-fronted deployments, the TwiML WebSocket URL (returned by the voice webhook) should point to the gateway's `/webhooks/twilio/relay` endpoint rather than directly to the runtime. The gateway proxies ConversationRelay frames bidirectionally between Twilio and the runtime, preserving close and error semantics for proper cleanup.
+**Media Streams handshake metadata:** Twilio Media Streams does not reliably preserve URL query parameters across the WebSocket upgrade, so handshake metadata (`callSessionId` and auth `token`) is encoded as **URL path segments** (primary transport). The gateway also supports legacy query-parameter-based handshake as a fallback for backward compatibility. The metadata extractor in `twilio-media-websocket.ts` resolves values from path segments first, falling back to query parameters.
 Signature validation is **fail-closed**: if the Twilio auth token is not configured, all webhook requests are rejected with `403`. Missing or invalid `X-Twilio-Signature` headers are also rejected with `403`. Payload size is capped by `maxWebhookPayloadBytes` (checked via both `Content-Length` header and actual body size).
@@ -999,7 +1066,8 @@ This makes ingress URL updates smoother in local tunnel workflows because Twilio
 | POST   | `/v1/calls/:callSessionId/instruction` | Relay a steering instruction to an active call's controller (alternative to in-conversation bridge)                                    |
 | POST   | `/v1/internal/twilio/status`           | Internal status callback used by gateway; accepts JSON `{ params }`                                                                    |
 | POST   | `/v1/internal/twilio/connect-action`   | Internal connect action callback used by gateway; accepts JSON `{ params }`                                                            |
-| WS     | `/v1/calls/relay`                      | ConversationRelay WebSocket (bidirectional: prompt/interrupt/dtmf from Twilio, text tokens/end to Twilio)                              |
+| WS     | `/v1/calls/relay`                      | ConversationRelay WebSocket (bidirectional: prompt/interrupt/dtmf from Twilio, text tokens/end to Twilio) — Deepgram/Google path       |
+| WS     | `/v1/calls/media-stream`               | Media Streams WebSocket (raw audio from Twilio, daemon-side STT) — OpenAI Whisper path                                                 |
 ### Tools
@@ -1016,7 +1084,7 @@ Both tools and HTTP routes delegate to the same domain functions in `call-domain
 The CallController detects two special markers in the LLM's response text:
 - **`[ASK_GUARDIAN: question]`** — The AI needs to consult the guardian. The controller creates a pending question, notifies the session via `fireCallQuestionNotifier`, puts the caller on hold, and waits for a guardian answer (timeout configured via `calls.userConsultTimeoutSeconds`).
-- **`[END_CALL]`** — The AI has determined the call's purpose is fulfilled. The controller sends a goodbye, closes the ConversationRelay session, and marks the call as completed.
+- **`[END_CALL]`** — The AI has determined the call's purpose is fulfilled. The controller sends a goodbye, closes the call session, and marks the call as completed.
 Both markers are stripped from the TTS output so the callee never hears the raw control text.
@@ -1035,19 +1103,20 @@ Malformed or unprocessable provider callback payloads are logged as dead-letter
 Call behavior is controlled via the `calls` config block in the assistant configuration (`config/schema.ts`). All values have sensible defaults and are validated via Zod:
-| Field                               | Type     | Default                        | Description                                                                                         |
-| ----------------------------------- | -------- | ------------------------------ | --------------------------------------------------------------------------------------------------- |
-| `calls.enabled`                     | boolean  | `true`                         | Master toggle for the calls feature. When `false`, call routes return 403 and tools return errors.  |
-| `calls.provider`                    | enum     | `'twilio'`                     | Voice provider to use (currently only Twilio is supported).                                         |
-| `calls.maxDurationSeconds`          | int      | `3600`                         | Maximum allowed duration per call.                                                                  |
-| `calls.userConsultTimeoutSeconds`   | int      | `120`                          | How long to wait for a user answer before timing out a pending question.                            |
-| `calls.disclosure.enabled`          | boolean  | `true`                         | Whether the AI should disclose it is an AI at the start of the call.                                |
-| `calls.disclosure.text`             | string   | _(default disclosure prompt)_  | The disclosure instruction included in the system prompt.                                           |
-| `calls.safety.denyCategories`       | string[] | `[]`                           | Categories of calls to deny (e.g., emergency numbers are always denied regardless of this setting). |
-| `calls.model`                       | string   | _(unset — uses default model)_ | Optional override for the LLM model used in voice call conversations.                               |
-| `calls.voice.language`              | string   | `'en-US'`                      | Language code for TTS and transcription.                                                            |
-| `calls.voice.transcriptionProvider` | enum     | `'Deepgram'`                   | Speech-to-text provider (`Deepgram` or `Google`).                                                   |
-| `elevenlabs.voiceId`                | string   | `'ZF6FPAbjXT4488VcRRnw'`       | ElevenLabs voice ID used by both in-app TTS and phone calls. Defaults to Amelia.                    |
+| Field                             | Type     | Default                        | Description                                                                                                                                                                               |
+| --------------------------------- | -------- | ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `calls.enabled`                   | boolean  | `true`                         | Master toggle for the calls feature. When `false`, call routes return 403 and tools return errors.                                                                                        |
+| `calls.provider`                  | enum     | `'twilio'`                     | Voice provider to use (currently only Twilio is supported).                                                                                                                               |
+| `calls.maxDurationSeconds`        | int      | `3600`                         | Maximum allowed duration per call.                                                                                                                                                        |
+| `calls.userConsultTimeoutSeconds` | int      | `120`                          | How long to wait for a user answer before timing out a pending question.                                                                                                                  |
+| `calls.disclosure.enabled`        | boolean  | `true`                         | Whether the AI should disclose it is an AI at the start of the call.                                                                                                                      |
+| `calls.disclosure.text`           | string   | _(default disclosure prompt)_  | The disclosure instruction included in the system prompt.                                                                                                                                 |
+| `calls.safety.denyCategories`     | string[] | `[]`                           | Categories of calls to deny (e.g., emergency numbers are always denied regardless of this setting).                                                                                       |
+| `calls.model`                     | string   | _(unset — uses default model)_ | Optional override for the LLM model used in voice call conversations.                                                                                                                     |
+| `calls.voice.language`            | string   | `'en-US'`                      | Language code for TTS and transcription.                                                                                                                                                  |
+| `services.stt.provider`           | enum     | `'deepgram'`                   | STT provider for all boundaries including telephony. Determines the Twilio integration path (ConversationRelay-native for `deepgram`/`google-gemini`, media-stream for `openai-whisper`). |
+| `services.tts.provider`           | enum     | `'elevenlabs'`                 | Active TTS provider for speech synthesis (catalog-driven; see [TTS Provider Abstraction](../assistant/ARCHITECTURE.md#tts-provider-abstraction-servicestts)).                             |
+| `services.tts.providers.<id>.*`   | object   | _(per-provider defaults)_      | Provider-specific settings block. One block per catalog entry (e.g. `elevenlabs`, `fish-audio`).                                                                                          |
 ### Caller Identity Resolution
@@ -1064,10 +1133,24 @@ Both the resolved mode and source are logged at info level on success, and rejec
 ### Voice Quality Profile Resolution
-Voice and TTS settings are configurable via the `calls.voice` config block — they are not hardcoded. The function `resolveVoiceQualityProfile()` in `voice-quality.ts` reads the current config and resolves it into a `VoiceQualityProfile` containing the TTS provider, voice spec string, language, and transcription provider.
+Voice and TTS settings are configurable via the `calls.voice` and `services.tts` config blocks — they are not hardcoded. The function `resolveVoiceQualityProfile()` in `voice-quality.ts` uses the catalog-driven call strategy abstraction to determine how the active TTS provider integrates with the Twilio telephony path, then resolves the result into a `VoiceQualityProfile` containing the TTS provider, voice spec string, and language.
+The active TTS provider is determined by `services.tts.provider` (default: `"elevenlabs"`). Provider-specific settings (voice ID, model, tuning parameters) are read from `services.tts.providers.<id>`. The call mode (`native-twilio` or `synthesized-play`) is resolved from the canonical provider catalog via `resolveCallStrategy()` in `tts-call-strategy.ts` — it reads the provider's declared `callMode` rather than inferring behavior from runtime capabilities.
+For `native-twilio` providers (e.g. ElevenLabs), the voice quality profile looks up a registered `NativeTwilioVoiceSpecBuilder` to construct the provider-specific voice spec string for the ConversationRelay `voice` attribute. New native providers plug in by registering their own voice spec builder — no edits to core call routing logic required. For `synthesized-play` providers (e.g. Fish Audio), `ttsProvider` is set to `"Google"` as a placeholder in TwiML and actual audio is delivered via `play` messages — the assistant synthesises audio via the provider's HTTP API.
+The voice webhook in `twilio-routes.ts` calls `resolveVoiceQualityProfile()` for TTS settings and separately resolves the telephony STT strategy via `resolveTelephonySttRouting()`. The routing result determines which TwiML generator to use: `generateTwiML()` for Twilio-native ConversationRelay, or `generateStreamTwiML()` for the media-stream path. This separation keeps TTS and STT resolution independent — the voice quality profile controls the TTS provider, voice, and language, while the routing strategy controls the STT integration path.
+For full details on the catalog-driven TTS architecture, provider catalog, call strategy abstraction, and the provider-add checklist, see the [TTS Provider Abstraction](../assistant/ARCHITECTURE.md#tts-provider-abstraction-servicestts) section in the assistant architecture docs.
+### Telephony STT: Provider-Conditional Hybrid Routing
+Telephony STT is unified under `services.stt.provider`. The voice webhook in `twilio-routes.ts` calls `resolveTelephonySttRouting()` to determine the Twilio integration path based on the active provider:
+- **Deepgram / Google** (`conversation-relay-native` strategy) — TwiML emits `<Connect><ConversationRelay>` with Twilio-native `transcriptionProvider` and `speechModel` attributes. The gateway proxies ConversationRelay frames via `/webhooks/twilio/relay`. The daemon receives transcribed text, not raw audio.
-All calls use **ElevenLabs** as the TTS provider via Twilio ConversationRelay. The voice ID is read from the shared `elevenlabs.voiceId` config key (defaulting to Amelia — `ZF6FPAbjXT4488VcRRnw`). Optional tuning parameters (`voiceModelId`, `speed`, `stability`, `similarityBoost`) are also read from the top-level `elevenlabs` config. When `voiceModelId` is set, the emitted voice spec uses the Twilio ConversationRelay extended format: `voiceId-model-speed_stability_similarity`. When `voiceModelId` is empty (the default), only the bare `voiceId` is sent.
+- **OpenAI Whisper** (`media-stream-custom` strategy) — TwiML emits `<Connect><Stream>` pointing to the gateway's media-stream proxy (`/webhooks/twilio/media-stream`). The gateway forwards raw audio frames to the daemon's media-stream server, which transcribes server-side.
-The voice webhook in `twilio-routes.ts` calls `resolveVoiceQualityProfile()` and passes the result directly to `generateTwiML()` to produce ConversationRelay TwiML.
+Both paths are active in production. The strategy selection happens at call setup time based on the current `services.stt.provider` value. See `docs/internal-reference.md` for a provider-specific troubleshooting matrix.
 ---

package/Dockerfile CHANGED Viewed

@@ -2,7 +2,7 @@
 FROM oven/bun:1.3.11@sha256:0733e50325078969732ebe3b15ce4c4be5082f18c4ac1a0f0ca4839c2e4e42a7 AS bun
 # Build stage
-FROM debian:trixie@sha256:3615a749858a1cba49b408fb49c37093db813321355a9ab7c1f9f4836341e9db AS builder
+FROM debian:trixie@sha256:3352c2e13876c8a5c5873ef20870e1939e73cb9a3c1aeba5e3e72172a85ce9ed AS builder
 WORKDIR /app
@@ -14,7 +14,7 @@ RUN bun install --frozen-lockfile --production
 COPY . .
 # Runtime stage
-FROM debian:trixie-slim@sha256:1d3c811171a08a5adaa4a163fbafd96b61b87aa871bbc7aa15431ac275d3d430 AS runner
+FROM debian:trixie-slim@sha256:4ffb3a1511099754cddc70eb1b12e50ffdb67619aa0ab6c13fcd800a78ef7c7a AS runner
 WORKDIR /app

package/bun.lock CHANGED Viewed

@@ -10,6 +10,7 @@
         "pino": "9.14.0",
         "pino-pretty": "13.1.3",
         "uuid": "13.0.0",
+        "zod": "4.3.6",
       },
       "devDependencies": {
         "@types/bun": "1.3.9",

package/bunfig.toml ADDED Viewed

@@ -0,0 +1,6 @@
+[install]
+exact = true
+[test]
+root = "./src"
+preload = ["./src/__tests__/test-preload.ts"]

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vellumai/vellum-gateway",
-  "version": "0.6.3",
+  "version": "0.6.4",
   "license": "MIT",
   "type": "module",
   "exports": {
@@ -27,7 +27,8 @@
     "minimatch": "10.2.4",
     "pino": "9.14.0",
     "pino-pretty": "13.1.3",
-    "uuid": "13.0.0"
+    "uuid": "13.0.0",
+    "zod": "4.3.6"
   },
   "devDependencies": {
     "@types/bun": "1.3.9",

package/src/__tests__/config-file-cache.test.ts CHANGED Viewed

@@ -13,26 +13,26 @@ import { ConfigFileCache } from "../config-file-cache.js";
 let testBaseDir: string;
 let workspaceDir: string;
 let configPath: string;
-let savedBaseDataDir: string | undefined;
+let savedWorkspaceDir: string | undefined;
 function writeConfig(data: Record<string, unknown>): void {
   writeFileSync(configPath, JSON.stringify(data));
 }
 beforeEach(() => {
-  savedBaseDataDir = process.env.BASE_DATA_DIR;
+  savedWorkspaceDir = process.env.VELLUM_WORKSPACE_DIR;
   testBaseDir = mkdtempSync(join(tmpdir(), "config-file-cache-test-"));
   workspaceDir = join(testBaseDir, ".vellum", "workspace");
   mkdirSync(workspaceDir, { recursive: true });
   configPath = join(workspaceDir, "config.json");
-  process.env.BASE_DATA_DIR = testBaseDir;
+  process.env.VELLUM_WORKSPACE_DIR = workspaceDir;
 });
 afterEach(() => {
-  if (savedBaseDataDir === undefined) {
-    delete process.env.BASE_DATA_DIR;
+  if (savedWorkspaceDir === undefined) {
+    delete process.env.VELLUM_WORKSPACE_DIR;
   } else {
-    process.env.BASE_DATA_DIR = savedBaseDataDir;
+    process.env.VELLUM_WORKSPACE_DIR = savedWorkspaceDir;
   }
   rmSync(testBaseDir, { recursive: true, force: true });
 });

package/src/__tests__/config.test.ts CHANGED Viewed

@@ -23,7 +23,7 @@ describe("config: hardcoded defaults", () => {
     expect(config.unmappedPolicy).toBe("reject");
     expect(config.routingEntries).toEqual([]);
     expect(config.defaultAssistantId).toBeUndefined();
-    expect(config.logFile.dir).toMatch(/\.vellum\/logs$/);
+    expect(config.logFile.dir).toMatch(/logs$/);
     expect(config.logFile.retentionDays).toBe(30);
   });

package/src/__tests__/credential-reader.test.ts CHANGED Viewed

@@ -39,7 +39,7 @@ const testDir = join(
 );
 function metadataDir(): string {
-  return join(testDir, ".vellum", "workspace", "data", "credentials");
+  return join(testDir, "data", "credentials");
 }
 function writeMetadata(
@@ -106,8 +106,8 @@ function encryptEntries(
 }
 function writeEncryptedStore(entries: Record<string, string>): void {
-  const storePath = join(testDir, ".vellum", "protected", "keys.enc");
-  mkdirSync(join(testDir, ".vellum", "protected"), { recursive: true });
+  mkdirSync(testDir, { recursive: true });
+  const storePath = join(testDir, "keys.enc");
   const salt = randomBytes(16);
   const key = pbkdf2Sync(
@@ -131,17 +131,16 @@ function writeEncryptedStore(entries: Record<string, string>): void {
  * The store.key is used directly as the AES-256-GCM key (no PBKDF2).
  */
 function writeEncryptedStoreV2(entries: Record<string, string>): void {
-  const protectedDir = join(testDir, ".vellum", "protected");
-  mkdirSync(protectedDir, { recursive: true });
+  mkdirSync(testDir, { recursive: true });
   const storeKey = randomBytes(KEY_LENGTH);
-  writeFileSync(join(protectedDir, "store.key"), storeKey);
+  writeFileSync(join(testDir, "store.key"), storeKey);
   const store = {
     version: 2,
     entries: encryptEntries(entries, storeKey),
   };
-  writeFileSync(join(protectedDir, "keys.enc"), JSON.stringify(store));
+  writeFileSync(join(testDir, "keys.enc"), JSON.stringify(store));
 }
 // ---------------------------------------------------------------------------
@@ -149,12 +148,14 @@ function writeEncryptedStoreV2(entries: Record<string, string>): void {
 // ---------------------------------------------------------------------------
 beforeEach(() => {
-  process.env.BASE_DATA_DIR = testDir;
+  process.env.GATEWAY_SECURITY_DIR = testDir;
+  process.env.VELLUM_WORKSPACE_DIR = testDir;
   logCalls.length = 0;
 });
 afterEach(() => {
-  delete process.env.BASE_DATA_DIR;
+  delete process.env.GATEWAY_SECURITY_DIR;
+  delete process.env.VELLUM_WORKSPACE_DIR;
   try {
     rmSync(testDir, { recursive: true, force: true });
   } catch {
@@ -178,8 +179,7 @@ describe("v2 encrypted store with store.key", () => {
   test("returns undefined for v2 store when store.key is missing", async () => {
     // Write a v2 store but without the store.key file
-    const protectedDir = join(testDir, ".vellum", "protected");
-    mkdirSync(protectedDir, { recursive: true });
+    mkdirSync(testDir, { recursive: true });
     const storeKey = randomBytes(KEY_LENGTH);
     const store = {
@@ -189,7 +189,7 @@ describe("v2 encrypted store with store.key", () => {
         storeKey,
       ),
     };
-    writeFileSync(join(protectedDir, "keys.enc"), JSON.stringify(store));
+    writeFileSync(join(testDir, "keys.enc"), JSON.stringify(store));
     // Deliberately do NOT write store.key
     const result = await readCredential(credentialKey("test", "key"));

package/src/__tests__/credential-watcher-managed-bootstrap.test.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { afterEach, describe, expect, test } from "bun:test";
+import { createServer } from "node:net";
 import { spawn, type ChildProcess } from "node:child_process";
 import { mkdirSync, renameSync, rmSync, writeFileSync } from "node:fs";
 import { tmpdir } from "node:os";
@@ -54,19 +55,51 @@ let gatewayPort = 0;
 let cesPort = 0;
 let cesServer: ReturnType<typeof Bun.serve> | null = null;
-function assignPorts(): void {
-  if (gatewayPort !== 0 && cesPort !== 0) return;
-  gatewayPort = 49152 + Math.floor(Math.random() * 8_192);
-  cesPort = gatewayPort + 1;
+/** Ask the OS for a free port by briefly binding to port 0. */
+function getFreePort(): Promise<number> {
+  return new Promise((resolve, reject) => {
+    const srv = createServer();
+    srv.listen(0, "127.0.0.1", () => {
+      const addr = srv.address();
+      if (!addr || typeof addr === "string") {
+        srv.close();
+        reject(new Error("Failed to get free port"));
+        return;
+      }
+      const port = addr.port;
+      srv.close(() => resolve(port));
+    });
+    srv.on("error", reject);
+  });
+}
+/** Wait for a child process to exit, with a safety timeout. */
+function waitForExit(proc: ChildProcess, timeoutMs = 5_000): Promise<void> {
+  return new Promise<void>((resolve) => {
+    if (proc.exitCode !== null || proc.signalCode !== null) {
+      resolve();
+      return;
+    }
+    const timer = setTimeout(resolve, timeoutMs);
+    proc.on("exit", () => {
+      clearTimeout(timer);
+      resolve();
+    });
+  });
 }
 async function startGateway(): Promise<void> {
-  assignPorts();
+  if (cesPort === 0)
+    throw new Error(
+      "CES port not assigned — call startFakeCes or reserveCesPort first",
+    );
+  gatewayPort = await getFreePort();
   gatewayProc = spawn("bun", ["run", gatewayEntry], {
     env: {
       ...process.env,
-      BASE_DATA_DIR: testDir,
+      GATEWAY_SECURITY_DIR: join(testDir, ".vellum", "protected"),
+      VELLUM_WORKSPACE_DIR: join(testDir, ".vellum", "workspace"),
       GATEWAY_PORT: String(gatewayPort),
       CES_CREDENTIAL_URL: `http://127.0.0.1:${cesPort}`,
       CES_SERVICE_TOKEN: TEST_SERVICE_TOKEN,
@@ -76,8 +109,27 @@ async function startGateway(): Promise<void> {
     stdio: ["ignore", "pipe", "pipe"],
   });
+  // Collect stderr for diagnostics on failure.
+  const stderrChunks: Buffer[] = [];
+  gatewayProc.stderr?.on("data", (chunk: Buffer) => stderrChunks.push(chunk));
+  // Track early exit so we can fail fast instead of polling for 30s.
+  let earlyExitCode: number | null = null;
+  let earlyExitSignal: string | null = null;
+  gatewayProc.on("exit", (code, signal) => {
+    earlyExitCode = code;
+    earlyExitSignal = signal;
+  });
   const deadline = Date.now() + 30_000;
   while (Date.now() < deadline) {
+    // If the process already died, fail immediately with stderr.
+    if (earlyExitCode !== null || earlyExitSignal !== null) {
+      const stderr = Buffer.concat(stderrChunks).toString().slice(-2000);
+      throw new Error(
+        `Gateway exited early (code=${earlyExitCode}, signal=${earlyExitSignal})\n${stderr}`,
+      );
+    }
     try {
       const res = await fetch(`http://localhost:${gatewayPort}/healthz`);
       if (res.ok) return;
@@ -86,7 +138,10 @@ async function startGateway(): Promise<void> {
     }
     await new Promise((resolve) => setTimeout(resolve, 100));
   }
-  throw new Error("Gateway failed to start within 30 seconds");
+  const stderr = Buffer.concat(stderrChunks).toString().slice(-2000);
+  throw new Error(
+    `Gateway failed to start within 30 seconds\nstderr: ${stderr}`,
+  );
 }
 function startFakeCes(opts: {
@@ -94,11 +149,12 @@ function startFakeCes(opts: {
   credentials?: Record<string, string>;
   resolveValue?: (account: string) => string | undefined;
 }): void {
-  assignPorts();
   const accounts = opts.accounts ?? Object.keys(opts.credentials ?? {});
   const credentials = opts.credentials ?? {};
   cesServer = Bun.serve({
-    port: cesPort,
+    // If cesPort was pre-reserved (for tests that start the gateway before
+    // the CES), bind to that port. Otherwise let the OS pick a free one.
+    port: cesPort || 0,
     fetch(req) {
       const authHeader = req.headers.get("authorization");
       if (authHeader !== `Bearer ${TEST_SERVICE_TOKEN}`) {
@@ -130,17 +186,22 @@ function startFakeCes(opts: {
       return new Response("Not Found", { status: 404 });
     },
   });
+  cesPort = cesServer.port!;
 }
-afterEach(() => {
+afterEach(async () => {
   cesServer?.stop(true);
   cesServer = null;
   gatewayPort = 0;
   cesPort = 0;
   if (gatewayProc) {
-    gatewayProc.kill("SIGKILL");
+    const proc = gatewayProc;
     gatewayProc = null;
+    proc.kill("SIGKILL");
+    // Wait for the process to actually exit so ports and file handles are
+    // fully released before the next test starts.
+    await waitForExit(proc);
   }
   rmSync(testDir, { recursive: true, force: true });
@@ -151,6 +212,11 @@ describe("gateway managed credential bootstrap retry", () => {
     mkdirSync(testDir, { recursive: true });
     writeCredentialMetadata();
+    // Reserve the CES port before starting the gateway so the gateway
+    // knows where CES will eventually appear. CES isn't running yet —
+    // the gateway's managed bootstrap will get ECONNREFUSED until we
+    // start the fake CES below.
+    cesPort = await getFreePort();
     await startGateway();
     const base = `http://localhost:${gatewayPort}`;

package/src/__tests__/credential-watcher.test.ts CHANGED Viewed

@@ -86,7 +86,7 @@ function encrypt(
 /**
  * Write Telegram bot_token and webhook_secret into the encrypted store
- * at $BASE_DATA_DIR/.vellum/protected/keys.enc, using the same key
+ * at $GATEWAY_SECURITY_DIR/keys.enc, using the same key
  * derivation the gateway's credential-reader will use to decrypt.
  */
 function writeEncryptedStore(botToken: string, webhookSecret: string): void {
@@ -195,7 +195,8 @@ async function startGateway(): Promise<void> {
   gatewayProc = spawn("bun", ["run", gatewayEntry], {
     env: {
       ...process.env,
-      BASE_DATA_DIR: testDir,
+      GATEWAY_SECURITY_DIR: join(testDir, ".vellum", "protected"),
+      VELLUM_WORKSPACE_DIR: join(testDir, ".vellum", "workspace"),
       GATEWAY_PORT: String(port),
       // Ensure Telegram is NOT configured via env vars
       TELEGRAM_BOT_TOKEN: "",

package/src/__tests__/feature-flags-route.test.ts CHANGED Viewed

@@ -57,10 +57,10 @@ const TEST_REGISTRY = {
   ],
 };
-const savedBaseDataDir = process.env.BASE_DATA_DIR;
+const savedGatewaySecurityDir = process.env.GATEWAY_SECURITY_DIR;
 beforeEach(() => {
-  process.env.BASE_DATA_DIR = testDir;
+  process.env.GATEWAY_SECURITY_DIR = protectedDir;
   mkdirSync(protectedDir, { recursive: true });
   writeFileSync(defaultsPath, JSON.stringify(TEST_REGISTRY, null, 2));
   // Point registry resolution at the isolated test file first
@@ -71,10 +71,10 @@ beforeEach(() => {
 });
 afterEach(() => {
-  if (savedBaseDataDir === undefined) {
-    delete process.env.BASE_DATA_DIR;
+  if (savedGatewaySecurityDir === undefined) {
+    delete process.env.GATEWAY_SECURITY_DIR;
   } else {
-    process.env.BASE_DATA_DIR = savedBaseDataDir;
+    process.env.GATEWAY_SECURITY_DIR = savedGatewaySecurityDir;
   }
   try {
     rmSync(testDir, { recursive: true, force: true });