@vellumai/vellum-gateway 0.6.3 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +111 -28
- package/Dockerfile +2 -2
- package/bun.lock +1 -0
- package/bunfig.toml +6 -0
- package/package.json +3 -2
- package/src/__tests__/config-file-cache.test.ts +6 -6
- package/src/__tests__/config.test.ts +1 -1
- package/src/__tests__/credential-reader.test.ts +12 -12
- package/src/__tests__/credential-watcher-managed-bootstrap.test.ts +77 -11
- package/src/__tests__/credential-watcher.test.ts +3 -2
- package/src/__tests__/feature-flags-route.test.ts +5 -5
- package/src/__tests__/ipc-contact-routes.test.ts +302 -0
- package/src/__tests__/ipc-feature-flag-routes.test.ts +284 -0
- package/src/__tests__/privacy-config-route.test.ts +911 -0
- package/src/__tests__/remote-feature-flag-sync.test.ts +5 -5
- package/src/__tests__/runtime-proxy.test.ts +114 -0
- package/src/__tests__/schema.test.ts +2 -0
- package/src/__tests__/slack-deliver.test.ts +287 -0
- package/src/__tests__/slack-errors.test.ts +14 -0
- package/src/__tests__/stt-stream-websocket.test.ts +392 -0
- package/src/__tests__/test-preload.ts +28 -0
- package/src/__tests__/twilio-media-websocket.test.ts +618 -0
- package/src/auth/token-service.ts +4 -9
- package/src/avatar-sync/avatar-channel-syncer.ts +78 -0
- package/src/avatar-sync/avatar-sync-watcher.ts +80 -0
- package/src/avatar-sync/slack-avatar-syncer.ts +70 -0
- package/src/avatar-sync/types.ts +16 -0
- package/src/cli/enable-proxy.ts +3 -6
- package/src/config.ts +3 -18
- package/src/credential-reader.ts +11 -23
- package/src/credential-watcher.ts +3 -3
- package/src/db/connection.ts +97 -6
- package/src/db/contact-store.ts +156 -0
- package/src/db/data-migrations/index.ts +73 -0
- package/src/db/data-migrations/m0001-guardian-init-lock.ts +62 -0
- package/src/email/register-callback.ts +7 -0
- package/src/feature-flag-registry.json +46 -14
- package/src/feature-flag-remote-store.ts +4 -9
- package/src/feature-flag-store.ts +4 -9
- package/src/http/routes/channel-verification-session-proxy.ts +2 -2
- package/src/http/routes/email-webhook.ts +6 -2
- package/src/http/routes/privacy-config.ts +217 -8
- package/src/http/routes/slack-deliver.ts +147 -24
- package/src/http/routes/stt-stream-websocket.ts +277 -0
- package/src/http/routes/twilio-media-websocket.ts +271 -0
- package/src/index.ts +185 -1
- package/src/ipc/contact-handlers.ts +65 -0
- package/src/ipc/feature-flag-handlers.ts +61 -0
- package/src/ipc/server.ts +272 -0
- package/src/logger.ts +1 -1
- package/src/paths.ts +83 -0
- package/src/platform-url.ts +24 -0
- package/src/schema.ts +259 -1
- package/src/slack/errors.ts +10 -0
- package/src/telegram/webhook-manager.ts +18 -0
- package/src/trust-store.ts +2 -6
package/ARCHITECTURE.md
CHANGED
|
@@ -32,6 +32,56 @@ Internet
|
|
|
32
32
|
+-- /webhooks/* --> BLOCKED (404, never forwarded to runtime)
|
|
33
33
|
```
|
|
34
34
|
|
|
35
|
+
### STT Route Proxying (Assistant-Scoped Rewrite)
|
|
36
|
+
|
|
37
|
+
Native clients (macOS, iOS) send speech-to-text transcription requests through the gateway to the daemon's STT service. Clients POST to the assistant-scoped path `/v1/assistants/:assistantId/stt/transcribe`, which the gateway's runtime proxy rewrites to the flat daemon path `/v1/stt/transcribe`. This follows the same assistant-scoped rewrite pattern used by other client-facing endpoints (feature flags, privacy config, etc.).
|
|
38
|
+
|
|
39
|
+
The request carries base64-encoded WAV audio and a MIME type. The daemon resolves the configured STT provider via `resolveBatchTranscriber()` and returns the transcribed text. Clients use the response to implement a service-first strategy: the service transcription takes precedence when available, with Apple-native `SFSpeechRecognizer` as fallback when the service returns 503 (not configured) or fails.
|
|
40
|
+
|
|
41
|
+
| Client path (gateway) | Daemon path (after rewrite) | Method |
|
|
42
|
+
| ----------------------------------- | --------------------------- | ------ |
|
|
43
|
+
| `/v1/assistants/:id/stt/transcribe` | `/v1/stt/transcribe` | POST |
|
|
44
|
+
|
|
45
|
+
**Key source files:**
|
|
46
|
+
|
|
47
|
+
| File | Purpose |
|
|
48
|
+
| ------------------------------------------------ | ------------------------------------------------------------------------- |
|
|
49
|
+
| `gateway/src/http/routes/runtime-proxy.ts` | Assistant-scoped path rewriting (`/v1/assistants/:id/...` → `/v1/...`) |
|
|
50
|
+
| `assistant/src/runtime/routes/stt-routes.ts` | Daemon HTTP endpoint: validates audio, resolves transcriber, returns text |
|
|
51
|
+
| `clients/shared/Network/STTClient.swift` | Shared client: POSTs audio to the gateway, returns typed `STTResult` |
|
|
52
|
+
| `clients/shared/Utilities/AudioWavEncoder.swift` | WAV encoding utility for PCM audio buffers |
|
|
53
|
+
|
|
54
|
+
### STT Streaming WebSocket Proxy
|
|
55
|
+
|
|
56
|
+
Native clients (macOS, iOS) open WebSocket connections through the gateway to the daemon's real-time STT streaming endpoint for conversation chat message capture. The gateway authenticates the downstream client using an edge JWT (actor principal required), then opens an upstream WebSocket connection to the daemon's `/v1/stt/stream` endpoint with a short-lived gateway service token. This keeps the daemon's WebSocket endpoint unreachable from the public internet while allowing authenticated clients to stream audio for real-time transcription.
|
|
57
|
+
|
|
58
|
+
**Config-authoritative model:** The runtime always resolves the streaming transcriber from `services.stt.provider` in the assistant config, regardless of any `provider` query parameter. The `provider` parameter is optional compatibility metadata — when supplied and it disagrees with the configured provider, the runtime logs a mismatch warning for operator visibility.
|
|
59
|
+
|
|
60
|
+
**Client path:** `wss://<gateway>/v1/stt/stream?mimeType=<mime>[&provider=<id>][&sampleRate=<hz>]`
|
|
61
|
+
|
|
62
|
+
**Query parameters:**
|
|
63
|
+
|
|
64
|
+
| Parameter | Required | Description |
|
|
65
|
+
| ------------ | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
66
|
+
| `mimeType` | Yes | MIME type of the audio being streamed (e.g. `audio/webm;codecs=opus`) |
|
|
67
|
+
| `provider` | No | Optional STT provider identifier (`deepgram`, `google-gemini`). Forwarded as compatibility metadata — the runtime resolves the transcriber from config, not from this parameter. |
|
|
68
|
+
| `sampleRate` | No | Sample rate in Hz (e.g. `16000`). Passed through to the daemon. |
|
|
69
|
+
| `token` | No | Edge JWT (alternative to `Authorization: Bearer` header for WS upgrades) |
|
|
70
|
+
|
|
71
|
+
**Auth model:** STT streaming is an authenticated, assistant-scoped path. The client must present a valid edge JWT with an actor principal. Service tokens are rejected. When `runtimeProxyRequireAuth` is globally disabled (dev bypass), the upgrade proceeds without token validation.
|
|
72
|
+
|
|
73
|
+
**Proxy behavior:** The gateway buffers up to 100 downstream messages while the upstream connection to the daemon is being established. If the buffer overflows, the downstream connection is closed with code 1008 (policy violation). Once the upstream connection opens, buffered messages are flushed in order. All subsequent messages are forwarded bidirectionally: client audio frames flow upstream, daemon transcript events (JSON text frames: `ready`, `partial`, `final`, `error`, `closed`) flow downstream. When either side closes, the other side is closed with the same code/reason.
|
|
74
|
+
|
|
75
|
+
**Key source files:**
|
|
76
|
+
|
|
77
|
+
| File | Purpose |
|
|
78
|
+
| ------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
|
|
79
|
+
| `gateway/src/http/routes/stt-stream-websocket.ts` | WebSocket upgrade handler (`createSttStreamWebsocketHandler`) and proxy handlers (`getSttStreamWebsocketHandlers`) |
|
|
80
|
+
| `gateway/src/index.ts` | Route registration: wires upgrade handler to the gateway's Bun HTTP server |
|
|
81
|
+
| `assistant/src/runtime/http-server.ts` | Daemon-side WebSocket upgrade at `/v1/stt/stream`, session creation and registry |
|
|
82
|
+
| `assistant/src/stt/stt-stream-session.ts` | Runtime session orchestrator: drives the `StreamingTranscriber` from the WebSocket |
|
|
83
|
+
| `clients/shared/Network/STTStreamingClient.swift` | Swift client: builds the gateway WS URL via `GatewayHTTPClient.buildWebSocketRequest` |
|
|
84
|
+
|
|
35
85
|
### Assistant Feature Flags API
|
|
36
86
|
|
|
37
87
|
The gateway exposes a REST API for reading and mutating assistant feature flags. Assistant feature flags are assistant-scoped, declaration-driven booleans that can gate any assistant behavior. Skill availability is one consumer, but not a required coupling (see [`assistant/ARCHITECTURE.md`](../assistant/ARCHITECTURE.md) for resolver and skill enforcement details).
|
|
@@ -551,12 +601,23 @@ If no guardian binding exists for the channel, escalation fails closed -- the me
|
|
|
551
601
|
|
|
552
602
|
#### SQLite Tables
|
|
553
603
|
|
|
604
|
+
**Assistant DB** (`assistant.db` — current owner, migrating to gateway):
|
|
605
|
+
|
|
554
606
|
| Table | Purpose |
|
|
555
607
|
| --------------------------- | --------------------------------------------------------------------- |
|
|
556
608
|
| `assistant_ingress_invites` | Invite tokens with SHA-256 hashes, expiry, use counts |
|
|
557
609
|
| `contacts` | Contact records with role, relationship, and per-contact metadata |
|
|
558
610
|
| `contact_channels` | Channel bindings per contact with access policy (allow/deny/escalate) |
|
|
559
611
|
|
|
612
|
+
**Gateway DB** (`gateway.sqlite` — future owner of auth/authz):
|
|
613
|
+
|
|
614
|
+
| Table | Purpose |
|
|
615
|
+
| ------------------ | ---------------------------------------------------------------------- |
|
|
616
|
+
| `contacts` | Contact auth/authz: id, display_name, role, principal_id |
|
|
617
|
+
| `contact_channels` | Channel bindings with policy, status, external IDs, verification state |
|
|
618
|
+
|
|
619
|
+
The gateway declares `contacts` and `contact_channels` tables and exposes them via IPC (`list_contacts`, `get_contact`, `get_contact_by_channel`, `get_channels_for_contact`). Endpoint cutover and data migration are in progress — the gateway will become the canonical owner once dual-writing is enabled.
|
|
620
|
+
|
|
560
621
|
#### Key Modules
|
|
561
622
|
|
|
562
623
|
| Module | Purpose |
|
|
@@ -566,6 +627,8 @@ If no guardian binding exists for the channel, escalation fails closed -- the me
|
|
|
566
627
|
| `assistant/src/contacts/contacts-write.ts` | Contact and channel writes (upsert, policy changes, invite redemption) |
|
|
567
628
|
| `assistant/src/daemon/handlers/config-inbox.ts` | Handlers for invite and member contracts |
|
|
568
629
|
| `assistant/src/runtime/routes/channel-routes.ts` | ACL enforcement point -- member lookup, policy check, escalation creation |
|
|
630
|
+
| `gateway/src/db/contact-store.ts` | Gateway-side read-only ContactStore (prepared-statement queries) |
|
|
631
|
+
| `gateway/src/ipc/contact-handlers.ts` | IPC route handlers for contact reads |
|
|
569
632
|
|
|
570
633
|
### Telegram Credential Flow
|
|
571
634
|
|
|
@@ -678,9 +741,9 @@ The Socket Mode client auto-reconnects on any WebSocket close or error. The back
|
|
|
678
741
|
|
|
679
742
|
---
|
|
680
743
|
|
|
681
|
-
## AI Phone Calls — Twilio
|
|
744
|
+
## AI Phone Calls — Twilio Voice
|
|
682
745
|
|
|
683
|
-
The Calls subsystem supports both **outbound** and **inbound** voice calls via Twilio
|
|
746
|
+
The Calls subsystem supports both **outbound** and **inbound** voice calls via Twilio. The Twilio integration path is provider-conditional: `services.stt.provider` determines whether calls use ConversationRelay (Twilio-native STT for Deepgram/Google) or Media Streams (daemon-side STT for OpenAI Whisper). The assistant uses an LLM-driven conversation loop to speak in real time. Voice is a first-class channel with its own per-call conversation (outbound key: `asst:${assistantId}:voice:call:${callSessionId}`, inbound key: `asst:${assistantId}:voice:inbound:${callSid}`). When the AI needs guardian input during a call, it dispatches ASK_GUARDIAN requests cross-channel to mac/telegram via the guardian dispatch engine. Answer resolution uses first-writer-wins semantics -- the first channel to respond provides the answer, and remaining channels receive a "already answered" notice.
|
|
684
747
|
|
|
685
748
|
### Outbound Call Flow
|
|
686
749
|
|
|
@@ -891,7 +954,8 @@ sequenceDiagram
|
|
|
891
954
|
| `gateway/src/http/routes/twilio-voice-webhook.ts` | Gateway route: validates Twilio signature, forwards voice webhook to runtime |
|
|
892
955
|
| `gateway/src/http/routes/twilio-status-webhook.ts` | Gateway route: validates Twilio signature, forwards status callback to runtime |
|
|
893
956
|
| `gateway/src/http/routes/twilio-connect-action-webhook.ts` | Gateway route: validates Twilio signature, forwards connect-action to runtime |
|
|
894
|
-
| `gateway/src/http/routes/twilio-relay-websocket.ts` | Gateway route: WebSocket proxy for ConversationRelay frames between Twilio and runtime
|
|
957
|
+
| `gateway/src/http/routes/twilio-relay-websocket.ts` | Gateway route: WebSocket proxy for ConversationRelay frames between Twilio and runtime (used for Deepgram/Google native STT) |
|
|
958
|
+
| `gateway/src/http/routes/twilio-media-websocket.ts` | Gateway route: WebSocket proxy for Media Streams frames between Twilio and runtime (used for OpenAI Whisper media-stream STT) |
|
|
895
959
|
| `gateway/src/twilio/validate-webhook.ts` | Twilio webhook validation: HMAC-SHA1 signature verification, payload size limits, fail-closed when auth token missing |
|
|
896
960
|
|
|
897
961
|
### Call State Machine
|
|
@@ -960,14 +1024,17 @@ All five tables live in `~/.vellum/workspace/data/db/assistant.db` alongside exi
|
|
|
960
1024
|
|
|
961
1025
|
Internet-facing Twilio callbacks terminate at the gateway, which validates signatures before forwarding to the runtime. This keeps the runtime behind the gateway's bearer-auth boundary.
|
|
962
1026
|
|
|
963
|
-
| Gateway Route
|
|
964
|
-
|
|
|
965
|
-
| `POST /webhooks/twilio/voice`
|
|
966
|
-
| `POST /webhooks/twilio/status`
|
|
967
|
-
| `POST /webhooks/twilio/connect-action`
|
|
968
|
-
| `WS /webhooks/twilio/relay`
|
|
1027
|
+
| Gateway Route | Validates | Forwards To (Runtime) |
|
|
1028
|
+
| ---------------------------------------------------------- | --------------------------------- | ---------------------------------------------------------------------------------------- |
|
|
1029
|
+
| `POST /webhooks/twilio/voice` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/voice-webhook` (JSON: `{ params, originalUrl, assistantId? }`) |
|
|
1030
|
+
| `POST /webhooks/twilio/status` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/status` (JSON: `{ params }`) |
|
|
1031
|
+
| `POST /webhooks/twilio/connect-action` | HMAC-SHA1 signature, payload size | `POST /v1/internal/twilio/connect-action` (JSON: `{ params }`) |
|
|
1032
|
+
| `WS /webhooks/twilio/relay` | WebSocket upgrade | `WS /v1/calls/relay` (bidirectional proxy) — ConversationRelay path |
|
|
1033
|
+
| `WS /webhooks/twilio/media-stream/<callSessionId>/<token>` | WebSocket upgrade | `WS /v1/calls/media-stream` (bidirectional proxy) — Media Streams path |
|
|
1034
|
+
|
|
1035
|
+
In gateway-fronted deployments, the TwiML WebSocket URL (returned by the voice webhook) should point to the gateway's `/webhooks/twilio/relay` (ConversationRelay) or `/webhooks/twilio/media-stream/<callSessionId>/<token>` (Media Streams) endpoint rather than directly to the runtime. The gateway proxies frames bidirectionally between Twilio and the runtime, preserving close and error semantics for proper cleanup.
|
|
969
1036
|
|
|
970
|
-
|
|
1037
|
+
**Media Streams handshake metadata:** Twilio Media Streams does not reliably preserve URL query parameters across the WebSocket upgrade, so handshake metadata (`callSessionId` and auth `token`) is encoded as **URL path segments** (primary transport). The gateway also supports legacy query-parameter-based handshake as a fallback for backward compatibility. The metadata extractor in `twilio-media-websocket.ts` resolves values from path segments first, falling back to query parameters.
|
|
971
1038
|
|
|
972
1039
|
Signature validation is **fail-closed**: if the Twilio auth token is not configured, all webhook requests are rejected with `403`. Missing or invalid `X-Twilio-Signature` headers are also rejected with `403`. Payload size is capped by `maxWebhookPayloadBytes` (checked via both `Content-Length` header and actual body size).
|
|
973
1040
|
|
|
@@ -999,7 +1066,8 @@ This makes ingress URL updates smoother in local tunnel workflows because Twilio
|
|
|
999
1066
|
| POST | `/v1/calls/:callSessionId/instruction` | Relay a steering instruction to an active call's controller (alternative to in-conversation bridge) |
|
|
1000
1067
|
| POST | `/v1/internal/twilio/status` | Internal status callback used by gateway; accepts JSON `{ params }` |
|
|
1001
1068
|
| POST | `/v1/internal/twilio/connect-action` | Internal connect action callback used by gateway; accepts JSON `{ params }` |
|
|
1002
|
-
| WS | `/v1/calls/relay` | ConversationRelay WebSocket (bidirectional: prompt/interrupt/dtmf from Twilio, text tokens/end to Twilio)
|
|
1069
|
+
| WS | `/v1/calls/relay` | ConversationRelay WebSocket (bidirectional: prompt/interrupt/dtmf from Twilio, text tokens/end to Twilio) — Deepgram/Google path |
|
|
1070
|
+
| WS | `/v1/calls/media-stream` | Media Streams WebSocket (raw audio from Twilio, daemon-side STT) — OpenAI Whisper path |
|
|
1003
1071
|
|
|
1004
1072
|
### Tools
|
|
1005
1073
|
|
|
@@ -1016,7 +1084,7 @@ Both tools and HTTP routes delegate to the same domain functions in `call-domain
|
|
|
1016
1084
|
The CallController detects two special markers in the LLM's response text:
|
|
1017
1085
|
|
|
1018
1086
|
- **`[ASK_GUARDIAN: question]`** — The AI needs to consult the guardian. The controller creates a pending question, notifies the session via `fireCallQuestionNotifier`, puts the caller on hold, and waits for a guardian answer (timeout configured via `calls.userConsultTimeoutSeconds`).
|
|
1019
|
-
- **`[END_CALL]`** — The AI has determined the call's purpose is fulfilled. The controller sends a goodbye, closes the
|
|
1087
|
+
- **`[END_CALL]`** — The AI has determined the call's purpose is fulfilled. The controller sends a goodbye, closes the call session, and marks the call as completed.
|
|
1020
1088
|
|
|
1021
1089
|
Both markers are stripped from the TTS output so the callee never hears the raw control text.
|
|
1022
1090
|
|
|
@@ -1035,19 +1103,20 @@ Malformed or unprocessable provider callback payloads are logged as dead-letter
|
|
|
1035
1103
|
|
|
1036
1104
|
Call behavior is controlled via the `calls` config block in the assistant configuration (`config/schema.ts`). All values have sensible defaults and are validated via Zod:
|
|
1037
1105
|
|
|
1038
|
-
| Field
|
|
1039
|
-
|
|
|
1040
|
-
| `calls.enabled`
|
|
1041
|
-
| `calls.provider`
|
|
1042
|
-
| `calls.maxDurationSeconds`
|
|
1043
|
-
| `calls.userConsultTimeoutSeconds`
|
|
1044
|
-
| `calls.disclosure.enabled`
|
|
1045
|
-
| `calls.disclosure.text`
|
|
1046
|
-
| `calls.safety.denyCategories`
|
|
1047
|
-
| `calls.model`
|
|
1048
|
-
| `calls.voice.language`
|
|
1049
|
-
| `
|
|
1050
|
-
| `
|
|
1106
|
+
| Field | Type | Default | Description |
|
|
1107
|
+
| --------------------------------- | -------- | ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
1108
|
+
| `calls.enabled` | boolean | `true` | Master toggle for the calls feature. When `false`, call routes return 403 and tools return errors. |
|
|
1109
|
+
| `calls.provider` | enum | `'twilio'` | Voice provider to use (currently only Twilio is supported). |
|
|
1110
|
+
| `calls.maxDurationSeconds` | int | `3600` | Maximum allowed duration per call. |
|
|
1111
|
+
| `calls.userConsultTimeoutSeconds` | int | `120` | How long to wait for a user answer before timing out a pending question. |
|
|
1112
|
+
| `calls.disclosure.enabled` | boolean | `true` | Whether the AI should disclose it is an AI at the start of the call. |
|
|
1113
|
+
| `calls.disclosure.text` | string | _(default disclosure prompt)_ | The disclosure instruction included in the system prompt. |
|
|
1114
|
+
| `calls.safety.denyCategories` | string[] | `[]` | Categories of calls to deny (e.g., emergency numbers are always denied regardless of this setting). |
|
|
1115
|
+
| `calls.model` | string | _(unset — uses default model)_ | Optional override for the LLM model used in voice call conversations. |
|
|
1116
|
+
| `calls.voice.language` | string | `'en-US'` | Language code for TTS and transcription. |
|
|
1117
|
+
| `services.stt.provider` | enum | `'deepgram'` | STT provider for all boundaries including telephony. Determines the Twilio integration path (ConversationRelay-native for `deepgram`/`google-gemini`, media-stream for `openai-whisper`). |
|
|
1118
|
+
| `services.tts.provider` | enum | `'elevenlabs'` | Active TTS provider for speech synthesis (catalog-driven; see [TTS Provider Abstraction](../assistant/ARCHITECTURE.md#tts-provider-abstraction-servicestts)). |
|
|
1119
|
+
| `services.tts.providers.<id>.*` | object | _(per-provider defaults)_ | Provider-specific settings block. One block per catalog entry (e.g. `elevenlabs`, `fish-audio`). |
|
|
1051
1120
|
|
|
1052
1121
|
### Caller Identity Resolution
|
|
1053
1122
|
|
|
@@ -1064,10 +1133,24 @@ Both the resolved mode and source are logged at info level on success, and rejec
|
|
|
1064
1133
|
|
|
1065
1134
|
### Voice Quality Profile Resolution
|
|
1066
1135
|
|
|
1067
|
-
Voice and TTS settings are configurable via the `calls.voice` config
|
|
1136
|
+
Voice and TTS settings are configurable via the `calls.voice` and `services.tts` config blocks — they are not hardcoded. The function `resolveVoiceQualityProfile()` in `voice-quality.ts` uses the catalog-driven call strategy abstraction to determine how the active TTS provider integrates with the Twilio telephony path, then resolves the result into a `VoiceQualityProfile` containing the TTS provider, voice spec string, and language.
|
|
1137
|
+
|
|
1138
|
+
The active TTS provider is determined by `services.tts.provider` (default: `"elevenlabs"`). Provider-specific settings (voice ID, model, tuning parameters) are read from `services.tts.providers.<id>`. The call mode (`native-twilio` or `synthesized-play`) is resolved from the canonical provider catalog via `resolveCallStrategy()` in `tts-call-strategy.ts` — it reads the provider's declared `callMode` rather than inferring behavior from runtime capabilities.
|
|
1139
|
+
|
|
1140
|
+
For `native-twilio` providers (e.g. ElevenLabs), the voice quality profile looks up a registered `NativeTwilioVoiceSpecBuilder` to construct the provider-specific voice spec string for the ConversationRelay `voice` attribute. New native providers plug in by registering their own voice spec builder — no edits to core call routing logic required. For `synthesized-play` providers (e.g. Fish Audio), `ttsProvider` is set to `"Google"` as a placeholder in TwiML and actual audio is delivered via `play` messages — the assistant synthesises audio via the provider's HTTP API.
|
|
1141
|
+
|
|
1142
|
+
The voice webhook in `twilio-routes.ts` calls `resolveVoiceQualityProfile()` for TTS settings and separately resolves the telephony STT strategy via `resolveTelephonySttRouting()`. The routing result determines which TwiML generator to use: `generateTwiML()` for Twilio-native ConversationRelay, or `generateStreamTwiML()` for the media-stream path. This separation keeps TTS and STT resolution independent — the voice quality profile controls the TTS provider, voice, and language, while the routing strategy controls the STT integration path.
|
|
1143
|
+
|
|
1144
|
+
For full details on the catalog-driven TTS architecture, provider catalog, call strategy abstraction, and the provider-add checklist, see the [TTS Provider Abstraction](../assistant/ARCHITECTURE.md#tts-provider-abstraction-servicestts) section in the assistant architecture docs.
|
|
1145
|
+
|
|
1146
|
+
### Telephony STT: Provider-Conditional Hybrid Routing
|
|
1147
|
+
|
|
1148
|
+
Telephony STT is unified under `services.stt.provider`. The voice webhook in `twilio-routes.ts` calls `resolveTelephonySttRouting()` to determine the Twilio integration path based on the active provider:
|
|
1149
|
+
|
|
1150
|
+
- **Deepgram / Google** (`conversation-relay-native` strategy) — TwiML emits `<Connect><ConversationRelay>` with Twilio-native `transcriptionProvider` and `speechModel` attributes. The gateway proxies ConversationRelay frames via `/webhooks/twilio/relay`. The daemon receives transcribed text, not raw audio.
|
|
1068
1151
|
|
|
1069
|
-
|
|
1152
|
+
- **OpenAI Whisper** (`media-stream-custom` strategy) — TwiML emits `<Connect><Stream>` pointing to the gateway's media-stream proxy (`/webhooks/twilio/media-stream`). The gateway forwards raw audio frames to the daemon's media-stream server, which transcribes server-side.
|
|
1070
1153
|
|
|
1071
|
-
|
|
1154
|
+
Both paths are active in production. The strategy selection happens at call setup time based on the current `services.stt.provider` value. See `docs/internal-reference.md` for a provider-specific troubleshooting matrix.
|
|
1072
1155
|
|
|
1073
1156
|
---
|
package/Dockerfile
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
FROM oven/bun:1.3.11@sha256:0733e50325078969732ebe3b15ce4c4be5082f18c4ac1a0f0ca4839c2e4e42a7 AS bun
|
|
3
3
|
|
|
4
4
|
# Build stage
|
|
5
|
-
FROM debian:trixie@sha256:
|
|
5
|
+
FROM debian:trixie@sha256:3352c2e13876c8a5c5873ef20870e1939e73cb9a3c1aeba5e3e72172a85ce9ed AS builder
|
|
6
6
|
|
|
7
7
|
WORKDIR /app
|
|
8
8
|
|
|
@@ -14,7 +14,7 @@ RUN bun install --frozen-lockfile --production
|
|
|
14
14
|
COPY . .
|
|
15
15
|
|
|
16
16
|
# Runtime stage
|
|
17
|
-
FROM debian:trixie-slim@sha256:
|
|
17
|
+
FROM debian:trixie-slim@sha256:4ffb3a1511099754cddc70eb1b12e50ffdb67619aa0ab6c13fcd800a78ef7c7a AS runner
|
|
18
18
|
|
|
19
19
|
WORKDIR /app
|
|
20
20
|
|
package/bun.lock
CHANGED
package/bunfig.toml
ADDED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vellumai/vellum-gateway",
|
|
3
|
-
"version": "0.6.
|
|
3
|
+
"version": "0.6.4",
|
|
4
4
|
"license": "MIT",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
@@ -27,7 +27,8 @@
|
|
|
27
27
|
"minimatch": "10.2.4",
|
|
28
28
|
"pino": "9.14.0",
|
|
29
29
|
"pino-pretty": "13.1.3",
|
|
30
|
-
"uuid": "13.0.0"
|
|
30
|
+
"uuid": "13.0.0",
|
|
31
|
+
"zod": "4.3.6"
|
|
31
32
|
},
|
|
32
33
|
"devDependencies": {
|
|
33
34
|
"@types/bun": "1.3.9",
|
|
@@ -13,26 +13,26 @@ import { ConfigFileCache } from "../config-file-cache.js";
|
|
|
13
13
|
let testBaseDir: string;
|
|
14
14
|
let workspaceDir: string;
|
|
15
15
|
let configPath: string;
|
|
16
|
-
let
|
|
16
|
+
let savedWorkspaceDir: string | undefined;
|
|
17
17
|
|
|
18
18
|
function writeConfig(data: Record<string, unknown>): void {
|
|
19
19
|
writeFileSync(configPath, JSON.stringify(data));
|
|
20
20
|
}
|
|
21
21
|
|
|
22
22
|
beforeEach(() => {
|
|
23
|
-
|
|
23
|
+
savedWorkspaceDir = process.env.VELLUM_WORKSPACE_DIR;
|
|
24
24
|
testBaseDir = mkdtempSync(join(tmpdir(), "config-file-cache-test-"));
|
|
25
25
|
workspaceDir = join(testBaseDir, ".vellum", "workspace");
|
|
26
26
|
mkdirSync(workspaceDir, { recursive: true });
|
|
27
27
|
configPath = join(workspaceDir, "config.json");
|
|
28
|
-
process.env.
|
|
28
|
+
process.env.VELLUM_WORKSPACE_DIR = workspaceDir;
|
|
29
29
|
});
|
|
30
30
|
|
|
31
31
|
afterEach(() => {
|
|
32
|
-
if (
|
|
33
|
-
delete process.env.
|
|
32
|
+
if (savedWorkspaceDir === undefined) {
|
|
33
|
+
delete process.env.VELLUM_WORKSPACE_DIR;
|
|
34
34
|
} else {
|
|
35
|
-
process.env.
|
|
35
|
+
process.env.VELLUM_WORKSPACE_DIR = savedWorkspaceDir;
|
|
36
36
|
}
|
|
37
37
|
rmSync(testBaseDir, { recursive: true, force: true });
|
|
38
38
|
});
|
|
@@ -23,7 +23,7 @@ describe("config: hardcoded defaults", () => {
|
|
|
23
23
|
expect(config.unmappedPolicy).toBe("reject");
|
|
24
24
|
expect(config.routingEntries).toEqual([]);
|
|
25
25
|
expect(config.defaultAssistantId).toBeUndefined();
|
|
26
|
-
expect(config.logFile.dir).toMatch(
|
|
26
|
+
expect(config.logFile.dir).toMatch(/logs$/);
|
|
27
27
|
expect(config.logFile.retentionDays).toBe(30);
|
|
28
28
|
});
|
|
29
29
|
|
|
@@ -39,7 +39,7 @@ const testDir = join(
|
|
|
39
39
|
);
|
|
40
40
|
|
|
41
41
|
function metadataDir(): string {
|
|
42
|
-
return join(testDir, "
|
|
42
|
+
return join(testDir, "data", "credentials");
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
function writeMetadata(
|
|
@@ -106,8 +106,8 @@ function encryptEntries(
|
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
function writeEncryptedStore(entries: Record<string, string>): void {
|
|
109
|
-
|
|
110
|
-
|
|
109
|
+
mkdirSync(testDir, { recursive: true });
|
|
110
|
+
const storePath = join(testDir, "keys.enc");
|
|
111
111
|
|
|
112
112
|
const salt = randomBytes(16);
|
|
113
113
|
const key = pbkdf2Sync(
|
|
@@ -131,17 +131,16 @@ function writeEncryptedStore(entries: Record<string, string>): void {
|
|
|
131
131
|
* The store.key is used directly as the AES-256-GCM key (no PBKDF2).
|
|
132
132
|
*/
|
|
133
133
|
function writeEncryptedStoreV2(entries: Record<string, string>): void {
|
|
134
|
-
|
|
135
|
-
mkdirSync(protectedDir, { recursive: true });
|
|
134
|
+
mkdirSync(testDir, { recursive: true });
|
|
136
135
|
|
|
137
136
|
const storeKey = randomBytes(KEY_LENGTH);
|
|
138
|
-
writeFileSync(join(
|
|
137
|
+
writeFileSync(join(testDir, "store.key"), storeKey);
|
|
139
138
|
|
|
140
139
|
const store = {
|
|
141
140
|
version: 2,
|
|
142
141
|
entries: encryptEntries(entries, storeKey),
|
|
143
142
|
};
|
|
144
|
-
writeFileSync(join(
|
|
143
|
+
writeFileSync(join(testDir, "keys.enc"), JSON.stringify(store));
|
|
145
144
|
}
|
|
146
145
|
|
|
147
146
|
// ---------------------------------------------------------------------------
|
|
@@ -149,12 +148,14 @@ function writeEncryptedStoreV2(entries: Record<string, string>): void {
|
|
|
149
148
|
// ---------------------------------------------------------------------------
|
|
150
149
|
|
|
151
150
|
beforeEach(() => {
|
|
152
|
-
process.env.
|
|
151
|
+
process.env.GATEWAY_SECURITY_DIR = testDir;
|
|
152
|
+
process.env.VELLUM_WORKSPACE_DIR = testDir;
|
|
153
153
|
logCalls.length = 0;
|
|
154
154
|
});
|
|
155
155
|
|
|
156
156
|
afterEach(() => {
|
|
157
|
-
delete process.env.
|
|
157
|
+
delete process.env.GATEWAY_SECURITY_DIR;
|
|
158
|
+
delete process.env.VELLUM_WORKSPACE_DIR;
|
|
158
159
|
try {
|
|
159
160
|
rmSync(testDir, { recursive: true, force: true });
|
|
160
161
|
} catch {
|
|
@@ -178,8 +179,7 @@ describe("v2 encrypted store with store.key", () => {
|
|
|
178
179
|
|
|
179
180
|
test("returns undefined for v2 store when store.key is missing", async () => {
|
|
180
181
|
// Write a v2 store but without the store.key file
|
|
181
|
-
|
|
182
|
-
mkdirSync(protectedDir, { recursive: true });
|
|
182
|
+
mkdirSync(testDir, { recursive: true });
|
|
183
183
|
|
|
184
184
|
const storeKey = randomBytes(KEY_LENGTH);
|
|
185
185
|
const store = {
|
|
@@ -189,7 +189,7 @@ describe("v2 encrypted store with store.key", () => {
|
|
|
189
189
|
storeKey,
|
|
190
190
|
),
|
|
191
191
|
};
|
|
192
|
-
writeFileSync(join(
|
|
192
|
+
writeFileSync(join(testDir, "keys.enc"), JSON.stringify(store));
|
|
193
193
|
// Deliberately do NOT write store.key
|
|
194
194
|
|
|
195
195
|
const result = await readCredential(credentialKey("test", "key"));
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { afterEach, describe, expect, test } from "bun:test";
|
|
2
|
+
import { createServer } from "node:net";
|
|
2
3
|
import { spawn, type ChildProcess } from "node:child_process";
|
|
3
4
|
import { mkdirSync, renameSync, rmSync, writeFileSync } from "node:fs";
|
|
4
5
|
import { tmpdir } from "node:os";
|
|
@@ -54,19 +55,51 @@ let gatewayPort = 0;
|
|
|
54
55
|
let cesPort = 0;
|
|
55
56
|
let cesServer: ReturnType<typeof Bun.serve> | null = null;
|
|
56
57
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
/** Ask the OS for a free port by briefly binding to port 0. */
|
|
59
|
+
function getFreePort(): Promise<number> {
|
|
60
|
+
return new Promise((resolve, reject) => {
|
|
61
|
+
const srv = createServer();
|
|
62
|
+
srv.listen(0, "127.0.0.1", () => {
|
|
63
|
+
const addr = srv.address();
|
|
64
|
+
if (!addr || typeof addr === "string") {
|
|
65
|
+
srv.close();
|
|
66
|
+
reject(new Error("Failed to get free port"));
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
const port = addr.port;
|
|
70
|
+
srv.close(() => resolve(port));
|
|
71
|
+
});
|
|
72
|
+
srv.on("error", reject);
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** Wait for a child process to exit, with a safety timeout. */
|
|
77
|
+
function waitForExit(proc: ChildProcess, timeoutMs = 5_000): Promise<void> {
|
|
78
|
+
return new Promise<void>((resolve) => {
|
|
79
|
+
if (proc.exitCode !== null || proc.signalCode !== null) {
|
|
80
|
+
resolve();
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
const timer = setTimeout(resolve, timeoutMs);
|
|
84
|
+
proc.on("exit", () => {
|
|
85
|
+
clearTimeout(timer);
|
|
86
|
+
resolve();
|
|
87
|
+
});
|
|
88
|
+
});
|
|
61
89
|
}
|
|
62
90
|
|
|
63
91
|
async function startGateway(): Promise<void> {
|
|
64
|
-
|
|
92
|
+
if (cesPort === 0)
|
|
93
|
+
throw new Error(
|
|
94
|
+
"CES port not assigned — call startFakeCes or reserveCesPort first",
|
|
95
|
+
);
|
|
96
|
+
gatewayPort = await getFreePort();
|
|
65
97
|
|
|
66
98
|
gatewayProc = spawn("bun", ["run", gatewayEntry], {
|
|
67
99
|
env: {
|
|
68
100
|
...process.env,
|
|
69
|
-
|
|
101
|
+
GATEWAY_SECURITY_DIR: join(testDir, ".vellum", "protected"),
|
|
102
|
+
VELLUM_WORKSPACE_DIR: join(testDir, ".vellum", "workspace"),
|
|
70
103
|
GATEWAY_PORT: String(gatewayPort),
|
|
71
104
|
CES_CREDENTIAL_URL: `http://127.0.0.1:${cesPort}`,
|
|
72
105
|
CES_SERVICE_TOKEN: TEST_SERVICE_TOKEN,
|
|
@@ -76,8 +109,27 @@ async function startGateway(): Promise<void> {
|
|
|
76
109
|
stdio: ["ignore", "pipe", "pipe"],
|
|
77
110
|
});
|
|
78
111
|
|
|
112
|
+
// Collect stderr for diagnostics on failure.
|
|
113
|
+
const stderrChunks: Buffer[] = [];
|
|
114
|
+
gatewayProc.stderr?.on("data", (chunk: Buffer) => stderrChunks.push(chunk));
|
|
115
|
+
|
|
116
|
+
// Track early exit so we can fail fast instead of polling for 30s.
|
|
117
|
+
let earlyExitCode: number | null = null;
|
|
118
|
+
let earlyExitSignal: string | null = null;
|
|
119
|
+
gatewayProc.on("exit", (code, signal) => {
|
|
120
|
+
earlyExitCode = code;
|
|
121
|
+
earlyExitSignal = signal;
|
|
122
|
+
});
|
|
123
|
+
|
|
79
124
|
const deadline = Date.now() + 30_000;
|
|
80
125
|
while (Date.now() < deadline) {
|
|
126
|
+
// If the process already died, fail immediately with stderr.
|
|
127
|
+
if (earlyExitCode !== null || earlyExitSignal !== null) {
|
|
128
|
+
const stderr = Buffer.concat(stderrChunks).toString().slice(-2000);
|
|
129
|
+
throw new Error(
|
|
130
|
+
`Gateway exited early (code=${earlyExitCode}, signal=${earlyExitSignal})\n${stderr}`,
|
|
131
|
+
);
|
|
132
|
+
}
|
|
81
133
|
try {
|
|
82
134
|
const res = await fetch(`http://localhost:${gatewayPort}/healthz`);
|
|
83
135
|
if (res.ok) return;
|
|
@@ -86,7 +138,10 @@ async function startGateway(): Promise<void> {
|
|
|
86
138
|
}
|
|
87
139
|
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
88
140
|
}
|
|
89
|
-
|
|
141
|
+
const stderr = Buffer.concat(stderrChunks).toString().slice(-2000);
|
|
142
|
+
throw new Error(
|
|
143
|
+
`Gateway failed to start within 30 seconds\nstderr: ${stderr}`,
|
|
144
|
+
);
|
|
90
145
|
}
|
|
91
146
|
|
|
92
147
|
function startFakeCes(opts: {
|
|
@@ -94,11 +149,12 @@ function startFakeCes(opts: {
|
|
|
94
149
|
credentials?: Record<string, string>;
|
|
95
150
|
resolveValue?: (account: string) => string | undefined;
|
|
96
151
|
}): void {
|
|
97
|
-
assignPorts();
|
|
98
152
|
const accounts = opts.accounts ?? Object.keys(opts.credentials ?? {});
|
|
99
153
|
const credentials = opts.credentials ?? {};
|
|
100
154
|
cesServer = Bun.serve({
|
|
101
|
-
|
|
155
|
+
// If cesPort was pre-reserved (for tests that start the gateway before
|
|
156
|
+
// the CES), bind to that port. Otherwise let the OS pick a free one.
|
|
157
|
+
port: cesPort || 0,
|
|
102
158
|
fetch(req) {
|
|
103
159
|
const authHeader = req.headers.get("authorization");
|
|
104
160
|
if (authHeader !== `Bearer ${TEST_SERVICE_TOKEN}`) {
|
|
@@ -130,17 +186,22 @@ function startFakeCes(opts: {
|
|
|
130
186
|
return new Response("Not Found", { status: 404 });
|
|
131
187
|
},
|
|
132
188
|
});
|
|
189
|
+
cesPort = cesServer.port!;
|
|
133
190
|
}
|
|
134
191
|
|
|
135
|
-
afterEach(() => {
|
|
192
|
+
afterEach(async () => {
|
|
136
193
|
cesServer?.stop(true);
|
|
137
194
|
cesServer = null;
|
|
138
195
|
gatewayPort = 0;
|
|
139
196
|
cesPort = 0;
|
|
140
197
|
|
|
141
198
|
if (gatewayProc) {
|
|
142
|
-
gatewayProc
|
|
199
|
+
const proc = gatewayProc;
|
|
143
200
|
gatewayProc = null;
|
|
201
|
+
proc.kill("SIGKILL");
|
|
202
|
+
// Wait for the process to actually exit so ports and file handles are
|
|
203
|
+
// fully released before the next test starts.
|
|
204
|
+
await waitForExit(proc);
|
|
144
205
|
}
|
|
145
206
|
|
|
146
207
|
rmSync(testDir, { recursive: true, force: true });
|
|
@@ -151,6 +212,11 @@ describe("gateway managed credential bootstrap retry", () => {
|
|
|
151
212
|
mkdirSync(testDir, { recursive: true });
|
|
152
213
|
writeCredentialMetadata();
|
|
153
214
|
|
|
215
|
+
// Reserve the CES port before starting the gateway so the gateway
|
|
216
|
+
// knows where CES will eventually appear. CES isn't running yet —
|
|
217
|
+
// the gateway's managed bootstrap will get ECONNREFUSED until we
|
|
218
|
+
// start the fake CES below.
|
|
219
|
+
cesPort = await getFreePort();
|
|
154
220
|
await startGateway();
|
|
155
221
|
|
|
156
222
|
const base = `http://localhost:${gatewayPort}`;
|
|
@@ -86,7 +86,7 @@ function encrypt(
|
|
|
86
86
|
|
|
87
87
|
/**
|
|
88
88
|
* Write Telegram bot_token and webhook_secret into the encrypted store
|
|
89
|
-
* at $
|
|
89
|
+
* at $GATEWAY_SECURITY_DIR/keys.enc, using the same key
|
|
90
90
|
* derivation the gateway's credential-reader will use to decrypt.
|
|
91
91
|
*/
|
|
92
92
|
function writeEncryptedStore(botToken: string, webhookSecret: string): void {
|
|
@@ -195,7 +195,8 @@ async function startGateway(): Promise<void> {
|
|
|
195
195
|
gatewayProc = spawn("bun", ["run", gatewayEntry], {
|
|
196
196
|
env: {
|
|
197
197
|
...process.env,
|
|
198
|
-
|
|
198
|
+
GATEWAY_SECURITY_DIR: join(testDir, ".vellum", "protected"),
|
|
199
|
+
VELLUM_WORKSPACE_DIR: join(testDir, ".vellum", "workspace"),
|
|
199
200
|
GATEWAY_PORT: String(port),
|
|
200
201
|
// Ensure Telegram is NOT configured via env vars
|
|
201
202
|
TELEGRAM_BOT_TOKEN: "",
|
|
@@ -57,10 +57,10 @@ const TEST_REGISTRY = {
|
|
|
57
57
|
],
|
|
58
58
|
};
|
|
59
59
|
|
|
60
|
-
const
|
|
60
|
+
const savedGatewaySecurityDir = process.env.GATEWAY_SECURITY_DIR;
|
|
61
61
|
|
|
62
62
|
beforeEach(() => {
|
|
63
|
-
process.env.
|
|
63
|
+
process.env.GATEWAY_SECURITY_DIR = protectedDir;
|
|
64
64
|
mkdirSync(protectedDir, { recursive: true });
|
|
65
65
|
writeFileSync(defaultsPath, JSON.stringify(TEST_REGISTRY, null, 2));
|
|
66
66
|
// Point registry resolution at the isolated test file first
|
|
@@ -71,10 +71,10 @@ beforeEach(() => {
|
|
|
71
71
|
});
|
|
72
72
|
|
|
73
73
|
afterEach(() => {
|
|
74
|
-
if (
|
|
75
|
-
delete process.env.
|
|
74
|
+
if (savedGatewaySecurityDir === undefined) {
|
|
75
|
+
delete process.env.GATEWAY_SECURITY_DIR;
|
|
76
76
|
} else {
|
|
77
|
-
process.env.
|
|
77
|
+
process.env.GATEWAY_SECURITY_DIR = savedGatewaySecurityDir;
|
|
78
78
|
}
|
|
79
79
|
try {
|
|
80
80
|
rmSync(testDir, { recursive: true, force: true });
|