@vellumai/assistant 0.4.22 → 0.4.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/bun.lock +3 -0
  2. package/package.json +2 -1
  3. package/scripts/ipc/check-swift-decoder-drift.ts +55 -44
  4. package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +0 -90
  5. package/src/__tests__/assistant-events-sse-hardening.test.ts +9 -3
  6. package/src/__tests__/config-schema.test.ts +38 -178
  7. package/src/__tests__/conversation-routes-guardian-reply.test.ts +4 -1
  8. package/src/__tests__/credential-security-invariants.test.ts +0 -2
  9. package/src/__tests__/guardian-verify-setup-skill-regression.test.ts +2 -2
  10. package/src/__tests__/headless-browser-interactions.test.ts +0 -4
  11. package/src/__tests__/ipc-snapshot.test.ts +0 -63
  12. package/src/__tests__/onboarding-template-contract.test.ts +10 -20
  13. package/src/__tests__/relay-server.test.ts +3 -3
  14. package/src/__tests__/resolve-guardian-trust-class.test.ts +61 -0
  15. package/src/__tests__/runtime-events-sse-parity.test.ts +10 -0
  16. package/src/__tests__/runtime-events-sse.test.ts +7 -0
  17. package/src/__tests__/session-init.benchmark.test.ts +0 -4
  18. package/src/__tests__/session-runtime-assembly.test.ts +34 -8
  19. package/src/__tests__/system-prompt.test.ts +7 -1
  20. package/src/__tests__/trusted-contact-approval-notifier.test.ts +12 -8
  21. package/src/__tests__/twilio-routes-twiml.test.ts +2 -2
  22. package/src/__tests__/twilio-routes.test.ts +2 -3
  23. package/src/__tests__/voice-quality.test.ts +21 -132
  24. package/src/calls/relay-server.ts +11 -5
  25. package/src/calls/twilio-routes.ts +4 -38
  26. package/src/calls/voice-quality.ts +7 -63
  27. package/src/config/bundled-skills/guardian-verify-setup/SKILL.md +7 -10
  28. package/src/config/bundled-skills/messaging/SKILL.md +3 -5
  29. package/src/config/bundled-skills/phone-calls/SKILL.md +143 -82
  30. package/src/config/bundled-skills/sms-setup/SKILL.md +0 -20
  31. package/src/config/bundled-skills/twilio-setup/SKILL.md +9 -17
  32. package/src/config/bundled-skills/voice-setup/SKILL.md +36 -1
  33. package/src/config/bundled-skills/voice-setup/icon.svg +20 -0
  34. package/src/config/calls-schema.ts +3 -53
  35. package/src/config/elevenlabs-schema.ts +33 -0
  36. package/src/config/schema.ts +183 -137
  37. package/src/config/types.ts +0 -1
  38. package/src/daemon/daemon-control.ts +3 -0
  39. package/src/daemon/handlers/browser.ts +2 -53
  40. package/src/daemon/ipc-contract/browser.ts +5 -84
  41. package/src/daemon/ipc-contract/surfaces.ts +51 -48
  42. package/src/daemon/ipc-contract-inventory.json +0 -9
  43. package/src/daemon/session-agent-loop-handlers.ts +3 -0
  44. package/src/daemon/session-agent-loop.ts +2 -1
  45. package/src/daemon/session-runtime-assembly.ts +9 -7
  46. package/src/daemon/session-tool-setup.ts +27 -13
  47. package/src/mcp/client.ts +2 -1
  48. package/src/memory/conversation-crud.ts +339 -166
  49. package/src/memory/migrations/102-alter-table-columns.ts +254 -37
  50. package/src/memory/schema.ts +1227 -1035
  51. package/src/runtime/routes/events-routes.ts +7 -0
  52. package/src/runtime/routes/inbound-message-handler.ts +3 -4
  53. package/src/schedule/scheduler.ts +159 -45
  54. package/src/security/secure-keys.ts +3 -3
  55. package/src/tools/browser/browser-execution.ts +314 -331
  56. package/src/tools/browser/browser-handoff.ts +11 -37
  57. package/src/tools/browser/browser-manager.ts +203 -352
  58. package/src/tools/browser/browser-screencast.ts +15 -76
  59. package/src/tools/network/script-proxy/certs.ts +7 -237
  60. package/src/tools/network/script-proxy/connect-tunnel.ts +1 -82
  61. package/src/tools/network/script-proxy/http-forwarder.ts +2 -151
  62. package/src/tools/network/script-proxy/logging.ts +12 -196
  63. package/src/tools/network/script-proxy/mitm-handler.ts +2 -270
  64. package/src/tools/network/script-proxy/policy.ts +4 -152
  65. package/src/tools/network/script-proxy/router.ts +2 -60
  66. package/src/tools/network/script-proxy/server.ts +5 -137
  67. package/src/tools/network/script-proxy/types.ts +19 -125
  68. package/src/tools/system/voice-config.ts +23 -1
  69. package/src/util/logger.ts +4 -1
  70. package/src/__tests__/elevenlabs-config.test.ts +0 -95
  71. package/src/__tests__/twilio-routes-elevenlabs.test.ts +0 -407
  72. package/src/calls/elevenlabs-config.ts +0 -32
@@ -11,7 +11,7 @@ You are helping the user set up and manage phone calls via Twilio. This skill co
11
11
 
12
12
  ## Overview
13
13
 
14
- The calling system uses Twilio's ConversationRelay for both **outbound** and **inbound** voice calls. Twilio works out of the box as the default voice provider. Optionally, you can enable ElevenLabs integration for higher-quality, more natural-sounding voices but this is entirely optional.
14
+ The calling system uses Twilio's ConversationRelay for both **outbound** and **inbound** voice calls with **ElevenLabs** providing the text-to-speech voice. After Twilio setup, the assistant configures ElevenLabs as the TTS provider and prompts the user to choose a voice from a curated list of supported options.
15
15
 
16
16
  ### Outbound calls
17
17
 
@@ -34,14 +34,6 @@ When someone dials the assistant's Twilio phone number:
34
34
  5. Once verified (or if no challenge is pending), the LLM orchestrator greets the caller in a receptionist style: "Hello, this is [user]'s assistant. How can I help you today?"
35
35
  6. The assistant converses naturally, using ASK_GUARDIAN to consult the user when needed, just like outbound calls.
36
36
 
37
- Three voice quality modes are available:
38
-
39
- - **`twilio_standard`** (default) — Fully supported. Standard Twilio TTS with Google voices. No extra setup required.
40
- - **`twilio_elevenlabs_tts`** — Fully supported. Uses ElevenLabs voices through Twilio ConversationRelay for more natural speech.
41
- - **`elevenlabs_agent`** — **Experimental/restricted.** Full ElevenLabs conversational agent mode. Consultation bridging (`waiting_on_user`) is not yet supported in this mode; the runtime guard blocks it before any ElevenLabs API calls are made. See the "Runtime behavior" section below for fallback and strict-fail details.
42
-
43
- You can keep using Twilio only — no changes needed. Enabling ElevenLabs can improve naturalness and quality.
44
-
45
37
  The user's assistant gets its own personal phone number through Twilio. All implicit calls (without an explicit mode) always use this assistant number. Optionally, users can call from their own phone number if it's authorized with the Twilio account — this must be explicitly requested per call via `caller_identity_mode="user_number"`.
46
38
 
47
39
  ## Step 1: Verify Twilio Setup
@@ -79,18 +71,105 @@ Verify:
79
71
  vellum config get calls.enabled
80
72
  ```
81
73
 
82
- ## Step 3: Verify Setup (Test Call)
74
+ ## Step 3: Choose a Voice
75
+
76
+ After enabling calls, let the user choose an ElevenLabs voice. Twilio has a native ElevenLabs integration — no separate ElevenLabs account or API key is needed.
77
+
78
+ ### Voice consistency with in-app TTS
79
+
80
+ The shared config key `elevenlabs.voiceId` is the single source of truth for ElevenLabs voice identity. Both in-app TTS and phone calls read from it (defaulting to **Rachel** — `21m00Tcm4TlvDq8ikWAM`).
81
+
82
+ Before presenting the voice list, check the current shared voice:
83
+
84
+ ```bash
85
+ vellum config get elevenlabs.voiceId
86
+ ```
87
+
88
+ **If a non-default voice is already set**, the user chose it during voice-setup or a previous session. Tell them:
89
+
90
+ > "Your assistant currently uses [voice name] for both in-app chat and phone calls. I'll keep the same voice for calls. You can change it if you'd like."
91
+
92
+ Skip the selection prompt unless the user wants to change.
93
+
94
+ **If the default (Rachel) is set or no override exists**, present the curated voice list below and let them pick. When they choose, set the shared config so both in-app TTS and phone calls use it:
95
+
96
+ ### Voice selection
97
+
98
+ Present the user with a list of supported ElevenLabs voices. These are pre-made voices with stable IDs that work with Twilio ConversationRelay out of the box.
99
+
100
+ **Ask the user: "Which voice would you like your assistant to use on phone calls?"**
101
+
102
+ Present these voices grouped by category:
103
+
104
+ #### Female voices
105
+
106
+ | Voice | Style | Voice ID |
107
+ | --------- | ------------------------------ | ------------------------------ |
108
+ | Rachel | Calm, warm, conversational | `21m00Tcm4TlvDq8ikWAM` |
109
+ | Sarah | Soft, young, approachable | `EXAVITQu4vr4xnSDxMaL` |
110
+ | Charlotte | Warm, Swedish-accented | `XB0fDUnXU5powFXDhCwa` |
111
+ | Alice | Confident, British | `Xb7hH8MSUJpSbSDYk0k2` |
112
+ | Matilda | Warm, friendly, young | `XrExE9yKIg1WjnnlVkGX` |
113
+ | Lily | Warm, British | `pFZP5JQG7iQjIQuC4Bku` |
114
+
115
+ #### Male voices
116
+
117
+ | Voice | Style | Voice ID |
118
+ | ------- | -------------------------------- | ------------------------------ |
119
+ | Antoni | Warm, well-rounded | `ErXwobaYiN019PkySvjV` |
120
+ | Josh | Deep, young, clear | `TxGEqnHWrfWFTfGW9XjX` |
121
+ | Arnold | Crisp, narrative | `VR6AewLTigWG4xSOukaG` |
122
+ | Adam | Deep, middle-aged, professional | `pNInz6obpgDQGcFmaJgB` |
123
+ | Bill | Trustworthy, American | `pqHfZKP75CvOlQylNhV4` |
124
+ | George | Warm, British, distinguished | `JBFqnCBsd6RMkjVDRZzb` |
125
+ | Daniel | Authoritative, British | `onwK4e9ZLuTAKqWW03F9` |
126
+ | Charlie | Casual, Australian | `IKne3meq5aSn9XLyUdCD` |
127
+ | Liam | Young, articulate | `TX3LPaxmHKxFdv7VOQHJ` |
128
+
129
+ After the user picks a voice, use `voice_config_update` to set the shared voice ID. This writes to the config file (`elevenlabs.voiceId`) for phone calls **and** pushes to the macOS app via IPC (`ttsVoiceId`) for in-app TTS in one call:
130
+
131
+ ```
132
+ voice_config_update setting="tts_voice_id" value="<selected-voice-id>"
133
+ ```
134
+
135
+ **If the user wants a voice not on this list**, they can browse more voices at https://elevenlabs.io/voice-library and provide the voice ID manually.
136
+
137
+ ## Step 4: Verify Setup (Test Call)
83
138
 
84
139
  Before making real calls, offer a quick verification:
85
140
 
86
141
  1. Confirm credentials are stored: check the Twilio config endpoint for `hasCredentials: true` and `phoneNumber`
87
142
  2. Confirm ingress is running: `ingress.publicBaseUrl` must be set and the tunnel active
88
143
  3. Confirm calls are enabled: `calls.enabled` must be `true`
144
+ 4. Confirm voice is configured: `elevenlabs.voiceId` should be set
89
145
 
90
- Suggest a test call to the user's own phone: **"Want to do a quick test call to your phone to make sure everything works?"**
146
+ Suggest a test call to the user's own phone: **"Want to do a quick test call to your phone to make sure everything works? This is a good way to hear how your chosen voice sounds."**
91
147
 
92
148
  If they agree, ask for their personal phone number and place a test call with a simple task like "Introduce yourself and confirm the call system is working."
93
149
 
150
+ ## Step 5: Verify Guardian Identity (Voice)
151
+
152
+ Now link the user's phone number as the trusted voice guardian. Tell the user: "Now let's verify your guardian identity for voice. This links your phone number so the assistant can verify inbound callers."
153
+
154
+ Load the **guardian-verify-setup** skill to handle the verification flow:
155
+
156
+ - Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
157
+
158
+ When invoking the skill, indicate the channel is `voice`. The guardian-verify-setup skill manages the full outbound verification flow, including:
159
+
160
+ - Collecting the user's phone number as the destination
161
+ - Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "voice"`
162
+ - Calling the phone number and providing a code for the user to enter via their phone's keypad
163
+ - Proactively polling for completion (voice auto-check) so the user gets instant confirmation
164
+ - Checking guardian status to confirm the binding was created
165
+ - Handling resend, cancel, and error cases
166
+
167
+ Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted voice guardian."_
168
+
169
+ After the guardian-verify-setup skill completes (or the user skips), continue to the next sections.
170
+
171
+ **Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed without blocking. Once verified, inbound callers can be prompted for voice verification before calls proceed (see the **Guardian voice verification for inbound calls** section below).
172
+
94
173
  ## Caller Identity
95
174
 
96
175
  All implicit calls (calls without an explicit `caller_identity_mode`) always use the assistant's Twilio phone number. This is the number that appears on the recipient's caller ID.
@@ -133,88 +212,83 @@ An optional verification step where the callee must enter a numeric code via the
133
212
  | `calls.verification.enabled` | Enable DTMF callee verification | `false` |
134
213
  | `calls.verification.codeLength` | Number of digits in the verification code | `6` |
135
214
 
136
- ## Optional: Higher Quality Voice with ElevenLabs
215
+ ## Advanced Voice Configuration
137
216
 
138
- ElevenLabs integration is entirely optional. The standard Twilio-only setup works unchanged — this section is only relevant if you want to improve voice quality.
217
+ ElevenLabs is the TTS provider for all calls. This section covers advanced voice selection and tuning.
139
218
 
140
- ### Mode: `twilio_elevenlabs_tts`
219
+ ### Changing the voice
141
220
 
142
- Uses ElevenLabs voices through Twilio's ConversationRelay. Speech is more natural-sounding than the default Google TTS voices.
221
+ To switch to a different voice after initial setup, use `voice_config_update` to set the shared voice ID. This writes to the config file (`elevenlabs.voiceId`) for phone calls **and** pushes to the macOS app via IPC for in-app TTS:
143
222
 
144
- **Recommended user-friendly workflow (no technical IDs required):**
145
-
146
- 1. Ask what kind of voice the user wants (examples: "warm", "professional", "playful", "calm", "deeper", "brighter")
147
- 2. If the user doesn't care, keep `twilio_standard` (simplest path)
148
- 3. If they want higher-quality voice, switch to `twilio_elevenlabs_tts` and choose a matching ElevenLabs voice on their behalf
223
+ ```
224
+ voice_config_update setting="tts_voice_id" value="<new-voice-id>"
225
+ ```
149
226
 
150
- The user should not need to know what a `voiceId` is unless they explicitly want advanced/manual control.
227
+ Browse more voices at https://elevenlabs.io/voice-library.
151
228
 
152
- **Manual/advanced setup (optional):**
229
+ ### Advanced voice selection with an ElevenLabs account
153
230
 
154
- ```bash
155
- vellum config set calls.voice.mode twilio_elevenlabs_tts
156
- vellum config set calls.voice.elevenlabs.voiceId "<your-voice-id>"
157
- ```
231
+ Users who have an ElevenLabs account and API key (e.g., from the **voice-setup** skill) can go beyond the curated voice list. With an API key, they can:
158
232
 
159
- By default, the system sends a **bare** `voiceId` to Twilio ConversationRelay (no model/tuning suffix). This is the safest default across voice IDs.
233
+ - **Browse the full ElevenLabs voice library programmatically** the ElevenLabs API (`GET https://api.elevenlabs.io/v2/voices`) supports searching by name, category, language, and accent. This returns voice IDs, names, labels, and preview URLs.
234
+ - **Use custom or cloned voices** — if the user has created a custom voice or voice clone in their ElevenLabs account, they can use its voice ID here. These voices are available in Twilio ConversationRelay just like pre-made voices.
235
+ - **Preview voices before choosing** — each voice in the API response includes a `preview_url` with an audio sample.
160
236
 
161
- If you want to force Twilio's extended voice spec, you can optionally set a model ID:
237
+ To check if the user has an API key stored:
162
238
 
163
239
  ```bash
164
- vellum config set calls.voice.elevenlabs.voiceModelId "flash_v2_5"
240
+ credential_store action=get service=elevenlabs field=api_key
165
241
  ```
166
242
 
167
- When `voiceModelId` is set, the emitted voice string becomes:
168
- `voiceId-model-speed_stability_similarity`.
169
-
170
- ### Mode: `elevenlabs_agent` (experimental/restricted)
243
+ If they have a key and want to browse voices, fetch the voice list:
171
244
 
172
- Full ElevenLabs conversational agent mode. This requires an ElevenLabs account with an agent configured on their platform.
245
+ ```bash
246
+ curl -s "https://api.elevenlabs.io/v2/voices?category=premade&page_size=50" \
247
+ -H "xi-api-key: <api_key_from_credential_store>" | python3 -m json.tool
248
+ ```
173
249
 
174
- **Restriction:** This mode is currently restricted because consultation bridging (`waiting_on_user`) is not yet supported. A runtime guard in `handleVoiceWebhook` blocks `elevenlabs_agent` before any ElevenLabs API calls are made.
250
+ To search for a specific voice style:
175
251
 
176
- **Setup:**
252
+ ```bash
253
+ curl -s "https://api.elevenlabs.io/v2/voices?search=warm+female&page_size=10" \
254
+ -H "xi-api-key: <api_key_from_credential_store>" | python3 -m json.tool
255
+ ```
177
256
 
178
- 1. Store your ElevenLabs API key securely:
257
+ After the user picks a voice, set the shared voice ID:
179
258
 
180
259
  ```
181
- credential_store action=store service=elevenlabs field=api_key value=<your_api_key>
260
+ voice_config_update setting="tts_voice_id" value="<selected-voice-id>"
182
261
  ```
183
262
 
184
- 2. Set the voice mode and agent ID:
263
+ ### Voice tuning parameters
185
264
 
186
- ```bash
187
- vellum config set calls.voice.mode elevenlabs_agent
188
- vellum config set calls.voice.elevenlabs.agentId "<your-agent-id>"
189
- ```
265
+ Fine-tune how the selected voice sounds. These parameters apply to all ElevenLabs modes:
190
266
 
191
- ### Fallback behavior and `fallbackToStandardOnError`
267
+ ```bash
268
+ # Playback speed (0.7 = slower, 1.0 = normal, 1.2 = faster)
269
+ vellum config set elevenlabs.speed 1.0
192
270
 
193
- By default, `calls.voice.fallbackToStandardOnError` is `true`. This setting controls what happens when an ElevenLabs mode encounters errors or is restricted.
271
+ # Stability (0.0 = more expressive/variable, 1.0 = more consistent/monotone)
272
+ vellum config set elevenlabs.stability 0.5
194
273
 
195
- #### Invalid configuration (e.g., missing voiceId or agentId)
274
+ # Similarity boost (0.0 = more creative, 1.0 = closer to original voice)
275
+ vellum config set elevenlabs.similarityBoost 0.75
276
+ ```
196
277
 
197
- - **`true` (default):** The profile resolver silently falls back to `twilio_standard` mode and logs a warning. The call proceeds with standard Twilio TTS.
198
- - **`false`:** The voice webhook returns **HTTP 500** with the specific configuration error details (e.g., `"Voice quality configuration error: calls.voice.elevenlabs.voiceId is required..."`).
278
+ Lower stability makes the voice more expressive but less predictable good for conversational calls. Higher stability is better for scripted/formal calls.
199
279
 
200
- #### `elevenlabs_agent` mode guard (consultation bridging unsupported)
280
+ ### Voice model tuning
201
281
 
202
- - **`true` (default):** The `elevenlabs_agent` mode is silently downgraded to standard ConversationRelay TwiML with a warning log. The call proceeds normally with standard Twilio TTS. No ElevenLabs API calls are made.
203
- - **`false`:** The voice webhook returns **HTTP 501** with the message: `"elevenlabs_agent mode is restricted: consultation bridging (waiting_on_user) is not yet supported."`. No ElevenLabs API calls are made.
282
+ By default, the system sends a **bare** `voiceId` to Twilio ConversationRelay (no model/tuning suffix). This is the safest default across voice IDs.
204
283
 
205
- You can disable fallback if you want strict ElevenLabs-only behavior:
284
+ If you want to force Twilio's extended voice spec, you can optionally set a model ID:
206
285
 
207
286
  ```bash
208
- vellum config set calls.voice.fallbackToStandardOnError false
287
+ vellum config set elevenlabs.voiceModelId "flash_v2_5"
209
288
  ```
210
289
 
211
- ### Reverting to standard Twilio
212
-
213
- To go back to the default voice at any time:
214
-
215
- ```bash
216
- vellum config set calls.voice.mode twilio_standard
217
- ```
290
+ When `voiceModelId` is set, the emitted voice string becomes:
291
+ `voiceId-model-speed_stability_similarity`.
218
292
 
219
293
  ## Making Outbound Calls
220
294
 
@@ -477,16 +551,13 @@ All call-related settings can be managed via `vellum config`:
477
551
  | `calls.model` | Override LLM model for call orchestration | _(uses default model)_ |
478
552
  | `calls.callerIdentity.allowPerCallOverride` | Allow per-call caller identity selection | `true` |
479
553
  | `calls.callerIdentity.userNumber` | E.164 phone number for user-number mode | _(empty)_ |
480
- | `calls.voice.mode` | Voice quality mode (`twilio_standard`, `twilio_elevenlabs_tts`, `elevenlabs_agent`) | `twilio_standard` |
481
554
  | `calls.voice.language` | Language code for TTS and transcription | `en-US` |
482
555
  | `calls.voice.transcriptionProvider` | Speech-to-text provider (`Deepgram`, `Google`) | `Deepgram` |
483
- | `calls.voice.fallbackToStandardOnError` | Auto-fallback to standard Twilio TTS on ElevenLabs errors | `true` |
484
- | `calls.voice.elevenlabs.voiceId` | Advanced/internal ElevenLabs voice identifier. Usually set by the assistant based on requested voice style | _(empty)_ |
485
- | `calls.voice.elevenlabs.voiceModelId` | Optional Twilio ConversationRelay model suffix. Leave empty to send bare `voiceId` | _(empty)_ |
486
- | `calls.voice.elevenlabs.agentId` | ElevenLabs agent ID (for `elevenlabs_agent` mode) | _(empty)_ |
487
- | `calls.voice.elevenlabs.speed` | Playback speed (`0.7` – `1.2`) | `1.0` |
488
- | `calls.voice.elevenlabs.stability` | Voice stability (`0.0` – `1.0`) | `0.5` |
489
- | `calls.voice.elevenlabs.similarityBoost` | Voice similarity boost (`0.0` – `1.0`) | `0.75` |
556
+ | `elevenlabs.voiceId` | ElevenLabs voice ID used by both in-app TTS and phone calls. Set during setup from the curated voice list. Defaults to Rachel | `21m00Tcm4TlvDq8ikWAM` |
557
+ | `elevenlabs.voiceModelId` | Optional Twilio ConversationRelay model suffix. Leave empty to send bare `voiceId` | _(empty)_ |
558
+ | `elevenlabs.speed` | Playback speed (`0.7` `1.2`) | `1.0` |
559
+ | `elevenlabs.stability` | Voice stability (`0.0` `1.0`) | `0.5` |
560
+ | `elevenlabs.similarityBoost` | Voice similarity boost (`0.0` – `1.0`) | `0.75` |
490
561
 
491
562
  ### Adjusting settings
492
563
 
@@ -560,25 +631,15 @@ Or re-run the public-ingress skill to auto-detect and save the new URL.
560
631
 
561
632
  The system has a 30-second silence timeout. If nobody speaks for 30 seconds, the agent will ask "Are you still there?" This is expected behavior.
562
633
 
563
- ### Call quality didn't improve after enabling ElevenLabs
634
+ ### Call quality sounds off
564
635
 
565
- - Verify `calls.voice.mode` is set to `twilio_elevenlabs_tts` or `elevenlabs_agent` (not still `twilio_standard`)
636
+ - Verify `elevenlabs.voiceId` is set to a valid ElevenLabs voice ID
566
637
  - Ask for the desired voice style again and try a different voice selection
567
- - If configuring manually: check that `calls.voice.elevenlabs.voiceId` contains a valid ElevenLabs voice ID
568
- - If mode is `elevenlabs_agent`, ensure `calls.voice.elevenlabs.agentId` is also set
569
638
 
570
639
  ### Twilio says "application error" right after answer
571
640
 
572
641
  - This often means ConversationRelay rejected voice configuration after TwiML fetch
573
- - Keep `calls.voice.elevenlabs.voiceModelId` empty first (bare `voiceId` mode)
642
+ - Keep `elevenlabs.voiceModelId` empty first (bare `voiceId` mode)
574
643
  - If you set `voiceModelId`, try clearing it and retesting:
575
- `vellum config set calls.voice.elevenlabs.voiceModelId ""`
576
-
577
- ### ElevenLabs mode falls back to standard
578
-
579
- When `calls.voice.fallbackToStandardOnError` is `true` (the default), the system silently falls back to standard Twilio TTS if ElevenLabs encounters an error or restriction. Check:
644
+ `vellum config set elevenlabs.voiceModelId ""`
580
645
 
581
- - For `elevenlabs_agent` mode: this mode is currently restricted (consultation bridging not yet supported) and will always fall back to standard when fallback is enabled. If fallback is disabled, the voice webhook returns HTTP 501.
582
- - For `twilio_elevenlabs_tts` mode: verify `calls.voice.elevenlabs.voiceId` is set to a valid voice ID
583
- - For invalid configs (missing voiceId/agentId): if fallback is disabled, the voice webhook returns HTTP 500 with the config error
584
- - Review daemon logs for warning messages about fallback or guard activation
@@ -144,26 +144,6 @@ After deletion, return to Step 3b to collect information and resubmit. Warn the
144
144
 
145
145
  **On failure:** Report the exact error message and guide the user through resolution.
146
146
 
147
- ## Step 3.5: Guardian Verification (SMS)
148
-
149
- Now link the user's phone number as the trusted SMS guardian. Tell the user: "Now let's verify your guardian identity for SMS. This links your phone number as the trusted guardian for SMS messaging."
150
-
151
- Load the **guardian-verify-setup** skill to handle the verification flow:
152
-
153
- - Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
154
-
155
- When invoking the skill, indicate the channel is `sms`. The guardian-verify-setup skill manages the full outbound verification flow, including:
156
-
157
- - Collecting the user's phone number as the destination (accepts any common format -- the API normalizes to E.164)
158
- - Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "sms"`
159
- - Sending a 6-digit code to the phone number that the user must reply with from the SMS channel
160
- - Checking guardian status to confirm the binding was created
161
- - Handling resend, cancel, and error cases
162
-
163
- Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted SMS guardian."_
164
-
165
- **Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed to Step 4 without blocking.
166
-
167
147
  ## Step 4: Test Send
168
148
 
169
149
  Run a test SMS to verify end-to-end delivery:
@@ -215,44 +215,36 @@ Confirm:
215
215
 
216
216
  Tell the user: **"Twilio is configured. Your assistant's phone number is {phoneNumber}. This number is used for both voice calls and SMS messaging."**
217
217
 
218
- ## Step 5.5: Guardian Verification (SMS and Voice)
218
+ ## Step 5.5: Guardian Verification (Voice)
219
219
 
220
- Now link the user's phone number as the trusted guardian for SMS and/or voice channels. Tell the user: "Now let's verify your guardian identity. This links your phone number as the trusted guardian for messaging and calls."
220
+ Now link the user's phone number as the trusted voice guardian. Tell the user: "Now let's verify your guardian identity for voice. This links your phone number so the assistant can verify inbound callers."
221
221
 
222
222
  Load the **guardian-verify-setup** skill to handle the verification flow:
223
223
 
224
224
  - Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
225
225
 
226
- The guardian-verify-setup skill manages the full outbound verification flow for **one channel at a time** (sms, voice, or telegram). Each invocation handles:
226
+ When invoking the skill, indicate the channel is `voice`. The guardian-verify-setup skill manages the full outbound verification flow, including:
227
227
 
228
228
  - Collecting the user's phone number as the destination (accepts any common format -- the API normalizes to E.164)
229
- - Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start`
230
- - For **SMS**: sending a 6-digit code to the phone number that the user must reply with from the SMS channel
231
- - For **voice**: calling the phone number and providing a code for the user to enter via their phone's keypad
229
+ - Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "voice"`
230
+ - Calling the phone number and providing a code for the user to enter via their phone's keypad
231
+ - Proactively polling for completion (voice auto-check) so the user gets instant confirmation
232
232
  - Checking guardian status to confirm the binding was created
233
233
  - Handling resend, cancel, and error cases
234
234
 
235
- **If the user wants to verify both SMS and voice**, load the skill twice -- once for SMS and once for voice. Each channel requires its own separate verification session.
235
+ Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted voice guardian."_
236
236
 
237
- Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted guardian. We'll verify one channel at a time."_
238
-
239
- After the guardian-verify-setup skill completes verification for a channel, load it again for the next channel if needed. Once all desired channels are verified (or the user skips), continue to Step 6.
237
+ After the guardian-verify-setup skill completes (or the user skips), continue to Step 6.
240
238
 
241
239
  **Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed to Step 6 without blocking.
242
240
 
243
- To re-check guardian status later, query the channel(s) that were verified:
241
+ To re-check guardian status later:
244
242
 
245
243
  ```bash
246
- # Check SMS guardian status
247
- curl -s "$INTERNAL_GATEWAY_BASE_URL/v1/integrations/guardian/status?channel=sms" \
248
- -H "Authorization: Bearer $GATEWAY_AUTH_TOKEN"
249
- # Check voice guardian status
250
244
  curl -s "$INTERNAL_GATEWAY_BASE_URL/v1/integrations/guardian/status?channel=voice" \
251
245
  -H "Authorization: Bearer $GATEWAY_AUTH_TOKEN"
252
246
  ```
253
247
 
254
- Check the status for whichever channel(s) the user actually verified (SMS, voice, or both). Report the guardian verification result per channel: **"Guardian identity — SMS: {verified | not configured}, Voice: {verified | not configured}."**
255
-
256
248
  ## Step 6: Enable Features
257
249
 
258
250
  Now that Twilio is configured, the user can enable the features that depend on it:
@@ -9,7 +9,7 @@ You are helping the user set up and troubleshoot voice features (push-to-talk, w
9
9
 
10
10
  ## Available Tools
11
11
 
12
- - `voice_config_update` — Change any voice setting (PTT key, wake word enabled/keyword/timeout)
12
+ - `voice_config_update` — Change any voice setting (PTT key, wake word enabled/keyword/timeout, TTS voice ID)
13
13
  - `open_system_settings` — Open macOS System Settings to a specific privacy pane
14
14
  - `navigate_settings_tab` — Open the Vellum settings panel to the Voice tab
15
15
  - `credential_store` — Collect API keys securely (for ElevenLabs TTS)
@@ -66,6 +66,41 @@ Ask if they want high-quality text-to-speech voices via ElevenLabs (optional —
66
66
  2. Use `credential_store` with `action: "prompt"`, `service: "elevenlabs"`, `field: "api_key"` to show a secure input dialog.
67
67
  3. After the key is stored, confirm success.
68
68
 
69
+ #### Choose an ElevenLabs voice
70
+
71
+ After storing the API key, let the user pick their preferred voice. The shared config key `elevenlabs.voiceId` controls the voice for **both** in-app TTS and phone calls (defaulting to Rachel).
72
+
73
+ Check the current voice:
74
+
75
+ ```bash
76
+ vellum config get elevenlabs.voiceId
77
+ ```
78
+
79
+ Ask the user if they want to change their TTS voice. If yes, use `voice_config_update` with `setting: "tts_voice_id"` and the chosen voice ID. This writes to both the config file (`elevenlabs.voiceId`) and pushes to the macOS app via IPC in one call.
80
+
81
+ Common choices from the curated ElevenLabs list:
82
+ - **Rachel** (`21m00Tcm4TlvDq8ikWAM`) — Calm, warm, conversational (default)
83
+ - **Sarah** (`EXAVITQu4vr4xnSDxMaL`) — Soft, young, approachable
84
+ - **Charlotte** (`XB0fDUnXU5powFXDhCwa`) — Warm, Swedish-accented
85
+ - **Josh** (`TxGEqnHWrfWFTfGW9XjX`) — Deep, young, clear
86
+ - **Adam** (`pNInz6obpgDQGcFmaJgB`) — Deep, middle-aged, professional
87
+
88
+ If the user wants to browse more voices, they can search at https://elevenlabs.io/voice-library or use the ElevenLabs API with their key.
89
+
90
+ #### Sync with phone calls
91
+
92
+ After setting the voice, check whether phone calls are configured:
93
+
94
+ ```bash
95
+ vellum config get calls.enabled
96
+ ```
97
+
98
+ **If phone calls are enabled** (`calls.enabled` is `true`):
99
+ - Tell the user their phone calls will automatically use the same voice they just chose, since both in-app TTS and phone calls read from `elevenlabs.voiceId`.
100
+
101
+ **If phone calls are not yet configured** (`calls.enabled` is `false` or not set):
102
+ - Tell the user: "When you set up phone calls later, they'll automatically use the same voice for a consistent experience."
103
+
69
104
  ### 5. Verification
70
105
 
71
106
  After setup is complete:
@@ -0,0 +1,20 @@
1
+ <svg viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg">
2
+ <rect x="7" y="1" width="2" height="2" fill="#4A90E2"/>
3
+ <rect x="6" y="3" width="4" height="1" fill="#4A90E2"/>
4
+ <rect x="5" y="4" width="6" height="1" fill="#7B68EE"/>
5
+ <rect x="4" y="5" width="8" height="1" fill="#7B68EE"/>
6
+ <rect x="4" y="6" width="8" height="1" fill="#4A90E2"/>
7
+ <rect x="5" y="7" width="6" height="1" fill="#4A90E2"/>
8
+ <rect x="6" y="8" width="4" height="1" fill="#7B68EE"/>
9
+ <rect x="7" y="9" width="2" height="1" fill="#7B68EE"/>
10
+ <rect x="3" y="10" width="2" height="4" fill="#50C878"/>
11
+ <rect x="4" y="10" width="1" height="1" fill="#50C878"/>
12
+ <rect x="5" y="11" width="1" height="1" fill="#50C878"/>
13
+ <rect x="11" y="10" width="2" height="4" fill="#E74C3C"/>
14
+ <rect x="10" y="10" width="1" height="1" fill="#E74C3C"/>
15
+ <rect x="9" y="11" width="1" height="1" fill="#E74C3C"/>
16
+ <rect x="6" y="11" width="4" height="1" fill="#FFD700"/>
17
+ <rect x="5" y="12" width="6" height="1" fill="#FFD700"/>
18
+ <rect x="4" y="13" width="8" height="1" fill="#FFD700"/>
19
+ <rect x="7" y="14" width="2" height="2" fill="#4A90E2"/>
20
+ </svg>
@@ -1,7 +1,6 @@
1
1
  import { z } from 'zod';
2
2
 
3
3
  const VALID_CALL_PROVIDERS = ['twilio'] as const;
4
- const VALID_CALL_VOICE_MODES = ['twilio_standard', 'twilio_elevenlabs_tts', 'elevenlabs_agent'] as const;
5
4
  export const VALID_CALLER_IDENTITY_MODES = ['assistant_number', 'user_number'] as const;
6
5
  const VALID_CALL_TRANSCRIPTION_PROVIDERS = ['Deepgram', 'Google'] as const;
7
6
 
@@ -20,51 +19,7 @@ export const CallsSafetyConfigSchema = z.object({
20
19
  .default([]),
21
20
  });
22
21
 
23
- export const CallsElevenLabsConfigSchema = z.object({
24
- voiceId: z
25
- .string({ error: 'calls.voice.elevenlabs.voiceId must be a string' })
26
- .default(''),
27
- voiceModelId: z
28
- .string({ error: 'calls.voice.elevenlabs.voiceModelId must be a string' })
29
- .default(''),
30
- speed: z
31
- .number({ error: 'calls.voice.elevenlabs.speed must be a number' })
32
- .min(0.7, 'calls.voice.elevenlabs.speed must be >= 0.7')
33
- .max(1.2, 'calls.voice.elevenlabs.speed must be <= 1.2')
34
- .default(1.0),
35
- stability: z
36
- .number({ error: 'calls.voice.elevenlabs.stability must be a number' })
37
- .min(0, 'calls.voice.elevenlabs.stability must be >= 0')
38
- .max(1, 'calls.voice.elevenlabs.stability must be <= 1')
39
- .default(0.5),
40
- similarityBoost: z
41
- .number({ error: 'calls.voice.elevenlabs.similarityBoost must be a number' })
42
- .min(0, 'calls.voice.elevenlabs.similarityBoost must be >= 0')
43
- .max(1, 'calls.voice.elevenlabs.similarityBoost must be <= 1')
44
- .default(0.75),
45
- useSpeakerBoost: z
46
- .boolean({ error: 'calls.voice.elevenlabs.useSpeakerBoost must be a boolean' })
47
- .default(true),
48
- agentId: z
49
- .string({ error: 'calls.voice.elevenlabs.agentId must be a string' })
50
- .default(''),
51
- apiBaseUrl: z
52
- .string({ error: 'calls.voice.elevenlabs.apiBaseUrl must be a string' })
53
- .default('https://api.elevenlabs.io'),
54
- registerCallTimeoutMs: z
55
- .number({ error: 'calls.voice.elevenlabs.registerCallTimeoutMs must be a number' })
56
- .int('calls.voice.elevenlabs.registerCallTimeoutMs must be an integer')
57
- .min(1000, 'calls.voice.elevenlabs.registerCallTimeoutMs must be >= 1000')
58
- .max(15000, 'calls.voice.elevenlabs.registerCallTimeoutMs must be <= 15000')
59
- .default(5000),
60
- });
61
-
62
22
  export const CallsVoiceConfigSchema = z.object({
63
- mode: z
64
- .enum(VALID_CALL_VOICE_MODES, {
65
- error: `calls.voice.mode must be one of: ${VALID_CALL_VOICE_MODES.join(', ')}`,
66
- })
67
- .default('twilio_standard'),
68
23
  language: z
69
24
  .string({ error: 'calls.voice.language must be a string' })
70
25
  .default('en-US'),
@@ -73,10 +28,6 @@ export const CallsVoiceConfigSchema = z.object({
73
28
  error: `calls.voice.transcriptionProvider must be one of: ${VALID_CALL_TRANSCRIPTION_PROVIDERS.join(', ')}`,
74
29
  })
75
30
  .default('Deepgram'),
76
- fallbackToStandardOnError: z
77
- .boolean({ error: 'calls.voice.fallbackToStandardOnError must be a boolean' })
78
- .default(true),
79
- elevenlabs: CallsElevenLabsConfigSchema.default(CallsElevenLabsConfigSchema.parse({})),
80
31
  });
81
32
 
82
33
  export const CallerIdentityConfigSchema = z.object({
@@ -142,7 +93,7 @@ export const CallsConfigSchema = z.object({
142
93
  .int('calls.guardianWaitUpdateInitialIntervalMs must be an integer')
143
94
  .min(1000, 'calls.guardianWaitUpdateInitialIntervalMs must be >= 1000')
144
95
  .max(60_000, 'calls.guardianWaitUpdateInitialIntervalMs must be at most 60000')
145
- .default(5000),
96
+ .default(15_000),
146
97
  guardianWaitUpdateInitialWindowMs: z
147
98
  .number({ error: 'calls.guardianWaitUpdateInitialWindowMs must be a number' })
148
99
  .int('calls.guardianWaitUpdateInitialWindowMs must be an integer')
@@ -154,13 +105,13 @@ export const CallsConfigSchema = z.object({
154
105
  .int('calls.guardianWaitUpdateSteadyMinIntervalMs must be an integer')
155
106
  .min(1000, 'calls.guardianWaitUpdateSteadyMinIntervalMs must be >= 1000')
156
107
  .max(60_000, 'calls.guardianWaitUpdateSteadyMinIntervalMs must be at most 60000')
157
- .default(7000),
108
+ .default(20_000),
158
109
  guardianWaitUpdateSteadyMaxIntervalMs: z
159
110
  .number({ error: 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be a number' })
160
111
  .int('calls.guardianWaitUpdateSteadyMaxIntervalMs must be an integer')
161
112
  .min(1000, 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be >= 1000')
162
113
  .max(60_000, 'calls.guardianWaitUpdateSteadyMaxIntervalMs must be at most 60000')
163
- .default(10_000),
114
+ .default(30_000),
164
115
  disclosure: CallsDisclosureConfigSchema.default(CallsDisclosureConfigSchema.parse({})),
165
116
  safety: CallsSafetyConfigSchema.default(CallsSafetyConfigSchema.parse({})),
166
117
  voice: CallsVoiceConfigSchema.default(CallsVoiceConfigSchema.parse({})),
@@ -175,6 +126,5 @@ export type CallsConfig = z.infer<typeof CallsConfigSchema>;
175
126
  export type CallsDisclosureConfig = z.infer<typeof CallsDisclosureConfigSchema>;
176
127
  export type CallsSafetyConfig = z.infer<typeof CallsSafetyConfigSchema>;
177
128
  export type CallsVoiceConfig = z.infer<typeof CallsVoiceConfigSchema>;
178
- export type CallsElevenLabsConfig = z.infer<typeof CallsElevenLabsConfigSchema>;
179
129
  export type CallerIdentityConfig = z.infer<typeof CallerIdentityConfigSchema>;
180
130
  export type CallsVerificationConfig = z.infer<typeof CallsVerificationConfigSchema>;
@@ -0,0 +1,33 @@
1
+ import { z } from 'zod';
2
+
3
+ // Default ElevenLabs voice — "Rachel" (calm, warm, conversational).
4
+ // Used by both in-app TTS and phone calls (via Twilio ConversationRelay).
5
+ // Mirrored in: clients/macos/.../OpenAIVoiceService.swift (defaultVoiceId)
6
+ export const DEFAULT_ELEVENLABS_VOICE_ID = '21m00Tcm4TlvDq8ikWAM';
7
+
8
+ export const ElevenLabsConfigSchema = z.object({
9
+ voiceId: z
10
+ .string({ error: 'elevenlabs.voiceId must be a string' })
11
+ .min(1, 'elevenlabs.voiceId must not be empty')
12
+ .default(DEFAULT_ELEVENLABS_VOICE_ID),
13
+ voiceModelId: z
14
+ .string({ error: 'elevenlabs.voiceModelId must be a string' })
15
+ .default(''),
16
+ speed: z
17
+ .number({ error: 'elevenlabs.speed must be a number' })
18
+ .min(0.7, 'elevenlabs.speed must be >= 0.7')
19
+ .max(1.2, 'elevenlabs.speed must be <= 1.2')
20
+ .default(1.0),
21
+ stability: z
22
+ .number({ error: 'elevenlabs.stability must be a number' })
23
+ .min(0, 'elevenlabs.stability must be >= 0')
24
+ .max(1, 'elevenlabs.stability must be <= 1')
25
+ .default(0.5),
26
+ similarityBoost: z
27
+ .number({ error: 'elevenlabs.similarityBoost must be a number' })
28
+ .min(0, 'elevenlabs.similarityBoost must be >= 0')
29
+ .max(1, 'elevenlabs.similarityBoost must be <= 1')
30
+ .default(0.75),
31
+ });
32
+
33
+ export type ElevenLabsConfig = z.infer<typeof ElevenLabsConfigSchema>;