@vellumai/assistant 0.4.23 → 0.4.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bun.lock +3 -0
- package/package.json +2 -1
- package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +0 -15
- package/src/__tests__/assistant-events-sse-hardening.test.ts +9 -3
- package/src/__tests__/call-controller.test.ts +80 -0
- package/src/__tests__/config-schema.test.ts +38 -178
- package/src/__tests__/conversation-routes-guardian-reply.test.ts +4 -1
- package/src/__tests__/credential-security-invariants.test.ts +0 -2
- package/src/__tests__/guardian-verify-setup-skill-regression.test.ts +2 -2
- package/src/__tests__/ipc-snapshot.test.ts +0 -9
- package/src/__tests__/onboarding-template-contract.test.ts +10 -20
- package/src/__tests__/relay-server.test.ts +3 -3
- package/src/__tests__/runtime-events-sse-parity.test.ts +10 -0
- package/src/__tests__/runtime-events-sse.test.ts +7 -0
- package/src/__tests__/session-runtime-assembly.test.ts +34 -8
- package/src/__tests__/system-prompt.test.ts +7 -1
- package/src/__tests__/trusted-contact-approval-notifier.test.ts +12 -8
- package/src/__tests__/twilio-routes-twiml.test.ts +2 -2
- package/src/__tests__/twilio-routes.test.ts +2 -3
- package/src/__tests__/voice-quality.test.ts +21 -132
- package/src/calls/call-controller.ts +34 -29
- package/src/calls/relay-server.ts +11 -5
- package/src/calls/twilio-routes.ts +4 -38
- package/src/calls/voice-quality.ts +7 -63
- package/src/config/bundled-skills/guardian-verify-setup/SKILL.md +7 -10
- package/src/config/bundled-skills/messaging/SKILL.md +3 -5
- package/src/config/bundled-skills/phone-calls/SKILL.md +144 -83
- package/src/config/bundled-skills/sms-setup/SKILL.md +0 -20
- package/src/config/bundled-skills/twilio-setup/SKILL.md +9 -17
- package/src/config/bundled-skills/voice-setup/SKILL.md +36 -1
- package/src/config/bundled-skills/voice-setup/icon.svg +20 -0
- package/src/config/calls-schema.ts +3 -53
- package/src/config/elevenlabs-schema.ts +33 -0
- package/src/config/schema.ts +183 -137
- package/src/config/types.ts +0 -1
- package/src/daemon/handlers/browser.ts +1 -6
- package/src/daemon/ipc-contract/browser.ts +5 -14
- package/src/daemon/ipc-contract-inventory.json +0 -2
- package/src/daemon/session-agent-loop-handlers.ts +3 -0
- package/src/daemon/session-runtime-assembly.ts +9 -7
- package/src/mcp/client.ts +2 -1
- package/src/memory/conversation-crud.ts +339 -166
- package/src/runtime/auth/middleware.ts +87 -26
- package/src/runtime/routes/events-routes.ts +7 -0
- package/src/runtime/routes/inbound-message-handler.ts +3 -4
- package/src/schedule/scheduler.ts +159 -45
- package/src/security/secure-keys.ts +3 -3
- package/src/tools/browser/browser-manager.ts +72 -228
- package/src/tools/browser/browser-screencast.ts +0 -5
- package/src/tools/network/script-proxy/certs.ts +7 -237
- package/src/tools/network/script-proxy/connect-tunnel.ts +1 -82
- package/src/tools/network/script-proxy/http-forwarder.ts +2 -151
- package/src/tools/network/script-proxy/logging.ts +12 -196
- package/src/tools/network/script-proxy/mitm-handler.ts +2 -270
- package/src/tools/network/script-proxy/policy.ts +4 -152
- package/src/tools/network/script-proxy/router.ts +2 -60
- package/src/tools/network/script-proxy/server.ts +5 -137
- package/src/tools/network/script-proxy/types.ts +19 -125
- package/src/tools/system/voice-config.ts +23 -1
- package/src/util/logger.ts +4 -1
- package/src/__tests__/elevenlabs-config.test.ts +0 -95
- package/src/__tests__/twilio-routes-elevenlabs.test.ts +0 -407
- package/src/calls/elevenlabs-config.ts +0 -32
|
@@ -29,7 +29,7 @@ import {
|
|
|
29
29
|
} from './call-store.js';
|
|
30
30
|
import { getTwilioConfig } from './twilio-config.js';
|
|
31
31
|
import type { CallStatus } from './types.js';
|
|
32
|
-
import {
|
|
32
|
+
import { resolveVoiceQualityProfile } from './voice-quality.js';
|
|
33
33
|
|
|
34
34
|
const log = getLogger('twilio-routes');
|
|
35
35
|
|
|
@@ -144,7 +144,7 @@ function mapTwilioStatus(twilioStatus: string): CallStatus | null {
|
|
|
144
144
|
* Receives the initial voice webhook when Twilio connects the call.
|
|
145
145
|
* Returns TwiML XML that tells Twilio to open a ConversationRelay WebSocket.
|
|
146
146
|
*
|
|
147
|
-
* Supports two
|
|
147
|
+
* Supports two flows:
|
|
148
148
|
* - **Outbound** (callSessionId present in query): uses the existing session
|
|
149
149
|
* - **Inbound** (callSessionId absent): creates or reuses a session keyed
|
|
150
150
|
* by the Twilio CallSid. Uses daemon internal scope for assistant identity.
|
|
@@ -214,43 +214,9 @@ function buildVoiceWebhookTwiml(
|
|
|
214
214
|
task: string | null,
|
|
215
215
|
guardianVerificationSessionId?: string | null,
|
|
216
216
|
): Response {
|
|
217
|
-
|
|
217
|
+
const profile = resolveVoiceQualityProfile(loadConfig());
|
|
218
218
|
|
|
219
|
-
log.info({ callSessionId,
|
|
220
|
-
|
|
221
|
-
if (profile.validationErrors.length > 0) {
|
|
222
|
-
log.warn({ callSessionId, errors: profile.validationErrors }, 'Voice quality profile has validation warnings');
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
// WS-A: Enforce strict fallback semantics — reject invalid profiles when fallback is disabled
|
|
226
|
-
if (!isVoiceProfileValid(profile)) {
|
|
227
|
-
if (!profile.fallbackToStandardOnError) {
|
|
228
|
-
const errorMsg = `Voice quality configuration error: ${profile.validationErrors.join('; ')}`;
|
|
229
|
-
log.error({ callSessionId, errors: profile.validationErrors }, errorMsg);
|
|
230
|
-
return new Response(errorMsg, { status: 500 });
|
|
231
|
-
}
|
|
232
|
-
// Fallback is enabled — profile already resolved to standard; log explicitly
|
|
233
|
-
log.info({ callSessionId }, 'Profile invalid with fallback enabled; proceeding with standard mode');
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
// WS-B: Guard elevenlabs_agent until consultation bridge exists.
|
|
237
|
-
// This fires BEFORE any ElevenLabs API calls, blocking the entire mode.
|
|
238
|
-
if (profile.mode === 'elevenlabs_agent') {
|
|
239
|
-
if (!profile.fallbackToStandardOnError) {
|
|
240
|
-
const msg = 'elevenlabs_agent mode is restricted: consultation bridging (waiting_on_user) is not yet supported. Set calls.voice.fallbackToStandardOnError=true to fall back to standard mode.';
|
|
241
|
-
log.error({ callSessionId }, msg);
|
|
242
|
-
return new Response(msg, { status: 501 });
|
|
243
|
-
}
|
|
244
|
-
log.warn({ callSessionId }, 'elevenlabs_agent mode is restricted/experimental — consultation bridging is not yet supported; falling back to standard ConversationRelay TwiML');
|
|
245
|
-
const standardConfig = loadConfig();
|
|
246
|
-
profile = resolveVoiceQualityProfile({
|
|
247
|
-
...standardConfig,
|
|
248
|
-
calls: {
|
|
249
|
-
...standardConfig.calls,
|
|
250
|
-
voice: { ...standardConfig.calls.voice, mode: 'twilio_standard' },
|
|
251
|
-
},
|
|
252
|
-
});
|
|
253
|
-
}
|
|
219
|
+
log.info({ callSessionId, ttsProvider: profile.ttsProvider, voice: profile.voice }, 'Voice quality profile resolved');
|
|
254
220
|
|
|
255
221
|
const twilioConfig = getTwilioConfig();
|
|
256
222
|
let relayUrl: string;
|
|
@@ -1,14 +1,10 @@
|
|
|
1
1
|
import { loadConfig } from '../config/loader.js';
|
|
2
2
|
|
|
3
3
|
export interface VoiceQualityProfile {
|
|
4
|
-
mode: 'twilio_standard' | 'twilio_elevenlabs_tts' | 'elevenlabs_agent';
|
|
5
4
|
language: string;
|
|
6
5
|
transcriptionProvider: string;
|
|
7
6
|
ttsProvider: string;
|
|
8
7
|
voice: string;
|
|
9
|
-
agentId?: string;
|
|
10
|
-
fallbackToStandardOnError: boolean;
|
|
11
|
-
validationErrors: string[];
|
|
12
8
|
}
|
|
13
9
|
|
|
14
10
|
/**
|
|
@@ -45,70 +41,18 @@ export function buildElevenLabsVoiceSpec(config: {
|
|
|
45
41
|
|
|
46
42
|
/**
|
|
47
43
|
* Resolve the effective voice quality profile from config.
|
|
48
|
-
*
|
|
49
|
-
*
|
|
44
|
+
*
|
|
45
|
+
* Always uses ElevenLabs TTS via Twilio ConversationRelay.
|
|
46
|
+
* The voice ID comes from the shared `elevenlabs.voiceId` config
|
|
47
|
+
* (defaults to Rachel — 21m00Tcm4TlvDq8ikWAM).
|
|
50
48
|
*/
|
|
51
49
|
export function resolveVoiceQualityProfile(config?: ReturnType<typeof loadConfig>): VoiceQualityProfile {
|
|
52
50
|
const cfg = config ?? loadConfig();
|
|
53
51
|
const voice = cfg.calls.voice;
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
// Default/standard profile
|
|
57
|
-
const standardProfile: VoiceQualityProfile = {
|
|
58
|
-
mode: 'twilio_standard',
|
|
52
|
+
return {
|
|
59
53
|
language: voice.language,
|
|
60
54
|
transcriptionProvider: voice.transcriptionProvider,
|
|
61
|
-
ttsProvider: '
|
|
62
|
-
voice:
|
|
63
|
-
fallbackToStandardOnError: voice.fallbackToStandardOnError,
|
|
64
|
-
validationErrors: [],
|
|
55
|
+
ttsProvider: 'ElevenLabs',
|
|
56
|
+
voice: buildElevenLabsVoiceSpec(cfg.elevenlabs),
|
|
65
57
|
};
|
|
66
|
-
|
|
67
|
-
if (voice.mode === 'twilio_standard') {
|
|
68
|
-
return standardProfile;
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
if (voice.mode === 'twilio_elevenlabs_tts') {
|
|
72
|
-
if (!voice.elevenlabs.voiceId && !voice.fallbackToStandardOnError) {
|
|
73
|
-
errors.push('calls.voice.elevenlabs.voiceId is required for twilio_elevenlabs_tts mode when fallback is disabled');
|
|
74
|
-
}
|
|
75
|
-
if (!voice.elevenlabs.voiceId && voice.fallbackToStandardOnError) {
|
|
76
|
-
return { ...standardProfile, validationErrors: ['calls.voice.elevenlabs.voiceId is empty; falling back to twilio_standard'] };
|
|
77
|
-
}
|
|
78
|
-
return {
|
|
79
|
-
mode: 'twilio_elevenlabs_tts',
|
|
80
|
-
language: voice.language,
|
|
81
|
-
transcriptionProvider: voice.transcriptionProvider,
|
|
82
|
-
ttsProvider: 'ElevenLabs',
|
|
83
|
-
voice: buildElevenLabsVoiceSpec(voice.elevenlabs),
|
|
84
|
-
fallbackToStandardOnError: voice.fallbackToStandardOnError,
|
|
85
|
-
validationErrors: errors,
|
|
86
|
-
};
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
if (voice.mode === 'elevenlabs_agent') {
|
|
90
|
-
if (!voice.elevenlabs.agentId && !voice.fallbackToStandardOnError) {
|
|
91
|
-
errors.push('calls.voice.elevenlabs.agentId is required for elevenlabs_agent mode when fallback is disabled');
|
|
92
|
-
}
|
|
93
|
-
if (!voice.elevenlabs.agentId && voice.fallbackToStandardOnError) {
|
|
94
|
-
return { ...standardProfile, validationErrors: ['calls.voice.elevenlabs.agentId is empty; falling back to twilio_standard'] };
|
|
95
|
-
}
|
|
96
|
-
return {
|
|
97
|
-
mode: 'elevenlabs_agent',
|
|
98
|
-
language: voice.language,
|
|
99
|
-
transcriptionProvider: voice.transcriptionProvider,
|
|
100
|
-
ttsProvider: 'ElevenLabs',
|
|
101
|
-
voice: buildElevenLabsVoiceSpec(voice.elevenlabs),
|
|
102
|
-
agentId: voice.elevenlabs.agentId,
|
|
103
|
-
fallbackToStandardOnError: voice.fallbackToStandardOnError,
|
|
104
|
-
validationErrors: errors,
|
|
105
|
-
};
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
return standardProfile;
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
/** Returns false when the profile has any validation errors. */
|
|
112
|
-
export function isVoiceProfileValid(profile: VoiceQualityProfile): boolean {
|
|
113
|
-
return profile.validationErrors.length === 0;
|
|
114
58
|
}
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: "Guardian Verify Setup"
|
|
3
|
-
description: "Set up guardian verification for
|
|
3
|
+
description: "Set up guardian verification for voice or Telegram channels via outbound verification flow"
|
|
4
4
|
user-invocable: true
|
|
5
5
|
metadata: {"vellum": {"emoji": "\ud83d\udd10"}}
|
|
6
6
|
---
|
|
7
7
|
|
|
8
|
-
You are helping your user set up guardian verification for a messaging channel (
|
|
8
|
+
You are helping your user set up guardian verification for a messaging channel (voice or Telegram). This links their identity as the trusted guardian for the chosen channel. All API calls go through the gateway HTTP API using `curl` with bearer auth.
|
|
9
9
|
|
|
10
10
|
## Prerequisites
|
|
11
11
|
|
|
@@ -19,17 +19,16 @@ You are helping your user set up guardian verification for a messaging channel (
|
|
|
19
19
|
|
|
20
20
|
Ask the user which channel they want to verify:
|
|
21
21
|
|
|
22
|
-
- **sms** -- verify a phone number for SMS messaging
|
|
23
22
|
- **voice** -- verify a phone number for voice calls
|
|
24
23
|
- **telegram** -- verify a Telegram account
|
|
25
24
|
|
|
26
|
-
If the user's intent already specifies a channel (e.g. "verify my phone number for
|
|
25
|
+
If the user's intent already specifies a channel (e.g. "verify my phone number for voice calls"), skip the prompt and proceed.
|
|
27
26
|
|
|
28
27
|
## Step 2: Collect Destination
|
|
29
28
|
|
|
30
29
|
Based on the chosen channel, ask for the required destination:
|
|
31
30
|
|
|
32
|
-
- **
|
|
31
|
+
- **Voice**: Ask for their phone number. Accept any common format (e.g. +15551234567, (555) 123-4567, 555-123-4567). The API normalizes it to E.164.
|
|
33
32
|
- **Telegram**: Ask for their Telegram chat ID (numeric) or @handle. Explain:
|
|
34
33
|
- If they know their numeric chat ID, provide it directly. The bot will send the code to that chat.
|
|
35
34
|
- If they only know their @handle, the flow uses a bootstrap deep-link that they must click first.
|
|
@@ -45,13 +44,12 @@ curl -s -X POST "$INTERNAL_GATEWAY_BASE_URL/v1/integrations/guardian/outbound/st
|
|
|
45
44
|
-d '{"channel": "<channel>", "destination": "<destination>"}'
|
|
46
45
|
```
|
|
47
46
|
|
|
48
|
-
Replace `<channel>` with `
|
|
47
|
+
Replace `<channel>` with `voice` or `telegram`, and `<destination>` with the phone number or Telegram destination.
|
|
49
48
|
|
|
50
49
|
### On success (`success: true`)
|
|
51
50
|
|
|
52
51
|
Report the exact next action based on the channel:
|
|
53
52
|
|
|
54
|
-
- **SMS**: "I've sent a 6-digit verification code to [number]. Reply with the code from that SMS conversation (not here) to complete verification — the code can only be consumed through the SMS channel."
|
|
55
53
|
- **Voice**: The response includes a `secret` field with the verification code. Tell the user the code BEFORE the call connects: "I'm calling [number] now. Your verification code is [secret]. When you answer the call, enter this code using your phone's keypad." The `/outbound/start` API call already initiates the voice call. Do NOT place a separate `call_start` call. **After delivering the code, immediately begin the voice auto-check polling loop** (see [Voice Auto-Check Polling](#voice-auto-check-polling) below).
|
|
56
54
|
- **Telegram with chat ID** (no `telegramBootstrapUrl` in response): The response includes a `secret` field. Show it in the current chat: "Your verification code is **[secret]**. I've also sent it to your Telegram. Open the Telegram bot chat and reply with that 6-digit code to complete verification." If the response does not contain a `secret` field, treat this as a control-plane error: tell the user something went wrong and ask them to retry from Step 3 or resend (Step 4).
|
|
57
55
|
- **Telegram with handle** (`telegramBootstrapUrl` present in response): "Tap this deep-link first: [telegramBootstrapUrl]. After Telegram binds your identity, I'll send your verification code."
|
|
@@ -68,7 +66,7 @@ Handle each error code:
|
|
|
68
66
|
| `invalid_destination` | Tell the user the format is invalid. For phone: suggest E.164 format (+15551234567). For Telegram: explain that group chat IDs (negative numbers) are not supported. |
|
|
69
67
|
| `already_bound` | Tell the user a guardian is already bound for this channel. Ask if they want to replace it. If yes, re-run the start request with `"rebind": true` added to the JSON body. |
|
|
70
68
|
| `rate_limited` | Tell the user they have sent too many verification attempts to this destination. Ask them to wait and try again later. |
|
|
71
|
-
| `unsupported_channel` | Tell the user the channel is not supported. Only
|
|
69
|
+
| `unsupported_channel` | Tell the user the channel is not supported. Only voice and telegram are valid. |
|
|
72
70
|
| `no_bot_username` | Telegram bot is not configured. Load and run the `telegram-setup` skill first. |
|
|
73
71
|
|
|
74
72
|
## Step 4: Handle Resend
|
|
@@ -84,7 +82,6 @@ curl -s -X POST "$INTERNAL_GATEWAY_BASE_URL/v1/integrations/guardian/outbound/re
|
|
|
84
82
|
|
|
85
83
|
On success, report the next action based on the channel:
|
|
86
84
|
|
|
87
|
-
- **SMS**: "I've sent a new verification code to [number]. Reply with the code from that SMS conversation to complete verification."
|
|
88
85
|
- **Voice**: The resend response includes a fresh `secret` field with a new verification code. Tell the user the new code BEFORE the call connects — just like the initial start flow: "I'm calling [number] again. Your new verification code is [secret]. When you answer the call, enter this code using your phone's keypad." The `/outbound/resend` API call already initiates the voice call. Do NOT place a separate `call_start` call. **After delivering the code, immediately begin the voice auto-check polling loop** (see [Voice Auto-Check Polling](#voice-auto-check-polling) below).
|
|
89
86
|
- **Telegram**: The resend response includes a fresh `secret` field. Show the new code in the current chat: "Your new verification code is **[secret]**. I've also sent it to your Telegram. Open the Telegram bot chat and reply with that 6-digit code to complete verification." If the response does not contain a `secret` field, treat this as a control-plane error: tell the user something went wrong and ask them to retry from Step 3.
|
|
90
87
|
|
|
@@ -140,7 +137,7 @@ When in a **rebind flow** (i.e., the `start_outbound` request included `"rebind"
|
|
|
140
137
|
- Non-rebind flows (fresh verification with no prior binding) are unaffected — the first `bound: true` is trustworthy.
|
|
141
138
|
|
|
142
139
|
**Important polling rules:**
|
|
143
|
-
- This polling loop is voice-only. Do NOT poll for
|
|
140
|
+
- This polling loop is voice-only. Do NOT poll for Telegram channels (Telegram has its own bot-driven flow).
|
|
144
141
|
- Do NOT require the user to ask "did it work?" — the whole point is proactive confirmation.
|
|
145
142
|
- If the user sends a message while polling is in progress, handle their message normally. If their message is about verification status, the next poll iteration will provide the answer.
|
|
146
143
|
|
|
@@ -84,15 +84,13 @@ SMS messaging uses Twilio as the telephony provider. Twilio credentials and phon
|
|
|
84
84
|
|
|
85
85
|
The sms-setup skill handles: Twilio credential storage (Account SID + Auth Token), phone number provisioning or assignment, public ingress setup, SMS compliance verification, and end-to-end test sending. Once SMS is set up, messaging is available automatically — no additional feature flag is needed.
|
|
86
86
|
|
|
87
|
-
|
|
87
|
+
### Guardian Verification (Voice or Telegram)
|
|
88
88
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
If the user asks to verify their guardian identity for any channel (SMS, voice, or Telegram), load the **guardian-verify-setup** skill:
|
|
89
|
+
If the user asks to verify their guardian identity for voice or Telegram, load the **guardian-verify-setup** skill:
|
|
92
90
|
|
|
93
91
|
- Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
|
|
94
92
|
|
|
95
|
-
The guardian-verify-setup skill handles the full outbound verification flow for
|
|
93
|
+
The guardian-verify-setup skill handles the full outbound verification flow for voice and Telegram channels. It collects the user's destination (phone number or Telegram chat ID/handle), initiates an outbound verification session, and guides the user through entering or replying with the verification code. This is the single source of truth for guardian verification setup -- do not duplicate the verification flow inline.
|
|
96
94
|
|
|
97
95
|
## Error Recovery
|
|
98
96
|
|
|
@@ -11,7 +11,7 @@ You are helping the user set up and manage phone calls via Twilio. This skill co
|
|
|
11
11
|
|
|
12
12
|
## Overview
|
|
13
13
|
|
|
14
|
-
The calling system uses Twilio's ConversationRelay for both **outbound** and **inbound** voice calls. Twilio
|
|
14
|
+
The calling system uses Twilio's ConversationRelay for both **outbound** and **inbound** voice calls with **ElevenLabs** providing the text-to-speech voice. After Twilio setup, the assistant configures ElevenLabs as the TTS provider and prompts the user to choose a voice from a curated list of supported options.
|
|
15
15
|
|
|
16
16
|
### Outbound calls
|
|
17
17
|
|
|
@@ -34,14 +34,6 @@ When someone dials the assistant's Twilio phone number:
|
|
|
34
34
|
5. Once verified (or if no challenge is pending), the LLM orchestrator greets the caller in a receptionist style: "Hello, this is [user]'s assistant. How can I help you today?"
|
|
35
35
|
6. The assistant converses naturally, using ASK_GUARDIAN to consult the user when needed, just like outbound calls.
|
|
36
36
|
|
|
37
|
-
Three voice quality modes are available:
|
|
38
|
-
|
|
39
|
-
- **`twilio_standard`** (default) — Fully supported. Standard Twilio TTS with Google voices. No extra setup required.
|
|
40
|
-
- **`twilio_elevenlabs_tts`** — Fully supported. Uses ElevenLabs voices through Twilio ConversationRelay for more natural speech.
|
|
41
|
-
- **`elevenlabs_agent`** — **Experimental/restricted.** Full ElevenLabs conversational agent mode. Consultation bridging (`waiting_on_user`) is not yet supported in this mode; the runtime guard blocks it before any ElevenLabs API calls are made. See the "Runtime behavior" section below for fallback and strict-fail details.
|
|
42
|
-
|
|
43
|
-
You can keep using Twilio only — no changes needed. Enabling ElevenLabs can improve naturalness and quality.
|
|
44
|
-
|
|
45
37
|
The user's assistant gets its own personal phone number through Twilio. All implicit calls (without an explicit mode) always use this assistant number. Optionally, users can call from their own phone number if it's authorized with the Twilio account — this must be explicitly requested per call via `caller_identity_mode="user_number"`.
|
|
46
38
|
|
|
47
39
|
## Step 1: Verify Twilio Setup
|
|
@@ -79,18 +71,105 @@ Verify:
|
|
|
79
71
|
vellum config get calls.enabled
|
|
80
72
|
```
|
|
81
73
|
|
|
82
|
-
## Step 3:
|
|
74
|
+
## Step 3: Choose a Voice
|
|
75
|
+
|
|
76
|
+
After enabling calls, let the user choose an ElevenLabs voice. Twilio has a native ElevenLabs integration — no separate ElevenLabs account or API key is needed.
|
|
77
|
+
|
|
78
|
+
### Voice consistency with in-app TTS
|
|
79
|
+
|
|
80
|
+
The shared config key `elevenlabs.voiceId` is the single source of truth for ElevenLabs voice identity. Both in-app TTS and phone calls read from it (defaulting to **Rachel** — `21m00Tcm4TlvDq8ikWAM`).
|
|
81
|
+
|
|
82
|
+
Before presenting the voice list, check the current shared voice:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
vellum config get elevenlabs.voiceId
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**If a non-default voice is already set**, the user chose it during voice-setup or a previous session. Tell them:
|
|
89
|
+
|
|
90
|
+
> "Your assistant currently uses [voice name] for both in-app chat and phone calls. I'll keep the same voice for calls. You can change it if you'd like."
|
|
91
|
+
|
|
92
|
+
Skip the selection prompt unless the user wants to change.
|
|
93
|
+
|
|
94
|
+
**If the default (Rachel) is set or no override exists**, present the curated voice list below and let them pick. When they choose, set the shared config so both in-app TTS and phone calls use it:
|
|
95
|
+
|
|
96
|
+
### Voice selection
|
|
97
|
+
|
|
98
|
+
Present the user with a list of supported ElevenLabs voices. These are pre-made voices with stable IDs that work with Twilio ConversationRelay out of the box.
|
|
99
|
+
|
|
100
|
+
**Ask the user: "Which voice would you like your assistant to use on phone calls?"**
|
|
101
|
+
|
|
102
|
+
Present these voices grouped by category:
|
|
103
|
+
|
|
104
|
+
#### Female voices
|
|
105
|
+
|
|
106
|
+
| Voice | Style | Voice ID |
|
|
107
|
+
| --------- | ------------------------------ | ------------------------------ |
|
|
108
|
+
| Rachel | Calm, warm, conversational | `21m00Tcm4TlvDq8ikWAM` |
|
|
109
|
+
| Sarah | Soft, young, approachable | `EXAVITQu4vr4xnSDxMaL` |
|
|
110
|
+
| Charlotte | Warm, Swedish-accented | `XB0fDUnXU5powFXDhCwa` |
|
|
111
|
+
| Alice | Confident, British | `Xb7hH8MSUJpSbSDYk0k2` |
|
|
112
|
+
| Matilda | Warm, friendly, young | `XrExE9yKIg1WjnnlVkGX` |
|
|
113
|
+
| Lily | Warm, British | `pFZP5JQG7iQjIQuC4Bku` |
|
|
114
|
+
|
|
115
|
+
#### Male voices
|
|
116
|
+
|
|
117
|
+
| Voice | Style | Voice ID |
|
|
118
|
+
| ------- | -------------------------------- | ------------------------------ |
|
|
119
|
+
| Antoni | Warm, well-rounded | `ErXwobaYiN019PkySvjV` |
|
|
120
|
+
| Josh | Deep, young, clear | `TxGEqnHWrfWFTfGW9XjX` |
|
|
121
|
+
| Arnold | Crisp, narrative | `VR6AewLTigWG4xSOukaG` |
|
|
122
|
+
| Adam | Deep, middle-aged, professional | `pNInz6obpgDQGcFmaJgB` |
|
|
123
|
+
| Bill | Trustworthy, American | `pqHfZKP75CvOlQylNhV4` |
|
|
124
|
+
| George | Warm, British, distinguished | `JBFqnCBsd6RMkjVDRZzb` |
|
|
125
|
+
| Daniel | Authoritative, British | `onwK4e9ZLuTAKqWW03F9` |
|
|
126
|
+
| Charlie | Casual, Australian | `IKne3meq5aSn9XLyUdCD` |
|
|
127
|
+
| Liam | Young, articulate | `TX3LPaxmHKxFdv7VOQHJ` |
|
|
128
|
+
|
|
129
|
+
After the user picks a voice, use `voice_config_update` to set the shared voice ID. This writes to the config file (`elevenlabs.voiceId`) for phone calls **and** pushes to the macOS app via IPC (`ttsVoiceId`) for in-app TTS in one call:
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
voice_config_update setting="tts_voice_id" value="<selected-voice-id>"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**If the user wants a voice not on this list**, they can browse more voices at https://elevenlabs.io/voice-library and provide the voice ID manually.
|
|
136
|
+
|
|
137
|
+
## Step 4: Verify Setup (Test Call)
|
|
83
138
|
|
|
84
139
|
Before making real calls, offer a quick verification:
|
|
85
140
|
|
|
86
141
|
1. Confirm credentials are stored: check the Twilio config endpoint for `hasCredentials: true` and `phoneNumber`
|
|
87
142
|
2. Confirm ingress is running: `ingress.publicBaseUrl` must be set and the tunnel active
|
|
88
143
|
3. Confirm calls are enabled: `calls.enabled` must be `true`
|
|
144
|
+
4. Confirm voice is configured: `elevenlabs.voiceId` should be set
|
|
89
145
|
|
|
90
|
-
Suggest a test call to the user's own phone: **"Want to do a quick test call to your phone to make sure everything works?"**
|
|
146
|
+
Suggest a test call to the user's own phone: **"Want to do a quick test call to your phone to make sure everything works? This is a good way to hear how your chosen voice sounds."**
|
|
91
147
|
|
|
92
148
|
If they agree, ask for their personal phone number and place a test call with a simple task like "Introduce yourself and confirm the call system is working."
|
|
93
149
|
|
|
150
|
+
## Step 5: Verify Guardian Identity (Voice)
|
|
151
|
+
|
|
152
|
+
Now link the user's phone number as the trusted voice guardian. Tell the user: "Now let's verify your guardian identity for voice. This links your phone number so the assistant can verify inbound callers."
|
|
153
|
+
|
|
154
|
+
Load the **guardian-verify-setup** skill to handle the verification flow:
|
|
155
|
+
|
|
156
|
+
- Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
|
|
157
|
+
|
|
158
|
+
When invoking the skill, indicate the channel is `voice`. The guardian-verify-setup skill manages the full outbound verification flow, including:
|
|
159
|
+
|
|
160
|
+
- Collecting the user's phone number as the destination
|
|
161
|
+
- Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "voice"`
|
|
162
|
+
- Calling the phone number and providing a code for the user to enter via their phone's keypad
|
|
163
|
+
- Proactively polling for completion (voice auto-check) so the user gets instant confirmation
|
|
164
|
+
- Checking guardian status to confirm the binding was created
|
|
165
|
+
- Handling resend, cancel, and error cases
|
|
166
|
+
|
|
167
|
+
Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted voice guardian."_
|
|
168
|
+
|
|
169
|
+
After the guardian-verify-setup skill completes (or the user skips), continue to the next sections.
|
|
170
|
+
|
|
171
|
+
**Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed without blocking. Once verified, inbound callers can be prompted for voice verification before calls proceed (see the **Guardian voice verification for inbound calls** section below).
|
|
172
|
+
|
|
94
173
|
## Caller Identity
|
|
95
174
|
|
|
96
175
|
All implicit calls (calls without an explicit `caller_identity_mode`) always use the assistant's Twilio phone number. This is the number that appears on the recipient's caller ID.
|
|
@@ -133,88 +212,83 @@ An optional verification step where the callee must enter a numeric code via the
|
|
|
133
212
|
| `calls.verification.enabled` | Enable DTMF callee verification | `false` |
|
|
134
213
|
| `calls.verification.codeLength` | Number of digits in the verification code | `6` |
|
|
135
214
|
|
|
136
|
-
##
|
|
215
|
+
## Advanced Voice Configuration
|
|
137
216
|
|
|
138
|
-
ElevenLabs
|
|
217
|
+
ElevenLabs is the TTS provider for all calls. This section covers advanced voice selection and tuning.
|
|
139
218
|
|
|
140
|
-
###
|
|
219
|
+
### Changing the voice
|
|
141
220
|
|
|
142
|
-
|
|
221
|
+
To switch to a different voice after initial setup, use `voice_config_update` to set the shared voice ID. This writes to the config file (`elevenlabs.voiceId`) for phone calls **and** pushes to the macOS app via IPC for in-app TTS:
|
|
143
222
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
2. If the user doesn't care, keep `twilio_standard` (simplest path)
|
|
148
|
-
3. If they want higher-quality voice, switch to `twilio_elevenlabs_tts` and choose a matching ElevenLabs voice on their behalf
|
|
223
|
+
```
|
|
224
|
+
voice_config_update setting="tts_voice_id" value="<new-voice-id>"
|
|
225
|
+
```
|
|
149
226
|
|
|
150
|
-
|
|
227
|
+
Browse more voices at https://elevenlabs.io/voice-library.
|
|
151
228
|
|
|
152
|
-
|
|
229
|
+
### Advanced voice selection with an ElevenLabs account
|
|
153
230
|
|
|
154
|
-
|
|
155
|
-
vellum config set calls.voice.mode twilio_elevenlabs_tts
|
|
156
|
-
vellum config set calls.voice.elevenlabs.voiceId "<your-voice-id>"
|
|
157
|
-
```
|
|
231
|
+
Users who have an ElevenLabs account and API key (e.g., from the **voice-setup** skill) can go beyond the curated voice list. With an API key, they can:
|
|
158
232
|
|
|
159
|
-
|
|
233
|
+
- **Browse the full ElevenLabs voice library programmatically** — the ElevenLabs API (`GET https://api.elevenlabs.io/v2/voices`) supports searching by name, category, language, and accent. This returns voice IDs, names, labels, and preview URLs.
|
|
234
|
+
- **Use custom or cloned voices** — if the user has created a custom voice or voice clone in their ElevenLabs account, they can use its voice ID here. These voices are available in Twilio ConversationRelay just like pre-made voices.
|
|
235
|
+
- **Preview voices before choosing** — each voice in the API response includes a `preview_url` with an audio sample.
|
|
160
236
|
|
|
161
|
-
|
|
237
|
+
To check if the user has an API key stored:
|
|
162
238
|
|
|
163
239
|
```bash
|
|
164
|
-
|
|
240
|
+
credential_store action=get service=elevenlabs field=api_key
|
|
165
241
|
```
|
|
166
242
|
|
|
167
|
-
|
|
168
|
-
`voiceId-model-speed_stability_similarity`.
|
|
169
|
-
|
|
170
|
-
### Mode: `elevenlabs_agent` (experimental/restricted)
|
|
243
|
+
If they have a key and want to browse voices, fetch the voice list:
|
|
171
244
|
|
|
172
|
-
|
|
245
|
+
```bash
|
|
246
|
+
curl -s "https://api.elevenlabs.io/v2/voices?category=premade&page_size=50" \
|
|
247
|
+
-H "xi-api-key: <api_key_from_credential_store>" | python3 -m json.tool
|
|
248
|
+
```
|
|
173
249
|
|
|
174
|
-
|
|
250
|
+
To search for a specific voice style:
|
|
175
251
|
|
|
176
|
-
|
|
252
|
+
```bash
|
|
253
|
+
curl -s "https://api.elevenlabs.io/v2/voices?search=warm+female&page_size=10" \
|
|
254
|
+
-H "xi-api-key: <api_key_from_credential_store>" | python3 -m json.tool
|
|
255
|
+
```
|
|
177
256
|
|
|
178
|
-
|
|
257
|
+
After the user picks a voice, set the shared voice ID:
|
|
179
258
|
|
|
180
259
|
```
|
|
181
|
-
|
|
260
|
+
voice_config_update setting="tts_voice_id" value="<selected-voice-id>"
|
|
182
261
|
```
|
|
183
262
|
|
|
184
|
-
|
|
263
|
+
### Voice tuning parameters
|
|
185
264
|
|
|
186
|
-
|
|
187
|
-
vellum config set calls.voice.mode elevenlabs_agent
|
|
188
|
-
vellum config set calls.voice.elevenlabs.agentId "<your-agent-id>"
|
|
189
|
-
```
|
|
265
|
+
Fine-tune how the selected voice sounds. These parameters apply to all ElevenLabs modes:
|
|
190
266
|
|
|
191
|
-
|
|
267
|
+
```bash
|
|
268
|
+
# Playback speed (0.7 = slower, 1.0 = normal, 1.2 = faster)
|
|
269
|
+
vellum config set elevenlabs.speed 1.0
|
|
192
270
|
|
|
193
|
-
|
|
271
|
+
# Stability (0.0 = more expressive/variable, 1.0 = more consistent/monotone)
|
|
272
|
+
vellum config set elevenlabs.stability 0.5
|
|
194
273
|
|
|
195
|
-
|
|
274
|
+
# Similarity boost (0.0 = more creative, 1.0 = closer to original voice)
|
|
275
|
+
vellum config set elevenlabs.similarityBoost 0.75
|
|
276
|
+
```
|
|
196
277
|
|
|
197
|
-
|
|
198
|
-
- **`false`:** The voice webhook returns **HTTP 500** with the specific configuration error details (e.g., `"Voice quality configuration error: calls.voice.elevenlabs.voiceId is required..."`).
|
|
278
|
+
Lower stability makes the voice more expressive but less predictable — good for conversational calls. Higher stability is better for scripted/formal calls.
|
|
199
279
|
|
|
200
|
-
|
|
280
|
+
### Voice model tuning
|
|
201
281
|
|
|
202
|
-
|
|
203
|
-
- **`false`:** The voice webhook returns **HTTP 501** with the message: `"elevenlabs_agent mode is restricted: consultation bridging (waiting_on_user) is not yet supported."`. No ElevenLabs API calls are made.
|
|
282
|
+
By default, the system sends a **bare** `voiceId` to Twilio ConversationRelay (no model/tuning suffix). This is the safest default across voice IDs.
|
|
204
283
|
|
|
205
|
-
|
|
284
|
+
If you want to force Twilio's extended voice spec, you can optionally set a model ID:
|
|
206
285
|
|
|
207
286
|
```bash
|
|
208
|
-
vellum config set
|
|
287
|
+
vellum config set elevenlabs.voiceModelId "flash_v2_5"
|
|
209
288
|
```
|
|
210
289
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
To go back to the default voice at any time:
|
|
214
|
-
|
|
215
|
-
```bash
|
|
216
|
-
vellum config set calls.voice.mode twilio_standard
|
|
217
|
-
```
|
|
290
|
+
When `voiceModelId` is set, the emitted voice string becomes:
|
|
291
|
+
`voiceId-model-speed_stability_similarity`.
|
|
218
292
|
|
|
219
293
|
## Making Outbound Calls
|
|
220
294
|
|
|
@@ -477,16 +551,13 @@ All call-related settings can be managed via `vellum config`:
|
|
|
477
551
|
| `calls.model` | Override LLM model for call orchestration | _(uses default model)_ |
|
|
478
552
|
| `calls.callerIdentity.allowPerCallOverride` | Allow per-call caller identity selection | `true` |
|
|
479
553
|
| `calls.callerIdentity.userNumber` | E.164 phone number for user-number mode | _(empty)_ |
|
|
480
|
-
| `calls.voice.mode` | Voice quality mode (`twilio_standard`, `twilio_elevenlabs_tts`, `elevenlabs_agent`) | `twilio_standard` |
|
|
481
554
|
| `calls.voice.language` | Language code for TTS and transcription | `en-US` |
|
|
482
555
|
| `calls.voice.transcriptionProvider` | Speech-to-text provider (`Deepgram`, `Google`) | `Deepgram` |
|
|
483
|
-
| `
|
|
484
|
-
| `
|
|
485
|
-
| `
|
|
486
|
-
| `
|
|
487
|
-
| `
|
|
488
|
-
| `calls.voice.elevenlabs.stability` | Voice stability (`0.0` – `1.0`) | `0.5` |
|
|
489
|
-
| `calls.voice.elevenlabs.similarityBoost` | Voice similarity boost (`0.0` – `1.0`) | `0.75` |
|
|
556
|
+
| `elevenlabs.voiceId` | ElevenLabs voice ID used by both in-app TTS and phone calls. Set during setup from the curated voice list. Defaults to Rachel | `21m00Tcm4TlvDq8ikWAM` |
|
|
557
|
+
| `elevenlabs.voiceModelId` | Optional Twilio ConversationRelay model suffix. Leave empty to send bare `voiceId` | _(empty)_ |
|
|
558
|
+
| `elevenlabs.speed` | Playback speed (`0.7` – `1.2`) | `1.0` |
|
|
559
|
+
| `elevenlabs.stability` | Voice stability (`0.0` – `1.0`) | `0.5` |
|
|
560
|
+
| `elevenlabs.similarityBoost` | Voice similarity boost (`0.0` – `1.0`) | `0.75` |
|
|
490
561
|
|
|
491
562
|
### Adjusting settings
|
|
492
563
|
|
|
@@ -558,27 +629,17 @@ Or re-run the public-ingress skill to auto-detect and save the new URL.
|
|
|
558
629
|
|
|
559
630
|
### Call drops after 30 seconds of silence
|
|
560
631
|
|
|
561
|
-
The system has a 30-second silence timeout. If nobody speaks for 30 seconds, the agent will ask "Are you still there?" This is expected behavior.
|
|
632
|
+
The system has a 30-second silence timeout. If nobody speaks for 30 seconds during normal conversation, the agent will ask "Are you still there?" This is expected behavior. During guardian wait states (inbound access-request wait or in-call guardian consultation wait), this generic silence nudge is suppressed — the guardian-wait heartbeat messaging is used instead.
|
|
562
633
|
|
|
563
|
-
### Call quality
|
|
634
|
+
### Call quality sounds off
|
|
564
635
|
|
|
565
|
-
- Verify `
|
|
636
|
+
- Verify `elevenlabs.voiceId` is set to a valid ElevenLabs voice ID
|
|
566
637
|
- Ask for the desired voice style again and try a different voice selection
|
|
567
|
-
- If configuring manually: check that `calls.voice.elevenlabs.voiceId` contains a valid ElevenLabs voice ID
|
|
568
|
-
- If mode is `elevenlabs_agent`, ensure `calls.voice.elevenlabs.agentId` is also set
|
|
569
638
|
|
|
570
639
|
### Twilio says "application error" right after answer
|
|
571
640
|
|
|
572
641
|
- This often means ConversationRelay rejected voice configuration after TwiML fetch
|
|
573
|
-
- Keep `
|
|
642
|
+
- Keep `elevenlabs.voiceModelId` empty first (bare `voiceId` mode)
|
|
574
643
|
- If you set `voiceModelId`, try clearing it and retesting:
|
|
575
|
-
`vellum config set
|
|
576
|
-
|
|
577
|
-
### ElevenLabs mode falls back to standard
|
|
578
|
-
|
|
579
|
-
When `calls.voice.fallbackToStandardOnError` is `true` (the default), the system silently falls back to standard Twilio TTS if ElevenLabs encounters an error or restriction. Check:
|
|
644
|
+
`vellum config set elevenlabs.voiceModelId ""`
|
|
580
645
|
|
|
581
|
-
- For `elevenlabs_agent` mode: this mode is currently restricted (consultation bridging not yet supported) and will always fall back to standard when fallback is enabled. If fallback is disabled, the voice webhook returns HTTP 501.
|
|
582
|
-
- For `twilio_elevenlabs_tts` mode: verify `calls.voice.elevenlabs.voiceId` is set to a valid voice ID
|
|
583
|
-
- For invalid configs (missing voiceId/agentId): if fallback is disabled, the voice webhook returns HTTP 500 with the config error
|
|
584
|
-
- Review daemon logs for warning messages about fallback or guard activation
|
|
@@ -144,26 +144,6 @@ After deletion, return to Step 3b to collect information and resubmit. Warn the
|
|
|
144
144
|
|
|
145
145
|
**On failure:** Report the exact error message and guide the user through resolution.
|
|
146
146
|
|
|
147
|
-
## Step 3.5: Guardian Verification (SMS)
|
|
148
|
-
|
|
149
|
-
Now link the user's phone number as the trusted SMS guardian. Tell the user: "Now let's verify your guardian identity for SMS. This links your phone number as the trusted guardian for SMS messaging."
|
|
150
|
-
|
|
151
|
-
Load the **guardian-verify-setup** skill to handle the verification flow:
|
|
152
|
-
|
|
153
|
-
- Call `skill_load` with `skill: "guardian-verify-setup"` to load the dependency skill.
|
|
154
|
-
|
|
155
|
-
When invoking the skill, indicate the channel is `sms`. The guardian-verify-setup skill manages the full outbound verification flow, including:
|
|
156
|
-
|
|
157
|
-
- Collecting the user's phone number as the destination (accepts any common format -- the API normalizes to E.164)
|
|
158
|
-
- Starting the outbound verification session via the gateway endpoint `POST /v1/integrations/guardian/outbound/start` with `channel: "sms"`
|
|
159
|
-
- Sending a 6-digit code to the phone number that the user must reply with from the SMS channel
|
|
160
|
-
- Checking guardian status to confirm the binding was created
|
|
161
|
-
- Handling resend, cancel, and error cases
|
|
162
|
-
|
|
163
|
-
Tell the user: _"I've loaded the guardian verification guide. It will walk you through linking your phone number as the trusted SMS guardian."_
|
|
164
|
-
|
|
165
|
-
**Note:** Guardian verification is optional but recommended. If the user declines or wants to skip, proceed to Step 4 without blocking.
|
|
166
|
-
|
|
167
147
|
## Step 4: Test Send
|
|
168
148
|
|
|
169
149
|
Run a test SMS to verify end-to-end delivery:
|